From c449d55eaa5d00eb9d9e5a7ead3921819b4b1ef7 Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Wed, 7 Jun 2023 10:55:43 -0600 Subject: [PATCH 01/16] adds bash script for syncing LENS reference files --- bin/mirror-lens-references.sh | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 bin/mirror-lens-references.sh diff --git a/bin/mirror-lens-references.sh b/bin/mirror-lens-references.sh new file mode 100644 index 00000000..3ee22047 --- /dev/null +++ b/bin/mirror-lens-references.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +# This script assumes that `AWS_PROFILE` has been set to a AWS CLI +# profile that has write-access to the `s3://sage-igenomes` bucket +# (or using any other means of authenticating with the AWS CLI). + +prefixes=( + "antigen.garnish/" + "erv/" + "homo_sapiens/" + "mhcflurry/" + "mus_musculus/" + "viral/" +) + +echo "syncing LENS reference files" +mkdir -p "./lens/references" +wget -P ./lens https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/b0d8a24036628bd9e6c860eb531919c3/download_general_references.sh \ + https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/2238c7f4af5c5499f298315bf5080ad7/download_human_references.sh \ + https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/bf210753e3724edf6db2543eb54c3de4/download_mouse_references.sh +bash download_general_references.sh ./references +bash download_human_references.sh ./references +bash download_mouse_references.sh ./references + +for prefix in ${prefixes[*]}; do + aws s3 --region us-east-1 sync "./references/$prefix/" "s3://sage-igenomes/LENS/$prefix/" --acl public-read +done + +rm -r "./lens" From d19d1ae3fd634b0275a5c7b03f686df4c72dd330 Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Wed, 7 Jun 2023 11:08:50 -0600 Subject: [PATCH 02/16] updates echo commands --- bin/mirror-lens-references.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/mirror-lens-references.sh b/bin/mirror-lens-references.sh index 3ee22047..82f08753 100644 --- a/bin/mirror-lens-references.sh +++ b/bin/mirror-lens-references.sh @@ -13,7 +13,7 @@ prefixes=( "viral/" ) -echo "syncing LENS reference files" +echo "Downloading LENS reference files" mkdir -p "./lens/references" wget -P ./lens https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/b0d8a24036628bd9e6c860eb531919c3/download_general_references.sh \ https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/2238c7f4af5c5499f298315bf5080ad7/download_human_references.sh \ @@ -23,6 +23,7 @@ bash download_human_references.sh ./references bash download_mouse_references.sh ./references for prefix in ${prefixes[*]}; do + echo "Syncing $prefix..." aws s3 --region us-east-1 sync "./references/$prefix/" "s3://sage-igenomes/LENS/$prefix/" --acl public-read done From 9d742eb691b1b2e60e74ab6946a3b2f2159ae27b Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Wed, 7 Jun 2023 11:12:47 -0600 Subject: [PATCH 03/16] updates echo commands --- bin/mirror-lens-references.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/mirror-lens-references.sh b/bin/mirror-lens-references.sh index 82f08753..8e381540 100644 --- a/bin/mirror-lens-references.sh +++ b/bin/mirror-lens-references.sh @@ -23,7 +23,7 @@ bash download_human_references.sh ./references bash download_mouse_references.sh ./references for prefix in ${prefixes[*]}; do - echo "Syncing $prefix..." + echo "Syncing $prefix..." aws s3 --region us-east-1 sync "./references/$prefix/" "s3://sage-igenomes/LENS/$prefix/" --acl public-read done From 7297a6815527b47edd533fde2acb08a24719a290 Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Wed, 7 Jun 2023 11:12:54 -0600 Subject: [PATCH 04/16] updates echo commands --- bin/mirror-lens-references.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/mirror-lens-references.sh b/bin/mirror-lens-references.sh index 8e381540..82f08753 100644 --- a/bin/mirror-lens-references.sh +++ b/bin/mirror-lens-references.sh @@ -23,7 +23,7 @@ bash download_human_references.sh ./references bash download_mouse_references.sh ./references for prefix in ${prefixes[*]}; do - echo "Syncing $prefix..." + echo "Syncing $prefix..." aws s3 --region us-east-1 sync "./references/$prefix/" "s3://sage-igenomes/LENS/$prefix/" --acl public-read done From 828faa63ee5fde21389a3472ee072dfaee37cf62 Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Wed, 7 Jun 2023 13:48:02 -0600 Subject: [PATCH 05/16] adds storage estimate --- bin/mirror-lens-references.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bin/mirror-lens-references.sh b/bin/mirror-lens-references.sh index 82f08753..3b52d998 100644 --- a/bin/mirror-lens-references.sh +++ b/bin/mirror-lens-references.sh @@ -4,6 +4,8 @@ # profile that has write-access to the `s3://sage-igenomes` bucket # (or using any other means of authenticating with the AWS CLI). +# Downloading the LENS reference files will temporarily take up ~72.1 GB of disk space. + prefixes=( "antigen.garnish/" "erv/" From 2dc008ee691d23876c4ca8748c97bd11777ce5ab Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Wed, 7 Jun 2023 13:52:39 -0600 Subject: [PATCH 06/16] address PR comments --- bin/mirror-lens-references.sh | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/bin/mirror-lens-references.sh b/bin/mirror-lens-references.sh index 3b52d998..4974ea3b 100644 --- a/bin/mirror-lens-references.sh +++ b/bin/mirror-lens-references.sh @@ -17,16 +17,18 @@ prefixes=( echo "Downloading LENS reference files" mkdir -p "./lens/references" -wget -P ./lens https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/b0d8a24036628bd9e6c860eb531919c3/download_general_references.sh \ +cd ./lens +wget -P . https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/b0d8a24036628bd9e6c860eb531919c3/download_general_references.sh \ https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/2238c7f4af5c5499f298315bf5080ad7/download_human_references.sh \ https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/bf210753e3724edf6db2543eb54c3de4/download_mouse_references.sh -bash download_general_references.sh ./references -bash download_human_references.sh ./references -bash download_mouse_references.sh ./references +export RAFT_PATH="." +bash download_general_references.sh . +bash download_human_references.sh . +bash download_mouse_references.sh . for prefix in ${prefixes[*]}; do echo "Syncing $prefix..." aws s3 --region us-east-1 sync "./references/$prefix/" "s3://sage-igenomes/LENS/$prefix/" --acl public-read done - +cd .. rm -r "./lens" From 894d3cef38c0b364d6030a8e0234a68c971b7962 Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Wed, 7 Jun 2023 14:46:56 -0600 Subject: [PATCH 07/16] adds link to LENS set up instructions --- bin/mirror-lens-references.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bin/mirror-lens-references.sh b/bin/mirror-lens-references.sh index 4974ea3b..14c20674 100644 --- a/bin/mirror-lens-references.sh +++ b/bin/mirror-lens-references.sh @@ -1,5 +1,8 @@ #!/bin/bash +# Refer to the Running LENS instructions and set up your environment accordingly before executing this script: +# https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/Running-LENS#populate-raft-global-references-directory + # This script assumes that `AWS_PROFILE` has been set to a AWS CLI # profile that has write-access to the `s3://sage-igenomes` bucket # (or using any other means of authenticating with the AWS CLI). From 9b0866e32ab44b59ffd0f1d205d61bcb15ecfd0d Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Thu, 29 Jun 2023 18:04:09 -0600 Subject: [PATCH 08/16] adds old ref file scripts --- lens/download_general_references.sh | 40 +++++++++ lens/download_human_references.sh | 128 ++++++++++++++++++++++++++++ lens/download_mouse_references.sh | 91 ++++++++++++++++++++ 3 files changed, 259 insertions(+) create mode 100644 lens/download_general_references.sh create mode 100644 lens/download_human_references.sh create mode 100644 lens/download_mouse_references.sh diff --git a/lens/download_general_references.sh b/lens/download_general_references.sh new file mode 100644 index 00000000..45c1ae56 --- /dev/null +++ b/lens/download_general_references.sh @@ -0,0 +1,40 @@ +cd $RAFT_PATH/references + +# mhcflurry data directory +mkdir -p mhcflurry +cd mhcflurry +mkdir -p tmp +cd tmp +wget https://github.com/openvax/mhcflurry/releases/download/pre-2.0/models_class1_presentation.20200611.tar.bz2 +tar xvf * +mv models/* ../ +cd .. +rm -rf tmp +cd .. + +# Viral reference +mkdir -p viral; cd viral +wget https://github.com/dmarron/virdetect/raw/master/reference/virus_masked_hg38.fa +wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/4dedb99984857905ee96ab1d148d7863/virdetect.cds.gff.gz +wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/ad52c657d06c12d7a3346f15b71390af/virus.cds.fa.gz +wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/9e3a49921bd325caa98dcd9211f8cdd9/virus.pep.fa.gz +gunzip * +cd .. + +# antigen.garnish data directory +# The gzip extracts to the desired directory, so no mkdir and cd required. +curl -fsSL "https://s3.amazonaws.com/get.rech.io/antigen.garnish-2.3.0.tar.gz" | tar -xvz +chmod -R 700 antigen.garnish + +# BLASTP binary (for antigen.garnish) +mkdir -p bin; cd bin +wget https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.13.0/ncbi-blast-2.13.0+-x64-linux.tar.gz +tar xvf *gz +mv ncbi*/bin/blastp . +rm -rf ncbi* +mv blastp ../antigen.garnish +cd .. +rm -rf bin + +# Make dummy_file +touch dummy_file diff --git a/lens/download_human_references.sh b/lens/download_human_references.sh new file mode 100644 index 00000000..af0cbb73 --- /dev/null +++ b/lens/download_human_references.sh @@ -0,0 +1,128 @@ +cd $1/references +mkdir -p homo_sapiens; cd homo_sapiens + +# Genomic reference +mkdir -p fasta; cd fasta +wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta +singularity pull docker://staphb/samtools:1.13 +singularity exec -B $PWD samtools*.s* samtools faidx Homo_sapiens_assembly38.fasta +# EBV removal strategy from https://bioinformatics.stackexchange.com/a/14421 +keep_ids=($(awk '{print $1}' Homo_sapiens_assembly38.fasta.fai | grep -v chrEBV)) +singularity exec -B $PWD samtools*.s* samtools faidx -o Homo_sapiens.assembly38.no_ebv.fa Homo_sapiens_assembly38.fasta "${keep_ids[@]}" +rm Homo_sapiens_assembly38.fasta +rm Homo_sapiens_assembly38.fasta.fai +rm samtools*.s* +cd .. + +export VERSION=3.6.0 && # adjust this as necessary \ + mkdir -p $GOPATH/src/github.com/sylabs && \ + cd $GOPATH/src/github.com/sylabs && \ + wget https://github.com/sylabs/singularity/releases/download/v${VERSION}/singularity-${VERSION}.tar.gz && \ + tar -xzf singularity-${VERSION}.tar.gz && \ + cd ./singularity && \ + ./mconfig + + + +# GTF/GFF3 +mkdir -p annot; cd annot +wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_37/gencode.v37.annotation.gtf.gz +wget http://geve.med.u-tokai.ac.jp/download_data/gtf_m/Hsap38.geve.m_v1.gtf.bz2 +bzip2 -d Hsap38.geve.m_v1.gtf.bz2 +zcat gencode.v37.annotation.gtf.gz > gencode.v37.annotation.with.hervs.gtf; +cat Hsap38.geve.m_v1.gtf | sed 's/^/chr/g' | sed 's/CDS/transcript/g' >> gencode.v37.annotation.with.hervs.gtf +cat Hsap38.geve.m_v1.gtf | sed 's/^/chr/g' | sed 's/CDS/exon/g' >> gencode.v37.annotation.with.hervs.gtf +wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_37/gencode.v37.annotation.gff3.gz +gunzip gencode.v37.annotation.gff3.gz +cd .. + +# Protein reference +mkdir -p protein; cd protein +wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_37/gencode.v37.pc_translations.fa.gz +gunzip gencode.v37.pc_translations.fa.gz +cd .. + +# Reference VCFs +mkdir -p vcfs; cd vcfs +wget https://storage.googleapis.com/gatk-best-practices/somatic-hg38/1000g_pon.hg38.vcf.gz +wget https://storage.googleapis.com/gatk-best-practices/somatic-hg38/af-only-gnomad.hg38.vcf.gz +wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf +wget https://storage.googleapis.com/gatk-best-practices/somatic-hg38/small_exac_common_3.hg38.vcf.gz +bgzip Homo_sapiens_assembly38.dbsnp138.vcf +cd .. + +# BEDs +# https://www.biostars.org/p/459269/#459274 +mkdir -p beds; cd beds +zgrep 'transcript_type "protein_coding"' ../annot/gencode.v37.annotation.gtf.gz | awk '($3=="exon") {printf("%s\t%s\t%s\n",$1,int($4)-1,$5);}' | sort -T . -t $'\t' -k1,1 -k2,2n | bedtools merge > hg38_exome.bed +cd .. + +# snpEff reference +mkdir -p snpeff; cd snpeff +singularity pull docker://resolwebio/snpeff:latest +mkdir -p GRCh38.GENCODEv37 +cd GRCh38.GENCODEv37 +wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/430f9d80c841721499fbcec937b0f721/snpEff.config +ln ../../annot/gencode.v37.annotation.gtf.gz genes.gtf.gz +ln ../../fasta/Homo_sapiens.assembly38.no_ebv.fa sequences.fa +cd .. +singularity exec -B $PWD snpeff*.s* /opt/snpeff/snpeff/bin/snpEff build -gtf22 -v GRCh38.GENCODEv37 -dataDir ${PWD} -c GRCh38.GENCODEv37/snpEff.config +rm GRCh38.GENCODEv37/sequences.fa +rm GRCh38.GENCODEv37/genes.gtf.gz +rm snpeff*.s* +cd .. + +# NeoSplice reference +mkdir -p neosplice; cd neosplice +### The steps below generates a peptidome specific to a GTF and reference +### FASTA which is ideal. The Python script is taxing though, and users may not +### be able to run the script. As an alternative, the "off-the-shelf" peptidome +### included with NeoSplice is provided by default. + +wget https://raw.githubusercontent.com/max555beyond/NeoSplice/master/generate_reference_peptidome.py +python -m pip install --user pyfaidx +python -m pip install --user bcbio-gff +sed 's/os.makedirs(path, 0777)/os.makedirs(path, "0777")/g' generate_reference_peptidome.py > generate_reference_peptidome.py.tmp +mv generate_reference_peptidome.py.tmp generate_reference_peptidome.py +python generate_reference_peptidome.py ../annot/gencode.v37.annotation.gff3 ../fasta/Homo_sapiens.assembly38.no_ebv.fa . +mv .peptidome_result/ peptidome.homo_sapiens +rm generate_reference_peptidome.py + +mkdir -p peptidome.homo_sapiens; cd peptidome.homo_sapiens + +wget https://github.com/Benjamin-Vincent-Lab/NeoSplice/blob/master/Reference_peptidome/reference_peptidome_8.txt.gz +wget https://github.com/Benjamin-Vincent-Lab/NeoSplice/blob/master/Reference_peptidome/reference_peptidome_9.txt.gz +wget https://github.com/Benjamin-Vincent-Lab/NeoSplice/blob/master/Reference_peptidome/reference_peptidome_10.txt.gz +wget https://github.com/Benjamin-Vincent-Lab/NeoSplice/blob/master/Reference_peptidome/reference_peptidome_11.txt.gz + +cd ../.. + +# CTA/Self-antigen reference +mkdir -p cta_self; cd cta_self +wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/5a9786203497b90c0cc0c0a6a251399b/cta_and_self_antigen.homo_sapiens.gene_list +cd .. + +# STARFusion reference +# Note: This file is quite large (31G), so ensure you have sufficient storage. +mkdir -p starfusion; cd starfusion +wget https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/__genome_libs_StarFv1.10/GRCh38_gencode_v37_CTAT_lib_Mar012021.plug-n-play.tar.gz +tar -xvf GRCh38_gencode_v37_CTAT_lib_Mar012021.plug-n-play.tar.gz +cd GRCh38_gencode_v37_CTAT_lib_Mar012021.plug-n-play +mv ctat_genome_lib_build_dir/* .; rm -rf ctat_genome_lib_build_dir/; cd .. +rm -rf GRCh38_gencode_v37_CTAT_lib_Mar012021.plug-n-play.tar.gz +cd .. + +# ERV reference +mkdir -p erv; cd erv +wget http://geve.med.u-tokai.ac.jp/download_data/table/Hsap38.txt.bz2 +bzip2 -d Hsap38.txt.bz2 +cd .. + +# TCGA external reference +mkdir -p tcga; cd tcga +python3 -m pip install numpy --user +wget https://toil-xena-hub.s3.us-east-1.amazonaws.com/download/tcga_rsem_isoform_tpm.gz +wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/5e315a7217ff68ee2ced894e8a4a7246/tissue_source_site_codes +wget https://gitlab.com/landscape-of-effective-neoantigens-software/tcga2lens/-/raw/0e4ac67007b5e77b151162465b44003f555951a4/tcga2lens.py +python3 tcga2lens.py summarize-transcript-expression --tx-file tcga_rsem_isoform_tpm.gz --tumor-type-map tissue_source_site_codes --output tcga_transcript_tpm_summary.tsv +cd .. diff --git a/lens/download_mouse_references.sh b/lens/download_mouse_references.sh new file mode 100644 index 00000000..2d2fe7e9 --- /dev/null +++ b/lens/download_mouse_references.sh @@ -0,0 +1,91 @@ +cd $1/references +mkdir -p mus_musculus; cd mus_musculus + +# Genomic reference +mkdir -p fasta; cd fasta +wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/GRCm38.primary_assembly.genome.fa.gz +gunzip GRCm38.primary_assembly.genome.fa.gz +cd .. + +# GTF/GFF3 +mkdir -p annot; cd annot +wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.annotation.gtf.gz +wget http://geve.med.u-tokai.ac.jp/download_data/gtf_m/Mmus38.geve.m_v1.gtf.bz2 +bzip2 -d Mmus38.geve.m_v1.gtf.bz2 +zcat gencode.vM25.annotation.gtf.gz | grep -v chrMG > gencode.vM25.annotation.with.mervs.gtf +cat Mmus38.geve.m_v1.gtf | sed 's/^/chr/g' | sed 's/CDS/transcript/g' >> gencode.vM25.annotation.with.mervs.gtf +cat Mmus38.geve.m_v1.gtf | sed 's/^/chr/g' | sed 's/CDS/exon/g' >> gencode.vM25.annotation.with.mervs.gtf +wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.annotation.gff3.gz +gunzip gencode.vM25.annotation.gff3.gz +cd .. + +# Protein reference +mkdir -p fasta; cd fasta +wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.pc_translations.fa.gz +gunzip gencode.vM25.pc_translations.fa.gz +cd .. + +# Reference VCFs +mkdir -p vcfs; cd vcfs +# From https://github.com/igordot/genomics/blob/master/workflows/gatk-mouse-mm10.md +wget --recursive --no-parent --no-directories \ +--accept vcf*vcf.gz \ +ftp://ftp.ncbi.nih.gov/snp/organisms/archive/mouse_10090/VCF/ +rm *Alt* +rm *MT* +rm *Multi* +rm *NotOn* +rm *Un* +for vcf in $(ls -1 vcf_chr_*.vcf.gz) ; do + vcf_new=${vcf/.vcf.gz/.vcf} + echo $vcf + zcat $vcf | sed 's/^\([0-9XY]\)/chr\1/' > $vcf_new + rm -fv $vcf +done +for i in *vcf; do echo ${i}; bgzip ${i}; done +for i in *vcf.gz; do echo ${i}; tabix ${i}; done +bcftools merge -Oz -o mm10.dbsnp.vcf.gz *vcf.gz +rm vcf* +cd .. + +# BEDs +# https://www.biostars.org/p/459269/#459274 +mkdir -p beds; cd beds +zgrep 'transcript_type "protein_coding"' ../annot/gencode.vM25.annotation.gtf.gz | awk '($3=="exon") {printf("%s\t%s\t%s\n",$1,int($4)-1,$5);}' | sort -T . -t $'\t' -k1,1 -k2,2n | bedtools merge > mm10_exome.bed +cd .. + +# snpEff reference +mkdir -p snpeff; cd snpeff +singularity pull docker://resolwebio/snpeff:latest +singularity exec -B $PWD snpeff_latest.sif /opt/snpeff/snpeff/bin/snpEff download GRCm38.86 -dataDir ${PWD} +rm snpeff_latest.sif +wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/430f9d80c841721499fbcec937b0f721/snpEff.config +cd .. + +# NeoSplice reference +mkdir -p neosplice; cd neosplice +wget https://raw.githubusercontent.com/max555beyond/NeoSplice/master/generate_reference_peptidome.py +python generate_reference_peptidome.py ../annot/gencode.vM25.annotation.gff3 ../fasta/GRCm38.primary_assembly.genome.fa . +mv .peptidome_result/ peptidome.mus_musculus +rm generate_reference_peptidome.py +cd .. + +# CTA/Self-antigen reference +mkdir -p cta_self; cd cta_self +wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/7f7454717866c1a61fb505f8ac5446e0/cta_and_self_antigen.mus_musculus.gene_list +cd .. + +# STARFusion reference +mkdir -p starfusion; cd starfusion +wget https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/__genome_libs_StarFv1.9/Mouse_gencode_M24_CTAT_lib_Apr062020.plug-n-play.tar.gz +tar -xvf Mouse_gencode_M24_CTAT_lib_Apr062020.plug-n-play.tar.gz +cd Mouse_gencode_M24_CTAT_lib_Apr062020.plug-n-play +mv ctat_genome_lib_build_dir/* .; rm -rf ctat_genome_lib_build_dir/; cd .. +rm -rf Mouse_gencode_M24_CTAT_lib_Apr062020.plug-n-play.tar.gz +cd .. + +# ERV reference +mkdir -p erv; cd erv +wget http://geve.med.u-tokai.ac.jp/download_data/table/Mmus38.txt.bz2 +bzip2 -d Mmus38.txt.bz2 +cd .. From bfd84bc51d4d3881312c653c12e972ced2ff893f Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Thu, 29 Jun 2023 18:05:14 -0600 Subject: [PATCH 09/16] updates general and human ref scripts --- lens/download_general_references.sh | 25 ++++---- lens/download_human_references.sh | 89 ++++++++++++++--------------- 2 files changed, 53 insertions(+), 61 deletions(-) diff --git a/lens/download_general_references.sh b/lens/download_general_references.sh index 45c1ae56..7db0eb09 100644 --- a/lens/download_general_references.sh +++ b/lens/download_general_references.sh @@ -1,16 +1,15 @@ -cd $RAFT_PATH/references +export RAFT_PATH=~/raft +export REFERENCES_PATH=$RAFT_PATH/references +cd $REFERENCES_PATH # mhcflurry data directory -mkdir -p mhcflurry -cd mhcflurry -mkdir -p tmp -cd tmp +mkdir -p mhcflurry/tmp +cd mhcflurry/tmp wget https://github.com/openvax/mhcflurry/releases/download/pre-2.0/models_class1_presentation.20200611.tar.bz2 tar xvf * mv models/* ../ -cd .. -rm -rf tmp -cd .. +cd $REFERENCES_PATH +rm -rf mhcflurry/tmp # Viral reference mkdir -p viral; cd viral @@ -19,7 +18,7 @@ wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/mod wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/ad52c657d06c12d7a3346f15b71390af/virus.cds.fa.gz wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/9e3a49921bd325caa98dcd9211f8cdd9/virus.pep.fa.gz gunzip * -cd .. +cd $REFERENCES_PATH # antigen.garnish data directory # The gzip extracts to the desired directory, so no mkdir and cd required. @@ -30,11 +29,9 @@ chmod -R 700 antigen.garnish mkdir -p bin; cd bin wget https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.13.0/ncbi-blast-2.13.0+-x64-linux.tar.gz tar xvf *gz -mv ncbi*/bin/blastp . -rm -rf ncbi* -mv blastp ../antigen.garnish -cd .. +mv ncbi*/bin/blastp $REFERENCES_PATH/antigen.garnish +cd $REFERENCES_PATH rm -rf bin -# Make dummy_file +# Make dummy_file - I have no idea why this was here in the first place touch dummy_file diff --git a/lens/download_human_references.sh b/lens/download_human_references.sh index af0cbb73..c7485055 100644 --- a/lens/download_human_references.sh +++ b/lens/download_human_references.sh @@ -1,46 +1,41 @@ -cd $1/references +export RAFT_PATH=~/raft +export REFERENCES_PATH=$RAFT_PATH/references +cd $REFERENCES_PATH + +# set up homo sapiens directory mkdir -p homo_sapiens; cd homo_sapiens +export HUMAN_REFERENCES_PATH=$REFERENCES_PATH/homo_sapiens # Genomic reference mkdir -p fasta; cd fasta wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta -singularity pull docker://staphb/samtools:1.13 -singularity exec -B $PWD samtools*.s* samtools faidx Homo_sapiens_assembly38.fasta +docker pull staphb/samtools:1.13 +docker run -v $PWD:/data staphb/samtools:1.13 samtools faidx Homo_sapiens_assembly38.fasta # EBV removal strategy from https://bioinformatics.stackexchange.com/a/14421 keep_ids=($(awk '{print $1}' Homo_sapiens_assembly38.fasta.fai | grep -v chrEBV)) -singularity exec -B $PWD samtools*.s* samtools faidx -o Homo_sapiens.assembly38.no_ebv.fa Homo_sapiens_assembly38.fasta "${keep_ids[@]}" -rm Homo_sapiens_assembly38.fasta -rm Homo_sapiens_assembly38.fasta.fai -rm samtools*.s* -cd .. - -export VERSION=3.6.0 && # adjust this as necessary \ - mkdir -p $GOPATH/src/github.com/sylabs && \ - cd $GOPATH/src/github.com/sylabs && \ - wget https://github.com/sylabs/singularity/releases/download/v${VERSION}/singularity-${VERSION}.tar.gz && \ - tar -xzf singularity-${VERSION}.tar.gz && \ - cd ./singularity && \ - ./mconfig - +docker run -v $PWD:/data staphb/samtools:1.13 samtools faidx -o Homo_sapiens.assembly38.no_ebv.fa Homo_sapiens_assembly38.fasta "${keep_ids[@]}" +rm -f *.fasta* +cd $HUMAN_REFERENCES_PATH +# stopped here # GTF/GFF3 mkdir -p annot; cd annot wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_37/gencode.v37.annotation.gtf.gz wget http://geve.med.u-tokai.ac.jp/download_data/gtf_m/Hsap38.geve.m_v1.gtf.bz2 bzip2 -d Hsap38.geve.m_v1.gtf.bz2 -zcat gencode.v37.annotation.gtf.gz > gencode.v37.annotation.with.hervs.gtf; +zcat gencode.v37.annotation.gtf.gz > gencode.v37.annotation.with.hervs.gtf cat Hsap38.geve.m_v1.gtf | sed 's/^/chr/g' | sed 's/CDS/transcript/g' >> gencode.v37.annotation.with.hervs.gtf cat Hsap38.geve.m_v1.gtf | sed 's/^/chr/g' | sed 's/CDS/exon/g' >> gencode.v37.annotation.with.hervs.gtf wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_37/gencode.v37.annotation.gff3.gz gunzip gencode.v37.annotation.gff3.gz -cd .. +cd $HUMAN_REFERENCES_PATH # Protein reference mkdir -p protein; cd protein wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_37/gencode.v37.pc_translations.fa.gz gunzip gencode.v37.pc_translations.fa.gz -cd .. +cd $HUMAN_REFERENCES_PATH # Reference VCFs mkdir -p vcfs; cd vcfs @@ -49,28 +44,28 @@ wget https://storage.googleapis.com/gatk-best-practices/somatic-hg38/af-only-gno wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf wget https://storage.googleapis.com/gatk-best-practices/somatic-hg38/small_exac_common_3.hg38.vcf.gz bgzip Homo_sapiens_assembly38.dbsnp138.vcf -cd .. +cd $HUMAN_REFERENCES_PATH + # BEDs # https://www.biostars.org/p/459269/#459274 mkdir -p beds; cd beds -zgrep 'transcript_type "protein_coding"' ../annot/gencode.v37.annotation.gtf.gz | awk '($3=="exon") {printf("%s\t%s\t%s\n",$1,int($4)-1,$5);}' | sort -T . -t $'\t' -k1,1 -k2,2n | bedtools merge > hg38_exome.bed -cd .. +zgrep 'transcript_type "protein_coding"' $HUMAN_REFERENCES_PATH/annot/gencode.v37.annotation.gtf.gz | awk '($3=="exon") {printf("%s\t%s\t%s\n",$1,int($4)-1,$5);}' | sort -T . -t $'\t' -k1,1 -k2,2n | bedtools merge > hg38_exome.bed +cd $HUMAN_REFERENCES_PATH + # snpEff reference mkdir -p snpeff; cd snpeff -singularity pull docker://resolwebio/snpeff:latest -mkdir -p GRCh38.GENCODEv37 -cd GRCh38.GENCODEv37 +docker pull resolwebio/snpeff:2.0.0 +mkdir -p GRCh38.GENCODEv37; cd GRCh38.GENCODEv37 wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/430f9d80c841721499fbcec937b0f721/snpEff.config -ln ../../annot/gencode.v37.annotation.gtf.gz genes.gtf.gz -ln ../../fasta/Homo_sapiens.assembly38.no_ebv.fa sequences.fa -cd .. -singularity exec -B $PWD snpeff*.s* /opt/snpeff/snpeff/bin/snpEff build -gtf22 -v GRCh38.GENCODEv37 -dataDir ${PWD} -c GRCh38.GENCODEv37/snpEff.config -rm GRCh38.GENCODEv37/sequences.fa -rm GRCh38.GENCODEv37/genes.gtf.gz -rm snpeff*.s* -cd .. +ln $HUMAN_REFERENCES_PATH/annot/gencode.v37.annotation.gtf.gz genes.gtf.gz +sudo ln $HUMAN_REFERENCES_PATH/fasta/Homo_sapiens.assembly38.no_ebv.fa sequences.fa +cd $HUMAN_REFERENCES_PATH/snpeff +docker run -v $PWD:/data resolwebio/snpeff:2.0.0 /opt/snpeff/snpeff/bin/snpEff build -gtf22 -v GRCh38.GENCODEv37 -dataDir ${PWD} -c GRCh38.GENCODEv37/snpEff.config +rm -f GRCh38.GENCODEv37/sequences.fa GRCh38.GENCODEv37/genes.gtf.gz +cd $HUMAN_REFERENCES_PATH + # NeoSplice reference mkdir -p neosplice; cd neosplice @@ -79,14 +74,14 @@ mkdir -p neosplice; cd neosplice ### be able to run the script. As an alternative, the "off-the-shelf" peptidome ### included with NeoSplice is provided by default. -wget https://raw.githubusercontent.com/max555beyond/NeoSplice/master/generate_reference_peptidome.py -python -m pip install --user pyfaidx -python -m pip install --user bcbio-gff -sed 's/os.makedirs(path, 0777)/os.makedirs(path, "0777")/g' generate_reference_peptidome.py > generate_reference_peptidome.py.tmp -mv generate_reference_peptidome.py.tmp generate_reference_peptidome.py -python generate_reference_peptidome.py ../annot/gencode.v37.annotation.gff3 ../fasta/Homo_sapiens.assembly38.no_ebv.fa . -mv .peptidome_result/ peptidome.homo_sapiens -rm generate_reference_peptidome.py +# wget https://raw.githubusercontent.com/max555beyond/NeoSplice/master/generate_reference_peptidome.py +# python3 -m pip install --user pyfaidx +# python3 -m pip install --user bcbio-gff +# sed 's/os.makedirs(path, 0777)/os.makedirs(path, 0777)/g' generate_reference_peptidome.py > generate_reference_peptidome.py.tmp +# mv generate_reference_peptidome.py.tmp generate_reference_peptidome.py +# python3 generate_reference_peptidome.py $HUMAN_REFERENCES_PATH/annot/gencode.v37.annotation.gff3 $HUMAN_REFERENCES_PATH/fasta/Homo_sapiens.assembly38.no_ebv.fa . +# mv .peptidome_result/ peptidome.homo_sapiens +# rm generate_reference_peptidome.py mkdir -p peptidome.homo_sapiens; cd peptidome.homo_sapiens @@ -95,12 +90,12 @@ wget https://github.com/Benjamin-Vincent-Lab/NeoSplice/blob/master/Reference_pep wget https://github.com/Benjamin-Vincent-Lab/NeoSplice/blob/master/Reference_peptidome/reference_peptidome_10.txt.gz wget https://github.com/Benjamin-Vincent-Lab/NeoSplice/blob/master/Reference_peptidome/reference_peptidome_11.txt.gz -cd ../.. +cd $HUMAN_REFERENCES_PATH # CTA/Self-antigen reference mkdir -p cta_self; cd cta_self wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/5a9786203497b90c0cc0c0a6a251399b/cta_and_self_antigen.homo_sapiens.gene_list -cd .. +cd $HUMAN_REFERENCES_PATH # STARFusion reference # Note: This file is quite large (31G), so ensure you have sufficient storage. @@ -110,13 +105,13 @@ tar -xvf GRCh38_gencode_v37_CTAT_lib_Mar012021.plug-n-play.tar.gz cd GRCh38_gencode_v37_CTAT_lib_Mar012021.plug-n-play mv ctat_genome_lib_build_dir/* .; rm -rf ctat_genome_lib_build_dir/; cd .. rm -rf GRCh38_gencode_v37_CTAT_lib_Mar012021.plug-n-play.tar.gz -cd .. +cd $HUMAN_REFERENCES_PATH # ERV reference mkdir -p erv; cd erv wget http://geve.med.u-tokai.ac.jp/download_data/table/Hsap38.txt.bz2 bzip2 -d Hsap38.txt.bz2 -cd .. +cd $HUMAN_REFERENCES_PATH # TCGA external reference mkdir -p tcga; cd tcga @@ -125,4 +120,4 @@ wget https://toil-xena-hub.s3.us-east-1.amazonaws.com/download/tcga_rsem_isoform wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/5e315a7217ff68ee2ced894e8a4a7246/tissue_source_site_codes wget https://gitlab.com/landscape-of-effective-neoantigens-software/tcga2lens/-/raw/0e4ac67007b5e77b151162465b44003f555951a4/tcga2lens.py python3 tcga2lens.py summarize-transcript-expression --tx-file tcga_rsem_isoform_tpm.gz --tumor-type-map tissue_source_site_codes --output tcga_transcript_tpm_summary.tsv -cd .. +cd $HUMAN_REFERENCES_PATH From 5a212573318ba42c9aafcdc80d1e11586fe188ba Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Wed, 5 Jul 2023 12:49:02 -0600 Subject: [PATCH 10/16] updates ref file scripts --- lens/download_general_references.sh | 9 ----- lens/download_human_references.sh | 10 ++++++ lens/download_mouse_references.sh | 52 +++++++++++++++-------------- 3 files changed, 37 insertions(+), 34 deletions(-) diff --git a/lens/download_general_references.sh b/lens/download_general_references.sh index 7db0eb09..5fc7f5b2 100644 --- a/lens/download_general_references.sh +++ b/lens/download_general_references.sh @@ -2,15 +2,6 @@ export RAFT_PATH=~/raft export REFERENCES_PATH=$RAFT_PATH/references cd $REFERENCES_PATH -# mhcflurry data directory -mkdir -p mhcflurry/tmp -cd mhcflurry/tmp -wget https://github.com/openvax/mhcflurry/releases/download/pre-2.0/models_class1_presentation.20200611.tar.bz2 -tar xvf * -mv models/* ../ -cd $REFERENCES_PATH -rm -rf mhcflurry/tmp - # Viral reference mkdir -p viral; cd viral wget https://github.com/dmarron/virdetect/raw/master/reference/virus_masked_hg38.fa diff --git a/lens/download_human_references.sh b/lens/download_human_references.sh index c7485055..687e2144 100644 --- a/lens/download_human_references.sh +++ b/lens/download_human_references.sh @@ -6,6 +6,16 @@ cd $REFERENCES_PATH mkdir -p homo_sapiens; cd homo_sapiens export HUMAN_REFERENCES_PATH=$REFERENCES_PATH/homo_sapiens +#mhcflurry +mkdir -p mhcflurry/tmp +cd mhcflurry/tmp +wget https://github.com/openvax/mhcflurry/releases/download/pre-2.0/models_class1_presentation.20200611.tar.bz2 +tar xvf * +mv models/* ../ +cd $HUMAN_REFERENCES_PATH +rm -rf tmp + + # Genomic reference mkdir -p fasta; cd fasta wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta diff --git a/lens/download_mouse_references.sh b/lens/download_mouse_references.sh index 2d2fe7e9..9d11c70d 100644 --- a/lens/download_mouse_references.sh +++ b/lens/download_mouse_references.sh @@ -1,11 +1,21 @@ -cd $1/references +export RAFT_PATH=~/raft +export REFERENCES_PATH=$RAFT_PATH/references +cd $REFERENCES_PATH + mkdir -p mus_musculus; cd mus_musculus +export MOUSE_REFERENCES_PATH=$REFERENCES_PATH/mus_musculus + # Genomic reference mkdir -p fasta; cd fasta wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/GRCm38.primary_assembly.genome.fa.gz gunzip GRCm38.primary_assembly.genome.fa.gz -cd .. +cd $MOUSE_REFERENCES_PATH + +# Protein reference +wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.pc_translations.fa.gz +gunzip gencode.vM25.pc_translations.fa.gz +cd $MOUSE_REFERENCES_PATH # GTF/GFF3 mkdir -p annot; cd annot @@ -17,13 +27,8 @@ cat Mmus38.geve.m_v1.gtf | sed 's/^/chr/g' | sed 's/CDS/transcript/g' >> gencode cat Mmus38.geve.m_v1.gtf | sed 's/^/chr/g' | sed 's/CDS/exon/g' >> gencode.vM25.annotation.with.mervs.gtf wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.annotation.gff3.gz gunzip gencode.vM25.annotation.gff3.gz -cd .. +cd $MOUSE_REFERENCES_PATH -# Protein reference -mkdir -p fasta; cd fasta -wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.pc_translations.fa.gz -gunzip gencode.vM25.pc_translations.fa.gz -cd .. # Reference VCFs mkdir -p vcfs; cd vcfs @@ -31,11 +36,7 @@ mkdir -p vcfs; cd vcfs wget --recursive --no-parent --no-directories \ --accept vcf*vcf.gz \ ftp://ftp.ncbi.nih.gov/snp/organisms/archive/mouse_10090/VCF/ -rm *Alt* -rm *MT* -rm *Multi* -rm *NotOn* -rm *Un* +rm *Alt* *MT* *Multi* *NotOn* *Un* for vcf in $(ls -1 vcf_chr_*.vcf.gz) ; do vcf_new=${vcf/.vcf.gz/.vcf} echo $vcf @@ -46,34 +47,35 @@ for i in *vcf; do echo ${i}; bgzip ${i}; done for i in *vcf.gz; do echo ${i}; tabix ${i}; done bcftools merge -Oz -o mm10.dbsnp.vcf.gz *vcf.gz rm vcf* -cd .. +cd $MOUSE_REFERENCES_PATH # BEDs # https://www.biostars.org/p/459269/#459274 mkdir -p beds; cd beds -zgrep 'transcript_type "protein_coding"' ../annot/gencode.vM25.annotation.gtf.gz | awk '($3=="exon") {printf("%s\t%s\t%s\n",$1,int($4)-1,$5);}' | sort -T . -t $'\t' -k1,1 -k2,2n | bedtools merge > mm10_exome.bed -cd .. +zgrep 'transcript_type "protein_coding"' $MOUSE_REFERENCES_PATH/annot/gencode.vM25.annotation.gtf.gz | awk '($3=="exon") {printf("%s\t%s\t%s\n",$1,int($4)-1,$5);}' | sort -T . -t $'\t' -k1,1 -k2,2n | bedtools merge > mm10_exome.bed +cd $MOUSE_REFERENCES_PATH # snpEff reference mkdir -p snpeff; cd snpeff -singularity pull docker://resolwebio/snpeff:latest -singularity exec -B $PWD snpeff_latest.sif /opt/snpeff/snpeff/bin/snpEff download GRCm38.86 -dataDir ${PWD} -rm snpeff_latest.sif +docker pull resolwebio/snpeff:2.0.0 +docker run -v $PWD:/data resolwebio/snpeff:2.0.0 /opt/snpeff/snpeff/bin/snpEff download GRCm38.86 -dataDir ${PWD} wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/430f9d80c841721499fbcec937b0f721/snpEff.config -cd .. +cd $MOUSE_REFERENCES_PATH # NeoSplice reference mkdir -p neosplice; cd neosplice wget https://raw.githubusercontent.com/max555beyond/NeoSplice/master/generate_reference_peptidome.py -python generate_reference_peptidome.py ../annot/gencode.vM25.annotation.gff3 ../fasta/GRCm38.primary_assembly.genome.fa . +python3 generate_reference_peptidome.py $MOUSE_REFERENCES_PATH/annot/gencode.vM25.annotation.gff3 $MOUSE_REFERENCES_PATH/fasta/GRCm38.primary_assembly.genome.fa . mv .peptidome_result/ peptidome.mus_musculus rm generate_reference_peptidome.py -cd .. +cd $MOUSE_REFERENCES_PATH + +#stopped here # CTA/Self-antigen reference mkdir -p cta_self; cd cta_self wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/7f7454717866c1a61fb505f8ac5446e0/cta_and_self_antigen.mus_musculus.gene_list -cd .. +cd $MOUSE_REFERENCES_PATH # STARFusion reference mkdir -p starfusion; cd starfusion @@ -82,10 +84,10 @@ tar -xvf Mouse_gencode_M24_CTAT_lib_Apr062020.plug-n-play.tar.gz cd Mouse_gencode_M24_CTAT_lib_Apr062020.plug-n-play mv ctat_genome_lib_build_dir/* .; rm -rf ctat_genome_lib_build_dir/; cd .. rm -rf Mouse_gencode_M24_CTAT_lib_Apr062020.plug-n-play.tar.gz -cd .. +cd $MOUSE_REFERENCES_PATH # ERV reference mkdir -p erv; cd erv wget http://geve.med.u-tokai.ac.jp/download_data/table/Mmus38.txt.bz2 bzip2 -d Mmus38.txt.bz2 -cd .. +cd $MOUSE_REFERENCES_PATH From 77ef5f49dfef8671e780a3a8c5a875a90e005a39 Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Wed, 5 Jul 2023 12:52:00 -0600 Subject: [PATCH 11/16] pre-commit fixes --- lens/download_human_references.sh | 2 +- lens/download_mouse_references.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lens/download_human_references.sh b/lens/download_human_references.sh index 687e2144..bd846b8e 100644 --- a/lens/download_human_references.sh +++ b/lens/download_human_references.sh @@ -4,7 +4,7 @@ cd $REFERENCES_PATH # set up homo sapiens directory mkdir -p homo_sapiens; cd homo_sapiens -export HUMAN_REFERENCES_PATH=$REFERENCES_PATH/homo_sapiens +export HUMAN_REFERENCES_PATH=$REFERENCES_PATH/homo_sapiens #mhcflurry mkdir -p mhcflurry/tmp diff --git a/lens/download_mouse_references.sh b/lens/download_mouse_references.sh index 9d11c70d..bb6da99e 100644 --- a/lens/download_mouse_references.sh +++ b/lens/download_mouse_references.sh @@ -3,7 +3,7 @@ export REFERENCES_PATH=$RAFT_PATH/references cd $REFERENCES_PATH mkdir -p mus_musculus; cd mus_musculus -export MOUSE_REFERENCES_PATH=$REFERENCES_PATH/mus_musculus +export MOUSE_REFERENCES_PATH=$REFERENCES_PATH/mus_musculus # Genomic reference From 84a595661c009ef9395ee0b07df5cc0bf598e750 Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Wed, 5 Jul 2023 13:06:16 -0600 Subject: [PATCH 12/16] updates final bash and mouse file --- .../lens}/download_general_references.sh | 0 .../lens}/download_human_references.sh | 0 .../lens}/download_mouse_references.sh | 20 +++++++++---------- bin/{ => lens}/mirror-lens-references.sh | 10 +++------- 4 files changed, 13 insertions(+), 17 deletions(-) rename {lens => bin/lens}/download_general_references.sh (100%) rename {lens => bin/lens}/download_human_references.sh (100%) rename {lens => bin/lens}/download_mouse_references.sh (83%) rename bin/{ => lens}/mirror-lens-references.sh (59%) diff --git a/lens/download_general_references.sh b/bin/lens/download_general_references.sh similarity index 100% rename from lens/download_general_references.sh rename to bin/lens/download_general_references.sh diff --git a/lens/download_human_references.sh b/bin/lens/download_human_references.sh similarity index 100% rename from lens/download_human_references.sh rename to bin/lens/download_human_references.sh diff --git a/lens/download_mouse_references.sh b/bin/lens/download_mouse_references.sh similarity index 83% rename from lens/download_mouse_references.sh rename to bin/lens/download_mouse_references.sh index bb6da99e..5364c24b 100644 --- a/lens/download_mouse_references.sh +++ b/bin/lens/download_mouse_references.sh @@ -3,7 +3,7 @@ export REFERENCES_PATH=$RAFT_PATH/references cd $REFERENCES_PATH mkdir -p mus_musculus; cd mus_musculus -export MOUSE_REFERENCES_PATH=$REFERENCES_PATH/mus_musculus +export MOUSE_REFERENCES_PATH=$REFERENCES_PATH/mus_musculus # Genomic reference @@ -64,7 +64,7 @@ cd $MOUSE_REFERENCES_PATH # NeoSplice reference mkdir -p neosplice; cd neosplice -wget https://raw.githubusercontent.com/max555beyond/NeoSplice/master/generate_reference_peptidome.py +wget https://raw.githubusercontent.com/BWMac/NeoSplice/master/generate_reference_peptidome.py python3 generate_reference_peptidome.py $MOUSE_REFERENCES_PATH/annot/gencode.vM25.annotation.gff3 $MOUSE_REFERENCES_PATH/fasta/GRCm38.primary_assembly.genome.fa . mv .peptidome_result/ peptidome.mus_musculus rm generate_reference_peptidome.py @@ -77,14 +77,14 @@ mkdir -p cta_self; cd cta_self wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/7f7454717866c1a61fb505f8ac5446e0/cta_and_self_antigen.mus_musculus.gene_list cd $MOUSE_REFERENCES_PATH -# STARFusion reference -mkdir -p starfusion; cd starfusion -wget https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/__genome_libs_StarFv1.9/Mouse_gencode_M24_CTAT_lib_Apr062020.plug-n-play.tar.gz -tar -xvf Mouse_gencode_M24_CTAT_lib_Apr062020.plug-n-play.tar.gz -cd Mouse_gencode_M24_CTAT_lib_Apr062020.plug-n-play -mv ctat_genome_lib_build_dir/* .; rm -rf ctat_genome_lib_build_dir/; cd .. -rm -rf Mouse_gencode_M24_CTAT_lib_Apr062020.plug-n-play.tar.gz -cd $MOUSE_REFERENCES_PATH +# STARFusion reference - waiting for response on 404 error +# mkdir -p starfusion; cd starfusion +# wget https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/__genome_libs_StarFv1.9/Mouse_gencode_M24_CTAT_lib_Apr062020.plug-n-play.tar.gz +# tar -xvf Mouse_gencode_M24_CTAT_lib_Apr062020.plug-n-play.tar.gz +# cd Mouse_gencode_M24_CTAT_lib_Apr062020.plug-n-play +# mv ctat_genome_lib_build_dir/* .; rm -rf ctat_genome_lib_build_dir/; cd .. +# rm -rf Mouse_gencode_M24_CTAT_lib_Apr062020.plug-n-play.tar.gz +# cd $MOUSE_REFERENCES_PATH # ERV reference mkdir -p erv; cd erv diff --git a/bin/mirror-lens-references.sh b/bin/lens/mirror-lens-references.sh similarity index 59% rename from bin/mirror-lens-references.sh rename to bin/lens/mirror-lens-references.sh index 14c20674..06abee05 100644 --- a/bin/mirror-lens-references.sh +++ b/bin/lens/mirror-lens-references.sh @@ -21,13 +21,9 @@ prefixes=( echo "Downloading LENS reference files" mkdir -p "./lens/references" cd ./lens -wget -P . https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/b0d8a24036628bd9e6c860eb531919c3/download_general_references.sh \ - https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/2238c7f4af5c5499f298315bf5080ad7/download_human_references.sh \ - https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/bf210753e3724edf6db2543eb54c3de4/download_mouse_references.sh -export RAFT_PATH="." -bash download_general_references.sh . -bash download_human_references.sh . -bash download_mouse_references.sh . +bash download_general_references.sh +bash download_human_references.sh +bash download_mouse_references.sh for prefix in ${prefixes[*]}; do echo "Syncing $prefix..." From cbb29c5589a3aef325a4da425049b382dc814450 Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Wed, 5 Jul 2023 13:08:02 -0600 Subject: [PATCH 13/16] fix pre-commit --- bin/lens/download_mouse_references.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/lens/download_mouse_references.sh b/bin/lens/download_mouse_references.sh index 5364c24b..e7c38bda 100644 --- a/bin/lens/download_mouse_references.sh +++ b/bin/lens/download_mouse_references.sh @@ -3,7 +3,7 @@ export REFERENCES_PATH=$RAFT_PATH/references cd $REFERENCES_PATH mkdir -p mus_musculus; cd mus_musculus -export MOUSE_REFERENCES_PATH=$REFERENCES_PATH/mus_musculus +export MOUSE_REFERENCES_PATH=$REFERENCES_PATH/mus_musculus # Genomic reference From 54400936cce9e74f327dc6c5ec910e589bc53d6b Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Thu, 6 Jul 2023 14:23:17 -0600 Subject: [PATCH 14/16] updates scripts --- bin/lens/download_human_references.sh | 19 +++++++++---------- bin/lens/mirror-lens-references.sh | 7 +++---- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/bin/lens/download_human_references.sh b/bin/lens/download_human_references.sh index bd846b8e..5ce85ce1 100644 --- a/bin/lens/download_human_references.sh +++ b/bin/lens/download_human_references.sh @@ -66,14 +66,15 @@ cd $HUMAN_REFERENCES_PATH # snpEff reference mkdir -p snpeff; cd snpeff -docker pull resolwebio/snpeff:2.0.0 +docker pull resolwebio/snpeff:latest mkdir -p GRCh38.GENCODEv37; cd GRCh38.GENCODEv37 wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/430f9d80c841721499fbcec937b0f721/snpEff.config -ln $HUMAN_REFERENCES_PATH/annot/gencode.v37.annotation.gtf.gz genes.gtf.gz +cp $HUMAN_REFERENCES_PATH/annot/gencode.v37.annotation.gtf.gz genes.gtf.gz +gunzip genes.gtf.gz sudo ln $HUMAN_REFERENCES_PATH/fasta/Homo_sapiens.assembly38.no_ebv.fa sequences.fa cd $HUMAN_REFERENCES_PATH/snpeff -docker run -v $PWD:/data resolwebio/snpeff:2.0.0 /opt/snpeff/snpeff/bin/snpEff build -gtf22 -v GRCh38.GENCODEv37 -dataDir ${PWD} -c GRCh38.GENCODEv37/snpEff.config -rm -f GRCh38.GENCODEv37/sequences.fa GRCh38.GENCODEv37/genes.gtf.gz +docker run -v $PWD:/data -w /data resolwebio/snpeff:latest /opt/snpeff/snpeff/bin/snpEff build -gtf22 -v GRCh38.GENCODEv37 -dataDir /data -c GRCh38.GENCODEv37/snpEff.config +rm -f GRCh38.GENCODEv37/sequences.fa GRCh38.GENCODEv37/genes.gtf cd $HUMAN_REFERENCES_PATH @@ -94,12 +95,10 @@ mkdir -p neosplice; cd neosplice # rm generate_reference_peptidome.py mkdir -p peptidome.homo_sapiens; cd peptidome.homo_sapiens - -wget https://github.com/Benjamin-Vincent-Lab/NeoSplice/blob/master/Reference_peptidome/reference_peptidome_8.txt.gz -wget https://github.com/Benjamin-Vincent-Lab/NeoSplice/blob/master/Reference_peptidome/reference_peptidome_9.txt.gz -wget https://github.com/Benjamin-Vincent-Lab/NeoSplice/blob/master/Reference_peptidome/reference_peptidome_10.txt.gz -wget https://github.com/Benjamin-Vincent-Lab/NeoSplice/blob/master/Reference_peptidome/reference_peptidome_11.txt.gz - +wget https://github.com/Benjamin-Vincent-Lab/NeoSplice/raw/master/Reference_peptidome/reference_peptidome_8.txt.gz +wget https://github.com/Benjamin-Vincent-Lab/NeoSplice/raw/master/Reference_peptidome/reference_peptidome_9.txt.gz +wget https://github.com/Benjamin-Vincent-Lab/NeoSplice/raw/master/Reference_peptidome/reference_peptidome_10.txt.gz +wget https://github.com/Benjamin-Vincent-Lab/NeoSplice/raw/master/Reference_peptidome/reference_peptidome_11.txt.gz cd $HUMAN_REFERENCES_PATH # CTA/Self-antigen reference diff --git a/bin/lens/mirror-lens-references.sh b/bin/lens/mirror-lens-references.sh index 06abee05..dd84ed06 100644 --- a/bin/lens/mirror-lens-references.sh +++ b/bin/lens/mirror-lens-references.sh @@ -11,10 +11,9 @@ prefixes=( "antigen.garnish/" - "erv/" + # "erv/" "homo_sapiens/" - "mhcflurry/" - "mus_musculus/" + # "mus_musculus/" "viral/" ) @@ -23,7 +22,7 @@ mkdir -p "./lens/references" cd ./lens bash download_general_references.sh bash download_human_references.sh -bash download_mouse_references.sh +# bash download_mouse_references.sh for prefix in ${prefixes[*]}; do echo "Syncing $prefix..." From 88464a283b7c89b387b2ede893ea20a15b69c031 Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Thu, 6 Jul 2023 14:35:03 -0600 Subject: [PATCH 15/16] remove stopped comment --- bin/lens/download_human_references.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/bin/lens/download_human_references.sh b/bin/lens/download_human_references.sh index 5ce85ce1..99a69659 100644 --- a/bin/lens/download_human_references.sh +++ b/bin/lens/download_human_references.sh @@ -27,8 +27,6 @@ docker run -v $PWD:/data staphb/samtools:1.13 samtools faidx -o Homo_sapiens.ass rm -f *.fasta* cd $HUMAN_REFERENCES_PATH -# stopped here - # GTF/GFF3 mkdir -p annot; cd annot wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_37/gencode.v37.annotation.gtf.gz From 58a2e6db0bb7f15fbcd729c7cd06cf4a04dab909 Mon Sep 17 00:00:00 2001 From: Brad Macdonald Date: Thu, 6 Jul 2023 14:37:32 -0600 Subject: [PATCH 16/16] remove comment --- bin/lens/download_general_references.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/lens/download_general_references.sh b/bin/lens/download_general_references.sh index 5fc7f5b2..bd441eed 100644 --- a/bin/lens/download_general_references.sh +++ b/bin/lens/download_general_references.sh @@ -24,5 +24,5 @@ mv ncbi*/bin/blastp $REFERENCES_PATH/antigen.garnish cd $REFERENCES_PATH rm -rf bin -# Make dummy_file - I have no idea why this was here in the first place +# Make dummy_file touch dummy_file