diff --git a/bin/lens/download_general_references.sh b/bin/lens/download_general_references.sh new file mode 100644 index 00000000..bd441eed --- /dev/null +++ b/bin/lens/download_general_references.sh @@ -0,0 +1,28 @@ +export RAFT_PATH=~/raft +export REFERENCES_PATH=$RAFT_PATH/references +cd $REFERENCES_PATH + +# Viral reference +mkdir -p viral; cd viral +wget https://github.com/dmarron/virdetect/raw/master/reference/virus_masked_hg38.fa +wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/4dedb99984857905ee96ab1d148d7863/virdetect.cds.gff.gz +wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/ad52c657d06c12d7a3346f15b71390af/virus.cds.fa.gz +wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/9e3a49921bd325caa98dcd9211f8cdd9/virus.pep.fa.gz +gunzip * +cd $REFERENCES_PATH + +# antigen.garnish data directory +# The gzip extracts to the desired directory, so no mkdir and cd required. +curl -fsSL "https://s3.amazonaws.com/get.rech.io/antigen.garnish-2.3.0.tar.gz" | tar -xvz +chmod -R 700 antigen.garnish + +# BLASTP binary (for antigen.garnish) +mkdir -p bin; cd bin +wget https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.13.0/ncbi-blast-2.13.0+-x64-linux.tar.gz +tar xvf *gz +mv ncbi*/bin/blastp $REFERENCES_PATH/antigen.garnish +cd $REFERENCES_PATH +rm -rf bin + +# Make dummy_file +touch dummy_file diff --git a/bin/lens/download_human_references.sh b/bin/lens/download_human_references.sh new file mode 100644 index 00000000..99a69659 --- /dev/null +++ b/bin/lens/download_human_references.sh @@ -0,0 +1,130 @@ +export RAFT_PATH=~/raft +export REFERENCES_PATH=$RAFT_PATH/references +cd $REFERENCES_PATH + +# set up homo sapiens directory +mkdir -p homo_sapiens; cd homo_sapiens +export HUMAN_REFERENCES_PATH=$REFERENCES_PATH/homo_sapiens + +#mhcflurry +mkdir -p mhcflurry/tmp +cd mhcflurry/tmp +wget https://github.com/openvax/mhcflurry/releases/download/pre-2.0/models_class1_presentation.20200611.tar.bz2 +tar xvf * +mv models/* ../ +cd $HUMAN_REFERENCES_PATH +rm -rf tmp + + +# Genomic reference +mkdir -p fasta; cd fasta +wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta +docker pull staphb/samtools:1.13 +docker run -v $PWD:/data staphb/samtools:1.13 samtools faidx Homo_sapiens_assembly38.fasta +# EBV removal strategy from https://bioinformatics.stackexchange.com/a/14421 +keep_ids=($(awk '{print $1}' Homo_sapiens_assembly38.fasta.fai | grep -v chrEBV)) +docker run -v $PWD:/data staphb/samtools:1.13 samtools faidx -o Homo_sapiens.assembly38.no_ebv.fa Homo_sapiens_assembly38.fasta "${keep_ids[@]}" +rm -f *.fasta* +cd $HUMAN_REFERENCES_PATH + +# GTF/GFF3 +mkdir -p annot; cd annot +wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_37/gencode.v37.annotation.gtf.gz +wget http://geve.med.u-tokai.ac.jp/download_data/gtf_m/Hsap38.geve.m_v1.gtf.bz2 +bzip2 -d Hsap38.geve.m_v1.gtf.bz2 +zcat gencode.v37.annotation.gtf.gz > gencode.v37.annotation.with.hervs.gtf +cat Hsap38.geve.m_v1.gtf | sed 's/^/chr/g' | sed 's/CDS/transcript/g' >> gencode.v37.annotation.with.hervs.gtf +cat Hsap38.geve.m_v1.gtf | sed 's/^/chr/g' | sed 's/CDS/exon/g' >> gencode.v37.annotation.with.hervs.gtf +wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_37/gencode.v37.annotation.gff3.gz +gunzip gencode.v37.annotation.gff3.gz +cd $HUMAN_REFERENCES_PATH + +# Protein reference +mkdir -p protein; cd protein +wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_37/gencode.v37.pc_translations.fa.gz +gunzip gencode.v37.pc_translations.fa.gz +cd $HUMAN_REFERENCES_PATH + +# Reference VCFs +mkdir -p vcfs; cd vcfs +wget https://storage.googleapis.com/gatk-best-practices/somatic-hg38/1000g_pon.hg38.vcf.gz +wget https://storage.googleapis.com/gatk-best-practices/somatic-hg38/af-only-gnomad.hg38.vcf.gz +wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf +wget https://storage.googleapis.com/gatk-best-practices/somatic-hg38/small_exac_common_3.hg38.vcf.gz +bgzip Homo_sapiens_assembly38.dbsnp138.vcf +cd $HUMAN_REFERENCES_PATH + + +# BEDs +# https://www.biostars.org/p/459269/#459274 +mkdir -p beds; cd beds +zgrep 'transcript_type "protein_coding"' $HUMAN_REFERENCES_PATH/annot/gencode.v37.annotation.gtf.gz | awk '($3=="exon") {printf("%s\t%s\t%s\n",$1,int($4)-1,$5);}' | sort -T . -t $'\t' -k1,1 -k2,2n | bedtools merge > hg38_exome.bed +cd $HUMAN_REFERENCES_PATH + + +# snpEff reference +mkdir -p snpeff; cd snpeff +docker pull resolwebio/snpeff:latest +mkdir -p GRCh38.GENCODEv37; cd GRCh38.GENCODEv37 +wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/430f9d80c841721499fbcec937b0f721/snpEff.config +cp $HUMAN_REFERENCES_PATH/annot/gencode.v37.annotation.gtf.gz genes.gtf.gz +gunzip genes.gtf.gz +sudo ln $HUMAN_REFERENCES_PATH/fasta/Homo_sapiens.assembly38.no_ebv.fa sequences.fa +cd $HUMAN_REFERENCES_PATH/snpeff +docker run -v $PWD:/data -w /data resolwebio/snpeff:latest /opt/snpeff/snpeff/bin/snpEff build -gtf22 -v GRCh38.GENCODEv37 -dataDir /data -c GRCh38.GENCODEv37/snpEff.config +rm -f GRCh38.GENCODEv37/sequences.fa GRCh38.GENCODEv37/genes.gtf +cd $HUMAN_REFERENCES_PATH + + +# NeoSplice reference +mkdir -p neosplice; cd neosplice +### The steps below generates a peptidome specific to a GTF and reference +### FASTA which is ideal. The Python script is taxing though, and users may not +### be able to run the script. As an alternative, the "off-the-shelf" peptidome +### included with NeoSplice is provided by default. + +# wget https://raw.githubusercontent.com/max555beyond/NeoSplice/master/generate_reference_peptidome.py +# python3 -m pip install --user pyfaidx +# python3 -m pip install --user bcbio-gff +# sed 's/os.makedirs(path, 0777)/os.makedirs(path, 0777)/g' generate_reference_peptidome.py > generate_reference_peptidome.py.tmp +# mv generate_reference_peptidome.py.tmp generate_reference_peptidome.py +# python3 generate_reference_peptidome.py $HUMAN_REFERENCES_PATH/annot/gencode.v37.annotation.gff3 $HUMAN_REFERENCES_PATH/fasta/Homo_sapiens.assembly38.no_ebv.fa . +# mv .peptidome_result/ peptidome.homo_sapiens +# rm generate_reference_peptidome.py + +mkdir -p peptidome.homo_sapiens; cd peptidome.homo_sapiens +wget https://github.com/Benjamin-Vincent-Lab/NeoSplice/raw/master/Reference_peptidome/reference_peptidome_8.txt.gz +wget https://github.com/Benjamin-Vincent-Lab/NeoSplice/raw/master/Reference_peptidome/reference_peptidome_9.txt.gz +wget https://github.com/Benjamin-Vincent-Lab/NeoSplice/raw/master/Reference_peptidome/reference_peptidome_10.txt.gz +wget https://github.com/Benjamin-Vincent-Lab/NeoSplice/raw/master/Reference_peptidome/reference_peptidome_11.txt.gz +cd $HUMAN_REFERENCES_PATH + +# CTA/Self-antigen reference +mkdir -p cta_self; cd cta_self +wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/5a9786203497b90c0cc0c0a6a251399b/cta_and_self_antigen.homo_sapiens.gene_list +cd $HUMAN_REFERENCES_PATH + +# STARFusion reference +# Note: This file is quite large (31G), so ensure you have sufficient storage. +mkdir -p starfusion; cd starfusion +wget https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/__genome_libs_StarFv1.10/GRCh38_gencode_v37_CTAT_lib_Mar012021.plug-n-play.tar.gz +tar -xvf GRCh38_gencode_v37_CTAT_lib_Mar012021.plug-n-play.tar.gz +cd GRCh38_gencode_v37_CTAT_lib_Mar012021.plug-n-play +mv ctat_genome_lib_build_dir/* .; rm -rf ctat_genome_lib_build_dir/; cd .. +rm -rf GRCh38_gencode_v37_CTAT_lib_Mar012021.plug-n-play.tar.gz +cd $HUMAN_REFERENCES_PATH + +# ERV reference +mkdir -p erv; cd erv +wget http://geve.med.u-tokai.ac.jp/download_data/table/Hsap38.txt.bz2 +bzip2 -d Hsap38.txt.bz2 +cd $HUMAN_REFERENCES_PATH + +# TCGA external reference +mkdir -p tcga; cd tcga +python3 -m pip install numpy --user +wget https://toil-xena-hub.s3.us-east-1.amazonaws.com/download/tcga_rsem_isoform_tpm.gz +wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/5e315a7217ff68ee2ced894e8a4a7246/tissue_source_site_codes +wget https://gitlab.com/landscape-of-effective-neoantigens-software/tcga2lens/-/raw/0e4ac67007b5e77b151162465b44003f555951a4/tcga2lens.py +python3 tcga2lens.py summarize-transcript-expression --tx-file tcga_rsem_isoform_tpm.gz --tumor-type-map tissue_source_site_codes --output tcga_transcript_tpm_summary.tsv +cd $HUMAN_REFERENCES_PATH diff --git a/bin/lens/download_mouse_references.sh b/bin/lens/download_mouse_references.sh new file mode 100644 index 00000000..e7c38bda --- /dev/null +++ b/bin/lens/download_mouse_references.sh @@ -0,0 +1,93 @@ +export RAFT_PATH=~/raft +export REFERENCES_PATH=$RAFT_PATH/references +cd $REFERENCES_PATH + +mkdir -p mus_musculus; cd mus_musculus +export MOUSE_REFERENCES_PATH=$REFERENCES_PATH/mus_musculus + + +# Genomic reference +mkdir -p fasta; cd fasta +wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/GRCm38.primary_assembly.genome.fa.gz +gunzip GRCm38.primary_assembly.genome.fa.gz +cd $MOUSE_REFERENCES_PATH + +# Protein reference +wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.pc_translations.fa.gz +gunzip gencode.vM25.pc_translations.fa.gz +cd $MOUSE_REFERENCES_PATH + +# GTF/GFF3 +mkdir -p annot; cd annot +wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.annotation.gtf.gz +wget http://geve.med.u-tokai.ac.jp/download_data/gtf_m/Mmus38.geve.m_v1.gtf.bz2 +bzip2 -d Mmus38.geve.m_v1.gtf.bz2 +zcat gencode.vM25.annotation.gtf.gz | grep -v chrMG > gencode.vM25.annotation.with.mervs.gtf +cat Mmus38.geve.m_v1.gtf | sed 's/^/chr/g' | sed 's/CDS/transcript/g' >> gencode.vM25.annotation.with.mervs.gtf +cat Mmus38.geve.m_v1.gtf | sed 's/^/chr/g' | sed 's/CDS/exon/g' >> gencode.vM25.annotation.with.mervs.gtf +wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.annotation.gff3.gz +gunzip gencode.vM25.annotation.gff3.gz +cd $MOUSE_REFERENCES_PATH + + +# Reference VCFs +mkdir -p vcfs; cd vcfs +# From https://github.com/igordot/genomics/blob/master/workflows/gatk-mouse-mm10.md +wget --recursive --no-parent --no-directories \ +--accept vcf*vcf.gz \ +ftp://ftp.ncbi.nih.gov/snp/organisms/archive/mouse_10090/VCF/ +rm *Alt* *MT* *Multi* *NotOn* *Un* +for vcf in $(ls -1 vcf_chr_*.vcf.gz) ; do + vcf_new=${vcf/.vcf.gz/.vcf} + echo $vcf + zcat $vcf | sed 's/^\([0-9XY]\)/chr\1/' > $vcf_new + rm -fv $vcf +done +for i in *vcf; do echo ${i}; bgzip ${i}; done +for i in *vcf.gz; do echo ${i}; tabix ${i}; done +bcftools merge -Oz -o mm10.dbsnp.vcf.gz *vcf.gz +rm vcf* +cd $MOUSE_REFERENCES_PATH + +# BEDs +# https://www.biostars.org/p/459269/#459274 +mkdir -p beds; cd beds +zgrep 'transcript_type "protein_coding"' $MOUSE_REFERENCES_PATH/annot/gencode.vM25.annotation.gtf.gz | awk '($3=="exon") {printf("%s\t%s\t%s\n",$1,int($4)-1,$5);}' | sort -T . -t $'\t' -k1,1 -k2,2n | bedtools merge > mm10_exome.bed +cd $MOUSE_REFERENCES_PATH + +# snpEff reference +mkdir -p snpeff; cd snpeff +docker pull resolwebio/snpeff:2.0.0 +docker run -v $PWD:/data resolwebio/snpeff:2.0.0 /opt/snpeff/snpeff/bin/snpEff download GRCm38.86 -dataDir ${PWD} +wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/430f9d80c841721499fbcec937b0f721/snpEff.config +cd $MOUSE_REFERENCES_PATH + +# NeoSplice reference +mkdir -p neosplice; cd neosplice +wget https://raw.githubusercontent.com/BWMac/NeoSplice/master/generate_reference_peptidome.py +python3 generate_reference_peptidome.py $MOUSE_REFERENCES_PATH/annot/gencode.vM25.annotation.gff3 $MOUSE_REFERENCES_PATH/fasta/GRCm38.primary_assembly.genome.fa . +mv .peptidome_result/ peptidome.mus_musculus +rm generate_reference_peptidome.py +cd $MOUSE_REFERENCES_PATH + +#stopped here + +# CTA/Self-antigen reference +mkdir -p cta_self; cd cta_self +wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/7f7454717866c1a61fb505f8ac5446e0/cta_and_self_antigen.mus_musculus.gene_list +cd $MOUSE_REFERENCES_PATH + +# STARFusion reference - waiting for response on 404 error +# mkdir -p starfusion; cd starfusion +# wget https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/__genome_libs_StarFv1.9/Mouse_gencode_M24_CTAT_lib_Apr062020.plug-n-play.tar.gz +# tar -xvf Mouse_gencode_M24_CTAT_lib_Apr062020.plug-n-play.tar.gz +# cd Mouse_gencode_M24_CTAT_lib_Apr062020.plug-n-play +# mv ctat_genome_lib_build_dir/* .; rm -rf ctat_genome_lib_build_dir/; cd .. +# rm -rf Mouse_gencode_M24_CTAT_lib_Apr062020.plug-n-play.tar.gz +# cd $MOUSE_REFERENCES_PATH + +# ERV reference +mkdir -p erv; cd erv +wget http://geve.med.u-tokai.ac.jp/download_data/table/Mmus38.txt.bz2 +bzip2 -d Mmus38.txt.bz2 +cd $MOUSE_REFERENCES_PATH diff --git a/bin/lens/mirror-lens-references.sh b/bin/lens/mirror-lens-references.sh new file mode 100644 index 00000000..dd84ed06 --- /dev/null +++ b/bin/lens/mirror-lens-references.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# Refer to the Running LENS instructions and set up your environment accordingly before executing this script: +# https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/Running-LENS#populate-raft-global-references-directory + +# This script assumes that `AWS_PROFILE` has been set to a AWS CLI +# profile that has write-access to the `s3://sage-igenomes` bucket +# (or using any other means of authenticating with the AWS CLI). + +# Downloading the LENS reference files will temporarily take up ~72.1 GB of disk space. + +prefixes=( + "antigen.garnish/" + # "erv/" + "homo_sapiens/" + # "mus_musculus/" + "viral/" +) + +echo "Downloading LENS reference files" +mkdir -p "./lens/references" +cd ./lens +bash download_general_references.sh +bash download_human_references.sh +# bash download_mouse_references.sh + +for prefix in ${prefixes[*]}; do + echo "Syncing $prefix..." + aws s3 --region us-east-1 sync "./references/$prefix/" "s3://sage-igenomes/LENS/$prefix/" --acl public-read +done +cd .. +rm -r "./lens"