-
Notifications
You must be signed in to change notification settings - Fork 22
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #198 from Sage-Bionetworks-Workflows/bwmac/IBCDPE-…
…575/lens_ref_files [IBCDPE-587] adds bash script for syncing LENS reference files
- Loading branch information
Showing
4 changed files
with
283 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
export RAFT_PATH=~/raft | ||
export REFERENCES_PATH=$RAFT_PATH/references | ||
cd $REFERENCES_PATH | ||
|
||
# Viral reference | ||
mkdir -p viral; cd viral | ||
wget https://github.com/dmarron/virdetect/raw/master/reference/virus_masked_hg38.fa | ||
wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/4dedb99984857905ee96ab1d148d7863/virdetect.cds.gff.gz | ||
wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/ad52c657d06c12d7a3346f15b71390af/virus.cds.fa.gz | ||
wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/9e3a49921bd325caa98dcd9211f8cdd9/virus.pep.fa.gz | ||
gunzip * | ||
cd $REFERENCES_PATH | ||
|
||
# antigen.garnish data directory | ||
# The gzip extracts to the desired directory, so no mkdir and cd required. | ||
curl -fsSL "https://s3.amazonaws.com/get.rech.io/antigen.garnish-2.3.0.tar.gz" | tar -xvz | ||
chmod -R 700 antigen.garnish | ||
|
||
# BLASTP binary (for antigen.garnish) | ||
mkdir -p bin; cd bin | ||
wget https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.13.0/ncbi-blast-2.13.0+-x64-linux.tar.gz | ||
tar xvf *gz | ||
mv ncbi*/bin/blastp $REFERENCES_PATH/antigen.garnish | ||
cd $REFERENCES_PATH | ||
rm -rf bin | ||
|
||
# Make dummy_file | ||
touch dummy_file |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
export RAFT_PATH=~/raft | ||
export REFERENCES_PATH=$RAFT_PATH/references | ||
cd $REFERENCES_PATH | ||
|
||
# set up homo sapiens directory | ||
mkdir -p homo_sapiens; cd homo_sapiens | ||
export HUMAN_REFERENCES_PATH=$REFERENCES_PATH/homo_sapiens | ||
|
||
#mhcflurry | ||
mkdir -p mhcflurry/tmp | ||
cd mhcflurry/tmp | ||
wget https://github.com/openvax/mhcflurry/releases/download/pre-2.0/models_class1_presentation.20200611.tar.bz2 | ||
tar xvf * | ||
mv models/* ../ | ||
cd $HUMAN_REFERENCES_PATH | ||
rm -rf tmp | ||
|
||
|
||
# Genomic reference | ||
mkdir -p fasta; cd fasta | ||
wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta | ||
docker pull staphb/samtools:1.13 | ||
docker run -v $PWD:/data staphb/samtools:1.13 samtools faidx Homo_sapiens_assembly38.fasta | ||
# EBV removal strategy from https://bioinformatics.stackexchange.com/a/14421 | ||
keep_ids=($(awk '{print $1}' Homo_sapiens_assembly38.fasta.fai | grep -v chrEBV)) | ||
docker run -v $PWD:/data staphb/samtools:1.13 samtools faidx -o Homo_sapiens.assembly38.no_ebv.fa Homo_sapiens_assembly38.fasta "${keep_ids[@]}" | ||
rm -f *.fasta* | ||
cd $HUMAN_REFERENCES_PATH | ||
|
||
# GTF/GFF3 | ||
mkdir -p annot; cd annot | ||
wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_37/gencode.v37.annotation.gtf.gz | ||
wget http://geve.med.u-tokai.ac.jp/download_data/gtf_m/Hsap38.geve.m_v1.gtf.bz2 | ||
bzip2 -d Hsap38.geve.m_v1.gtf.bz2 | ||
zcat gencode.v37.annotation.gtf.gz > gencode.v37.annotation.with.hervs.gtf | ||
cat Hsap38.geve.m_v1.gtf | sed 's/^/chr/g' | sed 's/CDS/transcript/g' >> gencode.v37.annotation.with.hervs.gtf | ||
cat Hsap38.geve.m_v1.gtf | sed 's/^/chr/g' | sed 's/CDS/exon/g' >> gencode.v37.annotation.with.hervs.gtf | ||
wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_37/gencode.v37.annotation.gff3.gz | ||
gunzip gencode.v37.annotation.gff3.gz | ||
cd $HUMAN_REFERENCES_PATH | ||
|
||
# Protein reference | ||
mkdir -p protein; cd protein | ||
wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_37/gencode.v37.pc_translations.fa.gz | ||
gunzip gencode.v37.pc_translations.fa.gz | ||
cd $HUMAN_REFERENCES_PATH | ||
|
||
# Reference VCFs | ||
mkdir -p vcfs; cd vcfs | ||
wget https://storage.googleapis.com/gatk-best-practices/somatic-hg38/1000g_pon.hg38.vcf.gz | ||
wget https://storage.googleapis.com/gatk-best-practices/somatic-hg38/af-only-gnomad.hg38.vcf.gz | ||
wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf | ||
wget https://storage.googleapis.com/gatk-best-practices/somatic-hg38/small_exac_common_3.hg38.vcf.gz | ||
bgzip Homo_sapiens_assembly38.dbsnp138.vcf | ||
cd $HUMAN_REFERENCES_PATH | ||
|
||
|
||
# BEDs | ||
# https://www.biostars.org/p/459269/#459274 | ||
mkdir -p beds; cd beds | ||
zgrep 'transcript_type "protein_coding"' $HUMAN_REFERENCES_PATH/annot/gencode.v37.annotation.gtf.gz | awk '($3=="exon") {printf("%s\t%s\t%s\n",$1,int($4)-1,$5);}' | sort -T . -t $'\t' -k1,1 -k2,2n | bedtools merge > hg38_exome.bed | ||
cd $HUMAN_REFERENCES_PATH | ||
|
||
|
||
# snpEff reference | ||
mkdir -p snpeff; cd snpeff | ||
docker pull resolwebio/snpeff:latest | ||
mkdir -p GRCh38.GENCODEv37; cd GRCh38.GENCODEv37 | ||
wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/430f9d80c841721499fbcec937b0f721/snpEff.config | ||
cp $HUMAN_REFERENCES_PATH/annot/gencode.v37.annotation.gtf.gz genes.gtf.gz | ||
gunzip genes.gtf.gz | ||
sudo ln $HUMAN_REFERENCES_PATH/fasta/Homo_sapiens.assembly38.no_ebv.fa sequences.fa | ||
cd $HUMAN_REFERENCES_PATH/snpeff | ||
docker run -v $PWD:/data -w /data resolwebio/snpeff:latest /opt/snpeff/snpeff/bin/snpEff build -gtf22 -v GRCh38.GENCODEv37 -dataDir /data -c GRCh38.GENCODEv37/snpEff.config | ||
rm -f GRCh38.GENCODEv37/sequences.fa GRCh38.GENCODEv37/genes.gtf | ||
cd $HUMAN_REFERENCES_PATH | ||
|
||
|
||
# NeoSplice reference | ||
mkdir -p neosplice; cd neosplice | ||
### The steps below generates a peptidome specific to a GTF and reference | ||
### FASTA which is ideal. The Python script is taxing though, and users may not | ||
### be able to run the script. As an alternative, the "off-the-shelf" peptidome | ||
### included with NeoSplice is provided by default. | ||
|
||
# wget https://raw.githubusercontent.com/max555beyond/NeoSplice/master/generate_reference_peptidome.py | ||
# python3 -m pip install --user pyfaidx | ||
# python3 -m pip install --user bcbio-gff | ||
# sed 's/os.makedirs(path, 0777)/os.makedirs(path, 0777)/g' generate_reference_peptidome.py > generate_reference_peptidome.py.tmp | ||
# mv generate_reference_peptidome.py.tmp generate_reference_peptidome.py | ||
# python3 generate_reference_peptidome.py $HUMAN_REFERENCES_PATH/annot/gencode.v37.annotation.gff3 $HUMAN_REFERENCES_PATH/fasta/Homo_sapiens.assembly38.no_ebv.fa . | ||
# mv .peptidome_result/ peptidome.homo_sapiens | ||
# rm generate_reference_peptidome.py | ||
|
||
mkdir -p peptidome.homo_sapiens; cd peptidome.homo_sapiens | ||
wget https://github.com/Benjamin-Vincent-Lab/NeoSplice/raw/master/Reference_peptidome/reference_peptidome_8.txt.gz | ||
wget https://github.com/Benjamin-Vincent-Lab/NeoSplice/raw/master/Reference_peptidome/reference_peptidome_9.txt.gz | ||
wget https://github.com/Benjamin-Vincent-Lab/NeoSplice/raw/master/Reference_peptidome/reference_peptidome_10.txt.gz | ||
wget https://github.com/Benjamin-Vincent-Lab/NeoSplice/raw/master/Reference_peptidome/reference_peptidome_11.txt.gz | ||
cd $HUMAN_REFERENCES_PATH | ||
|
||
# CTA/Self-antigen reference | ||
mkdir -p cta_self; cd cta_self | ||
wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/5a9786203497b90c0cc0c0a6a251399b/cta_and_self_antigen.homo_sapiens.gene_list | ||
cd $HUMAN_REFERENCES_PATH | ||
|
||
# STARFusion reference | ||
# Note: This file is quite large (31G), so ensure you have sufficient storage. | ||
mkdir -p starfusion; cd starfusion | ||
wget https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/__genome_libs_StarFv1.10/GRCh38_gencode_v37_CTAT_lib_Mar012021.plug-n-play.tar.gz | ||
tar -xvf GRCh38_gencode_v37_CTAT_lib_Mar012021.plug-n-play.tar.gz | ||
cd GRCh38_gencode_v37_CTAT_lib_Mar012021.plug-n-play | ||
mv ctat_genome_lib_build_dir/* .; rm -rf ctat_genome_lib_build_dir/; cd .. | ||
rm -rf GRCh38_gencode_v37_CTAT_lib_Mar012021.plug-n-play.tar.gz | ||
cd $HUMAN_REFERENCES_PATH | ||
|
||
# ERV reference | ||
mkdir -p erv; cd erv | ||
wget http://geve.med.u-tokai.ac.jp/download_data/table/Hsap38.txt.bz2 | ||
bzip2 -d Hsap38.txt.bz2 | ||
cd $HUMAN_REFERENCES_PATH | ||
|
||
# TCGA external reference | ||
mkdir -p tcga; cd tcga | ||
python3 -m pip install numpy --user | ||
wget https://toil-xena-hub.s3.us-east-1.amazonaws.com/download/tcga_rsem_isoform_tpm.gz | ||
wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/5e315a7217ff68ee2ced894e8a4a7246/tissue_source_site_codes | ||
wget https://gitlab.com/landscape-of-effective-neoantigens-software/tcga2lens/-/raw/0e4ac67007b5e77b151162465b44003f555951a4/tcga2lens.py | ||
python3 tcga2lens.py summarize-transcript-expression --tx-file tcga_rsem_isoform_tpm.gz --tumor-type-map tissue_source_site_codes --output tcga_transcript_tpm_summary.tsv | ||
cd $HUMAN_REFERENCES_PATH |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
export RAFT_PATH=~/raft | ||
export REFERENCES_PATH=$RAFT_PATH/references | ||
cd $REFERENCES_PATH | ||
|
||
mkdir -p mus_musculus; cd mus_musculus | ||
export MOUSE_REFERENCES_PATH=$REFERENCES_PATH/mus_musculus | ||
|
||
|
||
# Genomic reference | ||
mkdir -p fasta; cd fasta | ||
wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/GRCm38.primary_assembly.genome.fa.gz | ||
gunzip GRCm38.primary_assembly.genome.fa.gz | ||
cd $MOUSE_REFERENCES_PATH | ||
|
||
# Protein reference | ||
wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.pc_translations.fa.gz | ||
gunzip gencode.vM25.pc_translations.fa.gz | ||
cd $MOUSE_REFERENCES_PATH | ||
|
||
# GTF/GFF3 | ||
mkdir -p annot; cd annot | ||
wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.annotation.gtf.gz | ||
wget http://geve.med.u-tokai.ac.jp/download_data/gtf_m/Mmus38.geve.m_v1.gtf.bz2 | ||
bzip2 -d Mmus38.geve.m_v1.gtf.bz2 | ||
zcat gencode.vM25.annotation.gtf.gz | grep -v chrMG > gencode.vM25.annotation.with.mervs.gtf | ||
cat Mmus38.geve.m_v1.gtf | sed 's/^/chr/g' | sed 's/CDS/transcript/g' >> gencode.vM25.annotation.with.mervs.gtf | ||
cat Mmus38.geve.m_v1.gtf | sed 's/^/chr/g' | sed 's/CDS/exon/g' >> gencode.vM25.annotation.with.mervs.gtf | ||
wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.annotation.gff3.gz | ||
gunzip gencode.vM25.annotation.gff3.gz | ||
cd $MOUSE_REFERENCES_PATH | ||
|
||
|
||
# Reference VCFs | ||
mkdir -p vcfs; cd vcfs | ||
# From https://github.com/igordot/genomics/blob/master/workflows/gatk-mouse-mm10.md | ||
wget --recursive --no-parent --no-directories \ | ||
--accept vcf*vcf.gz \ | ||
ftp://ftp.ncbi.nih.gov/snp/organisms/archive/mouse_10090/VCF/ | ||
rm *Alt* *MT* *Multi* *NotOn* *Un* | ||
for vcf in $(ls -1 vcf_chr_*.vcf.gz) ; do | ||
vcf_new=${vcf/.vcf.gz/.vcf} | ||
echo $vcf | ||
zcat $vcf | sed 's/^\([0-9XY]\)/chr\1/' > $vcf_new | ||
rm -fv $vcf | ||
done | ||
for i in *vcf; do echo ${i}; bgzip ${i}; done | ||
for i in *vcf.gz; do echo ${i}; tabix ${i}; done | ||
bcftools merge -Oz -o mm10.dbsnp.vcf.gz *vcf.gz | ||
rm vcf* | ||
cd $MOUSE_REFERENCES_PATH | ||
|
||
# BEDs | ||
# https://www.biostars.org/p/459269/#459274 | ||
mkdir -p beds; cd beds | ||
zgrep 'transcript_type "protein_coding"' $MOUSE_REFERENCES_PATH/annot/gencode.vM25.annotation.gtf.gz | awk '($3=="exon") {printf("%s\t%s\t%s\n",$1,int($4)-1,$5);}' | sort -T . -t $'\t' -k1,1 -k2,2n | bedtools merge > mm10_exome.bed | ||
cd $MOUSE_REFERENCES_PATH | ||
|
||
# snpEff reference | ||
mkdir -p snpeff; cd snpeff | ||
docker pull resolwebio/snpeff:2.0.0 | ||
docker run -v $PWD:/data resolwebio/snpeff:2.0.0 /opt/snpeff/snpeff/bin/snpEff download GRCm38.86 -dataDir ${PWD} | ||
wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/430f9d80c841721499fbcec937b0f721/snpEff.config | ||
cd $MOUSE_REFERENCES_PATH | ||
|
||
# NeoSplice reference | ||
mkdir -p neosplice; cd neosplice | ||
wget https://raw.githubusercontent.com/BWMac/NeoSplice/master/generate_reference_peptidome.py | ||
python3 generate_reference_peptidome.py $MOUSE_REFERENCES_PATH/annot/gencode.vM25.annotation.gff3 $MOUSE_REFERENCES_PATH/fasta/GRCm38.primary_assembly.genome.fa . | ||
mv .peptidome_result/ peptidome.mus_musculus | ||
rm generate_reference_peptidome.py | ||
cd $MOUSE_REFERENCES_PATH | ||
|
||
#stopped here | ||
|
||
# CTA/Self-antigen reference | ||
mkdir -p cta_self; cd cta_self | ||
wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/7f7454717866c1a61fb505f8ac5446e0/cta_and_self_antigen.mus_musculus.gene_list | ||
cd $MOUSE_REFERENCES_PATH | ||
|
||
# STARFusion reference - waiting for response on 404 error | ||
# mkdir -p starfusion; cd starfusion | ||
# wget https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/__genome_libs_StarFv1.9/Mouse_gencode_M24_CTAT_lib_Apr062020.plug-n-play.tar.gz | ||
# tar -xvf Mouse_gencode_M24_CTAT_lib_Apr062020.plug-n-play.tar.gz | ||
# cd Mouse_gencode_M24_CTAT_lib_Apr062020.plug-n-play | ||
# mv ctat_genome_lib_build_dir/* .; rm -rf ctat_genome_lib_build_dir/; cd .. | ||
# rm -rf Mouse_gencode_M24_CTAT_lib_Apr062020.plug-n-play.tar.gz | ||
# cd $MOUSE_REFERENCES_PATH | ||
|
||
# ERV reference | ||
mkdir -p erv; cd erv | ||
wget http://geve.med.u-tokai.ac.jp/download_data/table/Mmus38.txt.bz2 | ||
bzip2 -d Mmus38.txt.bz2 | ||
cd $MOUSE_REFERENCES_PATH |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
#!/bin/bash | ||
|
||
# Refer to the Running LENS instructions and set up your environment accordingly before executing this script: | ||
# https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/Running-LENS#populate-raft-global-references-directory | ||
|
||
# This script assumes that `AWS_PROFILE` has been set to a AWS CLI | ||
# profile that has write-access to the `s3://sage-igenomes` bucket | ||
# (or using any other means of authenticating with the AWS CLI). | ||
|
||
# Downloading the LENS reference files will temporarily take up ~72.1 GB of disk space. | ||
|
||
prefixes=( | ||
"antigen.garnish/" | ||
# "erv/" | ||
"homo_sapiens/" | ||
# "mus_musculus/" | ||
"viral/" | ||
) | ||
|
||
echo "Downloading LENS reference files" | ||
mkdir -p "./lens/references" | ||
cd ./lens | ||
bash download_general_references.sh | ||
bash download_human_references.sh | ||
# bash download_mouse_references.sh | ||
|
||
for prefix in ${prefixes[*]}; do | ||
echo "Syncing $prefix..." | ||
aws s3 --region us-east-1 sync "./references/$prefix/" "s3://sage-igenomes/LENS/$prefix/" --acl public-read | ||
done | ||
cd .. | ||
rm -r "./lens" |