Skip to content

Commit

Permalink
Merge pull request #198 from Sage-Bionetworks-Workflows/bwmac/IBCDPE-…
Browse files Browse the repository at this point in the history
…575/lens_ref_files

[IBCDPE-587] adds bash script for syncing LENS reference files
  • Loading branch information
BWMac authored Jul 7, 2023
2 parents 85f0623 + 58a2e6d commit 50917ec
Show file tree
Hide file tree
Showing 4 changed files with 283 additions and 0 deletions.
28 changes: 28 additions & 0 deletions bin/lens/download_general_references.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
export RAFT_PATH=~/raft
export REFERENCES_PATH=$RAFT_PATH/references
cd $REFERENCES_PATH

# Viral reference
mkdir -p viral; cd viral
wget https://github.com/dmarron/virdetect/raw/master/reference/virus_masked_hg38.fa
wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/4dedb99984857905ee96ab1d148d7863/virdetect.cds.gff.gz
wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/ad52c657d06c12d7a3346f15b71390af/virus.cds.fa.gz
wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/9e3a49921bd325caa98dcd9211f8cdd9/virus.pep.fa.gz
gunzip *
cd $REFERENCES_PATH

# antigen.garnish data directory
# The gzip extracts to the desired directory, so no mkdir and cd required.
curl -fsSL "https://s3.amazonaws.com/get.rech.io/antigen.garnish-2.3.0.tar.gz" | tar -xvz
chmod -R 700 antigen.garnish

# BLASTP binary (for antigen.garnish)
mkdir -p bin; cd bin
wget https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.13.0/ncbi-blast-2.13.0+-x64-linux.tar.gz
tar xvf *gz
mv ncbi*/bin/blastp $REFERENCES_PATH/antigen.garnish
cd $REFERENCES_PATH
rm -rf bin

# Make dummy_file
touch dummy_file
130 changes: 130 additions & 0 deletions bin/lens/download_human_references.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
export RAFT_PATH=~/raft
export REFERENCES_PATH=$RAFT_PATH/references
cd $REFERENCES_PATH

# set up homo sapiens directory
mkdir -p homo_sapiens; cd homo_sapiens
export HUMAN_REFERENCES_PATH=$REFERENCES_PATH/homo_sapiens

#mhcflurry
mkdir -p mhcflurry/tmp
cd mhcflurry/tmp
wget https://github.com/openvax/mhcflurry/releases/download/pre-2.0/models_class1_presentation.20200611.tar.bz2
tar xvf *
mv models/* ../
cd $HUMAN_REFERENCES_PATH
rm -rf tmp


# Genomic reference
mkdir -p fasta; cd fasta
wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta
docker pull staphb/samtools:1.13
docker run -v $PWD:/data staphb/samtools:1.13 samtools faidx Homo_sapiens_assembly38.fasta
# EBV removal strategy from https://bioinformatics.stackexchange.com/a/14421
keep_ids=($(awk '{print $1}' Homo_sapiens_assembly38.fasta.fai | grep -v chrEBV))
docker run -v $PWD:/data staphb/samtools:1.13 samtools faidx -o Homo_sapiens.assembly38.no_ebv.fa Homo_sapiens_assembly38.fasta "${keep_ids[@]}"
rm -f *.fasta*
cd $HUMAN_REFERENCES_PATH

# GTF/GFF3
mkdir -p annot; cd annot
wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_37/gencode.v37.annotation.gtf.gz
wget http://geve.med.u-tokai.ac.jp/download_data/gtf_m/Hsap38.geve.m_v1.gtf.bz2
bzip2 -d Hsap38.geve.m_v1.gtf.bz2
zcat gencode.v37.annotation.gtf.gz > gencode.v37.annotation.with.hervs.gtf
cat Hsap38.geve.m_v1.gtf | sed 's/^/chr/g' | sed 's/CDS/transcript/g' >> gencode.v37.annotation.with.hervs.gtf
cat Hsap38.geve.m_v1.gtf | sed 's/^/chr/g' | sed 's/CDS/exon/g' >> gencode.v37.annotation.with.hervs.gtf
wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_37/gencode.v37.annotation.gff3.gz
gunzip gencode.v37.annotation.gff3.gz
cd $HUMAN_REFERENCES_PATH

# Protein reference
mkdir -p protein; cd protein
wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_37/gencode.v37.pc_translations.fa.gz
gunzip gencode.v37.pc_translations.fa.gz
cd $HUMAN_REFERENCES_PATH

# Reference VCFs
mkdir -p vcfs; cd vcfs
wget https://storage.googleapis.com/gatk-best-practices/somatic-hg38/1000g_pon.hg38.vcf.gz
wget https://storage.googleapis.com/gatk-best-practices/somatic-hg38/af-only-gnomad.hg38.vcf.gz
wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf
wget https://storage.googleapis.com/gatk-best-practices/somatic-hg38/small_exac_common_3.hg38.vcf.gz
bgzip Homo_sapiens_assembly38.dbsnp138.vcf
cd $HUMAN_REFERENCES_PATH


# BEDs
# https://www.biostars.org/p/459269/#459274
mkdir -p beds; cd beds
zgrep 'transcript_type "protein_coding"' $HUMAN_REFERENCES_PATH/annot/gencode.v37.annotation.gtf.gz | awk '($3=="exon") {printf("%s\t%s\t%s\n",$1,int($4)-1,$5);}' | sort -T . -t $'\t' -k1,1 -k2,2n | bedtools merge > hg38_exome.bed
cd $HUMAN_REFERENCES_PATH


# snpEff reference
mkdir -p snpeff; cd snpeff
docker pull resolwebio/snpeff:latest
mkdir -p GRCh38.GENCODEv37; cd GRCh38.GENCODEv37
wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/430f9d80c841721499fbcec937b0f721/snpEff.config
cp $HUMAN_REFERENCES_PATH/annot/gencode.v37.annotation.gtf.gz genes.gtf.gz
gunzip genes.gtf.gz
sudo ln $HUMAN_REFERENCES_PATH/fasta/Homo_sapiens.assembly38.no_ebv.fa sequences.fa
cd $HUMAN_REFERENCES_PATH/snpeff
docker run -v $PWD:/data -w /data resolwebio/snpeff:latest /opt/snpeff/snpeff/bin/snpEff build -gtf22 -v GRCh38.GENCODEv37 -dataDir /data -c GRCh38.GENCODEv37/snpEff.config
rm -f GRCh38.GENCODEv37/sequences.fa GRCh38.GENCODEv37/genes.gtf
cd $HUMAN_REFERENCES_PATH


# NeoSplice reference
mkdir -p neosplice; cd neosplice
### The steps below generates a peptidome specific to a GTF and reference
### FASTA which is ideal. The Python script is taxing though, and users may not
### be able to run the script. As an alternative, the "off-the-shelf" peptidome
### included with NeoSplice is provided by default.

# wget https://raw.githubusercontent.com/max555beyond/NeoSplice/master/generate_reference_peptidome.py
# python3 -m pip install --user pyfaidx
# python3 -m pip install --user bcbio-gff
# sed 's/os.makedirs(path, 0777)/os.makedirs(path, 0777)/g' generate_reference_peptidome.py > generate_reference_peptidome.py.tmp
# mv generate_reference_peptidome.py.tmp generate_reference_peptidome.py
# python3 generate_reference_peptidome.py $HUMAN_REFERENCES_PATH/annot/gencode.v37.annotation.gff3 $HUMAN_REFERENCES_PATH/fasta/Homo_sapiens.assembly38.no_ebv.fa .
# mv .peptidome_result/ peptidome.homo_sapiens
# rm generate_reference_peptidome.py

mkdir -p peptidome.homo_sapiens; cd peptidome.homo_sapiens
wget https://github.com/Benjamin-Vincent-Lab/NeoSplice/raw/master/Reference_peptidome/reference_peptidome_8.txt.gz
wget https://github.com/Benjamin-Vincent-Lab/NeoSplice/raw/master/Reference_peptidome/reference_peptidome_9.txt.gz
wget https://github.com/Benjamin-Vincent-Lab/NeoSplice/raw/master/Reference_peptidome/reference_peptidome_10.txt.gz
wget https://github.com/Benjamin-Vincent-Lab/NeoSplice/raw/master/Reference_peptidome/reference_peptidome_11.txt.gz
cd $HUMAN_REFERENCES_PATH

# CTA/Self-antigen reference
mkdir -p cta_self; cd cta_self
wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/5a9786203497b90c0cc0c0a6a251399b/cta_and_self_antigen.homo_sapiens.gene_list
cd $HUMAN_REFERENCES_PATH

# STARFusion reference
# Note: This file is quite large (31G), so ensure you have sufficient storage.
mkdir -p starfusion; cd starfusion
wget https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/__genome_libs_StarFv1.10/GRCh38_gencode_v37_CTAT_lib_Mar012021.plug-n-play.tar.gz
tar -xvf GRCh38_gencode_v37_CTAT_lib_Mar012021.plug-n-play.tar.gz
cd GRCh38_gencode_v37_CTAT_lib_Mar012021.plug-n-play
mv ctat_genome_lib_build_dir/* .; rm -rf ctat_genome_lib_build_dir/; cd ..
rm -rf GRCh38_gencode_v37_CTAT_lib_Mar012021.plug-n-play.tar.gz
cd $HUMAN_REFERENCES_PATH

# ERV reference
mkdir -p erv; cd erv
wget http://geve.med.u-tokai.ac.jp/download_data/table/Hsap38.txt.bz2
bzip2 -d Hsap38.txt.bz2
cd $HUMAN_REFERENCES_PATH

# TCGA external reference
mkdir -p tcga; cd tcga
python3 -m pip install numpy --user
wget https://toil-xena-hub.s3.us-east-1.amazonaws.com/download/tcga_rsem_isoform_tpm.gz
wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/5e315a7217ff68ee2ced894e8a4a7246/tissue_source_site_codes
wget https://gitlab.com/landscape-of-effective-neoantigens-software/tcga2lens/-/raw/0e4ac67007b5e77b151162465b44003f555951a4/tcga2lens.py
python3 tcga2lens.py summarize-transcript-expression --tx-file tcga_rsem_isoform_tpm.gz --tumor-type-map tissue_source_site_codes --output tcga_transcript_tpm_summary.tsv
cd $HUMAN_REFERENCES_PATH
93 changes: 93 additions & 0 deletions bin/lens/download_mouse_references.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
export RAFT_PATH=~/raft
export REFERENCES_PATH=$RAFT_PATH/references
cd $REFERENCES_PATH

mkdir -p mus_musculus; cd mus_musculus
export MOUSE_REFERENCES_PATH=$REFERENCES_PATH/mus_musculus


# Genomic reference
mkdir -p fasta; cd fasta
wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/GRCm38.primary_assembly.genome.fa.gz
gunzip GRCm38.primary_assembly.genome.fa.gz
cd $MOUSE_REFERENCES_PATH

# Protein reference
wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.pc_translations.fa.gz
gunzip gencode.vM25.pc_translations.fa.gz
cd $MOUSE_REFERENCES_PATH

# GTF/GFF3
mkdir -p annot; cd annot
wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.annotation.gtf.gz
wget http://geve.med.u-tokai.ac.jp/download_data/gtf_m/Mmus38.geve.m_v1.gtf.bz2
bzip2 -d Mmus38.geve.m_v1.gtf.bz2
zcat gencode.vM25.annotation.gtf.gz | grep -v chrMG > gencode.vM25.annotation.with.mervs.gtf
cat Mmus38.geve.m_v1.gtf | sed 's/^/chr/g' | sed 's/CDS/transcript/g' >> gencode.vM25.annotation.with.mervs.gtf
cat Mmus38.geve.m_v1.gtf | sed 's/^/chr/g' | sed 's/CDS/exon/g' >> gencode.vM25.annotation.with.mervs.gtf
wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.annotation.gff3.gz
gunzip gencode.vM25.annotation.gff3.gz
cd $MOUSE_REFERENCES_PATH


# Reference VCFs
mkdir -p vcfs; cd vcfs
# From https://github.com/igordot/genomics/blob/master/workflows/gatk-mouse-mm10.md
wget --recursive --no-parent --no-directories \
--accept vcf*vcf.gz \
ftp://ftp.ncbi.nih.gov/snp/organisms/archive/mouse_10090/VCF/
rm *Alt* *MT* *Multi* *NotOn* *Un*
for vcf in $(ls -1 vcf_chr_*.vcf.gz) ; do
vcf_new=${vcf/.vcf.gz/.vcf}
echo $vcf
zcat $vcf | sed 's/^\([0-9XY]\)/chr\1/' > $vcf_new
rm -fv $vcf
done
for i in *vcf; do echo ${i}; bgzip ${i}; done
for i in *vcf.gz; do echo ${i}; tabix ${i}; done
bcftools merge -Oz -o mm10.dbsnp.vcf.gz *vcf.gz
rm vcf*
cd $MOUSE_REFERENCES_PATH

# BEDs
# https://www.biostars.org/p/459269/#459274
mkdir -p beds; cd beds
zgrep 'transcript_type "protein_coding"' $MOUSE_REFERENCES_PATH/annot/gencode.vM25.annotation.gtf.gz | awk '($3=="exon") {printf("%s\t%s\t%s\n",$1,int($4)-1,$5);}' | sort -T . -t $'\t' -k1,1 -k2,2n | bedtools merge > mm10_exome.bed
cd $MOUSE_REFERENCES_PATH

# snpEff reference
mkdir -p snpeff; cd snpeff
docker pull resolwebio/snpeff:2.0.0
docker run -v $PWD:/data resolwebio/snpeff:2.0.0 /opt/snpeff/snpeff/bin/snpEff download GRCm38.86 -dataDir ${PWD}
wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/430f9d80c841721499fbcec937b0f721/snpEff.config
cd $MOUSE_REFERENCES_PATH

# NeoSplice reference
mkdir -p neosplice; cd neosplice
wget https://raw.githubusercontent.com/BWMac/NeoSplice/master/generate_reference_peptidome.py
python3 generate_reference_peptidome.py $MOUSE_REFERENCES_PATH/annot/gencode.vM25.annotation.gff3 $MOUSE_REFERENCES_PATH/fasta/GRCm38.primary_assembly.genome.fa .
mv .peptidome_result/ peptidome.mus_musculus
rm generate_reference_peptidome.py
cd $MOUSE_REFERENCES_PATH

#stopped here

# CTA/Self-antigen reference
mkdir -p cta_self; cd cta_self
wget https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/uploads/7f7454717866c1a61fb505f8ac5446e0/cta_and_self_antigen.mus_musculus.gene_list
cd $MOUSE_REFERENCES_PATH

# STARFusion reference - waiting for response on 404 error
# mkdir -p starfusion; cd starfusion
# wget https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/__genome_libs_StarFv1.9/Mouse_gencode_M24_CTAT_lib_Apr062020.plug-n-play.tar.gz
# tar -xvf Mouse_gencode_M24_CTAT_lib_Apr062020.plug-n-play.tar.gz
# cd Mouse_gencode_M24_CTAT_lib_Apr062020.plug-n-play
# mv ctat_genome_lib_build_dir/* .; rm -rf ctat_genome_lib_build_dir/; cd ..
# rm -rf Mouse_gencode_M24_CTAT_lib_Apr062020.plug-n-play.tar.gz
# cd $MOUSE_REFERENCES_PATH

# ERV reference
mkdir -p erv; cd erv
wget http://geve.med.u-tokai.ac.jp/download_data/table/Mmus38.txt.bz2
bzip2 -d Mmus38.txt.bz2
cd $MOUSE_REFERENCES_PATH
32 changes: 32 additions & 0 deletions bin/lens/mirror-lens-references.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash

# Refer to the Running LENS instructions and set up your environment accordingly before executing this script:
# https://gitlab.com/landscape-of-effective-neoantigens-software/nextflow/modules/tools/lens/-/wikis/Running-LENS#populate-raft-global-references-directory

# This script assumes that `AWS_PROFILE` has been set to a AWS CLI
# profile that has write-access to the `s3://sage-igenomes` bucket
# (or using any other means of authenticating with the AWS CLI).

# Downloading the LENS reference files will temporarily take up ~72.1 GB of disk space.

prefixes=(
"antigen.garnish/"
# "erv/"
"homo_sapiens/"
# "mus_musculus/"
"viral/"
)

echo "Downloading LENS reference files"
mkdir -p "./lens/references"
cd ./lens
bash download_general_references.sh
bash download_human_references.sh
# bash download_mouse_references.sh

for prefix in ${prefixes[*]}; do
echo "Syncing $prefix..."
aws s3 --region us-east-1 sync "./references/$prefix/" "s3://sage-igenomes/LENS/$prefix/" --acl public-read
done
cd ..
rm -r "./lens"

0 comments on commit 50917ec

Please sign in to comment.