Skip to content

Commit

Permalink
bump v1.6.0; update ITS databases
Browse files Browse the repository at this point in the history
  • Loading branch information
Jon Palmer committed Mar 24, 2023
1 parent 28d817d commit 1eb9631
Show file tree
Hide file tree
Showing 5 changed files with 103 additions and 47 deletions.
2 changes: 1 addition & 1 deletion amptk/__version__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
VERSION = (1, 5, 5)
VERSION = (1, 6, 0)

__version__ = ".".join(map(str, VERSION))
2 changes: 1 addition & 1 deletion amptk/downloads.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
"LSU": "https://osf.io/sqn5r/download?version=4",
"COI": "https://osf.io/pax79/download?version=5",
"PR2": "https://osf.io/6hjdq/download?version=2"
}
}
10 changes: 10 additions & 0 deletions amptk/downloadsv1.6.0.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"ITS": [
"https://osf.io/kse82/download?version=2",
"https://osf.io/mx5gd/download?version=2"
],
"16S": "https://osf.io/m7v5q/download?version=4",
"LSU": "https://osf.io/sqn5r/download?version=4",
"COI": "https://osf.io/pax79/download?version=5",
"PR2": "https://osf.io/6hjdq/download?version=2"
}
103 changes: 75 additions & 28 deletions amptk/install.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#!/usr/bin/env python

from __future__ import (absolute_import, division,
print_function, unicode_literals)
from __future__ import absolute_import, division, print_function, unicode_literals
import sys
import os
import argparse
Expand All @@ -10,69 +9,117 @@
import json
import requests
import shutil
import subprocess
from amptk import amptklib

try:
from urllib.request import urlopen
except ImportError:
from urllib2 import urlopen


class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
def __init__(self,prog):
super(MyFormatter,self).__init__(prog,max_help_position=50)
def __init__(self, prog):
super(MyFormatter, self).__init__(prog, max_help_position=50)


def main(args):
parser=argparse.ArgumentParser(prog='amptk-install.py',
description='''Script to download preformatted databases''',
parser = argparse.ArgumentParser(
prog="amptk-install.py",
description="""Script to download preformatted databases""",
epilog="""Written by Jon Palmer (2019) [email protected]""",
formatter_class=MyFormatter)
parser.add_argument('-i','--input', nargs='+', required=True, choices=['ITS', '16S', 'LSU', 'COI', 'PR2'], help='Install Databases')
parser.add_argument('-f','--force', action='store_true', help='Overwrite existing databases')
parser.add_argument('-l','--local', action='store_true', help='Use local downloads.json for links')
args=parser.parse_args(args)
formatter_class=MyFormatter,
)
parser.add_argument(
"-i",
"--input",
nargs="+",
required=True,
choices=["ITS", "16S", "LSU", "COI", "PR2"],
help="Install Databases",
)
parser.add_argument(
"-f", "--force", action="store_true", help="Overwrite existing databases"
)
parser.add_argument(
"-l", "--local", action="store_true", help="Use local downloads.json for links"
)
args = parser.parse_args(args)

parentdir = os.path.join(os.path.dirname(amptklib.__file__))

# downd from github to get most recent databases
if not args.local:
try:
print('Retrieving download links from GitHub Repo')
URL = json.loads(requests.get("https://raw.githubusercontent.com/nextgenusfs/amptk/master/amptk/downloads.json").text)
print("Retrieving download links from GitHub Repo")
URL = json.loads(
requests.get(
"https://raw.githubusercontent.com/nextgenusfs/amptk/master/amptk/downloadsv1.6.0.json"
).text
)
except:
print('Unable to download links from GitHub, using funannotate version specific links')
with open(os.path.join(os.path.dirname(__file__), 'downloads.json')) as infile:
print(
"Unable to download links from GitHub, using funannotate version specific links"
)
with open(
os.path.join(os.path.dirname(__file__), "downloadsv1.6.0.json")
) as infile:
URL = json.load(infile)
else:
with open(os.path.join(os.path.dirname(__file__), 'downloads.json')) as infile:
with open(
os.path.join(os.path.dirname(__file__), "downloadsv1.6.0.json")
) as infile:
URL = json.load(infile)

for x in args.input:
udbfile = os.path.join(parentdir, 'DB', x+'.udb')
udbfile = os.path.join(parentdir, "DB", x + ".udb")
if os.path.isfile(udbfile):
if not args.force:
print("A formated database was found, to overwrite use '--force'. You can add more custom databases by using the `amptk database` command.")
print(
"A formated database was found, to overwrite use '--force'. You can add more custom databases by using the `amptk database` command."
)
sys.exit(1)
#download
# download
if not x in URL:
if args.force:
continue
print("%s not valid, choices are ITS, 16S, LSU, COI, PR2" % x)
sys.exit(1)
print("Downloading %s pre-formatted database" % x)
# getting where some files need to be split, so check here if is a list or not
# list of tar.gz files must be in proper order
address = URL.get(x)
if not os.path.isfile(x+'.amptk.tar.gz'):
amptklib.download(address, x+'.amptk.tar.gz')
tfile = tarfile.open(x+'.amptk.tar.gz', 'r:gz')
if isinstance(address, list):
dloads = []
for i, addy in enumerate(address):
dloadname = "{}.part{}.tar.gz".format(x, i + 1)
if not os.path.isfile(dloadname):
amptklib.download(addy, dloadname)
dloads.append(dloadname)
concat_cmd = ["cat"] + dloads
with open(x + ".amptk.tar.gz", "wb") as outfile:
subprocess.call(concat_cmd, stdout=outfile)
for f in dloads:
os.remove(f)
elif isinstance(address, str):
if not os.path.isfile(x + ".amptk.tar.gz"):
amptklib.download(address, x + ".amptk.tar.gz")
# now extract and install
tfile = tarfile.open(x + ".amptk.tar.gz", "r:gz")
tfile.extractall(x)
for file in os.listdir(x):
shutil.move(os.path.join(x,file), os.path.join(parentdir, 'DB', file))
shutil.move(os.path.join(x, file), os.path.join(parentdir, "DB", file))
shutil.rmtree(x)
os.remove(x+'.amptk.tar.gz')
print('Extracting FASTA files for {:}'.format(x))
extracted = os.path.join(parentdir, 'DB', x+'.extracted.fa')
cmd = ['vsearch', '--udb2fasta', udbfile, '--output', extracted]
os.remove(x + ".amptk.tar.gz")
print("Extracting FASTA files for {:}".format(x))
extracted = os.path.join(parentdir, "DB", x + ".extracted.fa")
cmd = ["vsearch", "--udb2fasta", udbfile, "--output", extracted]
amptklib.runSubprocess5(cmd)
print("{:} taxonomy database installed to {:}".format(x, os.path.join(parentdir, 'DB')))
print(
"{:} taxonomy database installed to {:}".format(
x, os.path.join(parentdir, "DB")
)
)


if __name__ == "__main__":
Expand Down
33 changes: 16 additions & 17 deletions docs/taxonomy.rst
Original file line number Diff line number Diff line change
Expand Up @@ -86,31 +86,30 @@ Taxonomy databases are built with the ``amptk database`` command. This command

**Fungal ITS DB**

These databases were created from Unite v8.0, first downloading two databases from the UNITE website. First the General FASTA release of the DB `here <https://unite.ut.ee/sh_files/sh_general_release_28.06.2017.zip>`_, and `here <https://unite.ut.ee/sh_files/sh_general_release_s_28.06.2017.zip>`_. Then the Full UNITE+INSD database `here <https://unite.ut.ee/sh_files/UNITE_public_28.06.2017.fasta.zip>`_. For the general FASTA releases, the 'developer' fasta files are used. The taxonomy information is then reformated and databases produced as follows:
These databases were created from Unite v9.3 (March 2023), first downloading two databases from the `UNITE website <https://unite.ut.ee/repository.php>`_. Download the General FASTA release of the DB and the complete UNITE+INSD database. For the general FASTA releases, the 'developer' fasta files are used. The taxonomy information is then reformated and databases produced as follows:

.. code-block:: none
#Create full length ITS USEARCH Database, convert taxonomy, and create USEARCH database
amptk database -i UNITE_public_all_02.02.2019.fasta -f ITS1-F -r ITS4 \
--primer_required none -o ITS --create_db usearch --install --source UNITE:8.0
# Create full length ITS USEARCH Database, convert taxonomy, and create USEARCH database
amptk database -i UNITE_public_all_29.11.2022.fasta -f ITS1-F -r ITS4 -derep_fulllength \
--primer_required none -o ITS --create_db usearch --install --source UNITE:9.3
#create SINTAX database
amptk database -i sh_general_release_dynamic_all_02.02.2019_dev.fasta \
# create SINTAX databases
amptk database -i sh_general_release_dynamic_s_all_29.11.2022_dev.fasta \
-o ITS_SINTAX --create_db sintax -f ITS1-F -r ITS4 --derep_fulllength \
--install --source UNITE:8.0 --primer_required none
--install --source UNITE:9.3 --primer_required none
#Create UTAX Databases
amptk database -i sh_general_release_dynamic_all_02.02.2019_dev.fasta \
-o ITS_UTAX --create_db utax -f ITS1-F -r ITS4 \
--derep_fulllength --install --source UNITE:8.0 --primer_required none
amptk database -i sh_general_release_dynamic_s_all_29.11.2022_dev.fasta \
-o ITS1_SINTAX --create_db sintax -f ITS1-F -r ITS2 --derep_fulllength \
--install --source UNITE:9.3 --primer_required rev
amptk database -i sh_general_release_dynamic_all_02.02.2019_dev.fasta \
-o ITS1_UTAX -f ITS1-F -r ITS2 --primer_required rev --derep_fulllength \
--create_db utax --install --subsample 65000 --source UNITE:8.0
amptk database -i sh_general_release_dynamic_s_all_29.11.2022_dev.fasta \
-o ITS2_SINTAX --create_db sintax -f fITS7 -r ITS4 --derep_fulllength \
--install --source UNITE:9.3 --primer_required for
# file limit on size of database in OSF repository, so tar gzip into parts up to 4 GB (mac osx)
tar cvzf - ITS.udb* ITS_SINTAX.udb* ITS1_SINTAX.udb* ITS2_SINTAX.udb* | split -b 4000m - ITS.amptk.tar.gz.
amptk database -i sh_general_release_dynamic_all_02.02.2019_dev.fasta \
-o ITS2_UTAX --create_db utax -f fITS7 -r ITS4 --derep_fulllength \
--install --source UNITE:8.0 --primer_required for
**Arthropod/Chordate mtCOI DB**

Expand Down

0 comments on commit 1eb9631

Please sign in to comment.