Skip to content

Commit

Permalink
support of multiple width values 1 per fasta
Browse files Browse the repository at this point in the history
  • Loading branch information
olgatsiouri1996 committed Dec 23, 2021
1 parent 48825a6 commit 5ede056
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 3 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# biomisc [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5795517.svg)](https://doi.org/10.5281/zenodo.5795517)
# biomisc
collection of miscellaneous command line bioinformatic scripts(see the wiki page for documentation and depedences)
21 changes: 19 additions & 2 deletions fasta_manipulation/fasta_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
import os
import argparse
from Bio import SeqIO
import pandas as pd
import sys
# input parameters
ap = argparse.ArgumentParser(description="changes the width of sequences line in 1 or many FASTA files")
ap.add_argument("-in", "--input", required=False, help="input fasta file")
ap.add_argument("-txt", "--txt", required=False, help="input txt file with 2 columns 1) file name (without extension), 2) width")
ap.add_argument("-out", "--output", required=False, help="output fasta file")
ap.add_argument("-width", "--width", required=False, type=int, default=80, help="number of characters per line. Default 80")
ap.add_argument("-pro", "--program", required=False, type=int, default=1, help="program to choose. 1) one input/output fasta file, 2) many input/output fasta files. Default is 1")
ap.add_argument("-pro", "--program", required=False, type=int, default=1, help="program to choose. 1) one input/output fasta file, 2) many input/output fasta files, 3) .txt file with fasta file names and width for each file. Default is 1")
args = vars(ap.parse_args())
# main
# create function to split the input sequence based on a specific number of characters
Expand All @@ -21,7 +23,7 @@ def split_every_width(s,w): return [s[i:i+w] for i in range(0,len(s),w)]
print(">"+record.id)
print('\n'.join(split_every_width(str(record.seq), args['width']))) # add characters in new line after the number of characters surpasses the input width
sys.stdout.close()
else:
elif args['program'] == 2:
# import each fasta file from the working directory
for filename in sorted(os.listdir(str(os.getcwd()))):
if filename.endswith(".fa") or filename.endswith(".fasta"):
Expand All @@ -31,4 +33,19 @@ def split_every_width(s,w): return [s[i:i+w] for i in range(0,len(s),w)]
print(">"+record.id)
print('\n'.join(split_every_width(str(record.seq), args['width']))) # add characters in new line after the number of characters surpasses the input width
sys.stdout.close()
else:
df = pd.read_csv(args['txt'], header=None, sep="\t")
# select ids and widths columns, convert to lists
headers = df.iloc[:,0].values.tolist()
widths = df.iloc[:,1].values.tolist()
# iter elements on pairs to export to fasta
for (a,b) in zip(headers, widths):
# export to new fasta files with the user imported width value
sys.stdout = open(''.join([str(a),"_","w",str(b),".fasta"]), 'a')
for record in SeqIO.parse(''.join([str(a),".fasta"]),'fasta'):
print(">"+record.id)
print('\n'.join(split_every_width(str(record.seq), int(b)))) # add characters in new line after the number of characters surpasses the input width
sys.stdout.close()



0 comments on commit 5ede056

Please sign in to comment.