-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_algorithms.sh
109 lines (80 loc) · 3.28 KB
/
run_algorithms.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/bin/sh
projectDir="$HOME/emdproject"
# echo $projectDir
dataDir="$projectDir/datasets/ECRDB61B/margin200"
# echo $dataDir
resultsDir="$projectDir/results"
# echo $resultsDir
bgmodel="$projectDir/rsat_meme_bg_ecoli_complete_genome.txt"
# echo $bgmodel
cd $dataDir
pwd
files="*.txt"
# echo $files
for file in $files
do
filename=${file:0:-4}
echo "Processing $file ..."
for i in {0..10}
do
meme $dataDir/$file -dna -maxsize 100000000 -bfile $bgmodel -time 3600 -p 4 -mod tcm -minsites 1 -nostatus -nmotifs 5 -text -maxw 10+$i > $resultsDir/meme/$filename\_meme$i.txt
streme --p $dataDir/$file -bfile $bgmodel --verbosity 1 --dna --totallength 100000000 --time 14400 --w 15 --seed 10+$i --thresh 0.05 --text > $resultsDir/streme/$filename\_streme$i.txt
done
done
echo "Done"
# MotifSampler
# We made the following adjustments to the
# default parameter values. We searched five different
# motifs of a width of fifteen. The number of repeating runs
# was set to five. The background frequency model was gen-
# erated using the intergenic region sequences of all E. coli
# genome and the third order Markov model was used. We
# used the consensus score as the statistical measure for the
# quality of the predicted motifs.
#cd /home/spyros/metaptyxiako/2nd_semester/Algorithms_in_Molecular_Biology/ALGORITHMS/MotifSampler
cd "$projectDir/datasets/ECRDB61B/"
for file in margin200/*.txt; do awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' "$file" | sed 's/[[:space:]]*$//' > "${file%.txt}.fasta"; done
./MotifSampler -f margin200/*.fasta -b background.bg -n 5 -r 5 -w 15 -o output_200.gff -m matrix_200.txt
# BioProspector
awk '{if(NR==1) print ">ecoli\n"$0; else print}' ecoli.genome > ecoli.fasta
or
awk '/^>/ {if (seq != "") {print seq}; printf("%s ",$0); seq=""; next;} { seq = seq $0 } END { print seq }' ecoli.txt > ecoli.fasta
correct
sed -n '1p; 2,$ {s/\s//g;H}; ${x;s/\n//g;p}' ecoli.genome > ecoli.fasta
# check the lines
wc -l myfile.txt
# dhmioyrgei txt me 2 grammes
sed -n '1p; 2,$ {H; $ {g;s/\n//g;p;q}}' yourfile.txt > newfile.txt
for file in /home/spyros/metaptyxiako/2nd_semester/Algorithms_in_Molecular_Biology/ALGORITHMS/Bioprospector/files/file_*.txt; do
filename=$(basename "$file");
sed -n '1p; 2,$ {H; $ {g;s/\n//g;p;q}}' "$filename" > "${filename%.txt}.fasta";
echo "${filename%.txt}.fasta";
done
# To run bioprospector
# Loop through all the txt files in margin200 directory
for file in margin200/*.txt; do
# Get the filename without the extension
filename=$(basename -- "$file")
filename="${filename%.*}"
# Loop through 10 iterations
for i in {1..10}; do
output="${filename}_${i}.txt"
./BioProspector.linux -i "$file" -f background.txt -W 15 -o "$output"
done
done
# Combine all the results into a single file
mv *.txt results_bioprospector/
# To run MDscan
# Loop through all the txt files in margin200 directory
for file in margin200/*.txt; do
# Get the filename without the extension
filename=$(basename -- "$file")
filename="${filename%.*}"
# Loop through 10 iterations
for i in {1..10}; do
output="${filename}_${i}.txt"
./MDscan.linux -i "$file" -f background.txt -w 15 -s -o "$output"
done
done
# Combine all the results into a single file
mv *.txt results_md/