-
Notifications
You must be signed in to change notification settings - Fork 2
/
config.yaml
85 lines (70 loc) · 2.55 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
train_set: "path/to/train/set" # UK dataset
external_set: # additional testing sets
__set_name__: "path/to/external/set" # Africa dataset
metadata: "path/to/train/set/metadata" # UK metadata
results_dir: "path/to/where/you/want/your/results"
# List of subtypes you want to train on separately
subtype:
- "B" # train on UK B test on UK C
- "C" # train on UK C test on UK B
- "ALL" # train on UK B and C test on external test sets
# List of models to train (RF, Logistic, or Bayes)
model:
# Python scikit-learn models
- "RF" # RandomForestClassifier
- "Bayes" # MultinomialNB
- "Logistic" # LogisticRegressionCV
# Custom Fisher classifer classifiers
# Bonf = Bonferroni correction, BH = Benjamini-Hochberg
# number at end is the necessary number of significant mutations in
# a sample to be classified as positive.
- "FisherBonf1"
- "FisherBH1"
- "FisherBonf2"
- "FisherBH2"
# Categorical Target feature on which you want to classify (name of feature in dataset)
target:
- "encoded_label" # RTI status
- "hasDRM" # presence / absence of known DRMs
# parameters for models to train
# parameter available in scikit-learn documentation
parameters:
RF:
n_jobs: 4
n_estimators: 5000
Logistic:
n_jobs: 4
cv: 10
Cs: 100
penalty: "l1"
multi_class: "multinomial"
solver: "saga"
scoring: "balanced_accuracy"
# number of times to train the final models on the whole training data
num_final_repeats: 20
# Wether to remove features corresponding to subtype reference AAs from the training set
remove_consensus: true
# Which mutations to remove from the sequences (case doesn't matter)
# 'SDRM' removes all surveillance DRMS,
# 'DRM' removes non SDRM DRMs
# 'ALL' removes all DRMs
# 'ACCESSORY' removes accessory DRMs
# 'STANDALONE' removes non accessory DRMs
# 'NRTI' removes NRTI caused DRMs
# 'NNRTI' removes NNRTI caused DRMs
# 'OTHER' removes non NRTI/NNRTI caused DRMs
# Anything else and no features will be removed
remove_drms: "SDRM"
# Which sequences to remove from the dataset (case doesn't matter)
# 'DRM' will remove sequences that has at least 1 known DRM
# 'NO DRM' will remove sequences that have no known DRM
# Anything else will not remove any sequences
remove_sequences: "None"
# Wether to remove naive sequences with at least 1 DRM from the training set and put them in the testing set
remove_naive_DRMS: false
# Wether to remove gaps, STOP codons and X AAs.
deep_clean: true
# Wether to use subsampling to balance labels in the dataset
balance: false
# what metric to use to evaluate the classifiers
metric: "balanced"