-
Notifications
You must be signed in to change notification settings - Fork 0
/
step_04.R
116 lines (99 loc) · 5.13 KB
/
step_04.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# A script that reduces to genes with either sufficient variation or average
# expression level. Uses a threshold on A and/or SD.
# Thresholds comes in as a variables from the JSON input file.
args = base::commandArgs(trailingOnly = TRUE)
print(args)
path2_json_file = args[1]
# **********************************************************************
# Load the necessary libraries
print("Loading libraries")
options(stringsAsFactors = FALSE)
options(bitmapType='quartz')
library(jsonlite)
library(ggplot2)
# Read in input files:
# JSON input file with SD and AVG thresholds
print("*** Reading the input files ***")
json = read_json(path2_json_file)
# Extracting the file paths from the JSON file
parent_folder = json$folders$output_folder
experiment = json$experiment_name
input_Z = file.path(parent_folder, "results", paste0(experiment, "_Z_normalized.txt"))
path2_count_means = file.path(parent_folder, "results", paste0(experiment, "_genecounts_means.txt"))
path2_count_sd = file.path(parent_folder, "results", paste0(experiment, "_genecounts_sd.txt"))
# Extracting the mean and sd thresholds
# If they do not exist in the json, set them at 0.25%
# mean
if (is.na(json$"input_variables"$"mean_precentage_threshold")){
mean_thr = 0.25
} else if (!is.numeric(json$"input_variables"$"mean_precentage_threshold")){
print("Invalid mean percentage threshold. Setting default threshold = 0.25")
mean_thr = 0.25
} else if (json$"input_variables"$"mean_precentage_threshold" <0 ){
print("Percentage mean threshold should be >0 and <1. Setting default threshold = 0.25")
mean_thr = 0.25
} else if (json$"input_variables"$"mean_precentage_threshold" >1 ){
print("Percentage mean threshold should be >0 and <1. Setting default threshold = 0.25")
mean_thr = 0.25
} else {mean_thr = json$"input_variables"$"mean_precentage_threshold"}
# sd
if (is.na(json$"input_variables"$"sd_precentage_threshold")){
sd_thr = 0.25
} else if (!is.numeric(json$"input_variables"$"sd_precentage_threshold")){
print("Invalid sd percentage threshold. Setting default threshold = 0.25")
sd_thr = 0.25
} else if (json$"input_variables"$"sd_precentage_threshold" <0 ){
print("Percentage sd threshold should be >0 and <1. Setting default threshold = 0.25")
sd_thr = 0.25
} else if (json$"input_variables"$"sd_precentage_threshold" >1 ){
print("Percentage sd threshold should be >0 and <1. Setting default threshold = 0.25")
sd_thr = 0.25
} else {sd_thr = json$"input_variables"$"sd_precentage_threshold"}
# Read in the Z table, avg, and sd tables
print("*** Loading the normalized matrix, sd, and mean matrix from previous step ***")
Z = as.matrix(read.table(input_Z, sep = '\t', header = TRUE, row.names = 1,check.names=FALSE))
raw_means = as.matrix(read.table(path2_count_means, sep = '\t', row.names = 1,header=TRUE))
raw_sd = as.matrix(read.table(path2_count_sd, sep = '\t', row.names = 1, header=TRUE))
mean_quantile <- quantile(raw_means[,1] , probs = mean_thr)
sd_quantile <- quantile(raw_sd[,1], probs = sd_thr)
mean_subset <- subset(raw_means, raw_means[,1]>mean_quantile)
sd_subset <- subset(raw_sd, raw_sd[,1] > sd_quantile)
# Subsetting the data set for a min average and variance in expression levels
gene_id_mn = intersect(rownames(Z), rownames(mean_subset))
Z = Z[gene_id_mn,]
stopifnot(rownames(Z) == rownames(mean_subset))
gene_id_sd = intersect(rownames(Z), rownames(sd_subset))
Z = Z[gene_id_sd,]
gene_ids = c(gene_id_mn, gene_id_sd)
gene_ids = unique(gene_ids)
stopifnot(rownames(Z) == (rownames(gene_ids)))
outputfile_path = file.path(parent_folder, "results", paste0(experiment , "_Z_threshold.txt"))
write.table(Z, file = outputfile_path, sep = '\t',col.names=NA,row.names=TRUE,quote=FALSE)
# Plots for the final report:
figure4 = file.path(parent_folder, "figures", paste0(experiment, "sd_histogram.png"))
png(figure4)
ggplot()+
geom_histogram(aes(log(raw_sd)), binwidth = 0.1, col ="black", fill = "white")+
geom_vline(xintercept = log(sd_quantile), linetype = "dashed", col = 'blue')+
labs(title = "Est. counts standard deviation threshold",
caption = "Histogram of the standard deviation of the raw counts. The blue line represents the threshold used for filtering in step 04.")+
xlab("Standard deviation (log scale)")+
ylab("counts")
dev.off()
figure5 = file.path(parent_folder, "figures", paste0(experiment, "mean_histogram.png"))
png(figure5)
ggplot()+
geom_histogram(aes(log(raw_means)), binwidth = 0.1, col ="black", fill = "white")+
geom_vline(xintercept = log(sd_quantile), linetype = "dashed", col = 'blue')+
labs(title = "Est. counts means threshold",
caption = "Histogram of the means of the raw counts. The blue line represents the threshold used for filtering in step 04.")+
xlab("Mean (log scale)")+
ylab("counts")
dev.off()
# Save the file paths into the json copy for the final report:
path_2_json_copy = file.path(parent_folder, "results", paste0(experiment, "_json_copy.json"))
json_copy <- read_json(path_2_json_copy)
json_copy$path_2_results$Z_threshold = as.character(outputfile_path)
json_copy$figures$sd_histogram = as.character(figure4)
json_copy$figures$mean_histogram = as.character(figure5)
write_json(json_copy, path_2_json_copy, auto_unbox = TRUE)