-
Notifications
You must be signed in to change notification settings - Fork 0
/
step_06.R
200 lines (167 loc) · 8.48 KB
/
step_06.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
## A script that performs linear regression on log10 value of % Eigenvalues vs component number
## for 1->N, 2->N, until (N-2)->N and finds the meaningful PCs with a maximum threshold on the
## R-squared value for each regression. N max = variable in JSON. Default = 9.
## It adds the coordinate of individual observations for each sample on each meaningful
## PC to the design file as new columns (PC1, PC2...PCN where N = last meaningful PC)
args = base::commandArgs(trailingOnly = TRUE)
print(args)
path2_json_file = args[1]
# **********************************************************************
## Load in the necessary libraries:
print("*** Loading libraries ***")
options(stringsAsFactors = FALSE)
options(bitmapType='quartz')
library(genefilter)
library(jsonlite)
library(readr)
library(ggplot2)
# Read in input files:
# JSON input file with SD and AVG thresholds
print("*** Reading the input files ***")
json = read_json(path2_json_file)
parent_folder = json$folders$output_folder
experiment = json$experiment_name
path2_design = file.path(parent_folder, "results", paste0(experiment, "_design.txt"))
path_2_eigenvalues = file.path(parent_folder, "results", paste0(experiment, "_pca_eigenvalues.txt"))
path_2_pca = file.path(parent_folder, "results", paste0(experiment, "_pca_object.rds"))
path_2_json_copy = file.path(parent_folder, "results", paste0(experiment, "_json_copy.json"))
json_copy <- read_json(path_2_json_copy)
# Extract the R-squared value from the JSON. Default R-squared threshold = 0.95.
print("*** Extracting adjusted R-squared threshold value from the JSON ***")
if (!is.numeric(json$input_variables$R_squared_threshold)){
max_Rsqured = 0.95
} else if (json$input_variables$R_squared_threshold > 1){
max_Rsqured = 0.95
} else if (json$input_variables$R_squared_threshold <= 0){
max_Rsqured = 0.95
} else {max_Rsqured = json$input_variables$R_squared_threshold}
# Extract the max number of PC for regression. Default = 10.
print("*** Extracting the max number of PC to be used for regression ***")
if (!is.numeric(json$input_variables$max_number_PC_regression)){
max_PC_regression = 10
} else if(json$input_variables$max_number_PC_regression<1){
max_PC_regression = 10
} else {max_PC_regression = json$input_variables$max_number_PC_regression}
# Load files
design = read.table(path2_design, sep = "\t", header = TRUE, row.names = 1)
pca_eigenvalue = read.table(path_2_eigenvalues, sep = "\t", header = TRUE, row.names = 1)
pca = read_rds(path_2_pca)
# **************** Start of the program **********************
options(scipen = 999) #to avoid display in scientific notation
##6/22/21 Edit: removing principal components that explain
##less than 1% of the variance so that they do not impact the
##regression used to determine meaningful principal components
pca_eigenvalue=pca_eigenvalue[pca_eigenvalue$variance.percent>1,]
# Visualize percent variation and decide which PCs matter
pca_eigenvalue$dimension = 1:nrow(pca_eigenvalue)
pca_variation = round(pca_eigenvalue$variance.percent, 3)
# convert to log10
pca_var_log = log10(pca_variation)
# Create an empty table to save coefficients
number_PC = nrow(pca_eigenvalue)
res = data.frame(model_num = rep(0, number_PC-3),
slope = rep(0,number_PC-3),
intercept = rep(0, number_PC-3),
R_squared = rep(0, number_PC-3),
adj_R_squared = rep(0, number_PC-3))
# Performing linear regression to find meaningful components
# Warning: doing up to (N-2)->N or up to JSON parameter for mac PC number (default = 10)
##6/22/21 Edit: using all principal components to fit regression, as the earlier
##re-assignment of pca_eigenvalue removes any principal components that would
##skew the regression by having a small variance percent
print("*** Performing linear regression of log10 eigenvalues vs. PC number ***")
for (i in 1:(number_PC-3)){
if (i>max_PC_regression){
break
}
#Before 6/22/21, linear = lm(pca_var_log[i:(number_PC-2)] ~ pca_eigenvalue$dimension[i:(number_PC-2)])
linear = lm(pca_var_log[i:(number_PC)] ~ pca_eigenvalue$dimension[i:(number_PC)])
res$model_num[i] = i
res$intercept[i] = coef(linear)[1]
res$slope[i] = coef(linear)[2]
res$R_squared[i] = summary(linear)$r.squared
res$adj_R_squared[i] = summary(linear)$adj.r.squared
}
### Save the regression table
output_pc_eigen = file.path(parent_folder, "results", paste0(experiment, "_regression_pc_eigen.txt"))
write.table(res, file = output_pc_eigen, sep = '\t',col.names=NA,row.names=TRUE,quote=FALSE)
json_copy$path_2_results$pc_vs_eigen = as.character(output_pc_eigen)
# Find the significant PCs according to a max R_squared threshold
# Establish the cutoff line and save that line as last_meaningful:
for (i in 1:((nrow(res))-1)){
if (res$adj_R_squared[i] > max_Rsqured){
break()
}
last_meaningful = i
}
if (!exists("last_meaningful")){
print("Error: The program could not identify any meaningful components")
print("This is due to the adjusted R_squared threshold for the regression between Eigenvalue and PC number")
##6/22/2021 Edit: replacing the stop() command that breaks the script with a reset of the last_meaningful
## variable to automatically keep two principal components as meaningful regardless of the result
## of the regression
print("Resetting to keep the first two principal components as meaningful")
last_meaningful = 2
}
##6/22/2021 Edit: if only 1 principal component is kept as meaningful, increase to two
if (last_meaningful<2){
print("Regression analysis only recognizes one principal component")
print("Resetting to keep the first two principal components as meaningful")
last_meaningful = 2
}
print("*** Adding the meaningful PCs coordinates to a copy of the design file ***")
# add the meaningful components values to the design file:
design_meaningful_PC = design
for (i in 1:last_meaningful){
variable = paste0("PC", i)
design_meaningful_PC = cbind(design_meaningful_PC, variable = pca$x[,i])
# assign the right column name to the column just created:
colnames(design_meaningful_PC)[length(colnames(design_meaningful_PC))] <- variable
}
# Save the copy of the design file that includes all “meaningful” component values as new columns:
output_design_meaningful = file.path(parent_folder, "results", paste0(experiment, "_design_meaningful.txt"))
write.table(design_meaningful_PC, file = output_design_meaningful, sep = '\t',col.names=NA,row.names=TRUE,quote=FALSE)
print("*** Generating plots for the final report ***")
# Plots for the final report:
pca_var_log = as.data.frame(pca_var_log)
pca_var_log$dimensions = 1:nrow(pca_var_log)
figure9 = file.path(parent_folder, "figures", paste0(experiment, "_log10scree_plot.png"))
png(figure9)
ggplot()+
geom_point(aes(x = pca_var_log$dimensions, y = pca_var_log$pca_var_log))+
xlab("PC number")+
ylab("Eigenvalue log10")+
scale_x_continuous(breaks=c(1:nrow(pca_var_log)))+
ggtitle("Log 10 of the Eigenvalues vs PC number")
dev.off()
# create a color palette
colfunc <- colorRampPalette(c("blue",
"violet",
"red",
"orange",
"yellow",
"green",
"brown",
"black"))
figure10 = file.path(parent_folder, "figures", paste0(experiment, "_regression_plot.png"))
png(figure10)
colors = colfunc(nrow(res))
plot(pca_var_log$dimensions, pca_var_log$pca_var_log,
xlab = "PC number", ylab = "Eigenvalue log10",
main = "log10 of eigenvalues with the regression lines")
for (i in 1:(nrow(res))){
abline(a = res$intercept[i], b = res$slope[i], col = colors[i])
text(x=number_PC-2, y=(i * 0.1)+0.6, labels=paste0("model # ", i, "->", "N-3"), col = colors[i])
}
dev.off()
### Generate a loading scores table only for meaningful PCs ##
loadings_meaningful = pca$rotation[,1:last_meaningful]
### Save the loadings for meaningful PC into a file ###
output_loadings_meaningful = file.path(parent_folder, "results", paste0(experiment, "_meaningful_pc_loading_scores.txt"))
write.table(loadings_meaningful, file = output_loadings_meaningful, sep = '\t',col.names=NA,row.names=TRUE,quote=FALSE)
# Updating the json copy
json_copy$path_2_results$design_meaningful = as.character(output_design_meaningful)
json_copy$path_2_results$meaningful_loading_scores = as.character(output_loadings_meaningful)
json_copy$figures$scree_plot_log10 = as.character(figure9)
json_copy$figures$regression_plot = as.character(figure10)
write_json(json_copy, path_2_json_copy, auto_unbox = TRUE)