From eeaa13503443f4afed163c1fa74b951353b31a3b Mon Sep 17 00:00:00 2001 From: Animesh Sharma Date: Thu, 16 Nov 2023 15:36:23 +0100 Subject: [PATCH] merge median intensities and write complete-cases table, t-test tech reps --- diffExprTestRank.r | 46 +++++++++++++++----- diffExprTestT.r | 106 ++++++++++++++++++++++++++++----------------- 2 files changed, 102 insertions(+), 50 deletions(-) diff --git a/diffExprTestRank.r b/diffExprTestRank.r index 01d887c..0fd1dc2 100644 --- a/diffExprTestRank.r +++ b/diffExprTestRank.r @@ -1,4 +1,4 @@ -#F:\R-4.3.1\bin\Rscript.exe diffExprTestRank.r "L:\promec\TIMSTOF\LARS\2023\231027_plasma_beads\combined\txt\proteinGroups.txt" L:\promec\TIMSTOF\LARS\2023\231027_plasma_beads\combined\txt\Groups.txt "Intensity." "Bead" "Rem" +#F:\R-4.3.1\bin\Rscript.exe diffExprTestRank.r "L:\promec\TIMSTOF\LARS\2023\231025_Kamila_zub\combined\txt\proteinGroups.txt" "L:\promec\TIMSTOF\LARS\2023\231025_Kamila_zub\combined\txt\Groups.txt" "Group" "Rem" #setup#### #install.packages(c("readxl","writexl","svglite","ggplot2","BiocManager"),repos="http://cran.us.r-project.org",lib=.libPaths()) #BiocManager::install(c("limma","pheatmap"),repos="http://cran.us.r-project.org",lib=.libPaths()) @@ -8,21 +8,20 @@ print("USAGE:Rscript diffExprTestRank.r selThrFC) @@ -176,7 +176,7 @@ testWilcox <- function(log2LFQmm,sel1,sel2,fName){ #install.packages("svglite") ggplot2::ggsave(paste0(inpF,fName,selection,scale,lGroup,sCol,sel1,mCol,paste(sel2,collapse=""),eCol,selThr,selThrFC,cvThr,rGroup,lName,"WilcoxTestBH.svg"), p) print(p) - return(sum(Significance)) + return(wilcoxTest.results.ret) } } #log2selection#### @@ -227,8 +227,32 @@ rownames(log2LFQselScaleimpCorr)<-colnames(log2LFQselMM) svgPHC<-pheatmap::pheatmap(log2LFQselScaleimpCorr,annotation_row = annoR,annotation_col = annoR,clustering_distance_rows = "euclidean",clustering_distance_cols = "euclidean",fontsize_row=4,cluster_cols=T,cluster_rows=T,fontsize_col=4) ggplot2::ggsave(paste0(inpF,lName,lGroup,selection,scale,"heatmap.minMax.pearson.svg"), svgPHC) #compare#### +colnames(log2LFQ) +log2LFQsel=log2LFQ[,gsub("-",".",rownames(label[is.na(label$removed)|label$removed==" "|label$removed=='',]))] +colnames(log2LFQsel) +dim(log2LFQsel) +label=label[is.na(label$removed)|label$removed==" "|label$removed=='',] +table(label$pair2test) for(i in (names(table(label[,lGroup])))){ print(i) print(label[label[,lGroup]==i,]) - print(testWilcox(log2LFQselMM,i,setdiff(names(table(label[,lGroup])),i),"minMaxLog2LFQ")) + assign(i,testWilcox(log2LFQsel,i,setdiff(names(table(label[,lGroup])),i),"log2Int")) } +#merge#### +dataFC<-data.frame(rowSums(log2LFQsel,na.rm=T)) +colnames(dataFC)<-"sum" +dataFC["RowGeneUniProtScorePeps"]<-row.names(data) +dim(dataFC) +summary(dataFC) +for (obj in (names(table(label[,lGroup])))) { + print(obj) + dataT<-get(obj) + dataC<-data.frame(cbind(dataT$RowGeneUniProtScorePeps,dataT$logFCmedianGrp1)) + colnames(dataC)<-c("RowGeneUniProtScorePeps",paste0(obj,"log2MedInt")) + dataFC<-merge(dataFC,dataC,by='RowGeneUniProtScorePeps',all=T) +} +writexl::write_xlsx(dataFC,paste0(inpF,fName,selection,selThr,selThrFC,cvThr,rGroup,lName,"merge.xlsx")) +#completeCases#### +dataFCnar<-dataFC[complete.cases(dataFC), ] +dataFCnar<-dataFCnar[order(dataFCnar$sum,decreasing=F),] +writexl::write_xlsx(dataFCnar,paste0(inpF,fName,selection,selThr,selThrFC,cvThr,rGroup,lName,"completeCase.xlsx")) diff --git a/diffExprTestT.r b/diffExprTestT.r index c2d2fbf..f2be728 100644 --- a/diffExprTestT.r +++ b/diffExprTestT.r @@ -1,4 +1,4 @@ -#F:\R-4.3.1\bin\Rscript.exe diffExprTestT.r "F:\OneDrive - NTNU\IRD\proteinGroups.txt" "f:\OneDrive - NTNU\IRD\Groups.txt" Group Rem +#F:\R-4.3.1\bin\Rscript.exe diffExprTestT.r "L:\promec\TIMSTOF\LARS\2023\231025_Kamila_zub\combined\txt\proteinGroups.txt" "L:\promec\TIMSTOF\LARS\2023\231025_Kamila_zub\combined\txt\Groups.txt" Group Rem #setup #install.packages(c("readxl","writexl","svglite","ggplot2","BiocManager"),repos="http://cran.us.r-project.org",lib=.libPaths()) #BiocManager::install(c("limma","pheatmap"),repos="http://cran.us.r-project.org",lib=.libPaths()) @@ -13,17 +13,17 @@ if (length(args) != 4) {stop("\n\nNeeds FOUR arguments, the full path of the dir c:/Users/animeshs/R-4.2.1-win/bin/Rscript.exe diffExprTestT.r \"L:/promec/TIMSTOF/Data/combined/txt/proteinGroups.txt\" \"L:/promec/TIMSTOF/Data/combined/txt/Groups.txt\" Groups Remove ", call.=FALSE)} inpF <- args[1] -#inpF <-"L:/promec/HF/Lars/2023/231006 IRD DDA Kristine Solveig/txt/matchedFeatures.txt" +#inpF <-"L:/promec/TIMSTOF/LARS/2023/231025_Kamila_zub/combined/txt/proteinGroups.txt" inpL <- args[2] -#inpL <-"L:/promec/HF/Lars/2023/231006 IRD DDA Kristine Solveig/txt/Groups.txt" +#inpL <-"L:/promec/TIMSTOF/LARS/2023/231025_Kamila_zub/combined/txt/Groups.txt" lGroup <- args[3] #lGroup<-"Group" rGroup <- args[4] -#rGroup<-"Remove" +#rGroup<-"Rem" inpD<-dirname(inpF) fName<-basename(inpF) lName<-basename(inpL) -selection<-"Intensity." +selection<-"LFQ.intensity." thr=0.0#count selThr=0.05#pValue-tTest selThrFC=0.5#log2-MedianDifference @@ -37,23 +37,48 @@ data <- read.table(inpF,stringsAsFactors = FALSE, header = TRUE, quote = "", com #data = data[!data$Reverse=="+",] #data = data[!data$Potential.contaminant=="+",] #data = data[!data$Only.identified.by.site=="+",] -#row.names(data)<-paste(row.names(data),data[,c(1:(grep(selection,colnames(data))[1]-1))],sep=";;") +row.names(data)<-paste(row.names(data),data$Fasta.headers,data$Protein.IDs,data$Protein.names,data$Gene.names,data$Score,data$Peptide.counts..unique.,sep=";;") summary(data) dim(data) +log2Int<-as.matrix(log2(data[,grep("Intensity",colnames(data))])) +log2Int[log2Int==-Inf]=NA +hist(log2Int,main=paste("Mean:",mean(log2Int,na.rm=T),"SD:",sd(log2Int,na.rm=T)),breaks=round(max(log2Int,na.rm=T)),xlim=range(min(log2Int,na.rm=T),max(log2Int,na.rm=T))) +summary(log2(data[,grep("Intensity",colnames(data))])) +par(mar=c(12,3,1,1)) +boxplot(log2Int,las=2) +#corHCint#### +scale=3 +log2Intimp<-matrix(rnorm(dim(log2Int)[1]*dim(log2Int)[2],mean=mean(log2Int,na.rm = T)-scale,sd=sd(log2Int,na.rm = T)/(scale)), dim(log2Int)[1],dim(log2Int)[2]) +log2Intimp[log2Intimp<0]<-0 +boxplot(log2Intimp,las=2) +bk1 <- c(seq(-3,-0.01,by=0.01)) +bk2 <- c(seq(0.01,3,by=0.01)) +bk <- c(bk1,bk2) #combine the break limits for purpose of graphing +my_palette <- c(colorRampPalette(colors = c("darkblue", "white"))(n = length(bk1)-1),"gray", "gray",c(colorRampPalette(colors = c("white","darkred"))(n = length(bk2)-1))) +colnames(log2Intimp)<-colnames(log2Int) +log2IntimpCorr<-cor(log2Int,use="pairwise.complete.obs",method="spearman") +colnames(log2IntimpCorr)<-colnames(log2Int) +rownames(log2IntimpCorr)<-colnames(log2Int) +svgPHC<-pheatmap::pheatmap(log2IntimpCorr,clustering_distance_rows = "euclidean",clustering_distance_cols = "euclidean",fontsize_row=8,cluster_cols=T,cluster_rows=T,fontsize_col = 8) +#maxLFQ#### LFQ<-as.matrix(data[,grep(selection,colnames(data))]) +#protNum<-1:ncol(LFQ) +#protNum<-"LFQ intensity"#1:ncol(LFQ) +#colnames(LFQ)=paste(protNum,sub(selection,"",colnames(LFQ)),sep=";") colnames(LFQ)=sub(selection,"",colnames(LFQ)) dim(LFQ) log2LFQ<-log2(LFQ) log2LFQ[log2LFQ==-Inf]=NA log2LFQ[log2LFQ==0]=NA summary(log2LFQ) -hist(log2LFQ,main=paste("Mean:",mean(log2LFQ,na.rm=T),"SD:",sd(log2LFQ,na.rm=T)),breaks=round(max(log2LFQ,na.rm=T)),xlim=range(min(log2LFQ,na.rm=T),max(log2LFQ,na.rm=T))) -par(mar=c(12,3,1,1)) +hist(log2Int,main=paste("Mean:",mean(log2Int,na.rm=T),"SD:",sd(log2Int,na.rm=T)),breaks=round(max(log2Int,na.rm=T)),xlim=range(min(log2Int,na.rm=T),max(log2Int,na.rm=T))) +hist(log2LFQ,main=paste("Mean:",mean(log2LFQ,na.rm=T),"SD:",sd(log2LFQ,na.rm=T)),breaks=round(max(log2Int,na.rm=T)),xlim=range(min(log2Int,na.rm=T),max(log2Int,na.rm=T))) +boxplot(log2Int,las=2) boxplot(log2LFQ,las=2) -rowName<-paste(data$Mass,data$Calibrated.retention.time,data$m.z,data$Charge,data$Intensity,data$Sequence,sep=";;") -#writexl::write_xlsx(as.data.frame(cbind(rowName,log2LFQ,rownames(data))),paste0(inpD,"log2LFQ.xlsx")) -data$geneName<-paste(sapply(strsplit(paste(sapply(strsplit(data$Raw.files, ";",fixed=T), "[", 1)), " "), "[", 1)) -data$uniprotID<-paste(sapply(strsplit(paste(sapply(strsplit(data$Multiplet.ids, ";",fixed=T), "[", 1)), "-"), "[", 1)) +rowName<-paste(sapply(strsplit(paste(sapply(strsplit(data$Fasta.headers, "|",fixed=T), "[", 2)), "-"), "[", 1)) +writexl::write_xlsx(as.data.frame(cbind(rowName,log2LFQ,rownames(data))),paste0(inpD,"log2LFQ.xlsx")) +data$geneName<-paste(sapply(strsplit(paste(sapply(strsplit(data$Gene.names, ";",fixed=T), "[", 1)), " "), "[", 1)) +data$uniprotID<-paste(sapply(strsplit(paste(sapply(strsplit(data$Protein.IDs, ";",fixed=T), "[", 1)), "-"), "[", 1)) data[data$geneName=="NA","geneName"]=data[data$geneName=="NA","uniprotID"] #label#### label<-read.table(inpL,header=T,sep="\t",row.names=1)#, colClasses=c(rep("factor",3))) @@ -73,30 +98,36 @@ row.names(annoR)<-gsub("\\-","\\.",rownames(label[is.na(label$removed)|label$rem names(annoR)<-lGroup summary(annoR) #corHClfq#### +log2LFQimp<-matrix(rnorm(dim(log2LFQ)[1]*dim(log2LFQ)[2],mean=mean(log2LFQ,na.rm = T)-scale,sd=sd(log2LFQ,na.rm = T)/(scale)), dim(log2LFQ)[1],dim(log2LFQ)[2]) +log2LFQimp[log2LFQimp<0]<-0 +par(mar=c(12,3,1,1)) +boxplot(log2LFQimp,las=2) +colnames(log2LFQimp)<-colnames(log2LFQ) log2LFQimpCorr<-cor(log2LFQ,use="pairwise.complete.obs",method="pearson") colnames(log2LFQimpCorr)<-colnames(log2LFQ) rownames(log2LFQimpCorr)<-colnames(log2LFQ) svgPHC<-pheatmap::pheatmap(log2LFQimpCorr,clustering_distance_rows = "euclidean",clustering_distance_cols = "euclidean",fontsize_row=8,cluster_cols=T,cluster_rows=T,fontsize_col = 8,annotation_row = annoR,annotation_col = annoR) #test#### -testT <- function(log2LFQ,sel1,sel2,cvThr){ - #sel1<-"D11N" - #sel2<-"D20N" - #log2LFQ<-log2LFQsel#[,gsub("-",".",rownames(label[label$Remove!="Y",]))] - #log2LFQ<-sapply(log2LFQ, as.numeric) - #colnames(log2LFQ) - d1<-as.matrix(log2LFQ[,gsub("-",".",rownames(label[label$pair2test==sel1,]))]) +testT <- function(log2LFQselT,sel1,sel2,cvThr){ + #sel1<-"S2" + #sel2<-"S4" + #log2LFQselT<-log2LFQ#[,gsub("-",".",rownames(label[label$Remove!="Y",]))] + #colnames(log2LFQselT) + d1<-as.matrix(log2LFQselT[,gsub("-",".",rownames(label[label$pair2test==sel1,]))]) colnames(d1)<-gsub("-",".",rownames(label[label$pair2test==sel1,])) - d2<-as.matrix(log2LFQ[,gsub("-",".",rownames(label[label$pair2test==sel2,]))]) + d2<-as.matrix(log2LFQselT[,gsub("-",".",rownames(label[label$pair2test==sel2,]))]) colnames(d2)<-gsub("-",".",rownames(label[label$pair2test==sel2,])) dataSellog2grpTtest<-as.matrix(cbind(d1,d2)) + dataSellog2grpTtest[dataSellog2grpTtest==0]=NA + #summary(dataSellog2grpTtest) + hist(dataSellog2grpTtest,breaks=round(max(dataSellog2grpTtest,na.rm=T))) + #dataSellog2grpTtest<-as.numeric(dataSellog2grpTtest) + row.names(dataSellog2grpTtest)<-row.names(data) if(sum(!is.na(d1))>1&sum(!is.na(d2))>1){ hist(d1,breaks=round(max(dataSellog2grpTtest,na.rm=T))) hist(d2,breaks=round(max(dataSellog2grpTtest,na.rm=T))) #assign(paste0("hda",sel1,sel2),dataSellog2grpTtest) #get(paste0("hda",sel1,sel2)) - dataSellog2grpTtest[dataSellog2grpTtest==0]=NA - hist(dataSellog2grpTtest,breaks=round(max(dataSellog2grpTtest,na.rm=T))) - row.names(dataSellog2grpTtest)<-row.names(data) comp<-paste0(sel1,sel2) sCol<-1 eCol<-ncol(dataSellog2grpTtest) @@ -106,17 +137,14 @@ testT <- function(log2LFQ,sel1,sel2,cvThr){ pValNA = apply( dataSellog2grpTtest, 1, function(x) if(sum(!is.na(x[c(sCol:mCol)]))<2&sum(!is.na(x[c((mCol+1):eCol)]))<2){NA} - else if((sum(x[c(sCol:mCol)],na.rm=T)-sum(x[c((mCol+1):eCol)],na.rm=T))==0){NA} - else if(!is.na(sd(x[c(sCol:mCol)],na.rm=T))&sd(x[c(sCol:mCol)],na.rm=T)==0){NA} - else if(!is.na(sd(x[c((mCol+1):eCol)],na.rm=T))&sd(x[c((mCol+1):eCol)],na.rm=T)==0){NA} - else if(sum(is.na(x[c(sCol:mCol)]))==0&sum(is.na(x[c((mCol+1):eCol)]))==0){ - t.test(as.numeric(x[c(sCol:mCol)]),as.numeric(x[c((mCol+1):eCol)]),var.equal=T)$p.value} + else if(sum(!is.na(x[c(sCol:mCol)]))>=1&sum(!is.na(x[c((mCol+1):eCol)]))==0){0} + else if(sum(!is.na(x[c(sCol:mCol)]))==0&sum(!is.na(x[c((mCol+1):eCol)]))>=1){0} + else if(sum(!is.na(x[c(sCol:mCol)]))==1&sum(!is.na(x[c((mCol+1):eCol)]))==1){1} + else if(sum(is.na(x[c(sCol:mCol)]))==0&sum(is.na(x[c((mCol+1):eCol)]))==0){t.test(as.numeric(x[c(sCol:mCol)]),as.numeric(x[c((mCol+1):eCol)]),var.equal=T)$p.value} else if(sum(!is.na(x[c(sCol:mCol)]))>1&sum(!is.na(x[c((mCol+1):eCol)]))<1&(sd(x[c(sCol:mCol)],na.rm=T)/mean(x[c(sCol:mCol)],na.rm=T))1&(sd(x[c((mCol+1):eCol)],na.rm=T)/mean(x[c((mCol+1):eCol)],na.rm=T))=2&sum(!is.na(x[c((mCol+1):eCol)]))>=1){ - t.test(as.numeric(x[c(sCol:mCol)]),as.numeric(x[c((mCol+1):eCol)]),na.rm=T,var.equal=T)$p.value} - else if(sum(!is.na(x[c(sCol:mCol)]))>=1&sum(!is.na(x[c((mCol+1):eCol)]))>=2){ - t.test(as.numeric(x[c(sCol:mCol)]),as.numeric(x[c((mCol+1):eCol)]),na.rm=T,var.equal=T)$p.value} + #else if(sum(!is.na(x[c(sCol:mCol)]))>=2&sum(!is.na(x[c((mCol+1):eCol)]))>=1){t.test(as.numeric(x[c(sCol:mCol)]),as.numeric(x[c((mCol+1):eCol)]),na.rm=T,var.equal=T)$p.value} + #else if(sum(!is.na(x[c(sCol:mCol)]))>=1&sum(!is.na(x[c((mCol+1):eCol)]))>=2){t.test(as.numeric(x[c(sCol:mCol)]),as.numeric(x[c((mCol+1):eCol)]),na.rm=T,var.equal=T)$p.value} else{NA} ) summary(warnings()) @@ -143,7 +171,7 @@ testT <- function(log2LFQ,sel1,sel2,cvThr){ logFCmedianGrp1[is.na(logFCmedianGrp1)]=0 logFCmedianGrp2[is.na(logFCmedianGrp2)]=0 hda<-cbind(logFCmedianGrp1,logFCmedianGrp2) - #plot(hda) + plot(hda) limma::vennDiagram(hda>0) logFCmedian = logFCmedianGrp1-logFCmedianGrp2 logFCmedianFC = 2^(logFCmedian+.Machine$double.xmin) @@ -151,16 +179,16 @@ testT <- function(log2LFQ,sel1,sel2,cvThr){ hist(logFCmedianFC) log2FCmedianFC=log2(logFCmedianFC) hist(log2FCmedianFC) - ttest.results = data.frame(Uniprot=rowName,Gene=data$geneName,Protein=data$Sequence,logFCmedianGrp1,logFCmedianGrp2,PValueMinusLog10=pValNAminusLog10,FoldChanglog2median=logFCmedianFC,CorrectedPValueBH=pValBHna,TtestPval=pValNA,dataSellog2grpTtest,Log2MedianChange=logFCmedian,grp1CV,grp2CV,RowGeneUniProtScorePeps=rownames(dataSellog2grpTtest)) + ttest.results = data.frame(Uniprot=rowName,Gene=data$Gene.names,Protein=data$Protein.names,logFCmedianGrp1,logFCmedianGrp2,PValueMinusLog10=pValNAminusLog10,FoldChanglog2median=logFCmedianFC,CorrectedPValueBH=pValBHna,TtestPval=pValNA,dataSellog2grpTtest,Log2MedianChange=logFCmedian,grp1CV,grp2CV,RowGeneUniProtScorePeps=rownames(dataSellog2grpTtest)) + writexl::write_xlsx(ttest.results,paste0(inpF,selection,sCol,eCol,comp,selThr,selThrFC,cvThr,lGroup,rGroup,lName,"tTestBH.xlsx")) + write.csv(ttest.results,paste0(inpF,selection,sCol,eCol,comp,selThr,selThrFC,cvThr,lGroup,rGroup,lName,"tTestBH.csv"),row.names = F) + ttest.results.return<-ttest.results + #volcano + ttest.results$RowGeneUniProtScorePeps<-data$geneName ttest.results[is.na(ttest.results)]=selThr Significance=ttest.results$CorrectedPValueBH0&abs(ttest.results$Log2MedianChange)>selThrFC sum(Significance) dsub <- subset(ttest.results,Significance) - writexl::write_xlsx(dsub,paste0(inpF,selection,sCol,eCol,comp,selThr,selThrFC,cvThr,lGroup,rGroup,lName,"tTestBH.xlsx")) - write.csv(dsub,paste0(inpF,selection,sCol,eCol,comp,selThr,selThrFC,cvThr,lGroup,rGroup,lName,"tTestBH.csv"),row.names = F) - ttest.results.return<-ttest.results - #volcano - ttest.results$RowGeneUniProtScorePeps<-data$geneName p <- ggplot2::ggplot(ttest.results,ggplot2::aes(Log2MedianChange,PValueMinusLog10))+ ggplot2::geom_point(ggplot2::aes(color=Significance)) p<-p + ggplot2::theme_bw(base_size=8) + ggplot2::geom_text(data=dsub,ggplot2::aes(label=RowGeneUniProtScorePeps),hjust=0, vjust=0,size=1,position=ggplot2::position_jitter(width=0.5,height=0.1)) + ggplot2::scale_fill_gradient(low="white", high="darkblue") + ggplot2::xlab("Log2 Median Change") + ggplot2::ylab("-Log10 P-value") #f=paste(file,proc.time()[3],".jpg")