-
Notifications
You must be signed in to change notification settings - Fork 0
/
3_gene_selection.R
executable file
·103 lines (86 loc) · 5.69 KB
/
3_gene_selection.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
library("Boruta")
library(randomForest)
set.seed(17022019)
mothertable = read.csv("./data/input/curated_mothertable20200515.lfs.txt.gz", sep = "\t")
# Remove any genes are not differentially expressed
mothertable = mothertable[mothertable$differentialinatleastoneconstrast == "TRUE", ]
row.names(mothertable) = mothertable$target_id
# We're only using the TPMs
mothertable = mothertable[43:81]
colnames(mothertable) <- sub("_tpm$", "", colnames(mothertable))
# Transpose it so that we have "HarvestX_repY" --> "Gene 1", "Gene 2", ...
mothertable.transposed = as.data.frame(t(mothertable))
mothertable.transposed$Mothertable.ID = row.names(mothertable.transposed)
# Load the traits we want to look at
important.traits = read.csv("data/results/trait_selection_result_roughFixed.csv")
important.traits = sub("\\_\\d+$", "", important.traits[important.traits$decision == "Confirmed", ]$trait)
important.traits = unique(important.traits)
# This tells us which plants belong to which harvest and rep
mothertable.mapping = read.csv("data/input/MothertableToPlantID_mapping.csv")
# Load Phenotpying data
pheno = read.csv("data/intermediary/1745AJ_Phenotyping_nafixed.csv")
# We only need the important traits, plant name, treatment and DAS
pheno = pheno[c(names(pheno)[c(1, 2, 3)], important.traits)]
pheno$Day = paste("d", pheno$DAS, sep = "")
# Merge by Plant.ID and Day (and Treatment, but that doesn't matter)
# We lose a number of samples here because we only have comparable phenotyping
# data between DAS 20 and 35. Even some samples at DAS 20 are lost because
# they were apparently not phenotyped with the new configuration before sampling.
pheno.merged = merge(mothertable.mapping, pheno)
options("expressions" = 50000)
# Runs Boruta of specified trait ~ all genes and returns only the confirmed genes
runBoruta <- function(trait, dataframe) {
# For speed/performance optimization, we're creating a y vector (trait) and an x dataframe containing all the
# possible predictors (dataframe-trait)
y = dataframe[, trait]
x = dataframe[, !names(dataframe) %in% c(trait)]
boruta_result = Boruta(x, y, pValue = 0.01, maxRuns = 800, doTrace = 2, ntree = 1000)
trait_importance = attStats(boruta_result)
trait_importance = cbind(gene = rownames(trait_importance), trait_importance)
trait_importance_roughfixed = attStats(TentativeRoughFix(boruta_result))
trait_importance_roughfixed = cbind(gene = rownames(trait_importance_roughfixed), trait_importance_roughfixed)
return(trait_importance_roughfixed[trait_importance_roughfixed$decision == "Confirmed", ])
}
important.genes = data.frame(trait = character(), gene = character(), medianImp = double())
for (trait in important.traits) {
print(paste("Analyzing trait", trait))
# Samples were taken from two plants so we need to average the phenotypic data
plant.means = aggregate(as.formula(paste(trait, "~ Mothertable.ID")), data = pheno.merged, FUN = mean)
# Join Trait and transposed mothertable
mothertable.forBoruta = merge(plant.means, mothertable.transposed)[, -1] # remove Mothertable.ID so it's all numbers
# Make one Boruta run and another one and keep only the genes that were confirmed in both runs.
# Keep going making more runs until no more genes are being thrown out.
confirmed.genes.allRuns = runBoruta(trait, mothertable.forBoruta)
# Throw out all genes from Boruta input that have been rejected
mothertable.forBoruta = mothertable.forBoruta[c(trait, as.character(confirmed.genes.allRuns$gene))]
confirmed.genes.currentRun = runBoruta(trait, mothertable.forBoruta)
# While there are genes in the allRuns dataframe that are not confirmed in the current run, do another run!
while(length(setdiff(unique(as.character(confirmed.genes.allRuns$gene)), unique(as.character(confirmed.genes.currentRun$gene)))) > 0) {
genes.intersect = intersect(unique(as.character(confirmed.genes.allRuns$gene)), unique(as.character(confirmed.genes.currentRun$gene)))
# Remove genes from both datasets that were not present in both
confirmed.genes.currentRun = confirmed.genes.currentRun[confirmed.genes.currentRun$gene %in% genes.intersect, ]
confirmed.genes.allRuns = confirmed.genes.allRuns[confirmed.genes.allRuns$gene %in% genes.intersect, ]
confirmed.genes.allRuns = rbind(confirmed.genes.allRuns, confirmed.genes.currentRun)
# Do another run!
print(paste(" Doing another run now (", length(genes.intersect), " genes left)", sep = ""))
# Throw out all genes from Boruta input that have been rejected again
mothertable.forBoruta = mothertable.forBoruta[c(trait, as.character(confirmed.genes.allRuns$gene))]
confirmed.genes.currentRun = runBoruta(trait, mothertable.forBoruta)
}
selected_genes = data.frame(aggregate(medianImp ~ gene, data = confirmed.genes.allRuns, FUN = median))
selected_genes$trait = trait
important.genes = rbind(important.genes, selected_genes)
# Do Leave-one-out cross-validation
mothertable.forBoruta = mothertable.forBoruta[c(trait, as.character(selected_genes$gene))]
write("i,Value,Prediction",file=paste0("data/results/cv/loocv_genes_",trait,".csv"))
for (i in 1:nrow(mothertable.forBoruta)) {
data.train = mothertable.forBoruta[-i, ]
data.test = mothertable.forBoruta[i, ]
data.train.x = subset(data.train, select = selected_genes$gene)
data.train.y = data.train[, trait]
model = randomForest(data.train.x, data.train.y, ntree = 1000)
write(paste(i, data.test[, trait], predict(model, newdata = subset(data.test, select = selected_genes$gene)), sep=","),file=paste0("data/results/cv/loocv_genes_",trait,".csv"),append = T)
}
}
important.genes = important.genes[order(important.genes$trait, important.genes$gene), ]
write.csv(important.genes, "data/results/gene_selection_result_roughFixed.csv", row.names = FALSE)