forked from heyuan7676/COVID-19
-
Notifications
You must be signed in to change notification settings - Fork 0
/
LR_confounders_allTissues.R
82 lines (64 loc) · 3.42 KB
/
LR_confounders_allTissues.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
source("utils.R")
### Didn't work out!
dir.create(file.path(datadir, "SVs/"), showWarnings = FALSE)
samples_used_in_GTEx <- function(){
samples_used = c()
for (tissue in sort(unique(samples$SMTSD))){
sample_in_the_tissue = samples %>% filter(SMTSD == tissue)
sample_in_the_tissue = as.data.frame(sample_in_the_tissue)
# restrict to the samples used in GTEx
tissue_name = gsub(" ", "_", gsub('\\)', '', gsub(' \\(', '_', gsub(' - ', '_', tissue))))
genotype_PCs = tryCatch(read.table(paste0(datadir, 'GTEx_Analysis_v8_eQTL_covariates/',tissue_name,'.v8.covariates.txt'),
sep='\t', header = T, stringsAsFactors = F, row.names = 1),
warning = function (w) {print(paste("No data available for tissue type", tissue_name))}, error = function(f) {return("failed")})
if(inherits(genotype_PCs, "character")){
print(paste(" ", "Skipping tissue", tissue_name))
next
}
sample_in_the_tissue = sample_in_the_tissue %>% filter(SUBJID %in% gsub("\\.", "-", colnames(genotype_PCs)))
samples_used = c(samples_used, sample_in_the_tissue$SAMPID)
}
return(samples_used)
}
readin_tpm <- function(){
tpm.whole = fread(paste0(datadir, 'gene_pc_lc_tpm.txt'))
all.genes = as.character(tpm.whole %>% pull(Name))
all.genes = sapply(all.genes, function(x) strsplit(x, '\\.')[[1]][1])
## subset columns
samples.used = samples_used_in_GTEx()
samples = samples %>% filter(SAMPID %in% samples.used)
tpm.exp = t(tpm.whole[, ..samples.used])
## subset rows
colnames(tpm.exp) = all.genes
idxxx = apply(tpm.exp, 2, function(x) sum(x>0.1) > 0.2 * nrow(tpm.exp))
print(paste0('Remove ', ncol(tpm.exp) - sum(idxxx), ' genes with low expression'))
tpm.exp = tpm.exp[, idxxx]
## log transformation
tpm.exp = log10(tpm.exp + 1)
return(tpm.exp)
}
#This outputs a table listing each "Tissue", "Gene", "Variable", "Median_TPM","coefficient", "pvalue", FDR
corrected_TestGene <- function(Test_gene, Test_gene_name){
Test_gene_tpm = data.frame("SAMPID" = rownames(tpm.exp.matrix), "geneExp" = tpm.exp.matrix[, Test_gene])
Test_gene_tpm_cov = merge(Test_gene_tpm, samples, by = 'SAMPID')
### remove missing data
cols = c("SAMPID", "AGE", "SEX", "DTHHRDY", "SMRIN", "SMTSISCH", "SMEXNCRT", "COHORT", "RACE", "SMTSD", "geneExp")
Test_gene_tpm_cov = Test_gene_tpm_cov[, cols]
Test_gene_tpm_cov.complete = Test_gene_tpm_cov[complete.cases(Test_gene_tpm_cov), ]
#print(paste0("Removed ", nrow(Test_gene_tpm_cov)-nrow(Test_gene_tpm_cov.complete), " data points with missing data"))
#print(paste0("Test with ", nrow(Test_gene_tpm_cov.complete), " data points"))
### fit on geneEXP
#model = lm(geneExp~AGE_GROUP+SEX+factor(DTHHRDY)+SMRIN+SMTSISCH+SMEXNCRT,
# data = Test_gene_tpm_cov.complete)
model = lm(geneExp~AGE+SEX+factor(DTHHRDY)+SMRIN+SMTSISCH+SMEXNCRT+factor(COHORT)+factor(RACE),
data = Test_gene_tpm_cov.complete)
Test_gene_tpm_cov.complete$corrected_geneEXP = residuals(model)
write.table(Test_gene_tpm_cov.complete, paste0(datadir, 'corrected_gene_exp/', Test_gene_name, '.csv'), sep=',', row.names = F, quote = F)
}
datadir_PPI = './PPI_data/'
dir.create(file.path(outdir, 'PPI_results'), showWarnings = FALSE)
dir.create(file.path(datadir, 'corrected_gene_exp'), showWarnings = FALSE)
## read in gene TPM
tpm.exp.matrix = readin_tpm()
corrected_TestGene('ENSG00000130234', 'ACE2')
corrected_TestGene('ENSG00000184012', 'TMPRSS2')