-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
273 lines (237 loc) · 15.5 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
# This is a sample Python script.
# Press Shift+F10 to execute it or replace it with your code.
# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
# "Write a function to evaluate whether parentheses are matching across a string.
# In other words each open bracket has a matching closing bracket and are correctly ordered.
# The input should be a string and the return should be a boolean - `True` if the brackets are correct `False` if they are not.
# The only characters used will be `'{}[]()'`"
# `samples` and `features` tables contain metadata about the rows and columns of the `data` table.
# Follow the instructions in your preferred programming language to answer the following questions."
# Create a variable named `data` by reading in a dataframe from the file `data.txt`
# which contains the sample (col) by feature counts (rows). The first column named `_id` can be used as index.**"
# Create a variable named `samples` by reading in a dataframe from the file `samples.txt` with the sample metadata.
# Add a checkpoint to verify that the `#SampleID` column has unique values.**"
# Create a variable named `features` by reading in a dataframe from the file `features.txt` with the features metadata.
# The first column named `_id` can be used as index.**"
import argparse
# "Imagine a grocery store with multiple aisles and multiple people waiting in each line.
# You quickly look at the items in each customers basket and know exactly how long it will take them to check out.
# If each customer always chooses the shortest line, calculate how long it will take for all customers to go through checkout.
# For example if you have 3 customers who will take 5, 10, and 3 minutes each to go through checkout and there is only one lane available it will take 18 minutes,
# if there are two aisles available it will take ten minutes"
# Write a method to reverse translate the string “ACCTGGCCGTACCT”.
# Results should be “AGGTACGGCCAGGT”.
# Add a validation step to show that the results is what expected.\n",
from sklearn.preprocessing import LabelEncoder
from associationbetweenfeatureconfidenceandcount.associationbetweenconfidenceandcount import \
findassociationbetweentranscriptconfidenceandtranscriptcount
from associationbetweenlengthanddaysinpool.associationbetweenlengthanddaysinpool import \
findassociationbetweensequencelengthanddaysinpool
from checkfornumberofperfectsquares.countperfectsquares import findperfectsquares
from read_sample_feature_expr_metadata.read_sample_feature_expression_values import \
readmetadataaboutsamplesfeaturesandexpressionvalues
from genesrankedbypathwayrank.generankingbasedonpathwayrank import rankgenesbasedonpathways
from genexpressiondataimputation.imputegeneexpressiondata_using_mean import imputegenexpressiondata_using_mean
from normalizeexpressionvalues.normalizeexpressionmeasures import normalizeexpressionmeasures
from reversecomplementDNA.reversecomplementDNA import reversecomplementDNA, validationofreversecomplement
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
import rpy2
from scipy.stats import beta
from sortedcontainers import SortedList
# Load data and introduce missing values
# Explore the `samples` metadata. Calculate how many sequenced samples are available
# for each subject (\"Subject\" column) across treatment usage (\"Treated_with_drug\" column).
# Hint: You can use a \"groupby\" function or a \"table\" function.**"
# Transform the `data` table to relative abundances - this is done by dividing the counts in each sample and
# each feature by the sum of the counts for all features in each sample. Name this new dataframe `data_ra`.**"
# Determine the top 10 genes with the highest mean relative abundance found across both subjects.
# The Gene label can be found in the `features` dataframe.**"
def findgenesofsignificance(datafilename: str, featuresfilename: str, samplesfilename: str, sequencesfilename: str):
sequence_header_to_fnasequence_association, sampleid_to_length_association_normalizedoverrmaxlength, \
sample_expressedfeature_or_transcript_expressionvalues_matrix_allvals, \
meanexpression_per_sample_for_all_features_or_transcripts_sorted, \
expressedtranscriptspersample_sampletotranscriptassoc_noexprvals, persample_expressed_transcript_to_expressionval_association, \
pertranscript_sampleidentifier_expressionvals_across_samples, \
samplid_to_subject_association, \
subjects_treatedwithdrugs_to_sample_association, samples_with_missing_expression_values, \
sampleid_to_dayssinceexperimentstarted_normalized, sample_treatedwithdrugs_association, features_metadata, \
transcript_or_feature_to_gene_association, transcript_or_feature_to_confidence_normalized_association, \
transcript_or_feature_to_count_normalized_association, gene_to_transcript_or_feature_association, \
pathways_ranked_by_positivecorrelnbetnnummodules_and_numgenes, \
gene_to_pathway_association, pathway_to_gene_association, sample_to_gene_association, \
gene_to_sample_association, geneid_to_all_transcriptexprvals_acrosssamples_association = \
readmetadataaboutsamplesfeaturesandexpressionvalues(
datafilename, featuresfilename, samplesfilename, sequencesfilename)
genesrankedbypathwayexpression = rankgenesbasedonpathways(gene_to_transcript_or_feature_association,
gene_to_pathway_association,
pathways_ranked_by_positivecorrelnbetnnummodules_and_numgenes)
print(gene_to_transcript_or_feature_association.keys())
genesmissingpathwayassociations: list[str] = []
for gene in gene_to_transcript_or_feature_association.keys():
# Multiply by confidence to normalize
# Use counts to normalize
####################################
# Also return gene - transcript - measure
# dict of transcript id and expression values (un-normalized)
transcriptandexprmeasure_unnormalized: dict = geneid_to_all_transcriptexprvals_acrosssamples_association.get(gene)
# return value from function is normalized
geneid_to_all_transcriptexprvals_acrosssamples_association[gene] = \
normalizeexpressionmeasures(gene, transcriptandexprmeasure_unnormalized,
sampleid_to_dayssinceexperimentstarted_normalized,
sampleid_to_length_association_normalizedoverrmaxlength,
genesrankedbypathwayexpression)
# TBD:
# 1. Use numpy and panda to find correlations between
# sequence length and number of expressed transcripts
# number of expressed trancripts / features and days in pool
# ???? anything between sequence length and days in pool (positive correln?)
# transcript length and confidence measure (positive correln expected)
# transcript count????
# correct normaliztion algo based on the correln coefficients
# Sample length from fna file
# check for correlation between sample length and number of expressed transcripts / features
# assuming positive correlation
# normalize over sample length (gene abundance can come from multiple transcripts from different samples)
# Pathway ranking (done)
# build pathway module gene association
# A gene can be implicated in multiple pathways and a pathway should have mutiple modules
# A gene from a well expressed pathway -> HOW to normalize
# to check: functional association and gene abundance expression
# 11/19
# Impute sequence length using correlation between length and num expressed transcripts / features
print("Gene ", gene,
" Sorted dict of transcripts and expr vals ", transcriptandexprmeasure_unnormalized)
print("Gene ", gene, " Relative abundances ", geneid_to_all_transcriptexprvals_acrosssamples_association.get(gene))
# Determine the top 10 genes with the highest mean relative abundance found across both subjects.
# The Gene label can be found in the `features` dataframe.**"
##################################################################################################
##############################################
# gene expression values is 1: many
# transcript to gene sanity check should be 1: 1
# gene to trancript sanit check should be 1: many
#
combined_dict: dict = {}
for gene in gene_to_transcript_or_feature_association.keys():
if gene not in combined_dict.keys():
combined_dict[gene] = {}
combined_dict[gene]["NormalizedExprVals"] = geneid_to_all_transcriptexprvals_acrosssamples_association.get(gene)
combined_dict[gene]["NormalizedPathwayRank"] = genesrankedbypathwayexpression.get(gene)
combined_dict[gene]["NormalizedSampleLength"] = []
combined_dict[gene]["NormalizedDaysInPool"] = []
for s in gene_to_sample_association.get(gene):
combined_dict[gene]["NormalizedSampleLength"].append(
sampleid_to_length_association_normalizedoverrmaxlength.get(s))
combined_dict[gene]["NormalizedDaysInPool"].append(sampleid_to_dayssinceexperimentstarted_normalized.get(s))
if gene_to_transcript_or_feature_association.get(s) is not None:
combined_dict[gene]["NormalizedConfidence"] = []
combined_dict[gene]["NormalizedCount"] = []
combined_dict[gene]["TreatedWithDrugs"] = None
for t in gene_to_transcript_or_feature_association.get(s):
combined_dict[gene]["NormalizedConfidence"].append(
transcript_or_feature_to_confidence_normalized_association.get(t))
combined_dict[gene]["NormalizedCount"].append(
transcript_or_feature_to_count_normalized_association.get(t))
combined_dict[gene]["TreatedWithDrugs"] = sample_treatedwithdrugs_association.get(s)
combined_dict[gene]["NormalizedSampleLength"] = list(set(combined_dict[gene]["NormalizedSampleLength"]))
##############################################
# Dump of combined dict
import json
print(json.dumps(combined_dict, sort_keys=True, indent=2))
return sequence_header_to_fnasequence_association, features_metadata, \
sample_expressedfeature_or_transcript_expressionvalues_matrix_allvals, samples_with_missing_expression_values, \
subjects_treatedwithdrugs_to_sample_association, sampleid_to_length_association_normalizedoverrmaxlength, \
sampleid_to_dayssinceexperimentstarted_normalized, geneid_to_all_transcriptexprvals_acrosssamples_association, \
transcript_or_feature_to_confidence_normalized_association, \
transcript_or_feature_to_count_normalized_association, combined_dict
def print_hi(name):
# Use a breakpoint in the code line below to debug your script.
print(f'Hi, {name}') # Press Ctrl+F8 to toggle the breakpoint.
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
print_hi('PyCharm')
# open sequences.fna and print the first 15 lines
# See PyCharm help at https://www.jetbrains.com/help/pycharm/
counter = 1
parser = argparse.ArgumentParser(description='Process some integers.')
# parser.add_argument('revcomplementstring', metavar='N', type=str, nargs='+',
# help='string to be reverse complement')
# parser.add_argument('--sum', dest='accumulate', action='store_const',
# const=sum, default=max,
# help='sum the integers (default: find the max)')
args = parser.parse_args()
# print(args.accumulate(args.integers))
try:
#with open("data/sequences.fna") as file:
# while line := file.readline().rstrip():
# # print(line)
# counter = counter + 1
# if counter == 15:
# break
#print("Before calling reverse complement ")
#DNAString = "ATGC"
#revcomplement = reversecomplementDNA(DNAString)
#print(validationofreversecomplement(DNAString, revcomplement))
#print("After printing reverse complement validation")
print("Before finding perfect squares")
findperfectsquares(4, 9)
findperfectsquares(100, 1748937)
#findperfectsquares(1341, 74189027341)
datafilename = "data/data.txt"
featuresfilename = "data/features.txt"
samplesfilename = "data/samples.txt"
sequencesfilename = "data/sequences.fna"
print("Before finding genes of significance")
sequencedict, featureinfodict2d, datainfodict2d, missingsamples, subjectstreatedwithdrugs, \
sampleid_to_length_association_normalizedoverrmaxlength, sampleid_to_dayssinceexperimentstarted_normalized, \
geneid_expressionvalues, transcript_or_feature_to_confidence_association, \
transcript_or_feature_to_count_association, combined_dict = findgenesofsignificance(
datafilename, featuresfilename, samplesfilename, sequencesfilename)
# 8 samples have both days in pool and seq length
# for those 8 samples,
findassociationbetweensequencelengthanddaysinpool(sampleid_to_length_association_normalizedoverrmaxlength,
sampleid_to_dayssinceexperimentstarted_normalized) # positive correlation
findassociationbetweentranscriptconfidenceandtranscriptcount(transcript_or_feature_to_confidence_association,
transcript_or_feature_to_count_association)
exit(1)
# convert gene transcript expression vals to panda df
df_genetranscript_exprvals = imputegenexpressiondata_using_mean(geneid_expressionvalues)
np.random.seed(42)
# kf = KFold(n_splits=8)
scores = []
# for train_index, test_index in kf.split(df_genetranscript_exprvals):
# X_train, X_test = df_genetranscript_exprvals[train_index], df_genetranscript_exprvals[test_index]
# y_train, y_test = df_genetranscript_exprvals[train_index], df_genetranscript_exprvals[test_index]
# clf = LinearRegression()
# clf.fit(X_train, y_train)
# y_test_pred = clf.predict(X_test)=-
# scores.append(r2_score(y_test, y_test_pred))
# test_size = 0.3
# X_train, X_test, y_train, y_test = train_test_split(df_genetranscript_exprvals[train_index],
# df_genetranscript_exprvals[test_index])
# print(" training data ", X_train)
# print(" y axis training data ", y_train)
# lr = LogisticRegression()
# lr.fit(X_train, y_train)
# print("After lr.fit")
# pred = lr.predict(X_test)
# print(metrics.accuracy_score(pred, y_test))
print("Mice forest algorithm scores ", scores)
# le = LabelEncoder()
import rpy2.robjects as robjects
r = robjects.r
m = r.matrix(r.rnorm(100), ncol=5)
pca = r.princomp(m)
r.plot(pca, main="Eigen values")
r.biplot(pca, main="biplot")
exit(1)
finally:
print("Exit ")
# print(list(checkforvalidparentheses("{([((()])))[][[()]]}")))
#print("After printing the first 15 lines")