Merge pull request #225 from PNNL-CompBio/dev

removed dependency on cptac pacakge at runtime
PNNL-CompBio · Sep 20, 2023 · 560577e · 560577e
2 parents 6fd97ef + 0c59640
commit 560577e
Show file tree

Hide file tree

Showing 5 changed files with 21 additions and 87 deletions.
diff --git a/mRNAData/getAllDatasets.py b/mRNAData/getAllDatasets.py
@@ -4,7 +4,8 @@
 '''
 
 import cptac
-
+import os
+os.mkdir('/data/')
 
 def getCancerObj(cancertype):
    # cptac.download(dataset=cancertype,source='harmonized',)
@@ -47,4 +48,8 @@ def getCancerObj(cancertype):
     dat.get_clinical(cs)
     tsource = dat_list['transcriptomics']
     res = dat.get_transcriptomics(tsource[0])
+    if res.columns.nlevels == 2:
+        res.columns = res.columns.droplevel(1)
+
     print(ds+':',res.shape)
+    res.to_csv('/data/'+ds+'.csv')
diff --git a/mRNAData/mRNADataSetsCLI.py b/mRNAData/mRNADataSetsCLI.py
@@ -3,7 +3,8 @@
 Basic CLI to import CPTAC proteomic data
 '''
 import argparse
-import cptac
+#import cptac
+import pandas
 
 
 def main():
@@ -15,42 +16,8 @@ def main():
                         to be collected')
     opts = parser.parse_args()
 
-    if opts.type.lower() == 'brca':
-        dat = cptac.Brca()
-    elif opts.type.lower() == 'ccrcc':
-        dat = cptac.Ccrcc()
-    elif opts.type.lower() == 'coad':
-        dat = cptac.Coad()
-    elif opts.type.lower() == 'ucec':
-        dat = cptac.Ucec()
-    elif opts.type.lower() == 'gbm':
-        dat = cptac.Gbm()
-    elif opts.type.lower() == 'hnscc':
-        dat = cptac.Hnscc()
-    elif opts.type.lower() == 'lscc':
-        dat = cptac.Lscc()
-    elif opts.type.lower() == 'luad':
-        dat = cptac.Luad()
-    elif opts.type.lower() == 'ovarian':
-        dat = cptac.Ov()
-    elif opts.type.lower() == 'pdac':
-        dat = cptac.Pdac()        
-    else:
-        exit()
-        #this call changed in recent version
-    dat_list = dat.list_data_sources().set_index('Data type').to_dict()['Available sources']
-    clinsource = dat_list['clinical']
-    if 'harmonized' in clinsource:
-        cs = 'harmonized'
-    else:
-        cs = clinsource[0]
-    dat.get_clinical(cs)
-    tsource = dat_list['transcriptomics']
-    df = dat.get_transcriptomics(tsource[0])
+    df=pandas.read_csv("/data/"+opts.type+'.csv',index_col=0)
 
-    if df.columns.nlevels == 2:
-        df.columns = df.columns.droplevel(1)
-
     # Get the sample type specific dataframe
     # if opts.sample.lower() != 'all':
     #     meta = dat.get_clinical()

diff --git a/metrics/mrna-prot/mrna-prot-comparison.cwl b/metrics/mrna-prot/mrna-prot-comparison.cwl
@@ -37,21 +37,12 @@ outputs:
    prot-files:
       type: File[]
       outputSource: run-all-algs-by-sig/prot-file
-#   dist-files:
-#      type: File[]
-#      outputSource: run-all-algs-by-sig/mat-dist-file
-#   dist-fig:
-#      type: File[]
-#      outputSource: get-distances/fig
-#   dist-tab:
-#      type: File
-#      outputSource: get-distances/table
 
 
 steps:
    run-all-algs-by-sig:
       run: call-deconv-and-cor.cwl
-      scatter: [signature,mrna-alg,prot-alg,cancerType,tissueType]
+      scatter: [signature,mrna-alg,prot-alg,tissueType,cancerType]
       scatterMethod: flat_crossproduct
       in:
         signature: signatures

diff --git a/protData/getAllDatasets.py b/protData/getAllDatasets.py
@@ -4,7 +4,8 @@
 '''
 
 import cptac
-
+import os
+os.mkdir('/data/')
 
 def getCancerObj(cancertype):
    # cptac.download(dataset=cancertype,source='harmonized',)
@@ -46,5 +47,9 @@ def getCancerObj(cancertype):
         cs = clinsource[0]
     dat.get_clinical(cs)
     tsource = dat_list['proteomics']
-    df = dat.get_proteomics(tsource[0])
-    print(ds+':',df.shape)
+    res = dat.get_proteomics(tsource[0])
+    if res.columns.nlevels == 2:
+        res.columns = res.columns.droplevel(1)
+
+    print(ds+':',res.shape)
+    res.to_csv('/data/'+ds+'.csv')
diff --git a/protData/protDataSetsCLI.py b/protData/protDataSetsCLI.py
@@ -3,8 +3,7 @@
 Basic CLI to import CPTAC proteomic data
 '''
 import argparse
-import cptac
-
+import pandas
 
 def main():
     parser = argparse.ArgumentParser()
@@ -14,42 +13,9 @@ def main():
                         help='Sample type, tumor vs normal vs all (default), \
                         to be collected')
     opts = parser.parse_args()
+    df=pandas.read_csv("/data/"+opts.type+'.csv',index_col=0)
+#    df = df.reset_index()
 
-    if opts.type.lower() == 'brca':
-        dat = cptac.Brca()
-    elif opts.type.lower() == 'ccrcc':
-        dat = cptac.Ccrcc()
-    elif opts.type.lower() == 'coad':
-        dat = cptac.Coad()
-    elif opts.type.lower() == 'ucec':
-        dat = cptac.Ucec()
-    elif opts.type.lower() == 'gbm':
-        dat = cptac.Gbm()
-    elif opts.type.lower() == 'hnscc':
-        dat = cptac.Hnscc()
-    elif opts.type.lower() == 'lscc':
-        dat = cptac.Lscc()
-    elif opts.type.lower() == 'luad':
-        dat = cptac.Luad()
-    elif opts.type.lower() == 'ovarian':
-        dat = cptac.Ov()
-    elif opts.type.lower() == 'pdac':
-        dat = cptac.Pdac()        
-    else:
-        exit()
-        #this call changed in recent version
-    dat_list = dat.list_data_sources().set_index('Data type').to_dict()['Available sources']
-    clinsource = dat_list['clinical']
-    if 'harmonized' in clinsource:
-        cs = 'harmonized'
-    else:
-        cs = clinsource[0]
-    dat.get_clinical(cs)
-    tsource = dat_list['proteomics']
-    df = dat.get_proteomics(tsource[0])
-
-    if df.columns.nlevels == 2:
-        df.columns = df.columns.droplevel(1)
 
     # Get the sample type specific dataframe
     # if opts.sample.lower() != 'all':