Merge pull request #60 from usegalaxy-eu/remote-check

Automatic check if an object already exists on ENA
usegalaxy-eu · Jan 19, 2022 · cb88385 · cb88385
2 parents 5d27dce + 0c74f43
commit cb88385
Show file tree

Hide file tree

Showing 5 changed files with 114 additions and 45 deletions.
diff --git a/README.md b/README.md
@@ -60,16 +60,17 @@ All supported arguments:
   --experiment EXPERIMENT
                         table of EXPERIMENT object
   --run RUN             table of RUN object
-  --data [FILE ...]     data for submission
+  --data [FILE [FILE ...]]
+                        data for submission
   --center CENTER_NAME  specific to your Webin account
   --checklist CHECKLIST
                         specify the sample checklist with following pattern: ERC0000XX, Default: ERC000011
-  --xlsx XLSX           Excel table with metadata
+  --xlsx XLSX           filled in excel template with metadata
+  --auto_action         BETA: detect automatically which action (add or modify) to apply when the action column is not given
   --tool TOOL_NAME      specify the name of the tool this submission is done with. Default: ena-upload-cli
   --tool_version TOOL_VERSION
                         specify the version of the tool this submission is done with
-  --no_data_upload      indicate if no upload should be performed and you like to submit a RUN object (e.g. if uploaded     
-                        was done separately).
+  --no_data_upload      indicate if no upload should be performed and you like to submit a RUN object (e.g. if uploaded was done separately).
   --draft               indicate if no submission should be performed
   --secret SECRET       .secret.yml file containing the password and Webin ID of your ENA account
   -d, --dev             flag to use the dev/sandbox endpoint of ENA
@@ -161,7 +162,10 @@ Use the *--dev* flag if you want to do a test submission using the tool by the s
 
 ### Submitting a selection of rows to ENA
 
-Optionally you can add a status column to every table that contains the action you want to apply during this submission. If you chose to add only the first 2 samples to ENA, you specify `--action add` as parameter in the command and you add the `add` value to the status column of the rows you want to submit as demonstrated below. Same holds for the action `modify`, `release` and `cancel`.
+There are two ways of submitting only a selection of objects to ENA. This is handy for reoccurring submissions, especially when they belong to the same study.
+
+- Manual: you can add an optional `status` column to every table/sheet that contains the action you want to apply during this submission. If you chose to add only the first 2 samples to ENA, you specify `--action add` as parameter in the command and you add the `add` value to the status column of the rows you want to submit as demonstrated below. Same holds for the action `modify`, `release` and `cancel`.
+- Automatic (BETA): using the `--auto_action` it is possible to auto detect wether an object (using the alias) is already present on ENA and will fill in the specified action (`--action` parameter) accordingly. In practice, this means that if a user chooses to add objects and we already find this object already exists using its alias, this objects will not be added. On the other hand, if the command is used to modify objects, we want to apply this solely on objects that already exist on ENA. The detection only works with ENA objects that are published and findable on the website trough the search function (both the dev and live website). If the tool does not correctly detect the presence of your ENA object, we suggest to use the more robust manual approach as described above.
 
 **Example with modify as seen in the [example sample modify table](example_tables/ENA_template_samples_modify.tsv)**
 

diff --git a/ena_upload/_version.py b/ena_upload/_version.py
@@ -1 +1 @@
-__version__ = "0.5.1"
+__version__ = "0.5.2"
diff --git a/ena_upload/check_remote.py b/ena_upload/check_remote.py
@@ -0,0 +1,37 @@
+import json
+import requests
+
+URL = "https://www.ebi.ac.uk/ena/portal/api/search"
+DEV_URL = "https://wwwdev.ebi.ac.uk/ena/portal/api/search"
+
+def identify_action(entry_type, alias, dev):
+    ''' define action ['add' | 'modify'] that needs to be performed for this entry '''
+    query = {entry_type + '_alias': alias}
+    remote_accessions = check_remote_entry(entry_type, query, dev)
+    if isinstance(remote_accessions, list) and len(remote_accessions) > 0:
+        print(f'\tFound: {entry_type} entry with alias {alias}')
+        return True
+    else:
+        print(f'\tNo {entry_type} entry found with alias {alias}')
+        return False
+
+
+def check_remote_entry(entry_type, query_dict, dev):
+    '''
+    Checks if an entry with that alias exists in the ENA repos
+    entry_type = [study | sample | experiment | run]
+    '''
+    assert entry_type in ['study', 'sample', 'experiment', 'run']
+    params_dict = {}
+    query_str = ' AND '.join(['%s="%s"' % (key, value) for (key, value) in query_dict.items()])
+    params_dict['query'] = query_str
+    params_dict['result'] = 'read_' + entry_type
+    params_dict['fields'] = entry_type + '_alias'
+    params_dict['format'] = 'json'
+    if dev:
+        response = requests.post(DEV_URL, data=params_dict)
+    else:
+        response = requests.post(URL, data=params_dict)
+    if response.content != b'':
+        return json.loads(response.content)
+    return []
diff --git a/ena_upload/ena_upload.py b/ena_upload/ena_upload.py
@@ -20,9 +20,12 @@
 import pandas as pd
 import tempfile
 from ena_upload._version import __version__
+from ena_upload.check_remote import identify_action
 
 SCHEMA_TYPES = ['study', 'experiment', 'run', 'sample']
 
+STATUS_CHANGES = {'ADD': 'ADDED', 'MODIFY': 'MODIFIED',
+              'CANCEL': 'CANCELLED', 'RELEASE': 'RELEASED'}
 
 class MyFTP_TLS(ftplib.FTP_TLS):
     """Explicit FTPS, with shared TLS session"""
@@ -36,7 +39,7 @@ def ntransfercmd(self, cmd, rest=None):
         return conn, size
 
 
-def create_dataframe(schema_tables, action):
+def create_dataframe(schema_tables, action, dev, auto_action):
     '''create pandas dataframe from the tables in schema_tables
        and return schema_dataframe
 
@@ -54,7 +57,7 @@ def create_dataframe(schema_tables, action):
     for schema, table in schema_tables.items():
         df = pd.read_csv(table, sep='\t', comment='#', dtype=str)
         df = df.dropna(how='all')
-        df = check_columns(df, schema, action)
+        df = check_columns(df, schema, action, dev, auto_action)
         schema_dataframe[schema] = df
 
     return schema_dataframe
@@ -80,7 +83,7 @@ def extract_targets(action, schema_dataframe):
     return schema_targets
 
 
-def check_columns(df, schema, action):
+def check_columns(df, schema, action, dev, auto_action):
     # checking for optional columns and if not present, adding them
     if schema == 'sample':
         optional_columns = ['accession', 'submission_date',
@@ -94,10 +97,32 @@ def check_columns(df, schema, action):
     for header in optional_columns:
         if not header in df.columns:
             if header == 'status':
-                # status column contain action keywords
-                # for xml rendering, keywords require uppercase
-                # according to scheme definition of submission
-                df[header] = str(action).upper()
+                if auto_action:
+                    for index, row in df.iterrows():
+                        remote_present = np.nan
+                        try:
+                            remote_present = str(identify_action(
+                                schema, str(df['alias'][index]), dev)).upper()
+
+                        except Exception as e:
+                            print(e)
+                            print(
+                                f"Something went wrong with detecting the ENA object {df['alias'][index]} on the servers of ENA. This object will be skipped.")
+                        if remote_present == np.nan:
+                            df.at[index, header] = np.nan
+                        elif remote_present and action == 'MODIFY':
+                            df.at[index, header] = action
+                            print(
+                                f"\t'{df['alias'][index]}' gets '{remote_present}' as action in the status column")
+                        elif not remote_present and action in ['ADD', 'CANCEL', 'RELEASE']:
+                            df.at[index, header] = action
+                            print(
+                                f"\t'{df['alias'][index]}' gets '{remote_present}' as action in the status column")
+                else:
+                    # status column contain action keywords
+                    # for xml rendering, keywords require uppercase
+                    # according to scheme definition of submission
+                    df[header] = str(action).upper()
             else:
                 df[header] = np.nan
         else:
@@ -106,6 +131,7 @@ def check_columns(df, schema, action):
 
     return df
 
+
 def check_filenames(file_paths, run_df):
     """Compare data filenames from command line and from RUN table.
 
@@ -462,16 +488,12 @@ def process_receipt(receipt, action):
         errors = '\nOops:\n' + '\n'.join(errors)
         sys.exit(errors)
 
-    # define expected status based on action
-    status = {'ADD': 'added', 'MODIFY': 'modified',
-              'CANCEL': 'cancelled', 'RELEASE': 'released'}
-
     def make_update(update, ena_type):
         update_list = []
         print(f"\n{ena_type.capitalize()} accession details:")
         for element in update:
             extract = (element.get('alias'), element.get(
-                'accession'), receiptDate, status[action])
+                'accession'), receiptDate, STATUS_CHANGES[action])
             print("\t".join(extract))
             update_list.append(extract)
         # used for labelling dataframe
@@ -515,7 +537,7 @@ def make_update(update, ena_type):
             print(f"\n{ena_type.capitalize()} accession details:")
             update_list = []
             for accession in accessions:
-                extract = (accession, receiptDate, status[action])
+                extract = (accession, receiptDate, STATUS_CHANGES[action])
                 update_list.append(extract)
                 print("\t".join(extract))
 
@@ -587,9 +609,6 @@ def update_table_simple(schema_dataframe, schema_targets, action):
     :return schema_dataframe: a dictionary - {schema:dataframe}
                               dataframe -- updated status
     """
-    # define expected status based on action
-    status = {'ADD': 'added', 'MODIFY': 'modified',
-              'CANCEL': 'cancelled', 'RELEASE': 'released'}
 
     for schema in schema_targets.keys():
         dataframe = schema_dataframe[schema]
@@ -599,7 +618,7 @@ def update_table_simple(schema_dataframe, schema_targets, action):
         targets.set_index('alias', inplace=True)
 
         for index in targets.index:
-            dataframe.loc[index, 'status'] = status[action]
+            dataframe.loc[index, 'status'] = STATUS_CHANGES[action]
 
     return schema_dataframe
 
@@ -687,10 +706,15 @@ def process_args():
 
     parser.add_argument('--checklist', help="specify the sample checklist with following pattern: ERC0000XX, Default: ERC000011", dest='checklist',
                         default='ERC000011')
-    
+
     parser.add_argument('--xlsx',
-                        help='excel table with metadata')
-
+                        help='filled in excel template with metadata')
+
+    parser.add_argument('--auto_action',
+                        action="store_true",
+                        default=False,
+                        help='BETA: detect automatically which action (add or modify) to apply when the action column is not given')
+
     parser.add_argument('--tool',
                         dest='tool_name',
                         default='ena-upload-cli',
@@ -730,15 +754,15 @@ def process_args():
         if not os.path.isfile(args.secret):
             msg = f"Oops, the file {args.secret} does not exist"
             parser.error(msg)
-    
+
     # check if xlsx file exists
     if args.xlsx:
         if not os.path.isfile(args.xlsx):
             msg = f"Oops, the file {args.xlsx} does not exist"
             parser.error(msg)
 
     # check if data is given when adding a 'run' table
-    if (not args.no_data_upload and args.run and args.action.upper() not in ['RELEASE','CANCEL']) or (not args.no_data_upload and args.xlsx and args.action.upper() not in ['RELEASE','CANCEL']):
+    if (not args.no_data_upload and args.run and args.action.upper() not in ['RELEASE', 'CANCEL']) or (not args.no_data_upload and args.xlsx and args.action.upper() not in ['RELEASE', 'CANCEL']):
         if args.data is None:
             parser.error('Oops, requires data for submitting RUN object')
 
@@ -767,6 +791,7 @@ def collect_tables(args):
 
     return schema_tables
 
+
 def update_date(date):
     if pd.isnull(date) or isinstance(date, str):
         return date
@@ -788,6 +813,7 @@ def main():
     secret = args.secret
     draft = args.draft
     xlsx = args.xlsx
+    auto_action = args.auto_action
 
     with open(secret, 'r') as secret_file:
         credentials = yaml.load(secret_file, Loader=yaml.FullLoader)
@@ -812,16 +838,19 @@ def main():
             elif f"ENA_{schema}" in xl_workbook.book.sheetnames:
                 xl_sheet = xl_workbook.parse(f"ENA_{schema}", header=0)
             else:
-                sys.exit(f"The sheet '{schema}' is not present in the excel sheet {xlsx}")
+                sys.exit(
+                    f"The sheet '{schema}' is not present in the excel sheet {xlsx}")
             xl_sheet = xl_sheet.drop(0).dropna(how='all')
             for column_name in list(xl_sheet.columns.values):
                 if 'date' in column_name:
-                    xl_sheet[column_name] = xl_sheet[column_name].apply(update_date)
+                    xl_sheet[column_name] = xl_sheet[column_name].apply(
+                        update_date)
 
             if True in xl_sheet.columns.duplicated():
                 sys.exit("Duplicated columns found")
 
-            xl_sheet = check_columns(xl_sheet, schema, action)
+            xl_sheet = check_columns(
+                xl_sheet, schema, action, dev, auto_action)
             schema_dataframe[schema] = xl_sheet
             path = os.path.dirname(os.path.abspath(xlsx))
             schema_tables[schema] = f"{path}/ENA_template_{schema}.tsv"
@@ -830,7 +859,8 @@ def main():
         schema_tables = collect_tables(args)
 
         # create dataframe from table
-        schema_dataframe = create_dataframe(schema_tables, action)
+        schema_dataframe = create_dataframe(
+            schema_tables, action, dev, auto_action)
 
     # ? add a function to sanitize characters
     # ? print 'validate table for specific action'
@@ -854,11 +884,11 @@ def main():
             file_paths = {}
             if args.data:
                 for path in args.data:
-                    file_paths[os.path.basename(path)] =  os.path.abspath(path) 
+                    file_paths[os.path.basename(path)] = os.path.abspath(path)
                 # check if file names identical between command line and table
                 # if not, system exits
                 check_filenames(file_paths, df)
-            
+
             # generate MD5 sum if not supplied in table
             if file_paths and not check_file_checksum(df):
                 print("No valid checksums found, generate now...", end=" ")
@@ -953,18 +983,16 @@ def main():
             print("There was an ERROR during submission:")
             sys.exit(receipt)
 
-        if action in ['ADD', 'MODIFY']:
-            schema_dataframe = update_table(schema_dataframe,
+    if action in ['ADD', 'MODIFY'] and not draft:
+        schema_dataframe = update_table(schema_dataframe,
                                             schema_targets,
                                             schema_update)
-            # save updates in new tables
-            save_update(schema_tables, schema_dataframe)
-        elif action in ['CANCEL', 'RELEASE']:
-            schema_dataframe = update_table_simple(schema_dataframe,
-                                                   schema_targets,
-                                                   action)
-            # save updates in new tables
-            save_update(schema_tables, schema_dataframe)
+    else:
+        schema_dataframe = update_table_simple(schema_dataframe,
+                                               schema_targets,
+                                               action)
+    # save updates in new tables
+    save_update(schema_tables, schema_dataframe)
 
 
 if __name__ == "__main__":

diff --git a/setup.py b/setup.py
@@ -1,4 +1,4 @@
-from setuptools import setup, find_packages
+from setuptools import setup
 from ena_upload._version import __version__
 
 with open("README.md", 'r') as f: