From 239dc5112dbc0883076cc47706b7a86dcaca0a18 Mon Sep 17 00:00:00 2001 From: AyeshaSanadi <167299982+AyeshaSanadi@users.noreply.github.com> Date: Thu, 12 Dec 2024 06:32:29 +0530 Subject: [PATCH] Regression inside command line tools [cmf artifact/execution list command] (#218) * Modified code * return data instead of assigned it to var * Added comment and optimized code * Modified command descriptions * Handled all testing scenario inside cmf artifact list cmd * Enhancement of -j option in cmf metadata export cmd * Updated help descriptions for every command. * Added proper comment, descriptions and warnings. * Added proper comments inside code. * Modified description of command and update document. * Updated description for cmf artifact/execution list command * Made changes inside cmf artifact/execution list code * Added 'name' as a properties inside execution list cmd. * Removed long option. --- cmflib/cmf.py | 28 ++- cmflib/cmf_commands_wrapper.py | 12 +- cmflib/commands/artifact/list.py | 245 +++++++++++++++++++++----- cmflib/commands/execution/__init__.py | 6 +- cmflib/commands/execution/list.py | 195 ++++++++++++++------ cmflib/commands/metadata/export.py | 130 ++++++++++---- cmflib/commands/pipeline/__init__.py | 4 +- cmflib/commands/pipeline/list.py | 37 ++-- docs/cmf_client/cmf_client.md | 42 +++-- requirements.txt | 2 + 10 files changed, 505 insertions(+), 196 deletions(-) diff --git a/cmflib/cmf.py b/cmflib/cmf.py index cb653dd6..a0af65f3 100644 --- a/cmflib/cmf.py +++ b/cmflib/cmf.py @@ -2305,7 +2305,7 @@ def non_related_args(type : str, args : dict): def pipeline_list(filepath = "./mlmd"): - """ Display list of pipline for current mlmd. + """ Display a list of pipeline name(s) from the available mlmd file. Example: ```python @@ -2323,43 +2323,41 @@ def pipeline_list(filepath = "./mlmd"): return output -def execution_list(pipeline_name: str, filepath = "./mlmd", execution_id: str = "", long = True): - """ Display list of execution for given pipeline. +def execution_list(pipeline_name: str, filepath = "./mlmd", execution_id: str = ""): + """Displays executions from the MLMD file with a few properties in a 7-column table, limited to 20 records per page. Example: ```python - result = _execution_list("example_pipeline", "./mlmd_directory", "example_execution_id", "long") + result = _execution_list("example_pipeline", "./mlmd_directory", "example_execution_id") ``` Args: pipeline_name: Name of the pipeline. filepath: Path to store the mlmd file. - execution_id: Executions for particular execution id. - long: Detailed summary regarding execution. + execution_id: Executions for particular execution id. Returns: Output from the _execution_list function. """ # Required arguments: pipeline_name - # Optional arguments: filepath( path to store mlmd file), execution_id, long - output = _execution_list(pipeline_name, filepath, execution_id, long) + # Optional arguments: filepath( path to store mlmd file), execution_id + output = _execution_list(pipeline_name, filepath, execution_id) return output -def artifact_list(pipeline_name: str, filepath = "./mlmd", artifact_name: str = "", long = True): - """ Display list of artifact for given pipeline. +def artifact_list(pipeline_name: str, filepath = "./mlmd", artifact_name: str = ""): + """ Displays artifacts from the MLMD file with a few properties in a 7-column table, limited to 20 records per page. Example: ```python - result = _artifact_list("example_pipeline", "./mlmd_directory", "example_artifact_name", "long") + result = _artifact_list("example_pipeline", "./mlmd_directory", "example_artifact_name") ``` Args: pipeline_name: Name of the pipeline. filepath: Path to store the mlmd file. - artifact_name: Artifacts for particular artifact name. - long: Detailed summary regarding artifact. + artifact_name: Artifacts for particular artifact name. Returns: Output from the _artifact_list function. """ # Required arguments: pipeline_name - # Optional arguments: filepath( path to store mlmd file), artifact_name, long - output = _artifact_list(pipeline_name, filepath, artifact_name, long) + # Optional arguments: filepath( path to store mlmd file), artifact_name + output = _artifact_list(pipeline_name, filepath, artifact_name) return output diff --git a/cmflib/cmf_commands_wrapper.py b/cmflib/cmf_commands_wrapper.py index a5d9a420..6a76b562 100644 --- a/cmflib/cmf_commands_wrapper.py +++ b/cmflib/cmf_commands_wrapper.py @@ -282,7 +282,7 @@ def _init_osdfremote(path, key_id, key_path, key_issuer, git_remote_url, cmf_ser print(msg) return msg -def _artifact_list(pipeline_name, file_name, artifact_name, long): +def _artifact_list(pipeline_name, file_name, artifact_name): cli_args = cli.parse_args( [ "artifact", @@ -292,9 +292,7 @@ def _artifact_list(pipeline_name, file_name, artifact_name, long): "-f", file_name, "-a", - artifact_name, - "-l", - long + artifact_name ] ) cmd = cli_args.func(cli_args) @@ -316,7 +314,7 @@ def _pipeline_list(file_name): print(msg) return msg -def _execution_list(pipeline_name, file_name, execution_id, long): +def _execution_list(pipeline_name, file_name, execution_id): cli_args = cli.parse_args( [ "execution", @@ -326,9 +324,7 @@ def _execution_list(pipeline_name, file_name, execution_id, long): "-f", file_name, "-e", - execution_id, - "-l", - long + execution_id ] ) cmd = cli_args.func(cli_args) diff --git a/cmflib/commands/artifact/list.py b/cmflib/commands/artifact/list.py index 536e66f1..cedf3164 100644 --- a/cmflib/commands/artifact/list.py +++ b/cmflib/commands/artifact/list.py @@ -17,69 +17,225 @@ import argparse import os import pandas as pd +import textwrap +from tabulate import tabulate from cmflib.cli.command import CmdBase from cmflib import cmfquery +from cmflib.dvc_wrapper import dvc_get_config +from typing import Union, List class CmdArtifactsList(CmdBase): - def update_dataframe(self, df): - for c in df.columns: - if c.startswith('custom_properties_'): - df.rename(columns = {c:c.replace('custom_properties_','')}, inplace = True) - else: - df = df.drop(c, axis = 1) + def convert_to_datetime(self, df: pd.DataFrame, col_name: str) -> pd.DataFrame: + """ + Function to convert a column to datetime format. + Parameters: + - df: The DataFrame containing the data. + - col_name: The name of the column to convert to datetime. + Returns: + - The updated DataFrame with the specified column converted to datetime format. + """ + # Convert the col_name column to UTC datetime format. + # The datetime is formatted as "Day DD Mon YYYY HH:MM:SS GMT" + df=df.copy() + df[col_name] = pd.to_datetime(df[col_name], unit='ms', utc=True).dt.strftime("%a %d %b %Y %H:%M:%S GMT") + return df + + def display_table(self, df: pd.DataFrame) -> None: + """ + Display the DataFrame in a paginated table format with text wrapping for better readability. + Parameters: + - df: The DataFrame to display. + """ + # Rearranging columns + updated_columns = ["id", "name", "type", "create_time_since_epoch", "url", "Commit", "uri"] + df = df[updated_columns] + df = df.copy() + + # Wrap text in object-type columns to a width of 14 characters. + # This ensures that long strings are displayed neatly within the table. + for col in df.select_dtypes(include=["object"]).columns: + df[col] = df[col].apply(lambda x: textwrap.fill(x, width=14) if isinstance(x, str) else x) + + total_records = len(df) + start_index = 0 + + # Display up to 20 records per page for better readability. + # This avoids overwhelming the user with too much data at once, especially for larger mlmd files. + while True: + end_index = start_index + 20 + records_per_page = df.iloc[start_index:end_index] + + # Display the table. + table = tabulate( + records_per_page, + headers=df.columns, + tablefmt="grid", + showindex=False, + ) + print(table) + + # Check if we've reached the end of the records. + if end_index >= total_records: + print("\nEnd of records.") + break + + # Ask the user for input to navigate pages. + user_input = input("Press Enter to see more or 'q' to quit: ").strip().lower() + if user_input == 'q': + break + + # Update start index for the next page. + start_index = end_index + + def search_artifact(self, df: pd.DataFrame) -> Union[int, List[int]]: + """ + Searches for the specified 'artifact_name' in the DataFrame and returns matching IDs. - def search_artifact(self, df): + Parameters: + - df: DataFrame to search within. + + Returns: + - List of matching IDs or -1 if no matches are found. + """ + # Example of a given sample 'artifact_name' --> artifacts/parsed/train.tsv:12345 + # These are the combinations we are implementing: + # 1. artifacts/parsed/train.tsv + # 2. train.tsv + # 3. artifacts/parsed/train.tsv:12345 + # 4. train.tsv:12345 + + # In case of multiple occurrences of 'artifact_name', we need to store the IDs of all matching names. + # For example, if "metrics" appears multiple times, we store all its IDs. + matched_ids = [] + artifact_name = self.args.artifact_name[0].strip() for index, row in df.iterrows(): - name = row['name'].split(":")[0] - file_name = name.split('/')[-1] - if file_name == self.args.artifact_name: - return row['id'] + # Extract the base name from the row. + # eg. artifacts/parsed/train.tsv:12345 --> artifacts/parsed/train.tsv + name = row['name'].split(":")[0] + if artifact_name == name: # Match the full path: artifacts/parsed/train.tsv + matched_ids.append(row['id']) + elif artifact_name == name.split('/')[-1]: # Match only the file name: train.tsv + matched_ids.append(row['id']) + elif artifact_name == row['name']: # Match the full path with hash: artifacts/parsed/train.tsv:12345 + matched_ids.append(row['id']) + elif artifact_name == row["name"].split('/')[-1]: # Match only the file name with hash: train.tsv:12345 + matched_ids.append(row['id']) + + if len(matched_ids) != 0: + return matched_ids return -1 def run(self): + # Check if 'cmf' is configured. + msg = "'cmf' is not configured.\nExecute 'cmf init' command." + result = dvc_get_config() + if len(result) == 0: + return msg + current_directory = os.getcwd() - # default path for mlmd file name - mlmd_file_name = "./mlmd" - if self.args.file_name: - mlmd_file_name = self.args.file_name + if not self.args.file_name: # If self.args.file_name is None or an empty list ([]). + mlmd_file_name = "./mlmd" # Default path for mlmd file name. + elif len(self.args.file_name) > 1: # If the user provided more than one file name. + return "Error: You can only provide one file name using the -f flag." + elif not self.args.file_name[0]: # self.args.file_name[0] is an empty string (""). + return "Error: Missing File name" + else: + mlmd_file_name = self.args.file_name[0].strip() if mlmd_file_name == "mlmd": mlmd_file_name = "./mlmd" - current_directory = os.path.dirname(mlmd_file_name) + + current_directory = os.path.dirname(mlmd_file_name) if not os.path.exists(mlmd_file_name): - return f"ERROR: {mlmd_file_name} doesn't exists in {current_directory} directory." + return f"Error: {mlmd_file_name} doesn't exists in {current_directory} directory." - # Creating cmfquery object + # Creating cmfquery object. query = cmfquery.CmfQuery(mlmd_file_name) + + # Check if pipeline exists in mlmd. + if self.args.pipeline_name is not None and len(self.args.pipeline_name) > 1: + return "Error: You can only provide one pipeline name using the -p flag." + elif not self.args.pipeline_name[0]: # self.args.pipeline_name[0] is an empty string (""). + return "Error: Missing pipeline name" + else: + pipeline_name = self.args.pipeline_name[0] + + df = query.get_all_artifacts_by_context(pipeline_name) + + if df.empty: + return "Pipeline name doesn't exists..." + else: + if not self.args.artifact_name: # If self.args.artifact_name is None or an empty list ([]). + pass + elif len(self.args.artifact_name) > 1: # If the user provided more than one artifact_name. + return "Error: You can only provide one artifact name using the -a flag." + elif not self.args.artifact_name[0]: # self.args.artifact_name[0] is an empty string (""). + return "Error: Missing artifact name" + else: + artifact_ids = self.search_artifact(df) + if(artifact_ids != -1): + # Multiple/Single artifact names exist with the same name. + for artifact_id in artifact_ids: + # Filter the DataFrame to retrieve rows corresponding to the current ID. + filtered_data = df.loc[df['id'] == artifact_id] + + # Converting "create_time_since_epoch" and "last_update_time_since_epoch" to datetime format. + filtered_data = self.convert_to_datetime(filtered_data, "create_time_since_epoch") + filtered_data = self.convert_to_datetime(filtered_data, "last_update_time_since_epoch") + + # Rearranging columns: Start with fixed columns and appending the remaining columns. + updated_columns = ["id", "name", "type", "create_time_since_epoch", "url", "Commit", "uri", "last_update_time_since_epoch"] + updated_columns += [ col for col in filtered_data.columns if col not in updated_columns] + + filtered_data = filtered_data[updated_columns] - df = query.get_all_artifacts_by_context(self.args.pipeline_name) + # Drop columns that start with 'custom_properties_' and that contains NaN values + columns_to_drop = [col for col in filtered_data.columns if col.startswith('custom_properties_') and df[col].isna().any()] + filtered_data = filtered_data.drop(columns=columns_to_drop) - if not df.empty: - if self.args.artifact_name: - artifact_id = self.search_artifact(df) - if(artifact_id != -1): - df = df.query(f'id == {int(artifact_id)}') + # Wrap text in object-type columns to a width of 30 characters for better readability. + for col in filtered_data.select_dtypes(include=['object']).columns: + filtered_data[col] = filtered_data[col].apply(lambda x: textwrap.fill(x, width=30) if isinstance(x, str) else x) + + # For a single artifact name, display the table in a horizontal format: + # Set 'id' as the index. + filtered_data.set_index("id", inplace=True) + # Transpose the DataFrame to make rows into columns. + filtered_data = filtered_data.T.reset_index() + # Rename the first column back to 'id' for consistency. + filtered_data.columns.values[0] = 'id' + + # Display the formatted and transposed table using the 'tabulate' library. + table = tabulate( + filtered_data, + headers=filtered_data.columns, # Use column names as headers. + tablefmt="grid", # Use grid format for table borders. + showindex=False, # Do not display the default index. + ) + print(table) + print() + + user_input = input("Press Enter to see more records if exists or 'q' to quit: ").strip().lower() + if user_input == 'q': + break + return "End of records.." else: - df = "Artifact name does not exist.." - else: - df = "Pipeline does not exist..." + return "Artifact name does not exist.." + + df = self.convert_to_datetime(df, "create_time_since_epoch") + self.display_table(df) + + return "Done." - if not isinstance(df, str): - if self.args.long: - pd.set_option('display.max_rows', None) # Set to None to display all rows - pd.set_option('display.max_columns', None) # Set to None to display all columns - else: - df = self.update_dataframe(df) - return df def add_parser(subparsers, parent_parser): - ARTIFACT_LIST_HELP = "Display list of artifact as present in current mlmd" + ARTIFACT_LIST_HELP = "Displays artifacts from the MLMD file with a few properties in a 7-column table, limited to 20 records per page." parser = subparsers.add_parser( "list", parents=[parent_parser], - description="Display artifact list", + description="Displays artifacts from the MLMD file with a few properties in a 7-column table, limited to 20 records per page.", help=ARTIFACT_LIST_HELP, formatter_class=argparse.RawDescriptionHelpFormatter, ) @@ -90,26 +246,25 @@ def add_parser(subparsers, parent_parser): "-p", "--pipeline_name", required=True, + action="append", help="Specify pipeline name.", metavar="", ) parser.add_argument( - "-f", "--file_name", help="Specify mlmd file name.", metavar="", + "-f", + "--file_name", + action="append", + help="Specify the absolute or relative path for the input MLMD file.", + metavar="", ) parser.add_argument( "-a", "--artifact_name", - help="Specify artifact name.", + action="append", + help="Specify the artifact name to display detailed information about the given artifact name.", metavar="", ) - parser.add_argument( - "-l", - "--long", - action='store_true', - help="Display detailed summary of artifact", - ) - parser.set_defaults(func=CmdArtifactsList) \ No newline at end of file diff --git a/cmflib/commands/execution/__init__.py b/cmflib/commands/execution/__init__.py index 0639b0cc..063adb9b 100644 --- a/cmflib/commands/execution/__init__.py +++ b/cmflib/commands/execution/__init__.py @@ -23,12 +23,12 @@ # This parser adds positional argumets to the main parser def add_parser(subparsers, parent_parser): - LIST_HELP = "Command to list executions." + LIST_HELP = "Display all executions with detailed information from the specified MLMD file." list_parser = subparsers.add_parser( - "executions", + "execution", parents=[parent_parser], - description="Display list of executions as present in current mlmd", + description="Display all executions with detailed information from the specified MLMD file.", help=LIST_HELP, formatter_class=argparse.RawDescriptionHelpFormatter, ) diff --git a/cmflib/commands/execution/list.py b/cmflib/commands/execution/list.py index 262d39a8..5816d108 100644 --- a/cmflib/commands/execution/list.py +++ b/cmflib/commands/execution/list.py @@ -16,71 +16,161 @@ import argparse import os +import textwrap import pandas as pd from cmflib.cli.command import CmdBase from cmflib import cmfquery +from tabulate import tabulate +from cmflib.dvc_wrapper import dvc_get_config class CmdExecutionList(CmdBase): - def update_dataframe(self, df): - # This function return dataframe with custom_properties_ only. - for c in df.columns: - if c.startswith('custom_properties_'): - df.rename(columns = {c:c.replace('custom_properties_','')}, inplace = True) - else: - df = df.drop(c, axis = 1) - return df - + + def display_table(self, df: pd.DataFrame) -> None: + """ + Display the DataFrame in a paginated table format with text wrapping for better readability. + Parameters: + - df: The DataFrame to display. + """ + # Rearranging columns + updated_columns = ["id", "Context_Type", "Execution", "Execution_uuid", "name", "Pipeline_Type", "Git_Repo"] + df = df[updated_columns] + df = df.copy() + + # Wrap text in object-type columns to a width of 14 characters. + # This ensures that long strings are displayed neatly within the table. + for col in df.select_dtypes(include=["object"]).columns: + df[col] = df[col].apply(lambda x: textwrap.fill(x, width=14) if isinstance(x, str) else x) + + total_records = len(df) + start_index = 0 + + # Display up to 20 records per page for better readability. + # This avoids overwhelming the user with too much data at once, especially for larger mlmd files. + while True: + end_index = start_index + 20 + records_per_page = df.iloc[start_index:end_index] + + # Display the table. + table = tabulate( + records_per_page, + headers=df.columns, + tablefmt="grid", + showindex=False, + ) + print(table) + + # Check if we've reached the end of the records. + if end_index >= total_records: + print("\nEnd of records.") + break + + # Ask the user for input to navigate pages. + user_input = input("Press Enter to see more or 'q' to quit: ").strip().lower() + if user_input == 'q': + break + + # Update start index for the next page. + start_index = end_index + def run(self): + # Check if 'cmf' is configured + msg = "'cmf' is not configured.\nExecute 'cmf init' command." + result = dvc_get_config() + if len(result) == 0: + return msg + current_directory = os.getcwd() - # default path for mlmd file name - mlmd_file_name = "./mlmd" - if self.args.file_name: - mlmd_file_name = self.args.file_name + if not self.args.file_name: # If self.args.file_name is None or an empty list ([]). + mlmd_file_name = "./mlmd" # Default path for mlmd file name. + elif len(self.args.file_name) > 1: # If the user provided more than one file name. + return "Error: You can only provide one file name using the -f flag." + elif not self.args.file_name[0]: # self.args.file_name[0] is an empty string (""). + return "Error: Missing File name" + else: + mlmd_file_name = self.args.file_name[0].strip() if mlmd_file_name == "mlmd": mlmd_file_name = "./mlmd" - current_directory = os.path.dirname(mlmd_file_name) - if not os.path.exists(mlmd_file_name): - return f"ERROR: {mlmd_file_name} doesn't exists in {current_directory} directory." + + current_directory = os.path.dirname(mlmd_file_name) + if not os.path.exists(mlmd_file_name): + return f"Error: {mlmd_file_name} doesn't exists in {current_directory} directory." - # Creating cmfquery object + # Creating cmfquery object. query = cmfquery.CmfQuery(mlmd_file_name) - df = query.get_all_executions_in_pipeline(self.args.pipeline_name) + # Check if pipeline exists in mlmd. + if self.args.pipeline_name is not None and len(self.args.pipeline_name) > 1: + return "Error: You can only provide one pipeline name using the -p flag." + elif not self.args.pipeline_name[0]: # self.args.pipeline_name[0] is an empty string (""). + return "Error: Missing pipeline name" + else: + pipeline_name = self.args.pipeline_name[0] + + df = query.get_all_executions_in_pipeline(pipeline_name) - # If dataframe is empty that means pipeline name is not exist - if df.empty: - df = "Pipeline does not exist.." + # Check if the DataFrame is empty, indicating the pipeline name does not exist. + if df.empty: + return "Pipeline does not exist.." else: - # If the new mlmd came[not in the case of Test-env] which is not pushed inside server, - # it doesn't exist column named with "Python_Env" + # Drop the 'Python_Env' column if it exists in the DataFrame. if "Python_Env" in df.columns: - # Dropping Python_Env column - df = df.drop(['Python_Env'], axis=1) # Type of df is series of integers - if self.args.execution_id: - try: - if int(self.args.execution_id) in list(df['id']): # Converting series to list - df = df.query(f'id == {int(self.args.execution_id)}') - else: - df = "Execution id does not exist.." - except: - df = "Execution id does not exist.." - - if not isinstance(df, str): - if self.args.long: - pd.set_option('display.max_rows', None) # Set to None to display all rows - pd.set_option('display.max_columns', None) # Set to None to display all columns - else: - df = self.update_dataframe(df) - return df + df = df.drop(['Python_Env'], axis=1) # Type of df is series of integers. + + # Process execution ID if provided + if not self.args.execution_id: # If self.args.execution_id is None or an empty list ([]). + pass + elif len(self.args.execution_id) > 1: # If the user provided more than one execution_id. + return "Error: You can only provide one execution id using the -e flag." + elif not self.args.execution_id[0]: # self.args.execution_id[0] is an empty string (""). + return "Error: Missing execution id" + else: + if self.args.execution_id[0].isdigit(): + if int(self.args.execution_id[0]) in list(df['id']): # Converting series to list. + df = df.query(f'id == {int(self.args.execution_id[0])}') # Used dataframe based on execution id + + # Rearranging columns: Start with fixed columns and appending the remaining columns. + updated_columns = ["id", "Context_Type", "Execution", "Execution_uuid", "name", "Pipeline_Type", "Git_Repo"] + updated_columns += [ col for col in df.columns if col not in updated_columns] + + df = df[updated_columns] + + # Drop columns that start with 'custom_properties_' and that contains NaN values + columns_to_drop = [col for col in df.columns if col.startswith('custom_properties_') and df[col].isna().any()] + df = df.drop(columns=columns_to_drop) + + # Wrap text in object-type columns to a width of 30 characters. + for col in df.select_dtypes(include=['object']).columns: + df[col] = df[col].apply(lambda x: textwrap.fill(x, width=30) if isinstance(x, str) else x) + + # Set 'id' as the DataFrame index and transpose it for display horizontally. + df.set_index("id", inplace=True) + df = df.T.reset_index() + df.columns.values[0] = 'id' # Rename the first column back to 'id'. + + # Display the updated DataFrame as a formatted table. + table = tabulate( + df, + headers=df.columns, + tablefmt="grid", + showindex=False, + ) + print(table) + print() + return "Done" + return "Execution id does not exist.." + + self.display_table(df) + return "Done" + def add_parser(subparsers, parent_parser): - EXECUTION_LIST_HELP = "Display list of executions as present in current mlmd" + EXECUTION_LIST_HELP = "Displays executions from the MLMD file with a few properties in a 7-column table, limited to 20 records per page." parser = subparsers.add_parser( "list", parents=[parent_parser], - description="Display list of executions", + description="Displays executions from the MLMD file with a few properties in a 7-column table, limited to 20 records per page.", help=EXECUTION_LIST_HELP, formatter_class=argparse.RawDescriptionHelpFormatter, ) @@ -89,28 +179,27 @@ def add_parser(subparsers, parent_parser): required_argumets.add_argument( "-p", - "--pipeline_name", + "--pipeline_name", + action="append", required=True, help="Specify pipeline name.", metavar="", ) parser.add_argument( - "-f", "--file_name", help="Specify mlmd file name.", metavar="", + "-f", + "--file_name", + action="append", + help="Specify the absolute or relative path for the input MLMD file.", + metavar="", ) parser.add_argument( "-e", "--execution_id", - help="Specify execution id.", + action="append", + help="Specify the execution id to retrieve execution.", metavar="", ) - - parser.add_argument( - "-l", - "--long", - action='store_true', - help="Display detailed summary of executions.", - ) parser.set_defaults(func=CmdExecutionList) \ No newline at end of file diff --git a/cmflib/commands/metadata/export.py b/cmflib/commands/metadata/export.py index fb8317a5..332febc6 100644 --- a/cmflib/commands/metadata/export.py +++ b/cmflib/commands/metadata/export.py @@ -18,60 +18,108 @@ import argparse import json import os + from cmflib import cmfquery from cmflib.cli.command import CmdBase - +from cmflib.dvc_wrapper import dvc_get_config # This class export local mlmd data to a json file class CmdMetadataExport(CmdBase): + def create_full_path(self, current_directory: str, json_file_name: str) -> str: + if not os.path.isdir(json_file_name): + temp = os.path.dirname(json_file_name) + current_directory = './' + if temp != "": + current_directory = temp + if os.path.exists(current_directory): + full_path_to_dump = json_file_name + return full_path_to_dump + else: + return f"{current_directory} doesn't exists." + else: + return "Provide path with file name." + def run(self): - + # Check if 'cmf' is configured. + msg = "'cmf' is not configured.\nExecute 'cmf init' command." + result = dvc_get_config() + if len(result) == 0: + return msg + current_directory = os.getcwd() full_path_to_dump = "" - mlmd_file_name = "./mlmd" - - # checks if mlmd filepath is given - if self.args.file_name: - mlmd_file_name = self.args.file_name - current_directory = os.path.dirname(self.args.file_name) - - # checks if mlmd file is present in current directory or given directory - if not os.path.exists(mlmd_file_name): - return f"ERROR: {mlmd_file_name} doesn't exists in the {current_directory}." - - - # setting directory where mlmd file will be dumped - if self.args.json_file_name: - if not os.path.isdir(self.args.json_file_name): - temp = os.path.dirname(self.args.json_file_name) - if temp != "": - current_directory = temp - if os.path.exists(current_directory): - full_path_to_dump = self.args.json_file_name - else: - return f"{current_directory} doesn't exists." - else: - return "Provide path with file name." + if not self.args.file_name: # If self.args.file_name is None or an empty list ([]). + mlmd_file_name = "./mlmd" # Default path for mlmd file name. + elif len(self.args.file_name) > 1: # If the user provided more than one file name. + return "Error: You can only provide one file name using the -f flag." + elif not self.args.file_name[0]: # self.args.file_name[0] is an empty string (""). + return "Error: Missing File name" else: - full_path_to_dump = os.getcwd() + f"/{self.args.pipeline_name}.json" - - # initialising cmfquery class + mlmd_file_name = self.args.file_name[0].strip() # Removing starting and ending whitespaces. + if mlmd_file_name == "mlmd": + mlmd_file_name = "./mlmd" + + current_directory = os.path.dirname(mlmd_file_name) + if not os.path.exists(mlmd_file_name): + return f"Error: {mlmd_file_name} doesn't exists in {current_directory} directory." + + # Initialising cmfquery class. query = cmfquery.CmfQuery(mlmd_file_name) - # check if pipeline exists in mlmd - pipeline = query.get_pipeline_id(self.args.pipeline_name) + # Check if pipeline exists in mlmd . + if self.args.pipeline_name is not None and len(self.args.pipeline_name) > 1: + return "Error: You can only provide one pipeline name using the -p flag." + elif not self.args.pipeline_name[0]: # self.args.pipeline_name[0] is an empty string (""). + return "Error: Missing pipeline name" + else: + pipeline_name = self.args.pipeline_name[0] + + pipeline = query.get_pipeline_id(pipeline_name) if pipeline > 0: - # pulling data from local mlmd file - json_payload = query.dumptojson(self.args.pipeline_name,None) - - # write metadata into json file + if not self.args.json_file_name: # If self.args.json_file_name is None or an empty list ([]). + json_file_name = self.args.json_file_name + elif len(self.args.json_file_name) > 1: # If the user provided more than one json file name. + return "Error: You can provide only one json file name using the -j flag." + elif not self.args.json_file_name[0]: # self.args.json_file_name[0] is an empty string (""). + return "Error: Missing Json file name" + else: + json_file_name = self.args.json_file_name[0].strip() + + # Setting directory where mlmd file will be dumped. + if json_file_name: + if not json_file_name.endswith(".json"): + json_file_name = json_file_name+".json" # Added .json extention to json file name. + if os.path.exists(json_file_name): + userRespone = input("File name already exists do you want to continue press yes/no: ") + if userRespone.lower() == "yes": # Overwrite file. + full_path_to_dump = self.create_full_path(current_directory, json_file_name) + else: + return "No changes made to the file. Operation aborted." + else: + full_path_to_dump = self.create_full_path(current_directory, json_file_name) + else: + # Checking whether a json file exists in the directory based on pipeline name. + if os.path.exists(f"{pipeline_name}.json"): + userRespone = input("File name already exists do you want to continue press yes/no: ") + if userRespone.lower() == "yes": + full_path_to_dump = os.getcwd() + f"/{pipeline_name}.json" + else: + return "No changes made to the file. Operation aborted." + else: + full_path_to_dump = os.getcwd() + f"/{pipeline_name}.json" + + # Pulling data from local mlmd file. + json_payload = query.dumptojson(pipeline_name,None) + + # Write metadata into json file. with open(full_path_to_dump, 'w') as f: f.write(json.dumps(json.loads(json_payload),indent=2)) return f"SUCCESS: metadata successfully exported in {full_path_to_dump}." else: - return f"{self.args.pipeline_name} doesn't exists in {mlmd_file_name}!!" + return f"{pipeline_name} doesn't exists in {mlmd_file_name}!!" + def add_parser(subparsers, parent_parser): @@ -89,6 +137,7 @@ def add_parser(subparsers, parent_parser): required_arguments.add_argument( "-p", "--pipeline_name", + action="append", required=True, help="Specify Pipeline name.", metavar="", @@ -97,12 +146,17 @@ def add_parser(subparsers, parent_parser): parser.add_argument( "-j", "--json_file_name", - help="Specify json file name with full path.", + action="append", + help="Specify output json file name with full path.", metavar="", ) parser.add_argument( - "-f", "--file_name", help="Specify mlmd file name.", metavar="" + "-f", + "--file_name", + action="append", + help="Specify the absolute or relative path for the input MLMD file.", + metavar="", ) parser.set_defaults(func=CmdMetadataExport) diff --git a/cmflib/commands/pipeline/__init__.py b/cmflib/commands/pipeline/__init__.py index 9b1a2066..75df2f5b 100644 --- a/cmflib/commands/pipeline/__init__.py +++ b/cmflib/commands/pipeline/__init__.py @@ -23,12 +23,12 @@ # This parser adds positional argumets to the main parser def add_parser(subparsers, parent_parser): - LIST_HELP = "Command to list pipeline." + LIST_HELP = "Display a list of pipeline name(s) from the available mlmd file." list_parser = subparsers.add_parser( "pipeline", parents=[parent_parser], - description="Display list of pipelines as present in current mlmd", + description="Display a list of pipeline name(s) from the available mlmd file.", help=LIST_HELP, formatter_class=argparse.RawDescriptionHelpFormatter, ) diff --git a/cmflib/commands/pipeline/list.py b/cmflib/commands/pipeline/list.py index 206aa4a9..fe597870 100644 --- a/cmflib/commands/pipeline/list.py +++ b/cmflib/commands/pipeline/list.py @@ -19,38 +19,55 @@ from cmflib.cli.command import CmdBase from cmflib import cmfquery +from cmflib.dvc_wrapper import dvc_get_config class CmdPipelineList(CmdBase): def run(self): + # Check if 'cmf' is configured. + msg = "'cmf' is not configured.\nExecute 'cmf init' command." + result = dvc_get_config() + if len(result) == 0: + return msg + current_directory = os.getcwd() - # default path for mlmd file name - mlmd_file_name = "./mlmd" - if self.args.file_name: - mlmd_file_name = self.args.file_name + if not self.args.file_name: # If self.args.file_name is None or an empty list ([]). + mlmd_file_name = "./mlmd" # Default path for mlmd file name. + elif len(self.args.file_name) > 1: # If the user provided more than one file name. + return "Error: You can only provide one file name using the -f flag." + elif not self.args.file_name[0]: # self.args.file_name[0] is an empty string (""). + return "Error: Missing File name" + else: + mlmd_file_name = self.args.file_name[0].strip() if mlmd_file_name == "mlmd": mlmd_file_name = "./mlmd" - current_directory = os.path.dirname(mlmd_file_name) + + current_directory = os.path.dirname(mlmd_file_name) if not os.path.exists(mlmd_file_name): - return f"ERROR: {mlmd_file_name} doesn't exists in {current_directory} directory." + return f"Error: {mlmd_file_name} doesn't exists in {current_directory} directory." - # Creating cmfquery object + # Creating cmfquery object. query = cmfquery.CmfQuery(mlmd_file_name) return [pipeline.name for pipeline in query._get_pipelines()] + def add_parser(subparsers, parent_parser): - PIPELINE_LIST_HELP = "Display list of pipelines as present in current mlmd" + PIPELINE_LIST_HELP = "Display a list of pipeline name(s) from the available mlmd file." parser = subparsers.add_parser( "list", parents=[parent_parser], - description="Display list of pipeline", + description="Display a list of pipeline name(s) from the available mlmd file.", help=PIPELINE_LIST_HELP, formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument( - "-f", "--file_name", help="Specify mlmd file name.", metavar="", + "-f", + "--file_name", + action="append", + help="Specify the absolute or relative path for the input MLMD file.", + metavar="", ) parser.set_defaults(func=CmdPipelineList) diff --git a/docs/cmf_client/cmf_client.md b/docs/cmf_client/cmf_client.md index bffe8161..a5a89ea8 100644 --- a/docs/cmf_client/cmf_client.md +++ b/docs/cmf_client/cmf_client.md @@ -2,9 +2,9 @@ # cmf ``` -Usage: cmf [-h] {init, artifact, metadata, executions, pipeline} +Usage: cmf [-h] {init, artifact, metadata, execution, pipeline} ``` -The `cmf` command is a comprehensive tool designed to initialize an artifact repository and perform various operations on artifacts, executions, pipeline and metadata. +The `cmf` command is a comprehensive tool designed to initialize an artifact repository and perform various operations on artifacts, execution, pipeline and metadata. ## cmf init ``` @@ -258,11 +258,11 @@ Optional Arguments ``` ### cmf artifact list ``` -Usage: cmf artifact list [-h] -p [pipeline_name] -f [file_name] -a [artifact_name] -l +Usage: cmf artifact list [-h] -p [pipeline_name] -f [file_name] -a [artifact_name] ``` -`cmf artifact list` command display list of artifacts. +`cmf artifact list` command displays artifacts from the MLMD file with a few properties in a 7-column table, limited to 20 records per page. ``` -cmf artifact list -p 'pipeline_name' -f '/path/to/mlmd-file-name' -a 'artifact_name' -l +cmf artifact list -p 'pipeline_name' -f '/path/to/mlmd-file-name' -a 'artifact_name' ``` Required Arguments ``` @@ -271,9 +271,8 @@ Required Arguments Optional Arguments ``` -h, --help show this help message and exit. - -f [file_name], --file-name [file_name] Specify mlmd file name. - -a [artifact_name], --artifact_name [artifact_name] Specify artifact name. - -l, --long Specify in which format you want to saw artifacts[By default short]. + -f [file_name], --file-name [file_name] Specify the absolute or relative path for the input MLMD file. + -a [artifact_name], --artifact_name [artifact_name] Specify the artifact name to display detailed information about the given artifact name. ``` ## cmf metadata @@ -333,22 +332,22 @@ Required Arguments Optional Arguments ``` -h, --help show this help message and exit. - -f [file_name], --file_name [file_name] Specify mlmd file name. - -j [json_file_name], --json_file_name [json_file_name] Specify json file name with full path. + -f [file_name], --file_name [file_name] Specify the absolute or relative path for the input MLMD file. + -j [json_file_name], --json_file_name [json_file_name] Specify output json file name with full path. ``` -## cmf executions +## cmf execution ``` -Usage: cmf executions [-h] {list} +Usage: cmf execution [-h] {list} ``` -`cmf executions` list executions from or to the user configured repository. +`cmf execution` command to displays executions from the MLMD file. ### cmf executions list ``` -Usage: cmf executions list [-h] -p [pipeline_name] -f [file_name] -e [execution_id] -l +Usage: cmf execution list [-h] -p [pipeline_name] -f [file_name] -e [execution_id] ``` -`cmf executions list` command display list of executions in current cmf configuration. +`cmf execution list` command to displays executions from the MLMD file with a few properties in a 7-column table, limited to 20 records per page. ``` -cmf executions list -p 'pipeline_name' -f '/path/to/mlmd-file-name' -e 'execution_id' -l +cmf execution list -p 'pipeline_name' -f '/path/to/mlmd-file-name' -e 'execution_id' ``` Required Arguments ``` @@ -357,26 +356,25 @@ Required Arguments Optional Arguments ``` -h, --help show this help message and exit. - -f [file_name], --file-name [file_name] Specify mlmd file name. - -e [exe_id], --execution_id [exe_id] Specify execution id. - -l, --long Specify in which format you want to saw execution[By default short]. + --f [file_name], --file-name [file_name] Specify the absolute or relative path for the input MLMD file. + -e [exe_id], --execution_id [exe_id] Specify the execution id to retrieve execution. ``` ## cmf pipeline ``` Usage: cmf pipeline [-h] {list} ``` -`cmf pipeline` command to display list of pipelines. +`cmf pipeline` command displays a list of pipeline name(s) from the available mlmd file. ### cmf pipeline list ``` Usage: cmf pipeline list [-h] -f [file_name] ``` -`cmf pipeline list` command display list of pipelines in current cmf configuration. +`cmf pipeline list` command displays a list of pipeline name(s) from the available mlmd file. ``` cmf pipeline list -f '/path/to/mlmd-file-name' ``` Optional Arguments ``` -h, --help show this help message and exit. - -f [file_name], --file-name [file_name] Specify mlmd file name. + --f [file_name], --file-name [file_name] Specify the absolute or relative path for the input MLMD file. ``` diff --git a/requirements.txt b/requirements.txt index e842e8ae..356aefb6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,5 @@ minio paramiko scikit_learn boto3 +textwrap +typing