From 5cfbe98b4c15470e20d05c96f776303214b3090f Mon Sep 17 00:00:00 2001 From: Varkha Sharma Date: Wed, 16 Oct 2024 03:23:05 -0700 Subject: [PATCH 01/15] Initial code for making python_env output an artifact --- cmflib/cmf.py | 237 +++++++++++++++++++++++++++++-- cmflib/cmf_merger.py | 7 +- cmflib/graph_wrapper.py | 32 +++++ cmflib/utils/helper_functions.py | 107 +++++++++----- 4 files changed, 334 insertions(+), 49 deletions(-) diff --git a/cmflib/cmf.py b/cmflib/cmf.py index cb653dd6..c6de4852 100644 --- a/cmflib/cmf.py +++ b/cmflib/cmf.py @@ -20,6 +20,7 @@ import re import os import sys +import yaml import pandas as pd import typing as t @@ -54,7 +55,7 @@ link_execution_to_input_artifact, ) from cmflib.utils.cmf_config import CmfConfig -from cmflib.utils.helper_functions import get_python_env, change_dir +from cmflib.utils.helper_functions import get_python_env, change_dir, get_md5_hash from cmflib.cmf_commands_wrapper import ( _metadata_push, _metadata_pull, @@ -104,15 +105,15 @@ class Cmf: # pylint: disable=too-many-instance-attributes # Reading CONFIG_FILE variable - cmf_config = os.environ.get("CONFIG_FILE", ".cmfconfig") + #cmf_config = os.environ.get("CONFIG_FILE", ".cmfconfig") ARTIFACTS_PATH = "cmf_artifacts" DATASLICE_PATH = "dataslice" METRICS_PATH = "metrics" - if os.path.exists(cmf_config): - attr_dict = CmfConfig.read_config(cmf_config) - __neo4j_uri = attr_dict.get("neo4j-uri", "") - __neo4j_password = attr_dict.get("neo4j-password", "") - __neo4j_user = attr_dict.get("neo4j-user", "") + #if os.path.exists(cmf_config): + # attr_dict = CmfConfig.read_config(cmf_config) + # __neo4j_uri = attr_dict.get("neo4j-uri", "") + # __neo4j_password = attr_dict.get("neo4j-password", "") + # __neo4j_user = attr_dict.get("neo4j-user", "") def __init__( self, @@ -419,7 +420,7 @@ def create_execution( git_repo = git_get_repo() git_start_commit = git_get_commit() cmd = str(sys.argv) if cmd is None else cmd - python_env=get_python_env() + self.execution = create_new_execution_in_existing_run_context( store=self.store, # Type field when re-using executions @@ -433,7 +434,6 @@ def create_execution( pipeline_type=self.parent_context.name, git_repo=git_repo, git_start_commit=git_start_commit, - python_env=python_env, custom_properties=custom_props, create_new_execution=create_new_execution, ) @@ -441,7 +441,8 @@ def create_execution( if uuids: self.execution.properties["Execution_uuid"].string_value = uuids+","+str(uuid.uuid1()) else: - self.execution.properties["Execution_uuid"].string_value = str(uuid.uuid1()) + self.execution.properties["Execution_uuid"].string_value = str(uuid.uuid1()) + self.store.put_executions([self.execution]) self.execution_name = str(self.execution.id) + "," + execution_type self.execution_command = cmd @@ -451,7 +452,7 @@ def create_execution( self.execution_label_props["Execution_Name"] = ( execution_type + ":" + str(self.execution.id) ) - + self.execution_label_props["execution_command"] = cmd if self.graph: self.driver.create_execution_node( @@ -462,6 +463,35 @@ def create_execution( self.execution.id, custom_props, ) + + directory_path = self.ARTIFACTS_PATH + os.makedirs(directory_path, exist_ok=True) + packages = get_python_env() + if isinstance(packages, list): + output = f"{packages}\n" + md5_hash = get_md5_hash(output) + print(md5_hash) + python_env_file_path = os.path.join(directory_path, f"{md5_hash}_python_env.txt") + # create file if it doesn't exists + if not os.path.exists(python_env_file_path): + print(f"{python_env_file_path} doesn't exists!!") + with open(python_env_file_path, 'w') as file: + for package in packages: + file.write(f"{package}\n") + + else: + # in case output is dict + env_output = yaml.dump(packages, sort_keys=False) + md5_hash = get_md5_hash(env_output) + python_env_file_path = os.path.join(directory_path, f"{md5_hash}_python_env.yaml") + # create file if it doesn't exists + if not os.path.exists(python_env_file_path): + print(f"{python_env_file_path} doesn't exists!!") + with open(python_env_file_path, 'w') as file: + file.write(env_output) + + # link the artifact to execution if it exists and creates artifact if it doesn't + self.log_python_env(python_env_file_path) os.chdir(logging_dir) return self.execution @@ -602,7 +632,6 @@ def merge_created_execution( # print(custom_props) git_repo = properties.get("Git_Repo", "") git_start_commit = properties.get("Git_Start_Commit", "") - python_env = properties.get("Python_Env", "") #name = properties.get("Name", "") create_new_execution = True execution_name = execution_type @@ -623,7 +652,6 @@ def merge_created_execution( pipeline_type=self.parent_context.name, git_repo=git_repo, git_start_commit=git_start_commit, - python_env=python_env, custom_properties=custom_props, create_new_execution=create_new_execution ) @@ -658,13 +686,195 @@ def merge_created_execution( self.execution.id, custom_props, ) + + # link the artifact to execution if it exists and creates artifact if it doesn't return self.execution + # what is the reason behind creating this function def log_dvc_lock(self, file_path: str): """Used to update the dvc lock file created with dvc run command.""" print("Entered dvc lock file commit") return commit_dvc_lock_file(file_path, self.execution.id) + def log_python_env( + self, + url: str, + ) -> mlpb.Artifact: + "Used to log the python packages involved in the current execution" + + git_repo = git_get_repo() + name = re.split("/", url)[-1] + existing_artifact = [] + + commit_output(url, self.execution.id) + c_hash = dvc_get_hash(url) + + if c_hash == "": + print("Error in getting the dvc hash,return without logging") + return + + dataset_commit = c_hash + dvc_url = dvc_get_url(url) + dvc_url_with_pipeline = f"{self.parent_context.name}:{dvc_url}" + url = url + ":" + c_hash + if c_hash and c_hash.strip: + existing_artifact.extend(self.store.get_artifacts_by_uri(c_hash)) + + if existing_artifact and len(existing_artifact) != 0: + existing_artifact = existing_artifact[0] + uri = c_hash + print("i am here") + artifact = link_execution_to_artifact( + store=self.store, + execution_id=self.execution.id, + uri=uri, + input_name=url, + event_type=mlpb.Event.Type.OUTPUT, + ) + else: + uri = c_hash if c_hash and c_hash.strip() else str(uuid.uuid1()) + artifact = create_new_artifact_event_and_attribution( + store=self.store, + execution_id=self.execution.id, + context_id=self.child_context.id, + uri=uri, + name=url, + type_name="Environment", + event_type=mlpb.Event.Type.OUTPUT, + properties={ + "git_repo": str(git_repo), + # passing c_hash value to commit + "Commit": str(dataset_commit), + "url": str(dvc_url_with_pipeline), + }, + artifact_type_properties={ + "git_repo": mlpb.STRING, + "Commit": mlpb.STRING, + "url": mlpb.STRING, + }, + milliseconds_since_epoch=int(time.time() * 1000), + ) + self.execution_label_props["git_repo"] = git_repo + self.execution_label_props["Commit"] = dataset_commit + + + if self.graph: + self.driver.create_env_node( + name, + url, + uri, + "output", + self.execution.id, + self.parent_context, + ) + + child_artifact = { + "Name": name, + "Path": url, + "URI": uri, + "Event": "output", + "Execution_Name": self.execution_name, + "Type": "Environment", + "Execution_Command": self.execution_command, + "Pipeline_Id": self.parent_context.id, + "Pipeline_Name": self.parent_context.name, + } + self.driver.create_artifact_relationships( + self.input_artifacts, child_artifact, self.execution_label_props + ) + + return artifact + + def log_python_env_on_server( + self, + url: str, + uri: str, + ) -> mlpb.Artifact: + "Used to log the python packages involved in the current execution" + + git_repo = git_get_repo() + name = re.split("/", url)[-1] + existing_artifact = [] + + commit_output(url, self.execution.id) + c_hash = dvc_get_hash(url) + + if c_hash == "": + print("Error in getting the dvc hash,return without logging") + return + + dataset_commit = c_hash + dvc_url = dvc_get_url(url) + dvc_url_with_pipeline = f"{self.parent_context.name}:{dvc_url}" + url = url + ":" + c_hash + if c_hash and c_hash.strip: + existing_artifact.extend(self.store.get_artifacts_by_uri(c_hash)) + + if existing_artifact and len(existing_artifact) != 0: + existing_artifact = existing_artifact[0] + uri = c_hash + print("i am here") + artifact = link_execution_to_artifact( + store=self.store, + execution_id=self.execution.id, + uri=uri, + input_name=url, + event_type=mlpb.Event.Type.OUTPUT, + ) + else: + uri = c_hash if c_hash and c_hash.strip() else str(uuid.uuid1()) + artifact = create_new_artifact_event_and_attribution( + store=self.store, + execution_id=self.execution.id, + context_id=self.child_context.id, + uri=uri, + name=url, + type_name="Environment", + event_type=mlpb.Event.Type.OUTPUT, + properties={ + "git_repo": str(git_repo), + # passing c_hash value to commit + "Commit": str(dataset_commit), + "url": str(dvc_url_with_pipeline), + }, + artifact_type_properties={ + "git_repo": mlpb.STRING, + "Commit": mlpb.STRING, + "url": mlpb.STRING, + }, + milliseconds_since_epoch=int(time.time() * 1000), + ) + self.execution_label_props["git_repo"] = git_repo + self.execution_label_props["Commit"] = dataset_commit + + + if self.graph: + self.driver.create_env_node( + name, + url, + uri, + "output", + self.execution.id, + self.parent_context, + ) + + child_artifact = { + "Name": name, + "Path": url, + "URI": uri, + "Event": "output", + "Execution_Name": self.execution_name, + "Type": "Environment", + "Execution_Command": self.execution_command, + "Pipeline_Id": self.parent_context.id, + "Pipeline_Name": self.parent_context.name, + } + self.driver.create_artifact_relationships( + self.input_artifacts, child_artifact, self.execution_label_props + ) + + return artifact + def log_dataset( self, url: str, @@ -1996,6 +2206,7 @@ def commit_existing(self, uri: str, props: t.Optional[t.Dict] = None, custom_pro # print(last) # os.symlink(str(index), slicedir + "/ " + last) + def metadata_push(pipeline_name: str, filepath = "./mlmd", tensorboard_path: str = "", execution_id: str = ""): """ Pushes MLMD file to CMF-server. Example: diff --git a/cmflib/cmf_merger.py b/cmflib/cmf_merger.py index 82c6082d..757b72c9 100644 --- a/cmflib/cmf_merger.py +++ b/cmflib/cmf_merger.py @@ -138,6 +138,8 @@ def parse_json_to_mlmd(mlmd_json, path_to_store: str, cmd: str, exec_id: Union[s dataslice.commit_existing(uri, custom_props) elif artifact_type == "Step_Metrics": cmf_class.commit_existing_metrics(event["artifact"]["name"], uri, custom_props) + elif artifact_type == "Environment": + cmf_class.log_python_env_on_server(artifact_name, uri) else: pass except AlreadyExistsError as e: @@ -194,7 +196,4 @@ def create_original_time_since_epoch(mlmd_data): artifact.append(k["artifact"]["create_time_since_epoch"]) # print(k['artifact']['custom_properties']['original_create_time_since_epoch']) - return mlmd_data - - - + return mlmd_data \ No newline at end of file diff --git a/cmflib/graph_wrapper.py b/cmflib/graph_wrapper.py index 29b5ab41..08b7a1f9 100644 --- a/cmflib/graph_wrapper.py +++ b/cmflib/graph_wrapper.py @@ -88,6 +88,20 @@ def create_dataset_node(self, name: str, path: str, uri: str, event: str, execut "Execution", "Dataset", self.execution_id, node_id, event) _ = session.write_transaction(self._run_transaction, pc_syntax) + def create_env_node(self, name: str, path: str, uri: str, event: str, execution_id: int, + pipeline_context: mlpb.Context): + pipeline_id = pipeline_context.id + pipeline_name = pipeline_context.name + dataset_syntax = self._create_env_syntax( + name, path, uri, pipeline_id, pipeline_name) + with self.driver.session() as session: + node = session.write_transaction( + self._run_transaction, dataset_syntax) + node_id = node[0]["node_id"] + pc_syntax = self._create_execution_artifacts_link_syntax( + "Execution", "Environment", self.execution_id, node_id, event) + _ = session.write_transaction(self._run_transaction, pc_syntax) + def create_dataslice_node(self, name: str, path: str, uri: str, parent_name:str, custom_properties=None): if custom_properties is None: @@ -285,6 +299,24 @@ def _create_dataset_syntax(name: str, path: str, uri: str, pipeline_id: int, pip syntax_str = syntax_str + " RETURN ID(a) as node_id" return syntax_str + @staticmethod + def _create_env_syntax(name: str, path: str, uri: str, pipeline_id: int, pipeline_name: str): + custom_properties = {} + custom_properties["Name"] = name + custom_properties["Path"] = path + custom_properties["pipeline_id"] = str(pipeline_id) + custom_properties["pipeline_name"] = pipeline_name + syntax_str = "MERGE (a:Environment {uri:\"" + uri + "\"}) SET " + # props_str = "" + for k, v in custom_properties.items(): + k = re.sub('\W+', '', k) + props_str = "a." + k + \ + " = coalesce([x in a." + k + " where x <>\"" + str(v) + "\"], []) + \"" + str(v) + "\"," + syntax_str = syntax_str + props_str + syntax_str = syntax_str.rstrip(",") + syntax_str = syntax_str + " RETURN ID(a) as node_id" + return syntax_str + @staticmethod def _create_dataslice_syntax(name: str, path: str, uri: str, custom_properties): diff --git a/cmflib/utils/helper_functions.py b/cmflib/utils/helper_functions.py index 9dc12ea3..ce38bc03 100644 --- a/cmflib/utils/helper_functions.py +++ b/cmflib/utils/helper_functions.py @@ -18,6 +18,7 @@ import sys import subprocess import json +import yaml def is_url(url)-> bool: from urllib.parse import urlparse @@ -36,56 +37,98 @@ def is_git_repo(): else: return +def get_python_env(env_name='cmf'): + # what this is supposed to return + try: + # Check if the environment is conda + if is_conda_installed(): # If conda is installed and the command succeeds -def get_python_env()-> str: - installed_packages = "" - python_version = sys.version - packages = "" - # check if conda is installed - if is_conda_installed(): - import conda - # List all installed packages and their versions - data = list_conda_packages_json() - transformed_result = [f"{entry['name']}=={entry['version']}" for entry in data] - installed_packages = transformed_result - packages = f"Conda: Python {python_version}: {installed_packages}" - else: - # pip - try: - from pip._internal.operations import freeze - - # List all installed packages and their versions - installed_packages_generator = freeze.freeze() - installed_packages = list(installed_packages_generator) - packages = f"Python {python_version}: {installed_packages}" - except ImportError: - print("Pip is not installed.") - return packages + # Step 1: Get the list of conda packages + conda_packages = subprocess.check_output(['conda', 'list', '--export']).decode('utf-8').splitlines() + + # Step 2: Get the list of pip packages + pip_packages = subprocess.check_output(['pip', 'freeze']).decode('utf-8').splitlines() + + # Step 3: Get the list of channels from the current conda environment + channels_raw = subprocess.check_output(['conda', 'config', '--show', 'channels']).decode('utf-8').splitlines() + + # Filter out lines that start with 'channels:' and any empty or commented lines + channels = [line.strip().lstrip('- ').strip() for line in channels_raw if line and not line.startswith('channels:') and not line.startswith('#')] + + # Step 4: Create a YAML structure for the environment + env_data = { + 'name': env_name, # Name the environment -- don't provide the name + 'channels': channels, # Add the cleaned channels list + 'dependencies': [], + } + + # Add conda packages to dependencies + for package in conda_packages: + if not package.startswith('#') and len(package.strip()) > 0: + env_data['dependencies'].append(package) + + # Add pip packages under a pip section in dependencies + if pip_packages: + pip_section = {'pip': pip_packages} + env_data['dependencies'].append(pip_section) + + return env_data + + else: + # If not conda, assume virtualenv/pip + print("Detected virtualenv/pip environment. Exporting requirements.txt...") + # Step 1: Get the list of pip packages + pip_packages = subprocess.check_output(['pip', 'freeze']).decode('utf-8').splitlines() + + return pip_packages + + except Exception as e: + print(f"An error occurred: {e}") + + return + +def get_md5_hash(output): + import hashlib + + # Convert the string to bytes (utf-8 encoding) + byte_content = output.encode('utf-8') + + # Create an MD5 hash object + md5_hash = hashlib.md5() + + # Update the hash with the byte content + md5_hash.update(byte_content) + + # Return the hexadecimal digest + hash_for_op = md5_hash.hexdigest() + + return hash_for_op + def change_dir(cmf_init_path): logging_dir = os.getcwd() if not logging_dir == cmf_init_path: os.chdir(cmf_init_path) return logging_dir -def is_conda_installed(): + +def is_conda_installed() -> bool: + """Check if Conda is installed by running 'conda --version'.""" try: - import conda # Run the 'conda --version' command and capture the output subprocess.run(['conda', '--version'], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) return True - except subprocess.CalledProcessError: - return False - except ImportError: + except (subprocess.CalledProcessError, FileNotFoundError): return False -def list_conda_packages_json(): +def list_conda_packages_json() -> list: + """Return a list of installed Conda packages and their versions.""" try: result = subprocess.run(['conda', 'list', '--json'], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) return json.loads(result.stdout) - except subprocess.CalledProcessError as e: - return f"Error: {e.stderr}" + except (subprocess.CalledProcessError, json.JSONDecodeError): + return [] # Generate SciToken dynamically From ecbe106dc633faf93859e7e876dacf96b784e16d Mon Sep 17 00:00:00 2001 From: AyeshaSanadi Date: Thu, 17 Oct 2024 18:50:49 +0530 Subject: [PATCH 02/15] Resolved external artifact path conflict occured during artifact pull cmd --- cmflib/commands/artifact/pull.py | 8 +++++--- cmflib/commands/artifact/push.py | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/cmflib/commands/artifact/pull.py b/cmflib/commands/artifact/pull.py index 0cbdded0..7b38f701 100644 --- a/cmflib/commands/artifact/pull.py +++ b/cmflib/commands/artifact/pull.py @@ -99,7 +99,11 @@ def extract_repo_args(self, type: str, name: str, url: str, current_directory: s elif type == "local": token_length = len(token) - download_loc = current_directory + "/" + name + if name.startswith("cmf_artifacts") or name.startswith("artifacts"): + download_loc = current_directory + "/" + name + else: + # Incase of external artifact + download_loc = current_directory + "/" + name.split("/")[-1] # local artifact repo path = local-storage/files/md5/23/69v2uu3jeejjeiw. @@ -255,7 +259,6 @@ def run(self): print(stmt) else: for name, url in name_url_dict.items(): - #print(name, url) if not isinstance(url, str): continue local_args = self.extract_repo_args("local", name, url, current_directory) @@ -286,7 +289,6 @@ def run(self): print(stmt) else: for name, url in name_url_dict.items(): - #print(name, url) if not isinstance(url, str): continue args = self.extract_repo_args("ssh", name, url, current_directory) diff --git a/cmflib/commands/artifact/push.py b/cmflib/commands/artifact/push.py index 6efd5afb..5f1bdcb3 100644 --- a/cmflib/commands/artifact/push.py +++ b/cmflib/commands/artifact/push.py @@ -94,7 +94,7 @@ def run(self): if not artifacts.empty: artifacts = artifacts[artifacts['type'] != 'Metrics'] # adding .dvc at the end of every file as it is needed for pull - artifacts['name'] = artifacts['name'].apply(lambda name: f"{name.split(':')[0]}.dvc") + artifacts['name'] = artifacts['name'].apply(lambda name: name.split(':')[0]) names.extend(artifacts['name'].tolist()) file_set = set(names) result = dvc_push(list(file_set)) From 1bfe81b244de0769dc1e51fafaccc4fe64a9d812 Mon Sep 17 00:00:00 2001 From: Varkha Sharma Date: Wed, 6 Nov 2024 16:29:52 -0800 Subject: [PATCH 03/15] removing some unrelated changes which has been done in oter PRs --- cmflib/commands/artifact/pull.py | 8 +++----- cmflib/commands/artifact/push.py | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/cmflib/commands/artifact/pull.py b/cmflib/commands/artifact/pull.py index 7b38f701..0cbdded0 100644 --- a/cmflib/commands/artifact/pull.py +++ b/cmflib/commands/artifact/pull.py @@ -99,11 +99,7 @@ def extract_repo_args(self, type: str, name: str, url: str, current_directory: s elif type == "local": token_length = len(token) - if name.startswith("cmf_artifacts") or name.startswith("artifacts"): - download_loc = current_directory + "/" + name - else: - # Incase of external artifact - download_loc = current_directory + "/" + name.split("/")[-1] + download_loc = current_directory + "/" + name # local artifact repo path = local-storage/files/md5/23/69v2uu3jeejjeiw. @@ -259,6 +255,7 @@ def run(self): print(stmt) else: for name, url in name_url_dict.items(): + #print(name, url) if not isinstance(url, str): continue local_args = self.extract_repo_args("local", name, url, current_directory) @@ -289,6 +286,7 @@ def run(self): print(stmt) else: for name, url in name_url_dict.items(): + #print(name, url) if not isinstance(url, str): continue args = self.extract_repo_args("ssh", name, url, current_directory) diff --git a/cmflib/commands/artifact/push.py b/cmflib/commands/artifact/push.py index 5f1bdcb3..6efd5afb 100644 --- a/cmflib/commands/artifact/push.py +++ b/cmflib/commands/artifact/push.py @@ -94,7 +94,7 @@ def run(self): if not artifacts.empty: artifacts = artifacts[artifacts['type'] != 'Metrics'] # adding .dvc at the end of every file as it is needed for pull - artifacts['name'] = artifacts['name'].apply(lambda name: name.split(':')[0]) + artifacts['name'] = artifacts['name'].apply(lambda name: f"{name.split(':')[0]}.dvc") names.extend(artifacts['name'].tolist()) file_set = set(names) result = dvc_push(list(file_set)) From a050883d006c979a6fdcac4336f27ba23562621f Mon Sep 17 00:00:00 2001 From: Varkha Sharma Date: Thu, 7 Nov 2024 05:41:01 -0800 Subject: [PATCH 04/15] fixing neo4j code --- cmflib/cmf.py | 99 +++++++++++++++++--------------- cmflib/cmf_merger.py | 2 +- cmflib/graph_wrapper.py | 45 +++++++++++++-- cmflib/utils/helper_functions.py | 2 - docker-compose-server.yml | 8 +-- 5 files changed, 100 insertions(+), 56 deletions(-) diff --git a/cmflib/cmf.py b/cmflib/cmf.py index c6de4852..93407b78 100644 --- a/cmflib/cmf.py +++ b/cmflib/cmf.py @@ -172,6 +172,7 @@ def __init__( os.chdir(logging_dir) @staticmethod + # function used to load neo4j params for cmf client def __load_neo4j_params(): cmf_config = os.environ.get("CONFIG_FILE", ".cmfconfig") if os.path.exists(cmf_config): @@ -182,6 +183,7 @@ def __load_neo4j_params(): @staticmethod + # function used to load neo4j params for cmf-server def __get_neo4j_server_config(): Cmf.__neo4j_uri = os.getenv('NEO4J_URI', "") Cmf.__neo4j_user = os.getenv('NEO4J_USER_NAME', "") @@ -470,11 +472,10 @@ def create_execution( if isinstance(packages, list): output = f"{packages}\n" md5_hash = get_md5_hash(output) - print(md5_hash) - python_env_file_path = os.path.join(directory_path, f"{md5_hash}_python_env.txt") + python_env_file_path = os.path.join(directory_path, f"python_env_{md5_hash}.txt") # create file if it doesn't exists if not os.path.exists(python_env_file_path): - print(f"{python_env_file_path} doesn't exists!!") + #print(f"{python_env_file_path} doesn't exists!!") with open(python_env_file_path, 'w') as file: for package in packages: file.write(f"{package}\n") @@ -483,10 +484,10 @@ def create_execution( # in case output is dict env_output = yaml.dump(packages, sort_keys=False) md5_hash = get_md5_hash(env_output) - python_env_file_path = os.path.join(directory_path, f"{md5_hash}_python_env.yaml") + python_env_file_path = os.path.join(directory_path, f"python_env_{md5_hash}.yaml") # create file if it doesn't exists if not os.path.exists(python_env_file_path): - print(f"{python_env_file_path} doesn't exists!!") + #print(f"{python_env_file_path} doesn't exists!!") with open(python_env_file_path, 'w') as file: file.write(env_output) @@ -690,7 +691,6 @@ def merge_created_execution( # link the artifact to execution if it exists and creates artifact if it doesn't return self.execution - # what is the reason behind creating this function def log_dvc_lock(self, file_path: str): """Used to update the dvc lock file created with dvc run command.""" print("Entered dvc lock file commit") @@ -713,7 +713,7 @@ def log_python_env( print("Error in getting the dvc hash,return without logging") return - dataset_commit = c_hash + commit = c_hash dvc_url = dvc_get_url(url) dvc_url_with_pipeline = f"{self.parent_context.name}:{dvc_url}" url = url + ":" + c_hash @@ -723,7 +723,6 @@ def log_python_env( if existing_artifact and len(existing_artifact) != 0: existing_artifact = existing_artifact[0] uri = c_hash - print("i am here") artifact = link_execution_to_artifact( store=self.store, execution_id=self.execution.id, @@ -744,7 +743,7 @@ def log_python_env( properties={ "git_repo": str(git_repo), # passing c_hash value to commit - "Commit": str(dataset_commit), + "Commit": str(commit), "url": str(dvc_url_with_pipeline), }, artifact_type_properties={ @@ -754,8 +753,11 @@ def log_python_env( }, milliseconds_since_epoch=int(time.time() * 1000), ) + custom_props = {} + custom_props["git_repo"] = git_repo + custom_props["Commit"] = commit self.execution_label_props["git_repo"] = git_repo - self.execution_label_props["Commit"] = dataset_commit + self.execution_label_props["Commit"] = commit if self.graph: @@ -766,6 +768,7 @@ def log_python_env( "output", self.execution.id, self.parent_context, + custom_props, ) child_artifact = { @@ -789,31 +792,21 @@ def log_python_env_on_server( self, url: str, uri: str, + props: t.Optional[t.Dict] = None, ) -> mlpb.Artifact: "Used to log the python packages involved in the current execution" - git_repo = git_get_repo() - name = re.split("/", url)[-1] + git_repo = props.get("git_repo", "") + name = url existing_artifact = [] - - commit_output(url, self.execution.id) - c_hash = dvc_get_hash(url) - - if c_hash == "": - print("Error in getting the dvc hash,return without logging") - return - - dataset_commit = c_hash - dvc_url = dvc_get_url(url) - dvc_url_with_pipeline = f"{self.parent_context.name}:{dvc_url}" + c_hash = uri + commit = props.get("Commit", "") url = url + ":" + c_hash if c_hash and c_hash.strip: existing_artifact.extend(self.store.get_artifacts_by_uri(c_hash)) if existing_artifact and len(existing_artifact) != 0: existing_artifact = existing_artifact[0] - uri = c_hash - print("i am here") artifact = link_execution_to_artifact( store=self.store, execution_id=self.execution.id, @@ -834,8 +827,8 @@ def log_python_env_on_server( properties={ "git_repo": str(git_repo), # passing c_hash value to commit - "Commit": str(dataset_commit), - "url": str(dvc_url_with_pipeline), + "Commit": str(commit), + "url": props.get("url", ""), }, artifact_type_properties={ "git_repo": mlpb.STRING, @@ -844,10 +837,12 @@ def log_python_env_on_server( }, milliseconds_since_epoch=int(time.time() * 1000), ) + custom_props = {} + custom_props["git_repo"] = git_repo + custom_props["Commit"] = commit self.execution_label_props["git_repo"] = git_repo - self.execution_label_props["Commit"] = dataset_commit + self.execution_label_props["Commit"] = commit - if self.graph: self.driver.create_env_node( name, @@ -856,6 +851,7 @@ def log_python_env_on_server( "output", self.execution.id, self.parent_context, + custom_props, ) child_artifact = { @@ -1348,7 +1344,7 @@ def log_model( custom_properties=custom_props, milliseconds_since_epoch=int(time.time() * 1000), ) - # custom_properties["Commit"] = model_commit + custom_properties["Commit"] = model_commit self.execution_label_props["Commit"] = model_commit #To DO model nodes should be similar to dataset nodes when we create neo4j if self.graph: @@ -1493,8 +1489,8 @@ def log_model_with_version( custom_properties=custom_props, milliseconds_since_epoch=int(time.time() * 1000), ) - # custom_properties["Commit"] = model_commit - # custom_props["url"] = url + custom_properties["Commit"] = props.get("Commit", "") + custom_props["url"] = url self.execution_label_props["Commit"] = props.get("Commit", "") if self.graph: self.driver.create_model_node( @@ -1571,6 +1567,8 @@ def log_execution_metrics_from_client(self, metrics_name: str, existing_artifacts = self.store.get_artifacts_by_uri(uri) existing_artifact = existing_artifacts[0] if existing_artifacts else None + # Didn't understand this, + # and in case of step_metrics should we follow this logic or dataset's logic or does it even matter if not existing_artifact or \ ((existing_artifact) and not (existing_artifact.name == new_metrics_name)): #we need to add the artifact otherwise its already there @@ -1587,6 +1585,7 @@ def log_execution_metrics_from_client(self, metrics_name: str, custom_properties=custom_props, milliseconds_since_epoch=int(time.time() * 1000), ) + if self.graph: # To do create execution_links self.driver.create_metrics_node( @@ -1797,8 +1796,12 @@ def commit_metrics(self, metrics_name: str): custom_properties=custom_props, milliseconds_since_epoch=int(time.time() * 1000), ) + + custom_props["Commit"] = metrics_commit + self.execution_label_props["Commit"] = metrics_commit + if self.graph: - self.driver.create_metrics_node( + self.driver.create_step_metrics_node( name, uri, "output", @@ -1811,7 +1814,7 @@ def commit_metrics(self, metrics_name: str): "URI": uri, "Event": "output", "Execution_Name": self.execution_name, - "Type": "Metrics", + "Type": "Step_Metrics", "Execution_Command": self.execution_command, "Pipeline_Id": self.parent_context.id, } @@ -1872,8 +1875,13 @@ def commit_existing_metrics(self, metrics_name: str, uri: str, props: t.Optional custom_properties=custom_props, milliseconds_since_epoch=int(time.time() * 1000), ) + + metrics_commit = props.get("Commit", "") + custom_props["Commit"] = metrics_commit + self.execution_label_props["Commit"] = metrics_commit + if self.graph: - self.driver.create_metrics_node( + self.driver.create_step_metrics_node( metrics_name, uri, "output", @@ -1886,7 +1894,7 @@ def commit_existing_metrics(self, metrics_name: str, uri: str, props: t.Optional "URI": uri, "Event": "output", "Execution_Name": self.execution_name, - "Type": "Metrics", + "Type": "Step_Metrics", "Execution_Command": self.execution_command, "Pipeline_Id": self.parent_context.id, } @@ -2112,12 +2120,6 @@ def commit(self, custom_properties: t.Optional[t.Dict] = None) -> None: input_name=dataslice_path + ":" + c_hash, ) else: - props={ - "git_repo": str(git_repo), - # passing c_hash value to commit - "Commit": str(dataslice_commit), - "url": str(dvc_url_with_pipeline), - }, slice = create_new_artifact_event_and_attribution( store=self.writer.store, execution_id=self.writer.execution.id, @@ -2140,9 +2142,14 @@ def commit(self, custom_properties: t.Optional[t.Dict] = None) -> None: custom_properties=custom_props, milliseconds_since_epoch=int(time.time() * 1000), ) + + custom_props["git_repo"] = git_repo + custom_props["Commit"] = dataslice_commit + self.writer.execution_label_props["git_repo"] = git_repo + self.writer.execution_label_props["Commit"] = dataslice_commit if self.writer.graph: self.writer.driver.create_dataslice_node( - self.name, dataslice_path + ":" + c_hash, c_hash, self.data_parent, props + self.name, dataslice_path + ":" + c_hash, c_hash, self.data_parent, custom_props ) os.chdir(logging_dir) return slice @@ -2184,12 +2191,14 @@ def commit_existing(self, uri: str, props: t.Optional[t.Dict] = None, custom_pro "Commit": mlpb.STRING, "url": mlpb.STRING, }, - custom_properties=custom_properties, + custom_properties=custom_props, milliseconds_since_epoch=int(time.time() * 1000), ) + custom_props["git_repo"] = props.get("git_repo", "") + custom_props["Commit"] = props.get("Commit", "") if self.writer.graph: self.writer.driver.create_dataslice_node( - self.name, self.name, c_hash, self.data_parent, custom_properties + self.name, self.name, c_hash, self.data_parent, custom_props ) return slice diff --git a/cmflib/cmf_merger.py b/cmflib/cmf_merger.py index 757b72c9..bb5b6d85 100644 --- a/cmflib/cmf_merger.py +++ b/cmflib/cmf_merger.py @@ -139,7 +139,7 @@ def parse_json_to_mlmd(mlmd_json, path_to_store: str, cmd: str, exec_id: Union[s elif artifact_type == "Step_Metrics": cmf_class.commit_existing_metrics(event["artifact"]["name"], uri, custom_props) elif artifact_type == "Environment": - cmf_class.log_python_env_on_server(artifact_name, uri) + cmf_class.log_python_env_on_server(artifact_name, uri, props) else: pass except AlreadyExistsError as e: diff --git a/cmflib/graph_wrapper.py b/cmflib/graph_wrapper.py index 08b7a1f9..de5e585c 100644 --- a/cmflib/graph_wrapper.py +++ b/cmflib/graph_wrapper.py @@ -76,6 +76,7 @@ def create_dataset_node(self, name: str, path: str, uri: str, event: str, execut custom_properties=None): if custom_properties is None: custom_properties = {} + print("custom_properties = ", custom_properties) pipeline_id = pipeline_context.id pipeline_name = pipeline_context.name dataset_syntax = self._create_dataset_syntax( @@ -89,11 +90,14 @@ def create_dataset_node(self, name: str, path: str, uri: str, event: str, execut _ = session.write_transaction(self._run_transaction, pc_syntax) def create_env_node(self, name: str, path: str, uri: str, event: str, execution_id: int, - pipeline_context: mlpb.Context): + pipeline_context: mlpb.Context, custom_properties=None): + if custom_properties is None: + custom_properties = {} + print("custom_properties = ", custom_properties) pipeline_id = pipeline_context.id pipeline_name = pipeline_context.name dataset_syntax = self._create_env_syntax( - name, path, uri, pipeline_id, pipeline_name) + name, path, uri, pipeline_id, pipeline_name, custom_properties) with self.driver.session() as session: node = session.write_transaction( self._run_transaction, dataset_syntax) @@ -158,6 +162,22 @@ def create_metrics_node(self, name: str, uri: str, event: str, execution_id: int "Execution", "Metrics", self.execution_id, node_id, event) _ = session.write_transaction(self._run_transaction, pc_syntax) + def create_step_metrics_node(self, name: str, uri: str, event: str, execution_id: int, pipeline_context: mlpb.Context, + custom_properties=None): + if custom_properties is None: + custom_properties = {} + pipeline_id = pipeline_context.id + pipeline_name = pipeline_context.name + metrics_syntax = self._create_step_metrics_syntax( + name, uri, event, execution_id, pipeline_id, pipeline_name, custom_properties) + with self.driver.session() as session: + node = session.write_transaction( + self._run_transaction, metrics_syntax) + node_id = node[0]["node_id"] + pc_syntax = self._create_execution_artifacts_link_syntax( + "Execution", "Step_Metrics", self.execution_id, node_id, event) + _ = session.write_transaction(self._run_transaction, pc_syntax) + def create_artifact_relationships( self, parent_artifacts, @@ -300,8 +320,8 @@ def _create_dataset_syntax(name: str, path: str, uri: str, pipeline_id: int, pip return syntax_str @staticmethod - def _create_env_syntax(name: str, path: str, uri: str, pipeline_id: int, pipeline_name: str): - custom_properties = {} + def _create_env_syntax(name: str, path: str, uri: str, pipeline_id: int, pipeline_name: str, + custom_properties): custom_properties["Name"] = name custom_properties["Path"] = path custom_properties["pipeline_id"] = str(pipeline_id) @@ -365,6 +385,23 @@ def _create_metrics_syntax(name: str, uri: str, event: str, execution_id: int, p syntax_str = syntax_str + "})" syntax_str = syntax_str + " RETURN ID(a) as node_id" return syntax_str + + @staticmethod + def _create_step_metrics_syntax(name: str, uri: str, event: str, execution_id: int, pipeline_id: int, + pipeline_name: str, custom_properties): + custom_properties["Name"] = name + custom_properties["uri"] = uri + # custom_properties["execution_id"] = str(execution_id) + custom_properties["pipeline_id"] = str(pipeline_id) + custom_properties["pipeline_name"] = pipeline_name + syntax_str = "MERGE (a:Step_Metrics {" # + str(props) + ")" + for k, v in custom_properties.items(): + k = re.sub('\W+', '', k) + syntax_str = syntax_str + k + ":" + "\"" + str(v) + "\"" + "," + syntax_str = syntax_str.rstrip(syntax_str[-1]) + syntax_str = syntax_str + "})" + syntax_str = syntax_str + " RETURN ID(a) as node_id" + return syntax_str @staticmethod def _create_stage_syntax(name: str, props: t.Dict, uri: int, pipeline_id: int, pipeline_name: str) -> str: diff --git a/cmflib/utils/helper_functions.py b/cmflib/utils/helper_functions.py index ce38bc03..3371bb15 100644 --- a/cmflib/utils/helper_functions.py +++ b/cmflib/utils/helper_functions.py @@ -76,8 +76,6 @@ def get_python_env(env_name='cmf'): else: # If not conda, assume virtualenv/pip - print("Detected virtualenv/pip environment. Exporting requirements.txt...") - # Step 1: Get the list of pip packages pip_packages = subprocess.check_output(['pip', 'freeze']).decode('utf-8').splitlines() diff --git a/docker-compose-server.yml b/docker-compose-server.yml index f5a267af..8f19de7f 100644 --- a/docker-compose-server.yml +++ b/docker-compose-server.yml @@ -22,15 +22,15 @@ services: - "6006:6006" volumes: # directory path should be updated as per user's environment - - /home/xxxx/cmf-server/data/tensorboard-logs:/logs + - /home/sharvark/cmf-server/data/tensorboard-logs:/logs container_name: tensorboard server: image: server:latest # both the directory paths should be updated as per user's environment volumes: - - /home/xxxx/cmf-server/data:/cmf-server/data - - /home/xxxx/cmf-server/data/static:/cmf-server/data/static - - /home/xxxx/cmf-server/data/tensorboard-logs:/cmf-server/data/tensorboard-logs + - /home/sharvark/cmf-server/data:/cmf-server/data + - /home/sharvark/cmf-server/data/static:/cmf-server/data/static + - /home/sharvark/cmf-server/data/tensorboard-logs:/cmf-server/data/tensorboard-logs container_name: cmf-server build: context: ./ From d995b2f5e2ac400e09f6588bd362b62d6444827a Mon Sep 17 00:00:00 2001 From: Varkha Sharma Date: Thu, 7 Nov 2024 05:42:50 -0800 Subject: [PATCH 05/15] reverting docker-compose-server.yml --- docker-compose-server.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docker-compose-server.yml b/docker-compose-server.yml index 8f19de7f..f5a267af 100644 --- a/docker-compose-server.yml +++ b/docker-compose-server.yml @@ -22,15 +22,15 @@ services: - "6006:6006" volumes: # directory path should be updated as per user's environment - - /home/sharvark/cmf-server/data/tensorboard-logs:/logs + - /home/xxxx/cmf-server/data/tensorboard-logs:/logs container_name: tensorboard server: image: server:latest # both the directory paths should be updated as per user's environment volumes: - - /home/sharvark/cmf-server/data:/cmf-server/data - - /home/sharvark/cmf-server/data/static:/cmf-server/data/static - - /home/sharvark/cmf-server/data/tensorboard-logs:/cmf-server/data/tensorboard-logs + - /home/xxxx/cmf-server/data:/cmf-server/data + - /home/xxxx/cmf-server/data/static:/cmf-server/data/static + - /home/xxxx/cmf-server/data/tensorboard-logs:/cmf-server/data/tensorboard-logs container_name: cmf-server build: context: ./ From 4d3b28e22686db53b0d2c3eb67cacccdecf5298f Mon Sep 17 00:00:00 2001 From: Varkha Sharma Date: Thu, 7 Nov 2024 05:58:53 -0800 Subject: [PATCH 06/15] removing commented and print statements --- cmflib/cmf.py | 7 ------- cmflib/graph_wrapper.py | 1 - 2 files changed, 8 deletions(-) diff --git a/cmflib/cmf.py b/cmflib/cmf.py index 93407b78..a191c2a3 100644 --- a/cmflib/cmf.py +++ b/cmflib/cmf.py @@ -104,16 +104,9 @@ class Cmf: """ # pylint: disable=too-many-instance-attributes - # Reading CONFIG_FILE variable - #cmf_config = os.environ.get("CONFIG_FILE", ".cmfconfig") ARTIFACTS_PATH = "cmf_artifacts" DATASLICE_PATH = "dataslice" METRICS_PATH = "metrics" - #if os.path.exists(cmf_config): - # attr_dict = CmfConfig.read_config(cmf_config) - # __neo4j_uri = attr_dict.get("neo4j-uri", "") - # __neo4j_password = attr_dict.get("neo4j-password", "") - # __neo4j_user = attr_dict.get("neo4j-user", "") def __init__( self, diff --git a/cmflib/graph_wrapper.py b/cmflib/graph_wrapper.py index de5e585c..6ebd8ed5 100644 --- a/cmflib/graph_wrapper.py +++ b/cmflib/graph_wrapper.py @@ -93,7 +93,6 @@ def create_env_node(self, name: str, path: str, uri: str, event: str, execution_ pipeline_context: mlpb.Context, custom_properties=None): if custom_properties is None: custom_properties = {} - print("custom_properties = ", custom_properties) pipeline_id = pipeline_context.id pipeline_name = pipeline_context.name dataset_syntax = self._create_env_syntax( From ccbf9ffe2429cc8936080068b6b559fc44f50655 Mon Sep 17 00:00:00 2001 From: Varkha Sharma Date: Sun, 10 Nov 2024 04:54:05 -0800 Subject: [PATCH 07/15] added code to make Environment an input artifact instead of an output --- cmflib/cmf.py | 72 ++++++++++++++++++++++++--------------------------- 1 file changed, 34 insertions(+), 38 deletions(-) diff --git a/cmflib/cmf.py b/cmflib/cmf.py index a191c2a3..629c1cb2 100644 --- a/cmflib/cmf.py +++ b/cmflib/cmf.py @@ -139,6 +139,7 @@ def __init__( self.execution_name = "" self.execution_command = "" self.metrics = {} + # why have we created this list self.input_artifacts = [] self.execution_label_props = {} self.graph = graph @@ -721,7 +722,7 @@ def log_python_env( execution_id=self.execution.id, uri=uri, input_name=url, - event_type=mlpb.Event.Type.OUTPUT, + event_type=mlpb.Event.Type.INPUT, ) else: uri = c_hash if c_hash and c_hash.strip() else str(uuid.uuid1()) @@ -732,7 +733,7 @@ def log_python_env( uri=uri, name=url, type_name="Environment", - event_type=mlpb.Event.Type.OUTPUT, + event_type=mlpb.Event.Type.INPUT, properties={ "git_repo": str(git_repo), # passing c_hash value to commit @@ -752,33 +753,30 @@ def log_python_env( self.execution_label_props["git_repo"] = git_repo self.execution_label_props["Commit"] = commit - if self.graph: self.driver.create_env_node( name, url, uri, - "output", + "input", self.execution.id, self.parent_context, custom_props, ) - - child_artifact = { - "Name": name, - "Path": url, - "URI": uri, - "Event": "output", - "Execution_Name": self.execution_name, - "Type": "Environment", - "Execution_Command": self.execution_command, - "Pipeline_Id": self.parent_context.id, - "Pipeline_Name": self.parent_context.name, - } - self.driver.create_artifact_relationships( - self.input_artifacts, child_artifact, self.execution_label_props + self.input_artifacts.append( + { + "Name": name, + "Path": url, + "URI": uri, + "Event": "input", + "Execution_Name": self.execution_name, + "Type": "Environment", + "Execution_Command": self.execution_command, + "Pipeline_Id": self.parent_context.id, + "Pipeline_Name": self.parent_context.name, + } ) - + self.driver.create_execution_links(uri, name, "Environment") return artifact def log_python_env_on_server( @@ -805,7 +803,7 @@ def log_python_env_on_server( execution_id=self.execution.id, uri=uri, input_name=url, - event_type=mlpb.Event.Type.OUTPUT, + event_type=mlpb.Event.Type.INPUT, ) else: uri = c_hash if c_hash and c_hash.strip() else str(uuid.uuid1()) @@ -816,7 +814,7 @@ def log_python_env_on_server( uri=uri, name=url, type_name="Environment", - event_type=mlpb.Event.Type.OUTPUT, + event_type=mlpb.Event.Type.INPUT, properties={ "git_repo": str(git_repo), # passing c_hash value to commit @@ -835,33 +833,31 @@ def log_python_env_on_server( custom_props["Commit"] = commit self.execution_label_props["git_repo"] = git_repo self.execution_label_props["Commit"] = commit - + if self.graph: self.driver.create_env_node( name, url, uri, - "output", + "input", self.execution.id, self.parent_context, custom_props, ) - - child_artifact = { - "Name": name, - "Path": url, - "URI": uri, - "Event": "output", - "Execution_Name": self.execution_name, - "Type": "Environment", - "Execution_Command": self.execution_command, - "Pipeline_Id": self.parent_context.id, - "Pipeline_Name": self.parent_context.name, - } - self.driver.create_artifact_relationships( - self.input_artifacts, child_artifact, self.execution_label_props + self.input_artifacts.append( + { + "Name": name, + "Path": url, + "URI": uri, + "Event": "input", + "Execution_Name": self.execution_name, + "Type": "Environment", + "Execution_Command": self.execution_command, + "Pipeline_Id": self.parent_context.id, + "Pipeline_Name": self.parent_context.name, + } ) - + self.driver.create_execution_links(uri, name, "Environment") return artifact def log_dataset( From 0ce7327cf368adb17217141936e2af07e59e96b6 Mon Sep 17 00:00:00 2001 From: Varkha Sharma Date: Fri, 15 Nov 2024 01:19:06 -0800 Subject: [PATCH 08/15] pushing latest changes for tracking --- cmflib/cmf.py | 1267 +++----------------------------- cmflib/cmf_commands_wrapper.py | 446 ++++++++--- cmflib/cmf_server_methods.py | 768 +++++++++++++++++++ 3 files changed, 1224 insertions(+), 1257 deletions(-) create mode 100644 cmflib/cmf_server_methods.py diff --git a/cmflib/cmf.py b/cmflib/cmf.py index 629c1cb2..2eef4ed2 100644 --- a/cmflib/cmf.py +++ b/cmflib/cmf.py @@ -56,22 +56,29 @@ ) from cmflib.utils.cmf_config import CmfConfig from cmflib.utils.helper_functions import get_python_env, change_dir, get_md5_hash +from cmflib.cmf_server_methods import ( + merge_created_context, + merge_created_execution, + log_python_env_on_server, + log_dataset_with_version, + log_model_with_version, + log_execution_metrics_from_client, + commit_existing_metrics, + commit_existing, + commit_existing +) + from cmflib.cmf_commands_wrapper import ( - _metadata_push, - _metadata_pull, - _metadata_export, - _artifact_pull, - _artifact_push, - _artifact_pull_single, - _cmf_cmd_init, - _init_local, - _init_minioS3, - _init_amazonS3, - _init_sshremote, - _init_osdfremote, - _artifact_list, - _pipeline_list, - _execution_list, + metadata_push, + metadata_pull, + cmf_init_show, + metadata_export, + artifact_pull, + artifact_push, + artifact_pull_single, + artifact_list, + pipeline_list, + execution_list, ) class Cmf: @@ -273,47 +280,6 @@ def create_context( ) return ctx - def merge_created_context( - self, pipeline_stage: str, custom_properties: t.Optional[t.Dict] = None - ) -> mlpb.Context: - """Merge created context. - Every call creates a unique pipeline stage. - Created for metadata push purpose. - Example: - - ```python - #Create context - # Import CMF - from cmflib.cmf import Cmf - from ml_metadata.proto import metadata_store_pb2 as mlpb - # Create CMF logger - cmf = Cmf(filepath="mlmd", pipeline_name="test_pipeline") - # Create context - context: mlmd.proto.Context = cmf.merge_created_context( - pipeline_stage="Test-env/prepare", - custom_properties ={"user-metadata1": "metadata_value"} - ``` - Args: - Pipeline_stage: Pipeline_Name/Stage_name. - custom_properties: Developers can provide key value pairs with additional properties of the execution that - need to be stored. - Returns: - Context object from ML Metadata library associated with the new context for this stage. - """ - - custom_props = {} if custom_properties is None else custom_properties - ctx = get_or_create_run_context( - self.store, pipeline_stage, custom_props) - self.child_context = ctx - associate_child_to_parent_context( - store=self.store, parent_context=self.parent_context, child_context=ctx - ) - if self.graph: - self.driver.create_stage_node( - pipeline_stage, self.parent_context, ctx.id, custom_props - ) - return ctx - def update_context( self, type_name: str, @@ -564,127 +530,6 @@ def update_execution( ) return self.execution - def merge_created_execution( - self, - execution_type: str, - execution_cmd: str, - properties: t.Optional[t.Dict] = None, - custom_properties: t.Optional[t.Dict] = None, - orig_execution_name:str = "", - create_new_execution:bool = True - ) -> mlpb.Execution: - """Merge Created execution. - Every call creates a unique execution. Execution can only be created within a context, so - [create_context][cmflib.cmf.Cmf.create_context] must be called first. - Every call occurs when metadata push or pull is processed. Data from pre-existing executions is used - to create new executions with additional data(Required on cmf-server). - Example: - ```python - # Import CMF - from cmflib.cmf import Cmf - from ml_metadata.proto import metadata_store_pb2 as mlpb - # Create CMF logger - cmf = Cmf(filepath="mlmd", pipeline_name="test_pipeline") - # Create or reuse context for this stage - context: mlmd.proto.Context = cmf.merge_created_context( - pipeline_stage="prepare", - custom_properties ={"user-metadata1": "metadata_value"} - ) - # Create a new execution for this stage run - execution: mlmd.proto.Execution = cmf.merge_created_execution( - execution_type="Prepare", - properties={"Context_Type":""}, - custom_properties = {"split": split, "seed": seed}, - orig_execution_name=execution_name - ) - ``` - Args: - execution_type: Type of the execution.(when create_new_execution is False, this is the name of execution) - properties: Properties of Execution. - custom_properties: Developers can provide key value pairs with additional properties of the execution that - need to be stored. - - cmd: command used to run this execution. - - create_new_execution:bool = True, This can be used by advanced users to re-use executions - This is applicable, when working with framework code like mmdet, pytorch lightning etc, where the - custom call-backs are used to log metrics. - if create_new_execution is True(Default), execution_type parameter will be used as the name of the execution type. - if create_new_execution is False, if existing execution exist with the same name as execution_type. - it will be reused. - Only executions created with create_new_execution as False will have "name" as a property. - - - Returns: - Execution object from ML Metadata library associated with the execution for this stage. - """ - # Initializing the execution related fields - properties = {} if properties is None else properties - self.metrics = {} - self.input_artifacts = [] - self.execution_label_props = {} - custom_props = {} if custom_properties is None else custom_properties - # print(custom_props) - git_repo = properties.get("Git_Repo", "") - git_start_commit = properties.get("Git_Start_Commit", "") - #name = properties.get("Name", "") - create_new_execution = True - execution_name = execution_type - #exe.name property is passed as the orig_execution_name. - #if name is not an empty string then we are re-using executions - if orig_execution_name != "": - create_new_execution = False - execution_name = orig_execution_name - - self.execution = create_new_execution_in_existing_run_context( - store=self.store, - execution_type_name=execution_type, # Type field when re-using executions - execution_name=execution_name, #Name field if we are re-using executionsname - #Type field , if creating new executions always - context_id=self.child_context.id, - execution=execution_cmd, - pipeline_id=self.parent_context.id, - pipeline_type=self.parent_context.name, - git_repo=git_repo, - git_start_commit=git_start_commit, - custom_properties=custom_props, - create_new_execution=create_new_execution - ) - - uuids = "" - - uuids = self.execution.properties["Execution_uuid"].string_value - if uuids: - self.execution.properties["Execution_uuid"].string_value = uuids +\ - ","+properties["Execution_uuid"] - else: - self.execution.properties["Execution_uuid"].string_value =\ - properties["Execution_uuid"] - - - self.store.put_executions([self.execution]) - self.execution_name = str(self.execution.id) + "," + execution_type - self.execution_command = execution_cmd - for k, v in custom_props.items(): - k = re.sub("-", "_", k) - self.execution_label_props[k] = v - self.execution_label_props["Execution_Name"] = ( - execution_type + ":" + str(self.execution.id) - ) - self.execution_label_props["execution_command"] = execution_cmd - if self.graph: - self.driver.create_execution_node( - self.execution_name, - self.child_context.id, - self.parent_context, - execution_cmd, - self.execution.id, - custom_props, - ) - - # link the artifact to execution if it exists and creates artifact if it doesn't - return self.execution - def log_dvc_lock(self, file_path: str): """Used to update the dvc lock file created with dvc run command.""" print("Entered dvc lock file commit") @@ -779,87 +624,6 @@ def log_python_env( self.driver.create_execution_links(uri, name, "Environment") return artifact - def log_python_env_on_server( - self, - url: str, - uri: str, - props: t.Optional[t.Dict] = None, - ) -> mlpb.Artifact: - "Used to log the python packages involved in the current execution" - - git_repo = props.get("git_repo", "") - name = url - existing_artifact = [] - c_hash = uri - commit = props.get("Commit", "") - url = url + ":" + c_hash - if c_hash and c_hash.strip: - existing_artifact.extend(self.store.get_artifacts_by_uri(c_hash)) - - if existing_artifact and len(existing_artifact) != 0: - existing_artifact = existing_artifact[0] - artifact = link_execution_to_artifact( - store=self.store, - execution_id=self.execution.id, - uri=uri, - input_name=url, - event_type=mlpb.Event.Type.INPUT, - ) - else: - uri = c_hash if c_hash and c_hash.strip() else str(uuid.uuid1()) - artifact = create_new_artifact_event_and_attribution( - store=self.store, - execution_id=self.execution.id, - context_id=self.child_context.id, - uri=uri, - name=url, - type_name="Environment", - event_type=mlpb.Event.Type.INPUT, - properties={ - "git_repo": str(git_repo), - # passing c_hash value to commit - "Commit": str(commit), - "url": props.get("url", ""), - }, - artifact_type_properties={ - "git_repo": mlpb.STRING, - "Commit": mlpb.STRING, - "url": mlpb.STRING, - }, - milliseconds_since_epoch=int(time.time() * 1000), - ) - custom_props = {} - custom_props["git_repo"] = git_repo - custom_props["Commit"] = commit - self.execution_label_props["git_repo"] = git_repo - self.execution_label_props["Commit"] = commit - - if self.graph: - self.driver.create_env_node( - name, - url, - uri, - "input", - self.execution.id, - self.parent_context, - custom_props, - ) - self.input_artifacts.append( - { - "Name": name, - "Path": url, - "URI": uri, - "Event": "input", - "Execution_Name": self.execution_name, - "Type": "Environment", - "Execution_Command": self.execution_command, - "Pipeline_Id": self.parent_context.id, - "Pipeline_Name": self.parent_context.name, - } - ) - self.driver.create_execution_links(uri, name, "Environment") - return artifact - def log_dataset( self, url: str, @@ -1067,213 +831,76 @@ def update_model_url(self, dup_artifact: list, updated_url: str): put_artifact(self.store, dup_art) return dup_artifact - def log_dataset_with_version( + # Add the model to dvc do a git commit and store the commit id in MLMD + def log_model( self, - url: str, - version: str, + path: str, event: str, - props: t.Optional[t.Dict] = None, + model_framework: str = "Default", + model_type: str = "Default", + model_name: str = "Default", custom_properties: t.Optional[t.Dict] = None, ) -> mlpb.Artifact: - """Logs a dataset when the version (hash) is known. - Example: - ```python - artifact: mlpb.Artifact = cmf.log_dataset_with_version( - url="path/to/dataset", - version="abcdef", - event="output", - props={ "git_repo": "https://github.com/example/repo", - "url": "/path/in/repo", }, - custom_properties={ "custom_key": "custom_value", }, - ) - ``` - Args: - url: Path to the dataset. - version: Hash or version identifier for the dataset. - event: Takes arguments `INPUT` or `OUTPUT`. - props: Optional properties for the dataset (e.g., git_repo, url). - custom_properties: Optional custom properties for the dataset. - Returns: - Artifact object from the ML Protocol Buffers library associated with the new dataset artifact. + """Logs a model. + The model is added to dvc and the metadata file (.dvc) gets committed to git. + Example: + ```python + artifact: mlmd.proto.Artifact= cmf.log_model( + path="path/to/model.pkl", + event="output", + model_framework="SKlearn", + model_type="RandomForestClassifier", + model_name="RandomForestClassifier:default" + ) + ``` + Args: + path: Path to the model file. + event: Takes arguments `INPUT` OR `OUTPUT`. + model_framework: Framework used to create the model. + model_type: Type of model algorithm used. + model_name: Name of the algorithm used. + custom_properties: The model properties. + Returns: + Artifact object from ML Metadata library associated with the new model artifact. """ - props = {} if props is None else props + logging_dir = change_dir(self.cmf_init_path) + # Assigning current file name as stage and execution name + current_script = sys.argv[0] + file_name = os.path.basename(current_script) + name_without_extension = os.path.splitext(file_name)[0] + # create context if not already created + if not self.child_context: + self.create_context(pipeline_stage=name_without_extension) + assert self.child_context is not None, f"Failed to create context for {self.pipeline_name}!!" + + # create execution if not already created + if not self.execution: + self.create_execution(execution_type=name_without_extension) + assert self.execution is not None, f"Failed to create execution for {self.pipeline_name}!!" + + + # To Do : Technical Debt. + # If the model already exist , then we just link the existing model to the execution + # We do not update the model properties . + # We need to append the new properties to the existing model properties + if custom_properties is None: + custom_properties = {} custom_props = {} if custom_properties is None else custom_properties - git_repo = props.get("git_repo", "") - name = url + # name = re.split('/', path)[-1] event_type = mlpb.Event.Type.OUTPUT existing_artifact = [] - c_hash = version if event.lower() == "input": event_type = mlpb.Event.Type.INPUT - # dataset_commit = commit_output(url, self.execution.id) + commit_output(path, self.execution.id) + c_hash = dvc_get_hash(path) - dataset_commit = version - url = url + ":" + c_hash - if c_hash and c_hash.strip: - existing_artifact.extend(self.store.get_artifacts_by_uri(c_hash)) + if c_hash == "": + print("Error in getting the dvc hash,return without logging") + return - # To Do - What happens when uri is the same but names are different - if existing_artifact and len(existing_artifact) != 0: - existing_artifact = existing_artifact[0] - - # Quick fix- Updating only the name - if custom_properties is not None: - self.update_existing_artifact( - existing_artifact, custom_properties) - uri = c_hash - # update url for existing artifact - self.update_dataset_url(existing_artifact, props.get("url", "")) - artifact = link_execution_to_artifact( - store=self.store, - execution_id=self.execution.id, - uri=uri, - input_name=url, - event_type=event_type, - ) - else: - # if((existing_artifact and len(existing_artifact )!= 0) and c_hash != ""): - # url = url + ":" + str(self.execution.id) - uri = c_hash if c_hash and c_hash.strip() else str(uuid.uuid1()) - artifact = create_new_artifact_event_and_attribution( - store=self.store, - execution_id=self.execution.id, - context_id=self.child_context.id, - uri=uri, - name=url, - type_name="Dataset", - event_type=event_type, - properties={ - "git_repo": str(git_repo), - "Commit": str(dataset_commit), - "url": props.get("url", " "), - }, - artifact_type_properties={ - "git_repo": mlpb.STRING, - "Commit": mlpb.STRING, - "url": mlpb.STRING, - }, - custom_properties=custom_props, - milliseconds_since_epoch=int(time.time() * 1000), - ) - custom_props["git_repo"] = git_repo - custom_props["Commit"] = dataset_commit - self.execution_label_props["git_repo"] = git_repo - self.execution_label_props["Commit"] = dataset_commit - - if self.graph: - self.driver.create_dataset_node( - name, - url, - uri, - event, - self.execution.id, - self.parent_context, - custom_props, - ) - if event.lower() == "input": - self.input_artifacts.append( - { - "Name": name, - "Path": url, - "URI": uri, - "Event": event.lower(), - "Execution_Name": self.execution_name, - "Type": "Dataset", - "Execution_Command": self.execution_command, - "Pipeline_Id": self.parent_context.id, - "Pipeline_Name": self.parent_context.name, - } - ) - self.driver.create_execution_links(uri, name, "Dataset") - else: - child_artifact = { - "Name": name, - "Path": url, - "URI": uri, - "Event": event.lower(), - "Execution_Name": self.execution_name, - "Type": "Dataset", - "Execution_Command": self.execution_command, - "Pipeline_Id": self.parent_context.id, - "Pipeline_Name": self.parent_context.name, - } - self.driver.create_artifact_relationships( - self.input_artifacts, child_artifact, self.execution_label_props - ) - return artifact - - # Add the model to dvc do a git commit and store the commit id in MLMD - def log_model( - self, - path: str, - event: str, - model_framework: str = "Default", - model_type: str = "Default", - model_name: str = "Default", - custom_properties: t.Optional[t.Dict] = None, - ) -> mlpb.Artifact: - """Logs a model. - The model is added to dvc and the metadata file (.dvc) gets committed to git. - Example: - ```python - artifact: mlmd.proto.Artifact= cmf.log_model( - path="path/to/model.pkl", - event="output", - model_framework="SKlearn", - model_type="RandomForestClassifier", - model_name="RandomForestClassifier:default" - ) - ``` - Args: - path: Path to the model file. - event: Takes arguments `INPUT` OR `OUTPUT`. - model_framework: Framework used to create the model. - model_type: Type of model algorithm used. - model_name: Name of the algorithm used. - custom_properties: The model properties. - Returns: - Artifact object from ML Metadata library associated with the new model artifact. - """ - - logging_dir = change_dir(self.cmf_init_path) - # Assigning current file name as stage and execution name - current_script = sys.argv[0] - file_name = os.path.basename(current_script) - name_without_extension = os.path.splitext(file_name)[0] - # create context if not already created - if not self.child_context: - self.create_context(pipeline_stage=name_without_extension) - assert self.child_context is not None, f"Failed to create context for {self.pipeline_name}!!" - - # create execution if not already created - if not self.execution: - self.create_execution(execution_type=name_without_extension) - assert self.execution is not None, f"Failed to create execution for {self.pipeline_name}!!" - - - # To Do : Technical Debt. - # If the model already exist , then we just link the existing model to the execution - # We do not update the model properties . - # We need to append the new properties to the existing model properties - if custom_properties is None: - custom_properties = {} - custom_props = {} if custom_properties is None else custom_properties - # name = re.split('/', path)[-1] - event_type = mlpb.Event.Type.OUTPUT - existing_artifact = [] - if event.lower() == "input": - event_type = mlpb.Event.Type.INPUT - - commit_output(path, self.execution.id) - c_hash = dvc_get_hash(path) - - if c_hash == "": - print("Error in getting the dvc hash,return without logging") - return - - model_commit = c_hash + model_commit = c_hash # If connecting to an existing artifact - The name of the artifact is # used as path/steps/key @@ -1377,229 +1004,6 @@ def log_model( os.chdir(logging_dir) return artifact - # Add the model to dvc do a git commit and store the commit id in MLMD - def log_model_with_version( - self, - path: str, - event: str, - props=None, - custom_properties: t.Optional[t.Dict] = None, - ) -> object: - """Logs a model when the version(hash) is known - The model is added to dvc and the metadata file (.dvc) gets committed to git. - Example: - ```python - artifact: mlmd.proto.Artifact= cmf.log_model_with_version( - path="path/to/model.pkl", - event="output", - props={ - "url": "/home/user/local-storage/bf/629ccd5cd008066b72c04f9a918737", - "model_type": "RandomForestClassifier", - "model_name": "RandomForestClassifier:default", - "Commit": "commit 1146dad8b74cae205db6a3132ea403db1e4032e5", - "model_framework": "SKlearn", - }, - custom_properties={ - "uri": "bf629ccd5cd008066b72c04f9a918737", - }, - - ) - ``` - Args: - path: Path to the model file. - event: Takes arguments `INPUT` OR `OUTPUT`. - props: Model artifact properties. - custom_properties: The model properties. - Returns: - Artifact object from ML Metadata library associated with the new model artifact. - """ - - if custom_properties is None: - custom_properties = {} - custom_props = {} if custom_properties is None else custom_properties - name = re.split("/", path)[-1] - event_type = mlpb.Event.Type.OUTPUT - existing_artifact = [] - if event.lower() == "input": - event_type = mlpb.Event.Type.INPUT - - # props["commit"] = "" # To do get from incoming data - c_hash = props.get("uri", " ") - # If connecting to an existing artifact - The name of the artifact is used as path/steps/key - model_uri = path + ":" + c_hash - # dvc_url = dvc_get_url(path, False) - url = props.get("url", "") - # uri = "" - if c_hash and c_hash.strip(): - uri = c_hash.strip() - existing_artifact.extend(self.store.get_artifacts_by_uri(uri)) - else: - raise RuntimeError("Model commit failed, Model uri empty") - - if ( - existing_artifact - and len(existing_artifact) != 0 - ): - # update url for existing artifact - existing_artifact = self.update_model_url(existing_artifact, url) - artifact = link_execution_to_artifact( - store=self.store, - execution_id=self.execution.id, - uri=c_hash, - input_name=model_uri, - event_type=event_type, - ) - model_uri = artifact.name - else: - uri = c_hash if c_hash and c_hash.strip() else str(uuid.uuid1()) - model_uri = model_uri + ":" + str(self.execution.id) - artifact = create_new_artifact_event_and_attribution( - store=self.store, - execution_id=self.execution.id, - context_id=self.child_context.id, - uri=uri, - name=model_uri, - type_name="Model", - event_type=event_type, - properties={ - "model_framework": props.get("model_framework", ""), - "model_type": props.get("model_type", ""), - "model_name": props.get("model_name", ""), - "Commit": props.get("Commit", ""), - "url": str(url), - }, - artifact_type_properties={ - "model_framework": mlpb.STRING, - "model_type": mlpb.STRING, - "model_name": mlpb.STRING, - "Commit": mlpb.STRING, - "url": mlpb.STRING, - }, - custom_properties=custom_props, - milliseconds_since_epoch=int(time.time() * 1000), - ) - custom_properties["Commit"] = props.get("Commit", "") - custom_props["url"] = url - self.execution_label_props["Commit"] = props.get("Commit", "") - if self.graph: - self.driver.create_model_node( - model_uri, - uri, - event, - self.execution.id, - self.parent_context, - custom_props, - ) - if event.lower() == "input": - self.input_artifacts.append( - { - "Name": model_uri, - "URI": uri, - "Event": event.lower(), - "Execution_Name": self.execution_name, - "Type": "Model", - "Execution_Command": self.execution_command, - "Pipeline_Id": self.parent_context.id, - "Pipeline_Name": self.parent_context.name, - } - ) - self.driver.create_execution_links(uri, model_uri, "Model") - else: - child_artifact = { - "Name": model_uri, - "URI": uri, - "Event": event.lower(), - "Execution_Name": self.execution_name, - "Type": "Model", - "Execution_Command": self.execution_command, - "Pipeline_Id": self.parent_context.id, - "Pipeline_Name": self.parent_context.name, - } - self.driver.create_artifact_relationships( - self.input_artifacts, child_artifact, self.execution_label_props - ) - - return artifact - - def log_execution_metrics_from_client(self, metrics_name: str, - custom_properties: t.Optional[t.Dict] = None) -> mlpb.Artifact: - """ Logs execution metrics from a client. - Data from pre-existing metrics from client side is used to create identical metrics on server side. - Example: - ```python - artifact: mlpb.Artifact = cmf.log_execution_metrics_from_client( - metrics_name="example_metrics:uri:123", - custom_properties={"custom_key": "custom_value"}, - ) - ``` - Args: - metrics_name: Name of the metrics in the format "name:uri:execution_id". - custom_properties: Optional custom properties for the metrics. - Returns: - Artifact object from the ML Protocol Buffers library associated with the metrics artifact. - """ - - metrics = None - custom_props = {} if custom_properties is None else custom_properties - existing_artifact = [] - name_tokens = metrics_name.split(":") - if name_tokens and len(name_tokens) > 2: - name = name_tokens[0] - uri = name_tokens[1] - execution_id = name_tokens[2] - else: - print(f"Error : metrics name {metrics_name} is not in the correct format") - return - - #we need to add the execution id to the metrics name - new_metrics_name = f"{name}:{uri}:{str(self.execution.id)}" - existing_artifacts = self.store.get_artifacts_by_uri(uri) - - existing_artifact = existing_artifacts[0] if existing_artifacts else None - # Didn't understand this, - # and in case of step_metrics should we follow this logic or dataset's logic or does it even matter - if not existing_artifact or \ - ((existing_artifact) and not - (existing_artifact.name == new_metrics_name)): #we need to add the artifact otherwise its already there - metrics = create_new_artifact_event_and_attribution( - store=self.store, - execution_id=self.execution.id, - context_id=self.child_context.id, - uri=uri, - name=new_metrics_name, - type_name="Metrics", - event_type=mlpb.Event.Type.OUTPUT, - properties={"metrics_name": metrics_name}, - artifact_type_properties={"metrics_name": mlpb.STRING}, - custom_properties=custom_props, - milliseconds_since_epoch=int(time.time() * 1000), - ) - - if self.graph: - # To do create execution_links - self.driver.create_metrics_node( - metrics_name, - uri, - "output", - self.execution.id, - self.parent_context, - custom_props, - ) - child_artifact = { - "Name": metrics_name, - "URI": uri, - "Event": "output", - "Execution_Name": self.execution_name, - "Type": "Metrics", - "Execution_Command": self.execution_command, - "Pipeline_Id": self.parent_context.id, - "Pipeline_Name": self.parent_context.name, - } - self.driver.create_artifact_relationships( - self.input_artifacts, child_artifact, self.execution_label_props - ) - return metrics - def log_execution_metrics( self, metrics_name: str, custom_properties: t.Optional[t.Dict] = None @@ -1814,84 +1218,6 @@ def commit_metrics(self, metrics_name: str): os.chdir(logging_dir) return metrics - def commit_existing_metrics(self, metrics_name: str, uri: str, props: t.Optional[t.Dict] = None, custom_properties: t.Optional[t.Dict] = None): - """ - Commits existing metrics associated with the given URI to MLMD. - Example: - ```python - artifact: mlpb.Artifact = cmf.commit_existing_metrics("existing_metrics", "abc123", - {"custom_key": "custom_value"}) - ``` - Args: - metrics_name: Name of the metrics. - uri: Unique identifier associated with the metrics. - custom_properties: Optional custom properties for the metrics. - Returns: - Artifact object from the ML Protocol Buffers library associated with the existing metrics artifact. - """ - - custom_props = {} if custom_properties is None else custom_properties - c_hash = uri.strip() - existing_artifact = [] - existing_artifact.extend(self.store.get_artifacts_by_uri(c_hash)) - if (existing_artifact - and len(existing_artifact) != 0 ): - metrics = link_execution_to_artifact( - store=self.store, - execution_id=self.execution.id, - uri=c_hash, - input_name=metrics_name, - event_type=mlpb.Event.Type.OUTPUT, - ) - else: - metrics = create_new_artifact_event_and_attribution( - store=self.store, - execution_id=self.execution.id, - context_id=self.child_context.id, - uri=uri, - name=metrics_name, - type_name="Step_Metrics", - event_type=mlpb.Event.Type.OUTPUT, - properties={ - # passing uri value to commit - "Commit": props.get("Commit", ""), - "url": props.get("url", ""), - }, - artifact_type_properties={ - "Commit": mlpb.STRING, - "url": mlpb.STRING, - }, - custom_properties=custom_props, - milliseconds_since_epoch=int(time.time() * 1000), - ) - - metrics_commit = props.get("Commit", "") - custom_props["Commit"] = metrics_commit - self.execution_label_props["Commit"] = metrics_commit - - if self.graph: - self.driver.create_step_metrics_node( - metrics_name, - uri, - "output", - self.execution.id, - self.parent_context, - custom_props, - ) - child_artifact = { - "Name": metrics_name, - "URI": uri, - "Event": "output", - "Execution_Name": self.execution_name, - "Type": "Step_Metrics", - "Execution_Command": self.execution_command, - "Pipeline_Id": self.parent_context.id, - } - self.driver.create_artifact_relationships( - self.input_artifacts, child_artifact, self.execution_label_props - ) - return metrics - def log_validation_output( self, version: str, custom_properties: t.Optional[t.Dict] = None @@ -2143,53 +1469,6 @@ def commit(self, custom_properties: t.Optional[t.Dict] = None) -> None: os.chdir(logging_dir) return slice - # commit existing dataslice to server - def commit_existing(self, uri: str, props: t.Optional[t.Dict] = None, custom_properties: t.Optional[t.Dict] = None) -> None: - custom_props = {} if custom_properties is None else custom_properties - c_hash = uri.strip() - dataslice_commit = c_hash - existing_artifact = [] - if c_hash and c_hash.strip(): - existing_artifact.extend( - self.writer.store.get_artifacts_by_uri(c_hash)) - if existing_artifact and len(existing_artifact) != 0: - print("Adding to existing data slice") - # Haven't added event type in this if cond, is it not needed?? - slice = link_execution_to_input_artifact( - store=self.writer.store, - execution_id=self.writer.execution.id, - uri=c_hash, - input_name=self.name, - ) - else: - slice = create_new_artifact_event_and_attribution( - store=self.writer.store, - execution_id=self.writer.execution.id, - context_id=self.writer.child_context.id, - uri=c_hash, - name=self.name, - type_name="Dataslice", - event_type=mlpb.Event.Type.OUTPUT, - properties={ - "git_repo": props.get("git_repo", ""), - "Commit": props.get("Commit", ""), - "url": props.get("url", " "), - }, - artifact_type_properties={ - "git_repo": mlpb.STRING, - "Commit": mlpb.STRING, - "url": mlpb.STRING, - }, - custom_properties=custom_props, - milliseconds_since_epoch=int(time.time() * 1000), - ) - custom_props["git_repo"] = props.get("git_repo", "") - custom_props["Commit"] = props.get("Commit", "") - if self.writer.graph: - self.writer.driver.create_dataslice_node( - self.name, self.name, c_hash, self.data_parent, custom_props - ) - return slice # """Temporary code""" @@ -2204,371 +1483,27 @@ def commit_existing(self, uri: str, props: t.Optional[t.Dict] = None, custom_pro # print(last) # os.symlink(str(index), slicedir + "/ " + last) +# these are cmf logging api needed for server and defined in cmf_server_methods.py file + +Cmf.merge_created_context = merge_created_context +Cmf.merge_created_execution = merge_created_execution +Cmf.log_python_env_on_server = log_python_env_on_server +Cmf.log_dataset_with_version = log_dataset_with_version +Cmf.log_model_with_version = log_model_with_version +Cmf.log_execution_metrics_from_client = log_execution_metrics_from_client +Cmf.commit_existing_metrics = commit_existing_metrics +#log_metrics_from_client +Cmf.DataSlice.commit_existing = commit_existing +# log_dataslice_from_client + +Cmf.metadata_push = metadata_push +Cmf.metadata_pull = metadata_pull +Cmf.metadata_export = metadata_export +Cmf.artifact_pull = artifact_pull +Cmf.artifact_pull_single = artifact_pull_single +Cmf.artifact_push = artifact_push +Cmf.cmf_init_show = cmf_init_show +Cmf.pipeline_list = pipeline_list +Cmf.execution_list = execution_list +Cmf.artifact_list = artifact_list -def metadata_push(pipeline_name: str, filepath = "./mlmd", tensorboard_path: str = "", execution_id: str = ""): - """ Pushes MLMD file to CMF-server. - Example: - ```python - result = metadata_push("example_pipeline", "mlmd_file", "3") - ``` - Args: - pipeline_name: Name of the pipeline. - filepath: Path to the MLMD file. - execution_id: Optional execution ID. - tensorboard_path: Path to tensorboard logs. - - Returns: - Response output from the _metadata_push function. - """ - # Required arguments: pipeline_name - # Optional arguments: Execution_ID, filepath (mlmd file path, tensorboard_path - output = _metadata_push(pipeline_name, filepath, execution_id, tensorboard_path) - return output - -def metadata_pull(pipeline_name: str, filepath = "./mlmd", execution_id: str = ""): - """ Pulls MLMD file from CMF-server. - Example: - ```python - result = metadata_pull("example_pipeline", "./mlmd_directory", "execution_123") - ``` - Args: - pipeline_name: Name of the pipeline. - filepath: File path to store the MLMD file. - execution_id: Optional execution ID. - Returns: - Message from the _metadata_pull function. - """ - # Required arguments: pipeline_name - #Optional arguments: Execution_ID, filepath(file path to store mlmd file) - output = _metadata_pull(pipeline_name, filepath, execution_id) - return output - -def metadata_export(pipeline_name: str, jsonfilepath: str = "", filepath = "./mlmd"): - """ Export local mlmd's metadata in json format to a json file. - Example: - ```python - result = metadata_pull("example_pipeline", "./jsonfile", "./mlmd_directory") - ``` - Args: - pipeline_name: Name of the pipeline. - jsonfilepath: File path of json file. - filepath: File path to store the MLMD file. - Returns: - Message from the _metadata_pull function. - """ - # Required arguments: pipeline_name - #Optional arguments: jsonfilepath, filepath(file path to store mlmd file) - output = _metadata_export(pipeline_name, jsonfilepath, filepath) - return output - -def artifact_pull(pipeline_name: str, filepath = "./mlmd"): - """ Pulls artifacts from the initialized repository. - - Example: - ```python - result = artifact_pull("example_pipeline", "./mlmd_directory") - ``` - - Args: - pipeline_name: Name of the pipeline. - filepath: Path to store artifacts. - Returns: - Output from the _artifact_pull function. - """ - - # Required arguments: Pipeline_name - # Optional arguments: filepath( path to store artifacts) - output = _artifact_pull(pipeline_name, filepath) - return output - -def artifact_pull_single(pipeline_name: str, filepath: str, artifact_name: str): - """ Pulls a single artifact from the initialized repository. - Example: - ```python - result = artifact_pull_single("example_pipeline", "./mlmd_directory", "example_artifact") - ``` - Args: - pipeline_name: Name of the pipeline. - filepath: Path to store the artifact. - artifact_name: Name of the artifact. - Returns: - Output from the _artifact_pull_single function. - """ - - # Required arguments: Pipeline_name - # Optional arguments: filepath( path to store artifacts), artifact_name - output = _artifact_pull_single(pipeline_name, filepath, artifact_name) - return output - -def artifact_push(pipeline_name: str, filepath = "./mlmd"): - """ Pushes artifacts to the initialized repository. - - Example: - ```python - result = artifact_push("example_pipeline", "./mlmd_directory") - ``` - Args: - pipeline_name: Name of the pipeline. - filepath: Path to store the artifact. - Returns: - Output from the _artifact_push function. - """ - - output = _artifact_push(pipeline_name, filepath) - return output - -def cmf_init_show(): - """ Initializes and shows details of the CMF command. - Example: - ```python - result = cmf_init_show() - ``` - Returns: - Output from the _cmf_cmd_init function. - """ - - output=_cmf_cmd_init() - return output - -def cmf_init(type: str = "", - path: str = "", - git_remote_url: str = "", - cmf_server_url: str = "", - neo4j_user: str = "", - neo4j_password: str = "", - neo4j_uri: str = "", - url: str = "", - endpoint_url: str = "", - access_key_id: str = "", - secret_key: str = "", - session_token: str = "", - user: str = "", - password: str = "", - port: int = 0, - osdf_path: str = "", - key_id: str = "", - key_path: str = "", - key_issuer: str = "", - ): - - """ Initializes the CMF configuration based on the provided parameters. - Example: - ```python - cmf_init( type="local", - path="/path/to/re", - git_remote_url="git@github.com:user/repo.git", - cmf_server_url="http://cmf-server" - neo4j_user", - neo4j_password="password", - neo4j_uri="bolt://localhost:76" - ) - ``` - Args: - type: Type of repository ("local", "minioS3", "amazonS3", "sshremote") - path: Path for the local repository. - git_remote_url: Git remote URL for version control. - cmf_server_url: CMF server URL. - neo4j_user: Neo4j database username. - neo4j_password: Neo4j database password. - neo4j_uri: Neo4j database URI. - url: URL for MinioS3 or AmazonS3. - endpoint_url: Endpoint URL for MinioS3. - access_key_id: Access key ID for MinioS3 or AmazonS3. - secret_key: Secret key for MinioS3 or AmazonS3. - session_token: Session token for AmazonS3. - user: SSH remote username. - password: SSH remote password. - port: SSH remote port - Returns: - Output based on the initialized repository type. - """ - - if type == "": - return print("Error: Type is not provided") - if type not in ["local","minioS3","amazonS3","sshremote","osdfremote"]: - return print("Error: Type value is undefined"+ " "+type+".Expected: "+",".join(["local","minioS3","amazonS3","sshremote","osdfremote"])) - - if neo4j_user != "" and neo4j_password != "" and neo4j_uri != "": - pass - elif neo4j_user == "" and neo4j_password == "" and neo4j_uri == "": - pass - else: - return print("Error: Enter all neo4j parameters.") - - args={'path': path, - 'git_remote_url': git_remote_url, - 'url': url, - 'endpoint_url': endpoint_url, - 'access_key_id': access_key_id, - 'secret_key': secret_key, - 'session_token': session_token, - 'user': user, - 'password': password, - 'osdf_path': osdf_path, - 'key_id': key_id, - 'key_path': key_path, - 'key-issuer': key_issuer, - } - - status_args=non_related_args(type, args) - - if type == "local" and path != "" and git_remote_url != "" : - """Initialize local repository""" - output = _init_local( - path, git_remote_url, cmf_server_url, neo4j_user, neo4j_password, neo4j_uri - ) - if status_args != []: - print("There are non-related arguments: "+",".join(status_args)+".Please remove them.") - return output - - elif type == "minioS3" and url != "" and endpoint_url != "" and access_key_id != "" and secret_key != "" and git_remote_url != "": - """Initialize minioS3 repository""" - output = _init_minioS3( - url, - endpoint_url, - access_key_id, - secret_key, - git_remote_url, - cmf_server_url, - neo4j_user, - neo4j_password, - neo4j_uri, - ) - if status_args != []: - print("There are non-related arguments: "+",".join(status_args)+".Please remove them.") - return output - - elif type == "amazonS3" and url != "" and access_key_id != "" and secret_key != "" and git_remote_url != "": - """Initialize amazonS3 repository""" - output = _init_amazonS3( - url, - access_key_id, - secret_key, - session_token, - git_remote_url, - cmf_server_url, - neo4j_user, - neo4j_password, - neo4j_uri, - ) - if status_args != []: - print("There are non-related arguments: "+",".join(status_args)+".Please remove them.") - - return output - - elif type == "sshremote" and path != "" and user != "" and port != 0 and password != "" and git_remote_url != "": - """Initialize sshremote repository""" - output = _init_sshremote( - path, - user, - port, - password, - git_remote_url, - cmf_server_url, - neo4j_user, - neo4j_password, - neo4j_uri, - ) - if status_args != []: - print("There are non-related arguments: "+",".join(status_args)+".Please remove them.") - - return output - - elif type == "osdfremote" and osdf_path != "" and key_id != "" and key_path != 0 and key_issuer != "" and git_remote_url != "": - """Initialize osdfremote repository""" - output = _init_osdfremote( - osdf_path, - key_id, - key_path, - key_issuer, - git_remote_url, - cmf_server_url, - neo4j_user, - neo4j_password, - neo4j_uri, - ) - if status_args != []: - print("There are non-related arguments: "+",".join(status_args)+".Please remove them.") - - return output - - else: - print("Error: Enter all arguments") - - -def non_related_args(type : str, args : dict): - available_args=[i for i, j in args.items() if j != ""] - local=["path", "git_remote_url"] - minioS3=["url", "endpoint_url", "access_key_id", "secret_key", "git_remote_url"] - amazonS3=["url", "access_key_id", "secret_key", "session_token", "git_remote_url"] - sshremote=["path", "user", "port", "password", "git_remote_url"] - osdfremote=["osdf_path", "key_id", "key_path", "key-issuer", "git_remote_url"] - - - dict_repository_args={"local" : local, "minioS3" : minioS3, "amazonS3" : amazonS3, "sshremote" : sshremote} - - for repo,arg in dict_repository_args.items(): - if repo ==type: - non_related_args=list(set(available_args)-set(dict_repository_args[repo])) - return non_related_args - - -def pipeline_list(filepath = "./mlmd"): - """ Display list of pipline for current mlmd. - - Example: - ```python - result = _pipeline_list("./mlmd_directory") - ``` - - Args: - filepath: File path to store the MLMD file. - Returns: - Output from the _pipeline_list function. - """ - - # Optional arguments: filepath( path to store the MLMD file) - output = _pipeline_list(filepath) - return output - - -def execution_list(pipeline_name: str, filepath = "./mlmd", execution_id: str = "", long = True): - """ Display list of execution for given pipeline. - Example: - ```python - result = _execution_list("example_pipeline", "./mlmd_directory", "example_execution_id", "long") - ``` - Args: - pipeline_name: Name of the pipeline. - filepath: Path to store the mlmd file. - execution_id: Executions for particular execution id. - long: Detailed summary regarding execution. - Returns: - Output from the _execution_list function. - """ - - # Required arguments: pipeline_name - # Optional arguments: filepath( path to store mlmd file), execution_id, long - output = _execution_list(pipeline_name, filepath, execution_id, long) - return output - - -def artifact_list(pipeline_name: str, filepath = "./mlmd", artifact_name: str = "", long = True): - """ Display list of artifact for given pipeline. - Example: - ```python - result = _artifact_list("example_pipeline", "./mlmd_directory", "example_artifact_name", "long") - ``` - Args: - pipeline_name: Name of the pipeline. - filepath: Path to store the mlmd file. - artifact_name: Artifacts for particular artifact name. - long: Detailed summary regarding artifact. - Returns: - Output from the _artifact_list function. - """ - - # Required arguments: pipeline_name - # Optional arguments: filepath( path to store mlmd file), artifact_name, long - output = _artifact_list(pipeline_name, filepath, artifact_name, long) - return output diff --git a/cmflib/cmf_commands_wrapper.py b/cmflib/cmf_commands_wrapper.py index a5d9a420..80d92752 100644 --- a/cmflib/cmf_commands_wrapper.py +++ b/cmflib/cmf_commands_wrapper.py @@ -17,7 +17,23 @@ from cmflib import cli -def _metadata_push(pipeline_name, file_name, execution_id, tensorboard): +def metadata_push(pipeline_name: str, filepath = "./mlmd", tensorboard_path: str = "", execution_id: str = ""): + """ Pushes MLMD file to CMF-server. + Example: + ```python + result = metadata_push("example_pipeline", "mlmd_file", "3") + ``` + Args: + pipeline_name: Name of the pipeline. + filepath: Path to the MLMD file. + tensorboard_path: Path to tensorboard logs. + execution_id: Optional execution ID. + + Returns: + Pending + """ + # Required arguments: pipeline_name + # Optional arguments: Execution_ID, filepath (mlmd file path, tensorboard_path) cli_args = cli.parse_args( [ "metadata", @@ -25,11 +41,11 @@ def _metadata_push(pipeline_name, file_name, execution_id, tensorboard): "-p", pipeline_name, "-f", - file_name, + filepath, "-e", execution_id, "-t", - tensorboard + tensorboard_path ] ) cmd = cli_args.func(cli_args) @@ -37,50 +53,97 @@ def _metadata_push(pipeline_name, file_name, execution_id, tensorboard): print(msg) return msg -def _metadata_pull(pipeline_name, file_name, execution_id): + +def metadata_pull(pipeline_name: str, filepath = "./mlmd", execution_id: str = ""): + """ Pulls MLMD file from CMF-server. + Example: + ```python + result = metadata_pull("example_pipeline", "./mlmd_directory", "execution_123") + ``` + Args: + pipeline_name: Name of the pipeline. + filepath: File path to store the MLMD file. + execution_id: Optional execution ID. + Returns: + Pending + """ + # Required arguments: pipeline_name + #Optional arguments: Execution_ID, filepath(file path to store mlmd file) cli_args = cli.parse_args( - [ - "metadata", - "pull", - "-p", - pipeline_name, - "-f", - file_name, - "-e", - execution_id, - ] - ) + [ + "metadata", + "pull", + "-p", + pipeline_name, + "-f", + filepath, + "-e", + execution_id, + ] + ) cmd = cli_args.func(cli_args) msg = cmd.do_run() print(msg) + # i don't understand why we are both printing and returning the output + return msg -def _metadata_export(pipeline_name, json_file_name, file_name): +def metadata_export(pipeline_name: str, jsonfilepath: str = "", filepath = "./mlmd"): + """ Export local mlmd's metadata in json format to a json file. + Example: + ```python + result = metadata_pull("example_pipeline", "./jsonfile", "./mlmd_directory") + ``` + Args: + pipeline_name: Name of the pipeline. + jsonfilepath: File path of json file. + filepath: File path to store the MLMD file. + Returns: + Pending + """ + # Required arguments: pipeline_name + #Optional arguments: jsonfilepath, filepath(file path to store mlmd file) cli_args = cli.parse_args( - [ - "metadata", - "export", - "-p", - pipeline_name, - "-j", - json_file_name, - "-f", - file_name, - ] - ) + [ + "metadata", + "export", + "-p", + pipeline_name, + "-j", + jsonfilepath, + "-f", + filepath, + ] + ) cmd = cli_args.func(cli_args) msg = cmd.do_run() print(msg) return msg -def _artifact_push(pipeline_name, file_name): +def artifact_pull(pipeline_name: str, filepath = "./mlmd"): + """ Pulls artifacts from the initialized repository. + + Example: + ```python + result = artifact_pull("example_pipeline", "./mlmd_directory") + ``` + + Args: + pipeline_name: Name of the pipeline. + filepath: Path to store artifacts. + Returns: + Pending + """ + + # Required arguments: Pipeline_name + # Optional arguments: filepath( path to store artifacts) cli_args = cli.parse_args( [ "artifact", - "push", + "pull", "-p", pipeline_name, "-f", - file_name, + filepath, ] ) cmd = cli_args.func(cli_args) @@ -89,8 +152,21 @@ def _artifact_push(pipeline_name, file_name): return msg -def _artifact_pull(pipeline_name, file_name): - +def artifact_pull_single(pipeline_name: str, filepath: str, artifact_name: str): + """ Pulls a single artifact from the initialized repository. + Example: + ```python + result = artifact_pull_single("example_pipeline", "./mlmd_directory", "example_artifact") + ``` + Args: + pipeline_name: Name of the pipeline. + filepath: Path to store the artifact. + artifact_name: Name of the artifact. + Returns: + Pending + """ + # Required arguments: Pipeline_name + # Optional arguments: filepath( path to store artifacts), artifact_name cli_args = cli.parse_args( [ "artifact", @@ -98,7 +174,9 @@ def _artifact_pull(pipeline_name, file_name): "-p", pipeline_name, "-f", - file_name, + filepath, + "-a", + artifact_name, ] ) cmd = cli_args.func(cli_args) @@ -106,17 +184,28 @@ def _artifact_pull(pipeline_name, file_name): print(msg) return msg -def _artifact_pull_single(pipeline_name, file_name, artifact_name): + +def artifact_push(pipeline_name: str, filepath = "./mlmd"): + """ Pushes artifacts to the initialized repository. + + Example: + ```python + result = artifact_push("example_pipeline", "./mlmd_directory") + ``` + Args: + pipeline_name: Name of the pipeline. + filepath: Path to store the artifact. + Returns: + Pending + """ cli_args = cli.parse_args( [ "artifact", - "pull", + "push", "-p", pipeline_name, "-f", - file_name, - "-a", - artifact_name, + filepath, ] ) cmd = cli_args.func(cli_args) @@ -125,7 +214,15 @@ def _artifact_pull_single(pipeline_name, file_name, artifact_name): return msg -def _cmf_cmd_init(): +def cmf_init_show(): + """ Initializes and shows details of the CMF command. + Example: + ```python + result = cmf_init_show() + ``` + Returns: + Pending + """ cli_args = cli.parse_args( [ "init", @@ -137,8 +234,92 @@ def _cmf_cmd_init(): print(msg) return msg -def _init_local(path, git_remote_url, cmf_server_url, neo4j_user, neo4j_password, neo4j_uri): - cli_args = cli.parse_args( + +def cmf_init(type: str = "", + path: str = "", + git_remote_url: str = "", + cmf_server_url: str = "", + neo4j_user: str = "", + neo4j_password: str = "", + neo4j_uri: str = "", + url: str = "", + endpoint_url: str = "", + access_key_id: str = "", + secret_key: str = "", + session_token: str = "", + user: str = "", + password: str = "", + port: int = 0, + osdf_path: str = "", + key_id: str = "", + key_path: str = "", + key_issuer: str = "", + ): + + """ Initializes the CMF configuration based on the provided parameters. + Example: + ```python + cmf_init( type="local", + path="/path/to/re", + git_remote_url="git@github.com:user/repo.git", + cmf_server_url="http://cmf-server" + neo4j_user", + neo4j_password="password", + neo4j_uri="bolt://localhost:76" + ) + ``` + Args: + type: Type of repository ("local", "minioS3", "amazonS3", "sshremote") + path: Path for the local repository. + git_remote_url: Git remote URL for version control. + cmf_server_url: CMF server URL. + neo4j_user: Neo4j database username. + neo4j_password: Neo4j database password. + neo4j_uri: Neo4j database URI. + url: URL for MinioS3 or AmazonS3. + endpoint_url: Endpoint URL for MinioS3. + access_key_id: Access key ID for MinioS3 or AmazonS3. + secret_key: Secret key for MinioS3 or AmazonS3. + session_token: Session token for AmazonS3. + user: SSH remote username. + password: SSH remote password. + port: SSH remote port + Returns: + Output based on the initialized repository type. + """ + + if type == "": + return print("Error: Type is not provided") + if type not in ["local","minioS3","amazonS3","sshremote","osdfremote"]: + return print("Error: Type value is undefined"+ " "+type+".Expected: "+",".join(["local","minioS3","amazonS3","sshremote","osdfremote"])) + + if neo4j_user != "" and neo4j_password != "" and neo4j_uri != "": + pass + elif neo4j_user == "" and neo4j_password == "" and neo4j_uri == "": + pass + else: + return print("Error: Enter all neo4j parameters.") + + args={'path': path, + 'git_remote_url': git_remote_url, + 'url': url, + 'endpoint_url': endpoint_url, + 'access_key_id': access_key_id, + 'secret_key': secret_key, + 'session_token': session_token, + 'user': user, + 'password': password, + 'osdf_path': osdf_path, + 'key_id': key_id, + 'key_path': key_path, + 'key-issuer': key_issuer, + } + + status_args=non_related_args(type, args) + + if type == "local" and path != "" and git_remote_url != "" : + """Initialize local repository""" + cli_args = cli.parse_args( [ "init", "local", @@ -156,14 +337,16 @@ def _init_local(path, git_remote_url, cmf_server_url, neo4j_user, neo4j_password neo4j_uri ] ) - cmd = cli_args.func(cli_args) - msg = cmd.do_run() - print(msg) - return msg - - -def _init_minioS3(url, endpoint_url, access_key_id, secret_key, git_remote_url, cmf_server_url, neo4j_user, neo4j_password, neo4j_uri): - cli_args = cli.parse_args( + cmd = cli_args.func(cli_args) + msg = cmd.do_run() + print(msg) + if status_args != []: + print("There are non-related arguments: "+",".join(status_args)+".Please remove them.") + return msg + + elif type == "minioS3" and url != "" and endpoint_url != "" and access_key_id != "" and secret_key != "" and git_remote_url != "": + """Initialize minioS3 repository""" + cli_args = cli.parse_args( [ "init", "minioS3", @@ -187,13 +370,16 @@ def _init_minioS3(url, endpoint_url, access_key_id, secret_key, git_remote_url, neo4j_uri ] ) - cmd = cli_args.func(cli_args) - msg = cmd.do_run() - print(msg) - return msg - -def _init_amazonS3(url, access_key_id, secret_key, session_token, git_remote_url, cmf_server_url, neo4j_user, neo4j_password, neo4j_uri): - cli_args = cli.parse_args( + cmd = cli_args.func(cli_args) + msg = cmd.do_run() + print(msg) + if status_args != []: + print("There are non-related arguments: "+",".join(status_args)+".Please remove them.") + return msg + + elif type == "amazonS3" and url != "" and access_key_id != "" and secret_key != "" and git_remote_url != "": + """Initialize amazonS3 repository""" + cli_args = cli.parse_args( [ "init", "amazonS3", @@ -217,13 +403,18 @@ def _init_amazonS3(url, access_key_id, secret_key, session_token, git_remote_url neo4j_uri ] ) - cmd = cli_args.func(cli_args) - msg = cmd.do_run() - print(msg) - return msg + cmd = cli_args.func(cli_args) + msg = cmd.do_run() + print(msg) + + if status_args != []: + print("There are non-related arguments: "+",".join(status_args)+".Please remove them.") -def _init_sshremote(path,user, port, password, git_remote_url, cmf_server_url, neo4j_user, neo4j_password, neo4j_uri): - cli_args = cli.parse_args( + return msg + + elif type == "sshremote" and path != "" and user != "" and port != 0 and password != "" and git_remote_url != "": + """Initialize sshremote repository""" + cli_args = cli.parse_args( [ "init", "sshremote", @@ -247,13 +438,17 @@ def _init_sshremote(path,user, port, password, git_remote_url, cmf_server_url, n neo4j_uri ] ) - cmd = cli_args.func(cli_args) - msg = cmd.do_run() - print(msg) - return msg + cmd = cli_args.func(cli_args) + msg = cmd.do_run() + print(msg) + if status_args != []: + print("There are non-related arguments: "+",".join(status_args)+".Please remove them.") -def _init_osdfremote(path, key_id, key_path, key_issuer, git_remote_url, cmf_server_url, neo4j_user, neo4j_password, neo4j_uri): - cli_args = cli.parse_args( + return msg + + elif type == "osdfremote" and osdf_path != "" and key_id != "" and key_path != 0 and key_issuer != "" and git_remote_url != "": + """Initialize osdfremote repository""" + cli_args = cli.parse_args( [ "init", "osdf", @@ -277,24 +472,56 @@ def _init_osdfremote(path, key_id, key_path, key_issuer, git_remote_url, cmf_ser neo4j_uri ] ) - cmd = cli_args.func(cli_args) - msg = cmd.do_run() - print(msg) - return msg + cmd = cli_args.func(cli_args) + msg = cmd.do_run() + print(msg) + if status_args != []: + print("There are non-related arguments: "+",".join(status_args)+".Please remove them.") + + return msg + + else: + print("Error: Enter all arguments") + + +def non_related_args(type : str, args : dict): + available_args=[i for i, j in args.items() if j != ""] + local=["path", "git_remote_url"] + minioS3=["url", "endpoint_url", "access_key_id", "secret_key", "git_remote_url"] + amazonS3=["url", "access_key_id", "secret_key", "session_token", "git_remote_url"] + sshremote=["path", "user", "port", "password", "git_remote_url"] + osdfremote=["osdf_path", "key_id", "key_path", "key-issuer", "git_remote_url"] + + + dict_repository_args={"local" : local, "minioS3" : minioS3, "amazonS3" : amazonS3, "sshremote" : sshremote} -def _artifact_list(pipeline_name, file_name, artifact_name, long): + for repo,arg in dict_repository_args.items(): + if repo ==type: + non_related_args=list(set(available_args)-set(dict_repository_args[repo])) + return non_related_args + + +def pipeline_list(filepath = "./mlmd"): + """ Display list of pipline for current mlmd. + + Example: + ```python + result = _pipeline_list("./mlmd_directory") + ``` + + Args: + filepath: File path to store the MLMD file. + Returns: + Pending + """ + + # Optional arguments: filepath( path to store the MLMD file) cli_args = cli.parse_args( [ - "artifact", + "pipeline", "list", - "-p", - pipeline_name, "-f", - file_name, - "-a", - artifact_name, - "-l", - long + filepath ] ) cmd = cli_args.func(cli_args) @@ -302,13 +529,35 @@ def _artifact_list(pipeline_name, file_name, artifact_name, long): print(msg) return msg -def _pipeline_list(file_name): + +def execution_list(pipeline_name: str, filepath = "./mlmd", execution_id: str = "", long = True): + """ Display list of execution for given pipeline. + Example: + ```python + result = _execution_list("example_pipeline", "./mlmd_directory", "example_execution_id", "long") + ``` + Args: + pipeline_name: Name of the pipeline. + filepath: Path to store the mlmd file. + execution_id: Executions for particular execution id. + long: Detailed summary regarding execution. + Returns: + Pending + """ + # Required arguments: pipeline_name + # Optional arguments: filepath( path to store mlmd file), execution_id, long cli_args = cli.parse_args( [ - "pipeline", + "execution", "list", + "-p", + pipeline_name, "-f", - file_name + filepath, + "-e", + execution_id, + "-l", + long ] ) cmd = cli_args.func(cli_args) @@ -316,17 +565,33 @@ def _pipeline_list(file_name): print(msg) return msg -def _execution_list(pipeline_name, file_name, execution_id, long): + +def artifact_list(pipeline_name: str, filepath = "./mlmd", artifact_name: str = "", long = True): + """ Display list of artifact for given pipeline. + Example: + ```python + result = _artifact_list("example_pipeline", "./mlmd_directory", "example_artifact_name", "long") + ``` + Args: + pipeline_name: Name of the pipeline. + filepath: Path to store the mlmd file. + artifact_name: Artifacts for particular artifact name. + long: Detailed summary regarding artifact. + Returns: + Pending + """ + # Required arguments: pipeline_name + # Optional arguments: filepath( path to store mlmd file), artifact_name, long cli_args = cli.parse_args( [ - "execution", + "artifact", "list", "-p", pipeline_name, "-f", - file_name, - "-e", - execution_id, + filepath, + "-a", + artifact_name, "-l", long ] @@ -334,5 +599,4 @@ def _execution_list(pipeline_name, file_name, execution_id, long): cmd = cli_args.func(cli_args) msg = cmd.do_run() print(msg) - return msg - + return msg \ No newline at end of file diff --git a/cmflib/cmf_server_methods.py b/cmflib/cmf_server_methods.py new file mode 100644 index 00000000..2518f461 --- /dev/null +++ b/cmflib/cmf_server_methods.py @@ -0,0 +1,768 @@ +"""This module contains all the public API for CMF""" +### +# Copyright (2022) Hewlett Packard Enterprise Development LP +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +### + +import time +import uuid +import re +import typing as t + +# This import is needed for jupyterlab environment +from ml_metadata.proto import metadata_store_pb2 as mlpb +from cmflib.metadata_helper import ( + get_or_create_run_context, + associate_child_to_parent_context, + create_new_execution_in_existing_run_context, + link_execution_to_artifact, + create_new_artifact_event_and_attribution, + link_execution_to_input_artifact, +) + +def merge_created_context( + self, pipeline_stage: str, custom_properties: t.Optional[t.Dict] = None +) -> mlpb.Context: + """Merge created context. + Every call creates a unique pipeline stage. + Created for metadata push purpose. + Example: + + ```python + #Create context + # Import CMF + from cmflib.cmf import Cmf + from ml_metadata.proto import metadata_store_pb2 as mlpb + # Create CMF logger + cmf = Cmf(filepath="mlmd", pipeline_name="test_pipeline") + # Create context + context: mlmd.proto.Context = cmf.merge_created_context( + pipeline_stage="Test-env/prepare", + custom_properties ={"user-metadata1": "metadata_value"} + ``` + Args: + Pipeline_stage: Pipeline_Name/Stage_name. + custom_properties: Developers can provide key value pairs with additional properties of the execution that + need to be stored. + Returns: + Context object from ML Metadata library associated with the new context for this stage. + """ + + custom_props = {} if custom_properties is None else custom_properties + ctx = get_or_create_run_context( + self.store, pipeline_stage, custom_props) + self.child_context = ctx + associate_child_to_parent_context( + store=self.store, parent_context=self.parent_context, child_context=ctx + ) + if self.graph: + self.driver.create_stage_node( + pipeline_stage, self.parent_context, ctx.id, custom_props + ) + return ctx + + +def merge_created_execution( + self, + execution_type: str, + execution_cmd: str, + properties: t.Optional[t.Dict] = None, + custom_properties: t.Optional[t.Dict] = None, + orig_execution_name:str = "", + create_new_execution:bool = True +) -> mlpb.Execution: + """Merge Created execution. + Every call creates a unique execution. Execution can only be created within a context, so + [create_context][cmflib.cmf.Cmf.create_context] must be called first. + Every call occurs when metadata push or pull is processed. Data from pre-existing executions is used + to create new executions with additional data(Required on cmf-server). + Example: + ```python + # Import CMF + from cmflib.cmf import Cmf + from ml_metadata.proto import metadata_store_pb2 as mlpb + # Create CMF logger + cmf = Cmf(filepath="mlmd", pipeline_name="test_pipeline") + # Create or reuse context for this stage + context: mlmd.proto.Context = cmf.merge_created_context( + pipeline_stage="prepare", + custom_properties ={"user-metadata1": "metadata_value"} + ) + # Create a new execution for this stage run + execution: mlmd.proto.Execution = cmf.merge_created_execution( + execution_type="Prepare", + properties={"Context_Type":""}, + custom_properties = {"split": split, "seed": seed}, + orig_execution_name=execution_name + ) + ``` + Args: + execution_type: Type of the execution.(when create_new_execution is False, this is the name of execution) + properties: Properties of Execution. + custom_properties: Developers can provide key value pairs with additional properties of the execution that + need to be stored. + + cmd: command used to run this execution. + + create_new_execution:bool = True, This can be used by advanced users to re-use executions + This is applicable, when working with framework code like mmdet, pytorch lightning etc, where the + custom call-backs are used to log metrics. + if create_new_execution is True(Default), execution_type parameter will be used as the name of the execution type. + if create_new_execution is False, if existing execution exist with the same name as execution_type. + it will be reused. + Only executions created with create_new_execution as False will have "name" as a property. + + + Returns: + Execution object from ML Metadata library associated with the execution for this stage. + """ + # Initializing the execution related fields + properties = {} if properties is None else properties + self.metrics = {} + self.input_artifacts = [] + self.execution_label_props = {} + custom_props = {} if custom_properties is None else custom_properties + # print(custom_props) + git_repo = properties.get("Git_Repo", "") + git_start_commit = properties.get("Git_Start_Commit", "") + #name = properties.get("Name", "") + create_new_execution = True + execution_name = execution_type + #exe.name property is passed as the orig_execution_name. + #if name is not an empty string then we are re-using executions + if orig_execution_name != "": + create_new_execution = False + execution_name = orig_execution_name + + self.execution = create_new_execution_in_existing_run_context( + store=self.store, + execution_type_name=execution_type, # Type field when re-using executions + execution_name=execution_name, #Name field if we are re-using executionsname + #Type field , if creating new executions always + context_id=self.child_context.id, + execution=execution_cmd, + pipeline_id=self.parent_context.id, + pipeline_type=self.parent_context.name, + git_repo=git_repo, + git_start_commit=git_start_commit, + custom_properties=custom_props, + create_new_execution=create_new_execution + ) + + uuids = "" + + uuids = self.execution.properties["Execution_uuid"].string_value + if uuids: + self.execution.properties["Execution_uuid"].string_value = uuids +\ + ","+properties["Execution_uuid"] + else: + self.execution.properties["Execution_uuid"].string_value =\ + properties["Execution_uuid"] + + + self.store.put_executions([self.execution]) + self.execution_name = str(self.execution.id) + "," + execution_type + self.execution_command = execution_cmd + for k, v in custom_props.items(): + k = re.sub("-", "_", k) + self.execution_label_props[k] = v + self.execution_label_props["Execution_Name"] = ( + execution_type + ":" + str(self.execution.id) + ) + self.execution_label_props["execution_command"] = execution_cmd + if self.graph: + self.driver.create_execution_node( + self.execution_name, + self.child_context.id, + self.parent_context, + execution_cmd, + self.execution.id, + custom_props, + ) + + # link the artifact to execution if it exists and creates artifact if it doesn't + return self.execution + + +def log_python_env_on_server( + self, + url: str, + uri: str, + props: t.Optional[t.Dict] = None, + ) -> mlpb.Artifact: + "Used to log the python packages involved in the current execution" + + git_repo = props.get("git_repo", "") + name = url + existing_artifact = [] + c_hash = uri + commit = props.get("Commit", "") + url = url + ":" + c_hash + if c_hash and c_hash.strip: + existing_artifact.extend(self.store.get_artifacts_by_uri(c_hash)) + + if existing_artifact and len(existing_artifact) != 0: + existing_artifact = existing_artifact[0] + artifact = link_execution_to_artifact( + store=self.store, + execution_id=self.execution.id, + uri=uri, + input_name=url, + event_type=mlpb.Event.Type.INPUT, + ) + else: + uri = c_hash if c_hash and c_hash.strip() else str(uuid.uuid1()) + artifact = create_new_artifact_event_and_attribution( + store=self.store, + execution_id=self.execution.id, + context_id=self.child_context.id, + uri=uri, + name=url, + type_name="Environment", + event_type=mlpb.Event.Type.INPUT, + properties={ + "git_repo": str(git_repo), + # passing c_hash value to commit + "Commit": str(commit), + "url": props.get("url", ""), + }, + artifact_type_properties={ + "git_repo": mlpb.STRING, + "Commit": mlpb.STRING, + "url": mlpb.STRING, + }, + milliseconds_since_epoch=int(time.time() * 1000), + ) + custom_props = {} + custom_props["git_repo"] = git_repo + custom_props["Commit"] = commit + self.execution_label_props["git_repo"] = git_repo + self.execution_label_props["Commit"] = commit + + if self.graph: + self.driver.create_env_node( + name, + url, + uri, + "input", + self.execution.id, + self.parent_context, + custom_props, + ) + self.input_artifacts.append( + { + "Name": name, + "Path": url, + "URI": uri, + "Event": "input", + "Execution_Name": self.execution_name, + "Type": "Environment", + "Execution_Command": self.execution_command, + "Pipeline_Id": self.parent_context.id, + "Pipeline_Name": self.parent_context.name, + } + ) + self.driver.create_execution_links(uri, name, "Environment") + return artifact + + +def log_dataset_with_version( + self, + url: str, + version: str, + event: str, + props: t.Optional[t.Dict] = None, + custom_properties: t.Optional[t.Dict] = None, +) -> mlpb.Artifact: + """Logs a dataset when the version (hash) is known. + Example: + ```python + artifact: mlpb.Artifact = cmf.log_dataset_with_version( + url="path/to/dataset", + version="abcdef", + event="output", + props={ "git_repo": "https://github.com/example/repo", + "url": "/path/in/repo", }, + custom_properties={ "custom_key": "custom_value", }, + ) + ``` + Args: + url: Path to the dataset. + version: Hash or version identifier for the dataset. + event: Takes arguments `INPUT` or `OUTPUT`. + props: Optional properties for the dataset (e.g., git_repo, url). + custom_properties: Optional custom properties for the dataset. + Returns: + Artifact object from the ML Protocol Buffers library associated with the new dataset artifact. + """ + + props = {} if props is None else props + custom_props = {} if custom_properties is None else custom_properties + git_repo = props.get("git_repo", "") + name = url + event_type = mlpb.Event.Type.OUTPUT + existing_artifact = [] + c_hash = version + if event.lower() == "input": + event_type = mlpb.Event.Type.INPUT + + # dataset_commit = commit_output(url, self.execution.id) + + dataset_commit = version + url = url + ":" + c_hash + if c_hash and c_hash.strip: + existing_artifact.extend(self.store.get_artifacts_by_uri(c_hash)) + + # To Do - What happens when uri is the same but names are different + if existing_artifact and len(existing_artifact) != 0: + existing_artifact = existing_artifact[0] + + # Quick fix- Updating only the name + if custom_properties is not None: + self.update_existing_artifact( + existing_artifact, custom_properties) + uri = c_hash + # update url for existing artifact + self.update_dataset_url(existing_artifact, props.get("url", "")) + artifact = link_execution_to_artifact( + store=self.store, + execution_id=self.execution.id, + uri=uri, + input_name=url, + event_type=event_type, + ) + else: + # if((existing_artifact and len(existing_artifact )!= 0) and c_hash != ""): + # url = url + ":" + str(self.execution.id) + uri = c_hash if c_hash and c_hash.strip() else str(uuid.uuid1()) + artifact = create_new_artifact_event_and_attribution( + store=self.store, + execution_id=self.execution.id, + context_id=self.child_context.id, + uri=uri, + name=url, + type_name="Dataset", + event_type=event_type, + properties={ + "git_repo": str(git_repo), + "Commit": str(dataset_commit), + "url": props.get("url", " "), + }, + artifact_type_properties={ + "git_repo": mlpb.STRING, + "Commit": mlpb.STRING, + "url": mlpb.STRING, + }, + custom_properties=custom_props, + milliseconds_since_epoch=int(time.time() * 1000), + ) + custom_props["git_repo"] = git_repo + custom_props["Commit"] = dataset_commit + self.execution_label_props["git_repo"] = git_repo + self.execution_label_props["Commit"] = dataset_commit + + if self.graph: + self.driver.create_dataset_node( + name, + url, + uri, + event, + self.execution.id, + self.parent_context, + custom_props, + ) + if event.lower() == "input": + self.input_artifacts.append( + { + "Name": name, + "Path": url, + "URI": uri, + "Event": event.lower(), + "Execution_Name": self.execution_name, + "Type": "Dataset", + "Execution_Command": self.execution_command, + "Pipeline_Id": self.parent_context.id, + "Pipeline_Name": self.parent_context.name, + } + ) + self.driver.create_execution_links(uri, name, "Dataset") + else: + child_artifact = { + "Name": name, + "Path": url, + "URI": uri, + "Event": event.lower(), + "Execution_Name": self.execution_name, + "Type": "Dataset", + "Execution_Command": self.execution_command, + "Pipeline_Id": self.parent_context.id, + "Pipeline_Name": self.parent_context.name, + } + self.driver.create_artifact_relationships( + self.input_artifacts, child_artifact, self.execution_label_props + ) + return artifact + + + +# Add the model to dvc do a git commit and store the commit id in MLMD +def log_model_with_version( + self, + path: str, + event: str, + props=None, + custom_properties: t.Optional[t.Dict] = None, +) -> object: + """Logs a model when the version(hash) is known + The model is added to dvc and the metadata file (.dvc) gets committed to git. + Example: + ```python + artifact: mlmd.proto.Artifact= cmf.log_model_with_version( + path="path/to/model.pkl", + event="output", + props={ + "url": "/home/user/local-storage/bf/629ccd5cd008066b72c04f9a918737", + "model_type": "RandomForestClassifier", + "model_name": "RandomForestClassifier:default", + "Commit": "commit 1146dad8b74cae205db6a3132ea403db1e4032e5", + "model_framework": "SKlearn", + }, + custom_properties={ + "uri": "bf629ccd5cd008066b72c04f9a918737", + }, + + ) + ``` + Args: + path: Path to the model file. + event: Takes arguments `INPUT` OR `OUTPUT`. + props: Model artifact properties. + custom_properties: The model properties. + Returns: + Artifact object from ML Metadata library associated with the new model artifact. + """ + + if custom_properties is None: + custom_properties = {} + custom_props = {} if custom_properties is None else custom_properties + name = re.split("/", path)[-1] + event_type = mlpb.Event.Type.OUTPUT + existing_artifact = [] + if event.lower() == "input": + event_type = mlpb.Event.Type.INPUT + + # props["commit"] = "" # To do get from incoming data + c_hash = props.get("uri", " ") + # If connecting to an existing artifact - The name of the artifact is used as path/steps/key + model_uri = path + ":" + c_hash + # dvc_url = dvc_get_url(path, False) + url = props.get("url", "") + # uri = "" + if c_hash and c_hash.strip(): + uri = c_hash.strip() + existing_artifact.extend(self.store.get_artifacts_by_uri(uri)) + else: + raise RuntimeError("Model commit failed, Model uri empty") + + if ( + existing_artifact + and len(existing_artifact) != 0 + ): + # update url for existing artifact + existing_artifact = self.update_model_url(existing_artifact, url) + artifact = link_execution_to_artifact( + store=self.store, + execution_id=self.execution.id, + uri=c_hash, + input_name=model_uri, + event_type=event_type, + ) + model_uri = artifact.name + else: + uri = c_hash if c_hash and c_hash.strip() else str(uuid.uuid1()) + model_uri = model_uri + ":" + str(self.execution.id) + artifact = create_new_artifact_event_and_attribution( + store=self.store, + execution_id=self.execution.id, + context_id=self.child_context.id, + uri=uri, + name=model_uri, + type_name="Model", + event_type=event_type, + properties={ + "model_framework": props.get("model_framework", ""), + "model_type": props.get("model_type", ""), + "model_name": props.get("model_name", ""), + "Commit": props.get("Commit", ""), + "url": str(url), + }, + artifact_type_properties={ + "model_framework": mlpb.STRING, + "model_type": mlpb.STRING, + "model_name": mlpb.STRING, + "Commit": mlpb.STRING, + "url": mlpb.STRING, + }, + custom_properties=custom_props, + milliseconds_since_epoch=int(time.time() * 1000), + ) + custom_properties["Commit"] = props.get("Commit", "") + custom_props["url"] = url + self.execution_label_props["Commit"] = props.get("Commit", "") + if self.graph: + self.driver.create_model_node( + model_uri, + uri, + event, + self.execution.id, + self.parent_context, + custom_props, + ) + if event.lower() == "input": + self.input_artifacts.append( + { + "Name": model_uri, + "URI": uri, + "Event": event.lower(), + "Execution_Name": self.execution_name, + "Type": "Model", + "Execution_Command": self.execution_command, + "Pipeline_Id": self.parent_context.id, + "Pipeline_Name": self.parent_context.name, + } + ) + self.driver.create_execution_links(uri, model_uri, "Model") + else: + child_artifact = { + "Name": model_uri, + "URI": uri, + "Event": event.lower(), + "Execution_Name": self.execution_name, + "Type": "Model", + "Execution_Command": self.execution_command, + "Pipeline_Id": self.parent_context.id, + "Pipeline_Name": self.parent_context.name, + } + self.driver.create_artifact_relationships( + self.input_artifacts, child_artifact, self.execution_label_props + ) + + return artifact + +def log_execution_metrics_from_client(self, metrics_name: str, + custom_properties: t.Optional[t.Dict] = None) -> mlpb.Artifact: + """ Logs execution metrics from a client. + Data from pre-existing metrics from client side is used to create identical metrics on server side. + Example: + ```python + artifact: mlpb.Artifact = cmf.log_execution_metrics_from_client( + metrics_name="example_metrics:uri:123", + custom_properties={"custom_key": "custom_value"}, + ) + ``` + Args: + metrics_name: Name of the metrics in the format "name:uri:execution_id". + custom_properties: Optional custom properties for the metrics. + Returns: + Artifact object from the ML Protocol Buffers library associated with the metrics artifact. + """ + + metrics = None + custom_props = {} if custom_properties is None else custom_properties + existing_artifact = [] + name_tokens = metrics_name.split(":") + if name_tokens and len(name_tokens) > 2: + name = name_tokens[0] + uri = name_tokens[1] + execution_id = name_tokens[2] + else: + print(f"Error : metrics name {metrics_name} is not in the correct format") + return + + #we need to add the execution id to the metrics name + new_metrics_name = f"{name}:{uri}:{str(self.execution.id)}" + existing_artifacts = self.store.get_artifacts_by_uri(uri) + + existing_artifact = existing_artifacts[0] if existing_artifacts else None + # Didn't understand this, + # and in case of step_metrics should we follow this logic or dataset's logic or does it even matter + if not existing_artifact or \ + ((existing_artifact) and not + (existing_artifact.name == new_metrics_name)): #we need to add the artifact otherwise its already there + metrics = create_new_artifact_event_and_attribution( + store=self.store, + execution_id=self.execution.id, + context_id=self.child_context.id, + uri=uri, + name=new_metrics_name, + type_name="Metrics", + event_type=mlpb.Event.Type.OUTPUT, + properties={"metrics_name": metrics_name}, + artifact_type_properties={"metrics_name": mlpb.STRING}, + custom_properties=custom_props, + milliseconds_since_epoch=int(time.time() * 1000), + ) + + if self.graph: + # To do create execution_links + self.driver.create_metrics_node( + metrics_name, + uri, + "output", + self.execution.id, + self.parent_context, + custom_props, + ) + child_artifact = { + "Name": metrics_name, + "URI": uri, + "Event": "output", + "Execution_Name": self.execution_name, + "Type": "Metrics", + "Execution_Command": self.execution_command, + "Pipeline_Id": self.parent_context.id, + "Pipeline_Name": self.parent_context.name, + } + self.driver.create_artifact_relationships( + self.input_artifacts, child_artifact, self.execution_label_props + ) + return metrics + + + +def commit_existing_metrics(self, metrics_name: str, uri: str, props: t.Optional[t.Dict] = None, custom_properties: t.Optional[t.Dict] = None): + """ + Commits existing metrics associated with the given URI to MLMD. + Example: + ```python + artifact: mlpb.Artifact = cmf.commit_existing_metrics("existing_metrics", "abc123", + {"custom_key": "custom_value"}) + ``` + Args: + metrics_name: Name of the metrics. + uri: Unique identifier associated with the metrics. + custom_properties: Optional custom properties for the metrics. + Returns: + Artifact object from the ML Protocol Buffers library associated with the existing metrics artifact. + """ + + custom_props = {} if custom_properties is None else custom_properties + c_hash = uri.strip() + existing_artifact = [] + existing_artifact.extend(self.store.get_artifacts_by_uri(c_hash)) + if (existing_artifact + and len(existing_artifact) != 0 ): + metrics = link_execution_to_artifact( + store=self.store, + execution_id=self.execution.id, + uri=c_hash, + input_name=metrics_name, + event_type=mlpb.Event.Type.OUTPUT, + ) + else: + metrics = create_new_artifact_event_and_attribution( + store=self.store, + execution_id=self.execution.id, + context_id=self.child_context.id, + uri=uri, + name=metrics_name, + type_name="Step_Metrics", + event_type=mlpb.Event.Type.OUTPUT, + properties={ + # passing uri value to commit + "Commit": props.get("Commit", ""), + "url": props.get("url", ""), + }, + artifact_type_properties={ + "Commit": mlpb.STRING, + "url": mlpb.STRING, + }, + custom_properties=custom_props, + milliseconds_since_epoch=int(time.time() * 1000), + ) + + metrics_commit = props.get("Commit", "") + custom_props["Commit"] = metrics_commit + self.execution_label_props["Commit"] = metrics_commit + + if self.graph: + self.driver.create_step_metrics_node( + metrics_name, + uri, + "output", + self.execution.id, + self.parent_context, + custom_props, + ) + child_artifact = { + "Name": metrics_name, + "URI": uri, + "Event": "output", + "Execution_Name": self.execution_name, + "Type": "Step_Metrics", + "Execution_Command": self.execution_command, + "Pipeline_Id": self.parent_context.id, + } + self.driver.create_artifact_relationships( + self.input_artifacts, child_artifact, self.execution_label_props + ) + return metrics + +# commit existing dataslice to server +def commit_existing(self, uri: str, props: t.Optional[t.Dict] = None, custom_properties: t.Optional[t.Dict] = None) -> None: + custom_props = {} if custom_properties is None else custom_properties + c_hash = uri.strip() + dataslice_commit = c_hash + existing_artifact = [] + if c_hash and c_hash.strip(): + existing_artifact.extend( + self.writer.store.get_artifacts_by_uri(c_hash)) + if existing_artifact and len(existing_artifact) != 0: + print("Adding to existing data slice") + # Haven't added event type in this if cond, is it not needed?? + slice = link_execution_to_input_artifact( + store=self.writer.store, + execution_id=self.writer.execution.id, + uri=c_hash, + input_name=self.name, + ) + else: + slice = create_new_artifact_event_and_attribution( + store=self.writer.store, + execution_id=self.writer.execution.id, + context_id=self.writer.child_context.id, + uri=c_hash, + name=self.name, + type_name="Dataslice", + event_type=mlpb.Event.Type.OUTPUT, + properties={ + "git_repo": props.get("git_repo", ""), + "Commit": props.get("Commit", ""), + "url": props.get("url", " "), + }, + artifact_type_properties={ + "git_repo": mlpb.STRING, + "Commit": mlpb.STRING, + "url": mlpb.STRING, + }, + custom_properties=custom_props, + milliseconds_since_epoch=int(time.time() * 1000), + ) + custom_props["git_repo"] = props.get("git_repo", "") + custom_props["Commit"] = props.get("Commit", "") + if self.writer.graph: + self.writer.driver.create_dataslice_node( + self.name, self.name, c_hash, self.data_parent, custom_props + ) + return slice \ No newline at end of file From eac1b53e1630a61ec416ae0f038194dd0f5d1ae5 Mon Sep 17 00:00:00 2001 From: Varkha Sharma Date: Thu, 21 Nov 2024 09:27:28 -0800 Subject: [PATCH 09/15] removing cmf_commands_wrapper changes as this issue is becoming too big to test --- cmflib/cmf.py | 1374 ++++++++++++++++++++++++++------ cmflib/cmf_commands_wrapper.py | 446 +++-------- cmflib/cmf_merger.py | 6 +- cmflib/cmf_server_methods.py | 8 +- cmflib/graph_wrapper.py | 35 +- server/app/main.py | 2 + 6 files changed, 1232 insertions(+), 639 deletions(-) diff --git a/cmflib/cmf.py b/cmflib/cmf.py index 2eef4ed2..cb653dd6 100644 --- a/cmflib/cmf.py +++ b/cmflib/cmf.py @@ -20,7 +20,6 @@ import re import os import sys -import yaml import pandas as pd import typing as t @@ -55,30 +54,23 @@ link_execution_to_input_artifact, ) from cmflib.utils.cmf_config import CmfConfig -from cmflib.utils.helper_functions import get_python_env, change_dir, get_md5_hash -from cmflib.cmf_server_methods import ( - merge_created_context, - merge_created_execution, - log_python_env_on_server, - log_dataset_with_version, - log_model_with_version, - log_execution_metrics_from_client, - commit_existing_metrics, - commit_existing, - commit_existing -) - +from cmflib.utils.helper_functions import get_python_env, change_dir from cmflib.cmf_commands_wrapper import ( - metadata_push, - metadata_pull, - cmf_init_show, - metadata_export, - artifact_pull, - artifact_push, - artifact_pull_single, - artifact_list, - pipeline_list, - execution_list, + _metadata_push, + _metadata_pull, + _metadata_export, + _artifact_pull, + _artifact_push, + _artifact_pull_single, + _cmf_cmd_init, + _init_local, + _init_minioS3, + _init_amazonS3, + _init_sshremote, + _init_osdfremote, + _artifact_list, + _pipeline_list, + _execution_list, ) class Cmf: @@ -111,9 +103,16 @@ class Cmf: """ # pylint: disable=too-many-instance-attributes + # Reading CONFIG_FILE variable + cmf_config = os.environ.get("CONFIG_FILE", ".cmfconfig") ARTIFACTS_PATH = "cmf_artifacts" DATASLICE_PATH = "dataslice" METRICS_PATH = "metrics" + if os.path.exists(cmf_config): + attr_dict = CmfConfig.read_config(cmf_config) + __neo4j_uri = attr_dict.get("neo4j-uri", "") + __neo4j_password = attr_dict.get("neo4j-password", "") + __neo4j_user = attr_dict.get("neo4j-user", "") def __init__( self, @@ -146,7 +145,6 @@ def __init__( self.execution_name = "" self.execution_command = "" self.metrics = {} - # why have we created this list self.input_artifacts = [] self.execution_label_props = {} self.graph = graph @@ -173,7 +171,6 @@ def __init__( os.chdir(logging_dir) @staticmethod - # function used to load neo4j params for cmf client def __load_neo4j_params(): cmf_config = os.environ.get("CONFIG_FILE", ".cmfconfig") if os.path.exists(cmf_config): @@ -184,7 +181,6 @@ def __load_neo4j_params(): @staticmethod - # function used to load neo4j params for cmf-server def __get_neo4j_server_config(): Cmf.__neo4j_uri = os.getenv('NEO4J_URI', "") Cmf.__neo4j_user = os.getenv('NEO4J_USER_NAME', "") @@ -280,6 +276,47 @@ def create_context( ) return ctx + def merge_created_context( + self, pipeline_stage: str, custom_properties: t.Optional[t.Dict] = None + ) -> mlpb.Context: + """Merge created context. + Every call creates a unique pipeline stage. + Created for metadata push purpose. + Example: + + ```python + #Create context + # Import CMF + from cmflib.cmf import Cmf + from ml_metadata.proto import metadata_store_pb2 as mlpb + # Create CMF logger + cmf = Cmf(filepath="mlmd", pipeline_name="test_pipeline") + # Create context + context: mlmd.proto.Context = cmf.merge_created_context( + pipeline_stage="Test-env/prepare", + custom_properties ={"user-metadata1": "metadata_value"} + ``` + Args: + Pipeline_stage: Pipeline_Name/Stage_name. + custom_properties: Developers can provide key value pairs with additional properties of the execution that + need to be stored. + Returns: + Context object from ML Metadata library associated with the new context for this stage. + """ + + custom_props = {} if custom_properties is None else custom_properties + ctx = get_or_create_run_context( + self.store, pipeline_stage, custom_props) + self.child_context = ctx + associate_child_to_parent_context( + store=self.store, parent_context=self.parent_context, child_context=ctx + ) + if self.graph: + self.driver.create_stage_node( + pipeline_stage, self.parent_context, ctx.id, custom_props + ) + return ctx + def update_context( self, type_name: str, @@ -382,7 +419,7 @@ def create_execution( git_repo = git_get_repo() git_start_commit = git_get_commit() cmd = str(sys.argv) if cmd is None else cmd - + python_env=get_python_env() self.execution = create_new_execution_in_existing_run_context( store=self.store, # Type field when re-using executions @@ -396,6 +433,7 @@ def create_execution( pipeline_type=self.parent_context.name, git_repo=git_repo, git_start_commit=git_start_commit, + python_env=python_env, custom_properties=custom_props, create_new_execution=create_new_execution, ) @@ -403,8 +441,7 @@ def create_execution( if uuids: self.execution.properties["Execution_uuid"].string_value = uuids+","+str(uuid.uuid1()) else: - self.execution.properties["Execution_uuid"].string_value = str(uuid.uuid1()) - + self.execution.properties["Execution_uuid"].string_value = str(uuid.uuid1()) self.store.put_executions([self.execution]) self.execution_name = str(self.execution.id) + "," + execution_type self.execution_command = cmd @@ -414,7 +451,7 @@ def create_execution( self.execution_label_props["Execution_Name"] = ( execution_type + ":" + str(self.execution.id) ) - + self.execution_label_props["execution_command"] = cmd if self.graph: self.driver.create_execution_node( @@ -425,34 +462,6 @@ def create_execution( self.execution.id, custom_props, ) - - directory_path = self.ARTIFACTS_PATH - os.makedirs(directory_path, exist_ok=True) - packages = get_python_env() - if isinstance(packages, list): - output = f"{packages}\n" - md5_hash = get_md5_hash(output) - python_env_file_path = os.path.join(directory_path, f"python_env_{md5_hash}.txt") - # create file if it doesn't exists - if not os.path.exists(python_env_file_path): - #print(f"{python_env_file_path} doesn't exists!!") - with open(python_env_file_path, 'w') as file: - for package in packages: - file.write(f"{package}\n") - - else: - # in case output is dict - env_output = yaml.dump(packages, sort_keys=False) - md5_hash = get_md5_hash(env_output) - python_env_file_path = os.path.join(directory_path, f"python_env_{md5_hash}.yaml") - # create file if it doesn't exists - if not os.path.exists(python_env_file_path): - #print(f"{python_env_file_path} doesn't exists!!") - with open(python_env_file_path, 'w') as file: - file.write(env_output) - - # link the artifact to execution if it exists and creates artifact if it doesn't - self.log_python_env(python_env_file_path) os.chdir(logging_dir) return self.execution @@ -530,99 +539,131 @@ def update_execution( ) return self.execution - def log_dvc_lock(self, file_path: str): - """Used to update the dvc lock file created with dvc run command.""" - print("Entered dvc lock file commit") - return commit_dvc_lock_file(file_path, self.execution.id) + def merge_created_execution( + self, + execution_type: str, + execution_cmd: str, + properties: t.Optional[t.Dict] = None, + custom_properties: t.Optional[t.Dict] = None, + orig_execution_name:str = "", + create_new_execution:bool = True + ) -> mlpb.Execution: + """Merge Created execution. + Every call creates a unique execution. Execution can only be created within a context, so + [create_context][cmflib.cmf.Cmf.create_context] must be called first. + Every call occurs when metadata push or pull is processed. Data from pre-existing executions is used + to create new executions with additional data(Required on cmf-server). + Example: + ```python + # Import CMF + from cmflib.cmf import Cmf + from ml_metadata.proto import metadata_store_pb2 as mlpb + # Create CMF logger + cmf = Cmf(filepath="mlmd", pipeline_name="test_pipeline") + # Create or reuse context for this stage + context: mlmd.proto.Context = cmf.merge_created_context( + pipeline_stage="prepare", + custom_properties ={"user-metadata1": "metadata_value"} + ) + # Create a new execution for this stage run + execution: mlmd.proto.Execution = cmf.merge_created_execution( + execution_type="Prepare", + properties={"Context_Type":""}, + custom_properties = {"split": split, "seed": seed}, + orig_execution_name=execution_name + ) + ``` + Args: + execution_type: Type of the execution.(when create_new_execution is False, this is the name of execution) + properties: Properties of Execution. + custom_properties: Developers can provide key value pairs with additional properties of the execution that + need to be stored. - def log_python_env( - self, - url: str, - ) -> mlpb.Artifact: - "Used to log the python packages involved in the current execution" + cmd: command used to run this execution. - git_repo = git_get_repo() - name = re.split("/", url)[-1] - existing_artifact = [] + create_new_execution:bool = True, This can be used by advanced users to re-use executions + This is applicable, when working with framework code like mmdet, pytorch lightning etc, where the + custom call-backs are used to log metrics. + if create_new_execution is True(Default), execution_type parameter will be used as the name of the execution type. + if create_new_execution is False, if existing execution exist with the same name as execution_type. + it will be reused. + Only executions created with create_new_execution as False will have "name" as a property. - commit_output(url, self.execution.id) - c_hash = dvc_get_hash(url) - if c_hash == "": - print("Error in getting the dvc hash,return without logging") - return + Returns: + Execution object from ML Metadata library associated with the execution for this stage. + """ + # Initializing the execution related fields + properties = {} if properties is None else properties + self.metrics = {} + self.input_artifacts = [] + self.execution_label_props = {} + custom_props = {} if custom_properties is None else custom_properties + # print(custom_props) + git_repo = properties.get("Git_Repo", "") + git_start_commit = properties.get("Git_Start_Commit", "") + python_env = properties.get("Python_Env", "") + #name = properties.get("Name", "") + create_new_execution = True + execution_name = execution_type + #exe.name property is passed as the orig_execution_name. + #if name is not an empty string then we are re-using executions + if orig_execution_name != "": + create_new_execution = False + execution_name = orig_execution_name - commit = c_hash - dvc_url = dvc_get_url(url) - dvc_url_with_pipeline = f"{self.parent_context.name}:{dvc_url}" - url = url + ":" + c_hash - if c_hash and c_hash.strip: - existing_artifact.extend(self.store.get_artifacts_by_uri(c_hash)) + self.execution = create_new_execution_in_existing_run_context( + store=self.store, + execution_type_name=execution_type, # Type field when re-using executions + execution_name=execution_name, #Name field if we are re-using executionsname + #Type field , if creating new executions always + context_id=self.child_context.id, + execution=execution_cmd, + pipeline_id=self.parent_context.id, + pipeline_type=self.parent_context.name, + git_repo=git_repo, + git_start_commit=git_start_commit, + python_env=python_env, + custom_properties=custom_props, + create_new_execution=create_new_execution + ) - if existing_artifact and len(existing_artifact) != 0: - existing_artifact = existing_artifact[0] - uri = c_hash - artifact = link_execution_to_artifact( - store=self.store, - execution_id=self.execution.id, - uri=uri, - input_name=url, - event_type=mlpb.Event.Type.INPUT, - ) - else: - uri = c_hash if c_hash and c_hash.strip() else str(uuid.uuid1()) - artifact = create_new_artifact_event_and_attribution( - store=self.store, - execution_id=self.execution.id, - context_id=self.child_context.id, - uri=uri, - name=url, - type_name="Environment", - event_type=mlpb.Event.Type.INPUT, - properties={ - "git_repo": str(git_repo), - # passing c_hash value to commit - "Commit": str(commit), - "url": str(dvc_url_with_pipeline), - }, - artifact_type_properties={ - "git_repo": mlpb.STRING, - "Commit": mlpb.STRING, - "url": mlpb.STRING, - }, - milliseconds_since_epoch=int(time.time() * 1000), - ) - custom_props = {} - custom_props["git_repo"] = git_repo - custom_props["Commit"] = commit - self.execution_label_props["git_repo"] = git_repo - self.execution_label_props["Commit"] = commit + uuids = "" - if self.graph: - self.driver.create_env_node( - name, - url, - uri, - "input", - self.execution.id, - self.parent_context, - custom_props, - ) - self.input_artifacts.append( - { - "Name": name, - "Path": url, - "URI": uri, - "Event": "input", - "Execution_Name": self.execution_name, - "Type": "Environment", - "Execution_Command": self.execution_command, - "Pipeline_Id": self.parent_context.id, - "Pipeline_Name": self.parent_context.name, - } - ) - self.driver.create_execution_links(uri, name, "Environment") - return artifact + uuids = self.execution.properties["Execution_uuid"].string_value + if uuids: + self.execution.properties["Execution_uuid"].string_value = uuids +\ + ","+properties["Execution_uuid"] + else: + self.execution.properties["Execution_uuid"].string_value =\ + properties["Execution_uuid"] + + + self.store.put_executions([self.execution]) + self.execution_name = str(self.execution.id) + "," + execution_type + self.execution_command = execution_cmd + for k, v in custom_props.items(): + k = re.sub("-", "_", k) + self.execution_label_props[k] = v + self.execution_label_props["Execution_Name"] = ( + execution_type + ":" + str(self.execution.id) + ) + self.execution_label_props["execution_command"] = execution_cmd + if self.graph: + self.driver.create_execution_node( + self.execution_name, + self.child_context.id, + self.parent_context, + execution_cmd, + self.execution.id, + custom_props, + ) + return self.execution + + def log_dvc_lock(self, file_path: str): + """Used to update the dvc lock file created with dvc run command.""" + print("Entered dvc lock file commit") + return commit_dvc_lock_file(file_path, self.execution.id) def log_dataset( self, @@ -831,103 +872,240 @@ def update_model_url(self, dup_artifact: list, updated_url: str): put_artifact(self.store, dup_art) return dup_artifact - # Add the model to dvc do a git commit and store the commit id in MLMD - def log_model( + def log_dataset_with_version( self, - path: str, + url: str, + version: str, event: str, - model_framework: str = "Default", - model_type: str = "Default", - model_name: str = "Default", + props: t.Optional[t.Dict] = None, custom_properties: t.Optional[t.Dict] = None, ) -> mlpb.Artifact: - """Logs a model. - The model is added to dvc and the metadata file (.dvc) gets committed to git. - Example: - ```python - artifact: mlmd.proto.Artifact= cmf.log_model( - path="path/to/model.pkl", - event="output", - model_framework="SKlearn", - model_type="RandomForestClassifier", - model_name="RandomForestClassifier:default" - ) - ``` - Args: - path: Path to the model file. - event: Takes arguments `INPUT` OR `OUTPUT`. - model_framework: Framework used to create the model. - model_type: Type of model algorithm used. - model_name: Name of the algorithm used. - custom_properties: The model properties. - Returns: - Artifact object from ML Metadata library associated with the new model artifact. + """Logs a dataset when the version (hash) is known. + Example: + ```python + artifact: mlpb.Artifact = cmf.log_dataset_with_version( + url="path/to/dataset", + version="abcdef", + event="output", + props={ "git_repo": "https://github.com/example/repo", + "url": "/path/in/repo", }, + custom_properties={ "custom_key": "custom_value", }, + ) + ``` + Args: + url: Path to the dataset. + version: Hash or version identifier for the dataset. + event: Takes arguments `INPUT` or `OUTPUT`. + props: Optional properties for the dataset (e.g., git_repo, url). + custom_properties: Optional custom properties for the dataset. + Returns: + Artifact object from the ML Protocol Buffers library associated with the new dataset artifact. """ - logging_dir = change_dir(self.cmf_init_path) - # Assigning current file name as stage and execution name - current_script = sys.argv[0] - file_name = os.path.basename(current_script) - name_without_extension = os.path.splitext(file_name)[0] - # create context if not already created - if not self.child_context: - self.create_context(pipeline_stage=name_without_extension) - assert self.child_context is not None, f"Failed to create context for {self.pipeline_name}!!" - - # create execution if not already created - if not self.execution: - self.create_execution(execution_type=name_without_extension) - assert self.execution is not None, f"Failed to create execution for {self.pipeline_name}!!" - - - # To Do : Technical Debt. - # If the model already exist , then we just link the existing model to the execution - # We do not update the model properties . - # We need to append the new properties to the existing model properties - if custom_properties is None: - custom_properties = {} + props = {} if props is None else props custom_props = {} if custom_properties is None else custom_properties - # name = re.split('/', path)[-1] + git_repo = props.get("git_repo", "") + name = url event_type = mlpb.Event.Type.OUTPUT existing_artifact = [] + c_hash = version if event.lower() == "input": event_type = mlpb.Event.Type.INPUT - commit_output(path, self.execution.id) - c_hash = dvc_get_hash(path) - - if c_hash == "": - print("Error in getting the dvc hash,return without logging") - return + # dataset_commit = commit_output(url, self.execution.id) - model_commit = c_hash + dataset_commit = version + url = url + ":" + c_hash + if c_hash and c_hash.strip: + existing_artifact.extend(self.store.get_artifacts_by_uri(c_hash)) - # If connecting to an existing artifact - The name of the artifact is - # used as path/steps/key - model_uri = path + ":" + c_hash - dvc_url = dvc_get_url(path, False) - url = dvc_url - url_with_pipeline = f"{self.parent_context.name}:{url}" - uri = "" - if c_hash and c_hash.strip(): - uri = c_hash.strip() - existing_artifact.extend(self.store.get_artifacts_by_uri(uri)) - else: - raise RuntimeError("Model commit failed, Model uri empty") + # To Do - What happens when uri is the same but names are different + if existing_artifact and len(existing_artifact) != 0: + existing_artifact = existing_artifact[0] - if ( - existing_artifact - and len(existing_artifact) != 0 - ): + # Quick fix- Updating only the name + if custom_properties is not None: + self.update_existing_artifact( + existing_artifact, custom_properties) + uri = c_hash # update url for existing artifact - existing_artifact = self.update_model_url( - existing_artifact, url_with_pipeline - ) + self.update_dataset_url(existing_artifact, props.get("url", "")) artifact = link_execution_to_artifact( store=self.store, execution_id=self.execution.id, - uri=c_hash, - input_name=model_uri, + uri=uri, + input_name=url, + event_type=event_type, + ) + else: + # if((existing_artifact and len(existing_artifact )!= 0) and c_hash != ""): + # url = url + ":" + str(self.execution.id) + uri = c_hash if c_hash and c_hash.strip() else str(uuid.uuid1()) + artifact = create_new_artifact_event_and_attribution( + store=self.store, + execution_id=self.execution.id, + context_id=self.child_context.id, + uri=uri, + name=url, + type_name="Dataset", + event_type=event_type, + properties={ + "git_repo": str(git_repo), + "Commit": str(dataset_commit), + "url": props.get("url", " "), + }, + artifact_type_properties={ + "git_repo": mlpb.STRING, + "Commit": mlpb.STRING, + "url": mlpb.STRING, + }, + custom_properties=custom_props, + milliseconds_since_epoch=int(time.time() * 1000), + ) + custom_props["git_repo"] = git_repo + custom_props["Commit"] = dataset_commit + self.execution_label_props["git_repo"] = git_repo + self.execution_label_props["Commit"] = dataset_commit + + if self.graph: + self.driver.create_dataset_node( + name, + url, + uri, + event, + self.execution.id, + self.parent_context, + custom_props, + ) + if event.lower() == "input": + self.input_artifacts.append( + { + "Name": name, + "Path": url, + "URI": uri, + "Event": event.lower(), + "Execution_Name": self.execution_name, + "Type": "Dataset", + "Execution_Command": self.execution_command, + "Pipeline_Id": self.parent_context.id, + "Pipeline_Name": self.parent_context.name, + } + ) + self.driver.create_execution_links(uri, name, "Dataset") + else: + child_artifact = { + "Name": name, + "Path": url, + "URI": uri, + "Event": event.lower(), + "Execution_Name": self.execution_name, + "Type": "Dataset", + "Execution_Command": self.execution_command, + "Pipeline_Id": self.parent_context.id, + "Pipeline_Name": self.parent_context.name, + } + self.driver.create_artifact_relationships( + self.input_artifacts, child_artifact, self.execution_label_props + ) + return artifact + + # Add the model to dvc do a git commit and store the commit id in MLMD + def log_model( + self, + path: str, + event: str, + model_framework: str = "Default", + model_type: str = "Default", + model_name: str = "Default", + custom_properties: t.Optional[t.Dict] = None, + ) -> mlpb.Artifact: + """Logs a model. + The model is added to dvc and the metadata file (.dvc) gets committed to git. + Example: + ```python + artifact: mlmd.proto.Artifact= cmf.log_model( + path="path/to/model.pkl", + event="output", + model_framework="SKlearn", + model_type="RandomForestClassifier", + model_name="RandomForestClassifier:default" + ) + ``` + Args: + path: Path to the model file. + event: Takes arguments `INPUT` OR `OUTPUT`. + model_framework: Framework used to create the model. + model_type: Type of model algorithm used. + model_name: Name of the algorithm used. + custom_properties: The model properties. + Returns: + Artifact object from ML Metadata library associated with the new model artifact. + """ + + logging_dir = change_dir(self.cmf_init_path) + # Assigning current file name as stage and execution name + current_script = sys.argv[0] + file_name = os.path.basename(current_script) + name_without_extension = os.path.splitext(file_name)[0] + # create context if not already created + if not self.child_context: + self.create_context(pipeline_stage=name_without_extension) + assert self.child_context is not None, f"Failed to create context for {self.pipeline_name}!!" + + # create execution if not already created + if not self.execution: + self.create_execution(execution_type=name_without_extension) + assert self.execution is not None, f"Failed to create execution for {self.pipeline_name}!!" + + + # To Do : Technical Debt. + # If the model already exist , then we just link the existing model to the execution + # We do not update the model properties . + # We need to append the new properties to the existing model properties + if custom_properties is None: + custom_properties = {} + custom_props = {} if custom_properties is None else custom_properties + # name = re.split('/', path)[-1] + event_type = mlpb.Event.Type.OUTPUT + existing_artifact = [] + if event.lower() == "input": + event_type = mlpb.Event.Type.INPUT + + commit_output(path, self.execution.id) + c_hash = dvc_get_hash(path) + + if c_hash == "": + print("Error in getting the dvc hash,return without logging") + return + + model_commit = c_hash + + # If connecting to an existing artifact - The name of the artifact is + # used as path/steps/key + model_uri = path + ":" + c_hash + dvc_url = dvc_get_url(path, False) + url = dvc_url + url_with_pipeline = f"{self.parent_context.name}:{url}" + uri = "" + if c_hash and c_hash.strip(): + uri = c_hash.strip() + existing_artifact.extend(self.store.get_artifacts_by_uri(uri)) + else: + raise RuntimeError("Model commit failed, Model uri empty") + + if ( + existing_artifact + and len(existing_artifact) != 0 + ): + # update url for existing artifact + existing_artifact = self.update_model_url( + existing_artifact, url_with_pipeline + ) + artifact = link_execution_to_artifact( + store=self.store, + execution_id=self.execution.id, + uri=c_hash, + input_name=model_uri, event_type=event_type, ) model_uri = model_uri + ":" + str(self.execution.id) @@ -960,7 +1138,7 @@ def log_model( custom_properties=custom_props, milliseconds_since_epoch=int(time.time() * 1000), ) - custom_properties["Commit"] = model_commit + # custom_properties["Commit"] = model_commit self.execution_label_props["Commit"] = model_commit #To DO model nodes should be similar to dataset nodes when we create neo4j if self.graph: @@ -1004,6 +1182,226 @@ def log_model( os.chdir(logging_dir) return artifact + # Add the model to dvc do a git commit and store the commit id in MLMD + def log_model_with_version( + self, + path: str, + event: str, + props=None, + custom_properties: t.Optional[t.Dict] = None, + ) -> object: + """Logs a model when the version(hash) is known + The model is added to dvc and the metadata file (.dvc) gets committed to git. + Example: + ```python + artifact: mlmd.proto.Artifact= cmf.log_model_with_version( + path="path/to/model.pkl", + event="output", + props={ + "url": "/home/user/local-storage/bf/629ccd5cd008066b72c04f9a918737", + "model_type": "RandomForestClassifier", + "model_name": "RandomForestClassifier:default", + "Commit": "commit 1146dad8b74cae205db6a3132ea403db1e4032e5", + "model_framework": "SKlearn", + }, + custom_properties={ + "uri": "bf629ccd5cd008066b72c04f9a918737", + }, + + ) + ``` + Args: + path: Path to the model file. + event: Takes arguments `INPUT` OR `OUTPUT`. + props: Model artifact properties. + custom_properties: The model properties. + Returns: + Artifact object from ML Metadata library associated with the new model artifact. + """ + + if custom_properties is None: + custom_properties = {} + custom_props = {} if custom_properties is None else custom_properties + name = re.split("/", path)[-1] + event_type = mlpb.Event.Type.OUTPUT + existing_artifact = [] + if event.lower() == "input": + event_type = mlpb.Event.Type.INPUT + + # props["commit"] = "" # To do get from incoming data + c_hash = props.get("uri", " ") + # If connecting to an existing artifact - The name of the artifact is used as path/steps/key + model_uri = path + ":" + c_hash + # dvc_url = dvc_get_url(path, False) + url = props.get("url", "") + # uri = "" + if c_hash and c_hash.strip(): + uri = c_hash.strip() + existing_artifact.extend(self.store.get_artifacts_by_uri(uri)) + else: + raise RuntimeError("Model commit failed, Model uri empty") + + if ( + existing_artifact + and len(existing_artifact) != 0 + ): + # update url for existing artifact + existing_artifact = self.update_model_url(existing_artifact, url) + artifact = link_execution_to_artifact( + store=self.store, + execution_id=self.execution.id, + uri=c_hash, + input_name=model_uri, + event_type=event_type, + ) + model_uri = artifact.name + else: + uri = c_hash if c_hash and c_hash.strip() else str(uuid.uuid1()) + model_uri = model_uri + ":" + str(self.execution.id) + artifact = create_new_artifact_event_and_attribution( + store=self.store, + execution_id=self.execution.id, + context_id=self.child_context.id, + uri=uri, + name=model_uri, + type_name="Model", + event_type=event_type, + properties={ + "model_framework": props.get("model_framework", ""), + "model_type": props.get("model_type", ""), + "model_name": props.get("model_name", ""), + "Commit": props.get("Commit", ""), + "url": str(url), + }, + artifact_type_properties={ + "model_framework": mlpb.STRING, + "model_type": mlpb.STRING, + "model_name": mlpb.STRING, + "Commit": mlpb.STRING, + "url": mlpb.STRING, + }, + custom_properties=custom_props, + milliseconds_since_epoch=int(time.time() * 1000), + ) + # custom_properties["Commit"] = model_commit + # custom_props["url"] = url + self.execution_label_props["Commit"] = props.get("Commit", "") + if self.graph: + self.driver.create_model_node( + model_uri, + uri, + event, + self.execution.id, + self.parent_context, + custom_props, + ) + if event.lower() == "input": + self.input_artifacts.append( + { + "Name": model_uri, + "URI": uri, + "Event": event.lower(), + "Execution_Name": self.execution_name, + "Type": "Model", + "Execution_Command": self.execution_command, + "Pipeline_Id": self.parent_context.id, + "Pipeline_Name": self.parent_context.name, + } + ) + self.driver.create_execution_links(uri, model_uri, "Model") + else: + child_artifact = { + "Name": model_uri, + "URI": uri, + "Event": event.lower(), + "Execution_Name": self.execution_name, + "Type": "Model", + "Execution_Command": self.execution_command, + "Pipeline_Id": self.parent_context.id, + "Pipeline_Name": self.parent_context.name, + } + self.driver.create_artifact_relationships( + self.input_artifacts, child_artifact, self.execution_label_props + ) + + return artifact + + def log_execution_metrics_from_client(self, metrics_name: str, + custom_properties: t.Optional[t.Dict] = None) -> mlpb.Artifact: + """ Logs execution metrics from a client. + Data from pre-existing metrics from client side is used to create identical metrics on server side. + Example: + ```python + artifact: mlpb.Artifact = cmf.log_execution_metrics_from_client( + metrics_name="example_metrics:uri:123", + custom_properties={"custom_key": "custom_value"}, + ) + ``` + Args: + metrics_name: Name of the metrics in the format "name:uri:execution_id". + custom_properties: Optional custom properties for the metrics. + Returns: + Artifact object from the ML Protocol Buffers library associated with the metrics artifact. + """ + + metrics = None + custom_props = {} if custom_properties is None else custom_properties + existing_artifact = [] + name_tokens = metrics_name.split(":") + if name_tokens and len(name_tokens) > 2: + name = name_tokens[0] + uri = name_tokens[1] + execution_id = name_tokens[2] + else: + print(f"Error : metrics name {metrics_name} is not in the correct format") + return + + #we need to add the execution id to the metrics name + new_metrics_name = f"{name}:{uri}:{str(self.execution.id)}" + existing_artifacts = self.store.get_artifacts_by_uri(uri) + + existing_artifact = existing_artifacts[0] if existing_artifacts else None + if not existing_artifact or \ + ((existing_artifact) and not + (existing_artifact.name == new_metrics_name)): #we need to add the artifact otherwise its already there + metrics = create_new_artifact_event_and_attribution( + store=self.store, + execution_id=self.execution.id, + context_id=self.child_context.id, + uri=uri, + name=new_metrics_name, + type_name="Metrics", + event_type=mlpb.Event.Type.OUTPUT, + properties={"metrics_name": metrics_name}, + artifact_type_properties={"metrics_name": mlpb.STRING}, + custom_properties=custom_props, + milliseconds_since_epoch=int(time.time() * 1000), + ) + if self.graph: + # To do create execution_links + self.driver.create_metrics_node( + metrics_name, + uri, + "output", + self.execution.id, + self.parent_context, + custom_props, + ) + child_artifact = { + "Name": metrics_name, + "URI": uri, + "Event": "output", + "Execution_Name": self.execution_name, + "Type": "Metrics", + "Execution_Command": self.execution_command, + "Pipeline_Id": self.parent_context.id, + "Pipeline_Name": self.parent_context.name, + } + self.driver.create_artifact_relationships( + self.input_artifacts, child_artifact, self.execution_label_props + ) + return metrics + def log_execution_metrics( self, metrics_name: str, custom_properties: t.Optional[t.Dict] = None @@ -1189,12 +1587,8 @@ def commit_metrics(self, metrics_name: str): custom_properties=custom_props, milliseconds_since_epoch=int(time.time() * 1000), ) - - custom_props["Commit"] = metrics_commit - self.execution_label_props["Commit"] = metrics_commit - if self.graph: - self.driver.create_step_metrics_node( + self.driver.create_metrics_node( name, uri, "output", @@ -1207,7 +1601,7 @@ def commit_metrics(self, metrics_name: str): "URI": uri, "Event": "output", "Execution_Name": self.execution_name, - "Type": "Step_Metrics", + "Type": "Metrics", "Execution_Command": self.execution_command, "Pipeline_Id": self.parent_context.id, } @@ -1218,6 +1612,79 @@ def commit_metrics(self, metrics_name: str): os.chdir(logging_dir) return metrics + def commit_existing_metrics(self, metrics_name: str, uri: str, props: t.Optional[t.Dict] = None, custom_properties: t.Optional[t.Dict] = None): + """ + Commits existing metrics associated with the given URI to MLMD. + Example: + ```python + artifact: mlpb.Artifact = cmf.commit_existing_metrics("existing_metrics", "abc123", + {"custom_key": "custom_value"}) + ``` + Args: + metrics_name: Name of the metrics. + uri: Unique identifier associated with the metrics. + custom_properties: Optional custom properties for the metrics. + Returns: + Artifact object from the ML Protocol Buffers library associated with the existing metrics artifact. + """ + + custom_props = {} if custom_properties is None else custom_properties + c_hash = uri.strip() + existing_artifact = [] + existing_artifact.extend(self.store.get_artifacts_by_uri(c_hash)) + if (existing_artifact + and len(existing_artifact) != 0 ): + metrics = link_execution_to_artifact( + store=self.store, + execution_id=self.execution.id, + uri=c_hash, + input_name=metrics_name, + event_type=mlpb.Event.Type.OUTPUT, + ) + else: + metrics = create_new_artifact_event_and_attribution( + store=self.store, + execution_id=self.execution.id, + context_id=self.child_context.id, + uri=uri, + name=metrics_name, + type_name="Step_Metrics", + event_type=mlpb.Event.Type.OUTPUT, + properties={ + # passing uri value to commit + "Commit": props.get("Commit", ""), + "url": props.get("url", ""), + }, + artifact_type_properties={ + "Commit": mlpb.STRING, + "url": mlpb.STRING, + }, + custom_properties=custom_props, + milliseconds_since_epoch=int(time.time() * 1000), + ) + if self.graph: + self.driver.create_metrics_node( + metrics_name, + uri, + "output", + self.execution.id, + self.parent_context, + custom_props, + ) + child_artifact = { + "Name": metrics_name, + "URI": uri, + "Event": "output", + "Execution_Name": self.execution_name, + "Type": "Metrics", + "Execution_Command": self.execution_command, + "Pipeline_Id": self.parent_context.id, + } + self.driver.create_artifact_relationships( + self.input_artifacts, child_artifact, self.execution_label_props + ) + return metrics + def log_validation_output( self, version: str, custom_properties: t.Optional[t.Dict] = None @@ -1435,6 +1902,12 @@ def commit(self, custom_properties: t.Optional[t.Dict] = None) -> None: input_name=dataslice_path + ":" + c_hash, ) else: + props={ + "git_repo": str(git_repo), + # passing c_hash value to commit + "Commit": str(dataslice_commit), + "url": str(dvc_url_with_pipeline), + }, slice = create_new_artifact_event_and_attribution( store=self.writer.store, execution_id=self.writer.execution.id, @@ -1457,18 +1930,58 @@ def commit(self, custom_properties: t.Optional[t.Dict] = None) -> None: custom_properties=custom_props, milliseconds_since_epoch=int(time.time() * 1000), ) - - custom_props["git_repo"] = git_repo - custom_props["Commit"] = dataslice_commit - self.writer.execution_label_props["git_repo"] = git_repo - self.writer.execution_label_props["Commit"] = dataslice_commit if self.writer.graph: self.writer.driver.create_dataslice_node( - self.name, dataslice_path + ":" + c_hash, c_hash, self.data_parent, custom_props + self.name, dataslice_path + ":" + c_hash, c_hash, self.data_parent, props ) os.chdir(logging_dir) return slice + # commit existing dataslice to server + def commit_existing(self, uri: str, props: t.Optional[t.Dict] = None, custom_properties: t.Optional[t.Dict] = None) -> None: + custom_props = {} if custom_properties is None else custom_properties + c_hash = uri.strip() + dataslice_commit = c_hash + existing_artifact = [] + if c_hash and c_hash.strip(): + existing_artifact.extend( + self.writer.store.get_artifacts_by_uri(c_hash)) + if existing_artifact and len(existing_artifact) != 0: + print("Adding to existing data slice") + # Haven't added event type in this if cond, is it not needed?? + slice = link_execution_to_input_artifact( + store=self.writer.store, + execution_id=self.writer.execution.id, + uri=c_hash, + input_name=self.name, + ) + else: + slice = create_new_artifact_event_and_attribution( + store=self.writer.store, + execution_id=self.writer.execution.id, + context_id=self.writer.child_context.id, + uri=c_hash, + name=self.name, + type_name="Dataslice", + event_type=mlpb.Event.Type.OUTPUT, + properties={ + "git_repo": props.get("git_repo", ""), + "Commit": props.get("Commit", ""), + "url": props.get("url", " "), + }, + artifact_type_properties={ + "git_repo": mlpb.STRING, + "Commit": mlpb.STRING, + "url": mlpb.STRING, + }, + custom_properties=custom_properties, + milliseconds_since_epoch=int(time.time() * 1000), + ) + if self.writer.graph: + self.writer.driver.create_dataslice_node( + self.name, self.name, c_hash, self.data_parent, custom_properties + ) + return slice # """Temporary code""" @@ -1483,27 +1996,370 @@ def commit(self, custom_properties: t.Optional[t.Dict] = None) -> None: # print(last) # os.symlink(str(index), slicedir + "/ " + last) -# these are cmf logging api needed for server and defined in cmf_server_methods.py file - -Cmf.merge_created_context = merge_created_context -Cmf.merge_created_execution = merge_created_execution -Cmf.log_python_env_on_server = log_python_env_on_server -Cmf.log_dataset_with_version = log_dataset_with_version -Cmf.log_model_with_version = log_model_with_version -Cmf.log_execution_metrics_from_client = log_execution_metrics_from_client -Cmf.commit_existing_metrics = commit_existing_metrics -#log_metrics_from_client -Cmf.DataSlice.commit_existing = commit_existing -# log_dataslice_from_client - -Cmf.metadata_push = metadata_push -Cmf.metadata_pull = metadata_pull -Cmf.metadata_export = metadata_export -Cmf.artifact_pull = artifact_pull -Cmf.artifact_pull_single = artifact_pull_single -Cmf.artifact_push = artifact_push -Cmf.cmf_init_show = cmf_init_show -Cmf.pipeline_list = pipeline_list -Cmf.execution_list = execution_list -Cmf.artifact_list = artifact_list +def metadata_push(pipeline_name: str, filepath = "./mlmd", tensorboard_path: str = "", execution_id: str = ""): + """ Pushes MLMD file to CMF-server. + Example: + ```python + result = metadata_push("example_pipeline", "mlmd_file", "3") + ``` + Args: + pipeline_name: Name of the pipeline. + filepath: Path to the MLMD file. + execution_id: Optional execution ID. + tensorboard_path: Path to tensorboard logs. + + Returns: + Response output from the _metadata_push function. + """ + # Required arguments: pipeline_name + # Optional arguments: Execution_ID, filepath (mlmd file path, tensorboard_path + output = _metadata_push(pipeline_name, filepath, execution_id, tensorboard_path) + return output + +def metadata_pull(pipeline_name: str, filepath = "./mlmd", execution_id: str = ""): + """ Pulls MLMD file from CMF-server. + Example: + ```python + result = metadata_pull("example_pipeline", "./mlmd_directory", "execution_123") + ``` + Args: + pipeline_name: Name of the pipeline. + filepath: File path to store the MLMD file. + execution_id: Optional execution ID. + Returns: + Message from the _metadata_pull function. + """ + # Required arguments: pipeline_name + #Optional arguments: Execution_ID, filepath(file path to store mlmd file) + output = _metadata_pull(pipeline_name, filepath, execution_id) + return output + +def metadata_export(pipeline_name: str, jsonfilepath: str = "", filepath = "./mlmd"): + """ Export local mlmd's metadata in json format to a json file. + Example: + ```python + result = metadata_pull("example_pipeline", "./jsonfile", "./mlmd_directory") + ``` + Args: + pipeline_name: Name of the pipeline. + jsonfilepath: File path of json file. + filepath: File path to store the MLMD file. + Returns: + Message from the _metadata_pull function. + """ + # Required arguments: pipeline_name + #Optional arguments: jsonfilepath, filepath(file path to store mlmd file) + output = _metadata_export(pipeline_name, jsonfilepath, filepath) + return output + +def artifact_pull(pipeline_name: str, filepath = "./mlmd"): + """ Pulls artifacts from the initialized repository. + + Example: + ```python + result = artifact_pull("example_pipeline", "./mlmd_directory") + ``` + + Args: + pipeline_name: Name of the pipeline. + filepath: Path to store artifacts. + Returns: + Output from the _artifact_pull function. + """ + + # Required arguments: Pipeline_name + # Optional arguments: filepath( path to store artifacts) + output = _artifact_pull(pipeline_name, filepath) + return output + +def artifact_pull_single(pipeline_name: str, filepath: str, artifact_name: str): + """ Pulls a single artifact from the initialized repository. + Example: + ```python + result = artifact_pull_single("example_pipeline", "./mlmd_directory", "example_artifact") + ``` + Args: + pipeline_name: Name of the pipeline. + filepath: Path to store the artifact. + artifact_name: Name of the artifact. + Returns: + Output from the _artifact_pull_single function. + """ + + # Required arguments: Pipeline_name + # Optional arguments: filepath( path to store artifacts), artifact_name + output = _artifact_pull_single(pipeline_name, filepath, artifact_name) + return output + +def artifact_push(pipeline_name: str, filepath = "./mlmd"): + """ Pushes artifacts to the initialized repository. + + Example: + ```python + result = artifact_push("example_pipeline", "./mlmd_directory") + ``` + Args: + pipeline_name: Name of the pipeline. + filepath: Path to store the artifact. + Returns: + Output from the _artifact_push function. + """ + + output = _artifact_push(pipeline_name, filepath) + return output + +def cmf_init_show(): + """ Initializes and shows details of the CMF command. + Example: + ```python + result = cmf_init_show() + ``` + Returns: + Output from the _cmf_cmd_init function. + """ + + output=_cmf_cmd_init() + return output + +def cmf_init(type: str = "", + path: str = "", + git_remote_url: str = "", + cmf_server_url: str = "", + neo4j_user: str = "", + neo4j_password: str = "", + neo4j_uri: str = "", + url: str = "", + endpoint_url: str = "", + access_key_id: str = "", + secret_key: str = "", + session_token: str = "", + user: str = "", + password: str = "", + port: int = 0, + osdf_path: str = "", + key_id: str = "", + key_path: str = "", + key_issuer: str = "", + ): + + """ Initializes the CMF configuration based on the provided parameters. + Example: + ```python + cmf_init( type="local", + path="/path/to/re", + git_remote_url="git@github.com:user/repo.git", + cmf_server_url="http://cmf-server" + neo4j_user", + neo4j_password="password", + neo4j_uri="bolt://localhost:76" + ) + ``` + Args: + type: Type of repository ("local", "minioS3", "amazonS3", "sshremote") + path: Path for the local repository. + git_remote_url: Git remote URL for version control. + cmf_server_url: CMF server URL. + neo4j_user: Neo4j database username. + neo4j_password: Neo4j database password. + neo4j_uri: Neo4j database URI. + url: URL for MinioS3 or AmazonS3. + endpoint_url: Endpoint URL for MinioS3. + access_key_id: Access key ID for MinioS3 or AmazonS3. + secret_key: Secret key for MinioS3 or AmazonS3. + session_token: Session token for AmazonS3. + user: SSH remote username. + password: SSH remote password. + port: SSH remote port + Returns: + Output based on the initialized repository type. + """ + + if type == "": + return print("Error: Type is not provided") + if type not in ["local","minioS3","amazonS3","sshremote","osdfremote"]: + return print("Error: Type value is undefined"+ " "+type+".Expected: "+",".join(["local","minioS3","amazonS3","sshremote","osdfremote"])) + + if neo4j_user != "" and neo4j_password != "" and neo4j_uri != "": + pass + elif neo4j_user == "" and neo4j_password == "" and neo4j_uri == "": + pass + else: + return print("Error: Enter all neo4j parameters.") + + args={'path': path, + 'git_remote_url': git_remote_url, + 'url': url, + 'endpoint_url': endpoint_url, + 'access_key_id': access_key_id, + 'secret_key': secret_key, + 'session_token': session_token, + 'user': user, + 'password': password, + 'osdf_path': osdf_path, + 'key_id': key_id, + 'key_path': key_path, + 'key-issuer': key_issuer, + } + + status_args=non_related_args(type, args) + + if type == "local" and path != "" and git_remote_url != "" : + """Initialize local repository""" + output = _init_local( + path, git_remote_url, cmf_server_url, neo4j_user, neo4j_password, neo4j_uri + ) + if status_args != []: + print("There are non-related arguments: "+",".join(status_args)+".Please remove them.") + return output + + elif type == "minioS3" and url != "" and endpoint_url != "" and access_key_id != "" and secret_key != "" and git_remote_url != "": + """Initialize minioS3 repository""" + output = _init_minioS3( + url, + endpoint_url, + access_key_id, + secret_key, + git_remote_url, + cmf_server_url, + neo4j_user, + neo4j_password, + neo4j_uri, + ) + if status_args != []: + print("There are non-related arguments: "+",".join(status_args)+".Please remove them.") + return output + + elif type == "amazonS3" and url != "" and access_key_id != "" and secret_key != "" and git_remote_url != "": + """Initialize amazonS3 repository""" + output = _init_amazonS3( + url, + access_key_id, + secret_key, + session_token, + git_remote_url, + cmf_server_url, + neo4j_user, + neo4j_password, + neo4j_uri, + ) + if status_args != []: + print("There are non-related arguments: "+",".join(status_args)+".Please remove them.") + + return output + + elif type == "sshremote" and path != "" and user != "" and port != 0 and password != "" and git_remote_url != "": + """Initialize sshremote repository""" + output = _init_sshremote( + path, + user, + port, + password, + git_remote_url, + cmf_server_url, + neo4j_user, + neo4j_password, + neo4j_uri, + ) + if status_args != []: + print("There are non-related arguments: "+",".join(status_args)+".Please remove them.") + + return output + + elif type == "osdfremote" and osdf_path != "" and key_id != "" and key_path != 0 and key_issuer != "" and git_remote_url != "": + """Initialize osdfremote repository""" + output = _init_osdfremote( + osdf_path, + key_id, + key_path, + key_issuer, + git_remote_url, + cmf_server_url, + neo4j_user, + neo4j_password, + neo4j_uri, + ) + if status_args != []: + print("There are non-related arguments: "+",".join(status_args)+".Please remove them.") + + return output + + else: + print("Error: Enter all arguments") + + +def non_related_args(type : str, args : dict): + available_args=[i for i, j in args.items() if j != ""] + local=["path", "git_remote_url"] + minioS3=["url", "endpoint_url", "access_key_id", "secret_key", "git_remote_url"] + amazonS3=["url", "access_key_id", "secret_key", "session_token", "git_remote_url"] + sshremote=["path", "user", "port", "password", "git_remote_url"] + osdfremote=["osdf_path", "key_id", "key_path", "key-issuer", "git_remote_url"] + + + dict_repository_args={"local" : local, "minioS3" : minioS3, "amazonS3" : amazonS3, "sshremote" : sshremote} + + for repo,arg in dict_repository_args.items(): + if repo ==type: + non_related_args=list(set(available_args)-set(dict_repository_args[repo])) + return non_related_args + + +def pipeline_list(filepath = "./mlmd"): + """ Display list of pipline for current mlmd. + + Example: + ```python + result = _pipeline_list("./mlmd_directory") + ``` + + Args: + filepath: File path to store the MLMD file. + Returns: + Output from the _pipeline_list function. + """ + + # Optional arguments: filepath( path to store the MLMD file) + output = _pipeline_list(filepath) + return output + + +def execution_list(pipeline_name: str, filepath = "./mlmd", execution_id: str = "", long = True): + """ Display list of execution for given pipeline. + Example: + ```python + result = _execution_list("example_pipeline", "./mlmd_directory", "example_execution_id", "long") + ``` + Args: + pipeline_name: Name of the pipeline. + filepath: Path to store the mlmd file. + execution_id: Executions for particular execution id. + long: Detailed summary regarding execution. + Returns: + Output from the _execution_list function. + """ + + # Required arguments: pipeline_name + # Optional arguments: filepath( path to store mlmd file), execution_id, long + output = _execution_list(pipeline_name, filepath, execution_id, long) + return output + + +def artifact_list(pipeline_name: str, filepath = "./mlmd", artifact_name: str = "", long = True): + """ Display list of artifact for given pipeline. + Example: + ```python + result = _artifact_list("example_pipeline", "./mlmd_directory", "example_artifact_name", "long") + ``` + Args: + pipeline_name: Name of the pipeline. + filepath: Path to store the mlmd file. + artifact_name: Artifacts for particular artifact name. + long: Detailed summary regarding artifact. + Returns: + Output from the _artifact_list function. + """ + # Required arguments: pipeline_name + # Optional arguments: filepath( path to store mlmd file), artifact_name, long + output = _artifact_list(pipeline_name, filepath, artifact_name, long) + return output diff --git a/cmflib/cmf_commands_wrapper.py b/cmflib/cmf_commands_wrapper.py index 80d92752..a5d9a420 100644 --- a/cmflib/cmf_commands_wrapper.py +++ b/cmflib/cmf_commands_wrapper.py @@ -17,23 +17,7 @@ from cmflib import cli -def metadata_push(pipeline_name: str, filepath = "./mlmd", tensorboard_path: str = "", execution_id: str = ""): - """ Pushes MLMD file to CMF-server. - Example: - ```python - result = metadata_push("example_pipeline", "mlmd_file", "3") - ``` - Args: - pipeline_name: Name of the pipeline. - filepath: Path to the MLMD file. - tensorboard_path: Path to tensorboard logs. - execution_id: Optional execution ID. - - Returns: - Pending - """ - # Required arguments: pipeline_name - # Optional arguments: Execution_ID, filepath (mlmd file path, tensorboard_path) +def _metadata_push(pipeline_name, file_name, execution_id, tensorboard): cli_args = cli.parse_args( [ "metadata", @@ -41,11 +25,11 @@ def metadata_push(pipeline_name: str, filepath = "./mlmd", tensorboard_path: str "-p", pipeline_name, "-f", - filepath, + file_name, "-e", execution_id, "-t", - tensorboard_path + tensorboard ] ) cmd = cli_args.func(cli_args) @@ -53,97 +37,50 @@ def metadata_push(pipeline_name: str, filepath = "./mlmd", tensorboard_path: str print(msg) return msg - -def metadata_pull(pipeline_name: str, filepath = "./mlmd", execution_id: str = ""): - """ Pulls MLMD file from CMF-server. - Example: - ```python - result = metadata_pull("example_pipeline", "./mlmd_directory", "execution_123") - ``` - Args: - pipeline_name: Name of the pipeline. - filepath: File path to store the MLMD file. - execution_id: Optional execution ID. - Returns: - Pending - """ - # Required arguments: pipeline_name - #Optional arguments: Execution_ID, filepath(file path to store mlmd file) +def _metadata_pull(pipeline_name, file_name, execution_id): cli_args = cli.parse_args( - [ - "metadata", - "pull", - "-p", - pipeline_name, - "-f", - filepath, - "-e", - execution_id, - ] - ) + [ + "metadata", + "pull", + "-p", + pipeline_name, + "-f", + file_name, + "-e", + execution_id, + ] + ) cmd = cli_args.func(cli_args) msg = cmd.do_run() print(msg) - # i don't understand why we are both printing and returning the output - return msg -def metadata_export(pipeline_name: str, jsonfilepath: str = "", filepath = "./mlmd"): - """ Export local mlmd's metadata in json format to a json file. - Example: - ```python - result = metadata_pull("example_pipeline", "./jsonfile", "./mlmd_directory") - ``` - Args: - pipeline_name: Name of the pipeline. - jsonfilepath: File path of json file. - filepath: File path to store the MLMD file. - Returns: - Pending - """ - # Required arguments: pipeline_name - #Optional arguments: jsonfilepath, filepath(file path to store mlmd file) +def _metadata_export(pipeline_name, json_file_name, file_name): cli_args = cli.parse_args( - [ - "metadata", - "export", - "-p", - pipeline_name, - "-j", - jsonfilepath, - "-f", - filepath, - ] - ) + [ + "metadata", + "export", + "-p", + pipeline_name, + "-j", + json_file_name, + "-f", + file_name, + ] + ) cmd = cli_args.func(cli_args) msg = cmd.do_run() print(msg) return msg -def artifact_pull(pipeline_name: str, filepath = "./mlmd"): - """ Pulls artifacts from the initialized repository. - - Example: - ```python - result = artifact_pull("example_pipeline", "./mlmd_directory") - ``` - - Args: - pipeline_name: Name of the pipeline. - filepath: Path to store artifacts. - Returns: - Pending - """ - - # Required arguments: Pipeline_name - # Optional arguments: filepath( path to store artifacts) +def _artifact_push(pipeline_name, file_name): cli_args = cli.parse_args( [ "artifact", - "pull", + "push", "-p", pipeline_name, "-f", - filepath, + file_name, ] ) cmd = cli_args.func(cli_args) @@ -152,21 +89,8 @@ def artifact_pull(pipeline_name: str, filepath = "./mlmd"): return msg -def artifact_pull_single(pipeline_name: str, filepath: str, artifact_name: str): - """ Pulls a single artifact from the initialized repository. - Example: - ```python - result = artifact_pull_single("example_pipeline", "./mlmd_directory", "example_artifact") - ``` - Args: - pipeline_name: Name of the pipeline. - filepath: Path to store the artifact. - artifact_name: Name of the artifact. - Returns: - Pending - """ - # Required arguments: Pipeline_name - # Optional arguments: filepath( path to store artifacts), artifact_name +def _artifact_pull(pipeline_name, file_name): + cli_args = cli.parse_args( [ "artifact", @@ -174,9 +98,7 @@ def artifact_pull_single(pipeline_name: str, filepath: str, artifact_name: str): "-p", pipeline_name, "-f", - filepath, - "-a", - artifact_name, + file_name, ] ) cmd = cli_args.func(cli_args) @@ -184,28 +106,17 @@ def artifact_pull_single(pipeline_name: str, filepath: str, artifact_name: str): print(msg) return msg - -def artifact_push(pipeline_name: str, filepath = "./mlmd"): - """ Pushes artifacts to the initialized repository. - - Example: - ```python - result = artifact_push("example_pipeline", "./mlmd_directory") - ``` - Args: - pipeline_name: Name of the pipeline. - filepath: Path to store the artifact. - Returns: - Pending - """ +def _artifact_pull_single(pipeline_name, file_name, artifact_name): cli_args = cli.parse_args( [ "artifact", - "push", + "pull", "-p", pipeline_name, "-f", - filepath, + file_name, + "-a", + artifact_name, ] ) cmd = cli_args.func(cli_args) @@ -214,15 +125,7 @@ def artifact_push(pipeline_name: str, filepath = "./mlmd"): return msg -def cmf_init_show(): - """ Initializes and shows details of the CMF command. - Example: - ```python - result = cmf_init_show() - ``` - Returns: - Pending - """ +def _cmf_cmd_init(): cli_args = cli.parse_args( [ "init", @@ -234,92 +137,8 @@ def cmf_init_show(): print(msg) return msg - -def cmf_init(type: str = "", - path: str = "", - git_remote_url: str = "", - cmf_server_url: str = "", - neo4j_user: str = "", - neo4j_password: str = "", - neo4j_uri: str = "", - url: str = "", - endpoint_url: str = "", - access_key_id: str = "", - secret_key: str = "", - session_token: str = "", - user: str = "", - password: str = "", - port: int = 0, - osdf_path: str = "", - key_id: str = "", - key_path: str = "", - key_issuer: str = "", - ): - - """ Initializes the CMF configuration based on the provided parameters. - Example: - ```python - cmf_init( type="local", - path="/path/to/re", - git_remote_url="git@github.com:user/repo.git", - cmf_server_url="http://cmf-server" - neo4j_user", - neo4j_password="password", - neo4j_uri="bolt://localhost:76" - ) - ``` - Args: - type: Type of repository ("local", "minioS3", "amazonS3", "sshremote") - path: Path for the local repository. - git_remote_url: Git remote URL for version control. - cmf_server_url: CMF server URL. - neo4j_user: Neo4j database username. - neo4j_password: Neo4j database password. - neo4j_uri: Neo4j database URI. - url: URL for MinioS3 or AmazonS3. - endpoint_url: Endpoint URL for MinioS3. - access_key_id: Access key ID for MinioS3 or AmazonS3. - secret_key: Secret key for MinioS3 or AmazonS3. - session_token: Session token for AmazonS3. - user: SSH remote username. - password: SSH remote password. - port: SSH remote port - Returns: - Output based on the initialized repository type. - """ - - if type == "": - return print("Error: Type is not provided") - if type not in ["local","minioS3","amazonS3","sshremote","osdfremote"]: - return print("Error: Type value is undefined"+ " "+type+".Expected: "+",".join(["local","minioS3","amazonS3","sshremote","osdfremote"])) - - if neo4j_user != "" and neo4j_password != "" and neo4j_uri != "": - pass - elif neo4j_user == "" and neo4j_password == "" and neo4j_uri == "": - pass - else: - return print("Error: Enter all neo4j parameters.") - - args={'path': path, - 'git_remote_url': git_remote_url, - 'url': url, - 'endpoint_url': endpoint_url, - 'access_key_id': access_key_id, - 'secret_key': secret_key, - 'session_token': session_token, - 'user': user, - 'password': password, - 'osdf_path': osdf_path, - 'key_id': key_id, - 'key_path': key_path, - 'key-issuer': key_issuer, - } - - status_args=non_related_args(type, args) - - if type == "local" and path != "" and git_remote_url != "" : - """Initialize local repository""" - cli_args = cli.parse_args( +def _init_local(path, git_remote_url, cmf_server_url, neo4j_user, neo4j_password, neo4j_uri): + cli_args = cli.parse_args( [ "init", "local", @@ -337,16 +156,14 @@ def cmf_init(type: str = "", neo4j_uri ] ) - cmd = cli_args.func(cli_args) - msg = cmd.do_run() - print(msg) - if status_args != []: - print("There are non-related arguments: "+",".join(status_args)+".Please remove them.") - return msg - - elif type == "minioS3" and url != "" and endpoint_url != "" and access_key_id != "" and secret_key != "" and git_remote_url != "": - """Initialize minioS3 repository""" - cli_args = cli.parse_args( + cmd = cli_args.func(cli_args) + msg = cmd.do_run() + print(msg) + return msg + + +def _init_minioS3(url, endpoint_url, access_key_id, secret_key, git_remote_url, cmf_server_url, neo4j_user, neo4j_password, neo4j_uri): + cli_args = cli.parse_args( [ "init", "minioS3", @@ -370,16 +187,13 @@ def cmf_init(type: str = "", neo4j_uri ] ) - cmd = cli_args.func(cli_args) - msg = cmd.do_run() - print(msg) - if status_args != []: - print("There are non-related arguments: "+",".join(status_args)+".Please remove them.") - return msg - - elif type == "amazonS3" and url != "" and access_key_id != "" and secret_key != "" and git_remote_url != "": - """Initialize amazonS3 repository""" - cli_args = cli.parse_args( + cmd = cli_args.func(cli_args) + msg = cmd.do_run() + print(msg) + return msg + +def _init_amazonS3(url, access_key_id, secret_key, session_token, git_remote_url, cmf_server_url, neo4j_user, neo4j_password, neo4j_uri): + cli_args = cli.parse_args( [ "init", "amazonS3", @@ -403,18 +217,13 @@ def cmf_init(type: str = "", neo4j_uri ] ) - cmd = cli_args.func(cli_args) - msg = cmd.do_run() - print(msg) - - if status_args != []: - print("There are non-related arguments: "+",".join(status_args)+".Please remove them.") - - return msg + cmd = cli_args.func(cli_args) + msg = cmd.do_run() + print(msg) + return msg - elif type == "sshremote" and path != "" and user != "" and port != 0 and password != "" and git_remote_url != "": - """Initialize sshremote repository""" - cli_args = cli.parse_args( +def _init_sshremote(path,user, port, password, git_remote_url, cmf_server_url, neo4j_user, neo4j_password, neo4j_uri): + cli_args = cli.parse_args( [ "init", "sshremote", @@ -438,17 +247,13 @@ def cmf_init(type: str = "", neo4j_uri ] ) - cmd = cli_args.func(cli_args) - msg = cmd.do_run() - print(msg) - if status_args != []: - print("There are non-related arguments: "+",".join(status_args)+".Please remove them.") - - return msg + cmd = cli_args.func(cli_args) + msg = cmd.do_run() + print(msg) + return msg - elif type == "osdfremote" and osdf_path != "" and key_id != "" and key_path != 0 and key_issuer != "" and git_remote_url != "": - """Initialize osdfremote repository""" - cli_args = cli.parse_args( +def _init_osdfremote(path, key_id, key_path, key_issuer, git_remote_url, cmf_server_url, neo4j_user, neo4j_password, neo4j_uri): + cli_args = cli.parse_args( [ "init", "osdf", @@ -472,56 +277,24 @@ def cmf_init(type: str = "", neo4j_uri ] ) - cmd = cli_args.func(cli_args) - msg = cmd.do_run() - print(msg) - if status_args != []: - print("There are non-related arguments: "+",".join(status_args)+".Please remove them.") - - return msg - - else: - print("Error: Enter all arguments") - - -def non_related_args(type : str, args : dict): - available_args=[i for i, j in args.items() if j != ""] - local=["path", "git_remote_url"] - minioS3=["url", "endpoint_url", "access_key_id", "secret_key", "git_remote_url"] - amazonS3=["url", "access_key_id", "secret_key", "session_token", "git_remote_url"] - sshremote=["path", "user", "port", "password", "git_remote_url"] - osdfremote=["osdf_path", "key_id", "key_path", "key-issuer", "git_remote_url"] - - - dict_repository_args={"local" : local, "minioS3" : minioS3, "amazonS3" : amazonS3, "sshremote" : sshremote} + cmd = cli_args.func(cli_args) + msg = cmd.do_run() + print(msg) + return msg - for repo,arg in dict_repository_args.items(): - if repo ==type: - non_related_args=list(set(available_args)-set(dict_repository_args[repo])) - return non_related_args - - -def pipeline_list(filepath = "./mlmd"): - """ Display list of pipline for current mlmd. - - Example: - ```python - result = _pipeline_list("./mlmd_directory") - ``` - - Args: - filepath: File path to store the MLMD file. - Returns: - Pending - """ - - # Optional arguments: filepath( path to store the MLMD file) +def _artifact_list(pipeline_name, file_name, artifact_name, long): cli_args = cli.parse_args( [ - "pipeline", + "artifact", "list", + "-p", + pipeline_name, "-f", - filepath + file_name, + "-a", + artifact_name, + "-l", + long ] ) cmd = cli_args.func(cli_args) @@ -529,35 +302,13 @@ def pipeline_list(filepath = "./mlmd"): print(msg) return msg - -def execution_list(pipeline_name: str, filepath = "./mlmd", execution_id: str = "", long = True): - """ Display list of execution for given pipeline. - Example: - ```python - result = _execution_list("example_pipeline", "./mlmd_directory", "example_execution_id", "long") - ``` - Args: - pipeline_name: Name of the pipeline. - filepath: Path to store the mlmd file. - execution_id: Executions for particular execution id. - long: Detailed summary regarding execution. - Returns: - Pending - """ - # Required arguments: pipeline_name - # Optional arguments: filepath( path to store mlmd file), execution_id, long +def _pipeline_list(file_name): cli_args = cli.parse_args( [ - "execution", + "pipeline", "list", - "-p", - pipeline_name, "-f", - filepath, - "-e", - execution_id, - "-l", - long + file_name ] ) cmd = cli_args.func(cli_args) @@ -565,33 +316,17 @@ def execution_list(pipeline_name: str, filepath = "./mlmd", execution_id: str = print(msg) return msg - -def artifact_list(pipeline_name: str, filepath = "./mlmd", artifact_name: str = "", long = True): - """ Display list of artifact for given pipeline. - Example: - ```python - result = _artifact_list("example_pipeline", "./mlmd_directory", "example_artifact_name", "long") - ``` - Args: - pipeline_name: Name of the pipeline. - filepath: Path to store the mlmd file. - artifact_name: Artifacts for particular artifact name. - long: Detailed summary regarding artifact. - Returns: - Pending - """ - # Required arguments: pipeline_name - # Optional arguments: filepath( path to store mlmd file), artifact_name, long +def _execution_list(pipeline_name, file_name, execution_id, long): cli_args = cli.parse_args( [ - "artifact", + "execution", "list", "-p", pipeline_name, "-f", - filepath, - "-a", - artifact_name, + file_name, + "-e", + execution_id, "-l", long ] @@ -599,4 +334,5 @@ def artifact_list(pipeline_name: str, filepath = "./mlmd", artifact_name: str = cmd = cli_args.func(cli_args) msg = cmd.do_run() print(msg) - return msg \ No newline at end of file + return msg + diff --git a/cmflib/cmf_merger.py b/cmflib/cmf_merger.py index bb5b6d85..2f828a5e 100644 --- a/cmflib/cmf_merger.py +++ b/cmflib/cmf_merger.py @@ -135,11 +135,11 @@ def parse_json_to_mlmd(mlmd_json, path_to_store: str, cmd: str, exec_id: Union[s cmf_class.log_execution_metrics_from_client(event["artifact"]["name"], custom_props) elif artifact_type == "Dataslice": dataslice = cmf_class.create_dataslice(event["artifact"]["name"]) - dataslice.commit_existing(uri, custom_props) + dataslice.log_dataslice_from_client(uri, custom_props) elif artifact_type == "Step_Metrics": - cmf_class.commit_existing_metrics(event["artifact"]["name"], uri, custom_props) + cmf_class.log_metrics_from_client(event["artifact"]["name"], uri, custom_props) elif artifact_type == "Environment": - cmf_class.log_python_env_on_server(artifact_name, uri, props) + cmf_class.log_python_env_from_client(artifact_name, uri, props) else: pass except AlreadyExistsError as e: diff --git a/cmflib/cmf_server_methods.py b/cmflib/cmf_server_methods.py index 2518f461..f60a387c 100644 --- a/cmflib/cmf_server_methods.py +++ b/cmflib/cmf_server_methods.py @@ -195,7 +195,7 @@ def merge_created_execution( return self.execution -def log_python_env_on_server( +def log_python_env_from_client( self, url: str, uri: str, @@ -641,12 +641,12 @@ def log_execution_metrics_from_client(self, metrics_name: str, -def commit_existing_metrics(self, metrics_name: str, uri: str, props: t.Optional[t.Dict] = None, custom_properties: t.Optional[t.Dict] = None): +def log_metrics_from_client(self, metrics_name: str, uri: str, props: t.Optional[t.Dict] = None, custom_properties: t.Optional[t.Dict] = None): """ Commits existing metrics associated with the given URI to MLMD. Example: ```python - artifact: mlpb.Artifact = cmf.commit_existing_metrics("existing_metrics", "abc123", + artifact: mlpb.Artifact = cmf.log_metrics_from_client("existing_metrics", "abc123", {"custom_key": "custom_value"}) ``` Args: @@ -720,7 +720,7 @@ def commit_existing_metrics(self, metrics_name: str, uri: str, props: t.Optional return metrics # commit existing dataslice to server -def commit_existing(self, uri: str, props: t.Optional[t.Dict] = None, custom_properties: t.Optional[t.Dict] = None) -> None: +def log_dataslice_from_client(self, uri: str, props: t.Optional[t.Dict] = None, custom_properties: t.Optional[t.Dict] = None) -> None: custom_props = {} if custom_properties is None else custom_properties c_hash = uri.strip() dataslice_commit = c_hash diff --git a/cmflib/graph_wrapper.py b/cmflib/graph_wrapper.py index 6ebd8ed5..8d73d29c 100644 --- a/cmflib/graph_wrapper.py +++ b/cmflib/graph_wrapper.py @@ -76,7 +76,6 @@ def create_dataset_node(self, name: str, path: str, uri: str, event: str, execut custom_properties=None): if custom_properties is None: custom_properties = {} - print("custom_properties = ", custom_properties) pipeline_id = pipeline_context.id pipeline_name = pipeline_context.name dataset_syntax = self._create_dataset_syntax( @@ -218,10 +217,10 @@ def create_execution_links( parent_execution_query = "MATCH (n:{}".format( parent_artifact_type) + "{uri: '" + parent_artifact_uri + "'}) " \ - "<-[:output]-(f:Execution) Return ID(f)as id, f.uri as uri" + "<-[:output]-(f:Execution) Return ELEMENTID(f) as id, f.uri as uri" already_linked_execution_query = "MATCH (f)-[r:linked]->(e2:Execution) " \ - "WHERE r.uri = '{}' RETURN ID(f)as id, f.uri as uri".format(parent_artifact_uri) + "WHERE r.uri = '{}' RETURN ELEMENTID(f) as id, f.uri as uri".format(parent_artifact_uri) with self.driver.session() as session: execution_parent = session.read_transaction( @@ -252,7 +251,7 @@ def create_execution_links( def _get_node(self, node_label: str, node_name: str)->int: #Match(n:Metrics) where n.Name contains 'metrics_1' return n search_syntax = "MATCH (n:{}) where '{}' in n.Name \ - return ID(n) as node_id".format(node_label, node_name) + return ELEMENTID(n) as node_id".format(node_label, node_name) print(search_syntax) node_id = None with self.driver.session() as session: @@ -265,7 +264,7 @@ def _get_node(self, node_label: str, node_name: str)->int: def _get_node_with_path(self, node_label: str, node_path: str)->int: #Match(n:Metrics) where n.Path contains 'metrics_1' return n search_syntax = "MATCH (n:{}) where '{}' in n.Path \ - return ID(n) as node_id".format(node_label, node_path) + return ELEMENTID(n) as node_id".format(node_label, node_path) print(search_syntax) node_id = None with self.driver.session() as session: @@ -294,7 +293,7 @@ def _create_pipeline_syntax(name: str, props: t.Dict, uri: int) -> str: k = re.sub('\W+', '', k) syntax_str = syntax_str + k + ":" + "\"" + v + "\"" + "," syntax_str = syntax_str.rstrip(syntax_str[-1]) - syntax_str = syntax_str + "}) RETURN ID(a) as node_id" + syntax_str = syntax_str + "}) RETURN ELEMENTID(a) as node_id" return syntax_str # Todo - Verify what is considered as unique node . is it a combination of @@ -315,7 +314,7 @@ def _create_dataset_syntax(name: str, path: str, uri: str, pipeline_id: int, pip " = coalesce([x in a." + k + " where x <>\"" + str(v) + "\"], []) + \"" + str(v) + "\"," syntax_str = syntax_str + props_str syntax_str = syntax_str.rstrip(",") - syntax_str = syntax_str + " RETURN ID(a) as node_id" + syntax_str = syntax_str + " RETURN ELEMENTID(a) as node_id" return syntax_str @staticmethod @@ -333,7 +332,7 @@ def _create_env_syntax(name: str, path: str, uri: str, pipeline_id: int, pipelin " = coalesce([x in a." + k + " where x <>\"" + str(v) + "\"], []) + \"" + str(v) + "\"," syntax_str = syntax_str + props_str syntax_str = syntax_str.rstrip(",") - syntax_str = syntax_str + " RETURN ID(a) as node_id" + syntax_str = syntax_str + " RETURN ELEMENTID(a) as node_id" return syntax_str @staticmethod @@ -349,7 +348,7 @@ def _create_dataslice_syntax(name: str, path: str, uri: str, " = coalesce([x in a." + k + " where x <>\"" + str(v) + "\"], []) + \"" + str(v) + "\"," syntax_str = syntax_str + props_str syntax_str = syntax_str.rstrip(",") - syntax_str = syntax_str + " RETURN ID(a) as node_id" + syntax_str = syntax_str + " RETURN ELEMENTID(a) as node_id" return syntax_str @staticmethod @@ -365,7 +364,7 @@ def _create_model_syntax(name: str, uri: str, pipeline_id: int, pipeline_name: s #syntax_str = syntax_str + k + ":" + "\"" + str(v) + "\"" + "," syntax_str = syntax_str + props_str syntax_str = syntax_str.rstrip(",") - syntax_str = syntax_str + " RETURN ID(a) as node_id" + syntax_str = syntax_str + " RETURN ELEMENTID(a) as node_id" return syntax_str @staticmethod @@ -382,7 +381,7 @@ def _create_metrics_syntax(name: str, uri: str, event: str, execution_id: int, p syntax_str = syntax_str + k + ":" + "\"" + str(v) + "\"" + "," syntax_str = syntax_str.rstrip(syntax_str[-1]) syntax_str = syntax_str + "})" - syntax_str = syntax_str + " RETURN ID(a) as node_id" + syntax_str = syntax_str + " RETURN ELEMENTID(a) as node_id" return syntax_str @staticmethod @@ -399,7 +398,7 @@ def _create_step_metrics_syntax(name: str, uri: str, event: str, execution_id: i syntax_str = syntax_str + k + ":" + "\"" + str(v) + "\"" + "," syntax_str = syntax_str.rstrip(syntax_str[-1]) syntax_str = syntax_str + "})" - syntax_str = syntax_str + " RETURN ID(a) as node_id" + syntax_str = syntax_str + " RETURN ELEMENTID(a) as node_id" return syntax_str @staticmethod @@ -414,13 +413,13 @@ def _create_stage_syntax(name: str, props: t.Dict, uri: int, pipeline_id: int, p syntax_str = syntax_str + k + ":" + "\"" + str(v) + "\"" + "," syntax_str = syntax_str.rstrip(syntax_str[-1]) - syntax_str = syntax_str + "}) RETURN ID(a) as node_id" + syntax_str = syntax_str + "}) RETURN ELEMENTID(a) as node_id" return syntax_str @staticmethod def _create_parent_child_syntax(parent_label: str, child_label: str, parent_id: int, child_id: int, relation: str): - parent_child_syntax = "MATCH (a:{}), (b:{}) where ID(a) = {} AND ID(b) = {} MERGE (a)-[r:{}]->(b) \ + parent_child_syntax = "MATCH (a:{}), (b:{}) where ELEMENTID(a) = '{}' AND ELEMENTID(b) = '{}' MERGE (a)-[r:{}]->(b) \ return type(r)".format(parent_label, child_label, parent_id, child_id, relation) return parent_child_syntax @@ -428,10 +427,10 @@ def _create_parent_child_syntax(parent_label: str, child_label: str, parent_id: def _create_execution_artifacts_link_syntax(parent_label: str, child_label: str, parent_id: int, child_id: int, relation: str): if relation.lower() == "input": - parent_child_syntax = "MATCH (a:{}), (b:{}) where ID(a) = {} AND ID(b) = {} MERGE (a)<-[r:{}]-(b) \ + parent_child_syntax = "MATCH (a:{}), (b:{}) where ELEMENTID(a) = '{}' AND ELEMENTID(b) = '{}' MERGE (a)<-[r:{}]-(b) \ return type(r)".format(parent_label, child_label, parent_id, child_id, relation) else: - parent_child_syntax = "MATCH (a:{}), (b:{}) where ID(a) = {} AND ID(b) = {} MERGE (a)-[r:{}]->(b) \ + parent_child_syntax = "MATCH (a:{}), (b:{}) where ELEMENTID(a) = '{}' AND ELEMENTID(b) = '{}' MERGE (a)-[r:{}]->(b) \ return type(r)".format(parent_label, child_label, parent_id, child_id, relation) return parent_child_syntax @@ -448,7 +447,7 @@ def _create_execution_link_syntax(parent_label: str, child_label: str, parent_ur CREATE (a)-[r:RELTYPE]->(b) RETURN type(r) """ - parent_child_syntax_1 = "MATCH (a:{}), (b:{}) WHERE a.uri = '{}' AND ID(a) = {} AND ID(b) = {} ".format( + parent_child_syntax_1 = "MATCH (a:{}), (b:{}) WHERE a.uri = '{}' AND ELEMENTID(a) = '{}' AND ELEMENTID(b) = '{}' ".format( parent_label, child_label, parent_uri, parent_id, child_id) parent_child_syntax_2 = "MERGE (a)-[r:{}".format(relation) parent_child_syntax_3 = "{" @@ -502,5 +501,5 @@ def _create_execution_syntax(name: str, command: str, props: t.Dict, uri: int, p syntax_str = syntax_str + k + ":" + "\"" + v + "\"" + "," syntax_str = syntax_str.rstrip(syntax_str[-1]) - syntax_str = syntax_str + "}) RETURN ID(a) as node_id" + syntax_str = syntax_str + "}) RETURN ELEMENTID(a) as node_id" return syntax_str diff --git a/server/app/main.py b/server/app/main.py index 30df9c5a..2a116157 100644 --- a/server/app/main.py +++ b/server/app/main.py @@ -325,6 +325,8 @@ async def artifact_types(request: Request): # checks if mlmd file exists on server if os.path.exists(server_store_path): artifact_types = await async_api(get_artifact_types, server_store_path) + if "Environment" in artifact_types: + artifact_types.remove("Environment") return artifact_types else: artifact_types = "" From 834a9530eef80ac03ea894d8c1532ecf27e444a5 Mon Sep 17 00:00:00 2001 From: Varkha Sharma Date: Thu, 21 Nov 2024 09:50:27 -0800 Subject: [PATCH 10/15] re-added changes in cmf.py --- cmflib/cmf.py | 797 ++++++++++---------------------------------------- 1 file changed, 150 insertions(+), 647 deletions(-) diff --git a/cmflib/cmf.py b/cmflib/cmf.py index cb653dd6..7ccb7eb8 100644 --- a/cmflib/cmf.py +++ b/cmflib/cmf.py @@ -20,6 +20,7 @@ import re import os import sys +import yaml import pandas as pd import typing as t @@ -54,7 +55,18 @@ link_execution_to_input_artifact, ) from cmflib.utils.cmf_config import CmfConfig -from cmflib.utils.helper_functions import get_python_env, change_dir +from cmflib.utils.helper_functions import get_python_env, change_dir, get_md5_hash +from cmflib.cmf_server_methods import ( + merge_created_context, + merge_created_execution, + log_python_env_from_client, + log_dataset_with_version, + log_model_with_version, + log_execution_metrics_from_client, + log_metrics_from_client, + log_dataslice_from_client, +) + from cmflib.cmf_commands_wrapper import ( _metadata_push, _metadata_pull, @@ -103,16 +115,9 @@ class Cmf: """ # pylint: disable=too-many-instance-attributes - # Reading CONFIG_FILE variable - cmf_config = os.environ.get("CONFIG_FILE", ".cmfconfig") ARTIFACTS_PATH = "cmf_artifacts" DATASLICE_PATH = "dataslice" METRICS_PATH = "metrics" - if os.path.exists(cmf_config): - attr_dict = CmfConfig.read_config(cmf_config) - __neo4j_uri = attr_dict.get("neo4j-uri", "") - __neo4j_password = attr_dict.get("neo4j-password", "") - __neo4j_user = attr_dict.get("neo4j-user", "") def __init__( self, @@ -170,6 +175,7 @@ def __init__( ) os.chdir(logging_dir) + # function used to load neo4j params for cmf client @staticmethod def __load_neo4j_params(): cmf_config = os.environ.get("CONFIG_FILE", ".cmfconfig") @@ -179,7 +185,7 @@ def __load_neo4j_params(): Cmf.__neo4j_password = attr_dict.get("neo4j-password", "") Cmf.__neo4j_user = attr_dict.get("neo4j-user", "") - + # function used to load neo4j params for cmf-server @staticmethod def __get_neo4j_server_config(): Cmf.__neo4j_uri = os.getenv('NEO4J_URI', "") @@ -276,47 +282,7 @@ def create_context( ) return ctx - def merge_created_context( - self, pipeline_stage: str, custom_properties: t.Optional[t.Dict] = None - ) -> mlpb.Context: - """Merge created context. - Every call creates a unique pipeline stage. - Created for metadata push purpose. - Example: - - ```python - #Create context - # Import CMF - from cmflib.cmf import Cmf - from ml_metadata.proto import metadata_store_pb2 as mlpb - # Create CMF logger - cmf = Cmf(filepath="mlmd", pipeline_name="test_pipeline") - # Create context - context: mlmd.proto.Context = cmf.merge_created_context( - pipeline_stage="Test-env/prepare", - custom_properties ={"user-metadata1": "metadata_value"} - ``` - Args: - Pipeline_stage: Pipeline_Name/Stage_name. - custom_properties: Developers can provide key value pairs with additional properties of the execution that - need to be stored. - Returns: - Context object from ML Metadata library associated with the new context for this stage. - """ - - custom_props = {} if custom_properties is None else custom_properties - ctx = get_or_create_run_context( - self.store, pipeline_stage, custom_props) - self.child_context = ctx - associate_child_to_parent_context( - store=self.store, parent_context=self.parent_context, child_context=ctx - ) - if self.graph: - self.driver.create_stage_node( - pipeline_stage, self.parent_context, ctx.id, custom_props - ) - return ctx - + def update_context( self, type_name: str, @@ -462,6 +428,34 @@ def create_execution( self.execution.id, custom_props, ) + + directory_path = self.ARTIFACTS_PATH + os.makedirs(directory_path, exist_ok=True) + packages = get_python_env() + if isinstance(packages, list): + output = f"{packages}\n" + md5_hash = get_md5_hash(output) + python_env_file_path = os.path.join(directory_path, f"python_env_{md5_hash}.txt") + # create file if it doesn't exists + if not os.path.exists(python_env_file_path): + #print(f"{python_env_file_path} doesn't exists!!") + with open(python_env_file_path, 'w') as file: + for package in packages: + file.write(f"{package}\n") + + else: + # in case output is dict + env_output = yaml.dump(packages, sort_keys=False) + md5_hash = get_md5_hash(env_output) + python_env_file_path = os.path.join(directory_path, f"python_env_{md5_hash}.yaml") + # create file if it doesn't exists + if not os.path.exists(python_env_file_path): + #print(f"{python_env_file_path} doesn't exists!!") + with open(python_env_file_path, 'w') as file: + file.write(env_output) + + # link the artifact to execution if it exists and creates artifact if it doesn't + self.log_python_env(python_env_file_path) os.chdir(logging_dir) return self.execution @@ -539,126 +533,95 @@ def update_execution( ) return self.execution - def merge_created_execution( - self, - execution_type: str, - execution_cmd: str, - properties: t.Optional[t.Dict] = None, - custom_properties: t.Optional[t.Dict] = None, - orig_execution_name:str = "", - create_new_execution:bool = True - ) -> mlpb.Execution: - """Merge Created execution. - Every call creates a unique execution. Execution can only be created within a context, so - [create_context][cmflib.cmf.Cmf.create_context] must be called first. - Every call occurs when metadata push or pull is processed. Data from pre-existing executions is used - to create new executions with additional data(Required on cmf-server). - Example: - ```python - # Import CMF - from cmflib.cmf import Cmf - from ml_metadata.proto import metadata_store_pb2 as mlpb - # Create CMF logger - cmf = Cmf(filepath="mlmd", pipeline_name="test_pipeline") - # Create or reuse context for this stage - context: mlmd.proto.Context = cmf.merge_created_context( - pipeline_stage="prepare", - custom_properties ={"user-metadata1": "metadata_value"} - ) - # Create a new execution for this stage run - execution: mlmd.proto.Execution = cmf.merge_created_execution( - execution_type="Prepare", - properties={"Context_Type":""}, - custom_properties = {"split": split, "seed": seed}, - orig_execution_name=execution_name - ) - ``` - Args: - execution_type: Type of the execution.(when create_new_execution is False, this is the name of execution) - properties: Properties of Execution. - custom_properties: Developers can provide key value pairs with additional properties of the execution that - need to be stored. - - cmd: command used to run this execution. + def log_python_env( + self, + url: str, + ) -> mlpb.Artifact: + "Used to log the python packages involved in the current execution" - create_new_execution:bool = True, This can be used by advanced users to re-use executions - This is applicable, when working with framework code like mmdet, pytorch lightning etc, where the - custom call-backs are used to log metrics. - if create_new_execution is True(Default), execution_type parameter will be used as the name of the execution type. - if create_new_execution is False, if existing execution exist with the same name as execution_type. - it will be reused. - Only executions created with create_new_execution as False will have "name" as a property. + git_repo = git_get_repo() + name = re.split("/", url)[-1] + existing_artifact = [] + commit_output(url, self.execution.id) + c_hash = dvc_get_hash(url) - Returns: - Execution object from ML Metadata library associated with the execution for this stage. - """ - # Initializing the execution related fields - properties = {} if properties is None else properties - self.metrics = {} - self.input_artifacts = [] - self.execution_label_props = {} - custom_props = {} if custom_properties is None else custom_properties - # print(custom_props) - git_repo = properties.get("Git_Repo", "") - git_start_commit = properties.get("Git_Start_Commit", "") - python_env = properties.get("Python_Env", "") - #name = properties.get("Name", "") - create_new_execution = True - execution_name = execution_type - #exe.name property is passed as the orig_execution_name. - #if name is not an empty string then we are re-using executions - if orig_execution_name != "": - create_new_execution = False - execution_name = orig_execution_name + if c_hash == "": + print("Error in getting the dvc hash,return without logging") + return - self.execution = create_new_execution_in_existing_run_context( - store=self.store, - execution_type_name=execution_type, # Type field when re-using executions - execution_name=execution_name, #Name field if we are re-using executionsname - #Type field , if creating new executions always - context_id=self.child_context.id, - execution=execution_cmd, - pipeline_id=self.parent_context.id, - pipeline_type=self.parent_context.name, - git_repo=git_repo, - git_start_commit=git_start_commit, - python_env=python_env, - custom_properties=custom_props, - create_new_execution=create_new_execution - ) + commit = c_hash + dvc_url = dvc_get_url(url) + dvc_url_with_pipeline = f"{self.parent_context.name}:{dvc_url}" + url = url + ":" + c_hash + if c_hash and c_hash.strip: + existing_artifact.extend(self.store.get_artifacts_by_uri(c_hash)) - uuids = "" + if existing_artifact and len(existing_artifact) != 0: + existing_artifact = existing_artifact[0] + uri = c_hash + artifact = link_execution_to_artifact( + store=self.store, + execution_id=self.execution.id, + uri=uri, + input_name=url, + event_type=mlpb.Event.Type.INPUT, + ) + else: + uri = c_hash if c_hash and c_hash.strip() else str(uuid.uuid1()) + artifact = create_new_artifact_event_and_attribution( + store=self.store, + execution_id=self.execution.id, + context_id=self.child_context.id, + uri=uri, + name=url, + type_name="Environment", + event_type=mlpb.Event.Type.INPUT, + properties={ + "git_repo": str(git_repo), + # passing c_hash value to commit + "Commit": str(commit), + "url": str(dvc_url_with_pipeline), + }, + artifact_type_properties={ + "git_repo": mlpb.STRING, + "Commit": mlpb.STRING, + "url": mlpb.STRING, + }, + milliseconds_since_epoch=int(time.time() * 1000), + ) + custom_props = {} + custom_props["git_repo"] = git_repo + custom_props["Commit"] = commit + self.execution_label_props["git_repo"] = git_repo + self.execution_label_props["Commit"] = commit - uuids = self.execution.properties["Execution_uuid"].string_value - if uuids: - self.execution.properties["Execution_uuid"].string_value = uuids +\ - ","+properties["Execution_uuid"] - else: - self.execution.properties["Execution_uuid"].string_value =\ - properties["Execution_uuid"] + if self.graph: + self.driver.create_env_node( + name, + url, + uri, + "input", + self.execution.id, + self.parent_context, + custom_props, + ) + self.input_artifacts.append( + { + "Name": name, + "Path": url, + "URI": uri, + "Event": "input", + "Execution_Name": self.execution_name, + "Type": "Environment", + "Execution_Command": self.execution_command, + "Pipeline_Id": self.parent_context.id, + "Pipeline_Name": self.parent_context.name, + } + ) + self.driver.create_execution_links(uri, name, "Environment") + return artifact - - self.store.put_executions([self.execution]) - self.execution_name = str(self.execution.id) + "," + execution_type - self.execution_command = execution_cmd - for k, v in custom_props.items(): - k = re.sub("-", "_", k) - self.execution_label_props[k] = v - self.execution_label_props["Execution_Name"] = ( - execution_type + ":" + str(self.execution.id) - ) - self.execution_label_props["execution_command"] = execution_cmd - if self.graph: - self.driver.create_execution_node( - self.execution_name, - self.child_context.id, - self.parent_context, - execution_cmd, - self.execution.id, - custom_props, - ) - return self.execution def log_dvc_lock(self, file_path: str): """Used to update the dvc lock file created with dvc run command.""" @@ -872,142 +835,6 @@ def update_model_url(self, dup_artifact: list, updated_url: str): put_artifact(self.store, dup_art) return dup_artifact - def log_dataset_with_version( - self, - url: str, - version: str, - event: str, - props: t.Optional[t.Dict] = None, - custom_properties: t.Optional[t.Dict] = None, - ) -> mlpb.Artifact: - """Logs a dataset when the version (hash) is known. - Example: - ```python - artifact: mlpb.Artifact = cmf.log_dataset_with_version( - url="path/to/dataset", - version="abcdef", - event="output", - props={ "git_repo": "https://github.com/example/repo", - "url": "/path/in/repo", }, - custom_properties={ "custom_key": "custom_value", }, - ) - ``` - Args: - url: Path to the dataset. - version: Hash or version identifier for the dataset. - event: Takes arguments `INPUT` or `OUTPUT`. - props: Optional properties for the dataset (e.g., git_repo, url). - custom_properties: Optional custom properties for the dataset. - Returns: - Artifact object from the ML Protocol Buffers library associated with the new dataset artifact. - """ - - props = {} if props is None else props - custom_props = {} if custom_properties is None else custom_properties - git_repo = props.get("git_repo", "") - name = url - event_type = mlpb.Event.Type.OUTPUT - existing_artifact = [] - c_hash = version - if event.lower() == "input": - event_type = mlpb.Event.Type.INPUT - - # dataset_commit = commit_output(url, self.execution.id) - - dataset_commit = version - url = url + ":" + c_hash - if c_hash and c_hash.strip: - existing_artifact.extend(self.store.get_artifacts_by_uri(c_hash)) - - # To Do - What happens when uri is the same but names are different - if existing_artifact and len(existing_artifact) != 0: - existing_artifact = existing_artifact[0] - - # Quick fix- Updating only the name - if custom_properties is not None: - self.update_existing_artifact( - existing_artifact, custom_properties) - uri = c_hash - # update url for existing artifact - self.update_dataset_url(existing_artifact, props.get("url", "")) - artifact = link_execution_to_artifact( - store=self.store, - execution_id=self.execution.id, - uri=uri, - input_name=url, - event_type=event_type, - ) - else: - # if((existing_artifact and len(existing_artifact )!= 0) and c_hash != ""): - # url = url + ":" + str(self.execution.id) - uri = c_hash if c_hash and c_hash.strip() else str(uuid.uuid1()) - artifact = create_new_artifact_event_and_attribution( - store=self.store, - execution_id=self.execution.id, - context_id=self.child_context.id, - uri=uri, - name=url, - type_name="Dataset", - event_type=event_type, - properties={ - "git_repo": str(git_repo), - "Commit": str(dataset_commit), - "url": props.get("url", " "), - }, - artifact_type_properties={ - "git_repo": mlpb.STRING, - "Commit": mlpb.STRING, - "url": mlpb.STRING, - }, - custom_properties=custom_props, - milliseconds_since_epoch=int(time.time() * 1000), - ) - custom_props["git_repo"] = git_repo - custom_props["Commit"] = dataset_commit - self.execution_label_props["git_repo"] = git_repo - self.execution_label_props["Commit"] = dataset_commit - - if self.graph: - self.driver.create_dataset_node( - name, - url, - uri, - event, - self.execution.id, - self.parent_context, - custom_props, - ) - if event.lower() == "input": - self.input_artifacts.append( - { - "Name": name, - "Path": url, - "URI": uri, - "Event": event.lower(), - "Execution_Name": self.execution_name, - "Type": "Dataset", - "Execution_Command": self.execution_command, - "Pipeline_Id": self.parent_context.id, - "Pipeline_Name": self.parent_context.name, - } - ) - self.driver.create_execution_links(uri, name, "Dataset") - else: - child_artifact = { - "Name": name, - "Path": url, - "URI": uri, - "Event": event.lower(), - "Execution_Name": self.execution_name, - "Type": "Dataset", - "Execution_Command": self.execution_command, - "Pipeline_Id": self.parent_context.id, - "Pipeline_Name": self.parent_context.name, - } - self.driver.create_artifact_relationships( - self.input_artifacts, child_artifact, self.execution_label_props - ) - return artifact # Add the model to dvc do a git commit and store the commit id in MLMD def log_model( @@ -1138,7 +965,7 @@ def log_model( custom_properties=custom_props, milliseconds_since_epoch=int(time.time() * 1000), ) - # custom_properties["Commit"] = model_commit + custom_properties["Commit"] = model_commit self.execution_label_props["Commit"] = model_commit #To DO model nodes should be similar to dataset nodes when we create neo4j if self.graph: @@ -1182,226 +1009,6 @@ def log_model( os.chdir(logging_dir) return artifact - # Add the model to dvc do a git commit and store the commit id in MLMD - def log_model_with_version( - self, - path: str, - event: str, - props=None, - custom_properties: t.Optional[t.Dict] = None, - ) -> object: - """Logs a model when the version(hash) is known - The model is added to dvc and the metadata file (.dvc) gets committed to git. - Example: - ```python - artifact: mlmd.proto.Artifact= cmf.log_model_with_version( - path="path/to/model.pkl", - event="output", - props={ - "url": "/home/user/local-storage/bf/629ccd5cd008066b72c04f9a918737", - "model_type": "RandomForestClassifier", - "model_name": "RandomForestClassifier:default", - "Commit": "commit 1146dad8b74cae205db6a3132ea403db1e4032e5", - "model_framework": "SKlearn", - }, - custom_properties={ - "uri": "bf629ccd5cd008066b72c04f9a918737", - }, - - ) - ``` - Args: - path: Path to the model file. - event: Takes arguments `INPUT` OR `OUTPUT`. - props: Model artifact properties. - custom_properties: The model properties. - Returns: - Artifact object from ML Metadata library associated with the new model artifact. - """ - - if custom_properties is None: - custom_properties = {} - custom_props = {} if custom_properties is None else custom_properties - name = re.split("/", path)[-1] - event_type = mlpb.Event.Type.OUTPUT - existing_artifact = [] - if event.lower() == "input": - event_type = mlpb.Event.Type.INPUT - - # props["commit"] = "" # To do get from incoming data - c_hash = props.get("uri", " ") - # If connecting to an existing artifact - The name of the artifact is used as path/steps/key - model_uri = path + ":" + c_hash - # dvc_url = dvc_get_url(path, False) - url = props.get("url", "") - # uri = "" - if c_hash and c_hash.strip(): - uri = c_hash.strip() - existing_artifact.extend(self.store.get_artifacts_by_uri(uri)) - else: - raise RuntimeError("Model commit failed, Model uri empty") - - if ( - existing_artifact - and len(existing_artifact) != 0 - ): - # update url for existing artifact - existing_artifact = self.update_model_url(existing_artifact, url) - artifact = link_execution_to_artifact( - store=self.store, - execution_id=self.execution.id, - uri=c_hash, - input_name=model_uri, - event_type=event_type, - ) - model_uri = artifact.name - else: - uri = c_hash if c_hash and c_hash.strip() else str(uuid.uuid1()) - model_uri = model_uri + ":" + str(self.execution.id) - artifact = create_new_artifact_event_and_attribution( - store=self.store, - execution_id=self.execution.id, - context_id=self.child_context.id, - uri=uri, - name=model_uri, - type_name="Model", - event_type=event_type, - properties={ - "model_framework": props.get("model_framework", ""), - "model_type": props.get("model_type", ""), - "model_name": props.get("model_name", ""), - "Commit": props.get("Commit", ""), - "url": str(url), - }, - artifact_type_properties={ - "model_framework": mlpb.STRING, - "model_type": mlpb.STRING, - "model_name": mlpb.STRING, - "Commit": mlpb.STRING, - "url": mlpb.STRING, - }, - custom_properties=custom_props, - milliseconds_since_epoch=int(time.time() * 1000), - ) - # custom_properties["Commit"] = model_commit - # custom_props["url"] = url - self.execution_label_props["Commit"] = props.get("Commit", "") - if self.graph: - self.driver.create_model_node( - model_uri, - uri, - event, - self.execution.id, - self.parent_context, - custom_props, - ) - if event.lower() == "input": - self.input_artifacts.append( - { - "Name": model_uri, - "URI": uri, - "Event": event.lower(), - "Execution_Name": self.execution_name, - "Type": "Model", - "Execution_Command": self.execution_command, - "Pipeline_Id": self.parent_context.id, - "Pipeline_Name": self.parent_context.name, - } - ) - self.driver.create_execution_links(uri, model_uri, "Model") - else: - child_artifact = { - "Name": model_uri, - "URI": uri, - "Event": event.lower(), - "Execution_Name": self.execution_name, - "Type": "Model", - "Execution_Command": self.execution_command, - "Pipeline_Id": self.parent_context.id, - "Pipeline_Name": self.parent_context.name, - } - self.driver.create_artifact_relationships( - self.input_artifacts, child_artifact, self.execution_label_props - ) - - return artifact - - def log_execution_metrics_from_client(self, metrics_name: str, - custom_properties: t.Optional[t.Dict] = None) -> mlpb.Artifact: - """ Logs execution metrics from a client. - Data from pre-existing metrics from client side is used to create identical metrics on server side. - Example: - ```python - artifact: mlpb.Artifact = cmf.log_execution_metrics_from_client( - metrics_name="example_metrics:uri:123", - custom_properties={"custom_key": "custom_value"}, - ) - ``` - Args: - metrics_name: Name of the metrics in the format "name:uri:execution_id". - custom_properties: Optional custom properties for the metrics. - Returns: - Artifact object from the ML Protocol Buffers library associated with the metrics artifact. - """ - - metrics = None - custom_props = {} if custom_properties is None else custom_properties - existing_artifact = [] - name_tokens = metrics_name.split(":") - if name_tokens and len(name_tokens) > 2: - name = name_tokens[0] - uri = name_tokens[1] - execution_id = name_tokens[2] - else: - print(f"Error : metrics name {metrics_name} is not in the correct format") - return - - #we need to add the execution id to the metrics name - new_metrics_name = f"{name}:{uri}:{str(self.execution.id)}" - existing_artifacts = self.store.get_artifacts_by_uri(uri) - - existing_artifact = existing_artifacts[0] if existing_artifacts else None - if not existing_artifact or \ - ((existing_artifact) and not - (existing_artifact.name == new_metrics_name)): #we need to add the artifact otherwise its already there - metrics = create_new_artifact_event_and_attribution( - store=self.store, - execution_id=self.execution.id, - context_id=self.child_context.id, - uri=uri, - name=new_metrics_name, - type_name="Metrics", - event_type=mlpb.Event.Type.OUTPUT, - properties={"metrics_name": metrics_name}, - artifact_type_properties={"metrics_name": mlpb.STRING}, - custom_properties=custom_props, - milliseconds_since_epoch=int(time.time() * 1000), - ) - if self.graph: - # To do create execution_links - self.driver.create_metrics_node( - metrics_name, - uri, - "output", - self.execution.id, - self.parent_context, - custom_props, - ) - child_artifact = { - "Name": metrics_name, - "URI": uri, - "Event": "output", - "Execution_Name": self.execution_name, - "Type": "Metrics", - "Execution_Command": self.execution_command, - "Pipeline_Id": self.parent_context.id, - "Pipeline_Name": self.parent_context.name, - } - self.driver.create_artifact_relationships( - self.input_artifacts, child_artifact, self.execution_label_props - ) - return metrics - def log_execution_metrics( self, metrics_name: str, custom_properties: t.Optional[t.Dict] = None @@ -1587,6 +1194,10 @@ def commit_metrics(self, metrics_name: str): custom_properties=custom_props, milliseconds_since_epoch=int(time.time() * 1000), ) + + custom_props["Commit"] = metrics_commit + self.execution_label_props["Commit"] = metrics_commit + if self.graph: self.driver.create_metrics_node( name, @@ -1601,7 +1212,7 @@ def commit_metrics(self, metrics_name: str): "URI": uri, "Event": "output", "Execution_Name": self.execution_name, - "Type": "Metrics", + "Type": "Step_Metrics", "Execution_Command": self.execution_command, "Pipeline_Id": self.parent_context.id, } @@ -1612,79 +1223,6 @@ def commit_metrics(self, metrics_name: str): os.chdir(logging_dir) return metrics - def commit_existing_metrics(self, metrics_name: str, uri: str, props: t.Optional[t.Dict] = None, custom_properties: t.Optional[t.Dict] = None): - """ - Commits existing metrics associated with the given URI to MLMD. - Example: - ```python - artifact: mlpb.Artifact = cmf.commit_existing_metrics("existing_metrics", "abc123", - {"custom_key": "custom_value"}) - ``` - Args: - metrics_name: Name of the metrics. - uri: Unique identifier associated with the metrics. - custom_properties: Optional custom properties for the metrics. - Returns: - Artifact object from the ML Protocol Buffers library associated with the existing metrics artifact. - """ - - custom_props = {} if custom_properties is None else custom_properties - c_hash = uri.strip() - existing_artifact = [] - existing_artifact.extend(self.store.get_artifacts_by_uri(c_hash)) - if (existing_artifact - and len(existing_artifact) != 0 ): - metrics = link_execution_to_artifact( - store=self.store, - execution_id=self.execution.id, - uri=c_hash, - input_name=metrics_name, - event_type=mlpb.Event.Type.OUTPUT, - ) - else: - metrics = create_new_artifact_event_and_attribution( - store=self.store, - execution_id=self.execution.id, - context_id=self.child_context.id, - uri=uri, - name=metrics_name, - type_name="Step_Metrics", - event_type=mlpb.Event.Type.OUTPUT, - properties={ - # passing uri value to commit - "Commit": props.get("Commit", ""), - "url": props.get("url", ""), - }, - artifact_type_properties={ - "Commit": mlpb.STRING, - "url": mlpb.STRING, - }, - custom_properties=custom_props, - milliseconds_since_epoch=int(time.time() * 1000), - ) - if self.graph: - self.driver.create_metrics_node( - metrics_name, - uri, - "output", - self.execution.id, - self.parent_context, - custom_props, - ) - child_artifact = { - "Name": metrics_name, - "URI": uri, - "Event": "output", - "Execution_Name": self.execution_name, - "Type": "Metrics", - "Execution_Command": self.execution_command, - "Pipeline_Id": self.parent_context.id, - } - self.driver.create_artifact_relationships( - self.input_artifacts, child_artifact, self.execution_label_props - ) - return metrics - def log_validation_output( self, version: str, custom_properties: t.Optional[t.Dict] = None @@ -1902,12 +1440,6 @@ def commit(self, custom_properties: t.Optional[t.Dict] = None) -> None: input_name=dataslice_path + ":" + c_hash, ) else: - props={ - "git_repo": str(git_repo), - # passing c_hash value to commit - "Commit": str(dataslice_commit), - "url": str(dvc_url_with_pipeline), - }, slice = create_new_artifact_event_and_attribution( store=self.writer.store, execution_id=self.writer.execution.id, @@ -1930,57 +1462,16 @@ def commit(self, custom_properties: t.Optional[t.Dict] = None) -> None: custom_properties=custom_props, milliseconds_since_epoch=int(time.time() * 1000), ) - if self.writer.graph: - self.writer.driver.create_dataslice_node( - self.name, dataslice_path + ":" + c_hash, c_hash, self.data_parent, props - ) - os.chdir(logging_dir) - return slice - # commit existing dataslice to server - def commit_existing(self, uri: str, props: t.Optional[t.Dict] = None, custom_properties: t.Optional[t.Dict] = None) -> None: - custom_props = {} if custom_properties is None else custom_properties - c_hash = uri.strip() - dataslice_commit = c_hash - existing_artifact = [] - if c_hash and c_hash.strip(): - existing_artifact.extend( - self.writer.store.get_artifacts_by_uri(c_hash)) - if existing_artifact and len(existing_artifact) != 0: - print("Adding to existing data slice") - # Haven't added event type in this if cond, is it not needed?? - slice = link_execution_to_input_artifact( - store=self.writer.store, - execution_id=self.writer.execution.id, - uri=c_hash, - input_name=self.name, - ) - else: - slice = create_new_artifact_event_and_attribution( - store=self.writer.store, - execution_id=self.writer.execution.id, - context_id=self.writer.child_context.id, - uri=c_hash, - name=self.name, - type_name="Dataslice", - event_type=mlpb.Event.Type.OUTPUT, - properties={ - "git_repo": props.get("git_repo", ""), - "Commit": props.get("Commit", ""), - "url": props.get("url", " "), - }, - artifact_type_properties={ - "git_repo": mlpb.STRING, - "Commit": mlpb.STRING, - "url": mlpb.STRING, - }, - custom_properties=custom_properties, - milliseconds_since_epoch=int(time.time() * 1000), - ) + custom_props["git_repo"] = git_repo + custom_props["Commit"] = dataslice_commit + self.writer.execution_label_props["git_repo"] = git_repo + self.writer.execution_label_props["Commit"] = dataslice_commit if self.writer.graph: self.writer.driver.create_dataslice_node( - self.name, self.name, c_hash, self.data_parent, custom_properties + self.name, dataslice_path + ":" + c_hash, c_hash, self.data_parent, custom_props ) + os.chdir(logging_dir) return slice @@ -1996,6 +1487,18 @@ def commit_existing(self, uri: str, props: t.Optional[t.Dict] = None, custom_pro # print(last) # os.symlink(str(index), slicedir + "/ " + last) +# Binding cmf_server_methods to Cmf class +Cmf.merge_created_context = merge_created_context +Cmf.merge_created_execution = merge_created_execution +Cmf.log_python_env_from_client = log_python_env_from_client +Cmf.log_dataset_with_version = log_dataset_with_version +Cmf.log_model_with_version = log_model_with_version +Cmf.log_execution_metrics_from_client = log_execution_metrics_from_client +#Cmf.commit_existing_metrics = commit_existing_metrics +Cmf.log_metrics_from_client = log_metrics_from_client +#Cmf.DataSlice.commit_existing = commit_existing +Cmf.log_dataslice_from_client = log_dataslice_from_client + def metadata_push(pipeline_name: str, filepath = "./mlmd", tensorboard_path: str = "", execution_id: str = ""): """ Pushes MLMD file to CMF-server. Example: From 997c9bd89153e9038a483ed77908f08fdc8025a8 Mon Sep 17 00:00:00 2001 From: Varkha Sharma Date: Mon, 2 Dec 2024 02:54:49 -0800 Subject: [PATCH 11/15] pushing intermediate changes --- cmflib/cmf.py | 8 +- server/app/main.py | 51 ++++- ui/src/client.js | 16 ++ ui/src/components/ArtifactTable/index.jsx | 4 +- ui/src/components/ExecutionTable/index.jsx | 41 ++++ .../{Popup => ModelCardPopup}/index.css | 0 .../{Popup => ModelCardPopup}/index.jsx | 4 +- ui/src/components/PythonEnvPopup/index.css | 130 ++++++++++++ ui/src/components/PythonEnvPopup/index.jsx | 192 ++++++++++++++++++ 9 files changed, 439 insertions(+), 7 deletions(-) rename ui/src/components/{Popup => ModelCardPopup}/index.css (100%) rename ui/src/components/{Popup => ModelCardPopup}/index.jsx (98%) create mode 100644 ui/src/components/PythonEnvPopup/index.css create mode 100644 ui/src/components/PythonEnvPopup/index.jsx diff --git a/cmflib/cmf.py b/cmflib/cmf.py index 7ccb7eb8..530e7610 100644 --- a/cmflib/cmf.py +++ b/cmflib/cmf.py @@ -385,7 +385,7 @@ def create_execution( git_repo = git_get_repo() git_start_commit = git_get_commit() cmd = str(sys.argv) if cmd is None else cmd - python_env=get_python_env() + self.execution = create_new_execution_in_existing_run_context( store=self.store, # Type field when re-using executions @@ -399,7 +399,6 @@ def create_execution( pipeline_type=self.parent_context.name, git_repo=git_repo, git_start_commit=git_start_commit, - python_env=python_env, custom_properties=custom_props, create_new_execution=create_new_execution, ) @@ -456,6 +455,11 @@ def create_execution( # link the artifact to execution if it exists and creates artifact if it doesn't self.log_python_env(python_env_file_path) + new_custom_props = {} + new_custom_props["Python_env"] = python_env_file_path + print("new custom props = ", new_custom_props ) + self.update_execution(self.execution.id, new_custom_props) + print(self.execution) os.chdir(logging_dir) return self.execution diff --git a/server/app/main.py b/server/app/main.py index 2a116157..085eed89 100644 --- a/server/app/main.py +++ b/server/app/main.py @@ -1,8 +1,9 @@ # cmf-server api's from fastapi import FastAPI, Request, HTTPException, Query, UploadFile, File from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import HTMLResponse +from fastapi.responses import HTMLResponse, PlainTextResponse from fastapi.staticfiles import StaticFiles +from typing import Literal from contextlib import asynccontextmanager import pandas as pd from typing import List, Dict, Any @@ -397,6 +398,54 @@ async def artifact_execution_lineage(request: Request, pipeline_name: str): response = await query_visualization_artifact_execution(server_store_path, pipeline_name, dict_of_art_ids, dict_of_exe_ids) return response +# Rest api is for pushing python env to upload python env +@app.post("/python-env") +async def upload_python_env(request:Request, pipeline_name: str = Query(..., description="Pipeline name"), + file: UploadFile = File(..., description="The file to upload")): + try: + file_path = os.path.join("/cmf-server/data/env/", pipeline_name, file.filename) + os.makedirs(os.path.dirname(file_path), exist_ok=True) + with open(file_path, "wb") as buffer: + buffer.write(await file.read()) + return {"message": f"File '{file.filename}' uploaded successfully"} + except Exception as e: + return {"error": f"Failed to up load file: {e}"} + +# Rest api to fetch the data from the +@app.get("/python-env", response_class=PlainTextResponse) +async def get_python_env(file_name: str) -> str: + """ + API endpoint to fetch the content of a requirements file. + + Args: + file_name (str): The name of the file to be fetched. Must end with .txt or .yaml. + + Returns: + str: The content of the file as plain text. + + Raises: + HTTPException: If the file does not exist or the extension is unsupported. + """ + # Validate file extension + if not (file_name.endswith(".txt") or file_name.endswith(".yaml")): + raise HTTPException( + status_code=400, detail="Unsupported file extension. Use .txt or .yaml" + ) + + # Check if the file exists + file_path = os.path.join("/cmf-server/data/env/", os.path.basename(file_name)) + if not os.path.exists(file_path): + raise HTTPException(status_code=404, detail="File not found") + + # Read and return the file content as plain text + try: + with open(file_path, "r") as file: + content = file.read() + return content + except Exception as e: + raise HTTPException(status_code=500, detail=f"Error reading file: {str(e)}") + + async def update_global_art_dict(pipeline_name): global dict_of_art_ids diff --git a/ui/src/client.js b/ui/src/client.js index bf5d5b29..cba76743 100644 --- a/ui/src/client.js +++ b/ui/src/client.js @@ -152,6 +152,22 @@ class FastAPIClient { return data; }); } + + async getPythonEnv(file_name) { + return this.apiClient + .get(`/python-env`, { + params: { + file_name: file_name + }, + responseType: "text", // Explicitly specify response type as text + }) + .then(( response ) => { + return response.data; + }); + } + } + + export default FastAPIClient; diff --git a/ui/src/components/ArtifactTable/index.jsx b/ui/src/components/ArtifactTable/index.jsx index 51fd441d..9ff336ff 100644 --- a/ui/src/components/ArtifactTable/index.jsx +++ b/ui/src/components/ArtifactTable/index.jsx @@ -17,7 +17,7 @@ // ArtifactTable.jsx import React, { useState, useEffect } from "react"; import "./index.css"; -import Popup from "../../components/Popup"; +import ModelCardPopup from "../../components/ModelCardPopup"; import FastAPIClient from "../../client"; import config from "../../config"; @@ -181,7 +181,7 @@ const ArtifactTable = ({ artifacts, ArtifactType, onSort }) => { > Open Model Card - { // Default sorting order @@ -26,6 +31,9 @@ const ExecutionTable = ({ executions, onSort, onFilter }) => { const [filterValue, setFilterValue] = useState(""); const [expandedRow, setExpandedRow] = useState(null); + const [showPopup, setShowPopup] = useState(false); + const [popupData, setPopupData] = useState(""); + const consistentColumns = []; useEffect(() => { @@ -61,6 +69,20 @@ const ExecutionTable = ({ executions, onSort, onFilter }) => { } }; + + const handleLinkClick = (file_name) => { + setShowPopup(true); + client.getPythonEnv("/home/sharvark/cmf-server/data/env/python_env_4619b71f780f0c6f369de6b3d1872289.txt").then((data) => { + console.log(data); + setPopupData(data); + setShowPopup(true); + }); + }; + + const handleClosePopup = () => { + setShowPopup(false); + }; + const renderArrow = () => { if (sortOrder === "desc") { return ( @@ -168,6 +190,9 @@ const ExecutionTable = ({ executions, onSort, onFilter }) => { Execution + + Python Env + Git Repo @@ -192,6 +217,22 @@ const ExecutionTable = ({ executions, onSort, onFilter }) => { {data.Context_Type} {data.Execution} + + { + e.preventDefault(); + handleLinkClick(""); + }} + > + Click for Env Details + + + {data.Git_Repo} {data.Git_Start_Commit} {data.Pipeline_Type} diff --git a/ui/src/components/Popup/index.css b/ui/src/components/ModelCardPopup/index.css similarity index 100% rename from ui/src/components/Popup/index.css rename to ui/src/components/ModelCardPopup/index.css diff --git a/ui/src/components/Popup/index.jsx b/ui/src/components/ModelCardPopup/index.jsx similarity index 98% rename from ui/src/components/Popup/index.jsx rename to ui/src/components/ModelCardPopup/index.jsx index 9f44df4a..ac387f8b 100644 --- a/ui/src/components/Popup/index.jsx +++ b/ui/src/components/ModelCardPopup/index.jsx @@ -1,7 +1,7 @@ import React from "react"; import "./index.css"; // Optional: For styling the popup -const Popup = ({ show, model_data, onClose }) => { +const ModelCardPopup = ({ show, model_data, onClose }) => { if (!show) { return null; } @@ -198,4 +198,4 @@ const Popup = ({ show, model_data, onClose }) => { ); }; -export default Popup; +export default ModelCardPopup; diff --git a/ui/src/components/PythonEnvPopup/index.css b/ui/src/components/PythonEnvPopup/index.css new file mode 100644 index 00000000..66f3c3cb --- /dev/null +++ b/ui/src/components/PythonEnvPopup/index.css @@ -0,0 +1,130 @@ +.popup-overlay { + position: fixed; + top: 0; + left: 0; + width: 100%; + height: 100%; + background-color: rgba(0, 0, 0, 0.5); + display: flex; + justify-content: center; + align-items: center; +} + +.popup-border { + position: sticky; + top: 0; + right: 0; + z-index: 10; /* Ensure it stays on top of the popup */ +} + +.popup { + background-color: white; + padding: 20px; + border-radius: 8px; + max-width: 1100px; + width: 100%; + position: relative; + box-shadow: 0 0 10px rgba(0, 0, 0, 0.1); + max-height: 90vh; /* Ensuring it doesn't exceed the viewport */ + overflow-y: auto; /* Ensure the popup can scroll if it still overflows */ +} + +.close-button { + background: gray; + color: white; + border: 2px solid black; + border-radius: 50%; + padding: 5px 10px; + cursor: pointer; + position: absolute; + top: -18px; + right: -19px; + z-index: 10; +} + +.download-button { + background: white; + color: dark gray; + border: none; + cursor: pointer; + width: 84px; + height: 55px; + float: right; +} + +.popup-content { + margin-top: 20px; +} + +.popup-row { + display: flex; + justify-content: space-between; + margin-bottom: 10px; + text-align: left; +} + +.popup-labels, +.popup-data { + display: flex; + flex-direction: column; +} + +.popup-labels { + flex: 1; + font-weight: bold; + background-color: #f0f0f0; /* Light gray background for labels */ + padding: 10px; + border-radius: 4px 0 0 4px; /* Rounded corners for left section */ +} + +.popup-data { + flex: 2; + background-color: #e0f7fa; /* Light blue background for data */ + padding: 10px; + border-radius: 0 4px 4px 0; /* Rounded corners for right section */ + text-align: left; +} + +.table-container { + max-height: 400px; /* Adjust this height as needed */ + max-width: 100%; + overflow-x: auto; /* Horizontal scrollbar */ + overflow-y: auto; /* Vertical scrollbar */ + margin-top: 20px; +} + +.table { + width: 100%; + border-collapse: collapse; + overflow: auto; + background-color: #e0f7fa; /* Light blue background for data */ +} + +.table th, +.table td { + border: 1px solid #ddd; + padding: 8px; + text-align: left; /* Align text to the left */ +} + +.table th { + background-color: #f2f2f2; /* Light gray background for headers */ + font-weight: bold; +} + +.tbody tr:nth-child(even) { + background-color: #f9f9f9; /* Light gray background for even rows */ +} + +.tbody tr:hover { + background-color: #f1f1f1; /* Light gray background for hovered rows */ +} + +hr { + margin: 20px 0; +} + +p { + font-weight: bold; + margin: 10px 0; +} diff --git a/ui/src/components/PythonEnvPopup/index.jsx b/ui/src/components/PythonEnvPopup/index.jsx new file mode 100644 index 00000000..2e63a404 --- /dev/null +++ b/ui/src/components/PythonEnvPopup/index.jsx @@ -0,0 +1,192 @@ +import React from "react"; +import "./index.css"; // Optional: For styling the popup + +const PythonEnvPopup = ({ show, python_env, onClose }) => { + if (!show) { + return null; + } + + /* find the uri value from artifacts + const findUri = () => { + const item = model_data[0].find((entry) => entry.uri); + return item ? item.uri : "default"; + }; + + // create filename based on uri + const createFilename = (uri) => { + return `model_card_${uri}.json`; + }; + + const downloadJSON = () => { + const uri = findUri(); + const filename = createFilename(uri); + + const jsonString = JSON.stringify(model_data, null, 2); + const blob = new Blob([jsonString], { type: "application/json" }); + + const url = URL.createObjectURL(blob); + const link = document.createElement("a"); + link.href = url; + link.download = filename; + document.body.appendChild(link); + link.click(); + document.body.removeChild(link); + URL.revokeObjectURL(url); + }; + + const excludeColumns = ["create_time_since_epoch"]; + + const renameKey = (key) => { + const prefix = "custom_properties_"; + if (key.startsWith(prefix)) { + return key.slice(prefix.length); + } + return key; + }; + + const renderContent = (item, index) => { + switch (index) { + case 0: + return ( +
+

Model's Data

+
+ {item.length > 0 && + item.map((data, i) => ( +
+
+ {Object.keys(data) + .filter((key) => !excludeColumns.includes(key)) + .map((key, idx) => ( +

{renameKey(key)}:

+ ))} +
+
+ {Object.entries(data) + .filter(([key]) => !excludeColumns.includes(key)) + .map(([key, value], idx) => ( +

{value ? value : "Null"}

+ ))} +
+
+ ))} +
+ ); + case 1: + const exe_headers = item.length > 0 ? Object.keys(item[0]) : []; + return ( +
+
+

List of executions in which model has been used

+
+ + + + {exe_headers.map((header, index) => ( + + ))} + + + + {item.length > 0 && + item.map((data, i) => ( + + {exe_headers.map((header, index) => ( + + ))} + + ))} + +
+ {renameKey(header)} +
{data[header]}
+
+ ); + case 2: + return ( +
+
+

List of input artifacts for the model

+
+ {item.length > 0 && + item.map((data, i) => ( +
+
+ {Object.keys(data) + .filter((key) => !excludeColumns.includes(key)) + .map((key, idx) => ( +

{renameKey(key)}:

+ ))} +
+
+ {Object.entries(data) + .filter(([key]) => !excludeColumns.includes(key)) + .map(([key, value], idx) => ( +

{value ? value : "Null"}

+ ))} +
+
+ ))} +
+ ); + case 3: + return ( +
+
+

List of output artifacts for the model

+
+ {item.length > 0 && + item.map((data, i) => ( +
+
+ {Object.keys(data) + .filter((key) => !excludeColumns.includes(key)) + .map((key, idx) => ( +

{renameKey(key)}:

+ ))} +
+
+ {Object.entries(data) + .filter(([key]) => !excludeColumns.includes(key)) + .map(([key, value], idx) => ( +

{value ? value : "Null"}

+ ))} +
+
+ ))} +
+ ); + default: + return ( +
+

Unknown item

+
+ ); + } + };*/ + + return ( + <> + +
+
+
+ +
+
+
+ {python_env} +
+
+
+
+ + ); +}; + +export default PythonEnvPopup; From 08bdf53680489a78bbec34593511e8e8a2f3cbb3 Mon Sep 17 00:00:00 2001 From: Varkha Sharma Date: Tue, 3 Dec 2024 16:56:53 -0800 Subject: [PATCH 12/15] intermediate changes --- ui/src/components/ExecutionTable/index.jsx | 11 +- ui/src/components/PythonEnvPopup/index.css | 126 ++++----------- ui/src/components/PythonEnvPopup/index.jsx | 170 +-------------------- 3 files changed, 43 insertions(+), 264 deletions(-) diff --git a/ui/src/components/ExecutionTable/index.jsx b/ui/src/components/ExecutionTable/index.jsx index a1f25377..a5f805ec 100644 --- a/ui/src/components/ExecutionTable/index.jsx +++ b/ui/src/components/ExecutionTable/index.jsx @@ -72,7 +72,7 @@ const ExecutionTable = ({ executions, onSort, onFilter }) => { const handleLinkClick = (file_name) => { setShowPopup(true); - client.getPythonEnv("/home/sharvark/cmf-server/data/env/python_env_4619b71f780f0c6f369de6b3d1872289.txt").then((data) => { + client.getPythonEnv(file_name).then((data) => { console.log(data); setPopupData(data); setShowPopup(true); @@ -190,8 +190,10 @@ const ExecutionTable = ({ executions, onSort, onFilter }) => { Execution - - Python Env + + + Python Env + Git Repo @@ -222,7 +224,8 @@ const ExecutionTable = ({ executions, onSort, onFilter }) => { href="#" onClick={(e) => { e.preventDefault(); - handleLinkClick(""); + handleLinkClick(data.custom_properties_Python_env); + }} > Click for Env Details diff --git a/ui/src/components/PythonEnvPopup/index.css b/ui/src/components/PythonEnvPopup/index.css index 66f3c3cb..2570ca31 100644 --- a/ui/src/components/PythonEnvPopup/index.css +++ b/ui/src/components/PythonEnvPopup/index.css @@ -1,3 +1,4 @@ +/* Overlay for popup */ .popup-overlay { position: fixed; top: 0; @@ -10,121 +11,58 @@ align-items: center; } -.popup-border { - position: sticky; - top: 0; - right: 0; - z-index: 10; /* Ensure it stays on top of the popup */ -} - +/* Popup container */ .popup { + position: relative; background-color: white; padding: 20px; border-radius: 8px; - max-width: 1100px; - width: 100%; - position: relative; + width: 90%; + max-width: 800px; /* Adjusted for better responsiveness */ + max-height: 90vh; /* Prevent overflow beyond the viewport */ box-shadow: 0 0 10px rgba(0, 0, 0, 0.1); - max-height: 90vh; /* Ensuring it doesn't exceed the viewport */ - overflow-y: auto; /* Ensure the popup can scroll if it still overflows */ + overflow-y: auto; /* Scroll if content exceeds height */ } +/* Close button */ .close-button { + position: absolute; + top: -18px; + right: -19px; background: gray; color: white; border: 2px solid black; border-radius: 50%; padding: 5px 10px; + font-size: 12px; /* Unified font size */ cursor: pointer; - position: absolute; - top: -18px; - right: -19px; z-index: 10; } -.download-button { - background: white; - color: dark gray; - border: none; - cursor: pointer; - width: 84px; - height: 55px; - float: right; -} - -.popup-content { - margin-top: 20px; -} - -.popup-row { - display: flex; - justify-content: space-between; - margin-bottom: 10px; - text-align: left; -} - -.popup-labels, -.popup-data { - display: flex; - flex-direction: column; -} - -.popup-labels { - flex: 1; +/* Popup heading */ +.popup-heading { + font-size: 20px; font-weight: bold; - background-color: #f0f0f0; /* Light gray background for labels */ - padding: 10px; - border-radius: 4px 0 0 4px; /* Rounded corners for left section */ + text-align: center; + color: #333; + margin: 0; + padding: 10px 0; + background-color: #f9f9f9; /* Subtle background for heading */ + border-bottom: 1px solid #ddd; } -.popup-data { - flex: 2; - background-color: #e0f7fa; /* Light blue background for data */ - padding: 10px; - border-radius: 0 4px 4px 0; /* Rounded corners for right section */ - text-align: left; -} - -.table-container { - max-height: 400px; /* Adjust this height as needed */ - max-width: 100%; - overflow-x: auto; /* Horizontal scrollbar */ - overflow-y: auto; /* Vertical scrollbar */ +/* Popup content */ +.popup-content { margin-top: 20px; -} - -.table { - width: 100%; - border-collapse: collapse; - overflow: auto; - background-color: #e0f7fa; /* Light blue background for data */ -} - -.table th, -.table td { + padding: 15px; + font-family: 'Courier New', Courier, monospace; + font-size: 14px; /* Default font size */ + background-color: #f8f9fa; /* Light gray background */ + color: #333; border: 1px solid #ddd; - padding: 8px; - text-align: left; /* Align text to the left */ + border-radius: 4px; + max-height: 400px; + overflow-y: auto; /* Scrollable content */ + white-space: pre-wrap; /* Preserve whitespace and line breaks */ } -.table th { - background-color: #f2f2f2; /* Light gray background for headers */ - font-weight: bold; -} - -.tbody tr:nth-child(even) { - background-color: #f9f9f9; /* Light gray background for even rows */ -} - -.tbody tr:hover { - background-color: #f1f1f1; /* Light gray background for hovered rows */ -} - -hr { - margin: 20px 0; -} - -p { - font-weight: bold; - margin: 10px 0; -} diff --git a/ui/src/components/PythonEnvPopup/index.jsx b/ui/src/components/PythonEnvPopup/index.jsx index 2e63a404..a6f7d1c9 100644 --- a/ui/src/components/PythonEnvPopup/index.jsx +++ b/ui/src/components/PythonEnvPopup/index.jsx @@ -6,171 +6,8 @@ const PythonEnvPopup = ({ show, python_env, onClose }) => { return null; } - /* find the uri value from artifacts - const findUri = () => { - const item = model_data[0].find((entry) => entry.uri); - return item ? item.uri : "default"; - }; - - // create filename based on uri - const createFilename = (uri) => { - return `model_card_${uri}.json`; - }; - - const downloadJSON = () => { - const uri = findUri(); - const filename = createFilename(uri); - - const jsonString = JSON.stringify(model_data, null, 2); - const blob = new Blob([jsonString], { type: "application/json" }); - - const url = URL.createObjectURL(blob); - const link = document.createElement("a"); - link.href = url; - link.download = filename; - document.body.appendChild(link); - link.click(); - document.body.removeChild(link); - URL.revokeObjectURL(url); - }; - - const excludeColumns = ["create_time_since_epoch"]; - - const renameKey = (key) => { - const prefix = "custom_properties_"; - if (key.startsWith(prefix)) { - return key.slice(prefix.length); - } - return key; - }; - - const renderContent = (item, index) => { - switch (index) { - case 0: - return ( -
-

Model's Data

-
- {item.length > 0 && - item.map((data, i) => ( -
-
- {Object.keys(data) - .filter((key) => !excludeColumns.includes(key)) - .map((key, idx) => ( -

{renameKey(key)}:

- ))} -
-
- {Object.entries(data) - .filter(([key]) => !excludeColumns.includes(key)) - .map(([key, value], idx) => ( -

{value ? value : "Null"}

- ))} -
-
- ))} -
- ); - case 1: - const exe_headers = item.length > 0 ? Object.keys(item[0]) : []; - return ( -
-
-

List of executions in which model has been used

-
- - - - {exe_headers.map((header, index) => ( - - ))} - - - - {item.length > 0 && - item.map((data, i) => ( - - {exe_headers.map((header, index) => ( - - ))} - - ))} - -
- {renameKey(header)} -
{data[header]}
-
- ); - case 2: - return ( -
-
-

List of input artifacts for the model

-
- {item.length > 0 && - item.map((data, i) => ( -
-
- {Object.keys(data) - .filter((key) => !excludeColumns.includes(key)) - .map((key, idx) => ( -

{renameKey(key)}:

- ))} -
-
- {Object.entries(data) - .filter(([key]) => !excludeColumns.includes(key)) - .map(([key, value], idx) => ( -

{value ? value : "Null"}

- ))} -
-
- ))} -
- ); - case 3: - return ( -
-
-

List of output artifacts for the model

-
- {item.length > 0 && - item.map((data, i) => ( -
-
- {Object.keys(data) - .filter((key) => !excludeColumns.includes(key)) - .map((key, idx) => ( -

{renameKey(key)}:

- ))} -
-
- {Object.entries(data) - .filter(([key]) => !excludeColumns.includes(key)) - .map(([key, value], idx) => ( -

{value ? value : "Null"}

- ))} -
-
- ))} -
- ); - default: - return ( -
-

Unknown item

-
- ); - } - };*/ - return ( <> -
@@ -178,10 +15,11 @@ const PythonEnvPopup = ({ show, python_env, onClose }) => { X
+

Environment Configuration

-
- {python_env} -
+
+              {python_env}
+            
From 9864d98c61f96a731a7306edaa35c9f58213f050 Mon Sep 17 00:00:00 2001 From: Varkha Sharma Date: Mon, 9 Dec 2024 04:17:42 -0800 Subject: [PATCH 13/15] pushing intermediate changes --- cmflib/cmf.py | 4 +--- cmflib/metadata_helper.py | 4 ---- cmflib/server_interface/server_interface.py | 9 ++++++++- docker-compose-server.yml | 1 + server/app/main.py | 9 ++++----- ui/src/components/ExecutionTable/index.jsx | 2 +- ui/src/components/PythonEnvPopup/index.jsx | 3 ++- .../PythonEnvPopup/{index.css => index.module.css} | 0 8 files changed, 17 insertions(+), 15 deletions(-) rename ui/src/components/PythonEnvPopup/{index.css => index.module.css} (100%) diff --git a/cmflib/cmf.py b/cmflib/cmf.py index 530e7610..88da94b9 100644 --- a/cmflib/cmf.py +++ b/cmflib/cmf.py @@ -456,10 +456,8 @@ def create_execution( # link the artifact to execution if it exists and creates artifact if it doesn't self.log_python_env(python_env_file_path) new_custom_props = {} - new_custom_props["Python_env"] = python_env_file_path - print("new custom props = ", new_custom_props ) + new_custom_props["Python_Env"] = python_env_file_path self.update_execution(self.execution.id, new_custom_props) - print(self.execution) os.chdir(logging_dir) return self.execution diff --git a/cmflib/metadata_helper.py b/cmflib/metadata_helper.py index 91f3b6d1..73feeedf 100644 --- a/cmflib/metadata_helper.py +++ b/cmflib/metadata_helper.py @@ -316,7 +316,6 @@ def create_new_execution_in_existing_context( EXECUTION_REPO = "Git_Repo" EXECUTION_START_COMMIT = "Git_Start_Commit" EXECUTION_END_COMMIT = "Git_End_Commit" -EXECUTION_PYTHON_ENV= "Python_Env" EXECUTION_PIPELINE_TYPE = "Pipeline_Type" EXECUTION_PIPELINE_ID = "Pipeline_id" @@ -393,7 +392,6 @@ def create_new_execution_in_existing_run_context( git_repo: str = None, git_start_commit: str = None, git_end_commit: str = "", - python_env: str = "", custom_properties: dict = None, create_new_execution:bool = True ) -> metadata_store_pb2.Execution: @@ -418,7 +416,6 @@ def create_new_execution_in_existing_run_context( EXECUTION_REPO: metadata_store_pb2.STRING, EXECUTION_START_COMMIT: metadata_store_pb2.STRING, EXECUTION_END_COMMIT: metadata_store_pb2.STRING, - EXECUTION_PYTHON_ENV: metadata_store_pb2.STRING, }, properties={ @@ -433,7 +430,6 @@ def create_new_execution_in_existing_run_context( EXECUTION_REPO: metadata_store_pb2.Value(string_value=git_repo), EXECUTION_START_COMMIT: metadata_store_pb2.Value(string_value=git_start_commit), EXECUTION_END_COMMIT: metadata_store_pb2.Value(string_value=git_end_commit), - EXECUTION_PYTHON_ENV: metadata_store_pb2.Value(string_value=python_env), # should set to task ID, not component ID }, custom_properties=mlmd_custom_properties, diff --git a/cmflib/server_interface/server_interface.py b/cmflib/server_interface/server_interface.py index 3b3730c6..75a65214 100644 --- a/cmflib/server_interface/server_interface.py +++ b/cmflib/server_interface/server_interface.py @@ -17,7 +17,7 @@ import requests import json -# This function posts mlmd data to mlmd_push api on cmf-server +# This function posts mlmd data and env files on cmf-server using mlmd_push rest api def call_mlmd_push(json_payload, url, exec_id, pipeline_name): url_to_pass = f"{url}/mlmd_push" json_data = {"id": exec_id, "json_payload": json_payload, "pipeline_name": pipeline_name} @@ -40,3 +40,10 @@ def call_tensorboard(url, pipeline_name, file_name, file_path): params = {'pipeline_name': pipeline_name} response = requests.post(url_to_pass, files=files, params=params) return response + +# This function posts env file to cmf-server +def call_python_env(url, file_name, file_path): + url_to_pass = f"{url}/python-env" + files = {'file': (file_name, open(file_path, 'rb'))} + response = requests.post(url_to_pass, files=files) + return response \ No newline at end of file diff --git a/docker-compose-server.yml b/docker-compose-server.yml index f5a267af..626b678a 100644 --- a/docker-compose-server.yml +++ b/docker-compose-server.yml @@ -29,6 +29,7 @@ services: # both the directory paths should be updated as per user's environment volumes: - /home/xxxx/cmf-server/data:/cmf-server/data + - /home/xxxx/cmf-server/data/env:/cmf-server/data/env - /home/xxxx/cmf-server/data/static:/cmf-server/data/static - /home/xxxx/cmf-server/data/tensorboard-logs:/cmf-server/data/tensorboard-logs container_name: cmf-server diff --git a/server/app/main.py b/server/app/main.py index 085eed89..78e0bd4c 100644 --- a/server/app/main.py +++ b/server/app/main.py @@ -400,10 +400,9 @@ async def artifact_execution_lineage(request: Request, pipeline_name: str): # Rest api is for pushing python env to upload python env @app.post("/python-env") -async def upload_python_env(request:Request, pipeline_name: str = Query(..., description="Pipeline name"), - file: UploadFile = File(..., description="The file to upload")): +async def upload_python_env(request:Request, file: UploadFile = File(..., description="The file to upload")): try: - file_path = os.path.join("/cmf-server/data/env/", pipeline_name, file.filename) + file_path = os.path.join("/cmf-server/data/env/", os.path.basename(file.filename)) os.makedirs(os.path.dirname(file_path), exist_ok=True) with open(file_path, "wb") as buffer: buffer.write(await file.read()) @@ -411,7 +410,7 @@ async def upload_python_env(request:Request, pipeline_name: str = Query(..., des except Exception as e: return {"error": f"Failed to up load file: {e}"} -# Rest api to fetch the data from the +# Rest api to fetch the env data from the /cmf-server/data/env folder @app.get("/python-env", response_class=PlainTextResponse) async def get_python_env(file_name: str) -> str: """ @@ -434,6 +433,7 @@ async def get_python_env(file_name: str) -> str: # Check if the file exists file_path = os.path.join("/cmf-server/data/env/", os.path.basename(file_name)) + print("file_path = ", file_path) if not os.path.exists(file_path): raise HTTPException(status_code=404, detail="File not found") @@ -446,7 +446,6 @@ async def get_python_env(file_name: str) -> str: raise HTTPException(status_code=500, detail=f"Error reading file: {str(e)}") - async def update_global_art_dict(pipeline_name): global dict_of_art_ids output_dict = await async_api(get_all_artifact_ids, server_store_path, dict_of_exe_ids, pipeline_name) diff --git a/ui/src/components/ExecutionTable/index.jsx b/ui/src/components/ExecutionTable/index.jsx index a5f805ec..17c297fb 100644 --- a/ui/src/components/ExecutionTable/index.jsx +++ b/ui/src/components/ExecutionTable/index.jsx @@ -224,7 +224,7 @@ const ExecutionTable = ({ executions, onSort, onFilter }) => { href="#" onClick={(e) => { e.preventDefault(); - handleLinkClick(data.custom_properties_Python_env); + handleLinkClick(data.custom_properties_Python_Env); }} > diff --git a/ui/src/components/PythonEnvPopup/index.jsx b/ui/src/components/PythonEnvPopup/index.jsx index a6f7d1c9..1c694194 100644 --- a/ui/src/components/PythonEnvPopup/index.jsx +++ b/ui/src/components/PythonEnvPopup/index.jsx @@ -1,11 +1,12 @@ import React from "react"; -import "./index.css"; // Optional: For styling the popup +import "./index.module.css"; // Optional: For styling the popup const PythonEnvPopup = ({ show, python_env, onClose }) => { if (!show) { return null; } + console.log(python_env) return ( <>
diff --git a/ui/src/components/PythonEnvPopup/index.css b/ui/src/components/PythonEnvPopup/index.module.css similarity index 100% rename from ui/src/components/PythonEnvPopup/index.css rename to ui/src/components/PythonEnvPopup/index.module.css From d49a74bfef8dfa18ce4984a5d2b0562d9f10e41e Mon Sep 17 00:00:00 2001 From: Varkha Sharma Date: Mon, 9 Dec 2024 06:14:28 -0800 Subject: [PATCH 14/15] pushing intermediate changes --- cmflib/cmf_server_methods.py | 2 - cmflib/commands/metadata/push.py | 150 ++++++++++++-------- cmflib/server_interface/server_interface.py | 2 +- examples/example-get-started/src/query.py | 2 +- examples/nano-cmf/src/query.py | 2 +- server/app/main.py | 2 - 6 files changed, 92 insertions(+), 68 deletions(-) diff --git a/cmflib/cmf_server_methods.py b/cmflib/cmf_server_methods.py index f60a387c..6fc52aa6 100644 --- a/cmflib/cmf_server_methods.py +++ b/cmflib/cmf_server_methods.py @@ -595,8 +595,6 @@ def log_execution_metrics_from_client(self, metrics_name: str, existing_artifacts = self.store.get_artifacts_by_uri(uri) existing_artifact = existing_artifacts[0] if existing_artifacts else None - # Didn't understand this, - # and in case of step_metrics should we follow this logic or dataset's logic or does it even matter if not existing_artifact or \ ((existing_artifact) and not (existing_artifact.name == new_metrics_name)): #we need to add the artifact otherwise its already there diff --git a/cmflib/commands/metadata/push.py b/cmflib/commands/metadata/push.py index bd630397..71672821 100644 --- a/cmflib/commands/metadata/push.py +++ b/cmflib/commands/metadata/push.py @@ -26,18 +26,31 @@ # This class pushes mlmd file to cmf-server class CmdMetadataPush(CmdBase): + + # Create a function to search for files into multiple directories + def search_files(self, file_list, *directories): + found_files = {} + for directory in directories: + abs_dir = os.path.abspath(directory) + for file_name in file_list: + if isinstance(file_name, str): + file_path = os.path.join(abs_dir, file_name) + if os.path.isfile(file_path): + found_files[file_name] = file_path + return found_files + def run(self): - current_directory = os.getcwd() + current_directory = mlmd_directory = os.getcwd() mlmd_file_name = "./mlmd" # checks if mlmd filepath is given if self.args.file_name: mlmd_file_name = self.args.file_name - current_directory = os.path.dirname(self.args.file_name) + mlmd_directory = os.path.dirname(self.args.file_name) # checks if mlmd file is present in current directory or given directory if not os.path.exists(mlmd_file_name): - return f"ERROR: {mlmd_file_name} doesn't exists in the {current_directory}." + return f"ERROR: {mlmd_file_name} doesn't exists in the {mlmd_directory}." query = cmfquery.CmfQuery(mlmd_file_name) # print(json.dumps(json.loads(json_payload), indent=4, sort_keys=True)) @@ -63,74 +76,89 @@ def run(self): # Checks if pipeline name exists if self.args.pipeline_name in query.get_pipeline_names(): - # converts mlmd file to json format - json_payload = query.dumptojson(self.args.pipeline_name, None) - # checks if execution_id is given by user + execution = None + exec_id = None if self.args.execution: - exec_id = self.args.execution - mlmd_data = json.loads(json_payload)["Pipeline"] - # checks if given execution_id present in mlmd - for i in mlmd_data[0]["stages"]: - for j in i["executions"]: - if j["id"] == int(exec_id): - execution_flag = 1 - # calling mlmd_push api to push mlmd file to cmf-server - response = server_interface.call_mlmd_push( - json_payload, url, exec_id, self.args.pipeline_name - ) - break - if execution_flag == 0: + execution = cmfquery.get_all_executions_by_ids_list([self.args.execution]) + if execution.empty: return "Given execution is not found in mlmd." - else: - exec_id = None - response = server_interface.call_mlmd_push(json_payload, url, exec_id, self.args.pipeline_name) + exec_id = self.args.execution + # converts mlmd file to json format + json_payload = query.dumptojson(self.args.pipeline_name, None) + response = server_interface.call_mlmd_push(json_payload, url, exec_id, self.args.pipeline_name) status_code = response.status_code - if status_code == 200 and response.json()['status']=="success": - print("mlmd is successfully pushed.") - elif status_code==200 and response.json()["status"]=="exists": - print("Executions already exists.") - elif status_code==422 and response.json()["status"]=="version_update": + + # we need to push the python env files only after the mlmd push has succeded + # otherwise there is no use of those python env files on cmf-server + + if status_code==422 and response.json()["status"]=="version_update": return "ERROR: You need to update cmf to the latest version. Unable to push metadata file." elif status_code == 404: return "ERROR: cmf-server is not available." elif status_code == 500: return "ERROR: Internal server error." - else: - return "ERROR: Status Code = {status_code}. Unable to push mlmd." - - if self.args.tensorboard: - # /tensorboard api call is done only if mlmd push is successfully completed - # tensorboard parameter is passed - print("......................................") - print("tensorboard logs upload started!!") - print("......................................") - - # check if the path provided is for a file - if os.path.isfile(self.args.tensorboard): - file_name = os.path.basename(self.args.tensorboard) - tresponse = server_interface.call_tensorboard(url, self.args.pipeline_name, file_name, self.args.tensorboard) - tstatus_code = tresponse.status_code - if tstatus_code == 200: - return "tensorboard logs: file {file_name} pushed successfully" - else: - return "ERROR: Failed to upload file {file_name}. Server response: {response.text}" - # If path provided is a directory - elif os.path.isdir(self.args.tensorboard): - # Recursively push all files and subdirectories - for root, dirs, files in os.walk(self.args.tensorboard): - for file_name in files: - file_path = os.path.join(root, file_name) - relative_path = os.path.relpath(file_path, self.args.tensorboard) - tresponse = server_interface.call_tensorboard(url, self.args.pipeline_name, relative_path, file_path) - if tresponse.status_code == 200: - print(f"tensorboard logs: File {file_name} uploaded successfully.") - else: - return f"ERROR: Failed to upload file {file_name}. Server response: {tresponse.text}" - return f"tensorboard logs: {self.args.tensorboard} uploaded successfully!!" + elif status_code == 200: + # the only question remains how we want to percieve the failure of upload of the python env files + # for now, it is considered as non-consequential. + # that means it's failure/success doesn't matter. + # however, we will be keeping the record of the status code. + + # Getting all executions df to get the custom property 'Python_Env' + executions = query.get_all_executions_in_pipeline(self.args.pipeline_name) + if not executions.empty: + if 'custom_properties_Python_Env' in executions.columns: + list_of_env_files = executions['custom_properties_Python_Env'].drop_duplicates().tolist() + # This is a validation step to suppress errors in case user is pushing the mlmd + # from a directory in which 'cmf_artifacts/Python_Env_hash.txt' is not present. + # Find the valid file paths. + found_files = self.search_files(list_of_env_files, current_directory, mlmd_directory) + + # push valid files on cmf-server + if found_files: + for name, path in found_files.items(): + env_response = server_interface.call_python_env(url, name, path) + # keeping record of status but this won't affect the mlmd success. + print(env_response.json()) + + output = response.json()['status'] + if output =="success": + output = "mlmd is successfully pushed." + elif output =="exists": + output = "Executions already exists." + if not self.args.tensorboard: + return output else: - return "ERROR: Invalid data path. Provide valid file/folder path for tensorboard logs!!" + print(output) + # /tensorboard api call is done only if mlmd push is successfully completed + # tensorboard parameter is passed + print("......................................") + print("tensorboard logs upload started!!") + print("......................................") + # check if the path provided is for a file + if os.path.isfile(self.args.tensorboard): + file_name = os.path.basename(self.args.tensorboard) + tresponse = server_interface.call_tensorboard(url, self.args.pipeline_name, file_name, self.args.tensorboard) + tstatus_code = tresponse.status_code + if tstatus_code == 200: + return "tensorboard logs: file {file_name} pushed successfully" + else: + return "ERROR: Failed to upload file {file_name}. Server response: {response.text}" + elif os.path.isdir(self.args.tensorboard): + # Recursively push all files and subdirectories + for root, dirs, files in os.walk(self.args.tensorboard): + for file_name in files: + file_path = os.path.join(root, file_name) + relative_path = os.path.relpath(file_path, self.args.tensorboard) + tresponse = server_interface.call_tensorboard(url, self.args.pipeline_name, relative_path, file_path) + if tresponse.status_code == 200: + print(f"tensorboard logs: File {file_name} uploaded successfully.") + else: + return f"ERROR: Failed to upload file {file_name}. Server response: {tresponse.text}" + return f"tensorboard logs: {self.args.tensorboard} uploaded successfully!!" + else: + return "ERROR: Invalid data path. Provide valid file/folder path for tensorboard logs!!" else: - return "SUCCESS!!" + return "ERROR: Status Code = {status_code}. Unable to push mlmd." else: return "Pipeline name " + self.args.pipeline_name + " doesn't exists." diff --git a/cmflib/server_interface/server_interface.py b/cmflib/server_interface/server_interface.py index 75a65214..521d53cb 100644 --- a/cmflib/server_interface/server_interface.py +++ b/cmflib/server_interface/server_interface.py @@ -17,7 +17,7 @@ import requests import json -# This function posts mlmd data and env files on cmf-server using mlmd_push rest api +# This function posts mlmd data on cmf-server using mlmd_push rest api def call_mlmd_push(json_payload, url, exec_id, pipeline_name): url_to_pass = f"{url}/mlmd_push" json_data = {"id": exec_id, "json_payload": json_payload, "pipeline_name": pipeline_name} diff --git a/examples/example-get-started/src/query.py b/examples/example-get-started/src/query.py index 4b4b47ba..521ff518 100644 --- a/examples/example-get-started/src/query.py +++ b/examples/example-get-started/src/query.py @@ -12,7 +12,7 @@ def _print_executions_in_stage(cmf_query: cmfquery.CmfQuery, stage_name: str) -> print('\n') df: pd.DataFrame = cmf_query.get_all_executions_in_stage(stage_name) # dropping Python_Env value in query output as it is very big in size most of the time - df.drop(columns=['Git_Start_Commit', 'Git_End_Commit', 'Python_Env'], inplace=True, axis=1) + df.drop(columns=['Git_Start_Commit', 'Git_End_Commit'], inplace=True, axis=1) print(tabulate(df, headers='keys', tablefmt='psql')) diff --git a/examples/nano-cmf/src/query.py b/examples/nano-cmf/src/query.py index 8f6c41c6..97f31590 100644 --- a/examples/nano-cmf/src/query.py +++ b/examples/nano-cmf/src/query.py @@ -11,7 +11,7 @@ def _print_executions_in_stage(cmf_query: cmfquery.CmfQuery, stage_name: str) -> print('\n') print('\n') df: pd.DataFrame = cmf_query.get_all_executions_in_stage(stage_name) - df.drop(columns=['Git_Start_Commit', 'Git_End_Commit', 'Python_Env'], inplace=True, axis=1) + df.drop(columns=['Git_Start_Commit', 'Git_End_Commit'], inplace=True, axis=1) print(tabulate(df, headers='keys', tablefmt='psql')) diff --git a/server/app/main.py b/server/app/main.py index 78e0bd4c..d417fb1a 100644 --- a/server/app/main.py +++ b/server/app/main.py @@ -3,7 +3,6 @@ from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import HTMLResponse, PlainTextResponse from fastapi.staticfiles import StaticFiles -from typing import Literal from contextlib import asynccontextmanager import pandas as pd from typing import List, Dict, Any @@ -433,7 +432,6 @@ async def get_python_env(file_name: str) -> str: # Check if the file exists file_path = os.path.join("/cmf-server/data/env/", os.path.basename(file_name)) - print("file_path = ", file_path) if not os.path.exists(file_path): raise HTTPException(status_code=404, detail="File not found") From 240cc0f87711dc590143b334f886c701260db0c8 Mon Sep 17 00:00:00 2001 From: Varkha Sharma Date: Mon, 9 Dec 2024 09:43:09 -0800 Subject: [PATCH 15/15] fixing errors found in dev testing --- cmflib/cmf.py | 6 +++--- cmflib/cmf_merger.py | 5 +++-- cmflib/cmf_server_methods.py | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/cmflib/cmf.py b/cmflib/cmf.py index 88da94b9..f4f7a68b 100644 --- a/cmflib/cmf.py +++ b/cmflib/cmf.py @@ -63,7 +63,7 @@ log_dataset_with_version, log_model_with_version, log_execution_metrics_from_client, - log_metrics_from_client, + log_step_metrics_from_client, log_dataslice_from_client, ) @@ -1497,9 +1497,9 @@ def commit(self, custom_properties: t.Optional[t.Dict] = None) -> None: Cmf.log_model_with_version = log_model_with_version Cmf.log_execution_metrics_from_client = log_execution_metrics_from_client #Cmf.commit_existing_metrics = commit_existing_metrics -Cmf.log_metrics_from_client = log_metrics_from_client +Cmf.log_step_metrics_from_client = log_step_metrics_from_client #Cmf.DataSlice.commit_existing = commit_existing -Cmf.log_dataslice_from_client = log_dataslice_from_client +Cmf.DataSlice.log_dataslice_from_client = log_dataslice_from_client def metadata_push(pipeline_name: str, filepath = "./mlmd", tensorboard_path: str = "", execution_id: str = ""): """ Pushes MLMD file to CMF-server. diff --git a/cmflib/cmf_merger.py b/cmflib/cmf_merger.py index 2f828a5e..449946fd 100644 --- a/cmflib/cmf_merger.py +++ b/cmflib/cmf_merger.py @@ -135,9 +135,10 @@ def parse_json_to_mlmd(mlmd_json, path_to_store: str, cmd: str, exec_id: Union[s cmf_class.log_execution_metrics_from_client(event["artifact"]["name"], custom_props) elif artifact_type == "Dataslice": dataslice = cmf_class.create_dataslice(event["artifact"]["name"]) - dataslice.log_dataslice_from_client(uri, custom_props) + dataslice.log_dataslice_from_client(uri, props, custom_props) elif artifact_type == "Step_Metrics": - cmf_class.log_metrics_from_client(event["artifact"]["name"], uri, custom_props) + cmf_class.log_step_metrics_from_client(event["artifact"]["name"], uri, props, + custom_props) elif artifact_type == "Environment": cmf_class.log_python_env_from_client(artifact_name, uri, props) else: diff --git a/cmflib/cmf_server_methods.py b/cmflib/cmf_server_methods.py index 6fc52aa6..5e955508 100644 --- a/cmflib/cmf_server_methods.py +++ b/cmflib/cmf_server_methods.py @@ -639,7 +639,7 @@ def log_execution_metrics_from_client(self, metrics_name: str, -def log_metrics_from_client(self, metrics_name: str, uri: str, props: t.Optional[t.Dict] = None, custom_properties: t.Optional[t.Dict] = None): +def log_step_metrics_from_client(self, metrics_name: str, uri: str, props: t.Optional[t.Dict] = None, custom_properties: t.Optional[t.Dict] = None): """ Commits existing metrics associated with the given URI to MLMD. Example: