Merge branch 'HewlettPackard:master' into Tensorflow

HewlettPackard · Feb 21, 2024 · 867c5fb · 867c5fb
2 parents 7ab2b03 + 72734c8
commit 867c5fb
Show file tree

Hide file tree

Showing 12 changed files with 386 additions and 35 deletions.
diff --git a/cmflib/cmf.py b/cmflib/cmf.py
diff --git a/cmflib/cmf_merger.py b/cmflib/cmf_merger.py
@@ -105,7 +105,6 @@ def parse_json_to_mlmd(mlmd_json, path_to_store, cmd, exec_id):
                     )
                 elif artifact_type == "Metrics":
                     # print(props,'parse')
-                    # cmf_class.log_execution_metrics_with_uuid(props, custom_props)
                     cmf_class.log_execution_metrics_from_client(event["artifact"]["name"], custom_props)
                 elif artifact_type == "Dataslice":
                     dataslice = cmf_class.create_dataslice(event["artifact"]["name"])

diff --git a/cmflib/cmfquery.py b/cmflib/cmfquery.py
@@ -109,8 +109,6 @@ class CmfQuery(object):
     by users via CMF API. When methods in this class accept `name` parameters, it is expected that values of these
     parameters are fully-qualified names of respective entities.
 
-    TODO: (sergey) need to provide concrete examples and detailed description on how to actually use methods of this
-          class correctly, e.g., how to determine these fully-qualified names.
 
     Args:
         filepath: Path to the MLMD database file.
@@ -648,7 +646,12 @@ def get_all_child_artifacts(self, artifact_name: str) -> pd.DataFrame:
         return df
 
     def get_one_hop_parent_artifacts(self, artifact_name: str) -> pd.DataFrame:
-        """Return input artifacts for the execution that produced the given artifact."""
+        """Return input artifacts for the execution that produced the given artifact.
+        Args:
+            artifact_name: Artifact name.
+        Returns:
+            Data frame containing immediate parent artifactog of given artifact.
+        """
         artifact: t.Optional = self._get_artifact(artifact_name)
         if not artifact:
             return pd.DataFrame()
@@ -660,7 +663,12 @@ def get_one_hop_parent_artifacts(self, artifact_name: str) -> pd.DataFrame:
         )
 
     def get_all_parent_artifacts(self, artifact_name: str) -> pd.DataFrame:
-        """Return all upstream artifacts."""
+        """Return all upstream artifacts.
+        Args:
+            artifact_name: Artifact name.
+        Returns:
+            Data frame containing all parent artifacts.
+        """
         df = pd.DataFrame()
         d1 = self.get_one_hop_parent_artifacts(artifact_name)
         # df = df.append(d1, sort=True, ignore_index=True)
@@ -673,7 +681,12 @@ def get_all_parent_artifacts(self, artifact_name: str) -> pd.DataFrame:
         return df
 
     def get_all_parent_executions(self, artifact_name: str) -> pd.DataFrame:
-        """Return all executions that produced upstream artifacts for the given artifact."""
+        """Return all executions that produced upstream artifacts for the given artifact.
+        Args:
+            artifact_name: Artifact name.
+        Returns:
+            Data frame containing all parent executions.
+        """
         parent_artifacts: pd.DataFrame = self.get_all_parent_artifacts(artifact_name)
         if parent_artifacts.shape[0] == 0:
             # If it's empty, there's no `id` column and the code below raises an exception.
@@ -728,7 +741,12 @@ def find_producer_execution(self, artifact_name: str) -> t.Optional[mlpb.Executi
     get_producer_execution = find_producer_execution
 
     def get_metrics(self, metrics_name: str) -> t.Optional[pd.DataFrame]:
-        """Return metric data frame."""
+        """Return metric data frame.
+        Args:
+            metrics_name: Metrics name.
+        Returns:
+            Data frame containing all metrics.
+        """
         for metric in self.store.get_artifacts_by_type("Step_Metrics"):
             if metric.name == metrics_name:
                 name: t.Optional[str] = metric.custom_properties.get("Name", None)
@@ -749,6 +767,8 @@ def dumptojson(self, pipeline_name: str, exec_id: t.Optional[int] = None) -> t.O
         Args:
             pipeline_name: Name of an AI pipelines.
             exec_id: Optional stage execution ID - filter stages by this execution ID.
+        Returns:
+            Pipeline in JSON format.
         """
         if exec_id is not None:
             exec_id = int(exec_id)

diff --git a/cmflib/commands/artifact/pull.py b/cmflib/commands/artifact/pull.py
@@ -81,7 +81,8 @@ def extract_repo_args(self, type: str, name: str, url: str, current_directory: s
                 # url_with_bucket = varkha-test/23/6d9502e0283d91f689d7038b8508a2
                 # Splitting the string using '/' as the delimiter
                 bucket_name, object_name = url_with_bucket.split('/', 1)
-                download_loc = current_directory + "/" + name
+                download_loc =  current_directory + "/" + name if current_directory != ""  else name
+                print(download_loc)
                 return bucket_name, object_name, download_loc
             else:
                 # returning bucket_name, object_name and download_loc returning as empty

diff --git a/cmflib/commands/init/amazonS3.py b/cmflib/commands/init/amazonS3.py
@@ -127,12 +127,12 @@ def add_parser(subparsers, parent_parser):
         default=argparse.SUPPRESS,
     )
 
-    required_arguments.add_argument(
+    parser.add_argument(
         "--session-token",
         required=True,
         help="Specify Session Token.",
         metavar="<session_token>",
-        default=argparse.SUPPRESS,
+        default="",
     )
 
     required_arguments.add_argument(

diff --git a/cmflib/dvc_wrapper.py b/cmflib/dvc_wrapper.py
@@ -435,7 +435,7 @@ def dvc_push() -> str:
         process = subprocess.Popen(['dvc', 'push'],
                                    stdout=subprocess.PIPE,
                                    universal_newlines=True)
-        output, errs = process.communicate(timeout=60)
+        output, errs = process.communicate()
         commit = output.strip()
 
     except Exception as err:

diff --git a/cmflib/storage_backends/amazonS3_artifacts.py b/cmflib/storage_backends/amazonS3_artifacts.py
@@ -37,7 +37,9 @@ def download_artifacts(
                 aws_session_token=session_token
             )
             s3.head_bucket(Bucket=bucket_name)
-            dir_path, _ = download_loc.rsplit("/", 1)
+            dir_path = ""
+            if "/" in download_loc:
+                dir_path, _ = download_loc.rsplit("/", 1)
             if dir_path != "":
                 os.makedirs(dir_path, mode=0o777, exist_ok=True)  # creating subfolders if needed
             response = s3.download_file(bucket_name, object_name, download_loc)

diff --git a/docs/api/public/cmf.md b/docs/api/public/cmf.md
@@ -1,4 +1,18 @@
-# cmflib.cmf.Cmf
+# cmflib.cmf
+
+::: cmflib.cmf
+    options:
+      show_root_toc_entry: false
+      merge_init_into_class: true
+      docstring_style: google
+      members:
+        - metadata_push
+        - metadata_pull
+        - artifact_pull
+        - artifact_pull_single
+        - artifact_push
+        - cmf_init_show
+        - cmf_init
 
 ::: cmflib.cmf.Cmf
     options:
@@ -8,9 +22,17 @@
       members:
         - __init__
         - create_context
+        - merge_created_context
         - create_execution
+        - update_execution
+        - merge_created__execution
         - log_dataset
+        - log_dataset_with_version
         - log_model
+        - log_model_with_version
+        - log_execution_metrics_from_client
         - log_execution_metrics
         - log_metric
+        - commit_existing_metrics
         - create_dataslice
+        - update_dataslice
diff --git a/docs/api/public/cmfquery.md b/docs/api/public/cmfquery.md
@@ -0,0 +1,29 @@
+# cmflib.cmfquery.CmfQuery
+
+::: cmflib.cmfquery.CmfQuery
+    options:
+      show_root_toc_entry: false
+      merge_init_into_class: true
+      docstring_style: google
+      members:
+        - get_pipeline_names
+        - get_pipeline_id
+        - get_pipeline_stages
+        - get_all_exe_in_stage
+        - get_all_executions_by_ids_list
+        - get_all_artifacts_by_context
+        - get_all_artifacts_by_ids_list
+        - get_all_executions_in_stage
+        - get_artifact_df
+        - get_all_artifacts
+        - get_artifact
+        - get_all_artifacts_for_execution
+        - get_all_artifact_types
+        - get_all_executions_for_artifact
+        - get_one_hop_child_artifacts
+        - get_all_child_artifacts
+        - get_one_hop_parent_artifacts
+        - get_all_parent_artifacts
+        - get_all_parent_executions
+        - get_metrics
+        - dumptojson
diff --git a/mkdocs.yaml b/mkdocs.yaml
@@ -93,6 +93,7 @@ nav:
   - Public API:
       - CMF: api/public/cmf.md
       - DataSlice: api/public/dataslice.md
+      - CmfQuery: api/public/cmfquery.md
   - Ontology:
       - Ontology: common-metadata-ontology/readme.md
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "cmflib"
-version = "0.0.7"
+version = "0.0.8"
 dependencies = [
 	"ml-metadata==1.11.0",
 	"dvc[ssh,s3]==2.27.0",

diff --git a/setup.py b/setup.py
@@ -1,6 +1,6 @@
 from setuptools import setup, find_packages
 
-VERSION = '0.0.7'
+VERSION = '0.0.8'
 DESCRIPTION = 'Metadata Python Package'
 LONG_DESCRIPTION = 'Metadata framework storing AI metadata into MLMD'