Issue #27: Migrate from os.path to Pathlib (#64)

* Issue #27: Migrate from os.path to Pathlib Updates code to use pathlib.Path instead of os.path Also remove `import os` * formatting * version bump Co-authored-by: Ian Preston <[email protected]>
ianepreston · Feb 13, 2021 · 63da8ef · 63da8ef
1 parent 789acf0
commit 63da8ef
Show file tree

Hide file tree

Showing 3 changed files with 90 additions and 72 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "stats_can"
-version = "2.4.0"
+version = "2.5.0"
 description = "Read StatsCan data into python, mostly pandas dataframes"
 authors = ["Ian Preston"]
 license = "GPL-3.0-or-later"

diff --git a/src/stats_can/sc.py b/src/stats_can/sc.py
@@ -8,7 +8,7 @@
 within a date range
 """
 import json
-import os
+import pathlib
 import zipfile
 
 import h5py
@@ -79,7 +79,7 @@ def download_tables(tables, path=None, csv=True):
     ----------
     tables: list of str
         tables to be downloaded
-    path: str, default: None (will do current directory)
+    path: str or path object, default: None (will do current directory)
         Where to download the table and json
     csv: boolean, default True
         download in CSV format, if not download SDMX
@@ -89,20 +89,21 @@ def download_tables(tables, path=None, csv=True):
     downloaded: list
         list of tables that were downloaded
     """
+    path = pathlib.Path(path) if path else pathlib.Path()
     metas = get_cube_metadata(tables)
     for meta in metas:
         product_id = meta["productId"]
         zip_url = get_full_table_download(product_id, csv=csv)
-        zip_file = product_id + "-eng.zip" if csv else product_id + ".zip"
-        json_file = product_id + ".json"
-        if path:
-            zip_file = os.path.join(path, zip_file)
-            json_file = os.path.join(path, json_file)
+        zip_file_name = product_id + ("-eng.zip" if csv else ".zip")
+        json_file_name = product_id + ".json"
+        zip_file = path / zip_file_name
+        json_file = path / json_file_name
+
         # Thanks http://evanhahn.com/python-requests-library-useragent/
         response = requests.get(zip_url, stream=True, headers={"user-agent": None})
 
         progress_bar = tqdm(
-            desc=os.path.basename(zip_file),
+            desc=zip_file_name,
             total=int(response.headers.get("content-length", 0)),
             unit="B",
             unit_scale=True,
@@ -132,7 +133,7 @@ def zip_update_tables(path=None, csv=True):
 
     Parameters
     ----------
-    path: str, default: None
+    path: str or pathlib.Path, default: None
         where to look for tables to update
     csv: boolean, default: True
         Downloads updates in CSV form by default, SDMX if false
@@ -165,20 +166,20 @@ def zip_table_to_dataframe(table, path=None):
     ----------
     table: str
         the table to load to dataframe from zipped csv
-    path: str, default: current working directory when module is loaded
+    path: str or pathlib.Path, default: current working directory when module is loaded
         where to download the tables or load them
 
     Returns
     -------
     df: pandas.DataFrame
         the table as a dataframe
     """
+    path = pathlib.Path(path) if path else pathlib.Path()
     # Parse tables returns a list, can only do one table at a time here though
     table = parse_tables(table)[0]
     table_zip = table + "-eng.zip"
-    if path:
-        table_zip = os.path.join(path, table_zip)
-    if not os.path.isfile(table_zip):
+    table_zip = path / table_zip
+    if not table_zip.is_file():
         download_tables([table], path)
     csv_file = table + ".csv"
     with zipfile.ZipFile(table_zip) as myzip:
@@ -237,9 +238,8 @@ def list_zipped_tables(path=None):
         list of available tables json data
     """
     # Find json files
-    jsons = [f for f in os.listdir(path) if f.endswith(".json")]
-    if path:
-        jsons = [os.path.join(path, j) for j in jsons]
+    path = pathlib.Path(path) if path else pathlib.Path.cwd()
+    jsons = path.glob("*.json")
     tables = []
     for j in jsons:
         try:
@@ -270,18 +270,19 @@ def tables_to_h5(tables, h5file="stats_can.h5", path=None):
     tables: list
         list of tables loaded into the file
     """
-    if path:
-        h5file = os.path.join(path, h5file)
+    path = pathlib.Path(path) if path else pathlib.Path()
+    h5file = path / h5file
     tables = parse_tables(tables)
+    path = h5file.parent
+
     for table in tables:
         hkey = "table_" + table
         jkey = "json_" + table
         zip_file = table + "-eng.zip"
         json_file = table + ".json"
-        if path:
-            zip_file = os.path.join(path, zip_file)
-            json_file = os.path.join(path, json_file)
-        if not os.path.isfile(json_file):
+        zip_file = path / zip_file
+        json_file = path / json_file
+        if not json_file.is_file():
             download_tables([table], path)
         df = zip_table_to_dataframe(table, path=path)
         with open(json_file) as f_name:
@@ -292,8 +293,8 @@ def tables_to_h5(tables, h5file="stats_can.h5", path=None):
             if jkey in hfile.keys():
                 del hfile[jkey]
             hfile.create_dataset(jkey, data=json.dumps(df_json))
-        os.remove(zip_file)
-        os.remove(json_file)
+        zip_file.unlink()
+        json_file.unlink()
     return tables
 
 
@@ -314,8 +315,9 @@ def table_from_h5(table, h5file="stats_can.h5", path=None):
     df: pd.DataFrame
         table in dataframe format
     """
+    path = pathlib.Path(path) if path else pathlib.Path()
     table = "table_" + parse_tables(table)[0]
-    h5 = os.path.join(path, h5file) if path else h5file
+    h5 = path / h5file
     try:
         with pd.HDFStore(h5, "r") as store:
             df = pd.read_hdf(store, key=table)
@@ -343,8 +345,8 @@ def metadata_from_h5(tables, h5file="stats_can.h5", path=None):
     -------
     list of local table metadata
     """
-    if path:
-        h5file = os.path.join(path, h5file)
+    path = pathlib.Path(path) if path else pathlib.Path()
+    h5file = path / h5file
     tables = ["json_" + tbl for tbl in parse_tables(tables)]
     jsons = []
     try:
@@ -425,7 +427,7 @@ def h5_update_tables(h5file="stats_can.h5", path=None, tables=None):
     if tables:
         local_jsons = metadata_from_h5(tables, h5file=h5file, path=path)
     else:
-        h5 = os.path.join(path, h5file) if path else h5file
+        h5 = path / h5file if path else pathlib.Path(h5file)
         with h5py.File(h5, "r") as f:
             keys = [key for key in f.keys() if key.startswith("json")]
             local_jsons = [json.loads(f[key][()]) for key in keys]
@@ -488,8 +490,7 @@ def h5_included_keys(h5file="stats_can.h5", path=None):
     keys: list
         list of keys in the hdf5 file
     """
-    if path:
-        h5file = os.path.join(path, h5file)
+    h5file = path / h5file if path else pathlib.Path(h5file)
     with h5py.File(h5file, "r") as f:
         keys = [key for key in f.keys()]
     return keys
@@ -502,7 +503,7 @@ def delete_tables(tables, path=None, h5file="stats_can.h5", csv=True):
     ----------
     tables: list
         list of tables to delete
-    path: str or os path object, default None
+    path: str or path object, default None
         where to look for the tables to delete
     h5file: str default stats_can.h5
         h5file to remove from, set to None to remove zips
@@ -514,6 +515,7 @@ def delete_tables(tables, path=None, h5file="stats_can.h5", csv=True):
     to_delete: list
         list of deleted tables
     """
+    path = pathlib.Path(path) if path else pathlib.Path()
     clean_tables = parse_tables(tables)
     available_tables_jsons = list_downloaded_tables(path=path, h5file=h5file)
     available_tables = [j["productId"] for j in available_tables_jsons]
@@ -525,23 +527,22 @@ def delete_tables(tables, path=None, h5file="stats_can.h5", csv=True):
             tbl_to_del = "table_" + td
             keys_to_del.append(json_to_del)
             keys_to_del.append(tbl_to_del)
-        if path:
-            h5file = os.path.join(path, h5file)
+        h5file = path / h5file
         with h5py.File(h5file, "a") as f:
             for k in keys_to_del:
                 del f[k]
     else:
         files_to_del = []
         for td in to_delete:
             json_to_del = td + ".json"
-            zip_to_del = td + "-eng.zip" if csv else td + ".zip"
+            zip_to_del = td + ("-eng.zip" if csv else ".zip")
             files_to_del.append(zip_to_del)
             files_to_del.append(json_to_del)
-        if path:
-            files_to_del = [os.path.join(path, f) for f in files_to_del]
         for file in files_to_del:
-            if os.path.exists(file):
-                os.remove(file)
+            p = path / file
+            if p.is_file():
+                p.unlink()
+
     return to_delete
 
 
@@ -624,7 +625,7 @@ def vectors_to_df_local(vectors, path=None, start_date=None, h5file="stats_can.h
     ----------
     vectors: list
         list of vectors to be read in
-    path: str or os path, default None
+    path: str or path object, default None
         path to StatsCan tables
     start_date: datetime, optional, default None
         optional earliest reference date to include