Skip to content

Commit

Permalink
Issue #27: Migrate from os.path to Pathlib (#64)
Browse files Browse the repository at this point in the history
* Issue #27: Migrate from os.path to Pathlib

Updates code to use pathlib.Path instead of os.path
Also remove `import os`

* formatting

* version bump

Co-authored-by: Ian Preston <[email protected]>
  • Loading branch information
azimj and ianepreston authored Feb 13, 2021
1 parent 789acf0 commit 63da8ef
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 72 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "stats_can"
version = "2.4.0"
version = "2.5.0"
description = "Read StatsCan data into python, mostly pandas dataframes"
authors = ["Ian Preston"]
license = "GPL-3.0-or-later"
Expand Down
79 changes: 40 additions & 39 deletions src/stats_can/sc.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
within a date range
"""
import json
import os
import pathlib
import zipfile

import h5py
Expand Down Expand Up @@ -79,7 +79,7 @@ def download_tables(tables, path=None, csv=True):
----------
tables: list of str
tables to be downloaded
path: str, default: None (will do current directory)
path: str or path object, default: None (will do current directory)
Where to download the table and json
csv: boolean, default True
download in CSV format, if not download SDMX
Expand All @@ -89,20 +89,21 @@ def download_tables(tables, path=None, csv=True):
downloaded: list
list of tables that were downloaded
"""
path = pathlib.Path(path) if path else pathlib.Path()
metas = get_cube_metadata(tables)
for meta in metas:
product_id = meta["productId"]
zip_url = get_full_table_download(product_id, csv=csv)
zip_file = product_id + "-eng.zip" if csv else product_id + ".zip"
json_file = product_id + ".json"
if path:
zip_file = os.path.join(path, zip_file)
json_file = os.path.join(path, json_file)
zip_file_name = product_id + ("-eng.zip" if csv else ".zip")
json_file_name = product_id + ".json"
zip_file = path / zip_file_name
json_file = path / json_file_name

# Thanks http://evanhahn.com/python-requests-library-useragent/
response = requests.get(zip_url, stream=True, headers={"user-agent": None})

progress_bar = tqdm(
desc=os.path.basename(zip_file),
desc=zip_file_name,
total=int(response.headers.get("content-length", 0)),
unit="B",
unit_scale=True,
Expand Down Expand Up @@ -132,7 +133,7 @@ def zip_update_tables(path=None, csv=True):
Parameters
----------
path: str, default: None
path: str or pathlib.Path, default: None
where to look for tables to update
csv: boolean, default: True
Downloads updates in CSV form by default, SDMX if false
Expand Down Expand Up @@ -165,20 +166,20 @@ def zip_table_to_dataframe(table, path=None):
----------
table: str
the table to load to dataframe from zipped csv
path: str, default: current working directory when module is loaded
path: str or pathlib.Path, default: current working directory when module is loaded
where to download the tables or load them
Returns
-------
df: pandas.DataFrame
the table as a dataframe
"""
path = pathlib.Path(path) if path else pathlib.Path()
# Parse tables returns a list, can only do one table at a time here though
table = parse_tables(table)[0]
table_zip = table + "-eng.zip"
if path:
table_zip = os.path.join(path, table_zip)
if not os.path.isfile(table_zip):
table_zip = path / table_zip
if not table_zip.is_file():
download_tables([table], path)
csv_file = table + ".csv"
with zipfile.ZipFile(table_zip) as myzip:
Expand Down Expand Up @@ -237,9 +238,8 @@ def list_zipped_tables(path=None):
list of available tables json data
"""
# Find json files
jsons = [f for f in os.listdir(path) if f.endswith(".json")]
if path:
jsons = [os.path.join(path, j) for j in jsons]
path = pathlib.Path(path) if path else pathlib.Path.cwd()
jsons = path.glob("*.json")
tables = []
for j in jsons:
try:
Expand Down Expand Up @@ -270,18 +270,19 @@ def tables_to_h5(tables, h5file="stats_can.h5", path=None):
tables: list
list of tables loaded into the file
"""
if path:
h5file = os.path.join(path, h5file)
path = pathlib.Path(path) if path else pathlib.Path()
h5file = path / h5file
tables = parse_tables(tables)
path = h5file.parent

for table in tables:
hkey = "table_" + table
jkey = "json_" + table
zip_file = table + "-eng.zip"
json_file = table + ".json"
if path:
zip_file = os.path.join(path, zip_file)
json_file = os.path.join(path, json_file)
if not os.path.isfile(json_file):
zip_file = path / zip_file
json_file = path / json_file
if not json_file.is_file():
download_tables([table], path)
df = zip_table_to_dataframe(table, path=path)
with open(json_file) as f_name:
Expand All @@ -292,8 +293,8 @@ def tables_to_h5(tables, h5file="stats_can.h5", path=None):
if jkey in hfile.keys():
del hfile[jkey]
hfile.create_dataset(jkey, data=json.dumps(df_json))
os.remove(zip_file)
os.remove(json_file)
zip_file.unlink()
json_file.unlink()
return tables


Expand All @@ -314,8 +315,9 @@ def table_from_h5(table, h5file="stats_can.h5", path=None):
df: pd.DataFrame
table in dataframe format
"""
path = pathlib.Path(path) if path else pathlib.Path()
table = "table_" + parse_tables(table)[0]
h5 = os.path.join(path, h5file) if path else h5file
h5 = path / h5file
try:
with pd.HDFStore(h5, "r") as store:
df = pd.read_hdf(store, key=table)
Expand Down Expand Up @@ -343,8 +345,8 @@ def metadata_from_h5(tables, h5file="stats_can.h5", path=None):
-------
list of local table metadata
"""
if path:
h5file = os.path.join(path, h5file)
path = pathlib.Path(path) if path else pathlib.Path()
h5file = path / h5file
tables = ["json_" + tbl for tbl in parse_tables(tables)]
jsons = []
try:
Expand Down Expand Up @@ -425,7 +427,7 @@ def h5_update_tables(h5file="stats_can.h5", path=None, tables=None):
if tables:
local_jsons = metadata_from_h5(tables, h5file=h5file, path=path)
else:
h5 = os.path.join(path, h5file) if path else h5file
h5 = path / h5file if path else pathlib.Path(h5file)
with h5py.File(h5, "r") as f:
keys = [key for key in f.keys() if key.startswith("json")]
local_jsons = [json.loads(f[key][()]) for key in keys]
Expand Down Expand Up @@ -488,8 +490,7 @@ def h5_included_keys(h5file="stats_can.h5", path=None):
keys: list
list of keys in the hdf5 file
"""
if path:
h5file = os.path.join(path, h5file)
h5file = path / h5file if path else pathlib.Path(h5file)
with h5py.File(h5file, "r") as f:
keys = [key for key in f.keys()]
return keys
Expand All @@ -502,7 +503,7 @@ def delete_tables(tables, path=None, h5file="stats_can.h5", csv=True):
----------
tables: list
list of tables to delete
path: str or os path object, default None
path: str or path object, default None
where to look for the tables to delete
h5file: str default stats_can.h5
h5file to remove from, set to None to remove zips
Expand All @@ -514,6 +515,7 @@ def delete_tables(tables, path=None, h5file="stats_can.h5", csv=True):
to_delete: list
list of deleted tables
"""
path = pathlib.Path(path) if path else pathlib.Path()
clean_tables = parse_tables(tables)
available_tables_jsons = list_downloaded_tables(path=path, h5file=h5file)
available_tables = [j["productId"] for j in available_tables_jsons]
Expand All @@ -525,23 +527,22 @@ def delete_tables(tables, path=None, h5file="stats_can.h5", csv=True):
tbl_to_del = "table_" + td
keys_to_del.append(json_to_del)
keys_to_del.append(tbl_to_del)
if path:
h5file = os.path.join(path, h5file)
h5file = path / h5file
with h5py.File(h5file, "a") as f:
for k in keys_to_del:
del f[k]
else:
files_to_del = []
for td in to_delete:
json_to_del = td + ".json"
zip_to_del = td + "-eng.zip" if csv else td + ".zip"
zip_to_del = td + ("-eng.zip" if csv else ".zip")
files_to_del.append(zip_to_del)
files_to_del.append(json_to_del)
if path:
files_to_del = [os.path.join(path, f) for f in files_to_del]
for file in files_to_del:
if os.path.exists(file):
os.remove(file)
p = path / file
if p.is_file():
p.unlink()

return to_delete


Expand Down Expand Up @@ -624,7 +625,7 @@ def vectors_to_df_local(vectors, path=None, start_date=None, h5file="stats_can.h
----------
vectors: list
list of vectors to be read in
path: str or os path, default None
path: str or path object, default None
path to StatsCan tables
start_date: datetime, optional, default None
optional earliest reference date to include
Expand Down
Loading

0 comments on commit 63da8ef

Please sign in to comment.