-
Notifications
You must be signed in to change notification settings - Fork 23
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #116 from palewire/no-io
Remove disk I/O from the downloader
- Loading branch information
Showing
8 changed files
with
155 additions
and
56 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
.DS_Store | ||
docs/_build | ||
*.pyc | ||
cpi/data/ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
include LICENSE | ||
include README.md | ||
include cpi/cpi.db | ||
recursive-include notebooks * | ||
recursive-include notebooks * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
"""Download the latest annual Consumer Price Index (CPI) dataset.""" | ||
import csv | ||
import io | ||
import logging | ||
import sqlite3 | ||
import typing | ||
|
@@ -45,87 +45,70 @@ class Downloader: | |
"cu.data.20.USCommoditiesServicesSpecial", | ||
] | ||
|
||
def get_data_dir(self) -> Path: | ||
"""Return the directory Path where data will be stored.""" | ||
data_dir = self.THIS_DIR / "data" | ||
data_dir.mkdir(exist_ok=True, parents=True) | ||
return data_dir | ||
|
||
def rm(self): | ||
def rm(self) -> None: | ||
"""Remove any existing files.""" | ||
db_path = self.THIS_DIR / "cpi.db" | ||
if db_path.exists(): | ||
logger.debug(f"Deleting {db_path}") | ||
db_path.unlink() | ||
data_dir = self.get_data_dir() | ||
for f in data_dir.glob(".csv"): | ||
logger.debug(f"Deleting {f}") | ||
f.unlink() | ||
for f in data_dir.glob(".tsv"): | ||
logger.debug(f"Deleting {f}") | ||
f.unlink() | ||
|
||
def update(self): | ||
|
||
def update(self) -> None: | ||
"""Update the Consumer Price Index dataset that powers this library.""" | ||
# Delete existing files | ||
self.rm() | ||
|
||
# Download the TSVs | ||
logger.debug(f"Downloading {len(self.FILE_LIST)} files from the BLS") | ||
[self.get_tsv(file) for file in self.FILE_LIST] | ||
df_list = {name: self.get_df(name) for name in self.FILE_LIST} | ||
|
||
# Insert the TSVs | ||
logger.debug("Loading data into SQLite database") | ||
[self.insert_tsv(file) for file in self.FILE_LIST] | ||
|
||
def insert_tsv(self, file: str): | ||
"""Load the provided TSV file.""" | ||
# Connect to db | ||
db_path = self.THIS_DIR / "cpi.db" | ||
conn = sqlite3.connect(db_path) | ||
|
||
# Read file | ||
logger.debug(f" - {file}") | ||
csv_path = self.get_data_dir() / f"{file}.csv" | ||
csv_reader = list(csv.DictReader(open(csv_path))) | ||
|
||
# Convert it to a DataFrame | ||
df = pd.DataFrame(csv_reader) | ||
df.drop([None], axis=1, inplace=True, errors="ignore") | ||
|
||
# Write file to db | ||
df.to_sql(file, conn, if_exists="replace", index=False) | ||
# Load them one by one | ||
for name, df in df_list.items(): | ||
logger.debug(f"- {name}") | ||
df.to_sql(name, conn, if_exists="replace", index=False) | ||
|
||
# Close connection | ||
conn.close() | ||
|
||
def get_tsv(self, file: str): | ||
def get_df(self, file: str) -> pd.DataFrame: | ||
"""Download TSV file from the BLS.""" | ||
# Download it | ||
url = f"https://download.bls.gov/pub/time.series/cu/{file}" | ||
logger.debug(f" - {url}") | ||
tsv_path = self.get_data_dir() / f"{file}.tsv" | ||
headers = { | ||
"User-Agent": "[email protected]", | ||
} | ||
response = requests.get(url, headers=headers, timeout=30) | ||
|
||
# Make sure the response is legit | ||
try: | ||
assert response.ok | ||
except AssertionError: | ||
logger.error(f"Error downloading {url}") | ||
logger.error(f"Response: {response.text}") | ||
raise AssertionError(f"Error downloading {url}") | ||
with open(tsv_path, "w") as fp: | ||
fp.write(response.text) | ||
|
||
# Convert it to csv | ||
with open(tsv_path) as in_file: | ||
reader = csv.reader(in_file, delimiter="\t") | ||
csv_path = self.get_data_dir() / f"{file}.csv" | ||
with open(csv_path, "w") as out_file: | ||
writer = csv.writer(out_file) | ||
for row in reader: | ||
writer.writerow([cell.strip() for cell in row]) | ||
raise AssertionError(f"Error downloading {url} - {response.text}") | ||
|
||
# Read in the contents as an io.StringIO object | ||
df = pd.read_csv(io.StringIO(response.text), sep="\t") | ||
|
||
# .strip() every value in the dataframe | ||
df_obj = df.select_dtypes("object") | ||
df[df_obj.columns] = df_obj.apply(lambda x: x.str.strip()) | ||
|
||
# .strip every column name | ||
df.columns = [c.strip() for c in df.columns] | ||
|
||
# Clean file | ||
df.drop([None], axis=1, inplace=True, errors="ignore") | ||
|
||
# Pass it back | ||
return df | ||
|
||
|
||
if __name__ == "__main__": | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
sphinx | ||
myst-parser | ||
sphinx-palewire-theme | ||
sphinx-palewire-theme |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
{"all": {"latest_month": "2024-07-01", "latest_change": 0.2, "previous_month": "2024-06-01", "previous_change": -0.1}, "food": {"latest_month": "2024-07-01", "latest_change": 0.2, "previous_month": "2024-06-01", "previous_change": 0.2}, "energy": {"latest_month": "2024-07-01", "latest_change": 0.0, "previous_month": "2024-06-01", "previous_change": -2.0}, "less_food_and_energy": {"latest_month": "2024-07-01", "latest_change": 0.2, "previous_month": "2024-06-01", "previous_change": 0.1}, "yoy_change": 2.9} | ||
{"all": {"latest_month": "2024-07-01", "latest_change": 0.2, "previous_month": "2024-06-01", "previous_change": -0.1}, "food": {"latest_month": "2024-07-01", "latest_change": 0.2, "previous_month": "2024-06-01", "previous_change": 0.2}, "energy": {"latest_month": "2024-07-01", "latest_change": 0.0, "previous_month": "2024-06-01", "previous_change": -2.0}, "less_food_and_energy": {"latest_month": "2024-07-01", "latest_change": 0.2, "previous_month": "2024-06-01", "previous_change": 0.1}, "yoy_change": 2.9} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"id": "a5b27483-80ce-49cd-b363-f38e66b3b84f", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import os\n", | ||
"import sys\n", | ||
"this_dir = os.path.dirname(os.getcwd())\n", | ||
"sys.path.insert(0, this_dir)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"id": "16a4f9b1-616e-4f27-ab4b-c91a5bc3459d", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"ename": "OperationalError", | ||
"evalue": "no such table: cu.area", | ||
"output_type": "error", | ||
"traceback": [ | ||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | ||
"\u001b[0;31mOperationalError\u001b[0m Traceback (most recent call last)", | ||
"Cell \u001b[0;32mIn[5], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mcpi\u001b[39;00m\n", | ||
"File \u001b[0;32m~/Code/cpi/cpi/__init__.py:21\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;66;03m# Parse data for use\u001b[39;00m\n\u001b[1;32m 20\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mParsing data files from the BLS\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 21\u001b[0m areas \u001b[38;5;241m=\u001b[39m \u001b[43mparsers\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mParseArea\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparse\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 22\u001b[0m items \u001b[38;5;241m=\u001b[39m parsers\u001b[38;5;241m.\u001b[39mParseItem()\u001b[38;5;241m.\u001b[39mparse()\n\u001b[1;32m 23\u001b[0m periods \u001b[38;5;241m=\u001b[39m parsers\u001b[38;5;241m.\u001b[39mParsePeriod()\u001b[38;5;241m.\u001b[39mparse()\n", | ||
"File \u001b[0;32m~/Code/cpi/cpi/parsers.py:60\u001b[0m, in \u001b[0;36mParseArea.parse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 58\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mParsing area file\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 59\u001b[0m object_list \u001b[38;5;241m=\u001b[39m MappingList()\n\u001b[0;32m---> 60\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m row \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_file\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcu.area\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 61\u001b[0m obj \u001b[38;5;241m=\u001b[39m Area(row[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124marea_code\u001b[39m\u001b[38;5;124m\"\u001b[39m], row[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124marea_name\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 62\u001b[0m object_list\u001b[38;5;241m.\u001b[39mappend(obj)\n", | ||
"File \u001b[0;32m~/Code/cpi/cpi/parsers.py:37\u001b[0m, in \u001b[0;36mBaseParser.get_file\u001b[0;34m(self, file)\u001b[0m\n\u001b[1;32m 34\u001b[0m cursor \u001b[38;5;241m=\u001b[39m conn\u001b[38;5;241m.\u001b[39mcursor()\n\u001b[1;32m 36\u001b[0m \u001b[38;5;66;03m# Query this file\u001b[39;00m\n\u001b[0;32m---> 37\u001b[0m query \u001b[38;5;241m=\u001b[39m \u001b[43mcursor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mSELECT * FROM \u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mfile\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 38\u001b[0m columns \u001b[38;5;241m=\u001b[39m [d[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;28;01mfor\u001b[39;00m d \u001b[38;5;129;01min\u001b[39;00m query\u001b[38;5;241m.\u001b[39mdescription]\n\u001b[1;32m 39\u001b[0m result_list \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mdict\u001b[39m(\u001b[38;5;28mzip\u001b[39m(columns, r)) \u001b[38;5;28;01mfor\u001b[39;00m r \u001b[38;5;129;01min\u001b[39;00m query\u001b[38;5;241m.\u001b[39mfetchall()]\n", | ||
"\u001b[0;31mOperationalError\u001b[0m: no such table: cu.area" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import cpi" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"id": "588fdeea-b7b5-4be0-a590-13f2ed8a1e35", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"33.3 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"%timeit -r 1 cpi.update()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"id": "ff4d1a59-46db-4a76-af45-38b9771831bc", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"ename": "NameError", | ||
"evalue": "name 'cpi' is not defined", | ||
"output_type": "error", | ||
"traceback": [ | ||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | ||
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", | ||
"Cell \u001b[0;32mIn[3], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mget_ipython\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_line_magic\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtimeit\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m-r 1 cpi.update()\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n", | ||
"File \u001b[0;32m~/.local/share/virtualenvs/cpi-bt44Qfrb/lib/python3.9/site-packages/IPython/core/interactiveshell.py:2456\u001b[0m, in \u001b[0;36mInteractiveShell.run_line_magic\u001b[0;34m(self, magic_name, line, _stack_depth)\u001b[0m\n\u001b[1;32m 2454\u001b[0m kwargs[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlocal_ns\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_local_scope(stack_depth)\n\u001b[1;32m 2455\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbuiltin_trap:\n\u001b[0;32m-> 2456\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2458\u001b[0m \u001b[38;5;66;03m# The code below prevents the output from being displayed\u001b[39;00m\n\u001b[1;32m 2459\u001b[0m \u001b[38;5;66;03m# when using magics with decorator @output_can_be_silenced\u001b[39;00m\n\u001b[1;32m 2460\u001b[0m \u001b[38;5;66;03m# when the last Python token in the expression is a ';'.\u001b[39;00m\n\u001b[1;32m 2461\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(fn, magic\u001b[38;5;241m.\u001b[39mMAGIC_OUTPUT_CAN_BE_SILENCED, \u001b[38;5;28;01mFalse\u001b[39;00m):\n", | ||
"File \u001b[0;32m~/.local/share/virtualenvs/cpi-bt44Qfrb/lib/python3.9/site-packages/IPython/core/magics/execution.py:1185\u001b[0m, in \u001b[0;36mExecutionMagics.timeit\u001b[0;34m(self, line, cell, local_ns)\u001b[0m\n\u001b[1;32m 1183\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m index \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m10\u001b[39m):\n\u001b[1;32m 1184\u001b[0m number \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m10\u001b[39m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m index\n\u001b[0;32m-> 1185\u001b[0m time_number \u001b[38;5;241m=\u001b[39m \u001b[43mtimer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtimeit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnumber\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1186\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m time_number \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0.2\u001b[39m:\n\u001b[1;32m 1187\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n", | ||
"File \u001b[0;32m~/.local/share/virtualenvs/cpi-bt44Qfrb/lib/python3.9/site-packages/IPython/core/magics/execution.py:173\u001b[0m, in \u001b[0;36mTimer.timeit\u001b[0;34m(self, number)\u001b[0m\n\u001b[1;32m 171\u001b[0m gc\u001b[38;5;241m.\u001b[39mdisable()\n\u001b[1;32m 172\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 173\u001b[0m timing \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minner\u001b[49m\u001b[43m(\u001b[49m\u001b[43mit\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtimer\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 175\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m gcold:\n", | ||
"File \u001b[0;32m<magic-timeit>:1\u001b[0m, in \u001b[0;36minner\u001b[0;34m(_it, _timer)\u001b[0m\n", | ||
"\u001b[0;31mNameError\u001b[0m: name 'cpi' is not defined" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"%timeit -r 1 cpi.update()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "dd70c615-81dd-4d66-8498-c3ae2b37b10d", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.16" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |