Skip to content

Commit

Permalink
Merge pull request #116 from palewire/no-io
Browse files Browse the repository at this point in the history
Remove disk I/O from the downloader
  • Loading branch information
palewire authored Aug 31, 2024
2 parents a591764 + 8cc94b8 commit 0363ba6
Show file tree
Hide file tree
Showing 8 changed files with 155 additions and 56 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
.DS_Store
docs/_build
*.pyc
cpi/data/
Expand Down
2 changes: 1 addition & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
include LICENSE
include README.md
include cpi/cpi.db
recursive-include notebooks *
recursive-include notebooks *
75 changes: 29 additions & 46 deletions cpi/download.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Download the latest annual Consumer Price Index (CPI) dataset."""
import csv
import io
import logging
import sqlite3
import typing
Expand Down Expand Up @@ -45,87 +45,70 @@ class Downloader:
"cu.data.20.USCommoditiesServicesSpecial",
]

def get_data_dir(self) -> Path:
"""Return the directory Path where data will be stored."""
data_dir = self.THIS_DIR / "data"
data_dir.mkdir(exist_ok=True, parents=True)
return data_dir

def rm(self):
def rm(self) -> None:
"""Remove any existing files."""
db_path = self.THIS_DIR / "cpi.db"
if db_path.exists():
logger.debug(f"Deleting {db_path}")
db_path.unlink()
data_dir = self.get_data_dir()
for f in data_dir.glob(".csv"):
logger.debug(f"Deleting {f}")
f.unlink()
for f in data_dir.glob(".tsv"):
logger.debug(f"Deleting {f}")
f.unlink()

def update(self):

def update(self) -> None:
"""Update the Consumer Price Index dataset that powers this library."""
# Delete existing files
self.rm()

# Download the TSVs
logger.debug(f"Downloading {len(self.FILE_LIST)} files from the BLS")
[self.get_tsv(file) for file in self.FILE_LIST]
df_list = {name: self.get_df(name) for name in self.FILE_LIST}

# Insert the TSVs
logger.debug("Loading data into SQLite database")
[self.insert_tsv(file) for file in self.FILE_LIST]

def insert_tsv(self, file: str):
"""Load the provided TSV file."""
# Connect to db
db_path = self.THIS_DIR / "cpi.db"
conn = sqlite3.connect(db_path)

# Read file
logger.debug(f" - {file}")
csv_path = self.get_data_dir() / f"{file}.csv"
csv_reader = list(csv.DictReader(open(csv_path)))

# Convert it to a DataFrame
df = pd.DataFrame(csv_reader)
df.drop([None], axis=1, inplace=True, errors="ignore")

# Write file to db
df.to_sql(file, conn, if_exists="replace", index=False)
# Load them one by one
for name, df in df_list.items():
logger.debug(f"- {name}")
df.to_sql(name, conn, if_exists="replace", index=False)

# Close connection
conn.close()

def get_tsv(self, file: str):
def get_df(self, file: str) -> pd.DataFrame:
"""Download TSV file from the BLS."""
# Download it
url = f"https://download.bls.gov/pub/time.series/cu/{file}"
logger.debug(f" - {url}")
tsv_path = self.get_data_dir() / f"{file}.tsv"
headers = {
"User-Agent": "[email protected]",
}
response = requests.get(url, headers=headers, timeout=30)

# Make sure the response is legit
try:
assert response.ok
except AssertionError:
logger.error(f"Error downloading {url}")
logger.error(f"Response: {response.text}")
raise AssertionError(f"Error downloading {url}")
with open(tsv_path, "w") as fp:
fp.write(response.text)

# Convert it to csv
with open(tsv_path) as in_file:
reader = csv.reader(in_file, delimiter="\t")
csv_path = self.get_data_dir() / f"{file}.csv"
with open(csv_path, "w") as out_file:
writer = csv.writer(out_file)
for row in reader:
writer.writerow([cell.strip() for cell in row])
raise AssertionError(f"Error downloading {url} - {response.text}")

# Read in the contents as an io.StringIO object
df = pd.read_csv(io.StringIO(response.text), sep="\t")

# .strip() every value in the dataframe
df_obj = df.select_dtypes("object")
df[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())

# .strip every column name
df.columns = [c.strip() for c in df.columns]

# Clean file
df.drop([None], axis=1, inplace=True, errors="ignore")

# Pass it back
return df


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion docs/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@ help:
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

livehtml:
sphinx-autobuild -b html $(SOURCEDIR) $(BUILDDIR)/html
sphinx-autobuild -b html $(SOURCEDIR) $(BUILDDIR)/html
2 changes: 1 addition & 1 deletion docs/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
sphinx
myst-parser
sphinx-palewire-theme
sphinx-palewire-theme
12 changes: 6 additions & 6 deletions notebooks/dataframes.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -13981,23 +13981,23 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python2"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
"pygments_lexer": "ipython3",
"version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}
2 changes: 1 addition & 1 deletion notebooks/latest.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"all": {"latest_month": "2024-07-01", "latest_change": 0.2, "previous_month": "2024-06-01", "previous_change": -0.1}, "food": {"latest_month": "2024-07-01", "latest_change": 0.2, "previous_month": "2024-06-01", "previous_change": 0.2}, "energy": {"latest_month": "2024-07-01", "latest_change": 0.0, "previous_month": "2024-06-01", "previous_change": -2.0}, "less_food_and_energy": {"latest_month": "2024-07-01", "latest_change": 0.2, "previous_month": "2024-06-01", "previous_change": 0.1}, "yoy_change": 2.9}
{"all": {"latest_month": "2024-07-01", "latest_change": 0.2, "previous_month": "2024-06-01", "previous_change": -0.1}, "food": {"latest_month": "2024-07-01", "latest_change": 0.2, "previous_month": "2024-06-01", "previous_change": 0.2}, "energy": {"latest_month": "2024-07-01", "latest_change": 0.0, "previous_month": "2024-06-01", "previous_change": -2.0}, "less_food_and_energy": {"latest_month": "2024-07-01", "latest_change": 0.2, "previous_month": "2024-06-01", "previous_change": 0.1}, "yoy_change": 2.9}
115 changes: 115 additions & 0 deletions notebooks/speed-test.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"id": "a5b27483-80ce-49cd-b363-f38e66b3b84f",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"this_dir = os.path.dirname(os.getcwd())\n",
"sys.path.insert(0, this_dir)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "16a4f9b1-616e-4f27-ab4b-c91a5bc3459d",
"metadata": {},
"outputs": [
{
"ename": "OperationalError",
"evalue": "no such table: cu.area",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mOperationalError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[5], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mcpi\u001b[39;00m\n",
"File \u001b[0;32m~/Code/cpi/cpi/__init__.py:21\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;66;03m# Parse data for use\u001b[39;00m\n\u001b[1;32m 20\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mParsing data files from the BLS\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 21\u001b[0m areas \u001b[38;5;241m=\u001b[39m \u001b[43mparsers\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mParseArea\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparse\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 22\u001b[0m items \u001b[38;5;241m=\u001b[39m parsers\u001b[38;5;241m.\u001b[39mParseItem()\u001b[38;5;241m.\u001b[39mparse()\n\u001b[1;32m 23\u001b[0m periods \u001b[38;5;241m=\u001b[39m parsers\u001b[38;5;241m.\u001b[39mParsePeriod()\u001b[38;5;241m.\u001b[39mparse()\n",
"File \u001b[0;32m~/Code/cpi/cpi/parsers.py:60\u001b[0m, in \u001b[0;36mParseArea.parse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 58\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mParsing area file\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 59\u001b[0m object_list \u001b[38;5;241m=\u001b[39m MappingList()\n\u001b[0;32m---> 60\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m row \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_file\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcu.area\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 61\u001b[0m obj \u001b[38;5;241m=\u001b[39m Area(row[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124marea_code\u001b[39m\u001b[38;5;124m\"\u001b[39m], row[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124marea_name\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 62\u001b[0m object_list\u001b[38;5;241m.\u001b[39mappend(obj)\n",
"File \u001b[0;32m~/Code/cpi/cpi/parsers.py:37\u001b[0m, in \u001b[0;36mBaseParser.get_file\u001b[0;34m(self, file)\u001b[0m\n\u001b[1;32m 34\u001b[0m cursor \u001b[38;5;241m=\u001b[39m conn\u001b[38;5;241m.\u001b[39mcursor()\n\u001b[1;32m 36\u001b[0m \u001b[38;5;66;03m# Query this file\u001b[39;00m\n\u001b[0;32m---> 37\u001b[0m query \u001b[38;5;241m=\u001b[39m \u001b[43mcursor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mSELECT * FROM \u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mfile\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 38\u001b[0m columns \u001b[38;5;241m=\u001b[39m [d[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;28;01mfor\u001b[39;00m d \u001b[38;5;129;01min\u001b[39;00m query\u001b[38;5;241m.\u001b[39mdescription]\n\u001b[1;32m 39\u001b[0m result_list \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mdict\u001b[39m(\u001b[38;5;28mzip\u001b[39m(columns, r)) \u001b[38;5;28;01mfor\u001b[39;00m r \u001b[38;5;129;01min\u001b[39;00m query\u001b[38;5;241m.\u001b[39mfetchall()]\n",
"\u001b[0;31mOperationalError\u001b[0m: no such table: cu.area"
]
}
],
"source": [
"import cpi"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "588fdeea-b7b5-4be0-a590-13f2ed8a1e35",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"33.3 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
]
}
],
"source": [
"%timeit -r 1 cpi.update()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "ff4d1a59-46db-4a76-af45-38b9771831bc",
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'cpi' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[3], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mget_ipython\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_line_magic\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtimeit\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m-r 1 cpi.update()\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/.local/share/virtualenvs/cpi-bt44Qfrb/lib/python3.9/site-packages/IPython/core/interactiveshell.py:2456\u001b[0m, in \u001b[0;36mInteractiveShell.run_line_magic\u001b[0;34m(self, magic_name, line, _stack_depth)\u001b[0m\n\u001b[1;32m 2454\u001b[0m kwargs[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlocal_ns\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_local_scope(stack_depth)\n\u001b[1;32m 2455\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbuiltin_trap:\n\u001b[0;32m-> 2456\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2458\u001b[0m \u001b[38;5;66;03m# The code below prevents the output from being displayed\u001b[39;00m\n\u001b[1;32m 2459\u001b[0m \u001b[38;5;66;03m# when using magics with decorator @output_can_be_silenced\u001b[39;00m\n\u001b[1;32m 2460\u001b[0m \u001b[38;5;66;03m# when the last Python token in the expression is a ';'.\u001b[39;00m\n\u001b[1;32m 2461\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(fn, magic\u001b[38;5;241m.\u001b[39mMAGIC_OUTPUT_CAN_BE_SILENCED, \u001b[38;5;28;01mFalse\u001b[39;00m):\n",
"File \u001b[0;32m~/.local/share/virtualenvs/cpi-bt44Qfrb/lib/python3.9/site-packages/IPython/core/magics/execution.py:1185\u001b[0m, in \u001b[0;36mExecutionMagics.timeit\u001b[0;34m(self, line, cell, local_ns)\u001b[0m\n\u001b[1;32m 1183\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m index \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m10\u001b[39m):\n\u001b[1;32m 1184\u001b[0m number \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m10\u001b[39m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m index\n\u001b[0;32m-> 1185\u001b[0m time_number \u001b[38;5;241m=\u001b[39m \u001b[43mtimer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtimeit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnumber\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1186\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m time_number \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0.2\u001b[39m:\n\u001b[1;32m 1187\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n",
"File \u001b[0;32m~/.local/share/virtualenvs/cpi-bt44Qfrb/lib/python3.9/site-packages/IPython/core/magics/execution.py:173\u001b[0m, in \u001b[0;36mTimer.timeit\u001b[0;34m(self, number)\u001b[0m\n\u001b[1;32m 171\u001b[0m gc\u001b[38;5;241m.\u001b[39mdisable()\n\u001b[1;32m 172\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 173\u001b[0m timing \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minner\u001b[49m\u001b[43m(\u001b[49m\u001b[43mit\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtimer\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 175\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m gcold:\n",
"File \u001b[0;32m<magic-timeit>:1\u001b[0m, in \u001b[0;36minner\u001b[0;34m(_it, _timer)\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'cpi' is not defined"
]
}
],
"source": [
"%timeit -r 1 cpi.update()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dd70c615-81dd-4d66-8498-c3ae2b37b10d",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

0 comments on commit 0363ba6

Please sign in to comment.