diff --git a/.gitignore b/.gitignore index 3ea49d5..8df0779 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +.DS_Store docs/_build *.pyc cpi/data/ diff --git a/MANIFEST.in b/MANIFEST.in index 992bb65..e08203e 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,4 @@ include LICENSE include README.md include cpi/cpi.db -recursive-include notebooks * \ No newline at end of file +recursive-include notebooks * diff --git a/cpi/download.py b/cpi/download.py index 447c49f..8963606 100644 --- a/cpi/download.py +++ b/cpi/download.py @@ -1,5 +1,5 @@ """Download the latest annual Consumer Price Index (CPI) dataset.""" -import csv +import io import logging import sqlite3 import typing @@ -45,87 +45,70 @@ class Downloader: "cu.data.20.USCommoditiesServicesSpecial", ] - def get_data_dir(self) -> Path: - """Return the directory Path where data will be stored.""" - data_dir = self.THIS_DIR / "data" - data_dir.mkdir(exist_ok=True, parents=True) - return data_dir - - def rm(self): + def rm(self) -> None: """Remove any existing files.""" db_path = self.THIS_DIR / "cpi.db" if db_path.exists(): logger.debug(f"Deleting {db_path}") db_path.unlink() - data_dir = self.get_data_dir() - for f in data_dir.glob(".csv"): - logger.debug(f"Deleting {f}") - f.unlink() - for f in data_dir.glob(".tsv"): - logger.debug(f"Deleting {f}") - f.unlink() - - def update(self): + + def update(self) -> None: """Update the Consumer Price Index dataset that powers this library.""" # Delete existing files self.rm() # Download the TSVs logger.debug(f"Downloading {len(self.FILE_LIST)} files from the BLS") - [self.get_tsv(file) for file in self.FILE_LIST] + df_list = {name: self.get_df(name) for name in self.FILE_LIST} # Insert the TSVs logger.debug("Loading data into SQLite database") - [self.insert_tsv(file) for file in self.FILE_LIST] - def insert_tsv(self, file: str): - """Load the provided TSV file.""" # Connect to db db_path = self.THIS_DIR / "cpi.db" conn = sqlite3.connect(db_path) - # Read file - logger.debug(f" - {file}") - csv_path = self.get_data_dir() / f"{file}.csv" - csv_reader = list(csv.DictReader(open(csv_path))) - - # Convert it to a DataFrame - df = pd.DataFrame(csv_reader) - df.drop([None], axis=1, inplace=True, errors="ignore") - - # Write file to db - df.to_sql(file, conn, if_exists="replace", index=False) + # Load them one by one + for name, df in df_list.items(): + logger.debug(f"- {name}") + df.to_sql(name, conn, if_exists="replace", index=False) # Close connection conn.close() - def get_tsv(self, file: str): + def get_df(self, file: str) -> pd.DataFrame: """Download TSV file from the BLS.""" # Download it url = f"https://download.bls.gov/pub/time.series/cu/{file}" logger.debug(f" - {url}") - tsv_path = self.get_data_dir() / f"{file}.tsv" headers = { "User-Agent": "b@palewi.re", } response = requests.get(url, headers=headers, timeout=30) + + # Make sure the response is legit try: assert response.ok except AssertionError: logger.error(f"Error downloading {url}") logger.error(f"Response: {response.text}") - raise AssertionError(f"Error downloading {url}") - with open(tsv_path, "w") as fp: - fp.write(response.text) - - # Convert it to csv - with open(tsv_path) as in_file: - reader = csv.reader(in_file, delimiter="\t") - csv_path = self.get_data_dir() / f"{file}.csv" - with open(csv_path, "w") as out_file: - writer = csv.writer(out_file) - for row in reader: - writer.writerow([cell.strip() for cell in row]) + raise AssertionError(f"Error downloading {url} - {response.text}") + + # Read in the contents as an io.StringIO object + df = pd.read_csv(io.StringIO(response.text), sep="\t") + + # .strip() every value in the dataframe + df_obj = df.select_dtypes("object") + df[df_obj.columns] = df_obj.apply(lambda x: x.str.strip()) + + # .strip every column name + df.columns = [c.strip() for c in df.columns] + + # Clean file + df.drop([None], axis=1, inplace=True, errors="ignore") + + # Pass it back + return df if __name__ == "__main__": diff --git a/docs/Makefile b/docs/Makefile index 4ae5af2..8352782 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -20,4 +20,4 @@ help: @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) livehtml: - sphinx-autobuild -b html $(SOURCEDIR) $(BUILDDIR)/html \ No newline at end of file + sphinx-autobuild -b html $(SOURCEDIR) $(BUILDDIR)/html diff --git a/docs/requirements.txt b/docs/requirements.txt index 63ddc06..fa8de61 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,3 +1,3 @@ sphinx myst-parser -sphinx-palewire-theme \ No newline at end of file +sphinx-palewire-theme diff --git a/notebooks/dataframes.ipynb b/notebooks/dataframes.ipynb index e58ebb0..aec3dbe 100644 --- a/notebooks/dataframes.ipynb +++ b/notebooks/dataframes.ipynb @@ -13981,23 +13981,23 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 2", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "python2" + "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.12" + "pygments_lexer": "ipython3", + "version": "3.9.16" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/notebooks/latest.json b/notebooks/latest.json index 7e3a7f6..e6bbd2a 100644 --- a/notebooks/latest.json +++ b/notebooks/latest.json @@ -1 +1 @@ -{"all": {"latest_month": "2024-07-01", "latest_change": 0.2, "previous_month": "2024-06-01", "previous_change": -0.1}, "food": {"latest_month": "2024-07-01", "latest_change": 0.2, "previous_month": "2024-06-01", "previous_change": 0.2}, "energy": {"latest_month": "2024-07-01", "latest_change": 0.0, "previous_month": "2024-06-01", "previous_change": -2.0}, "less_food_and_energy": {"latest_month": "2024-07-01", "latest_change": 0.2, "previous_month": "2024-06-01", "previous_change": 0.1}, "yoy_change": 2.9} \ No newline at end of file +{"all": {"latest_month": "2024-07-01", "latest_change": 0.2, "previous_month": "2024-06-01", "previous_change": -0.1}, "food": {"latest_month": "2024-07-01", "latest_change": 0.2, "previous_month": "2024-06-01", "previous_change": 0.2}, "energy": {"latest_month": "2024-07-01", "latest_change": 0.0, "previous_month": "2024-06-01", "previous_change": -2.0}, "less_food_and_energy": {"latest_month": "2024-07-01", "latest_change": 0.2, "previous_month": "2024-06-01", "previous_change": 0.1}, "yoy_change": 2.9} diff --git a/notebooks/speed-test.ipynb b/notebooks/speed-test.ipynb new file mode 100644 index 0000000..6a1b562 --- /dev/null +++ b/notebooks/speed-test.ipynb @@ -0,0 +1,115 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "id": "a5b27483-80ce-49cd-b363-f38e66b3b84f", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "this_dir = os.path.dirname(os.getcwd())\n", + "sys.path.insert(0, this_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "16a4f9b1-616e-4f27-ab4b-c91a5bc3459d", + "metadata": {}, + "outputs": [ + { + "ename": "OperationalError", + "evalue": "no such table: cu.area", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mOperationalError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[5], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mcpi\u001b[39;00m\n", + "File \u001b[0;32m~/Code/cpi/cpi/__init__.py:21\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;66;03m# Parse data for use\u001b[39;00m\n\u001b[1;32m 20\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mParsing data files from the BLS\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 21\u001b[0m areas \u001b[38;5;241m=\u001b[39m \u001b[43mparsers\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mParseArea\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparse\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 22\u001b[0m items \u001b[38;5;241m=\u001b[39m parsers\u001b[38;5;241m.\u001b[39mParseItem()\u001b[38;5;241m.\u001b[39mparse()\n\u001b[1;32m 23\u001b[0m periods \u001b[38;5;241m=\u001b[39m parsers\u001b[38;5;241m.\u001b[39mParsePeriod()\u001b[38;5;241m.\u001b[39mparse()\n", + "File \u001b[0;32m~/Code/cpi/cpi/parsers.py:60\u001b[0m, in \u001b[0;36mParseArea.parse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 58\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mParsing area file\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 59\u001b[0m object_list \u001b[38;5;241m=\u001b[39m MappingList()\n\u001b[0;32m---> 60\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m row \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_file\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcu.area\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 61\u001b[0m obj \u001b[38;5;241m=\u001b[39m Area(row[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124marea_code\u001b[39m\u001b[38;5;124m\"\u001b[39m], row[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124marea_name\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 62\u001b[0m object_list\u001b[38;5;241m.\u001b[39mappend(obj)\n", + "File \u001b[0;32m~/Code/cpi/cpi/parsers.py:37\u001b[0m, in \u001b[0;36mBaseParser.get_file\u001b[0;34m(self, file)\u001b[0m\n\u001b[1;32m 34\u001b[0m cursor \u001b[38;5;241m=\u001b[39m conn\u001b[38;5;241m.\u001b[39mcursor()\n\u001b[1;32m 36\u001b[0m \u001b[38;5;66;03m# Query this file\u001b[39;00m\n\u001b[0;32m---> 37\u001b[0m query \u001b[38;5;241m=\u001b[39m \u001b[43mcursor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mSELECT * FROM \u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mfile\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 38\u001b[0m columns \u001b[38;5;241m=\u001b[39m [d[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;28;01mfor\u001b[39;00m d \u001b[38;5;129;01min\u001b[39;00m query\u001b[38;5;241m.\u001b[39mdescription]\n\u001b[1;32m 39\u001b[0m result_list \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mdict\u001b[39m(\u001b[38;5;28mzip\u001b[39m(columns, r)) \u001b[38;5;28;01mfor\u001b[39;00m r \u001b[38;5;129;01min\u001b[39;00m query\u001b[38;5;241m.\u001b[39mfetchall()]\n", + "\u001b[0;31mOperationalError\u001b[0m: no such table: cu.area" + ] + } + ], + "source": [ + "import cpi" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "588fdeea-b7b5-4be0-a590-13f2ed8a1e35", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "33.3 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n" + ] + } + ], + "source": [ + "%timeit -r 1 cpi.update()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "ff4d1a59-46db-4a76-af45-38b9771831bc", + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'cpi' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[3], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mget_ipython\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_line_magic\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtimeit\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m-r 1 cpi.update()\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.local/share/virtualenvs/cpi-bt44Qfrb/lib/python3.9/site-packages/IPython/core/interactiveshell.py:2456\u001b[0m, in \u001b[0;36mInteractiveShell.run_line_magic\u001b[0;34m(self, magic_name, line, _stack_depth)\u001b[0m\n\u001b[1;32m 2454\u001b[0m kwargs[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlocal_ns\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_local_scope(stack_depth)\n\u001b[1;32m 2455\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbuiltin_trap:\n\u001b[0;32m-> 2456\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2458\u001b[0m \u001b[38;5;66;03m# The code below prevents the output from being displayed\u001b[39;00m\n\u001b[1;32m 2459\u001b[0m \u001b[38;5;66;03m# when using magics with decorator @output_can_be_silenced\u001b[39;00m\n\u001b[1;32m 2460\u001b[0m \u001b[38;5;66;03m# when the last Python token in the expression is a ';'.\u001b[39;00m\n\u001b[1;32m 2461\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(fn, magic\u001b[38;5;241m.\u001b[39mMAGIC_OUTPUT_CAN_BE_SILENCED, \u001b[38;5;28;01mFalse\u001b[39;00m):\n", + "File \u001b[0;32m~/.local/share/virtualenvs/cpi-bt44Qfrb/lib/python3.9/site-packages/IPython/core/magics/execution.py:1185\u001b[0m, in \u001b[0;36mExecutionMagics.timeit\u001b[0;34m(self, line, cell, local_ns)\u001b[0m\n\u001b[1;32m 1183\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m index \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m10\u001b[39m):\n\u001b[1;32m 1184\u001b[0m number \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m10\u001b[39m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m index\n\u001b[0;32m-> 1185\u001b[0m time_number \u001b[38;5;241m=\u001b[39m \u001b[43mtimer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtimeit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnumber\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1186\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m time_number \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0.2\u001b[39m:\n\u001b[1;32m 1187\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n", + "File \u001b[0;32m~/.local/share/virtualenvs/cpi-bt44Qfrb/lib/python3.9/site-packages/IPython/core/magics/execution.py:173\u001b[0m, in \u001b[0;36mTimer.timeit\u001b[0;34m(self, number)\u001b[0m\n\u001b[1;32m 171\u001b[0m gc\u001b[38;5;241m.\u001b[39mdisable()\n\u001b[1;32m 172\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 173\u001b[0m timing \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minner\u001b[49m\u001b[43m(\u001b[49m\u001b[43mit\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtimer\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 175\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m gcold:\n", + "File \u001b[0;32m:1\u001b[0m, in \u001b[0;36minner\u001b[0;34m(_it, _timer)\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'cpi' is not defined" + ] + } + ], + "source": [ + "%timeit -r 1 cpi.update()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd70c615-81dd-4d66-8498-c3ae2b37b10d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}