diff --git a/Data Pipeline/00 - Download Dataset.ipynb b/Data Pipeline/00 - Download Dataset.ipynb new file mode 100644 index 0000000..4d670c6 --- /dev/null +++ b/Data Pipeline/00 - Download Dataset.ipynb @@ -0,0 +1,139 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + }, + "orig_nbformat": 4, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.8.8 64-bit ('base': conda)" + }, + "interpreter": { + "hash": "e245b9d4d52625933425f13c940396e11f2ad0cf135519173d3aca2cac5d4603" + } + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "source": [ + "# Download Election Dataset\n", + " \n", + "Download our State Board of Elections dataset from http://nc-campaign-finance-storage.s3-website-us-east-1.amazonaws.com/" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pip install requests python-dateutil" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import datetime, timezone\n", + "from dateutil import parser\n", + "from os import utime\n", + "from pathlib import Path\n", + "import requests\n", + "\n", + "DATA_URL = \"http://nc-campaign-finance-storage.s3.amazonaws.com/sboe-raw-files\"\n", + "DATA_DIR = \"./data\"\n", + "\n", + "raw_files = [\n", + " \"raw_files/contributions/contributions_20100101-20101231.csv\",\n", + " \"raw_files/contributions/contributions_20110101-20111231.csv\",\n", + " \"raw_files/contributions/contributions_20120101-20121231.csv\",\n", + " \"raw_files/contributions/contributions_20130101-20131231.csv\",\n", + " \"raw_files/contributions/contributions_20140101-20141231.csv\",\n", + " \"raw_files/contributions/contributions_20150101-20151231.csv\",\n", + " \"raw_files/contributions/contributions_20160101-20161231.csv\",\n", + " \"raw_files/contributions/contributions_20170101-20171231.csv\",\n", + " \"raw_files/contributions/contributions_20180101-20181231.csv\",\n", + " \"raw_files/contributions/contributions_20190101-20191231.csv\",\n", + " \"raw_files/contributions/contributions_20200101_20200630.csv\",\n", + " \"raw_files/contributions/contributions_20200701_20201231.csv\",\n", + " \"raw_files/expenses/expenses_20100101_20101231.csv\",\n", + " \"raw_files/expenses/expenses_20110101_20111231.csv\",\n", + " \"raw_files/expenses/expenses_20120101_20121231.csv\",\n", + " \"raw_files/expenses/expenses_20130101_20131231.csv\",\n", + " \"raw_files/expenses/expenses_20140101_20141231.csv\",\n", + " \"raw_files/expenses/expenses_20150101_20151231.csv\",\n", + " \"raw_files/expenses/expenses_20160101_20161231.csv\",\n", + " \"raw_files/expenses/expenses_20170101_20171231.csv\",\n", + " \"raw_files/expenses/expenses_20180101_20181231.csv\",\n", + " \"raw_files/expenses/expenses_20190101_20191231.csv\",\n", + " \"raw_files/expenses/expenses_20200101_20201231.csv\"\n", + "]\n", + "\n", + "def download_file(url, path):\n", + " with requests.get(url, stream=True) as response:\n", + " response.raise_for_status()\n", + " remote_size = int(response.headers[\"Content-Length\"])\n", + " remote_mtime = parser.parse(response.headers[\"Last-Modified\"])\n", + " if path.exists():\n", + " stats = path.stat()\n", + " local_size = stats.st_size\n", + " local_mtime = datetime.fromtimestamp(stats.st_mtime, timezone.utc)\n", + "\n", + " if local_size == remote_size and local_mtime == remote_mtime:\n", + " print(f'{path} skipped (already downloaded)')\n", + " return True\n", + " else:\n", + " path.parent.mkdir(parents=True, exist_ok=True)\n", + "\n", + " try:\n", + " with open(path, 'wb') as f:\n", + " for chunk in response.iter_content(chunk_size=8192):\n", + " f.write(chunk)\n", + " remote_ts = remote_mtime.timestamp()\n", + " os.utime(path, times=(remote_ts, remote_ts))\n", + " print(f'{path} downloaded')\n", + " return True\n", + " except:\n", + " path.unlink()\n", + " print(f'{path} incomplete, deleted')\n", + " return False\n", + "\n", + "\n", + "interrupted = False\n", + "\n", + "for file in raw_files:\n", + " url = f\"{DATA_URL}/{file}\"\n", + " path = Path(DATA_DIR, file)\n", + " if not download_file(url, path):\n", + " interrupted = True\n", + " break\n", + "\n", + "if interrupted:\n", + " print('Downloads were interrupted')\n", + "else:\n", + " print(\"Downloads complete\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ] +} \ No newline at end of file diff --git a/README.md b/README.md index 760221b..3081884 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ The Python scripts are Jupyter Notebooks, but should be easily converted to an i ## The scripts are meant to be run in order + * 00 - Download Dataset - downloads the raw files * 01 - Preprocess - imports the raw files, sets up the Postgres tables and preps the data for dedupe * 02 - Dedupe - this is a actual part that goes over the entire universe of donors and payees and determines if they are the same despite speeling and missing information * 03 - Post Dedupe - this creates the views, copies the canonical ids to the transactions and parses out the various sources of committee information to determine party, candidate and active years