Fixes ncopenpass#10

davidpeckham · Jul 24, 2021 · 9bdeef4 · 9bdeef4
1 parent 58d1e6b
commit 9bdeef4
Show file tree

Hide file tree

Showing 2 changed files with 140 additions and 0 deletions.
diff --git a/Data Pipeline/00 - Download Dataset.ipynb b/Data Pipeline/00 - Download Dataset.ipynb
@@ -0,0 +1,139 @@
+{
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8"
+  },
+  "orig_nbformat": 4,
+  "kernelspec": {
+   "name": "python3",
+   "display_name": "Python 3.8.8 64-bit ('base': conda)"
+  },
+  "interpreter": {
+   "hash": "e245b9d4d52625933425f13c940396e11f2ad0cf135519173d3aca2cac5d4603"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2,
+ "cells": [
+  {
+   "source": [
+    "# Download Election Dataset\n",
+    " \n",
+    "Download our State Board of Elections dataset from http://nc-campaign-finance-storage.s3-website-us-east-1.amazonaws.com/"
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pip install requests python-dateutil"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datetime import datetime, timezone\n",
+    "from dateutil import parser\n",
+    "from os import utime\n",
+    "from pathlib import Path\n",
+    "import requests\n",
+    "\n",
+    "DATA_URL = \"http://nc-campaign-finance-storage.s3.amazonaws.com/sboe-raw-files\"\n",
+    "DATA_DIR = \"./data\"\n",
+    "\n",
+    "raw_files = [\n",
+    "    \"raw_files/contributions/contributions_20100101-20101231.csv\",\n",
+    "    \"raw_files/contributions/contributions_20110101-20111231.csv\",\n",
+    "    \"raw_files/contributions/contributions_20120101-20121231.csv\",\n",
+    "    \"raw_files/contributions/contributions_20130101-20131231.csv\",\n",
+    "    \"raw_files/contributions/contributions_20140101-20141231.csv\",\n",
+    "    \"raw_files/contributions/contributions_20150101-20151231.csv\",\n",
+    "    \"raw_files/contributions/contributions_20160101-20161231.csv\",\n",
+    "    \"raw_files/contributions/contributions_20170101-20171231.csv\",\n",
+    "    \"raw_files/contributions/contributions_20180101-20181231.csv\",\n",
+    "    \"raw_files/contributions/contributions_20190101-20191231.csv\",\n",
+    "    \"raw_files/contributions/contributions_20200101_20200630.csv\",\n",
+    "    \"raw_files/contributions/contributions_20200701_20201231.csv\",\n",
+    "    \"raw_files/expenses/expenses_20100101_20101231.csv\",\n",
+    "    \"raw_files/expenses/expenses_20110101_20111231.csv\",\n",
+    "    \"raw_files/expenses/expenses_20120101_20121231.csv\",\n",
+    "    \"raw_files/expenses/expenses_20130101_20131231.csv\",\n",
+    "    \"raw_files/expenses/expenses_20140101_20141231.csv\",\n",
+    "    \"raw_files/expenses/expenses_20150101_20151231.csv\",\n",
+    "    \"raw_files/expenses/expenses_20160101_20161231.csv\",\n",
+    "    \"raw_files/expenses/expenses_20170101_20171231.csv\",\n",
+    "    \"raw_files/expenses/expenses_20180101_20181231.csv\",\n",
+    "    \"raw_files/expenses/expenses_20190101_20191231.csv\",\n",
+    "    \"raw_files/expenses/expenses_20200101_20201231.csv\"\n",
+    "]\n",
+    "\n",
+    "def download_file(url, path):\n",
+    "    with requests.get(url, stream=True) as response:\n",
+    "        response.raise_for_status()\n",
+    "        remote_size = int(response.headers[\"Content-Length\"])\n",
+    "        remote_mtime = parser.parse(response.headers[\"Last-Modified\"])\n",
+    "        if path.exists():\n",
+    "            stats = path.stat()\n",
+    "            local_size = stats.st_size\n",
+    "            local_mtime = datetime.fromtimestamp(stats.st_mtime, timezone.utc)\n",
+    "\n",
+    "            if local_size == remote_size and local_mtime == remote_mtime:\n",
+    "                print(f'{path} skipped (already downloaded)')\n",
+    "                return True\n",
+    "        else:\n",
+    "            path.parent.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
+    "        try:\n",
+    "            with open(path, 'wb') as f:\n",
+    "                for chunk in response.iter_content(chunk_size=8192):\n",
+    "                    f.write(chunk)\n",
+    "            remote_ts = remote_mtime.timestamp()\n",
+    "            os.utime(path, times=(remote_ts, remote_ts))\n",
+    "            print(f'{path} downloaded')\n",
+    "            return True\n",
+    "        except:\n",
+    "            path.unlink()\n",
+    "            print(f'{path} incomplete, deleted')\n",
+    "            return False\n",
+    "\n",
+    "\n",
+    "interrupted = False\n",
+    "\n",
+    "for file in raw_files:\n",
+    "    url = f\"{DATA_URL}/{file}\"\n",
+    "    path = Path(DATA_DIR, file)\n",
+    "    if not download_file(url, path):\n",
+    "        interrupted = True\n",
+    "        break\n",
+    "\n",
+    "if interrupted:\n",
+    "    print('Downloads were interrupted')\n",
+    "else:\n",
+    "    print(\"Downloads complete\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ]
+}
diff --git a/README.md b/README.md
@@ -18,6 +18,7 @@ The Python scripts are Jupyter Notebooks, but should be easily converted to an i
 
 ## The scripts are meant to be run in order
 
+    * 00 - Download Dataset - downloads the raw files
     * 01 - Preprocess - imports the raw files, sets up the Postgres tables and preps the data for dedupe
     * 02 - Dedupe - this is a actual part that goes over the entire universe of donors and payees and determines if they are the same despite speeling and missing information
     * 03 - Post Dedupe - this creates the views, copies the canonical ids to the transactions and parses out the various sources of committee information to determine party, candidate and active years