From a81512414aafaeb8fef580d7cb0343efed0a94b9 Mon Sep 17 00:00:00 2001
From: Yash Bhosale <bhosaley5@gmail.com>
Date: Tue, 17 Sep 2024 03:25:49 -0500
Subject: [PATCH 1/8] adding updated scripts to fetch the data and convert it
 into json

---
 scripts/elections.json      | 440 ++++++++++++++++++++++++++++++++++++
 scripts/scrape_elections.py |  56 +++++
 2 files changed, 496 insertions(+)
 create mode 100644 scripts/elections.json
 create mode 100644 scripts/scrape_elections.py

diff --git a/scripts/elections.json b/scripts/elections.json
new file mode 100644
index 0000000..6a15c55
--- /dev/null
+++ b/scripts/elections.json
@@ -0,0 +1,440 @@
+{
+  "242": {
+    "year": 2023,
+    "date": "4/3/23",
+    "label": "2023 Municipal Runoff - 4/3/23",
+    "races": {}
+  },
+  "241": {
+    "year": 2023,
+    "date": "2/28/23",
+    "label": "2023 Municipal General - 2/28/23",
+    "races": {}
+  },
+  "156": {
+    "year": 2022,
+    "date": "11/8/2022",
+    "label": "2022 General Election - 11/8/2022",
+    "races": {}
+  },
+  "252": {
+    "year": 2022,
+    "date": "6/28/2022",
+    "label": "2022 Primary - Democratic - 6/28/2022",
+    "races": {}
+  },
+  "253": {
+    "year": 2022,
+    "date": "6/28/2022",
+    "label": "2022 Primary - Republican - 6/28/2022",
+    "races": {}
+  },
+  "254": {
+    "year": 2022,
+    "date": "6/28/2022",
+    "label": "2022 Primary - Libertarian - 6/28/2022",
+    "races": {}
+  },
+  "255": {
+    "year": 2022,
+    "date": "6/28/2022",
+    "label": "2022 Primary - Non-Partisan - 6/28/2022",
+    "races": {}
+  },
+  "251": {
+    "year": 2020,
+    "date": "11/3/2020",
+    "label": "2020 General Election - 11/3/2020",
+    "races": {}
+  },
+  "250": {
+    "year": 2020,
+    "date": "3/17/2020",
+    "label": "2020 Primary - Non-Partisan - 3/17/2020",
+    "races": {}
+  },
+  "240": {
+    "year": 2020,
+    "date": "3/17/2020",
+    "label": "2020 Primary - Republican - 3/17/2020",
+    "races": {}
+  },
+  "230": {
+    "year": 2020,
+    "date": "3/17/2020",
+    "label": "2020 Primary - Democratic - 3/17/2020",
+    "races": {}
+  },
+  "220": {
+    "year": 2019,
+    "date": "4/2/2019",
+    "label": "2019 Municipal Runoffs - 4/2/2019",
+    "races": {}
+  },
+  "210": {
+    "year": 2019,
+    "date": "2/26/2019",
+    "label": "2019 Municipal General - 2/26/2019",
+    "races": {}
+  },
+  "200": {
+    "year": 2018,
+    "date": "11/6/2018",
+    "label": "2018 General Election - 11/6/2018",
+    "races": {}
+  },
+  "2": {
+    "year": 2018,
+    "date": "3/20/2018",
+    "label": "2018 Primary - Non-Partisan - 3/20/2018",
+    "races": {}
+  },
+  "1": {
+    "year": 2018,
+    "date": "3/20/2018",
+    "label": "2018 Primary - Republican - 3/20/2018",
+    "races": {}
+  },
+  "0": {
+    "year": 2018,
+    "date": "3/20/2018",
+    "label": "2018 Primary - Democratic - 3/20/2018",
+    "races": {}
+  },
+  "3": {
+    "year": 2017,
+    "date": "2/28/2017",
+    "label": "2017 Municipal General - 2/28/2017",
+    "races": {}
+  },
+  "4": {
+    "year": 2016,
+    "date": "11/8/2016",
+    "label": "2016 General Election - 11/8/2016",
+    "races": {}
+  },
+  "8": {
+    "year": 2016,
+    "date": "3/15/2016",
+    "label": "2016 Primary - Non-Partisan - 3/15/2016",
+    "races": {}
+  },
+  "7": {
+    "year": 2016,
+    "date": "3/15/2016",
+    "label": "2016 Primary - Green - 3/15/2016",
+    "races": {}
+  },
+  "6": {
+    "year": 2016,
+    "date": "3/15/2016",
+    "label": "2016 Primary - Republican - 3/15/2016",
+    "races": {}
+  },
+  "5": {
+    "year": 2016,
+    "date": "3/15/2016",
+    "label": "2016 Primary - Democratic - 3/15/2016",
+    "races": {}
+  },
+  "9": {
+    "year": 2015,
+    "date": "4/7/2015",
+    "label": "2015 Municipal Runoffs - 4/7/2015",
+    "races": {}
+  },
+  "10": {
+    "year": 2015,
+    "date": "2/24/2015",
+    "label": "2015 Municipal General - 2/24/2015",
+    "races": {}
+  },
+  "11": {
+    "year": 2014,
+    "date": "11/4/2014",
+    "label": "2014 General Election - 11/4/2014",
+    "races": {}
+  },
+  "12": {
+    "year": 2014,
+    "date": "3/18/2014",
+    "label": "2014 Primary - Democratic - 3/18/2014",
+    "races": {}
+  },
+  "13": {
+    "year": 2014,
+    "date": "3/18/2014",
+    "label": "2014 Primary - Republican - 3/18/2014",
+    "races": {}
+  },
+  "14": {
+    "year": 2014,
+    "date": "3/18/2014",
+    "label": "2014 Primary - Green - 3/18/2014",
+    "races": {}
+  },
+  "15": {
+    "year": 2014,
+    "date": "3/18/2014",
+    "label": "2014 Primary - Non-Partisan - 3/18/2014",
+    "races": {}
+  },
+  "16": {
+    "year": 2013,
+    "date": "4/9/2013",
+    "label": "2013 Special Election - 2nd Congressional - 4/9/2013",
+    "races": {}
+  },
+  "17": {
+    "year": 2013,
+    "date": "2/26/2013",
+    "label": "2013 Special Primary - 2nd Congressional - Democratic - 2/26/2013",
+    "races": {}
+  },
+  "18": {
+    "year": 2013,
+    "date": "2/26/2013",
+    "label": "2013 Special Priamry - 2nd Congressional - Republican - 2/26/2013",
+    "races": {}
+  },
+  "19": {
+    "year": 2012,
+    "date": "11/6/2012",
+    "label": "2012 General Election - 11/6/2012",
+    "races": {}
+  },
+  "20": {
+    "year": 2012,
+    "date": "3/20/2012",
+    "label": "2012 Primary - Democratic - 3/20/2012",
+    "races": {}
+  },
+  "21": {
+    "year": 2012,
+    "date": "3/20/2012",
+    "label": "2012 Primary - Republican - 3/20/2012",
+    "races": {}
+  },
+  "22": {
+    "year": 2012,
+    "date": "3/20/2012",
+    "label": "2012 Primary - Green - 3/20/2012",
+    "races": {}
+  },
+  "23": {
+    "year": 2012,
+    "date": "3/20/2012",
+    "label": "2012 Primary - Non-Partisan - 3/20/2012",
+    "races": {}
+  },
+  "24": {
+    "year": 2011,
+    "date": "4/5/2011",
+    "label": "2011 Municipal Runoffs - 4/5/2011",
+    "races": {}
+  },
+  "25": {
+    "year": 2011,
+    "date": "2/22/2011",
+    "label": "2011 Municipal General - 2/22/2011",
+    "races": {}
+  },
+  "26": {
+    "year": 2010,
+    "date": "11/2/2010",
+    "label": "2010 General Election - 11/2/2010",
+    "races": {}
+  },
+  "27": {
+    "year": 2010,
+    "date": "2/2/2010",
+    "label": "2010 Primary - Democratic - 2/2/2010",
+    "races": {}
+  },
+  "29": {
+    "year": 2010,
+    "date": "2/2/2010",
+    "label": "2010 Primary - Republican - 2/2/2010",
+    "races": {}
+  },
+  "31": {
+    "year": 2010,
+    "date": "2/2/2010",
+    "label": "2010 Primary - Green - 2/2/2010",
+    "races": {}
+  },
+  "33": {
+    "year": 2009,
+    "date": "4/7/2009",
+    "label": "2009 Special Election - 5th Congressional - 4/7/2009",
+    "races": {}
+  },
+  "34": {
+    "year": 2009,
+    "date": "3/3/2009",
+    "label": "2009 Special Primary - 5th Congressional - Democratic - 3/3/2009",
+    "races": {}
+  },
+  "36": {
+    "year": 2009,
+    "date": "3/3/2009",
+    "label": "2009 Special Primary - 5th Congressional - Republican - 3/3/2009",
+    "races": {}
+  },
+  "38": {
+    "year": 2009,
+    "date": "3/3/2009",
+    "label": "2009 Special Primary - 5th Congressional - Green - 3/3/2009",
+    "races": {}
+  },
+  "40": {
+    "year": 2008,
+    "date": "11/4/2008",
+    "label": "2008 General Election - 11/4/2008",
+    "races": {}
+  },
+  "45": {
+    "year": 2008,
+    "date": "2/4/2008",
+    "label": "2008 Primary - Democratic - 2/4/2008",
+    "races": {}
+  },
+  "50": {
+    "year": 2008,
+    "date": "2/4/2008",
+    "label": "2008 Primary - Republican - 2/4/2008",
+    "races": {}
+  },
+  "55": {
+    "year": 2008,
+    "date": "2/4/2008",
+    "label": "2008 Primary - Green - 2/4/2008",
+    "races": {}
+  },
+  "60": {
+    "year": 2007,
+    "date": "4/17/2007",
+    "label": "2007 Municipal Runoffs - 4/17/2007",
+    "races": {}
+  },
+  "65": {
+    "year": 2007,
+    "date": "2/27/2007",
+    "label": "2007 Municipal General - 2/27/2007",
+    "races": {}
+  },
+  "70": {
+    "year": 2006,
+    "date": "11/7/2006",
+    "label": "2006 General Election - 11/7/2006",
+    "races": {}
+  },
+  "75": {
+    "year": 2006,
+    "date": "3/21/2006",
+    "label": "2006 Primary - Democratic - 3/21/2006",
+    "races": {}
+  },
+  "80": {
+    "year": 2006,
+    "date": "3/21/2006",
+    "label": "2006 Primary - Republican - 3/21/2006",
+    "races": {}
+  },
+  "85": {
+    "year": 2006,
+    "date": "3/21/2006",
+    "label": "2006 Primary - Other - 3/21/2006",
+    "races": {}
+  },
+  "90": {
+    "year": 2004,
+    "date": "11/2/2004",
+    "label": "2004 General Election - 11/2/2004",
+    "races": {}
+  },
+  "95": {
+    "year": 2004,
+    "date": "3/16/2004",
+    "label": "2004 Primary - Democratic - 3/16/2004",
+    "races": {}
+  },
+  "100": {
+    "year": 2004,
+    "date": "3/16/2004",
+    "label": "2004 Primary - Republican - 3/16/2004",
+    "races": {}
+  },
+  "101": {
+    "year": 2004,
+    "date": "3/16/2004",
+    "label": "2004 Primary - Other - 3/16/2004",
+    "races": {}
+  },
+  "105": {
+    "year": 2003,
+    "date": "4/1/2003",
+    "label": "2003 Municipal Runoffs - 4/1/2003",
+    "races": {}
+  },
+  "110": {
+    "year": 2003,
+    "date": "2/25/2003",
+    "label": "2003 Municipal General - 2/25/2003",
+    "races": {}
+  },
+  "115": {
+    "year": 2002,
+    "date": "11/5/2002",
+    "label": "2002 General Election - 11/5/2002",
+    "races": {}
+  },
+  "116": {
+    "year": 2002,
+    "date": "3/19/2002",
+    "label": "2002 Primary - Democratic - 3/19/2002",
+    "races": {}
+  },
+  "117": {
+    "year": 2002,
+    "date": "3/19/2002",
+    "label": "2002 Primary - Republican - 3/19/2002",
+    "races": {}
+  },
+  "118": {
+    "year": 2002,
+    "date": "3/19/2002",
+    "label": "2002 Primary - Other - 3/19/2002",
+    "races": {}
+  },
+  "120": {
+    "year": 2000,
+    "date": "11/7/2000",
+    "label": "2000 General Election - 11/7/2000",
+    "races": {}
+  },
+  "124": {
+    "year": 2000,
+    "date": "3/21/2000",
+    "label": "2000 Primary - Democratic - 3/21/2000",
+    "races": {}
+  },
+  "125": {
+    "year": 2000,
+    "date": "3/21/2000",
+    "label": "2000 Primary - Republican - 3/21/2000",
+    "races": {}
+  }
+  "19830": {
+      "year": 1983,
+      "date": "2/22/1983",
+      "label": "1983 Primary - Democratic",
+      "races": {"0": "Mayor"}
+  },
+  "19831": {
+      "year": 1983,
+      "date": "4/12/1983",
+      "label": "1983 General Election",
+      "races": {"0": "Mayor"}
+  }
+}
diff --git a/scripts/scrape_elections.py b/scripts/scrape_elections.py
new file mode 100644
index 0000000..9d73a28
--- /dev/null
+++ b/scripts/scrape_elections.py
@@ -0,0 +1,56 @@
+from io import BytesIO
+import xlrd
+from pprint import pprint
+import pandas as pd
+from aiohttp import ClientSession
+from json import load, dump
+from asyncio import run
+
+
+
+def book_pandas(book: BytesIO):
+    workbook: xlrd.Book = xlrd.open_workbook(file_contents=book, ignore_workbook_corruption=True)
+    sheet = workbook.sheet_by_index(0)
+    rows = sheet.get_rows()
+    total = []
+    subtables = {}
+    for i in range(3):
+        next(rows)
+    subtables['Total'] = total
+    cur_row = next(rows)
+    while cur_row:
+
+        ward = cur_row[0].value
+        print(ward)
+        cur_row = next(rows)
+        sub_table = []
+        try:
+            while not all([cell.ctype in (xlrd.XL_CELL_EMPTY, xlrd.XL_CELL_BLANK) for cell in cur_row]):
+                sub_table.append([cell.value for cell in cur_row])
+                cur_row = next(rows)
+        except StopIteration:
+            pass
+        cols = sub_table[0]
+        print(sub_table)
+        print(cols)
+        for i in range(len(cols)):
+            if cols[i] == "%":
+                cols[i] = f"{cols[i-1]} %"
+        subtables[ward] = pd.DataFrame(sub_table[1:], columns=sub_table[0]).set_index('Precinct').to_dict(orient="index")
+        cur_row = next(rows, None)
+    dump(subtables, open("subtable.json", 'w'))
+
+    return subtables
+
+async def main():
+
+    results_metadata: dict = load(open("../output/results-metadata.json", "r"))
+    pairs = [(contest, race) for contest, c_info in results_metadata.items() for race in c_info["races"]]
+    #print(len(pairs))
+    #async with ClientSession() as cs:
+    #    async with cs.get("https://chicagoelections.gov/elections/results/156/download?contest=15&ward=&precinct=") as resp:
+    #        book_pandas(await resp.content.read())
+    book_pandas(open("/home/yash/Downloads/download.xls", "rb").read())
+
+if __name__ == "__main__":
+    run(main())

From cc7de2a9764922f4b7c877ee7fe1668faf28b69b Mon Sep 17 00:00:00 2001
From: Yash Bhosale <bhosaley5@gmail.com>
Date: Tue, 17 Sep 2024 19:23:43 -0500
Subject: [PATCH 2/8] some minor updates

---
 scripts/scrape_elections.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/scripts/scrape_elections.py b/scripts/scrape_elections.py
index 9d73a28..a4aada9 100644
--- a/scripts/scrape_elections.py
+++ b/scripts/scrape_elections.py
@@ -8,7 +8,7 @@
 
 
 
-def book_pandas(book: BytesIO):
+def book_pandas(book: BytesIO, race: int, contest: int):
     workbook: xlrd.Book = xlrd.open_workbook(file_contents=book, ignore_workbook_corruption=True)
     sheet = workbook.sheet_by_index(0)
     rows = sheet.get_rows()
@@ -21,7 +21,6 @@ def book_pandas(book: BytesIO):
     while cur_row:
 
         ward = cur_row[0].value
-        print(ward)
         cur_row = next(rows)
         sub_table = []
         try:
@@ -31,14 +30,13 @@ def book_pandas(book: BytesIO):
         except StopIteration:
             pass
         cols = sub_table[0]
-        print(sub_table)
-        print(cols)
+        cols = [col if col != '%' else cols[i-1] + " %" for i, col in enumerate(cols)]
         for i in range(len(cols)):
             if cols[i] == "%":
                 cols[i] = f"{cols[i-1]} %"
         subtables[ward] = pd.DataFrame(sub_table[1:], columns=sub_table[0]).set_index('Precinct').to_dict(orient="index")
         cur_row = next(rows, None)
-    dump(subtables, open("subtable.json", 'w'))
+    dump(subtables, open(f"{race}_{contest}_election.json", 'w'), indent = 2)
 
     return subtables
 
@@ -46,11 +44,10 @@ async def main():
 
     results_metadata: dict = load(open("../output/results-metadata.json", "r"))
     pairs = [(contest, race) for contest, c_info in results_metadata.items() for race in c_info["races"]]
-    #print(len(pairs))
     #async with ClientSession() as cs:
     #    async with cs.get("https://chicagoelections.gov/elections/results/156/download?contest=15&ward=&precinct=") as resp:
     #        book_pandas(await resp.content.read())
-    book_pandas(open("/home/yash/Downloads/download.xls", "rb").read())
+    book_pandas(open("/home/yash/Downloads/download.xls", "rb").read(), 156, 15)
 
 if __name__ == "__main__":
     run(main())

From fc0e3acc3d665960ef8ed6be786ae64f056a9015 Mon Sep 17 00:00:00 2001
From: Yash Bhosale <bhosaley5@gmail.com>
Date: Thu, 19 Sep 2024 19:54:24 -0500
Subject: [PATCH 3/8] cleanup, caching requests, new (unfinished) code to fetch
 the contest/race ids

---
 scripts/scrape_elections.py | 48 ++++++++++++++++++++++++++-----------
 1 file changed, 34 insertions(+), 14 deletions(-)

diff --git a/scripts/scrape_elections.py b/scripts/scrape_elections.py
index a4aada9..c89935f 100644
--- a/scripts/scrape_elections.py
+++ b/scripts/scrape_elections.py
@@ -1,14 +1,17 @@
 from io import BytesIO
 import xlrd
-from pprint import pprint
+from pprint import pprint 
 import pandas as pd
 from aiohttp import ClientSession
-from json import load, dump
-from asyncio import run
+from json import dump, load
+from asyncio import gather, run
+from aiohttp_client_cache import CachedSession, SQLiteBackend
+from requests import get
+from itertools import dropwhile
+from bs4 import BeautifulSoup
+DEBUG = True
 
-
-
-def book_pandas(book: BytesIO, race: int, contest: int):
+def book_pandas(book: BytesIO, race: int, contest: int, elec_data: dict):
     workbook: xlrd.Book = xlrd.open_workbook(file_contents=book, ignore_workbook_corruption=True)
     sheet = workbook.sheet_by_index(0)
     rows = sheet.get_rows()
@@ -34,20 +37,37 @@ def book_pandas(book: BytesIO, race: int, contest: int):
         for i in range(len(cols)):
             if cols[i] == "%":
                 cols[i] = f"{cols[i-1]} %"
-        subtables[ward] = pd.DataFrame(sub_table[1:], columns=sub_table[0]).set_index('Precinct').to_dict(orient="index")
+        subtables[ward] = pd.DataFrame(sub_table[1:], columns=cols).set_index('Precinct').to_dict(orient="index")
         cur_row = next(rows, None)
-    dump(subtables, open(f"{race}_{contest}_election.json", 'w'), indent = 2)
 
-    return subtables
+    elec_data.setdefault(race, {})[contest] = subtables
+
+async def fetch_contest_data(race: int, contest: int, cs: ClientSession, elec_data: dict):
+    print(f"race {race} contest {contest}")
+    resp = await cs.get(f"https://chicagoelections.gov/elections/results/{race}/download?contest={contest}&ward=&precinct=")
+    book_pandas(await resp.content.read(), race, contest, elec_data)
+
+async def fetch_races():
+    resp = get("https://chicagoelections.gov/elections/results")
+    soup = BeautifulSoup(resp, "lxml")
+    races = [dropwhile(lambda c: not c.isnumeric(), link['href']) for link in soup if link['href'].startswith("/elections/results")]
+    return races
+
+async def fetch_contests():
+    # <select name="contest"
+    raise NotImplementedError
 
 async def main():
 
     results_metadata: dict = load(open("../output/results-metadata.json", "r"))
-    pairs = [(contest, race) for contest, c_info in results_metadata.items() for race in c_info["races"]]
-    #async with ClientSession() as cs:
-    #    async with cs.get("https://chicagoelections.gov/elections/results/156/download?contest=15&ward=&precinct=") as resp:
-    #        book_pandas(await resp.content.read())
-    book_pandas(open("/home/yash/Downloads/download.xls", "rb").read(), 156, 15)
+    pairs = ((contest, race) for contest, c_info in results_metadata.items() for race in c_info["races"])
+    pairs = list(pairs)[:3]
+    print(pairs)
+    contest_data = {}
+    async with CachedSession(cache=SQLiteBackend('test_cache')) as cs:
+        await gather(*(fetch_contest_data(*pair, cs, contest_data) for pair in pairs))      
+    
+    dump(contest_data, open('data.json', 'w'), indent=2)
 
 if __name__ == "__main__":
     run(main())

From b7815fbc11973ff789dcf79927c8bc110c65c9e4 Mon Sep 17 00:00:00 2001
From: Yash Bhosale <bhosaley5@gmail.com>
Date: Tue, 24 Sep 2024 03:48:01 -0500
Subject: [PATCH 4/8] some error handling, some performance stuff

---
 scripts/scrape_elections.py | 121 ++++++++++++++++++++++++++++--------
 1 file changed, 96 insertions(+), 25 deletions(-)

diff --git a/scripts/scrape_elections.py b/scripts/scrape_elections.py
index c89935f..81e052b 100644
--- a/scripts/scrape_elections.py
+++ b/scripts/scrape_elections.py
@@ -1,73 +1,144 @@
 from io import BytesIO
 import xlrd
-from pprint import pprint 
+from pprint import pprint
 import pandas as pd
 from aiohttp import ClientSession
 from json import dump, load
-from asyncio import gather, run
+from asyncio import Semaphore, gather, run
 from aiohttp_client_cache import CachedSession, SQLiteBackend
 from requests import get
 from itertools import dropwhile
 from bs4 import BeautifulSoup
-DEBUG = True
+import warnings
+from multiprocessing import Pool
+from os import getenv
 
-def book_pandas(book: BytesIO, race: int, contest: int, elec_data: dict):
-    workbook: xlrd.Book = xlrd.open_workbook(file_contents=book, ignore_workbook_corruption=True)
+DEBUG = getenv('DEBUG', 1)
+print(DEBUG)
+SCRAPE_PROCESSES = getenv('SCRAPE_PROCESSES', 6) #my computer has 8 cores
+warnings.filterwarnings("error")
+
+
+def book_pandas(d):
+    contest, race, book = d['contest'], d['race'], d['data']
+    try:
+        workbook: xlrd.Book = xlrd.open_workbook(
+            file_contents=book, ignore_workbook_corruption=True
+        )
+    except xlrd.XLRDError as e:
+        print(e)
+        return 
     sheet = workbook.sheet_by_index(0)
     rows = sheet.get_rows()
     total = []
     subtables = {}
     for i in range(3):
         next(rows)
-    subtables['Total'] = total
+    subtables["Total"] = total
     cur_row = next(rows)
     while cur_row:
-
         ward = cur_row[0].value
         cur_row = next(rows)
         sub_table = []
         try:
-            while not all([cell.ctype in (xlrd.XL_CELL_EMPTY, xlrd.XL_CELL_BLANK) for cell in cur_row]):
+            while not all(
+                [
+                    cell.ctype in (xlrd.XL_CELL_EMPTY, xlrd.XL_CELL_BLANK)
+                    for cell in cur_row
+                ]
+            ):
                 sub_table.append([cell.value for cell in cur_row])
                 cur_row = next(rows)
         except StopIteration:
             pass
         cols = sub_table[0]
-        cols = [col if col != '%' else cols[i-1] + " %" for i, col in enumerate(cols)]
+        cols = [col if col != "%" else cols[i - 1] + " %" for i, col in enumerate(cols)]
+        # Note for the future: moving to SQLite might be more performant than json.
+        # Certainly the file size would likely be smaller. 
         for i in range(len(cols)):
             if cols[i] == "%":
                 cols[i] = f"{cols[i-1]} %"
-        subtables[ward] = pd.DataFrame(sub_table[1:], columns=cols).set_index('Precinct').to_dict(orient="index")
+        #TODO: unfortunately there's a bug where, for certain races that simply can't be generated e.g. 
+        subtables[ward] = (
+            pd.DataFrame(sub_table[1:], columns=cols)
+            .set_index("Precinct")
+            .to_dict(orient="index")
+        )
         cur_row = next(rows, None)
+    return {'contest': contest,
+                'race': race,
+                'data': subtables}
+        
 
-    elec_data.setdefault(race, {})[contest] = subtables
+async def fetch_contest_data(
+    race: int, contest: int, cs: ClientSession, elec_data: dict, sem: Semaphore
+):
+    # print(f"race {race} contest {contest}")
+    await sem.acquire()
+    try:
+        resp = await cs.get(
+            f"https://chicagoelections.gov/elections/results/{race}/download?contest={contest}&ward=&precinct="
+        )
+        resp.raise_for_status()
+        # This happens for some contests e.g. https://chicagoelections.gov/elections/results/7/download?contest=334&ward=&precinct=
+        # print(resp.content_type)
+        if resp.content_type != "application/vnd.ms-excel":
+            raise RuntimeError(f"race {race} contest {contest} did not return an Excel spreadsheet")
+        return {'contest': contest,
+                'race': race,
+                'data': await resp.content.read()}
+    except Exception as e:
+        print(e, race, contest)
+        return None
+    finally:
+        sem.release()
 
-async def fetch_contest_data(race: int, contest: int, cs: ClientSession, elec_data: dict):
-    print(f"race {race} contest {contest}")
-    resp = await cs.get(f"https://chicagoelections.gov/elections/results/{race}/download?contest={contest}&ward=&precinct=")
-    book_pandas(await resp.content.read(), race, contest, elec_data)
 
 async def fetch_races():
     resp = get("https://chicagoelections.gov/elections/results")
     soup = BeautifulSoup(resp, "lxml")
-    races = [dropwhile(lambda c: not c.isnumeric(), link['href']) for link in soup if link['href'].startswith("/elections/results")]
+    races = [
+        dropwhile(lambda c: not c.isnumeric(), link["href"])
+        for link in soup
+        if link["href"].startswith("/elections/results")
+    ]
     return races
 
+
 async def fetch_contests():
     # <select name="contest"
     raise NotImplementedError
 
-async def main():
 
-    results_metadata: dict = load(open("../output/results-metadata.json", "r"))
-    pairs = ((contest, race) for contest, c_info in results_metadata.items() for race in c_info["races"])
-    pairs = list(pairs)[:3]
-    print(pairs)
+async def main():
+    with open("../output/results-metadata.json", "r") as ifp:
+        results_metadata: dict = load(ifp)
+    pairs = (
+        (contest, race)
+        for contest, c_info in results_metadata.items()
+        for race in c_info["races"]
+    )
+    if DEBUG == 1:
+        pairs = list(pairs)[:1000]
+        # pprint(pairs)
     contest_data = {}
-    async with CachedSession(cache=SQLiteBackend('test_cache')) as cs:
-        await gather(*(fetch_contest_data(*pair, cs, contest_data) for pair in pairs))      
-    
-    dump(contest_data, open('data.json', 'w'), indent=2)
+    sem = Semaphore(10)
+    # maybe we can store this sqlite database for fast downloads? 
+    async with CachedSession(cache=SQLiteBackend("test_cache")) as cs:
+        contest_data = await gather(
+            *(fetch_contest_data(*pair, cs, contest_data, sem) for pair in pairs)
+        )
+    # TODO: Need a more elegant solution for this. Occasionally there are tables where 
+    # parts are empty - there's multiple candidates listed as 'No Candidate'
+    # e.g. https://chicagoelections.gov/elections/results/240/download?contest=390&ward=&precinct=
+    warnings.resetwarnings() 
+    contest_data = list(filter(None, contest_data))
+
+    with Pool(6) as p:
+        contest_data = p.map(book_pandas, contest_data)
+    with open("data.json", "w") as ofp:
+        dump(contest_data, ofp, indent=2)
+
 
 if __name__ == "__main__":
     run(main())

From 803a080ca808a045829ae5eafe529ed7d24acee5 Mon Sep 17 00:00:00 2001
From: Yash Bhosale <bhosaley5@gmail.com>
Date: Tue, 24 Sep 2024 18:49:52 -0500
Subject: [PATCH 5/8] outputting csvs in the output folder

---
 scripts/scrape_elections.py | 48 ++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/scripts/scrape_elections.py b/scripts/scrape_elections.py
index 81e052b..643c7fc 100644
--- a/scripts/scrape_elections.py
+++ b/scripts/scrape_elections.py
@@ -12,6 +12,9 @@
 import warnings
 from multiprocessing import Pool
 from os import getenv
+import locale
+import csv
+locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' )
 
 DEBUG = getenv('DEBUG', 1)
 print(DEBUG)
@@ -21,6 +24,7 @@
 
 def book_pandas(d):
     contest, race, book = d['contest'], d['race'], d['data']
+    print(contest, race)
     try:
         workbook: xlrd.Book = xlrd.open_workbook(
             file_contents=book, ignore_workbook_corruption=True
@@ -30,16 +34,18 @@ def book_pandas(d):
         return 
     sheet = workbook.sheet_by_index(0)
     rows = sheet.get_rows()
-    total = []
-    subtables = {}
+    subtables = []
     for i in range(3):
         next(rows)
-    subtables["Total"] = total
     cur_row = next(rows)
+    cols = []
     while cur_row:
         ward = cur_row[0].value
+
+        #TODO: unfortunately there's a bug where, for certain races that simply can't be generated e.g. 
+        cols = next(rows)
+        cols = [col.value if col.value != "%" else cols[i - 1].value + " %" for i, col in enumerate(cols)]
         cur_row = next(rows)
-        sub_table = []
         try:
             while not all(
                 [
@@ -47,33 +53,28 @@ def book_pandas(d):
                     for cell in cur_row
                 ]
             ):
-                sub_table.append([cell.value for cell in cur_row])
+                print(type(cur_row[0].value))
+                row = [int(ward.split(' ')[1]),
+                       int(cur_row[0].value) if type(cur_row[0].value) is not str else cur_row[0].value , 
+                       int(cur_row[1].value) if type(cur_row[1].value) is not str else int(cur_row[1].value.replace(',','')),
+                       int(cur_row[2].value) if type(cur_row[2].value) is not str else int(cur_row[2].value.replace(',','')),
+                       float(cur_row[3].value[:-1])]
+                subtables.append(row)
                 cur_row = next(rows)
         except StopIteration:
             pass
-        cols = sub_table[0]
-        cols = [col if col != "%" else cols[i - 1] + " %" for i, col in enumerate(cols)]
-        # Note for the future: moving to SQLite might be more performant than json.
-        # Certainly the file size would likely be smaller. 
-        for i in range(len(cols)):
-            if cols[i] == "%":
-                cols[i] = f"{cols[i-1]} %"
-        #TODO: unfortunately there's a bug where, for certain races that simply can't be generated e.g. 
-        subtables[ward] = (
-            pd.DataFrame(sub_table[1:], columns=cols)
-            .set_index("Precinct")
-            .to_dict(orient="index")
-        )
         cur_row = next(rows, None)
-    return {'contest': contest,
-                'race': race,
-                'data': subtables}
+    cols = ['Ward', *cols]
+    with open(f'../output/{race}_{contest}.csv', 'w') as ofp:
+        writer = csv.writer(ofp)
+        writer.writerow(cols)
+        writer.writerows(subtables)
+    return 
         
 
 async def fetch_contest_data(
     race: int, contest: int, cs: ClientSession, elec_data: dict, sem: Semaphore
 ):
-    # print(f"race {race} contest {contest}")
     await sem.acquire()
     try:
         resp = await cs.get(
@@ -81,7 +82,6 @@ async def fetch_contest_data(
         )
         resp.raise_for_status()
         # This happens for some contests e.g. https://chicagoelections.gov/elections/results/7/download?contest=334&ward=&precinct=
-        # print(resp.content_type)
         if resp.content_type != "application/vnd.ms-excel":
             raise RuntimeError(f"race {race} contest {contest} did not return an Excel spreadsheet")
         return {'contest': contest,
@@ -119,7 +119,7 @@ async def main():
         for race in c_info["races"]
     )
     if DEBUG == 1:
-        pairs = list(pairs)[:1000]
+        pairs = list(pairs)[:1]
         # pprint(pairs)
     contest_data = {}
     sem = Semaphore(10)

From 752f3963b57566a8be506a8f528b7e19a108d318 Mon Sep 17 00:00:00 2001
From: Yash Bhosale <bhosaley5@gmail.com>
Date: Wed, 25 Sep 2024 00:12:01 -0500
Subject: [PATCH 6/8] finished enough to merge

---
 scripts/scrape_elections.py | 89 +++++++++++++++++++++++--------------
 1 file changed, 55 insertions(+), 34 deletions(-)

diff --git a/scripts/scrape_elections.py b/scripts/scrape_elections.py
index 643c7fc..73bd9af 100644
--- a/scripts/scrape_elections.py
+++ b/scripts/scrape_elections.py
@@ -1,9 +1,8 @@
 from io import BytesIO
 import xlrd
 from pprint import pprint
-import pandas as pd
 from aiohttp import ClientSession
-from json import dump, load
+from json import load
 from asyncio import Semaphore, gather, run
 from aiohttp_client_cache import CachedSession, SQLiteBackend
 from requests import get
@@ -14,24 +13,35 @@
 from os import getenv
 import locale
 import csv
-locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' )
+from pathlib import Path
 
-DEBUG = getenv('DEBUG', 1)
-print(DEBUG)
-SCRAPE_PROCESSES = getenv('SCRAPE_PROCESSES', 6) #my computer has 8 cores
+locale.setlocale(locale.LC_ALL, "en_US.UTF-8")
+
+DEBUG = getenv("DEBUG", 1)
+SCRAPE_PROCESSES = getenv("SCRAPE_PROCESSES", 6)  # my computer has 8 cores
 warnings.filterwarnings("error")
 
 
+def transform_type(v):
+    if v is None:
+        return None
+    if type(v) is float:
+        return int(v) if v.is_integer() else v
+    elif "," in v:
+        return int(v.replace(",", ""))
+    elif "%" in v:
+        return float(v[:-1])
+
 def book_pandas(d):
-    contest, race, book = d['contest'], d['race'], d['data']
-    print(contest, race)
+    contest, race = d["contest"], d["race"]
+    book: BytesIO = d["data"]
     try:
         workbook: xlrd.Book = xlrd.open_workbook(
             file_contents=book, ignore_workbook_corruption=True
         )
     except xlrd.XLRDError as e:
         print(e)
-        return 
+        return
     sheet = workbook.sheet_by_index(0)
     rows = sheet.get_rows()
     subtables = []
@@ -42,9 +52,12 @@ def book_pandas(d):
     while cur_row:
         ward = cur_row[0].value
 
-        #TODO: unfortunately there's a bug where, for certain races that simply can't be generated e.g. 
+        # TODO: unfortunately there's a bug where, for certain races that simply can't be generated e.g.
         cols = next(rows)
-        cols = [col.value if col.value != "%" else cols[i - 1].value + " %" for i, col in enumerate(cols)]
+        cols = [
+            col.value.lower() if col.value != "%" else cols[i - 1].value + " percent"
+            for i, col in enumerate(cols)
+        ]
         cur_row = next(rows)
         try:
             while not all(
@@ -53,24 +66,33 @@ def book_pandas(d):
                     for cell in cur_row
                 ]
             ):
-                print(type(cur_row[0].value))
-                row = [int(ward.split(' ')[1]),
-                       int(cur_row[0].value) if type(cur_row[0].value) is not str else cur_row[0].value , 
-                       int(cur_row[1].value) if type(cur_row[1].value) is not str else int(cur_row[1].value.replace(',','')),
-                       int(cur_row[2].value) if type(cur_row[2].value) is not str else int(cur_row[2].value.replace(',','')),
-                       float(cur_row[3].value[:-1])]
+                row = [
+                    int(ward.split(" ")[1]),
+                    *(
+                        cell.value
+                        if cell.ctype not in (xlrd.XL_CELL_EMPTY, xlrd.XL_CELL_BLANK)
+                        else None
+                        for cell in cur_row
+                    ),
+                ]
                 subtables.append(row)
                 cur_row = next(rows)
         except StopIteration:
             pass
+        except ValueError as e:
+            print(race, contest)
+            pprint(cur_row)
+            print(e)
+            raise e
         cur_row = next(rows, None)
-    cols = ['Ward', *cols]
-    with open(f'../output/{race}_{contest}.csv', 'w') as ofp:
+
+    cols = ["Ward", *cols]
+    Path(f"../output/{race}").mkdir(parents=True, exist_ok=True)
+    with open(f"../output/{race}/{contest}.csv", "w") as ofp:
         writer = csv.writer(ofp)
         writer.writerow(cols)
         writer.writerows(subtables)
-    return 
-        
+
 
 async def fetch_contest_data(
     race: int, contest: int, cs: ClientSession, elec_data: dict, sem: Semaphore
@@ -83,10 +105,10 @@ async def fetch_contest_data(
         resp.raise_for_status()
         # This happens for some contests e.g. https://chicagoelections.gov/elections/results/7/download?contest=334&ward=&precinct=
         if resp.content_type != "application/vnd.ms-excel":
-            raise RuntimeError(f"race {race} contest {contest} did not return an Excel spreadsheet")
-        return {'contest': contest,
-                'race': race,
-                'data': await resp.content.read()}
+            raise RuntimeError(
+                f"race {race} contest {contest} did not return an Excel spreadsheet"
+            )
+        return {"contest": contest, "race": race, "data": await resp.content.read()}
     except Exception as e:
         print(e, race, contest)
         return None
@@ -113,32 +135,31 @@ async def fetch_contests():
 async def main():
     with open("../output/results-metadata.json", "r") as ifp:
         results_metadata: dict = load(ifp)
+    
     pairs = (
         (contest, race)
         for contest, c_info in results_metadata.items()
         for race in c_info["races"]
     )
+    
     if DEBUG == 1:
-        pairs = list(pairs)[:1]
-        # pprint(pairs)
+        pairs = list(pairs)[:1000]
+
     contest_data = {}
     sem = Semaphore(10)
-    # maybe we can store this sqlite database for fast downloads? 
+    # maybe we can store this sqlite database for fast downloads?
     async with CachedSession(cache=SQLiteBackend("test_cache")) as cs:
         contest_data = await gather(
             *(fetch_contest_data(*pair, cs, contest_data, sem) for pair in pairs)
         )
-    # TODO: Need a more elegant solution for this. Occasionally there are tables where 
+    # TODO: Need a more elegant solution for this. Occasionally there are tables where
     # parts are empty - there's multiple candidates listed as 'No Candidate'
     # e.g. https://chicagoelections.gov/elections/results/240/download?contest=390&ward=&precinct=
-    warnings.resetwarnings() 
+    warnings.resetwarnings()
     contest_data = list(filter(None, contest_data))
 
     with Pool(6) as p:
-        contest_data = p.map(book_pandas, contest_data)
-    with open("data.json", "w") as ofp:
-        dump(contest_data, ofp, indent=2)
-
+        p.map(book_pandas, contest_data)
 
 if __name__ == "__main__":
     run(main())

From 0cd97455451472a7abbe2768a16c0841c3dd9d4a Mon Sep 17 00:00:00 2001
From: Yash Bhosale <bhosaley5@gmail.com>
Date: Wed, 25 Sep 2024 00:24:33 -0500
Subject: [PATCH 7/8] this isnt on the website anymore

---
 output/results-metadata.json | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/output/results-metadata.json b/output/results-metadata.json
index b87b5dc..8708fc3 100644
--- a/output/results-metadata.json
+++ b/output/results-metadata.json
@@ -6859,17 +6859,5 @@
       "299": "COMMITTEEMAN - 49TH WARD",
       "300": "COMMITTEEMAN - 50TH WARD"
     }
-  },
-  "19830": {
-    "year": 1983,
-    "date": "2/22/1983",
-    "label": "1983 Primary - Democratic",
-    "races": { "0": "Mayor" }
-  },
-  "19831": {
-    "year": 1983,
-    "date": "4/12/1983",
-    "label": "1983 General Election",
-    "races": { "0": "Mayor" }
   }
 }

From b6caed18339e366cc6619fc3a7378730299e189f Mon Sep 17 00:00:00 2001
From: Yash Bhosale <bhosaley5@gmail.com>
Date: Tue, 1 Oct 2024 06:54:53 -0500
Subject: [PATCH 8/8] column formatting, adding id

---
 scripts/scrape_elections.py | 41 +++++++++++++++++++++++--------------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/scripts/scrape_elections.py b/scripts/scrape_elections.py
index 73bd9af..c8bfe25 100644
--- a/scripts/scrape_elections.py
+++ b/scripts/scrape_elections.py
@@ -27,10 +27,10 @@ def transform_type(v):
         return None
     if type(v) is float:
         return int(v) if v.is_integer() else v
+    elif "%" in v:
+        return float(v[:-1].replace(",", ""))
     elif "," in v:
         return int(v.replace(",", ""))
-    elif "%" in v:
-        return float(v[:-1])
 
 def book_pandas(d):
     contest, race = d["contest"], d["race"]
@@ -50,12 +50,13 @@ def book_pandas(d):
     cur_row = next(rows)
     cols = []
     while cur_row:
-        ward = cur_row[0].value
+        ward = int(cur_row[0].value.split(" ")[1])
 
         # TODO: unfortunately there's a bug where, for certain races that simply can't be generated e.g.
         cols = next(rows)
+
         cols = [
-            col.value.lower() if col.value != "%" else cols[i - 1].value + " percent"
+            col.value if col.value != "%" else cols[i - 1].value + " Percent"
             for i, col in enumerate(cols)
         ]
         cur_row = next(rows)
@@ -66,16 +67,19 @@ def book_pandas(d):
                     for cell in cur_row
                 ]
             ):
-                row = [
-                    int(ward.split(" ")[1]),
-                    *(
-                        cell.value
-                        if cell.ctype not in (xlrd.XL_CELL_EMPTY, xlrd.XL_CELL_BLANK)
-                        else None
-                        for cell in cur_row
-                    ),
-                ]
-                subtables.append(row)
+                if cur_row[0].value != 'Total':
+                    precinct = transform_type(cur_row[0].value)
+                    row = [
+                        f'{ward:02d}{precinct:02d}',
+                        ward,
+                        *(
+                            transform_type(cell.value)
+                            if cell.ctype not in (xlrd.XL_CELL_EMPTY, xlrd.XL_CELL_BLANK)
+                            else None
+                            for cell in cur_row
+                        ),
+                    ]
+                    subtables.append(row)
                 cur_row = next(rows)
         except StopIteration:
             pass
@@ -86,7 +90,14 @@ def book_pandas(d):
             raise e
         cur_row = next(rows, None)
 
-    cols = ["Ward", *cols]
+    conv = {
+        "Total Voters": "total",
+        "Precinct": "precinct",
+        "Registered Voters": "registered",
+        "Ballots Cast": "ballots",
+        "Turnout": "turnout"
+    }
+    cols = ["ward", *[conv.get(col, col) for col in cols]]
     Path(f"../output/{race}").mkdir(parents=True, exist_ok=True)
     with open(f"../output/{race}/{contest}.csv", "w") as ofp:
         writer = csv.writer(ofp)