From a81512414aafaeb8fef580d7cb0343efed0a94b9 Mon Sep 17 00:00:00 2001 From: Yash Bhosale Date: Tue, 17 Sep 2024 03:25:49 -0500 Subject: [PATCH 1/8] adding updated scripts to fetch the data and convert it into json --- scripts/elections.json | 440 ++++++++++++++++++++++++++++++++++++ scripts/scrape_elections.py | 56 +++++ 2 files changed, 496 insertions(+) create mode 100644 scripts/elections.json create mode 100644 scripts/scrape_elections.py diff --git a/scripts/elections.json b/scripts/elections.json new file mode 100644 index 0000000..6a15c55 --- /dev/null +++ b/scripts/elections.json @@ -0,0 +1,440 @@ +{ + "242": { + "year": 2023, + "date": "4/3/23", + "label": "2023 Municipal Runoff - 4/3/23", + "races": {} + }, + "241": { + "year": 2023, + "date": "2/28/23", + "label": "2023 Municipal General - 2/28/23", + "races": {} + }, + "156": { + "year": 2022, + "date": "11/8/2022", + "label": "2022 General Election - 11/8/2022", + "races": {} + }, + "252": { + "year": 2022, + "date": "6/28/2022", + "label": "2022 Primary - Democratic - 6/28/2022", + "races": {} + }, + "253": { + "year": 2022, + "date": "6/28/2022", + "label": "2022 Primary - Republican - 6/28/2022", + "races": {} + }, + "254": { + "year": 2022, + "date": "6/28/2022", + "label": "2022 Primary - Libertarian - 6/28/2022", + "races": {} + }, + "255": { + "year": 2022, + "date": "6/28/2022", + "label": "2022 Primary - Non-Partisan - 6/28/2022", + "races": {} + }, + "251": { + "year": 2020, + "date": "11/3/2020", + "label": "2020 General Election - 11/3/2020", + "races": {} + }, + "250": { + "year": 2020, + "date": "3/17/2020", + "label": "2020 Primary - Non-Partisan - 3/17/2020", + "races": {} + }, + "240": { + "year": 2020, + "date": "3/17/2020", + "label": "2020 Primary - Republican - 3/17/2020", + "races": {} + }, + "230": { + "year": 2020, + "date": "3/17/2020", + "label": "2020 Primary - Democratic - 3/17/2020", + "races": {} + }, + "220": { + "year": 2019, + "date": "4/2/2019", + "label": "2019 Municipal Runoffs - 4/2/2019", + "races": {} + }, + "210": { + "year": 2019, + "date": "2/26/2019", + "label": "2019 Municipal General - 2/26/2019", + "races": {} + }, + "200": { + "year": 2018, + "date": "11/6/2018", + "label": "2018 General Election - 11/6/2018", + "races": {} + }, + "2": { + "year": 2018, + "date": "3/20/2018", + "label": "2018 Primary - Non-Partisan - 3/20/2018", + "races": {} + }, + "1": { + "year": 2018, + "date": "3/20/2018", + "label": "2018 Primary - Republican - 3/20/2018", + "races": {} + }, + "0": { + "year": 2018, + "date": "3/20/2018", + "label": "2018 Primary - Democratic - 3/20/2018", + "races": {} + }, + "3": { + "year": 2017, + "date": "2/28/2017", + "label": "2017 Municipal General - 2/28/2017", + "races": {} + }, + "4": { + "year": 2016, + "date": "11/8/2016", + "label": "2016 General Election - 11/8/2016", + "races": {} + }, + "8": { + "year": 2016, + "date": "3/15/2016", + "label": "2016 Primary - Non-Partisan - 3/15/2016", + "races": {} + }, + "7": { + "year": 2016, + "date": "3/15/2016", + "label": "2016 Primary - Green - 3/15/2016", + "races": {} + }, + "6": { + "year": 2016, + "date": "3/15/2016", + "label": "2016 Primary - Republican - 3/15/2016", + "races": {} + }, + "5": { + "year": 2016, + "date": "3/15/2016", + "label": "2016 Primary - Democratic - 3/15/2016", + "races": {} + }, + "9": { + "year": 2015, + "date": "4/7/2015", + "label": "2015 Municipal Runoffs - 4/7/2015", + "races": {} + }, + "10": { + "year": 2015, + "date": "2/24/2015", + "label": "2015 Municipal General - 2/24/2015", + "races": {} + }, + "11": { + "year": 2014, + "date": "11/4/2014", + "label": "2014 General Election - 11/4/2014", + "races": {} + }, + "12": { + "year": 2014, + "date": "3/18/2014", + "label": "2014 Primary - Democratic - 3/18/2014", + "races": {} + }, + "13": { + "year": 2014, + "date": "3/18/2014", + "label": "2014 Primary - Republican - 3/18/2014", + "races": {} + }, + "14": { + "year": 2014, + "date": "3/18/2014", + "label": "2014 Primary - Green - 3/18/2014", + "races": {} + }, + "15": { + "year": 2014, + "date": "3/18/2014", + "label": "2014 Primary - Non-Partisan - 3/18/2014", + "races": {} + }, + "16": { + "year": 2013, + "date": "4/9/2013", + "label": "2013 Special Election - 2nd Congressional - 4/9/2013", + "races": {} + }, + "17": { + "year": 2013, + "date": "2/26/2013", + "label": "2013 Special Primary - 2nd Congressional - Democratic - 2/26/2013", + "races": {} + }, + "18": { + "year": 2013, + "date": "2/26/2013", + "label": "2013 Special Priamry - 2nd Congressional - Republican - 2/26/2013", + "races": {} + }, + "19": { + "year": 2012, + "date": "11/6/2012", + "label": "2012 General Election - 11/6/2012", + "races": {} + }, + "20": { + "year": 2012, + "date": "3/20/2012", + "label": "2012 Primary - Democratic - 3/20/2012", + "races": {} + }, + "21": { + "year": 2012, + "date": "3/20/2012", + "label": "2012 Primary - Republican - 3/20/2012", + "races": {} + }, + "22": { + "year": 2012, + "date": "3/20/2012", + "label": "2012 Primary - Green - 3/20/2012", + "races": {} + }, + "23": { + "year": 2012, + "date": "3/20/2012", + "label": "2012 Primary - Non-Partisan - 3/20/2012", + "races": {} + }, + "24": { + "year": 2011, + "date": "4/5/2011", + "label": "2011 Municipal Runoffs - 4/5/2011", + "races": {} + }, + "25": { + "year": 2011, + "date": "2/22/2011", + "label": "2011 Municipal General - 2/22/2011", + "races": {} + }, + "26": { + "year": 2010, + "date": "11/2/2010", + "label": "2010 General Election - 11/2/2010", + "races": {} + }, + "27": { + "year": 2010, + "date": "2/2/2010", + "label": "2010 Primary - Democratic - 2/2/2010", + "races": {} + }, + "29": { + "year": 2010, + "date": "2/2/2010", + "label": "2010 Primary - Republican - 2/2/2010", + "races": {} + }, + "31": { + "year": 2010, + "date": "2/2/2010", + "label": "2010 Primary - Green - 2/2/2010", + "races": {} + }, + "33": { + "year": 2009, + "date": "4/7/2009", + "label": "2009 Special Election - 5th Congressional - 4/7/2009", + "races": {} + }, + "34": { + "year": 2009, + "date": "3/3/2009", + "label": "2009 Special Primary - 5th Congressional - Democratic - 3/3/2009", + "races": {} + }, + "36": { + "year": 2009, + "date": "3/3/2009", + "label": "2009 Special Primary - 5th Congressional - Republican - 3/3/2009", + "races": {} + }, + "38": { + "year": 2009, + "date": "3/3/2009", + "label": "2009 Special Primary - 5th Congressional - Green - 3/3/2009", + "races": {} + }, + "40": { + "year": 2008, + "date": "11/4/2008", + "label": "2008 General Election - 11/4/2008", + "races": {} + }, + "45": { + "year": 2008, + "date": "2/4/2008", + "label": "2008 Primary - Democratic - 2/4/2008", + "races": {} + }, + "50": { + "year": 2008, + "date": "2/4/2008", + "label": "2008 Primary - Republican - 2/4/2008", + "races": {} + }, + "55": { + "year": 2008, + "date": "2/4/2008", + "label": "2008 Primary - Green - 2/4/2008", + "races": {} + }, + "60": { + "year": 2007, + "date": "4/17/2007", + "label": "2007 Municipal Runoffs - 4/17/2007", + "races": {} + }, + "65": { + "year": 2007, + "date": "2/27/2007", + "label": "2007 Municipal General - 2/27/2007", + "races": {} + }, + "70": { + "year": 2006, + "date": "11/7/2006", + "label": "2006 General Election - 11/7/2006", + "races": {} + }, + "75": { + "year": 2006, + "date": "3/21/2006", + "label": "2006 Primary - Democratic - 3/21/2006", + "races": {} + }, + "80": { + "year": 2006, + "date": "3/21/2006", + "label": "2006 Primary - Republican - 3/21/2006", + "races": {} + }, + "85": { + "year": 2006, + "date": "3/21/2006", + "label": "2006 Primary - Other - 3/21/2006", + "races": {} + }, + "90": { + "year": 2004, + "date": "11/2/2004", + "label": "2004 General Election - 11/2/2004", + "races": {} + }, + "95": { + "year": 2004, + "date": "3/16/2004", + "label": "2004 Primary - Democratic - 3/16/2004", + "races": {} + }, + "100": { + "year": 2004, + "date": "3/16/2004", + "label": "2004 Primary - Republican - 3/16/2004", + "races": {} + }, + "101": { + "year": 2004, + "date": "3/16/2004", + "label": "2004 Primary - Other - 3/16/2004", + "races": {} + }, + "105": { + "year": 2003, + "date": "4/1/2003", + "label": "2003 Municipal Runoffs - 4/1/2003", + "races": {} + }, + "110": { + "year": 2003, + "date": "2/25/2003", + "label": "2003 Municipal General - 2/25/2003", + "races": {} + }, + "115": { + "year": 2002, + "date": "11/5/2002", + "label": "2002 General Election - 11/5/2002", + "races": {} + }, + "116": { + "year": 2002, + "date": "3/19/2002", + "label": "2002 Primary - Democratic - 3/19/2002", + "races": {} + }, + "117": { + "year": 2002, + "date": "3/19/2002", + "label": "2002 Primary - Republican - 3/19/2002", + "races": {} + }, + "118": { + "year": 2002, + "date": "3/19/2002", + "label": "2002 Primary - Other - 3/19/2002", + "races": {} + }, + "120": { + "year": 2000, + "date": "11/7/2000", + "label": "2000 General Election - 11/7/2000", + "races": {} + }, + "124": { + "year": 2000, + "date": "3/21/2000", + "label": "2000 Primary - Democratic - 3/21/2000", + "races": {} + }, + "125": { + "year": 2000, + "date": "3/21/2000", + "label": "2000 Primary - Republican - 3/21/2000", + "races": {} + } + "19830": { + "year": 1983, + "date": "2/22/1983", + "label": "1983 Primary - Democratic", + "races": {"0": "Mayor"} + }, + "19831": { + "year": 1983, + "date": "4/12/1983", + "label": "1983 General Election", + "races": {"0": "Mayor"} + } +} diff --git a/scripts/scrape_elections.py b/scripts/scrape_elections.py new file mode 100644 index 0000000..9d73a28 --- /dev/null +++ b/scripts/scrape_elections.py @@ -0,0 +1,56 @@ +from io import BytesIO +import xlrd +from pprint import pprint +import pandas as pd +from aiohttp import ClientSession +from json import load, dump +from asyncio import run + + + +def book_pandas(book: BytesIO): + workbook: xlrd.Book = xlrd.open_workbook(file_contents=book, ignore_workbook_corruption=True) + sheet = workbook.sheet_by_index(0) + rows = sheet.get_rows() + total = [] + subtables = {} + for i in range(3): + next(rows) + subtables['Total'] = total + cur_row = next(rows) + while cur_row: + + ward = cur_row[0].value + print(ward) + cur_row = next(rows) + sub_table = [] + try: + while not all([cell.ctype in (xlrd.XL_CELL_EMPTY, xlrd.XL_CELL_BLANK) for cell in cur_row]): + sub_table.append([cell.value for cell in cur_row]) + cur_row = next(rows) + except StopIteration: + pass + cols = sub_table[0] + print(sub_table) + print(cols) + for i in range(len(cols)): + if cols[i] == "%": + cols[i] = f"{cols[i-1]} %" + subtables[ward] = pd.DataFrame(sub_table[1:], columns=sub_table[0]).set_index('Precinct').to_dict(orient="index") + cur_row = next(rows, None) + dump(subtables, open("subtable.json", 'w')) + + return subtables + +async def main(): + + results_metadata: dict = load(open("../output/results-metadata.json", "r")) + pairs = [(contest, race) for contest, c_info in results_metadata.items() for race in c_info["races"]] + #print(len(pairs)) + #async with ClientSession() as cs: + # async with cs.get("https://chicagoelections.gov/elections/results/156/download?contest=15&ward=&precinct=") as resp: + # book_pandas(await resp.content.read()) + book_pandas(open("/home/yash/Downloads/download.xls", "rb").read()) + +if __name__ == "__main__": + run(main()) From cc7de2a9764922f4b7c877ee7fe1668faf28b69b Mon Sep 17 00:00:00 2001 From: Yash Bhosale Date: Tue, 17 Sep 2024 19:23:43 -0500 Subject: [PATCH 2/8] some minor updates --- scripts/scrape_elections.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/scripts/scrape_elections.py b/scripts/scrape_elections.py index 9d73a28..a4aada9 100644 --- a/scripts/scrape_elections.py +++ b/scripts/scrape_elections.py @@ -8,7 +8,7 @@ -def book_pandas(book: BytesIO): +def book_pandas(book: BytesIO, race: int, contest: int): workbook: xlrd.Book = xlrd.open_workbook(file_contents=book, ignore_workbook_corruption=True) sheet = workbook.sheet_by_index(0) rows = sheet.get_rows() @@ -21,7 +21,6 @@ def book_pandas(book: BytesIO): while cur_row: ward = cur_row[0].value - print(ward) cur_row = next(rows) sub_table = [] try: @@ -31,14 +30,13 @@ def book_pandas(book: BytesIO): except StopIteration: pass cols = sub_table[0] - print(sub_table) - print(cols) + cols = [col if col != '%' else cols[i-1] + " %" for i, col in enumerate(cols)] for i in range(len(cols)): if cols[i] == "%": cols[i] = f"{cols[i-1]} %" subtables[ward] = pd.DataFrame(sub_table[1:], columns=sub_table[0]).set_index('Precinct').to_dict(orient="index") cur_row = next(rows, None) - dump(subtables, open("subtable.json", 'w')) + dump(subtables, open(f"{race}_{contest}_election.json", 'w'), indent = 2) return subtables @@ -46,11 +44,10 @@ async def main(): results_metadata: dict = load(open("../output/results-metadata.json", "r")) pairs = [(contest, race) for contest, c_info in results_metadata.items() for race in c_info["races"]] - #print(len(pairs)) #async with ClientSession() as cs: # async with cs.get("https://chicagoelections.gov/elections/results/156/download?contest=15&ward=&precinct=") as resp: # book_pandas(await resp.content.read()) - book_pandas(open("/home/yash/Downloads/download.xls", "rb").read()) + book_pandas(open("/home/yash/Downloads/download.xls", "rb").read(), 156, 15) if __name__ == "__main__": run(main()) From fc0e3acc3d665960ef8ed6be786ae64f056a9015 Mon Sep 17 00:00:00 2001 From: Yash Bhosale Date: Thu, 19 Sep 2024 19:54:24 -0500 Subject: [PATCH 3/8] cleanup, caching requests, new (unfinished) code to fetch the contest/race ids --- scripts/scrape_elections.py | 48 ++++++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/scripts/scrape_elections.py b/scripts/scrape_elections.py index a4aada9..c89935f 100644 --- a/scripts/scrape_elections.py +++ b/scripts/scrape_elections.py @@ -1,14 +1,17 @@ from io import BytesIO import xlrd -from pprint import pprint +from pprint import pprint import pandas as pd from aiohttp import ClientSession -from json import load, dump -from asyncio import run +from json import dump, load +from asyncio import gather, run +from aiohttp_client_cache import CachedSession, SQLiteBackend +from requests import get +from itertools import dropwhile +from bs4 import BeautifulSoup +DEBUG = True - - -def book_pandas(book: BytesIO, race: int, contest: int): +def book_pandas(book: BytesIO, race: int, contest: int, elec_data: dict): workbook: xlrd.Book = xlrd.open_workbook(file_contents=book, ignore_workbook_corruption=True) sheet = workbook.sheet_by_index(0) rows = sheet.get_rows() @@ -34,20 +37,37 @@ def book_pandas(book: BytesIO, race: int, contest: int): for i in range(len(cols)): if cols[i] == "%": cols[i] = f"{cols[i-1]} %" - subtables[ward] = pd.DataFrame(sub_table[1:], columns=sub_table[0]).set_index('Precinct').to_dict(orient="index") + subtables[ward] = pd.DataFrame(sub_table[1:], columns=cols).set_index('Precinct').to_dict(orient="index") cur_row = next(rows, None) - dump(subtables, open(f"{race}_{contest}_election.json", 'w'), indent = 2) - return subtables + elec_data.setdefault(race, {})[contest] = subtables + +async def fetch_contest_data(race: int, contest: int, cs: ClientSession, elec_data: dict): + print(f"race {race} contest {contest}") + resp = await cs.get(f"https://chicagoelections.gov/elections/results/{race}/download?contest={contest}&ward=&precinct=") + book_pandas(await resp.content.read(), race, contest, elec_data) + +async def fetch_races(): + resp = get("https://chicagoelections.gov/elections/results") + soup = BeautifulSoup(resp, "lxml") + races = [dropwhile(lambda c: not c.isnumeric(), link['href']) for link in soup if link['href'].startswith("/elections/results")] + return races + +async def fetch_contests(): + # Date: Tue, 24 Sep 2024 18:49:52 -0500 Subject: [PATCH 5/8] outputting csvs in the output folder --- scripts/scrape_elections.py | 48 ++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/scripts/scrape_elections.py b/scripts/scrape_elections.py index 81e052b..643c7fc 100644 --- a/scripts/scrape_elections.py +++ b/scripts/scrape_elections.py @@ -12,6 +12,9 @@ import warnings from multiprocessing import Pool from os import getenv +import locale +import csv +locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' ) DEBUG = getenv('DEBUG', 1) print(DEBUG) @@ -21,6 +24,7 @@ def book_pandas(d): contest, race, book = d['contest'], d['race'], d['data'] + print(contest, race) try: workbook: xlrd.Book = xlrd.open_workbook( file_contents=book, ignore_workbook_corruption=True @@ -30,16 +34,18 @@ def book_pandas(d): return sheet = workbook.sheet_by_index(0) rows = sheet.get_rows() - total = [] - subtables = {} + subtables = [] for i in range(3): next(rows) - subtables["Total"] = total cur_row = next(rows) + cols = [] while cur_row: ward = cur_row[0].value + + #TODO: unfortunately there's a bug where, for certain races that simply can't be generated e.g. + cols = next(rows) + cols = [col.value if col.value != "%" else cols[i - 1].value + " %" for i, col in enumerate(cols)] cur_row = next(rows) - sub_table = [] try: while not all( [ @@ -47,33 +53,28 @@ def book_pandas(d): for cell in cur_row ] ): - sub_table.append([cell.value for cell in cur_row]) + print(type(cur_row[0].value)) + row = [int(ward.split(' ')[1]), + int(cur_row[0].value) if type(cur_row[0].value) is not str else cur_row[0].value , + int(cur_row[1].value) if type(cur_row[1].value) is not str else int(cur_row[1].value.replace(',','')), + int(cur_row[2].value) if type(cur_row[2].value) is not str else int(cur_row[2].value.replace(',','')), + float(cur_row[3].value[:-1])] + subtables.append(row) cur_row = next(rows) except StopIteration: pass - cols = sub_table[0] - cols = [col if col != "%" else cols[i - 1] + " %" for i, col in enumerate(cols)] - # Note for the future: moving to SQLite might be more performant than json. - # Certainly the file size would likely be smaller. - for i in range(len(cols)): - if cols[i] == "%": - cols[i] = f"{cols[i-1]} %" - #TODO: unfortunately there's a bug where, for certain races that simply can't be generated e.g. - subtables[ward] = ( - pd.DataFrame(sub_table[1:], columns=cols) - .set_index("Precinct") - .to_dict(orient="index") - ) cur_row = next(rows, None) - return {'contest': contest, - 'race': race, - 'data': subtables} + cols = ['Ward', *cols] + with open(f'../output/{race}_{contest}.csv', 'w') as ofp: + writer = csv.writer(ofp) + writer.writerow(cols) + writer.writerows(subtables) + return async def fetch_contest_data( race: int, contest: int, cs: ClientSession, elec_data: dict, sem: Semaphore ): - # print(f"race {race} contest {contest}") await sem.acquire() try: resp = await cs.get( @@ -81,7 +82,6 @@ async def fetch_contest_data( ) resp.raise_for_status() # This happens for some contests e.g. https://chicagoelections.gov/elections/results/7/download?contest=334&ward=&precinct= - # print(resp.content_type) if resp.content_type != "application/vnd.ms-excel": raise RuntimeError(f"race {race} contest {contest} did not return an Excel spreadsheet") return {'contest': contest, @@ -119,7 +119,7 @@ async def main(): for race in c_info["races"] ) if DEBUG == 1: - pairs = list(pairs)[:1000] + pairs = list(pairs)[:1] # pprint(pairs) contest_data = {} sem = Semaphore(10) From 752f3963b57566a8be506a8f528b7e19a108d318 Mon Sep 17 00:00:00 2001 From: Yash Bhosale Date: Wed, 25 Sep 2024 00:12:01 -0500 Subject: [PATCH 6/8] finished enough to merge --- scripts/scrape_elections.py | 89 +++++++++++++++++++++++-------------- 1 file changed, 55 insertions(+), 34 deletions(-) diff --git a/scripts/scrape_elections.py b/scripts/scrape_elections.py index 643c7fc..73bd9af 100644 --- a/scripts/scrape_elections.py +++ b/scripts/scrape_elections.py @@ -1,9 +1,8 @@ from io import BytesIO import xlrd from pprint import pprint -import pandas as pd from aiohttp import ClientSession -from json import dump, load +from json import load from asyncio import Semaphore, gather, run from aiohttp_client_cache import CachedSession, SQLiteBackend from requests import get @@ -14,24 +13,35 @@ from os import getenv import locale import csv -locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' ) +from pathlib import Path -DEBUG = getenv('DEBUG', 1) -print(DEBUG) -SCRAPE_PROCESSES = getenv('SCRAPE_PROCESSES', 6) #my computer has 8 cores +locale.setlocale(locale.LC_ALL, "en_US.UTF-8") + +DEBUG = getenv("DEBUG", 1) +SCRAPE_PROCESSES = getenv("SCRAPE_PROCESSES", 6) # my computer has 8 cores warnings.filterwarnings("error") +def transform_type(v): + if v is None: + return None + if type(v) is float: + return int(v) if v.is_integer() else v + elif "," in v: + return int(v.replace(",", "")) + elif "%" in v: + return float(v[:-1]) + def book_pandas(d): - contest, race, book = d['contest'], d['race'], d['data'] - print(contest, race) + contest, race = d["contest"], d["race"] + book: BytesIO = d["data"] try: workbook: xlrd.Book = xlrd.open_workbook( file_contents=book, ignore_workbook_corruption=True ) except xlrd.XLRDError as e: print(e) - return + return sheet = workbook.sheet_by_index(0) rows = sheet.get_rows() subtables = [] @@ -42,9 +52,12 @@ def book_pandas(d): while cur_row: ward = cur_row[0].value - #TODO: unfortunately there's a bug where, for certain races that simply can't be generated e.g. + # TODO: unfortunately there's a bug where, for certain races that simply can't be generated e.g. cols = next(rows) - cols = [col.value if col.value != "%" else cols[i - 1].value + " %" for i, col in enumerate(cols)] + cols = [ + col.value.lower() if col.value != "%" else cols[i - 1].value + " percent" + for i, col in enumerate(cols) + ] cur_row = next(rows) try: while not all( @@ -53,24 +66,33 @@ def book_pandas(d): for cell in cur_row ] ): - print(type(cur_row[0].value)) - row = [int(ward.split(' ')[1]), - int(cur_row[0].value) if type(cur_row[0].value) is not str else cur_row[0].value , - int(cur_row[1].value) if type(cur_row[1].value) is not str else int(cur_row[1].value.replace(',','')), - int(cur_row[2].value) if type(cur_row[2].value) is not str else int(cur_row[2].value.replace(',','')), - float(cur_row[3].value[:-1])] + row = [ + int(ward.split(" ")[1]), + *( + cell.value + if cell.ctype not in (xlrd.XL_CELL_EMPTY, xlrd.XL_CELL_BLANK) + else None + for cell in cur_row + ), + ] subtables.append(row) cur_row = next(rows) except StopIteration: pass + except ValueError as e: + print(race, contest) + pprint(cur_row) + print(e) + raise e cur_row = next(rows, None) - cols = ['Ward', *cols] - with open(f'../output/{race}_{contest}.csv', 'w') as ofp: + + cols = ["Ward", *cols] + Path(f"../output/{race}").mkdir(parents=True, exist_ok=True) + with open(f"../output/{race}/{contest}.csv", "w") as ofp: writer = csv.writer(ofp) writer.writerow(cols) writer.writerows(subtables) - return - + async def fetch_contest_data( race: int, contest: int, cs: ClientSession, elec_data: dict, sem: Semaphore @@ -83,10 +105,10 @@ async def fetch_contest_data( resp.raise_for_status() # This happens for some contests e.g. https://chicagoelections.gov/elections/results/7/download?contest=334&ward=&precinct= if resp.content_type != "application/vnd.ms-excel": - raise RuntimeError(f"race {race} contest {contest} did not return an Excel spreadsheet") - return {'contest': contest, - 'race': race, - 'data': await resp.content.read()} + raise RuntimeError( + f"race {race} contest {contest} did not return an Excel spreadsheet" + ) + return {"contest": contest, "race": race, "data": await resp.content.read()} except Exception as e: print(e, race, contest) return None @@ -113,32 +135,31 @@ async def fetch_contests(): async def main(): with open("../output/results-metadata.json", "r") as ifp: results_metadata: dict = load(ifp) + pairs = ( (contest, race) for contest, c_info in results_metadata.items() for race in c_info["races"] ) + if DEBUG == 1: - pairs = list(pairs)[:1] - # pprint(pairs) + pairs = list(pairs)[:1000] + contest_data = {} sem = Semaphore(10) - # maybe we can store this sqlite database for fast downloads? + # maybe we can store this sqlite database for fast downloads? async with CachedSession(cache=SQLiteBackend("test_cache")) as cs: contest_data = await gather( *(fetch_contest_data(*pair, cs, contest_data, sem) for pair in pairs) ) - # TODO: Need a more elegant solution for this. Occasionally there are tables where + # TODO: Need a more elegant solution for this. Occasionally there are tables where # parts are empty - there's multiple candidates listed as 'No Candidate' # e.g. https://chicagoelections.gov/elections/results/240/download?contest=390&ward=&precinct= - warnings.resetwarnings() + warnings.resetwarnings() contest_data = list(filter(None, contest_data)) with Pool(6) as p: - contest_data = p.map(book_pandas, contest_data) - with open("data.json", "w") as ofp: - dump(contest_data, ofp, indent=2) - + p.map(book_pandas, contest_data) if __name__ == "__main__": run(main()) From 0cd97455451472a7abbe2768a16c0841c3dd9d4a Mon Sep 17 00:00:00 2001 From: Yash Bhosale Date: Wed, 25 Sep 2024 00:24:33 -0500 Subject: [PATCH 7/8] this isnt on the website anymore --- output/results-metadata.json | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/output/results-metadata.json b/output/results-metadata.json index b87b5dc..8708fc3 100644 --- a/output/results-metadata.json +++ b/output/results-metadata.json @@ -6859,17 +6859,5 @@ "299": "COMMITTEEMAN - 49TH WARD", "300": "COMMITTEEMAN - 50TH WARD" } - }, - "19830": { - "year": 1983, - "date": "2/22/1983", - "label": "1983 Primary - Democratic", - "races": { "0": "Mayor" } - }, - "19831": { - "year": 1983, - "date": "4/12/1983", - "label": "1983 General Election", - "races": { "0": "Mayor" } } } From b6caed18339e366cc6619fc3a7378730299e189f Mon Sep 17 00:00:00 2001 From: Yash Bhosale Date: Tue, 1 Oct 2024 06:54:53 -0500 Subject: [PATCH 8/8] column formatting, adding id --- scripts/scrape_elections.py | 41 +++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/scripts/scrape_elections.py b/scripts/scrape_elections.py index 73bd9af..c8bfe25 100644 --- a/scripts/scrape_elections.py +++ b/scripts/scrape_elections.py @@ -27,10 +27,10 @@ def transform_type(v): return None if type(v) is float: return int(v) if v.is_integer() else v + elif "%" in v: + return float(v[:-1].replace(",", "")) elif "," in v: return int(v.replace(",", "")) - elif "%" in v: - return float(v[:-1]) def book_pandas(d): contest, race = d["contest"], d["race"] @@ -50,12 +50,13 @@ def book_pandas(d): cur_row = next(rows) cols = [] while cur_row: - ward = cur_row[0].value + ward = int(cur_row[0].value.split(" ")[1]) # TODO: unfortunately there's a bug where, for certain races that simply can't be generated e.g. cols = next(rows) + cols = [ - col.value.lower() if col.value != "%" else cols[i - 1].value + " percent" + col.value if col.value != "%" else cols[i - 1].value + " Percent" for i, col in enumerate(cols) ] cur_row = next(rows) @@ -66,16 +67,19 @@ def book_pandas(d): for cell in cur_row ] ): - row = [ - int(ward.split(" ")[1]), - *( - cell.value - if cell.ctype not in (xlrd.XL_CELL_EMPTY, xlrd.XL_CELL_BLANK) - else None - for cell in cur_row - ), - ] - subtables.append(row) + if cur_row[0].value != 'Total': + precinct = transform_type(cur_row[0].value) + row = [ + f'{ward:02d}{precinct:02d}', + ward, + *( + transform_type(cell.value) + if cell.ctype not in (xlrd.XL_CELL_EMPTY, xlrd.XL_CELL_BLANK) + else None + for cell in cur_row + ), + ] + subtables.append(row) cur_row = next(rows) except StopIteration: pass @@ -86,7 +90,14 @@ def book_pandas(d): raise e cur_row = next(rows, None) - cols = ["Ward", *cols] + conv = { + "Total Voters": "total", + "Precinct": "precinct", + "Registered Voters": "registered", + "Ballots Cast": "ballots", + "Turnout": "turnout" + } + cols = ["ward", *[conv.get(col, col) for col in cols]] Path(f"../output/{race}").mkdir(parents=True, exist_ok=True) with open(f"../output/{race}/{contest}.csv", "w") as ofp: writer = csv.writer(ofp)