Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

All Chicago Scrapers #16

Merged
merged 12 commits into from
Feb 3, 2015
Merged
280 changes: 168 additions & 112 deletions chicago/bills.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,28 @@
from .legistar import LegistarScraper
import lxml
import lxml.etree
import datetime
import pytz

from pupa.scrape import Bill
from pupa.scrape import Bill, Vote


class ChicagoBillScraper(LegistarScraper):
base_url = 'https://chicago.legistar.com/'
legislation_url = 'https://chicago.legistar.com/Legislation.aspx'
timezone = "US/Central"

def session(self, action_date) :
if action_date < datetime.datetime(2011, 5, 18,
tzinfo=pytz.timezone(self.timezone)) :
return "2007"
elif action_date < datetime.datetime(2015, 5, 18,
tzinfo=pytz.timezone(self.timezone)) :
return "2011"
else :
return "2015"



def searchLegislation(self, search_text='', created_before=None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah! Any additional params will be passed to scrape!

On Fri, Jan 16, 2015 at 10:06 AM, Forest Gregg [email protected]
wrote:

In chicago/bills.py
#16 (diff)
:

class ChicagoBillScraper(LegistarScraper):
base_url = 'https://chicago.legistar.com/'
legislation_url = 'https://chicago.legistar.com/Legislation.aspx'

  • timezone = "US/Central"

def searchLegislation(self, search_text='', created_before=None,

Does pupa have any facilities for passing through command line args. It
would be great to be able to specify search parameters.


Reply to this email directly or view it on GitHub
https://github.com/opencivicdata/scrapers-us-municipal/pull/16/files#r23084807
.

Paul Tagliamonte
Software Developer | Sunlight Foundation

created_after=None, num_pages = None):
Expand Down Expand Up @@ -64,136 +79,177 @@ def parseSearchResults(self, page) :
continue
legislation_url = legislation[id_key]['url'].split(self.base_url)[-1]
legislation[id_key] = legislation_id
legislation['URL'] = self.base_url + legislation_url.split('&Options')[0]
legislation['url'] = self.base_url + legislation_url.split('&Options')[0]

yield legislation

def expandLegislationSummary(self, summary):
"""
Take a row as given from the searchLegislation method and retrieve the
details of the legislation summarized by that row.
"""
return self.expandSummaryRow(summary,
self.parseLegislationDetail)

def expandHistorySummary(self, action):
"""
Take a row as given from the parseLegislationDetail method and
retrieve the details of the history event summarized by that
row.
"""
return self.expandSummaryRow(action,
self.parseHistoryDetail)

def expandSummaryRow(self, row, parse_function):
"""
Take a row from a data table and use the URL value from that row to
retrieve more details. Parse those details with parse_function.
"""
print(row['URL'])
page = self.lxmlize(row['URL'])

return parse_function(page)

def _get_general_details(self, detail_div) :
"""
Parse the data in the top section of a detail page.
"""
key_query = ".//span[contains(@id, 'ctl00_ContentPlaceHolder1_lbl') "\
" and not(contains(@id, '2'))]"

value_query = ".//*[(contains(@id, 'ctl00_ContentPlaceHolder1_lbl') "\
" or contains(@id, 'ctl00_ContentPlaceHolder1_hyp')) "\
" and contains(@id, '2')]"


keys = [span.text_content().replace(':', '').strip()
for span
in detail_div.xpath(key_query)]

values = [element.text_content().strip()
for element
in detail_div.xpath(value_query)]

return dict(zip(keys, values))


def parseLegislationDetail(self, page):
"""
Take a legislation detail page and return a dictionary of
the different data appearing on the page

Example URL: http://chicago.legistar.com/LegislationDetail.aspx?ID=1050678&GUID=14361244-D12A-467F-B93D-E244CB281466&Options=ID|Text|&Search=zoning
"""

# Pull out the top matter
detail_div = page.xpath("//div[@id='ctl00_ContentPlaceHolder1_pageDetails']")[0]
details = self._get_general_details(detail_div)

details[u'Attachments'] = []

attachment_url = detail_div.xpath(".//span[@id='ctl00_ContentPlaceHolder1_lblAttachments2']/a")

for attachment in attachment_url :
details[u'Attachments'].append({
'url' : attachment.attrib['href'],
'label' : attachment.text_content()})

if u'Related files' in details :
details[u'Related files'] = details[u'Related files'].split(',')

if u'Sponsors' in details :
details[u'Sponsors'] = details[u'Sponsors'].split(',')

if u'Topics' in details :
details[u'Topics'] = details[u'Topics'].split(',')

return details


def scrape(self):
self.session = '2011'

for i, page in enumerate(self.searchLegislation()) :
for legislation_summary in self.parseSearchResults(page) :
title = legislation_summary['Title'].strip()
if title == "":
continue

bill = Bill(name=legislation_summary['Record #'],
session=self.session,
if legislation_summary['Type'].lower() in ('order',
'claim',
'communication',
'report',
'oath of office') :
continue
else :
bill_type = legislation_summary['Type'].lower()

bill_session = self.session(legislation_summary['Intro\xa0Date'])

bill = Bill(identifier=legislation_summary['Record #'],
legislative_session=bill_session,
title=title,
type=[legislation_summary['Type'].lower()],
organization=self.jurisdiction.name)
classification=bill_type,
from_organization=self.jurisdiction.name)

bill.add_source(legislation_summary['url'])

bill.add_source(legislation_summary['URL'])
bill, votes = self.addDetails(bill, legislation_summary['url'])

legislation_details = self.expandLegislationSummary(legislation_summary)
yield bill
for vote in votes :
yield vote


for related_bill in legislation_details.get('Related files', []) :
bill.add_related_bill(name = related_bill,
session = self.session,
relation='other-session',
chamber=None)

for i, sponsor in enumerate(legislation_details.get('Sponsors', [])) :
if i == 0 :
primary = True
sponsorship_type = "Primary"
else :
primary = False
sponsorship_type = "Regular"
def extractVotes(self, action_detail_url) :
action_detail_page = self.lxmlize(action_detail_url)
vote_table = action_detail_page.xpath("//table[@id='ctl00_ContentPlaceHolder1_gridVote_ctl00']")[0]
votes = list(self.parseDataTable(vote_table))
vote_list = []
for vote, _, _ in votes :
raw_option = vote['Vote'].lower()
vote_list.append((VOTE_OPTIONS.get(raw_option, raw_option),
vote['Person Name']['label']))

bill.add_sponsor(sponsor, sponsorship_type,
'person', primary)
action_detail_div = action_detail_page.xpath(".//div[@id='ctl00_ContentPlaceHolder1_pageTop1']")[0]
action_details = self.parseDetails(action_detail_div)
result = action_details['Result'].lower()

for subject in legislation_details.get(u'Topics', []) :
bill.add_subject(subject)
return result, vote_list

for attachment in legislation_details.get(u'Attachments', []) :
bill.add_version_link('PDF',
attachment['url'],
mimetype="application/pdf")


yield bill
def addBillHistory(self, bill, history_table) :
all_votes = []

history = self.parseDataTable(history_table)

for action, _, _ in history :
action_description = action['Action']
try :
action_date = action['Date'].date().isoformat()
except AttributeError : # https://chicago.legistar.com/LegislationDetail.aspx?ID=1424866&GUID=CEC53337-B991-4268-AE8A-D4D174F8D492
continue

if action_description :
bill.add_action(action_description,
action_date,
organization=action['Action\xa0By'],
classification=ACTION_CLASSIFICATION[action_description])
if 'url' in action['Action\xa0Details'] :
action_detail_url = action['Action\xa0Details']['url']
result, votes = self.extractVotes(action_detail_url)

if votes and result : # see https://github.com/datamade/municipal-scrapers-us/issues/15
action_vote = Vote(legislative_session=bill.legislative_session,
motion_text=action_description,
classification=None,
start_date=action_date,
result=result,
bill=bill.identifier)
action_vote.add_source(action_detail_url)
for option, voter in votes :
action_vote.vote(option, voter)

all_votes.append(action_vote)


return all_votes


def addDetails(self, bill, detail_url) :
detail_page = self.lxmlize(detail_url)
detail_div = detail_page.xpath(".//div[@id='ctl00_ContentPlaceHolder1_pageDetails']")[0]

legislation_details = self.parseDetails(detail_div)


for related_bill in legislation_details.get('Related files', []) :
bill.add_related_bill(identifier = related_bill['label'],
legislative_session = bill.legislative_session,
relation_type='pending')

for i, sponsor in enumerate(legislation_details.get('Sponsors', [])) :
if i == 0 :
primary = True
sponsorship_type = "Primary"
else :
primary = False
sponsorship_type = "Regular"

bill.add_sponsorship(sponsor['label'], sponsorship_type,
'person', primary)

if u'Topics' in legislation_details :
for subjuct in legislation_details[u'Topics'].split(',') :
bill.add_subject(subject)

for attachment in legislation_details.get(u'Attachments', []) :
if attachment['label'] :
bill.add_version_link(attachment['label'],
attachment['url'],
media_type="application/pdf")

history_table = detail_page.xpath("//table[@id='ctl00_ContentPlaceHolder1_gridLegislation_ctl00']")[0]


votes = self.addBillHistory(bill, history_table)

return bill, votes




ACTION_CLASSIFICATION = {'Referred' : 'committee-referral',
'Re-Referred' : 'committee-referral',
'Recommended to Pass' : 'committee-passage-favorable',
'Passed as Substitute' : 'passage',
'Adopted' : 'passage',
'Approved' : 'passage',
'Passed' : 'passage',
'Substituted in Committee' : 'substitution',
'Failed to Pass' : 'failure',
'Recommended Do Not Pass' : 'committee-passage-unfavorable',
'Amended in Committee' : 'amendment-passage',
'Placed on File' : 'filing',
'Withdrawn' : 'withdrawal',
'Signed by Mayor' : 'executive-signature',
'Appointment' : 'appointment',
'Direct Introduction' : None,
'Remove Co-Sponsor(s)' : None,
'Add Co-Sponsor(s)' : None,
'Tabled' : None,
'Rules Suspended - Immediate Consideration' : None,
'Committee Discharged' : None,
'Held in Committee' : None,
'Recommended for Re-Referral' : None,
'Published in Special Pamphlet' : None,
'Adopted as Substitute' : None,
'Deferred and Published' : None,
}

VOTE_OPTIONS = {'yea' : 'yes',
'rising vote' : 'yes',
'nay' : 'no',
'recused' : 'excused'}



Loading