Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
bzhangjma committed Oct 30, 2024
1 parent a370b4c commit 3f6de35
Showing 1 changed file with 55 additions and 28 deletions.
83 changes: 55 additions & 28 deletions ca_on_markham/people.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,8 @@
from utils import CanadianPerson as Person
from utils import CanadianScraper

COUNCIL_PAGE = (
"https://www.markham.ca/wps/portal/home/about/city-hall/regional-ward-councillors/02-regional-ward-councillors"
)
MAYOR_PAGE = "https://www.markham.ca/wps/portal/home/about/city-hall/mayor/00-mayors-office"
COUNCIL_PAGE = "https://www.markham.ca/about-city-markham/city-hall/regional-ward-councillors"
MAYOR_PAGE = "https://www.markham.ca/about-city-markham/city-hall/mayors-office"


class MarkhamPersonScraper(CanadianScraper):
Expand All @@ -17,25 +15,19 @@ def scrape(self):

yield self.scrape_mayor(MAYOR_PAGE)

councillors = page.xpath('//div[@class="col-sm-3 col-xs-6"]')
regional_councillors = page.xpath('//div[@class="grid md:grid-cols-2 grid-cols-1 lg:grid-cols-4 gap-4 scrollablec"]')[0]
ward_councillors = page.xpath('//div[@class="grid md:grid-cols-2 grid-cols-1 lg:grid-cols-4 gap-4 scrollablec"]')[1]
councillors = [regional_councillors, ward_councillors]
assert len(councillors), "No councillors found"
for councillor in councillors:
name, district = councillor.xpath(".//h4/text()")[0].split(", ")
if "Ward" in district:
district = district.replace("Councillor", "").strip()
role = "Councillor"
elif "Regional" in district:
role = "Regional Councillor"
district = f"Markham (seat {regional_councillor_seat_number})"
regional_councillor_seat_number += 1
else:
role = district
district = "Markham"
for i, councillor in enumerate(regional_councillors):
name = councillor.xpath(".//h3/text()")[0].strip()
district = councillor.xpath(".//p/text()")[0].strip()
role = "Regional Councillor"
district = f"Markham (seat {regional_councillor_seat_number})"
regional_councillor_seat_number += 1

image = councillor.xpath(".//img/@src")[0]
url = "https://www.markham.ca/wps/portal/home/about" + re.search(
r"(?<=about).*(?='\))", councillor.xpath(".//a/@href")[0]
).group(0)
url = councillor.xpath(".//a/@href")[0]

address, phone, email, links = self.get_contact(url)

Expand All @@ -52,14 +44,48 @@ def scrape(self):
p.add_link(link)

yield p

for i, councillor in enumerate(ward_councillors):
name = councillor.xpath(".//h3/text()")[0].strip()
district = councillor.xpath(".//p/text()")[0].strip()
district = district.replace("Councillor", "").strip()
role = "Councillor"

image = councillor.xpath(".//img/@src")[0]
url = councillor.xpath(".//a/@href")[0]

address, phone, email, links = self.get_contact(url)

p = Person(primary_org="legislature", name=name, district=district, role=role)
p.add_source(COUNCIL_PAGE)
p.add_source(url)

p.image = image
p.add_contact("address", address, "legislature")
p.add_contact("voice", phone, "legislature")
p.add_contact("email", email)

for link in links:
p.add_link(link)

yield p

def get_contact(self, url):
page = self.lxmlize(url)

contact_node = page.xpath('//div[@class="vcard col-sm-6"]')[0]
contact_node = page.xpath('//div[@class="pd-x-16 pd-y-32 bg-white committee-right-info-section layout__region layout__region--second"]')[0]
links = []

address = contact_node.xpath(".//p/text()")[:2]
if contact_node.xpath('.//span[@class="address-line1"]/text()'):
address = (contact_node.xpath('.//span[@class="address-line1"]/text()')[0]
+ " " + contact_node.xpath('.//span[@class="locality"]/text()')[0]
+ " " + contact_node.xpath('.//span[@class="administrative-area"]/text()')[0]
+ " " + contact_node.xpath('.//span[@class="postal-code"]/text()')[0]
+ " " + contact_node.xpath('.//span[@class="country"]/text()')[0])
else:
contact_node = page.xpath('//div[@class="formatted-text field-content field-content--label--body field-content--entity-type--block-content field-content--name--body"]')[0]
address = contact_node.xpath('.//p/text()')[0] + " " + contact_node.xpath('.//p/text()')[1]

links = get_links(contact_node)
phone = self.get_phone(contact_node)
email = self.get_email(contact_node)
Expand All @@ -68,12 +94,13 @@ def get_contact(self, url):

def scrape_mayor(self, url):
page = self.lxmlize(url)
name = page.xpath('//img/@alt[contains(., "Mayor")]')[0].split(", ", 1)[1]
email = self.get_email(page)
phone = self.get_phone(page)

name = page.xpath('.//div[@class="formatted-text field-content field-content--label--body field-content--entity-type--block-content field-content--name--body"]/h1/span/span/text()')[0]
contact_node = page.xpath('.//div[@class="dept-contact-info--block"]')[0]
email = self.get_email(contact_node)
phone = self.get_phone(contact_node)

p = Person(primary_org="legislature", name=name, district="Markham", role="Mayor")
p.image = page.xpath('//img[contains(./@alt, "Mayor")]/@src')[0]
p.image = page.xpath('.//div[@class="align-right media--image"]/div/img/@src')[0]
p.add_contact("email", email)
p.add_contact("voice", phone, "legislature")
p.add_source(url)
Expand All @@ -86,6 +113,6 @@ def get_links(elem):
links = elem.xpath(".//a")
for link in links:
link = link.attrib["href"]
if "http://www.markham.ca" not in link and "mail" not in link:
if "http://www.markham.ca" not in link and "mail" not in link and "tel" not in link:
links_r.append(link)
return links_r

0 comments on commit 3f6de35

Please sign in to comment.