-
Notifications
You must be signed in to change notification settings - Fork 33
/
scraper.py
executable file
·63 lines (50 loc) · 1.87 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/usr/bin/env python3
import requests
import pprint
import time
from bs4 import BeautifulSoup
from classes.course import (Course, splitHeaderTable)
from classes.coursesPage import (Chunk, CoursesPage)
from classes.sessionData import SessionData
from classes.loadingBar import printProgressBar
from classes.toJSON import formatCourses
URL = "http://timetabling.anu.edu.au/sws2022/"
SEMESTER = 1
# 1-50: 50 is the maximum allowed request
CHUNK = 50
start_time = time.time()
# Get landing page
print("Getting landing page...")
res = requests.get(URL)
cookies = res.cookies
landingSoup = BeautifulSoup(res.content, 'html.parser')
session = SessionData(landingSoup)
print("Got landing page! Getting list of courses...")
res = requests.post(URL, data=session.withTargetLinkType(
"LinkBtn_modules", "information"), cookies=cookies)
cookies = res.cookies
session = SessionData(BeautifulSoup(res.content, 'html.parser'))
coursesPage = CoursesPage(res)
coursesPage.courseList = list(
filter(lambda x: x[0].strip().endswith(f"S{SEMESTER}"), coursesPage.courseList))
courseCount = len(coursesPage.courseList)
print(f"Found {courseCount} courses.")
body = coursesPage.getBody(SEMESTER)
body = [(k, v) for k, v in body.items()]
courses = []
printProgressBar(0, courseCount)
for courseCodes in Chunk(coursesPage, CHUNK):
reqBody = [] + session.asModuleList() + body
for code in courseCodes:
reqBody.append(('dlObject', code[1]))
res = requests.post(URL, data=reqBody, cookies=cookies)
try:
new = splitHeaderTable(res)
except PermissionError:
print("Request error!")
exit(1)
courses = courses + new
printProgressBar(len(courses), courseCount)
formatCourses(courses)
print(
f"Scraping complete, scraped {len(courses)} courses in total, time elapsed: { time.time() - start_time}s")