-
Notifications
You must be signed in to change notification settings - Fork 1
/
write.py
123 lines (86 loc) · 2.79 KB
/
write.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
"""
Catpture raw permit html and write to file.
```bash
sudo docker run --name all-the-permits --rm -d -v /home/ec2-user/atx-permit-bot:/app -w /app atx-permit-bot python write.py
sudo docker run --name all-the-permits --rm -it -v /home/ec2-user/atx-permit-bot:/app -w /app atx-permit-bot python write.py
```
"""
from datetime import datetime
import logging
from logging.handlers import RotatingFileHandler
from multiprocessing import Pool
import os
import pdb
import requests
from config.config import BASE_URL, DATESTRING_FORMAT
from config.secrets import ENDPOINT, TOKEN
def success(html):
# make sure the html contains permit data
if "No Rows Returned" in html or not html:
return False
else:
return True
def async_get_permits(rsn):
now = datetime.now().strftime(DATESTRING_FORMAT)
print(now)
print(rsn)
url = f"{BASE_URL}{rsn}"
html = get_permit(url)
logger.info(f"RSN: {rsn}")
if not success(html):
fname = f"s3/{rsn}_NO_DATA.html"
else:
fname = f"s3/{rsn}.html"
with open(fname, "w") as fout:
print("write")
fout.write(html)
return None
def get_permit(url):
success = False
while not success:
try:
print(f"trying: {url}")
res = requests.get(url)
print("got a response")
res.raise_for_status()
success = True
except Exception as e:
print("error")
logging.error(res.text)
return ""
return res.text
def get_unscraped_rsns(max_rsn, scraped_rsns):
print("getting unscraped rsns")
unscraped_rsns = []
if not scraped_rsns:
scraped_rsns = [10000000]
for rsn in range(min(scraped_rsns), max_rsn):
if rsn not in scraped_rsns:
unscraped_rsns.append(rsn)
return unscraped_rsns
def get_scraped_rsns(path):
print("getting scraped rsns")
rsns = []
for file in os.scandir(path):
if ".html" in file.path:
fname = file.path.replace(".html", "").replace("s3/", "")
rsn = int(fname.split("_")[0])
rsns.append(rsn)
return rsns
def main():
max_rsn = 12353184 # the largest (most recent) RSN that we want to scrape
print("starting")
scraped_rsns = get_scraped_rsns("s3")
unscraped_rsns = get_unscraped_rsns(max_rsn, scraped_rsns)
with Pool(processes=4) as pool:
pool.map(async_get_permits, unscraped_rsns)
logger.info("done")
if __name__ == "__main__":
logger = logging.getLogger("my_logger")
logging.basicConfig(
format="%(asctime)s %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p"
)
handler = RotatingFileHandler("log/write.log", maxBytes=200000000)
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)
main()