-
Notifications
You must be signed in to change notification settings - Fork 15
/
mcrawler.py
executable file
·115 lines (98 loc) · 3.33 KB
/
mcrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env python3
import argparse
from link_fetcher import LinkFetcher
from postutils import save_posts, save_post_as_text, save_post_as_json, create_dir
import requests
import unicodedata
from bs4 import BeautifulSoup
class MediumCrawler:
"""
Crawls every shit for a user from medium.com
"""
def __init__(self, username='nishparadox'):
self.fetcher = LinkFetcher(driver_type='headless', username=username)
self.nlinks = 0
def crawl_lazily(self):
"""
Do the actual thing you want the crawler to do :P
It uses python generator
"""
self.posts = []
links = self.fetcher.get_links()
self.nlinks = len(links)
print("Total Number of links :: {}".format(self.nlinks))
for link in links:
print("Getting :: {}".format(link))
post = self.get_post(link)
self.posts.append(post)
yield post
def get_post(self, link):
"""
GET a single post that the link points to and nicely parse it.
The returned dictionary is in the format:
{
"title" : "...",
"timestamp" : "...",
"content" : "...",
"tags" : [....]
}
"""
response = requests.get(link)
soup = BeautifulSoup(response.text, 'html.parser')
divs = soup.find_all('div', {'class' : 'section-inner sectionLayout--insetColumn'})
title = soup.find('h1', {'class' : 'graf graf--h3 graf--leading graf--title'}).get_text(strip=True)
title = unicodedata.normalize("NFKD", title)
content =' '.join( [ div.get_text(strip=True) for div in divs ])
content = unicodedata.normalize("NFKD", content)
time = soup.find('time')['datetime']
tag_divs = soup.find('ul', {'class' : 'tags tags--postTags tags--borderless'})
tags = [ div.get_text(strip=True) for div in tag_divs ]
return {
'title' : title,
'timestamp' : time,
'content' : content,
'tags' : tags
}
def parse():
parser = argparse.ArgumentParser(
"mcrawler",
description="Crawl shit from medium"
)
parser.add_argument(
'-u',
'--user',
# dest='user',
help='The username for medium',
required=True
)
parser.add_argument(
'-t',
'--type',
# dest='type',
help='The format for dumping -> text, json',
required=True
)
parser.add_argument(
'-dd',
'--dump-dir',
# dest='dump_dir',
help='The directory where the data is to be dumped',
required=True
)
return parser.parse_args()
def run_crawler(username, dump_type, dump_dir):
import os
print("Crawling for user :: {}".format(username))
crawler = MediumCrawler(username=username)
dfunc = save_post_as_text if dump_type == 'text' else save_post_as_json
create_dir(dump_dir, username)
for post in crawler.crawl_lazily():
dfunc(username, post, dump_dir)
print("Dumped {} Posts to {}".format(crawler.nlinks, os.path.join(dump_dir, username)))
def run(args):
run_crawler(args.user, args.type, args.dump_dir)
def main():
args = parse()
run(args)
if __name__ == "__main__":
main()