-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrap.py
executable file
·47 lines (39 loc) · 1.38 KB
/
scrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!/usr/bin/env python
import json
import argparse
from pprint import pprint
from lxml import html
from urllib.request import urlopen, Request
parser = argparse.ArgumentParser(description='Webscraper')
parser.add_argument('-u', '--url', metavar='URL', help='Site to scrap')
parser.add_argument('-s', '--scrap', metavar='NAME:XPATH', help='Scraping pattern', required=True, nargs='+')
parser.add_argument("-v", "--verbose", help="increase output verbosity", action="count", default=0)
args = parser.parse_args()
if args.verbose:
print("verbosity turned on to %s" % args.verbose)
patterns = {}
for pattern in args.scrap:
split = pattern.split(':', 1)
patterns[split[0]] = split[1]
if args.verbose:
pprint(args, indent=3)
print("URL: %s" % args.url)
pprint(patterns, indent=3)
request = Request(args.url)
request.add_header('Accept-Encoding', 'utf-8')
page = urlopen(request)
tree = html.fromstring(page.read().decode('utf-8', 'ignore'))
response_json = {}
for patternName in patterns:
pattern = patterns[patternName]
elements = tree.xpath(pattern)
children = []
if len(elements) == 0:
children = '*'
elif len(elements) == 1:
children = elements[0].strip()
else:
for e in elements:
children.append(e.strip())
response_json[patternName] = children
print(json.dumps(response_json, separators=(',', ':'), ensure_ascii=False))