-
Notifications
You must be signed in to change notification settings - Fork 0
/
ContentScraper.py
42 lines (35 loc) · 1.01 KB
/
ContentScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import urllib2
from urllib2 import urlopen
import re
import cookielib
from cookielib import CookieJar
import time
cj = CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
opener.addheaders = [('User-agent','Mozilla/5.0')]
def main():
try:
page = 'http://www.huffingtonpost.com/feeds/index.xml'
sourceCode = opener.open(page).read()
#print sourceCode
try:
titles = re.findall(r'<title>(.*?)</title>',sourceCode)
#links = re.findall(r'<link.*?href="(.*?)"',sourceCode)
links = re.findall(r'<link>(.*?)</link>',sourceCode)
#for title in titles:
# print title
for link in links:
#if 'www.huffingtonpost.com' in link:
if link == 'www.huffingtonpost.com':
pass
else:
print 'let\'s open URL:',link
linkSource = opener.open(link).read()
content = re.findall(r'<p>(.*?)</p>',linkSource)
for theContent in content:
print theContent
except Exception,e:
print 'main loop 1', str(e)
except Exception,e:
print 'main loop 2', str(e)
main()