-
Notifications
You must be signed in to change notification settings - Fork 2
/
google_search.py
114 lines (99 loc) · 5.03 KB
/
google_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import httplib
import urllib
from BeautifulSoup import BeautifulSoup
import re
import time
class GoogleScholarSearch:
def __init__(self):
self.SEARCH_HOST = "scholar.google.com"
self.SEARCH_BASE_URL = "/scholar"
def search(self, terms, limit=10):
start = 0
results = []
while start+10<=limit:
params = urllib.urlencode({'q': "+".join(terms),'as_yhi': 2008, 'start': start })
headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
url = self.SEARCH_BASE_URL+"?"+params
conn = httplib.HTTPConnection(self.SEARCH_HOST)
conn.request("GET", url, {}, headers)
resp = conn.getresponse()
if resp.status==200:
html = resp.read()
html = html.decode('ascii', 'ignore')
# Screen-scrape the result to obtain the publication information
soup = BeautifulSoup(html)
citations = 0
for record in soup('div', {'class': 'gs_r'}):
# Includeds error checking
topPart = record.first('h3')
pubURL = topPart.a['href']
# Clean up the URL, make sure it does not contain '\' but '/' instead
pubURL = pubURL.replace('\\', '/')
pubTitle = ""
for part in topPart.a.contents:
pubTitle += str(part.string)
if pubTitle == "":
match1 = re.findall('<b>\[CITATION\]<\/b><\/font>(.*)- <a',str(record))
match2 = re.split('- <a',match1[citations])
pubTitle = re.sub('<\/?(\S)+>',"",match2[0])
citations = citations + 1
authorPart = record.first('span', {'class': 'gs_a'}).string
if authorPart == None:
authorPart = re.search('<span class="gs_a">(.*?)</span>',str(record)).group(1)
num = authorPart.count(" - ")
# Assume that the fields are delimited by ' - ', the first entry will be the
# list of authors, the last entry is the journal URL, anything in between
# should be the journal year
idx_start = authorPart.find(' - ')
idx_end = authorPart.rfind(' - ')
pubAuthors = authorPart[:idx_start]
pubJournalYear = re.search('\d{4}',authorPart[idx_start + 3:idx_end]).group(0)
pubJournalURL = authorPart[idx_end + 3:]
# If (only one ' - ' is found) and (the end bit contains '\d\d\d\d')
# then the last bit is journal year instead of journal URL
if pubJournalYear=='' and re.search('\d\d\d\d', pubJournalURL)!=None:
pubJournalYear = pubJournalURL
pubJournalURL = ''
# This can potentially fail if all of the abstract can be contained in the space
# provided such that no '...' is found
delimiter = soup.firstText("...").parent
pubAbstract = ""
while str(delimiter)!='Null' and (str(delimiter)!='<b>...</b>' or pubAbstract==""):
pubAbstract += str(delimiter)
delimiter = delimiter.nextSibling
pubAbstract += '<b>...</b>'
match = re.search("Cited by ([^<]*)", str(record))
pubCitation = ''
if match != None:
pubCitation = match.group(1)
results.append({
"URL": pubURL,
"Title": pubTitle,
"Authors": pubAuthors,
"JournalYear": pubJournalYear,
"JournalURL": pubJournalURL,
"Abstract": pubAbstract,
"NumCited": pubCitation,
"Terms": terms
})
else:
print "ERROR: ",
print resp.status, resp.reason
return []
start+=10
time.sleep(3)
return results
if __name__ == '__main__':
search = GoogleScholarSearch()
pubs = search.search(["breast cancer", "gene"], 20)
fout = open('','w+')
for pub in pubs:
firstauthortxt = pub['Authors'].split(', ')[0]
firstauthor = firstauthortxt.split(' ')[1]+", "+firstauthortxt.split(' ')[0]
print pub['Title']
print pub['Authors']
print pub['JournalYear']
print pub['Terms']
fout.write(firstauthor+'\t'+pub['JournalYear']+'\t'+pub['Title']+'\n')
print "======================================"
fout.close()