-
Notifications
You must be signed in to change notification settings - Fork 0
/
webscrape.py
130 lines (104 loc) · 4.88 KB
/
webscrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from bs4 import BeautifulSoup
import requests
import os
import logging
sources = ['SIGNINGSAVVY', 'LIFEPRINT']
SIGNINGSAVVY = 'https://www.signingsavvy.com'
LIFEPRINT_PREFIX = 'https://lifeprint.com/asl101/pages-signs'
LIFEPRINT_SUFFIX = '.htm'
CURRENT_PAGE_LINK_SUFFIX = 'javascript:;'
def get_terms(word, source='SIGNINGSAVVY'):
""" if no signs exist, returns None. Otherwise, returns list of 3-tuples
(term, url_suffixes, descriptions) for the various terms
"""
if source=='SIGNINGSAVVY':
search_url = os.path.join(SIGNINGSAVVY, 'search', word)
r_search = requests.get(search_url)
page_soup = BeautifulSoup(r_search.content, "html.parser")
search_results_div = page_soup.find("div", class_="search_results")
if search_results_div == None:
if page_soup.find("div", class_ = "sign_module"):
# word is unique
return [(word, 'search/' + word, None)]
else:
# word does not exist
return None
# multiple search results
results = []
terms_li = search_results_div.ul.find_all('li')
for term in terms_li:
if term.em is not None:
description = term.em.text.replace(""", "\"").split("\"")[1]
else:
description = None
results.append((term.a.text.title(), term.a['href'], description))
return results
def get_media(word, url_suffix=None, source='SIGNINGSAVVY'):
""" returns list of .mp4 links and list of labels
arguments:
word, the string to look up in the dictionary
url_suffix, string with the format "sign/{TERM}/{ID}{/OPTIONAL VARIATION INDEX}" (e.g. "sign/RUN/10423/1")
source, the asl website to search. must be from the list sources"""
mp4s = []
labels = []
if source=='SIGNINGSAVVY':
if not url_suffix:
search_url = os.path.join(SIGNINGSAVVY, 'search', word)
else:
search_url = os.path.join(SIGNINGSAVVY, url_suffix)
var_urls = []
r_search = requests.get(search_url)
page_soup = BeautifulSoup(r_search.content, "html.parser")
# get links for all variations of the sign
# if there is a single result, the html will contain a div tag with the
# class "signing_header";
# if there are multiple results, then we need to look through the
# "search_results" div to pick which term we actually want.
# header_div = page_soup.findAll("div", {"class": "signing_header"})
header_div = page_soup.find("div", class_="signing_header")
variations_li = header_div.ul.find_all('li')
for var in variations_li:
link_suffix = var.a['href']
# TODO: i was thinking about getting the button label (e.g. "ASL
# 1", "finger spell", etc) but it doesn't seem like that
# information is in the html (my guess is it's in the CSS or JS
# somehow?) so this code for labels is pretty much pointless for now
var_label = var.a.text
labels.append(var_label)
# signingsavvy's website changed their class attribute value
if var.a.has_attr('class') and var.a['class'][0] == 'current':
# or
# if link_suffix == CURRENT_PAGE_LINK_SUFFIX:
var_urls.append(search_url)
else:
var_urls.append(os.path.join(SIGNINGSAVVY, link_suffix))
# iterate through variations of the sign, then save the mp4 link along
# with the variation label
for url in var_urls:
r = requests.get(url)
page_soup = BeautifulSoup(r.content, "html.parser")
vid_div = page_soup.find("div", class_="videocontent")
# updating since signingsavvy changed their links from relative to
# absolute
# media_url = os.path.join(SIGNINGSAVVY, vid_div.source['src'])
media_url = vid_div.source['src']
mp4s.append(media_url)
elif source == 'LIFEPRINT':
search_url = os.path.join(LIFEPRINT_PREFIX, word[0], word,
LIFEPRINT_SUFFIX)
r_search = requests.get(search_url)
page_soup = BeautifulSoup(r_search.content, "html.parser")
# TODO: figure out way to parse natural language on the web page and
# label media variations
return mp4s, labels
if __name__ == "__main__":
## testing ##
# print(get_media('happy'))
print('testing get_media()\n-------------------')
print(get_media('avocado'))
print(get_media('math'))
print(get_media('computer science'))
print(get_media('run', url_suffix='sign/RUN/10423/1'))
# get_media('cat', source='LIFEPRINT')
print('\ntesting get_terms()\n------------------------')
print(get_terms('run'))