-
Notifications
You must be signed in to change notification settings - Fork 0
/
page_class.py
271 lines (231 loc) · 9.22 KB
/
page_class.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
import unittest
from unittest.mock import MagicMock
from unittest.mock import Mock
from lxml import etree
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from urllib.request import urlopen
import os
#from ad_class import ad_class
def get_pictures_lvl2(driver, url_ad):
driver.get(url_ad)
#print(url_ad)
# only if there is more than 1 image
urls_img = []
ok = True
i = 1
while(ok):
try:
xpath = "/html/body/section[1]/main/section/section/section/section/section[1]/div/ul/li[" + str(i) + "]/span/img"
elem = driver.find_element_by_xpath(xpath)
url_img = elem.get_attribute("src")
urls_img.append(url_img)
i = i + 1
except:
ok = False
# if there is only one image in the ad
try:
if len(urls_img) == 0:
xpath = "/html/body/section[1]/main/section/section/section/section/div/span/img"
elem = driver.find_element_by_xpath(xpath)
url_img = elem.get_attribute("src")
urls_img.append(url_img)
except:
# if there is 0 image
pass
return urls_img
def get_infos_lvl2(driver, url_ad):
htmlparser = etree.HTMLParser()
if('http' not in url_ad):
tree_ad = etree.parse(url_ad, htmlparser)
else:
page_ad = urlopen(url_ad)
tree_ad = etree.parse(page_ad, htmlparser)
# we get the description
# if there is only one image, there is only one section
try:
xpath_desc = "/html/body/section[1]/main/section/section/section/section/section[2]/div[6]/p[2]"
desc_elmt = tree_ad.xpath(xpath_desc)
text_iterator = desc_elmt[0].itertext()
desc = ""
for txt in text_iterator:
desc = desc + " " + txt
except:
xpath_desc = "/html/body/section[1]/main/section/section/section/section/section/div[6]/p[2]"
desc_elmt = tree_ad.xpath(xpath_desc)
text_iterator = desc_elmt[0].itertext()
desc = ""
for txt in text_iterator:
desc = desc + " " + txt
# to get the images :
# they are protected : a javascript code creates the link, so we have no direct access to the urls
# we have to use selenium
urls_img = get_pictures_lvl2(driver, url_ad)
return desc, urls_img
def process_nb(ah):
# we remove the separators of thousands
part = ah.split(".")
if ('0' in part[-1] and len(part[-1]) == 3):
ah = ah.replace(".", "")
part = ah.split(",")
if ('0' in part[-1] and len(part[-1]) == 3):
ah = ah.replace(",", "")
# we replace the separator of decimals
ah = ah.replace(",", ".")
return ah
def return_ah_from_str(string):
pattern = re.compile(r'([0-9.,]*\Smah)', re.IGNORECASE)
if (pattern.search(string) != None):
ah = pattern.search(string).groups()[0][:-3]
ah = process_nb(ah)
try:
ah = float(ah) / 1000
except:
ah = None
return ah
pattern = re.compile(r'([0-9.,]* mah)', re.IGNORECASE)
if (pattern.search(string) != None):
ah = pattern.search(string).groups()[0][:-4]
ah = process_nb(ah)
try:
ah = float(ah) / 1000
except:
ah = None
return ah
pattern = re.compile(r'([0-9.,]*\Sah)', re.IGNORECASE)
if (pattern.search(string) != None):
ah = pattern.search(string).groups()[0][:-2]
ah = process_nb(ah)
try:
ah = float(ah)
except:
ah = None
return ah
pattern = re.compile(r'([0-9.,]* ah)', re.IGNORECASE)
# iop = open("iop.txt", 'w')
# iop.write(pattern.search(string))
# iop.close()
if (pattern.search(string) != None):
ah = pattern.search(string).groups()[0][:-3]
ah = process_nb(ah)
try:
ah = float(ah)
except:
ah = None
return ah
def return_ah_from_img(url_img):
return None
def get_infos_lvl1(tree_ads, i, lvl2):
ad_glob = tree_ads.xpath("/html/body/section[1]/main/section/section/section/section/ul/li[" + str(i) + "]/a")
ad_url = "http:" + ad_glob[0].values()[0]
ad_title = ad_glob[0].values()[1]
# we try to find the capacity of the battery
ah = return_ah_from_str(ad_title)
desc = None
ad_img = tree_ads.xpath("/html/body/section[1]/main/section/section/section/section/ul/li[" + str(i) + "]/a/div[1]/span[1]/span[1]")
try:
url_img_ad = "http:" + ad_img[0].values()[2]
except:
url_img_ad = None
if (ah == None):
# we try to find the capacity of the battery from this image
if (url_img_ad != None):
ah = return_ah_from_img(url_img_ad)
if (ah == None and lvl2 == True):
# we try to find the capacity in the detailed description of the ad
# !!!!!!!!!!! to do
desc, urls_img = get_infos_lvl2(driver, ad_url)
ah = return_ah_from_str(desc)
if (ah == None):
# we try to find the capacity in the images of the detailed description
# !!!!!!!!!!! to do
ah = None
ad_city_elmt = tree_ads.xpath("/html/body/section[1]/main/section/section/section/section/ul/li[" + str(i) + "]/a/section/p[2]")
ad_city = ad_city_elmt[0].getchildren()[0].values()[1]
ad_price_elmt = tree_ads.xpath("/html/body/section[1]/main/section/section/section/section/ul/li[" + str(i) + "]/a/section/h3")
ad_price = int(ad_price_elmt[0].values()[2])
ad_time_elmt = tree_ads.xpath("/html/body/section[1]/main/section/section/section/section/ul/li[" + str(i) + "]/a/section/aside/p")
ad_time = ad_time_elmt[0].values()[2]
results = pd.DataFrame([ad_title, ah, ad_price, ad_url, ad_time, ad_city, url_img_ad, desc], index=['title', 'ah', 'price', 'url', 'time', 'city', 'url_img', 'desc']).transpose()
return results
class page_class:
def __init__(self, url_request):
htmlparser = etree.HTMLParser()
page_ads = urlopen(url_request)
self.tree_ads = etree.parse(page_ads, htmlparser)
def get_infos_ad(self, driver_in, i):
global driver
driver = driver_in
return get_infos_lvl1(self.tree_ads, i, True)
class Test_infos_lvl1(unittest.TestCase):
#def setUp(self):
@classmethod
def setUpClass(self):
file_html = open("ads_page.htm", 'r')
self.prices = [100, 10, 40, 10, 20, 10, 10, 20, 15, 20, 15, 40, 35, 7, 10, 10, 50, 40]
self.times = ['2017-04-22', '2017-04-22', '2017-04-21', '2017-04-16', '2017-04-16', '2017-04-15', '2017-04-05', '2017-04-03', '2017-04-03', '2017-03-21', '2017-03-19', '2017-03-13', '2017-03-13', '2017-03-08', '2017-03-06', '2017-03-06', '2017-02-28', '2017-02-22']
string_html = file_html.read().encode('utf8').decode('utf8')
file_html.close()
htmlparser = etree.HTMLParser()
tree_ads = etree.parse("ads_page.htm", htmlparser)
batteries = []
ok = True
i = 1
while (ok == True):
try:
battery = get_infos_lvl1(tree_ads, i, False)
ls_batt = pd.DataFrame(battery, index=["title", "capacity", "price", "url", "time", "city", "url_img", 'desc']).transpose()
batteries.append(ls_batt)
i = i + 1
except:
ok = False
self.df = pd.concat(batteries)
def test_prices(self):
prices_found = self.df['price'].tolist()
self.assertEqual(prices_found, self.prices)
def test_times(self):
times_found = self.df['time'].tolist()
self.assertEqual(times_found, self.times)
class Test_infos_lvl2_page1(unittest.TestCase):
@classmethod
def setUpClass(self):
#self.driver = webdriver.Firefox()
path = os.path.realpath(__file__)
path = path.split("\\")[:-1]
path.append("ad_detailed_page.htm")
self.path = path
url_ad = "file:///" + "/".join(path)
self.desc, self.urls_img = get_infos_lvl2(driver, url_ad)
def test_ah(self):
ah = return_ah_from_str(self.desc)
self.assertEqual(6, ah)
def test_img(self):
url = "file:///" + "/".join(self.path[:-1]) + "/ad_detailed_page_fichiers/6c4e2409c4722404ca97885b131afa1219ee3fbe.jpeg"
self.assertEqual(self.urls_img, [url])
class Test_infos_lvl2_page2(unittest.TestCase):
@classmethod
def setUpClass(self):
#self.driver = webdriver.Firefox()
path = os.path.realpath(__file__)
path = path.split("\\")[:-1]
path.append("ad_detailed_page2.htm")
self.path = path
url_ad = "file:///" + "/".join(path)
self.desc, self.urls_img = get_infos_lvl2(driver, url_ad)
def test_ah(self):
ah = return_ah_from_str(self.desc)
self.assertEqual(4.4, ah)
def test_img(self):
url = ['file:///ad-thumb/bea522a8c094fc7e70b3db2b37bc6c212024ccef.jpg', 'file:///ad-thumb/bcf9377259612bf7f7c11220d7c92248221315e5.jpg', 'file:///ad-thumb/10416909729efe9c983aeaf1e9b6b7704bbb0b83.jpg']
#print(self.urls_img)
self.assertEqual(self.urls_img, url)
@classmethod
def tearDownClass(self):
driver.close()
driver.quit()
if __name__ == "__main__":
global driver
driver = webdriver.Firefox()
unittest.main()