forked from marmelo/python-htmlparser
-
Notifications
You must be signed in to change notification settings - Fork 4
/
examples.py
214 lines (188 loc) · 6.87 KB
/
examples.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
#!/usr/bin/env python
"""
For more information, see:
@see https://docs.python.org/3/library/xml.etree.elementtree.html#xml.etree.ElementTree.Element
@see https://docs.python.org/3/library/xml.etree.elementtree.html#xpath-support
"""
from __future__ import print_function, unicode_literals
from htmlement import HTMLement
def example_simple():
"""
This example will parse a simple html tree and
extract the website title and all anchors
>>> example_simple()
Parsing: GitHub
GitHub => https://github.com/willforde
GitHub Project => https://github.com/willforde/python-htmlement
"""
html = """
<html>
<head>
<title>GitHub</title>
</head>
<body>
<a href="https://github.com/willforde">GitHub</a>
<a href="https://github.com/willforde/python-htmlement">GitHub Project</a>
</body>
</html>
"""
# Parse the document
parser = HTMLement()
parser.feed(html)
root = parser.close()
# Root is an xml.etree.Element and supports the ElementTree API
# (e.g. you may use its limited support for XPath expressions)
# Get title
title = root.find('head/title').text
print("Parsing: {}".format(title))
# Get all anchors
for a in root.iterfind(".//a"):
# Get href attribute
url = a.get("href")
# Get anchor name
name = a.text
print("{} => {}".format(name, url))
def example_filter():
"""
This example will parse a simple html tree and
extract all the list items within the ul menu element using a tree filter.
The tree filter will tell the parser to only parse the elements within the
requested section and to ignore all other elements.
Useful for speeding up the parsing of html pages.
>>> example_filter()
Menu Items
- Coffee
- Tea
- Milk
"""
html = """
<html>
<head>
<title>Coffee shop</title>
</head>
<body>
<ul class="menu">
<li>Coffee</li>
<li>Tea</li>
<li>Milk</li>
</ul>
<ul class="extras">
<li>Sugar</li>
<li>Cream</li>
</ul>
</body>
</html>
"""
# Parse the document
parser = HTMLement("ul", attrs={"class": "menu"})
parser.feed(html)
root = parser.close()
# Root should now be a 'ul' xml.etree.Element with all it's child elements available
# All other elements have been ignored. Way faster to parse.
# We are unable to get the title here sense all
# elements outside the filter was ignored
print("Menu Items")
# Get all listitems
for item in root.iterfind(".//li"):
# Get text from listitem
print("- {}".format(item.text))
def example_complex():
"""
This example will parse a more complex html tree of python talk's and will
extract the image, title, url and date of each talk.
A filter will be used to extract the main talks div element
>>> example_complex()
Image = /presentations/c7f1fbb5d03a409d9de8abb5238d6a68/thumb_slide_0.jpg
Url = /pycon2016/alex-martelli-exception-and-error-handling-in-python-2-and-python-3
Title = Alex Martelli - Exception and error handling in Python 2 and Python 3
Date = Jun 1, 2016
<BLANKLINE>
Image = /presentations/eef8ffe5b6784f7cb84948cf866b2608/thumb_slide_0.jpg
Url = /presentations/518cae54da12460e895163d809e25933/thumb_slide_0.jpg
Title = Jake Vanderplas - Statistics for Hackers
Date = May 29, 2016
<BLANKLINE>
Image = /presentations/8b3ee51b5fcc4a238c4cb4b7787979ac/thumb_slide_0.jpg
Url = /pycon2016/brett-slatkin-refactoring-python-why-and-how-to-restructure-your-code
Title = Brett Slatkin - Refactoring Python: Why and how to restructure your code
Date = May 29, 2016
<BLANKLINE>
"""
html = """
<html>
<head>
<title>PyCon 2016</title>
</head>
<body>
<div class="main">
<h1>Talks by PyCon 2016</h1>
<div class="talks" id="d5esfbb5d03adfdfede8a342238d6a68">
<div class="talk" data-id="c7f1fbb5d03a409d9de8abb5238d6a68">
<a href="/pycon2016/kelsey-gilmore-innis-seriously-strong-security-on-a-shoestring">
<img src="/presentations/c7f1fbb5d03a409d9de8abb5238d6a68/thumb_slide_0.jpg">
</a>
<div class="talk-listing-meta">
<h3 class="title">
<a href="/pycon2016/alex-martelli-exception-and-error-handling-in-python-2-and-python-3">
Alex Martelli - Exception and error handling in Python 2 and Python 3
</a>
</h3>
<p class="date">Jun 1, 2016</p>
</div>
</div>
<div class="talk" data-id="518cae54da12460e895163d809e25933">
<a href="/pycon2016/manuel-ebert-putting-1-million-new-words-into-the-dictionary">
<img src="/presentations/eef8ffe5b6784f7cb84948cf866b2608/thumb_slide_0.jpg">
</a>
<div class="talk-listing-meta">
<h3 class="title">
<a href="/presentations/518cae54da12460e895163d809e25933/thumb_slide_0.jpg">
Jake Vanderplas - Statistics for Hackers
</a>
</h3>
<p class="date">May 29, 2016</p>
</div>
</div>
<div class="talk" data-id="8b3ee51b5fcc4a238c4cb4b7787979ac">
<a href="/pycon2016/brett-slatkin-refactoring-python-why-and-how-to-restructure-your-code">
<img src="/presentations/8b3ee51b5fcc4a238c4cb4b7787979ac/thumb_slide_0.jpg">
</a>
<div class="talk-listing-meta">
<h3 class="title">
<a href="/pycon2016/brett-slatkin-refactoring-python-why-and-how-to-restructure-your-code">
Brett Slatkin - Refactoring Python: Why and how to restructure your code
</a>
</h3>
<p class="date">May 29, 2016</p>
</div>
</div>
</div>
</div>
</body>
</html>
"""
# Parse the document
parser = HTMLement("div", attrs={"class": "talks", "id": True})
parser.feed(html)
root = parser.close()
# Extract all div tags with class of talk
for talk in root.iterfind("./div[@class='talk']"):
# Fetch image
img = talk.find(".//img").get("src")
print("Image = {}".format(img))
# Fetch title and url
title_anchor = talk.find("./div/h3/a")
url = title_anchor.get("href")
print("Url = {}".format(url))
title = title_anchor.text
print("Title = {}".format(title))
# Fetch date
date = talk.find("./div/p").text
print("Date = {}".format(date))
print("")
if __name__ == "__main__":
example_simple()
print("")
example_filter()
print("")
example_complex()