-
Notifications
You must be signed in to change notification settings - Fork 0
/
web_classes.py
151 lines (129 loc) · 5.89 KB
/
web_classes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import urllib.request as rq
import urllib.parse as parse
from html.parser import HTMLParser
from pathlib import Path
import os
from helpers import *
# perhaps later, class methods will not just return a value.
# maybe they should set all gotten values to class properties
# then the main.py will be implemented to reflect this,
# or maybe just for some of the classes
# image_formats = ('.jpg', '.jpeg', '.png', '.gif', '.svg', '.bmp, '.webp', '.tiff', '.ico', '.heif', '.heic', '.svgz', '.ani')
# making it a set will ensure faster searching
image_formats = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg', '.tiff', '.eps', '.pdf', '.exif', '.pbm', '.pgm', '.ppm', '.pam', '.pfm', '.hdr', '.fits', '.ico', '.heif', '.bat', '.bpg', '.cgm', '.drw', '.dxf', '.emf', '.gerber', '.itc', '.sgl', '.odg', '.eps', '.raw', '.indd', '.ai', '.eps', '.pdf', '.xps', '.oxps', '.pct', '.pict', '.plt', '.wmf', '.svg', '.svgz', '.cgm', '.xar', '.sxd', '.v2d', '.vnd', '.wmz', '.emz', '.ani', '.cal', '.cin', '.fax', '.jbig', '.jng', '.mng', '.pcx', '.pict', '.pnm', '.ppm', '.qti', '.qtif', '.ras', '.tga', '.wbmp', '.xpm', '.xwd'}
class ImageFromHTML(HTMLParser):
# the main class. Uses most others to produce result
# that will be returned once to the caller of self.feed
def __init__(self, url=''):
super().__init__()
self.url = url
self.img_srcs = set()
self.css_srcs = set()
self.css_texts = set()
self.svg_texts = set()
self.svg_found = 0
self.css_found = 0
def feed(self, data, url):
# parse the html for all types of srcs
# change later to be only url (it will get the data itself)
super().feed(data)
# get images from gotten css_srcs
# update img_srcs as you get them...
get_img_from_css = CSSParser()
for css_src in self.css_srcs:
self.img_srcs.update(
get_img_from_css.start(url, css_src)
)
# get the image srcs from the css_texts
for css_text in self.css_texts:
self.img_srcs.update(
get_img_from_css.parse_css(url, css_text)
)
return (self.img_srcs, self.svg_texts)
def handle_starttag(self, tag, attrs):
match tag:
case "img":
#print("img")
attributes = dict(attrs)
if "src" in attributes:
self.img_srcs.add(attributes["src"])
case "link":
#print("link")
attributes = dict(attrs)
#check if rel value contains logo/icon/#addmore
if ("rel" in attributes):
if "href" in attributes:
if check_some(attributes["rel"], "logo", "icon"):
self.img_srcs.add(attributes["href"])
elif attributes["rel"]=="stylesheet":
self.css_srcs.add(attributes["href"])
#do the css parsing for backgrounds later
case "svg":
# svg found. Start filling in the svg
# resetting the past one in the process
self.svg_found = 1
attrs.append(("xmlns", "http://www.w3.org/2000/svg"))
self.svg_text = f"<svg{str_attr(attrs)}>"
case "style":
self.css_found = 1
if self.svg_found and tag != "svg":
# any other children of the svg, of course.
self.svg_text += f"<{tag}{str_attr(attrs)}>"
def handle_data(self, data):
if self.svg_found:
self.svg_text += data
if self.css_found:
self.css_texts.add(data)
self.css_found = 0
def handle_endtag(self, tag):
if self.svg_found:
if tag != "svg":
self.svg_text += f"</{tag}>\n"
else:
self.svg_text += "</svg>"
self.svg_texts.add(self.svg_text)
self.svg_found = 0
class ImageGetter:
def start(self, website, src, folder):
self.src = split_url(website, src)
self.folder = folder
file_name = Path(parse.urlparse(src).path).name
self.name = find_right_name(file_name, folder)
self.get_img_data()
def get_img_data(self):
with rq.urlopen(self.src) as online_img:
with open(f"{self.name}", 'wb') as local_img:
local_img.write(online_img.read())
class SVGMaker:
def __init__(self):
self.name_no = 0
def start(self, svg_text, folder):
self.name_no += 1
self.name = find_right_name(f"svg{self.name_no}.svg", folder)
self.svg_text = svg_text
with open(self.name, 'w') as svg_file:
svg_file.write(svg_text)
class CSSParser:
#returns the background images srcs
def start(self, website, src):
self.website = website
self.src = split_url(website, src)
self.css_text = get_web_text(self.src)
return self.parse_css(self.website, self.css_text)
def parse_css(self, website, css_text):
urls = set()
found_url = css_text.find("url")
while found_url+1:
found_closing_brackets = css_text.find(')', found_url)
if found_closing_brackets + 1:
# scrape the url out and get the src inside the bracket
url = css_text[found_url:found_closing_brackets]\
.strip("\'\"url( ")
# check if it is an image
if Path(url).suffix and Path(url).suffix in image_formats:
# add the website root to the url
url = split_url(website, url)
urls.add(url)
# now find the next one
found_url = css_text.find("url", found_closing_brackets)
return urls