From b88c6819b29773440cedba8dddec254795ab5764 Mon Sep 17 00:00:00 2001 From: Kylie Date: Thu, 23 May 2024 12:56:27 -0600 Subject: [PATCH 01/15] Create dataclass --- html_tag_collector/DataClassTags.py | 7 +++++++ html_tag_collector/collector.py | 2 ++ 2 files changed, 9 insertions(+) create mode 100644 html_tag_collector/DataClassTags.py diff --git a/html_tag_collector/DataClassTags.py b/html_tag_collector/DataClassTags.py new file mode 100644 index 0000000..9d71ea6 --- /dev/null +++ b/html_tag_collector/DataClassTags.py @@ -0,0 +1,7 @@ +from dataclasses import dataclass + + +@dataclass +class Tags: + html_title: str + diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py index 61f03bf..788a0e9 100644 --- a/html_tag_collector/collector.py +++ b/html_tag_collector/collector.py @@ -20,6 +20,8 @@ from RootURLCache import RootURLCache from common import get_user_agent +from DataClassTags import Tags + # Define the list of header tags we want to extract header_tags = ["h1", "h2", "h3", "h4", "h5", "h6"] From 625147546254cc4b6d6a604da496903ce628ac44 Mon Sep 17 00:00:00 2001 From: Kylie Date: Fri, 24 May 2024 23:48:17 -0600 Subject: [PATCH 02/15] Add dataclass parameters --- html_tag_collector/DataClassTags.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/html_tag_collector/DataClassTags.py b/html_tag_collector/DataClassTags.py index 9d71ea6..d24cefc 100644 --- a/html_tag_collector/DataClassTags.py +++ b/html_tag_collector/DataClassTags.py @@ -3,5 +3,17 @@ @dataclass class Tags: - html_title: str - + index: int = None + url: str = "" + url_path: str = "" + html_title: str = "" + meta_description: str = "" + root_page_title: str = "" + http_response: int = -1 + h1: str = "" + h2: str = "" + h3: str = "" + h4: str = "" + h5: str = "" + h6: str = "" + div_text: str = "" From 3c5f6461c868d780d00d3dd790408a7da45ecd96 Mon Sep 17 00:00:00 2001 From: Kylie Date: Fri, 24 May 2024 23:48:41 -0600 Subject: [PATCH 03/15] Create get_url function --- html_tag_collector/collector.py | 51 +++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 9 deletions(-) diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py index 788a0e9..c0fa5c6 100644 --- a/html_tag_collector/collector.py +++ b/html_tag_collector/collector.py @@ -233,41 +233,48 @@ def parse_response(url_response): """Parses relevant HTML tags from a Response object into a dictionary. Args: - url_response (list[dict]): List of dictionaries containing urls and theeir responses. + url_response (list[dict]): List of dictionaries containing urls and their responses. Returns: - list[dict]: List of dictionaries containing urls and relevant HTML tags. + Tags: DataClass containing the url and relevant HTML tags. """ remove_excess_whitespace = lambda s: " ".join(s.split()).strip() tags = {} + tags_test = Tags() res = url_response["response"] - tags["index"] = url_response["index"] + #tags["index"] = url_response["index"] + tags_test.index = url_response["index"] # Drop hostname from urls to reduce training bias - url = url_response["url"][0] + '''url = url_response["url"][0] tags["url"] = url if not url.startswith("http"): url = "https://" + url - tags["url_path"] = urlparse(url).path[1:] + tags["url_path"] = urlparse(url).path[1:]''' + tags_test = get_url(tags_test, url_response) + print(tags_test) tags["html_title"] = "" tags["meta_description"] = "" - tags["root_page_title"] = remove_excess_whitespace(root_url_cache.get_title(tags["url"])) + #tags["root_page_title"] = remove_excess_whitespace(root_url_cache.get_title(tags["url"])) + # The response is None if there was an error during connection, meaning there is no content to read if res is None: tags["http_response"] = -1 return tags + # If the connection did not return a 300 code, we can assume there is no relevant content to read tags["http_response"] = res.status_code if not res.ok: return tags + # Attempt to read the content-type, set the parser accordingly to avoid warning messages try: content_type = res.headers["content-type"] except KeyError: return tags - + # If content type does not contain "html" or "xml" then we can assume that the content is unreadable if "html" in content_type: parser = "lxml" elif "xml" in content_type: @@ -296,6 +303,7 @@ def parse_response(url_response): # Retreives and drops headers containing links to reduce training bias header_content = [header.get_text(" ", strip=True) for header in headers if not header.a] tags[header_tag] = json.dumps(header_content, ensure_ascii=False) + #setattr(tags_test, header_tag, "Test") # Extract max 500 words of text from HTML
's div_text = "" @@ -309,9 +317,9 @@ def parse_response(url_response): else: break # Stop adding text if word limit is reached - # truncate to 5000 characters in case of run-on 'words' + # Truncate to 5000 characters in case of run-on 'words' tags["div_text"] = div_text[:MAX_WORDS * 10] - + # Prevents most bs4 memory leaks if soup.html: soup.html.decompose() @@ -319,6 +327,31 @@ def parse_response(url_response): return tags +def get_url(tags, url_response): + """Updates the Tags dataclass with the url and url_path + + Args: + tags (Tags): DataClass for relevant HTML tags. + url_response (list[dict]): List of dictionaries containing urls and their responses. + + Returns: + Tags: DataClass with updated url and url_path. + """ + url = url_response["url"][0] + tags.url = url + if not url.startswith("http"): + url = "https://" + url + + # Drop hostname from urls to reduce training bias + url_path = urlparse(url).path[1:] + # Remove trailing backslash + if url_path[-1] == "/": + url_path = url_path[:-1] + tags.url_path = url_path + + return tags + + def collector_main(df, render_javascript=False): context = multiprocessing.get_context("spawn") manager = context.Manager() From 7d1d5ebb1c798ddd0c7345d2b3b0051efa30d4c1 Mon Sep 17 00:00:00 2001 From: Kylie Date: Sat, 25 May 2024 00:17:35 -0600 Subject: [PATCH 04/15] Add get_html_title function --- html_tag_collector/collector.py | 45 +++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py index c0fa5c6..368bd47 100644 --- a/html_tag_collector/collector.py +++ b/html_tag_collector/collector.py @@ -238,7 +238,7 @@ def parse_response(url_response): Returns: Tags: DataClass containing the url and relevant HTML tags. """ - remove_excess_whitespace = lambda s: " ".join(s.split()).strip() + #remove_excess_whitespace = lambda s: " ".join(s.split()).strip() tags = {} tags_test = Tags() @@ -253,19 +253,19 @@ def parse_response(url_response): url = "https://" + url tags["url_path"] = urlparse(url).path[1:]''' tags_test = get_url(tags_test, url_response) - print(tags_test) - tags["html_title"] = "" - tags["meta_description"] = "" + #tags["html_title"] = "" + #tags["meta_description"] = "" #tags["root_page_title"] = remove_excess_whitespace(root_url_cache.get_title(tags["url"])) # The response is None if there was an error during connection, meaning there is no content to read if res is None: - tags["http_response"] = -1 + #tags["http_response"] = -1 return tags # If the connection did not return a 300 code, we can assume there is no relevant content to read - tags["http_response"] = res.status_code + #tags["http_response"] = res.status_code + tags_test.http_response = res.status_code if not res.ok: return tags @@ -287,10 +287,11 @@ def parse_response(url_response): except (bs4.builder.ParserRejectedMarkup, AssertionError, AttributeError): return tags - if soup.title is not None and soup.title.string is not None: + '''if soup.title is not None and soup.title.string is not None: tags["html_title"] = remove_excess_whitespace(soup.title.string) else: - tags["html_title"] = "" + tags["html_title"] = ""''' + tags_test = get_html_title(tags_test, soup) meta_tag = soup.find("meta", attrs={"name": "description"}) try: @@ -352,6 +353,34 @@ def get_url(tags, url_response): return tags +def get_html_title(tags, soup): + """Updates the Tags dataclass with the html_title + + Args: + tags (Tags): DataClass for relevant HTML tags. + soup (BeautifulSoup): BeautifulSoup object to pull the HTML title from. + + Returns: + Tags: DataClass with updated html_title. + """ + if soup.title is not None and soup.title.string is not None: + tags.html_title = remove_excess_whitespace(soup.title.string) + + return tags + + +def remove_excess_whitespace(s): + """Removes leading, trailing, and excess adjacent whitespace. + + Args: + s (str): String to remove whitespace from. + + Returns: + str: Clean string with excess whitespace stripped. + """ + return " ".join(s.split()).strip() + + def collector_main(df, render_javascript=False): context = multiprocessing.get_context("spawn") manager = context.Manager() From f1bd4df6a2fcc2342ec179cf8b48b13d0c639c85 Mon Sep 17 00:00:00 2001 From: Kylie Date: Sat, 25 May 2024 00:24:48 -0600 Subject: [PATCH 05/15] Add get_meta_description function --- html_tag_collector/collector.py | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py index 368bd47..b52bf09 100644 --- a/html_tag_collector/collector.py +++ b/html_tag_collector/collector.py @@ -293,11 +293,13 @@ def parse_response(url_response): tags["html_title"] = ""''' tags_test = get_html_title(tags_test, soup) - meta_tag = soup.find("meta", attrs={"name": "description"}) + '''meta_tag = soup.find("meta", attrs={"name": "description"}) try: tags["meta_description"] = remove_excess_whitespace(meta_tag["content"]) if meta_tag is not None else "" except KeyError: - tags["meta_description"] = "" + tags["meta_description"] = ""''' + tags_test = get_meta_description(tags_test, soup) + print(tags_test) for header_tag in header_tags: headers = soup.find_all(header_tag) @@ -329,7 +331,7 @@ def parse_response(url_response): def get_url(tags, url_response): - """Updates the Tags dataclass with the url and url_path + """Updates the Tags dataclass with the url and url_path. Args: tags (Tags): DataClass for relevant HTML tags. @@ -354,7 +356,7 @@ def get_url(tags, url_response): def get_html_title(tags, soup): - """Updates the Tags dataclass with the html_title + """Updates the Tags dataclass with the html_title. Args: tags (Tags): DataClass for relevant HTML tags. @@ -369,6 +371,25 @@ def get_html_title(tags, soup): return tags +def get_meta_description(tags, soup): + """Updates the Tags dataclass with the meta_description. + + Args: + tags (Tags): DataClass for relevant HTML tags. + soup (BeautifulSoup): BeautifulSoup object to pull the meta description from. + + Returns: + Tags: DataClass with updated meta_description. + """ + meta_tag = soup.find("meta", attrs={"name": "description"}) + try: + tags.meta_description = remove_excess_whitespace(meta_tag["content"]) if meta_tag is not None else "" + except KeyError: + return + + return tags + + def remove_excess_whitespace(s): """Removes leading, trailing, and excess adjacent whitespace. From 3b74673a4199ed40466d3d6b57115a73428f5b11 Mon Sep 17 00:00:00 2001 From: Kylie Date: Sat, 25 May 2024 15:38:24 -0600 Subject: [PATCH 06/15] Add get_header_tags function --- html_tag_collector/collector.py | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py index b52bf09..ed53834 100644 --- a/html_tag_collector/collector.py +++ b/html_tag_collector/collector.py @@ -299,14 +299,13 @@ def parse_response(url_response): except KeyError: tags["meta_description"] = ""''' tags_test = get_meta_description(tags_test, soup) - print(tags_test) - for header_tag in header_tags: + '''for header_tag in header_tags: headers = soup.find_all(header_tag) # Retreives and drops headers containing links to reduce training bias header_content = [header.get_text(" ", strip=True) for header in headers if not header.a] - tags[header_tag] = json.dumps(header_content, ensure_ascii=False) - #setattr(tags_test, header_tag, "Test") + tags[header_tag] = json.dumps(header_content, ensure_ascii=False)''' + tags_test = get_header_tags(tags_test, soup) # Extract max 500 words of text from HTML
's div_text = "" @@ -331,7 +330,7 @@ def parse_response(url_response): def get_url(tags, url_response): - """Updates the Tags dataclass with the url and url_path. + """Updates the Tags DataClass with the url and url_path. Args: tags (Tags): DataClass for relevant HTML tags. @@ -356,7 +355,7 @@ def get_url(tags, url_response): def get_html_title(tags, soup): - """Updates the Tags dataclass with the html_title. + """Updates the Tags DataClass with the html_title. Args: tags (Tags): DataClass for relevant HTML tags. @@ -372,7 +371,7 @@ def get_html_title(tags, soup): def get_meta_description(tags, soup): - """Updates the Tags dataclass with the meta_description. + """Updates the Tags DataClass with the meta_description. Args: tags (Tags): DataClass for relevant HTML tags. @@ -390,6 +389,26 @@ def get_meta_description(tags, soup): return tags +def get_header_tags(tags, soup): + """Updates the Tags DataClass with the header tags. + + Args: + tags (Tags): DataClass for relevant HTML tags. + soup (BeautifulSoup): BeautifulSoup object to pull the header tags from. + + Returns: + Tags: DataClass with updated header tags. + """ + for header_tag in header_tags: + headers = soup.find_all(header_tag) + # Retreives and drops headers containing links to reduce training bias + header_content = [header.get_text(" ", strip=True) for header in headers if not header.a] + tag_content = json.dumps(header_content, ensure_ascii=False) + setattr(tags, header_tag, tag_content) + + return tags + + def remove_excess_whitespace(s): """Removes leading, trailing, and excess adjacent whitespace. From bb21b18247a5f7f56386799af603a44c1e9f2ac4 Mon Sep 17 00:00:00 2001 From: Kylie Date: Sat, 25 May 2024 15:52:21 -0600 Subject: [PATCH 07/15] Add get_div_text function --- html_tag_collector/collector.py | 36 ++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py index ed53834..8c84f90 100644 --- a/html_tag_collector/collector.py +++ b/html_tag_collector/collector.py @@ -308,7 +308,7 @@ def parse_response(url_response): tags_test = get_header_tags(tags_test, soup) # Extract max 500 words of text from HTML
's - div_text = "" + '''div_text = "" MAX_WORDS = 500 for div in soup.find_all("div"): text = div.get_text(" ", strip=True) @@ -320,8 +320,10 @@ def parse_response(url_response): break # Stop adding text if word limit is reached # Truncate to 5000 characters in case of run-on 'words' - tags["div_text"] = div_text[:MAX_WORDS * 10] - + tags["div_text"] = div_text[:MAX_WORDS * 10]''' + tags_test = get_div_text(tags_test, soup) + print(tags_test) + # Prevents most bs4 memory leaks if soup.html: soup.html.decompose() @@ -409,6 +411,34 @@ def get_header_tags(tags, soup): return tags +def get_div_text(tags, soup): + """Updates the Tags DataClass with the div_text. + + Args: + tags (Tags): DataClass for relevant HTML tags. + soup (BeautifulSoup): BeautifulSoup object to pull the div text from. + + Returns: + Tags: DataClass with updated div_text. + """ + # Extract max 500 words of text from HTML
's + div_text = "" + MAX_WORDS = 500 + for div in soup.find_all("div"): + text = div.get_text(" ", strip=True) + if text: + # Check if adding the current text exceeds the word limit + if len(div_text.split()) + len(text.split()) <= MAX_WORDS: + div_text += text + " " + else: + break # Stop adding text if word limit is reached + + # Truncate to 5000 characters in case of run-on 'words' + tags.div_text = div_text[:MAX_WORDS * 10] + + return tags + + def remove_excess_whitespace(s): """Removes leading, trailing, and excess adjacent whitespace. From 22c00e241e09daecf13d93661a4e5029f2dd3ef5 Mon Sep 17 00:00:00 2001 From: Kylie Date: Sat, 25 May 2024 16:37:31 -0600 Subject: [PATCH 08/15] Add verify_response fuction --- html_tag_collector/collector.py | 64 ++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 20 deletions(-) diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py index 8c84f90..cc85003 100644 --- a/html_tag_collector/collector.py +++ b/html_tag_collector/collector.py @@ -1,3 +1,4 @@ +from dataclasses import asdict import json import ssl import urllib3 @@ -240,11 +241,11 @@ def parse_response(url_response): """ #remove_excess_whitespace = lambda s: " ".join(s.split()).strip() - tags = {} - tags_test = Tags() + #tags = {} + tags = Tags() res = url_response["response"] #tags["index"] = url_response["index"] - tags_test.index = url_response["index"] + tags.index = url_response["index"] # Drop hostname from urls to reduce training bias '''url = url_response["url"][0] @@ -252,60 +253,61 @@ def parse_response(url_response): if not url.startswith("http"): url = "https://" + url tags["url_path"] = urlparse(url).path[1:]''' - tags_test = get_url(tags_test, url_response) + tags = get_url(tags, url_response) #tags["html_title"] = "" #tags["meta_description"] = "" #tags["root_page_title"] = remove_excess_whitespace(root_url_cache.get_title(tags["url"])) - # The response is None if there was an error during connection, meaning there is no content to read + '''# The response is None if there was an error during connection, meaning there is no content to read if res is None: - #tags["http_response"] = -1 - return tags + return asdict(tags) # If the connection did not return a 300 code, we can assume there is no relevant content to read - #tags["http_response"] = res.status_code - tags_test.http_response = res.status_code + tags.http_response = res.status_code if not res.ok: - return tags + return asdict(tags)''' + verified, tags = verify_response(tags, res) + if not verified: + return asdict(tags) # Attempt to read the content-type, set the parser accordingly to avoid warning messages try: content_type = res.headers["content-type"] except KeyError: - return tags + return asdict(tags) # If content type does not contain "html" or "xml" then we can assume that the content is unreadable if "html" in content_type: parser = "lxml" elif "xml" in content_type: parser = "lxml-xml" else: - return tags + return asdict(tags) try: soup = BeautifulSoup(res.html.html, parser) except (bs4.builder.ParserRejectedMarkup, AssertionError, AttributeError): - return tags + return asdict(tags) '''if soup.title is not None and soup.title.string is not None: tags["html_title"] = remove_excess_whitespace(soup.title.string) else: tags["html_title"] = ""''' - tags_test = get_html_title(tags_test, soup) + tags = get_html_title(tags, soup) '''meta_tag = soup.find("meta", attrs={"name": "description"}) try: tags["meta_description"] = remove_excess_whitespace(meta_tag["content"]) if meta_tag is not None else "" except KeyError: tags["meta_description"] = ""''' - tags_test = get_meta_description(tags_test, soup) + tags = get_meta_description(tags, soup) '''for header_tag in header_tags: headers = soup.find_all(header_tag) # Retreives and drops headers containing links to reduce training bias header_content = [header.get_text(" ", strip=True) for header in headers if not header.a] tags[header_tag] = json.dumps(header_content, ensure_ascii=False)''' - tags_test = get_header_tags(tags_test, soup) + tags = get_header_tags(tags, soup) # Extract max 500 words of text from HTML
's '''div_text = "" @@ -321,14 +323,13 @@ def parse_response(url_response): # Truncate to 5000 characters in case of run-on 'words' tags["div_text"] = div_text[:MAX_WORDS * 10]''' - tags_test = get_div_text(tags_test, soup) - print(tags_test) + tags = get_div_text(tags, soup) # Prevents most bs4 memory leaks if soup.html: soup.html.decompose() - return tags + return asdict(tags) def get_url(tags, url_response): @@ -349,12 +350,35 @@ def get_url(tags, url_response): # Drop hostname from urls to reduce training bias url_path = urlparse(url).path[1:] # Remove trailing backslash - if url_path[-1] == "/": + if url_path and url_path[-1] == "/": url_path = url_path[:-1] tags.url_path = url_path return tags +def verify_response(tags, res): + """Verifies the webpage response is readable and ok. + + Args: + tags (Tags): DataClass for relevant HTML tags. + res (HTMLResponse|Response): Response object to verify. + + Returns: + bool: False if verification fails, True otherwise. + Tags: Dataclass for relevant HTML tags. + """ + print(type(res)) + # The response is None if there was an error during connection, meaning there is no content to read + if res is None: + return False, tags + + # If the connection did not return a 300 code, we can assume there is no relevant content to read + tags.http_response = res.status_code + if not res.ok: + return False, tags + + return True, tags + def get_html_title(tags, soup): """Updates the Tags DataClass with the html_title. From c4b3ce5f6dddc221fb698d0122ed04327a2791c9 Mon Sep 17 00:00:00 2001 From: Kylie Date: Sat, 25 May 2024 16:52:35 -0600 Subject: [PATCH 09/15] Add get_parser function --- html_tag_collector/collector.py | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py index cc85003..679b14a 100644 --- a/html_tag_collector/collector.py +++ b/html_tag_collector/collector.py @@ -272,7 +272,7 @@ def parse_response(url_response): return asdict(tags) # Attempt to read the content-type, set the parser accordingly to avoid warning messages - try: + '''try: content_type = res.headers["content-type"] except KeyError: return asdict(tags) @@ -282,6 +282,9 @@ def parse_response(url_response): elif "xml" in content_type: parser = "lxml-xml" else: + return asdict(tags)''' + parser = get_parser(res) + if not parser: return asdict(tags) try: @@ -367,7 +370,6 @@ def verify_response(tags, res): bool: False if verification fails, True otherwise. Tags: Dataclass for relevant HTML tags. """ - print(type(res)) # The response is None if there was an error during connection, meaning there is no content to read if res is None: return False, tags @@ -380,6 +382,32 @@ def verify_response(tags, res): return True, tags +def get_parser(res): + """Retrieves the parser type to use with BeautifulSoup. + + Args: + res (HTMLResponse|Response): Response object to read the content-type from. + + Returns: + str|bool: A string of the parser to use, or False if not readable. + """ + # Attempt to read the content-type, set the parser accordingly to avoid warning messages + try: + content_type = res.headers["content-type"] + except KeyError: + return False + + # If content type does not contain "html" or "xml" then we can assume that the content is unreadable + if "html" in content_type: + parser = "lxml" + elif "xml" in content_type: + parser = "lxml-xml" + else: + return False + + return parser + + def get_html_title(tags, soup): """Updates the Tags DataClass with the html_title. From 927cbb7f040cd63abfecf89e07f1bce3c4abf75e Mon Sep 17 00:00:00 2001 From: Kylie Date: Tue, 28 May 2024 12:28:21 -0600 Subject: [PATCH 10/15] Restructure the functions --- html_tag_collector/collector.py | 91 ++++++++++++++++----------------- 1 file changed, 43 insertions(+), 48 deletions(-) diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py index 679b14a..e103d4f 100644 --- a/html_tag_collector/collector.py +++ b/html_tag_collector/collector.py @@ -237,7 +237,7 @@ def parse_response(url_response): url_response (list[dict]): List of dictionaries containing urls and their responses. Returns: - Tags: DataClass containing the url and relevant HTML tags. + dict: Dictionary containing the url and relevant HTML tags. """ #remove_excess_whitespace = lambda s: " ".join(s.split()).strip() @@ -253,12 +253,12 @@ def parse_response(url_response): if not url.startswith("http"): url = "https://" + url tags["url_path"] = urlparse(url).path[1:]''' - tags = get_url(tags, url_response) + tags.url, tags.url_path = get_url(url_response) #tags["html_title"] = "" #tags["meta_description"] = "" #tags["root_page_title"] = remove_excess_whitespace(root_url_cache.get_title(tags["url"])) - + tags.root_page_title = remove_excess_whitespace(root_url_cache.get_title(tags.url)) '''# The response is None if there was an error during connection, meaning there is no content to read if res is None: return asdict(tags) @@ -267,8 +267,8 @@ def parse_response(url_response): tags.http_response = res.status_code if not res.ok: return asdict(tags)''' - verified, tags = verify_response(tags, res) - if not verified: + verified, tags.http_response = verify_response(res) + if verified is False: return asdict(tags) # Attempt to read the content-type, set the parser accordingly to avoid warning messages @@ -284,7 +284,7 @@ def parse_response(url_response): else: return asdict(tags)''' parser = get_parser(res) - if not parser: + if parser is False: return asdict(tags) try: @@ -296,14 +296,14 @@ def parse_response(url_response): tags["html_title"] = remove_excess_whitespace(soup.title.string) else: tags["html_title"] = ""''' - tags = get_html_title(tags, soup) + tags.html_title = get_html_title(soup) '''meta_tag = soup.find("meta", attrs={"name": "description"}) try: tags["meta_description"] = remove_excess_whitespace(meta_tag["content"]) if meta_tag is not None else "" except KeyError: tags["meta_description"] = ""''' - tags = get_meta_description(tags, soup) + tags.meta_description = get_meta_description(soup) '''for header_tag in header_tags: headers = soup.find_all(header_tag) @@ -326,7 +326,7 @@ def parse_response(url_response): # Truncate to 5000 characters in case of run-on 'words' tags["div_text"] = div_text[:MAX_WORDS * 10]''' - tags = get_div_text(tags, soup) + tags.div_text = get_div_text(soup) # Prevents most bs4 memory leaks if soup.html: @@ -335,51 +335,47 @@ def parse_response(url_response): return asdict(tags) -def get_url(tags, url_response): - """Updates the Tags DataClass with the url and url_path. +def get_url(url_response): + """Returns the url and url_path. Args: - tags (Tags): DataClass for relevant HTML tags. url_response (list[dict]): List of dictionaries containing urls and their responses. Returns: - Tags: DataClass with updated url and url_path. + (str, str): Tuple with the url and url_path. """ url = url_response["url"][0] - tags.url = url - if not url.startswith("http"): - url = "https://" + url + new_url = url + if not new_url.startswith("http"): + new_url = "https://" + new_url # Drop hostname from urls to reduce training bias - url_path = urlparse(url).path[1:] + url_path = urlparse(new_url).path[1:] # Remove trailing backslash if url_path and url_path[-1] == "/": url_path = url_path[:-1] - tags.url_path = url_path - return tags + return url, url_path -def verify_response(tags, res): +def verify_response(res): """Verifies the webpage response is readable and ok. Args: - tags (Tags): DataClass for relevant HTML tags. res (HTMLResponse|Response): Response object to verify. Returns: - bool: False if verification fails, True otherwise. - Tags: Dataclass for relevant HTML tags. + (bool, int): A tuple containing False if verification fails, True otherwise and the http response code """ # The response is None if there was an error during connection, meaning there is no content to read if res is None: - return False, tags + return False, -1 # If the connection did not return a 300 code, we can assume there is no relevant content to read - tags.http_response = res.status_code + http_response = res.status_code if not res.ok: - return False, tags + return False, http_response - return True, tags + return True, http_response def get_parser(res): @@ -408,39 +404,39 @@ def get_parser(res): return parser -def get_html_title(tags, soup): - """Updates the Tags DataClass with the html_title. +def get_html_title(soup): + """Retrieves the HTML title from a BeautifulSoup object. Args: - tags (Tags): DataClass for relevant HTML tags. soup (BeautifulSoup): BeautifulSoup object to pull the HTML title from. Returns: - Tags: DataClass with updated html_title. + str: The HTML title. """ + html_title = "" + if soup.title is not None and soup.title.string is not None: - tags.html_title = remove_excess_whitespace(soup.title.string) + html_title = remove_excess_whitespace(soup.title.string) - return tags + return html_title -def get_meta_description(tags, soup): - """Updates the Tags DataClass with the meta_description. +def get_meta_description(soup): + """Retrieves the meta description from a BeautifulSoup object. Args: - tags (Tags): DataClass for relevant HTML tags. soup (BeautifulSoup): BeautifulSoup object to pull the meta description from. Returns: - Tags: DataClass with updated meta_description. + str: The meta description. """ meta_tag = soup.find("meta", attrs={"name": "description"}) try: - tags.meta_description = remove_excess_whitespace(meta_tag["content"]) if meta_tag is not None else "" + meta_description = remove_excess_whitespace(meta_tag["content"]) if meta_tag is not None else "" except KeyError: - return - - return tags + return "" + + return meta_description def get_header_tags(tags, soup): @@ -455,7 +451,7 @@ def get_header_tags(tags, soup): """ for header_tag in header_tags: headers = soup.find_all(header_tag) - # Retreives and drops headers containing links to reduce training bias + # Retrieves and drops headers containing links to reduce training bias header_content = [header.get_text(" ", strip=True) for header in headers if not header.a] tag_content = json.dumps(header_content, ensure_ascii=False) setattr(tags, header_tag, tag_content) @@ -463,15 +459,14 @@ def get_header_tags(tags, soup): return tags -def get_div_text(tags, soup): - """Updates the Tags DataClass with the div_text. +def get_div_text(soup): + """Retrieves the div text from a BeautifulSoup object. Args: - tags (Tags): DataClass for relevant HTML tags. soup (BeautifulSoup): BeautifulSoup object to pull the div text from. Returns: - Tags: DataClass with updated div_text. + str: The div text. """ # Extract max 500 words of text from HTML
's div_text = "" @@ -486,9 +481,9 @@ def get_div_text(tags, soup): break # Stop adding text if word limit is reached # Truncate to 5000 characters in case of run-on 'words' - tags.div_text = div_text[:MAX_WORDS * 10] + div_text = div_text[:MAX_WORDS * 10] - return tags + return div_text def remove_excess_whitespace(s): From 0c693c3df276c0476599b6cc5347858853c96d33 Mon Sep 17 00:00:00 2001 From: Kylie Date: Tue, 28 May 2024 15:22:08 -0600 Subject: [PATCH 11/15] Cleanup comments --- html_tag_collector/collector.py | 60 --------------------------------- 1 file changed, 60 deletions(-) diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py index e103d4f..f489e91 100644 --- a/html_tag_collector/collector.py +++ b/html_tag_collector/collector.py @@ -239,50 +239,18 @@ def parse_response(url_response): Returns: dict: Dictionary containing the url and relevant HTML tags. """ - #remove_excess_whitespace = lambda s: " ".join(s.split()).strip() - - #tags = {} tags = Tags() res = url_response["response"] - #tags["index"] = url_response["index"] tags.index = url_response["index"] - # Drop hostname from urls to reduce training bias - '''url = url_response["url"][0] - tags["url"] = url - if not url.startswith("http"): - url = "https://" + url - tags["url_path"] = urlparse(url).path[1:]''' tags.url, tags.url_path = get_url(url_response) - #tags["html_title"] = "" - #tags["meta_description"] = "" - #tags["root_page_title"] = remove_excess_whitespace(root_url_cache.get_title(tags["url"])) tags.root_page_title = remove_excess_whitespace(root_url_cache.get_title(tags.url)) - '''# The response is None if there was an error during connection, meaning there is no content to read - if res is None: - return asdict(tags) - # If the connection did not return a 300 code, we can assume there is no relevant content to read - tags.http_response = res.status_code - if not res.ok: - return asdict(tags)''' verified, tags.http_response = verify_response(res) if verified is False: return asdict(tags) - # Attempt to read the content-type, set the parser accordingly to avoid warning messages - '''try: - content_type = res.headers["content-type"] - except KeyError: - return asdict(tags) - # If content type does not contain "html" or "xml" then we can assume that the content is unreadable - if "html" in content_type: - parser = "lxml" - elif "xml" in content_type: - parser = "lxml-xml" - else: - return asdict(tags)''' parser = get_parser(res) if parser is False: return asdict(tags) @@ -292,40 +260,12 @@ def parse_response(url_response): except (bs4.builder.ParserRejectedMarkup, AssertionError, AttributeError): return asdict(tags) - '''if soup.title is not None and soup.title.string is not None: - tags["html_title"] = remove_excess_whitespace(soup.title.string) - else: - tags["html_title"] = ""''' tags.html_title = get_html_title(soup) - '''meta_tag = soup.find("meta", attrs={"name": "description"}) - try: - tags["meta_description"] = remove_excess_whitespace(meta_tag["content"]) if meta_tag is not None else "" - except KeyError: - tags["meta_description"] = ""''' tags.meta_description = get_meta_description(soup) - '''for header_tag in header_tags: - headers = soup.find_all(header_tag) - # Retreives and drops headers containing links to reduce training bias - header_content = [header.get_text(" ", strip=True) for header in headers if not header.a] - tags[header_tag] = json.dumps(header_content, ensure_ascii=False)''' tags = get_header_tags(tags, soup) - # Extract max 500 words of text from HTML
's - '''div_text = "" - MAX_WORDS = 500 - for div in soup.find_all("div"): - text = div.get_text(" ", strip=True) - if text: - # Check if adding the current text exceeds the word limit - if len(div_text.split()) + len(text.split()) <= MAX_WORDS: - div_text += text + " " - else: - break # Stop adding text if word limit is reached - - # Truncate to 5000 characters in case of run-on 'words' - tags["div_text"] = div_text[:MAX_WORDS * 10]''' tags.div_text = get_div_text(soup) # Prevents most bs4 memory leaks From 12ad80a2e333f1f2f1ebac1bb14f67e17010362e Mon Sep 17 00:00:00 2001 From: Kylie Date: Tue, 28 May 2024 15:23:18 -0600 Subject: [PATCH 12/15] Format with black --- html_tag_collector/collector.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py index f489e91..18d4457 100644 --- a/html_tag_collector/collector.py +++ b/html_tag_collector/collector.py @@ -169,12 +169,15 @@ async def get_response(session, url, index): # or the response is an unreadable content type # or the response code from the website is not in the 200s if ( - response is not None and len(response.content) > 10000000 - or content_type is not None and any( + response is not None + and len(response.content) > 10000000 + or content_type is not None + and any( filtered_type in content_type for filtered_type in ["pdf", "excel", "msword", "image", "rtf", "zip", "octet", "csv", "json"] ) - or response is not None and not response.ok + or response is not None + and not response.ok ): # Discard the response content to prevent out of memory errors if DEBUG: @@ -297,6 +300,7 @@ def get_url(url_response): return url, url_path + def verify_response(res): """Verifies the webpage response is readable and ok. @@ -326,7 +330,7 @@ def get_parser(res): Returns: str|bool: A string of the parser to use, or False if not readable. - """ + """ # Attempt to read the content-type, set the parser accordingly to avoid warning messages try: content_type = res.headers["content-type"] @@ -340,7 +344,7 @@ def get_parser(res): parser = "lxml-xml" else: return False - + return parser @@ -357,7 +361,7 @@ def get_html_title(soup): if soup.title is not None and soup.title.string is not None: html_title = remove_excess_whitespace(soup.title.string) - + return html_title @@ -369,7 +373,7 @@ def get_meta_description(soup): Returns: str: The meta description. - """ + """ meta_tag = soup.find("meta", attrs={"name": "description"}) try: meta_description = remove_excess_whitespace(meta_tag["content"]) if meta_tag is not None else "" @@ -388,7 +392,7 @@ def get_header_tags(tags, soup): Returns: Tags: DataClass with updated header tags. - """ + """ for header_tag in header_tags: headers = soup.find_all(header_tag) # Retrieves and drops headers containing links to reduce training bias @@ -407,7 +411,7 @@ def get_div_text(soup): Returns: str: The div text. - """ + """ # Extract max 500 words of text from HTML
's div_text = "" MAX_WORDS = 500 @@ -421,7 +425,7 @@ def get_div_text(soup): break # Stop adding text if word limit is reached # Truncate to 5000 characters in case of run-on 'words' - div_text = div_text[:MAX_WORDS * 10] + div_text = div_text[: MAX_WORDS * 10] return div_text @@ -434,7 +438,7 @@ def remove_excess_whitespace(s): Returns: str: Clean string with excess whitespace stripped. - """ + """ return " ".join(s.split()).strip() From 9afb1ee4e60b9ad3e9ebfd1516a9a226f992f508 Mon Sep 17 00:00:00 2001 From: Kylie Date: Wed, 29 May 2024 10:57:29 -0600 Subject: [PATCH 13/15] Add check_response --- html_tag_collector/collector.py | 58 +++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 21 deletions(-) diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py index 18d4457..72b9c93 100644 --- a/html_tag_collector/collector.py +++ b/html_tag_collector/collector.py @@ -165,30 +165,46 @@ async def get_response(session, url, index): except (KeyError, AttributeError): pass - # If the response size is greater than 10 MB - # or the response is an unreadable content type - # or the response code from the website is not in the 200s - if ( - response is not None - and len(response.content) > 10000000 - or content_type is not None - and any( - filtered_type in content_type - for filtered_type in ["pdf", "excel", "msword", "image", "rtf", "zip", "octet", "csv", "json"] - ) - or response is not None - and not response.ok - ): - # Discard the response content to prevent out of memory errors - if DEBUG: - print("Large or unreadable content discarded:", len(response.content), url) - new_response = requests.Response() - new_response.status_code = response.status_code - response = new_response + response = check_response(response, content_type, url) return {"index": index, "response": response} +def check_response(response, content_type, url): + """Checks the response to see if content is too large, unreadable, or invalid response code. The response is discarded if it is invalid. + + Args: + response (Response): Response object to check. + content_type (str): The content type returned by the website. + url (str): URL that was requested. + + Returns: + Response: The response object is returned either unmodified or discarded. + """ + # If the response size is greater than 10 MB + # or the response is an unreadable content type + # or the response code from the website is not in the 200s + if ( + response is not None + and len(response.content) > 10000000 + or content_type is not None + and any( + filtered_type in content_type + for filtered_type in ["pdf", "excel", "msword", "image", "rtf", "zip", "octet", "csv", "json"] + ) + or response is not None + and not response.ok + ): + # Discard the response content to prevent out of memory errors + if DEBUG: + print("Large or unreadable content discarded:", len(response.content), url) + new_response = requests.Response() + new_response.status_code = response.status_code + response = new_response + + return response + + async def render_js(urls_responses): """Renders JavaScript from a list of urls. @@ -314,7 +330,7 @@ def verify_response(res): if res is None: return False, -1 - # If the connection did not return a 300 code, we can assume there is no relevant content to read + # If the connection did not return a 200 code, we can assume there is no relevant content to read http_response = res.status_code if not res.ok: return False, http_response From 0137a6f3be0792851bd15c80b9055f9f383acc75 Mon Sep 17 00:00:00 2001 From: Kylie Date: Wed, 29 May 2024 11:33:32 -0600 Subject: [PATCH 14/15] Convert function return to namedtuple --- html_tag_collector/collector.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py index 72b9c93..08fa660 100644 --- a/html_tag_collector/collector.py +++ b/html_tag_collector/collector.py @@ -1,4 +1,5 @@ from dataclasses import asdict +from collections import namedtuple import json import ssl import urllib3 @@ -165,12 +166,12 @@ async def get_response(session, url, index): except (KeyError, AttributeError): pass - response = check_response(response, content_type, url) + response = response_valid(response, content_type, url) return {"index": index, "response": response} -def check_response(response, content_type, url): +def response_valid(response, content_type, url): """Checks the response to see if content is too large, unreadable, or invalid response code. The response is discarded if it is invalid. Args: @@ -324,18 +325,19 @@ def verify_response(res): res (HTMLResponse|Response): Response object to verify. Returns: - (bool, int): A tuple containing False if verification fails, True otherwise and the http response code + VerifiedResponse(bool, int): A named tuple containing False if verification fails, True otherwise and the http response code. """ + VerifiedResponse = namedtuple("VerifiedResponse", "verified http_response") # The response is None if there was an error during connection, meaning there is no content to read if res is None: - return False, -1 + return VerifiedResponse(False, -1) # If the connection did not return a 200 code, we can assume there is no relevant content to read http_response = res.status_code if not res.ok: - return False, http_response - - return True, http_response + return VerifiedResponse(False, http_response) + + return VerifiedResponse(True, http_response) def get_parser(res): From e40e47276173eb2ee14acf775ceaa9b04c120c55 Mon Sep 17 00:00:00 2001 From: Kylie Date: Wed, 29 May 2024 11:42:18 -0600 Subject: [PATCH 15/15] Add docstring --- html_tag_collector/collector.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py index 08fa660..26767dc 100644 --- a/html_tag_collector/collector.py +++ b/html_tag_collector/collector.py @@ -1,3 +1,15 @@ +""" The tag collector is used to collect HTML tags and other relevant data from websites that is useful for training prediction models. + Information being collected includes: + - The URL's path + - HTML title + - Meta description + - The root page's HTML title + - HTTP response code + - Contents of H1-H6 header tags + - Contents of div tags +""" + + from dataclasses import asdict from collections import namedtuple import json @@ -336,7 +348,7 @@ def verify_response(res): http_response = res.status_code if not res.ok: return VerifiedResponse(False, http_response) - + return VerifiedResponse(True, http_response)