From b88c6819b29773440cedba8dddec254795ab5764 Mon Sep 17 00:00:00 2001
From: Kylie <github.a5cee@kyscott.aleeas.com>
Date: Thu, 23 May 2024 12:56:27 -0600
Subject: [PATCH 01/15] Create dataclass

---
 html_tag_collector/DataClassTags.py | 7 +++++++
 html_tag_collector/collector.py     | 2 ++
 2 files changed, 9 insertions(+)
 create mode 100644 html_tag_collector/DataClassTags.py

diff --git a/html_tag_collector/DataClassTags.py b/html_tag_collector/DataClassTags.py
new file mode 100644
index 0000000..9d71ea6
--- /dev/null
+++ b/html_tag_collector/DataClassTags.py
@@ -0,0 +1,7 @@
+from dataclasses import dataclass
+
+
+@dataclass
+class Tags:
+    html_title: str
+    
diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py
index 61f03bf..788a0e9 100644
--- a/html_tag_collector/collector.py
+++ b/html_tag_collector/collector.py
@@ -20,6 +20,8 @@
 
 from RootURLCache import RootURLCache
 from common import get_user_agent
+from DataClassTags import Tags
+
 
 # Define the list of header tags we want to extract
 header_tags = ["h1", "h2", "h3", "h4", "h5", "h6"]

From 625147546254cc4b6d6a604da496903ce628ac44 Mon Sep 17 00:00:00 2001
From: Kylie <github.a5cee@kyscott.aleeas.com>
Date: Fri, 24 May 2024 23:48:17 -0600
Subject: [PATCH 02/15] Add dataclass parameters

---
 html_tag_collector/DataClassTags.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/html_tag_collector/DataClassTags.py b/html_tag_collector/DataClassTags.py
index 9d71ea6..d24cefc 100644
--- a/html_tag_collector/DataClassTags.py
+++ b/html_tag_collector/DataClassTags.py
@@ -3,5 +3,17 @@
 
 @dataclass
 class Tags:
-    html_title: str
-    
+    index: int = None
+    url: str = ""
+    url_path: str = ""
+    html_title: str = ""
+    meta_description: str = ""
+    root_page_title: str = ""
+    http_response: int = -1
+    h1: str = ""
+    h2: str = ""
+    h3: str = ""
+    h4: str = ""
+    h5: str = ""
+    h6: str = ""
+    div_text: str = ""

From 3c5f6461c868d780d00d3dd790408a7da45ecd96 Mon Sep 17 00:00:00 2001
From: Kylie <github.a5cee@kyscott.aleeas.com>
Date: Fri, 24 May 2024 23:48:41 -0600
Subject: [PATCH 03/15] Create get_url function

---
 html_tag_collector/collector.py | 51 +++++++++++++++++++++++++++------
 1 file changed, 42 insertions(+), 9 deletions(-)

diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py
index 788a0e9..c0fa5c6 100644
--- a/html_tag_collector/collector.py
+++ b/html_tag_collector/collector.py
@@ -233,41 +233,48 @@ def parse_response(url_response):
     """Parses relevant HTML tags from a Response object into a dictionary.
 
     Args:
-        url_response (list[dict]): List of dictionaries containing urls and theeir responses.
+        url_response (list[dict]): List of dictionaries containing urls and their responses.
 
     Returns:
-        list[dict]: List of dictionaries containing urls and relevant HTML tags.
+        Tags: DataClass containing the url and relevant HTML tags.
     """
     remove_excess_whitespace = lambda s: " ".join(s.split()).strip()
     
     tags = {}
+    tags_test = Tags()
     res = url_response["response"]
-    tags["index"] = url_response["index"]
+    #tags["index"] = url_response["index"]
+    tags_test.index = url_response["index"]
 
     # Drop hostname from urls to reduce training bias
-    url = url_response["url"][0]
+    '''url = url_response["url"][0]
     tags["url"] = url
     if not url.startswith("http"):
         url = "https://" + url
-    tags["url_path"] = urlparse(url).path[1:]
+    tags["url_path"] = urlparse(url).path[1:]'''
+    tags_test = get_url(tags_test, url_response)
+    print(tags_test)
 
     tags["html_title"] = ""
     tags["meta_description"] = ""
-    tags["root_page_title"] = remove_excess_whitespace(root_url_cache.get_title(tags["url"]))
+    #tags["root_page_title"] = remove_excess_whitespace(root_url_cache.get_title(tags["url"]))
 
+    # The response is None if there was an error during connection, meaning there is no content to read
     if res is None:
         tags["http_response"] = -1
         return tags
 
+    # If the connection did not return a 300 code, we can assume there is no relevant content to read
     tags["http_response"] = res.status_code
     if not res.ok:
         return tags
 
+    # Attempt to read the content-type, set the parser accordingly to avoid warning messages
     try:
         content_type = res.headers["content-type"]
     except KeyError:
         return tags
-
+    # If content type does not contain "html" or "xml" then we can assume that the content is unreadable
     if "html" in content_type:
         parser = "lxml"
     elif "xml" in content_type:
@@ -296,6 +303,7 @@ def parse_response(url_response):
         # Retreives and drops headers containing links to reduce training bias
         header_content = [header.get_text(" ", strip=True) for header in headers if not header.a]
         tags[header_tag] = json.dumps(header_content, ensure_ascii=False)
+        #setattr(tags_test, header_tag, "Test")
 
     # Extract max 500 words of text from HTML <div>'s
     div_text = ""
@@ -309,9 +317,9 @@ def parse_response(url_response):
             else:
                 break  # Stop adding text if word limit is reached
 
-    # truncate to 5000 characters in case of run-on 'words'
+    # Truncate to 5000 characters in case of run-on 'words'
     tags["div_text"] = div_text[:MAX_WORDS * 10]
-
+    
     # Prevents most bs4 memory leaks
     if soup.html:
         soup.html.decompose()
@@ -319,6 +327,31 @@ def parse_response(url_response):
     return tags
 
 
+def get_url(tags, url_response):
+    """Updates the Tags dataclass with the url and url_path
+
+    Args:
+        tags (Tags): DataClass for relevant HTML tags.
+        url_response (list[dict]): List of dictionaries containing urls and their responses.
+
+    Returns:
+        Tags: DataClass with updated url and url_path.
+    """
+    url = url_response["url"][0]
+    tags.url = url
+    if not url.startswith("http"):
+        url = "https://" + url
+
+    # Drop hostname from urls to reduce training bias
+    url_path = urlparse(url).path[1:]
+    # Remove trailing backslash
+    if url_path[-1] == "/":
+        url_path = url_path[:-1]
+    tags.url_path = url_path
+
+    return tags
+
+
 def collector_main(df, render_javascript=False):
     context = multiprocessing.get_context("spawn")
     manager = context.Manager()

From 7d1d5ebb1c798ddd0c7345d2b3b0051efa30d4c1 Mon Sep 17 00:00:00 2001
From: Kylie <github.a5cee@kyscott.aleeas.com>
Date: Sat, 25 May 2024 00:17:35 -0600
Subject: [PATCH 04/15] Add get_html_title function

---
 html_tag_collector/collector.py | 45 +++++++++++++++++++++++++++------
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py
index c0fa5c6..368bd47 100644
--- a/html_tag_collector/collector.py
+++ b/html_tag_collector/collector.py
@@ -238,7 +238,7 @@ def parse_response(url_response):
     Returns:
         Tags: DataClass containing the url and relevant HTML tags.
     """
-    remove_excess_whitespace = lambda s: " ".join(s.split()).strip()
+    #remove_excess_whitespace = lambda s: " ".join(s.split()).strip()
     
     tags = {}
     tags_test = Tags()
@@ -253,19 +253,19 @@ def parse_response(url_response):
         url = "https://" + url
     tags["url_path"] = urlparse(url).path[1:]'''
     tags_test = get_url(tags_test, url_response)
-    print(tags_test)
 
-    tags["html_title"] = ""
-    tags["meta_description"] = ""
+    #tags["html_title"] = ""
+    #tags["meta_description"] = ""
     #tags["root_page_title"] = remove_excess_whitespace(root_url_cache.get_title(tags["url"]))
 
     # The response is None if there was an error during connection, meaning there is no content to read
     if res is None:
-        tags["http_response"] = -1
+        #tags["http_response"] = -1
         return tags
 
     # If the connection did not return a 300 code, we can assume there is no relevant content to read
-    tags["http_response"] = res.status_code
+    #tags["http_response"] = res.status_code
+    tags_test.http_response = res.status_code
     if not res.ok:
         return tags
 
@@ -287,10 +287,11 @@ def parse_response(url_response):
     except (bs4.builder.ParserRejectedMarkup, AssertionError, AttributeError):
         return tags
 
-    if soup.title is not None and soup.title.string is not None:
+    '''if soup.title is not None and soup.title.string is not None:
         tags["html_title"] = remove_excess_whitespace(soup.title.string)
     else:
-        tags["html_title"] = ""
+        tags["html_title"] = ""'''
+    tags_test = get_html_title(tags_test, soup)
 
     meta_tag = soup.find("meta", attrs={"name": "description"})
     try:
@@ -352,6 +353,34 @@ def get_url(tags, url_response):
     return tags
 
 
+def get_html_title(tags, soup):
+    """Updates the Tags dataclass with the html_title
+
+    Args:
+        tags (Tags): DataClass for relevant HTML tags.
+        soup (BeautifulSoup): BeautifulSoup object to pull the HTML title from.
+
+    Returns:
+        Tags: DataClass with updated html_title.
+    """
+    if soup.title is not None and soup.title.string is not None:
+        tags.html_title = remove_excess_whitespace(soup.title.string)
+    
+    return tags
+
+
+def remove_excess_whitespace(s):
+    """Removes leading, trailing, and excess adjacent whitespace.
+
+    Args:
+        s (str): String to remove whitespace from.
+
+    Returns:
+        str: Clean string with excess whitespace stripped.
+    """    
+    return " ".join(s.split()).strip()
+
+
 def collector_main(df, render_javascript=False):
     context = multiprocessing.get_context("spawn")
     manager = context.Manager()

From f1bd4df6a2fcc2342ec179cf8b48b13d0c639c85 Mon Sep 17 00:00:00 2001
From: Kylie <github.a5cee@kyscott.aleeas.com>
Date: Sat, 25 May 2024 00:24:48 -0600
Subject: [PATCH 05/15] Add get_meta_description function

---
 html_tag_collector/collector.py | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py
index 368bd47..b52bf09 100644
--- a/html_tag_collector/collector.py
+++ b/html_tag_collector/collector.py
@@ -293,11 +293,13 @@ def parse_response(url_response):
         tags["html_title"] = ""'''
     tags_test = get_html_title(tags_test, soup)
 
-    meta_tag = soup.find("meta", attrs={"name": "description"})
+    '''meta_tag = soup.find("meta", attrs={"name": "description"})
     try:
         tags["meta_description"] = remove_excess_whitespace(meta_tag["content"]) if meta_tag is not None else ""
     except KeyError:
-        tags["meta_description"] = ""
+        tags["meta_description"] = ""'''
+    tags_test = get_meta_description(tags_test, soup)
+    print(tags_test)
 
     for header_tag in header_tags:
         headers = soup.find_all(header_tag)
@@ -329,7 +331,7 @@ def parse_response(url_response):
 
 
 def get_url(tags, url_response):
-    """Updates the Tags dataclass with the url and url_path
+    """Updates the Tags dataclass with the url and url_path.
 
     Args:
         tags (Tags): DataClass for relevant HTML tags.
@@ -354,7 +356,7 @@ def get_url(tags, url_response):
 
 
 def get_html_title(tags, soup):
-    """Updates the Tags dataclass with the html_title
+    """Updates the Tags dataclass with the html_title.
 
     Args:
         tags (Tags): DataClass for relevant HTML tags.
@@ -369,6 +371,25 @@ def get_html_title(tags, soup):
     return tags
 
 
+def get_meta_description(tags, soup):
+    """Updates the Tags dataclass with the meta_description.
+
+    Args:
+        tags (Tags): DataClass for relevant HTML tags.
+        soup (BeautifulSoup): BeautifulSoup object to pull the meta description from.
+
+    Returns:
+        Tags: DataClass with updated meta_description.
+    """    
+    meta_tag = soup.find("meta", attrs={"name": "description"})
+    try:
+        tags.meta_description = remove_excess_whitespace(meta_tag["content"]) if meta_tag is not None else ""
+    except KeyError:
+        return
+    
+    return tags
+
+
 def remove_excess_whitespace(s):
     """Removes leading, trailing, and excess adjacent whitespace.
 

From 3b74673a4199ed40466d3d6b57115a73428f5b11 Mon Sep 17 00:00:00 2001
From: Kylie <github.a5cee@kyscott.aleeas.com>
Date: Sat, 25 May 2024 15:38:24 -0600
Subject: [PATCH 06/15] Add get_header_tags function

---
 html_tag_collector/collector.py | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py
index b52bf09..ed53834 100644
--- a/html_tag_collector/collector.py
+++ b/html_tag_collector/collector.py
@@ -299,14 +299,13 @@ def parse_response(url_response):
     except KeyError:
         tags["meta_description"] = ""'''
     tags_test = get_meta_description(tags_test, soup)
-    print(tags_test)
 
-    for header_tag in header_tags:
+    '''for header_tag in header_tags:
         headers = soup.find_all(header_tag)
         # Retreives and drops headers containing links to reduce training bias
         header_content = [header.get_text(" ", strip=True) for header in headers if not header.a]
-        tags[header_tag] = json.dumps(header_content, ensure_ascii=False)
-        #setattr(tags_test, header_tag, "Test")
+        tags[header_tag] = json.dumps(header_content, ensure_ascii=False)'''
+    tags_test = get_header_tags(tags_test, soup)
 
     # Extract max 500 words of text from HTML <div>'s
     div_text = ""
@@ -331,7 +330,7 @@ def parse_response(url_response):
 
 
 def get_url(tags, url_response):
-    """Updates the Tags dataclass with the url and url_path.
+    """Updates the Tags DataClass with the url and url_path.
 
     Args:
         tags (Tags): DataClass for relevant HTML tags.
@@ -356,7 +355,7 @@ def get_url(tags, url_response):
 
 
 def get_html_title(tags, soup):
-    """Updates the Tags dataclass with the html_title.
+    """Updates the Tags DataClass with the html_title.
 
     Args:
         tags (Tags): DataClass for relevant HTML tags.
@@ -372,7 +371,7 @@ def get_html_title(tags, soup):
 
 
 def get_meta_description(tags, soup):
-    """Updates the Tags dataclass with the meta_description.
+    """Updates the Tags DataClass with the meta_description.
 
     Args:
         tags (Tags): DataClass for relevant HTML tags.
@@ -390,6 +389,26 @@ def get_meta_description(tags, soup):
     return tags
 
 
+def get_header_tags(tags, soup):
+    """Updates the Tags DataClass with the header tags.
+
+    Args:
+        tags (Tags): DataClass for relevant HTML tags.
+        soup (BeautifulSoup): BeautifulSoup object to pull the header tags from.
+
+    Returns:
+        Tags: DataClass with updated header tags.
+    """    
+    for header_tag in header_tags:
+        headers = soup.find_all(header_tag)
+        # Retreives and drops headers containing links to reduce training bias
+        header_content = [header.get_text(" ", strip=True) for header in headers if not header.a]
+        tag_content = json.dumps(header_content, ensure_ascii=False)
+        setattr(tags, header_tag, tag_content)
+
+    return tags
+
+
 def remove_excess_whitespace(s):
     """Removes leading, trailing, and excess adjacent whitespace.
 

From bb21b18247a5f7f56386799af603a44c1e9f2ac4 Mon Sep 17 00:00:00 2001
From: Kylie <github.a5cee@kyscott.aleeas.com>
Date: Sat, 25 May 2024 15:52:21 -0600
Subject: [PATCH 07/15] Add get_div_text function

---
 html_tag_collector/collector.py | 36 ++++++++++++++++++++++++++++++---
 1 file changed, 33 insertions(+), 3 deletions(-)

diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py
index ed53834..8c84f90 100644
--- a/html_tag_collector/collector.py
+++ b/html_tag_collector/collector.py
@@ -308,7 +308,7 @@ def parse_response(url_response):
     tags_test = get_header_tags(tags_test, soup)
 
     # Extract max 500 words of text from HTML <div>'s
-    div_text = ""
+    '''div_text = ""
     MAX_WORDS = 500
     for div in soup.find_all("div"):
         text = div.get_text(" ", strip=True)
@@ -320,8 +320,10 @@ def parse_response(url_response):
                 break  # Stop adding text if word limit is reached
 
     # Truncate to 5000 characters in case of run-on 'words'
-    tags["div_text"] = div_text[:MAX_WORDS * 10]
-    
+    tags["div_text"] = div_text[:MAX_WORDS * 10]'''
+    tags_test = get_div_text(tags_test, soup)
+    print(tags_test)
+
     # Prevents most bs4 memory leaks
     if soup.html:
         soup.html.decompose()
@@ -409,6 +411,34 @@ def get_header_tags(tags, soup):
     return tags
 
 
+def get_div_text(tags, soup):
+    """Updates the Tags DataClass with the div_text.
+
+    Args:
+        tags (Tags): DataClass for relevant HTML tags.
+        soup (BeautifulSoup): BeautifulSoup object to pull the div text from.
+
+    Returns:
+        Tags: DataClass with updated div_text.
+    """    
+    # Extract max 500 words of text from HTML <div>'s
+    div_text = ""
+    MAX_WORDS = 500
+    for div in soup.find_all("div"):
+        text = div.get_text(" ", strip=True)
+        if text:
+            # Check if adding the current text exceeds the word limit
+            if len(div_text.split()) + len(text.split()) <= MAX_WORDS:
+                div_text += text + " "
+            else:
+                break  # Stop adding text if word limit is reached
+
+    # Truncate to 5000 characters in case of run-on 'words'
+    tags.div_text = div_text[:MAX_WORDS * 10]
+
+    return tags
+
+
 def remove_excess_whitespace(s):
     """Removes leading, trailing, and excess adjacent whitespace.
 

From 22c00e241e09daecf13d93661a4e5029f2dd3ef5 Mon Sep 17 00:00:00 2001
From: Kylie <github.a5cee@kyscott.aleeas.com>
Date: Sat, 25 May 2024 16:37:31 -0600
Subject: [PATCH 08/15] Add verify_response fuction

---
 html_tag_collector/collector.py | 64 ++++++++++++++++++++++-----------
 1 file changed, 44 insertions(+), 20 deletions(-)

diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py
index 8c84f90..cc85003 100644
--- a/html_tag_collector/collector.py
+++ b/html_tag_collector/collector.py
@@ -1,3 +1,4 @@
+from dataclasses import asdict
 import json
 import ssl
 import urllib3
@@ -240,11 +241,11 @@ def parse_response(url_response):
     """
     #remove_excess_whitespace = lambda s: " ".join(s.split()).strip()
     
-    tags = {}
-    tags_test = Tags()
+    #tags = {}
+    tags = Tags()
     res = url_response["response"]
     #tags["index"] = url_response["index"]
-    tags_test.index = url_response["index"]
+    tags.index = url_response["index"]
 
     # Drop hostname from urls to reduce training bias
     '''url = url_response["url"][0]
@@ -252,60 +253,61 @@ def parse_response(url_response):
     if not url.startswith("http"):
         url = "https://" + url
     tags["url_path"] = urlparse(url).path[1:]'''
-    tags_test = get_url(tags_test, url_response)
+    tags = get_url(tags, url_response)
 
     #tags["html_title"] = ""
     #tags["meta_description"] = ""
     #tags["root_page_title"] = remove_excess_whitespace(root_url_cache.get_title(tags["url"]))
 
-    # The response is None if there was an error during connection, meaning there is no content to read
+    '''# The response is None if there was an error during connection, meaning there is no content to read
     if res is None:
-        #tags["http_response"] = -1
-        return tags
+        return asdict(tags)
 
     # If the connection did not return a 300 code, we can assume there is no relevant content to read
-    #tags["http_response"] = res.status_code
-    tags_test.http_response = res.status_code
+    tags.http_response = res.status_code
     if not res.ok:
-        return tags
+        return asdict(tags)'''
+    verified, tags = verify_response(tags, res)
+    if not verified:
+        return asdict(tags)
 
     # Attempt to read the content-type, set the parser accordingly to avoid warning messages
     try:
         content_type = res.headers["content-type"]
     except KeyError:
-        return tags
+        return asdict(tags)
     # If content type does not contain "html" or "xml" then we can assume that the content is unreadable
     if "html" in content_type:
         parser = "lxml"
     elif "xml" in content_type:
         parser = "lxml-xml"
     else:
-        return tags
+        return asdict(tags)
 
     try:
         soup = BeautifulSoup(res.html.html, parser)
     except (bs4.builder.ParserRejectedMarkup, AssertionError, AttributeError):
-        return tags
+        return asdict(tags)
 
     '''if soup.title is not None and soup.title.string is not None:
         tags["html_title"] = remove_excess_whitespace(soup.title.string)
     else:
         tags["html_title"] = ""'''
-    tags_test = get_html_title(tags_test, soup)
+    tags = get_html_title(tags, soup)
 
     '''meta_tag = soup.find("meta", attrs={"name": "description"})
     try:
         tags["meta_description"] = remove_excess_whitespace(meta_tag["content"]) if meta_tag is not None else ""
     except KeyError:
         tags["meta_description"] = ""'''
-    tags_test = get_meta_description(tags_test, soup)
+    tags = get_meta_description(tags, soup)
 
     '''for header_tag in header_tags:
         headers = soup.find_all(header_tag)
         # Retreives and drops headers containing links to reduce training bias
         header_content = [header.get_text(" ", strip=True) for header in headers if not header.a]
         tags[header_tag] = json.dumps(header_content, ensure_ascii=False)'''
-    tags_test = get_header_tags(tags_test, soup)
+    tags = get_header_tags(tags, soup)
 
     # Extract max 500 words of text from HTML <div>'s
     '''div_text = ""
@@ -321,14 +323,13 @@ def parse_response(url_response):
 
     # Truncate to 5000 characters in case of run-on 'words'
     tags["div_text"] = div_text[:MAX_WORDS * 10]'''
-    tags_test = get_div_text(tags_test, soup)
-    print(tags_test)
+    tags = get_div_text(tags, soup)
 
     # Prevents most bs4 memory leaks
     if soup.html:
         soup.html.decompose()
 
-    return tags
+    return asdict(tags)
 
 
 def get_url(tags, url_response):
@@ -349,12 +350,35 @@ def get_url(tags, url_response):
     # Drop hostname from urls to reduce training bias
     url_path = urlparse(url).path[1:]
     # Remove trailing backslash
-    if url_path[-1] == "/":
+    if url_path and url_path[-1] == "/":
         url_path = url_path[:-1]
     tags.url_path = url_path
 
     return tags
 
+def verify_response(tags, res):
+    """Verifies the webpage response is readable and ok.
+
+    Args:
+        tags (Tags): DataClass for relevant HTML tags.
+        res (HTMLResponse|Response): Response object to verify.
+
+    Returns:
+        bool: False if verification fails, True otherwise.
+        Tags: Dataclass for relevant HTML tags.
+    """
+    print(type(res))
+    # The response is None if there was an error during connection, meaning there is no content to read
+    if res is None:
+        return False, tags
+
+    # If the connection did not return a 300 code, we can assume there is no relevant content to read
+    tags.http_response = res.status_code
+    if not res.ok:
+        return False, tags
+
+    return True, tags
+
 
 def get_html_title(tags, soup):
     """Updates the Tags DataClass with the html_title.

From c4b3ce5f6dddc221fb698d0122ed04327a2791c9 Mon Sep 17 00:00:00 2001
From: Kylie <github.a5cee@kyscott.aleeas.com>
Date: Sat, 25 May 2024 16:52:35 -0600
Subject: [PATCH 09/15] Add get_parser function

---
 html_tag_collector/collector.py | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py
index cc85003..679b14a 100644
--- a/html_tag_collector/collector.py
+++ b/html_tag_collector/collector.py
@@ -272,7 +272,7 @@ def parse_response(url_response):
         return asdict(tags)
 
     # Attempt to read the content-type, set the parser accordingly to avoid warning messages
-    try:
+    '''try:
         content_type = res.headers["content-type"]
     except KeyError:
         return asdict(tags)
@@ -282,6 +282,9 @@ def parse_response(url_response):
     elif "xml" in content_type:
         parser = "lxml-xml"
     else:
+        return asdict(tags)'''
+    parser = get_parser(res)
+    if not parser:
         return asdict(tags)
 
     try:
@@ -367,7 +370,6 @@ def verify_response(tags, res):
         bool: False if verification fails, True otherwise.
         Tags: Dataclass for relevant HTML tags.
     """
-    print(type(res))
     # The response is None if there was an error during connection, meaning there is no content to read
     if res is None:
         return False, tags
@@ -380,6 +382,32 @@ def verify_response(tags, res):
     return True, tags
 
 
+def get_parser(res):
+    """Retrieves the parser type to use with BeautifulSoup.
+
+    Args:
+        res (HTMLResponse|Response): Response object to read the content-type from.
+
+    Returns:
+        str|bool: A string of the parser to use, or False if not readable.
+    """    
+    # Attempt to read the content-type, set the parser accordingly to avoid warning messages
+    try:
+        content_type = res.headers["content-type"]
+    except KeyError:
+        return False
+
+    # If content type does not contain "html" or "xml" then we can assume that the content is unreadable
+    if "html" in content_type:
+        parser = "lxml"
+    elif "xml" in content_type:
+        parser = "lxml-xml"
+    else:
+        return False
+    
+    return parser
+
+
 def get_html_title(tags, soup):
     """Updates the Tags DataClass with the html_title.
 

From 927cbb7f040cd63abfecf89e07f1bce3c4abf75e Mon Sep 17 00:00:00 2001
From: Kylie <github.a5cee@kyscott.aleeas.com>
Date: Tue, 28 May 2024 12:28:21 -0600
Subject: [PATCH 10/15] Restructure the functions

---
 html_tag_collector/collector.py | 91 ++++++++++++++++-----------------
 1 file changed, 43 insertions(+), 48 deletions(-)

diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py
index 679b14a..e103d4f 100644
--- a/html_tag_collector/collector.py
+++ b/html_tag_collector/collector.py
@@ -237,7 +237,7 @@ def parse_response(url_response):
         url_response (list[dict]): List of dictionaries containing urls and their responses.
 
     Returns:
-        Tags: DataClass containing the url and relevant HTML tags.
+        dict: Dictionary containing the url and relevant HTML tags.
     """
     #remove_excess_whitespace = lambda s: " ".join(s.split()).strip()
     
@@ -253,12 +253,12 @@ def parse_response(url_response):
     if not url.startswith("http"):
         url = "https://" + url
     tags["url_path"] = urlparse(url).path[1:]'''
-    tags = get_url(tags, url_response)
+    tags.url, tags.url_path = get_url(url_response)
 
     #tags["html_title"] = ""
     #tags["meta_description"] = ""
     #tags["root_page_title"] = remove_excess_whitespace(root_url_cache.get_title(tags["url"]))
-
+    tags.root_page_title = remove_excess_whitespace(root_url_cache.get_title(tags.url))
     '''# The response is None if there was an error during connection, meaning there is no content to read
     if res is None:
         return asdict(tags)
@@ -267,8 +267,8 @@ def parse_response(url_response):
     tags.http_response = res.status_code
     if not res.ok:
         return asdict(tags)'''
-    verified, tags = verify_response(tags, res)
-    if not verified:
+    verified, tags.http_response = verify_response(res)
+    if verified is False:
         return asdict(tags)
 
     # Attempt to read the content-type, set the parser accordingly to avoid warning messages
@@ -284,7 +284,7 @@ def parse_response(url_response):
     else:
         return asdict(tags)'''
     parser = get_parser(res)
-    if not parser:
+    if parser is False:
         return asdict(tags)
 
     try:
@@ -296,14 +296,14 @@ def parse_response(url_response):
         tags["html_title"] = remove_excess_whitespace(soup.title.string)
     else:
         tags["html_title"] = ""'''
-    tags = get_html_title(tags, soup)
+    tags.html_title = get_html_title(soup)
 
     '''meta_tag = soup.find("meta", attrs={"name": "description"})
     try:
         tags["meta_description"] = remove_excess_whitespace(meta_tag["content"]) if meta_tag is not None else ""
     except KeyError:
         tags["meta_description"] = ""'''
-    tags = get_meta_description(tags, soup)
+    tags.meta_description = get_meta_description(soup)
 
     '''for header_tag in header_tags:
         headers = soup.find_all(header_tag)
@@ -326,7 +326,7 @@ def parse_response(url_response):
 
     # Truncate to 5000 characters in case of run-on 'words'
     tags["div_text"] = div_text[:MAX_WORDS * 10]'''
-    tags = get_div_text(tags, soup)
+    tags.div_text = get_div_text(soup)
 
     # Prevents most bs4 memory leaks
     if soup.html:
@@ -335,51 +335,47 @@ def parse_response(url_response):
     return asdict(tags)
 
 
-def get_url(tags, url_response):
-    """Updates the Tags DataClass with the url and url_path.
+def get_url(url_response):
+    """Returns the url and url_path.
 
     Args:
-        tags (Tags): DataClass for relevant HTML tags.
         url_response (list[dict]): List of dictionaries containing urls and their responses.
 
     Returns:
-        Tags: DataClass with updated url and url_path.
+        (str, str): Tuple with the url and url_path.
     """
     url = url_response["url"][0]
-    tags.url = url
-    if not url.startswith("http"):
-        url = "https://" + url
+    new_url = url
+    if not new_url.startswith("http"):
+        new_url = "https://" + new_url
 
     # Drop hostname from urls to reduce training bias
-    url_path = urlparse(url).path[1:]
+    url_path = urlparse(new_url).path[1:]
     # Remove trailing backslash
     if url_path and url_path[-1] == "/":
         url_path = url_path[:-1]
-    tags.url_path = url_path
 
-    return tags
+    return url, url_path
 
-def verify_response(tags, res):
+def verify_response(res):
     """Verifies the webpage response is readable and ok.
 
     Args:
-        tags (Tags): DataClass for relevant HTML tags.
         res (HTMLResponse|Response): Response object to verify.
 
     Returns:
-        bool: False if verification fails, True otherwise.
-        Tags: Dataclass for relevant HTML tags.
+        (bool, int): A tuple containing False if verification fails, True otherwise and the http response code
     """
     # The response is None if there was an error during connection, meaning there is no content to read
     if res is None:
-        return False, tags
+        return False, -1
 
     # If the connection did not return a 300 code, we can assume there is no relevant content to read
-    tags.http_response = res.status_code
+    http_response = res.status_code
     if not res.ok:
-        return False, tags
+        return False, http_response
 
-    return True, tags
+    return True, http_response
 
 
 def get_parser(res):
@@ -408,39 +404,39 @@ def get_parser(res):
     return parser
 
 
-def get_html_title(tags, soup):
-    """Updates the Tags DataClass with the html_title.
+def get_html_title(soup):
+    """Retrieves the HTML title from a BeautifulSoup object.
 
     Args:
-        tags (Tags): DataClass for relevant HTML tags.
         soup (BeautifulSoup): BeautifulSoup object to pull the HTML title from.
 
     Returns:
-        Tags: DataClass with updated html_title.
+        str: The HTML title.
     """
+    html_title = ""
+
     if soup.title is not None and soup.title.string is not None:
-        tags.html_title = remove_excess_whitespace(soup.title.string)
+        html_title = remove_excess_whitespace(soup.title.string)
     
-    return tags
+    return html_title
 
 
-def get_meta_description(tags, soup):
-    """Updates the Tags DataClass with the meta_description.
+def get_meta_description(soup):
+    """Retrieves the meta description from a BeautifulSoup object.
 
     Args:
-        tags (Tags): DataClass for relevant HTML tags.
         soup (BeautifulSoup): BeautifulSoup object to pull the meta description from.
 
     Returns:
-        Tags: DataClass with updated meta_description.
+        str: The meta description.
     """    
     meta_tag = soup.find("meta", attrs={"name": "description"})
     try:
-        tags.meta_description = remove_excess_whitespace(meta_tag["content"]) if meta_tag is not None else ""
+        meta_description = remove_excess_whitespace(meta_tag["content"]) if meta_tag is not None else ""
     except KeyError:
-        return
-    
-    return tags
+        return ""
+
+    return meta_description
 
 
 def get_header_tags(tags, soup):
@@ -455,7 +451,7 @@ def get_header_tags(tags, soup):
     """    
     for header_tag in header_tags:
         headers = soup.find_all(header_tag)
-        # Retreives and drops headers containing links to reduce training bias
+        # Retrieves and drops headers containing links to reduce training bias
         header_content = [header.get_text(" ", strip=True) for header in headers if not header.a]
         tag_content = json.dumps(header_content, ensure_ascii=False)
         setattr(tags, header_tag, tag_content)
@@ -463,15 +459,14 @@ def get_header_tags(tags, soup):
     return tags
 
 
-def get_div_text(tags, soup):
-    """Updates the Tags DataClass with the div_text.
+def get_div_text(soup):
+    """Retrieves the div text from a BeautifulSoup object.
 
     Args:
-        tags (Tags): DataClass for relevant HTML tags.
         soup (BeautifulSoup): BeautifulSoup object to pull the div text from.
 
     Returns:
-        Tags: DataClass with updated div_text.
+        str: The div text.
     """    
     # Extract max 500 words of text from HTML <div>'s
     div_text = ""
@@ -486,9 +481,9 @@ def get_div_text(tags, soup):
                 break  # Stop adding text if word limit is reached
 
     # Truncate to 5000 characters in case of run-on 'words'
-    tags.div_text = div_text[:MAX_WORDS * 10]
+    div_text = div_text[:MAX_WORDS * 10]
 
-    return tags
+    return div_text
 
 
 def remove_excess_whitespace(s):

From 0c693c3df276c0476599b6cc5347858853c96d33 Mon Sep 17 00:00:00 2001
From: Kylie <github.a5cee@kyscott.aleeas.com>
Date: Tue, 28 May 2024 15:22:08 -0600
Subject: [PATCH 11/15] Cleanup comments

---
 html_tag_collector/collector.py | 60 ---------------------------------
 1 file changed, 60 deletions(-)

diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py
index e103d4f..f489e91 100644
--- a/html_tag_collector/collector.py
+++ b/html_tag_collector/collector.py
@@ -239,50 +239,18 @@ def parse_response(url_response):
     Returns:
         dict: Dictionary containing the url and relevant HTML tags.
     """
-    #remove_excess_whitespace = lambda s: " ".join(s.split()).strip()
-    
-    #tags = {}
     tags = Tags()
     res = url_response["response"]
-    #tags["index"] = url_response["index"]
     tags.index = url_response["index"]
 
-    # Drop hostname from urls to reduce training bias
-    '''url = url_response["url"][0]
-    tags["url"] = url
-    if not url.startswith("http"):
-        url = "https://" + url
-    tags["url_path"] = urlparse(url).path[1:]'''
     tags.url, tags.url_path = get_url(url_response)
 
-    #tags["html_title"] = ""
-    #tags["meta_description"] = ""
-    #tags["root_page_title"] = remove_excess_whitespace(root_url_cache.get_title(tags["url"]))
     tags.root_page_title = remove_excess_whitespace(root_url_cache.get_title(tags.url))
-    '''# The response is None if there was an error during connection, meaning there is no content to read
-    if res is None:
-        return asdict(tags)
 
-    # If the connection did not return a 300 code, we can assume there is no relevant content to read
-    tags.http_response = res.status_code
-    if not res.ok:
-        return asdict(tags)'''
     verified, tags.http_response = verify_response(res)
     if verified is False:
         return asdict(tags)
 
-    # Attempt to read the content-type, set the parser accordingly to avoid warning messages
-    '''try:
-        content_type = res.headers["content-type"]
-    except KeyError:
-        return asdict(tags)
-    # If content type does not contain "html" or "xml" then we can assume that the content is unreadable
-    if "html" in content_type:
-        parser = "lxml"
-    elif "xml" in content_type:
-        parser = "lxml-xml"
-    else:
-        return asdict(tags)'''
     parser = get_parser(res)
     if parser is False:
         return asdict(tags)
@@ -292,40 +260,12 @@ def parse_response(url_response):
     except (bs4.builder.ParserRejectedMarkup, AssertionError, AttributeError):
         return asdict(tags)
 
-    '''if soup.title is not None and soup.title.string is not None:
-        tags["html_title"] = remove_excess_whitespace(soup.title.string)
-    else:
-        tags["html_title"] = ""'''
     tags.html_title = get_html_title(soup)
 
-    '''meta_tag = soup.find("meta", attrs={"name": "description"})
-    try:
-        tags["meta_description"] = remove_excess_whitespace(meta_tag["content"]) if meta_tag is not None else ""
-    except KeyError:
-        tags["meta_description"] = ""'''
     tags.meta_description = get_meta_description(soup)
 
-    '''for header_tag in header_tags:
-        headers = soup.find_all(header_tag)
-        # Retreives and drops headers containing links to reduce training bias
-        header_content = [header.get_text(" ", strip=True) for header in headers if not header.a]
-        tags[header_tag] = json.dumps(header_content, ensure_ascii=False)'''
     tags = get_header_tags(tags, soup)
 
-    # Extract max 500 words of text from HTML <div>'s
-    '''div_text = ""
-    MAX_WORDS = 500
-    for div in soup.find_all("div"):
-        text = div.get_text(" ", strip=True)
-        if text:
-            # Check if adding the current text exceeds the word limit
-            if len(div_text.split()) + len(text.split()) <= MAX_WORDS:
-                div_text += text + " "
-            else:
-                break  # Stop adding text if word limit is reached
-
-    # Truncate to 5000 characters in case of run-on 'words'
-    tags["div_text"] = div_text[:MAX_WORDS * 10]'''
     tags.div_text = get_div_text(soup)
 
     # Prevents most bs4 memory leaks

From 12ad80a2e333f1f2f1ebac1bb14f67e17010362e Mon Sep 17 00:00:00 2001
From: Kylie <github.a5cee@kyscott.aleeas.com>
Date: Tue, 28 May 2024 15:23:18 -0600
Subject: [PATCH 12/15] Format with black

---
 html_tag_collector/collector.py | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py
index f489e91..18d4457 100644
--- a/html_tag_collector/collector.py
+++ b/html_tag_collector/collector.py
@@ -169,12 +169,15 @@ async def get_response(session, url, index):
         # or the response is an unreadable content type
         # or the response code from the website is not in the 200s
         if (
-            response is not None and len(response.content) > 10000000
-            or content_type is not None and any(
+            response is not None
+            and len(response.content) > 10000000
+            or content_type is not None
+            and any(
                 filtered_type in content_type
                 for filtered_type in ["pdf", "excel", "msword", "image", "rtf", "zip", "octet", "csv", "json"]
             )
-            or response is not None and not response.ok
+            or response is not None
+            and not response.ok
         ):
             # Discard the response content to prevent out of memory errors
             if DEBUG:
@@ -297,6 +300,7 @@ def get_url(url_response):
 
     return url, url_path
 
+
 def verify_response(res):
     """Verifies the webpage response is readable and ok.
 
@@ -326,7 +330,7 @@ def get_parser(res):
 
     Returns:
         str|bool: A string of the parser to use, or False if not readable.
-    """    
+    """
     # Attempt to read the content-type, set the parser accordingly to avoid warning messages
     try:
         content_type = res.headers["content-type"]
@@ -340,7 +344,7 @@ def get_parser(res):
         parser = "lxml-xml"
     else:
         return False
-    
+
     return parser
 
 
@@ -357,7 +361,7 @@ def get_html_title(soup):
 
     if soup.title is not None and soup.title.string is not None:
         html_title = remove_excess_whitespace(soup.title.string)
-    
+
     return html_title
 
 
@@ -369,7 +373,7 @@ def get_meta_description(soup):
 
     Returns:
         str: The meta description.
-    """    
+    """
     meta_tag = soup.find("meta", attrs={"name": "description"})
     try:
         meta_description = remove_excess_whitespace(meta_tag["content"]) if meta_tag is not None else ""
@@ -388,7 +392,7 @@ def get_header_tags(tags, soup):
 
     Returns:
         Tags: DataClass with updated header tags.
-    """    
+    """
     for header_tag in header_tags:
         headers = soup.find_all(header_tag)
         # Retrieves and drops headers containing links to reduce training bias
@@ -407,7 +411,7 @@ def get_div_text(soup):
 
     Returns:
         str: The div text.
-    """    
+    """
     # Extract max 500 words of text from HTML <div>'s
     div_text = ""
     MAX_WORDS = 500
@@ -421,7 +425,7 @@ def get_div_text(soup):
                 break  # Stop adding text if word limit is reached
 
     # Truncate to 5000 characters in case of run-on 'words'
-    div_text = div_text[:MAX_WORDS * 10]
+    div_text = div_text[: MAX_WORDS * 10]
 
     return div_text
 
@@ -434,7 +438,7 @@ def remove_excess_whitespace(s):
 
     Returns:
         str: Clean string with excess whitespace stripped.
-    """    
+    """
     return " ".join(s.split()).strip()
 
 

From 9afb1ee4e60b9ad3e9ebfd1516a9a226f992f508 Mon Sep 17 00:00:00 2001
From: Kylie <github.a5cee@kyscott.aleeas.com>
Date: Wed, 29 May 2024 10:57:29 -0600
Subject: [PATCH 13/15] Add check_response

---
 html_tag_collector/collector.py | 58 +++++++++++++++++++++------------
 1 file changed, 37 insertions(+), 21 deletions(-)

diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py
index 18d4457..72b9c93 100644
--- a/html_tag_collector/collector.py
+++ b/html_tag_collector/collector.py
@@ -165,30 +165,46 @@ async def get_response(session, url, index):
         except (KeyError, AttributeError):
             pass
 
-        # If the response size is greater than 10 MB
-        # or the response is an unreadable content type
-        # or the response code from the website is not in the 200s
-        if (
-            response is not None
-            and len(response.content) > 10000000
-            or content_type is not None
-            and any(
-                filtered_type in content_type
-                for filtered_type in ["pdf", "excel", "msword", "image", "rtf", "zip", "octet", "csv", "json"]
-            )
-            or response is not None
-            and not response.ok
-        ):
-            # Discard the response content to prevent out of memory errors
-            if DEBUG:
-                print("Large or unreadable content discarded:", len(response.content), url)
-            new_response = requests.Response()
-            new_response.status_code = response.status_code
-            response = new_response
+        response = check_response(response, content_type, url)
 
         return {"index": index, "response": response}
 
 
+def check_response(response, content_type, url):
+    """Checks the response to see if content is too large, unreadable, or invalid response code. The response is discarded if it is invalid.
+
+    Args:
+        response (Response): Response object to check.
+        content_type (str): The content type returned by the website.
+        url (str): URL that was requested.
+
+    Returns:
+        Response: The response object is returned either unmodified or discarded.
+    """    
+    # If the response size is greater than 10 MB
+    # or the response is an unreadable content type
+    # or the response code from the website is not in the 200s
+    if (
+        response is not None
+        and len(response.content) > 10000000
+        or content_type is not None
+        and any(
+            filtered_type in content_type
+            for filtered_type in ["pdf", "excel", "msword", "image", "rtf", "zip", "octet", "csv", "json"]
+        )
+        or response is not None
+        and not response.ok
+    ):
+        # Discard the response content to prevent out of memory errors
+        if DEBUG:
+            print("Large or unreadable content discarded:", len(response.content), url)
+        new_response = requests.Response()
+        new_response.status_code = response.status_code
+        response = new_response
+    
+    return response
+
+
 async def render_js(urls_responses):
     """Renders JavaScript from a list of urls.
 
@@ -314,7 +330,7 @@ def verify_response(res):
     if res is None:
         return False, -1
 
-    # If the connection did not return a 300 code, we can assume there is no relevant content to read
+    # If the connection did not return a 200 code, we can assume there is no relevant content to read
     http_response = res.status_code
     if not res.ok:
         return False, http_response

From 0137a6f3be0792851bd15c80b9055f9f383acc75 Mon Sep 17 00:00:00 2001
From: Kylie <github.a5cee@kyscott.aleeas.com>
Date: Wed, 29 May 2024 11:33:32 -0600
Subject: [PATCH 14/15] Convert function return to namedtuple

---
 html_tag_collector/collector.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py
index 72b9c93..08fa660 100644
--- a/html_tag_collector/collector.py
+++ b/html_tag_collector/collector.py
@@ -1,4 +1,5 @@
 from dataclasses import asdict
+from collections import namedtuple
 import json
 import ssl
 import urllib3
@@ -165,12 +166,12 @@ async def get_response(session, url, index):
         except (KeyError, AttributeError):
             pass
 
-        response = check_response(response, content_type, url)
+        response = response_valid(response, content_type, url)
 
         return {"index": index, "response": response}
 
 
-def check_response(response, content_type, url):
+def response_valid(response, content_type, url):
     """Checks the response to see if content is too large, unreadable, or invalid response code. The response is discarded if it is invalid.
 
     Args:
@@ -324,18 +325,19 @@ def verify_response(res):
         res (HTMLResponse|Response): Response object to verify.
 
     Returns:
-        (bool, int): A tuple containing False if verification fails, True otherwise and the http response code
+        VerifiedResponse(bool, int): A named tuple containing False if verification fails, True otherwise and the http response code.
     """
+    VerifiedResponse = namedtuple("VerifiedResponse", "verified http_response")
     # The response is None if there was an error during connection, meaning there is no content to read
     if res is None:
-        return False, -1
+        return VerifiedResponse(False, -1)
 
     # If the connection did not return a 200 code, we can assume there is no relevant content to read
     http_response = res.status_code
     if not res.ok:
-        return False, http_response
-
-    return True, http_response
+        return VerifiedResponse(False, http_response)
+        
+    return VerifiedResponse(True, http_response)
 
 
 def get_parser(res):

From e40e47276173eb2ee14acf775ceaa9b04c120c55 Mon Sep 17 00:00:00 2001
From: Kylie <github.a5cee@kyscott.aleeas.com>
Date: Wed, 29 May 2024 11:42:18 -0600
Subject: [PATCH 15/15] Add docstring

---
 html_tag_collector/collector.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/html_tag_collector/collector.py b/html_tag_collector/collector.py
index 08fa660..26767dc 100644
--- a/html_tag_collector/collector.py
+++ b/html_tag_collector/collector.py
@@ -1,3 +1,15 @@
+""" The tag collector is used to collect HTML tags and other relevant data from websites that is useful for training prediction models.
+    Information being collected includes:
+        - The URL's path
+        - HTML title
+        - Meta description
+        - The root page's HTML title
+        - HTTP response code
+        - Contents of H1-H6 header tags
+        - Contents of div tags
+"""
+
+
 from dataclasses import asdict
 from collections import namedtuple
 import json
@@ -336,7 +348,7 @@ def verify_response(res):
     http_response = res.status_code
     if not res.ok:
         return VerifiedResponse(False, http_response)
-        
+
     return VerifiedResponse(True, http_response)