Skip to content

Commit

Permalink
Format with black
Browse files Browse the repository at this point in the history
  • Loading branch information
EvilDrPurple committed May 28, 2024
1 parent 0c693c3 commit 12ad80a
Showing 1 changed file with 15 additions and 11 deletions.
26 changes: 15 additions & 11 deletions html_tag_collector/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,12 +169,15 @@ async def get_response(session, url, index):
# or the response is an unreadable content type
# or the response code from the website is not in the 200s
if (
response is not None and len(response.content) > 10000000
or content_type is not None and any(
response is not None
and len(response.content) > 10000000
or content_type is not None
and any(
filtered_type in content_type
for filtered_type in ["pdf", "excel", "msword", "image", "rtf", "zip", "octet", "csv", "json"]
)
or response is not None and not response.ok
or response is not None
and not response.ok
):
# Discard the response content to prevent out of memory errors
if DEBUG:
Expand Down Expand Up @@ -297,6 +300,7 @@ def get_url(url_response):

return url, url_path


def verify_response(res):
"""Verifies the webpage response is readable and ok.

Check warning on line 305 in html_tag_collector/collector.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] html_tag_collector/collector.py#L305 <401>

First line should be in imperative mood
Raw output
./html_tag_collector/collector.py:305:1: D401 First line should be in imperative mood
Expand Down Expand Up @@ -326,7 +330,7 @@ def get_parser(res):
Returns:
str|bool: A string of the parser to use, or False if not readable.
"""
"""
# Attempt to read the content-type, set the parser accordingly to avoid warning messages

Check failure on line 334 in html_tag_collector/collector.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] html_tag_collector/collector.py#L334 <501>

line too long (92 > 79 characters)
Raw output
./html_tag_collector/collector.py:334:80: E501 line too long (92 > 79 characters)
try:
content_type = res.headers["content-type"]
Expand All @@ -340,7 +344,7 @@ def get_parser(res):
parser = "lxml-xml"
else:
return False

return parser


Expand All @@ -357,7 +361,7 @@ def get_html_title(soup):

if soup.title is not None and soup.title.string is not None:
html_title = remove_excess_whitespace(soup.title.string)

return html_title


Expand All @@ -369,7 +373,7 @@ def get_meta_description(soup):
Returns:
str: The meta description.
"""
"""
meta_tag = soup.find("meta", attrs={"name": "description"})
try:
meta_description = remove_excess_whitespace(meta_tag["content"]) if meta_tag is not None else ""

Check failure on line 379 in html_tag_collector/collector.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] html_tag_collector/collector.py#L379 <501>

line too long (104 > 79 characters)
Raw output
./html_tag_collector/collector.py:379:80: E501 line too long (104 > 79 characters)
Expand All @@ -388,7 +392,7 @@ def get_header_tags(tags, soup):
Returns:
Tags: DataClass with updated header tags.
"""
"""
for header_tag in header_tags:
headers = soup.find_all(header_tag)
# Retrieves and drops headers containing links to reduce training bias
Expand All @@ -407,7 +411,7 @@ def get_div_text(soup):
Returns:
str: The div text.
"""
"""
# Extract max 500 words of text from HTML <div>'s
div_text = ""
MAX_WORDS = 500
Expand All @@ -421,7 +425,7 @@ def get_div_text(soup):
break # Stop adding text if word limit is reached

# Truncate to 5000 characters in case of run-on 'words'
div_text = div_text[:MAX_WORDS * 10]
div_text = div_text[: MAX_WORDS * 10]

return div_text

Expand All @@ -434,7 +438,7 @@ def remove_excess_whitespace(s):
Returns:
str: Clean string with excess whitespace stripped.
"""
"""
return " ".join(s.split()).strip()


Expand Down

0 comments on commit 12ad80a

Please sign in to comment.