From bd49107b122d802c83cd6ad2825fca1530fbe632 Mon Sep 17 00:00:00 2001 From: Dotdesh <105870984+dotdesh71@users.noreply.github.com> Date: Thu, 9 May 2024 23:53:28 +0600 Subject: [PATCH] Update website.py --- pyseoanalyzer/website.py | 115 +++++++++++++++++---------------------- 1 file changed, 51 insertions(+), 64 deletions(-) diff --git a/pyseoanalyzer/website.py b/pyseoanalyzer/website.py index 957c051..edf8b10 100644 --- a/pyseoanalyzer/website.py +++ b/pyseoanalyzer/website.py @@ -1,8 +1,6 @@ -from collections import Counter -from collections import defaultdict +from collections import Counter, defaultdict from urllib.parse import urlsplit from xml.dom import minidom - import socket from .http import http @@ -19,7 +17,7 @@ def __init__( self.analyze_extra_tags = analyze_extra_tags self.follow_links = follow_links self.crawled_pages = [] - self.crawled_urls = set([]) + self.crawled_urls = set() self.page_queue = [] self.wordcount = Counter() self.bigrams = Counter() @@ -29,71 +27,60 @@ def __init__( def check_dns(self, url_to_check): try: o = urlsplit(url_to_check) - socket.gethostbyname(o.hostname) + socket.gethostbyname_ex(o.hostname) return True - except: - pass - - return False + except (socket.herror, socket.gaierror): + return False def get_text_from_xml(self, nodelist): """ Stolen from the minidom documentation """ - rc = [] - - for node in nodelist: - if node.nodeType == node.TEXT_NODE: - rc.append(node.data) - - return "".join(rc) + return "".join(node.data for node in nodelist if node.nodeType == node.TEXT_NODE) def crawl(self): - if self.sitemap: - page = http.get(self.sitemap) - if self.sitemap.endswith("xml"): - xmldoc = minidom.parseString(page.data.decode("utf-8")) - sitemap_urls = xmldoc.getElementsByTagName("loc") - for url in sitemap_urls: - self.page_queue.append(self.get_text_from_xml(url.childNodes)) - elif self.sitemap.endswith("txt"): - sitemap_urls = page.data.decode("utf-8").split("\n") - for url in sitemap_urls: - self.page_queue.append(url) - - self.page_queue.append(self.base_url) - - for url in self.page_queue: - if url in self.crawled_urls: - continue - - page = Page( - url=url, - base_domain=self.base_url, - analyze_headings=self.analyze_headings, - analyze_extra_tags=self.analyze_extra_tags, - ) - - if page.parsed_url.netloc != page.base_domain.netloc: - continue - - page.analyze() - - self.content_hashes[page.content_hash].add(page.url) - - for w in page.wordcount: - self.wordcount[w] += page.wordcount[w] - - for b in page.bigrams: - self.bigrams[b] += page.bigrams[b] - - for t in page.trigrams: - self.trigrams[t] += page.trigrams[t] - - self.page_queue.extend(page.links) - - self.crawled_pages.append(page) - self.crawled_urls.add(page.url) - - if not self.follow_links: - break + try: + if self.sitemap: + page = http.get(self.sitemap) + if self.sitemap.endswith("xml"): + xmldoc = minidom.parseString(page.data.decode("utf-8")) + sitemap_urls = xmldoc.getElementsByTagName("loc") + for url in sitemap_urls: + self.page_queue.append(self.get_text_from_xml(url.childNodes)) + elif self.sitemap.endswith("txt"): + sitemap_urls = page.data.decode("utf-8").split("\n") + for url in sitemap_urls: + self.page_queue.append(url) + + self.page_queue.append(self.base_url) + + for url in self.page_queue: + if url in self.crawled_urls: + continue + + page = Page( + url=url, + base_domain=self.base_url, + analyze_headings=self.analyze_headings, + analyze_extra_tags=self.analyze_extra_tags, + ) + + if page.parsed_url.netloc != page.base_domain.netloc: + continue + + page.analyze() + + self.content_hashes[page.content_hash].add(page.url) + self.wordcount.update(page.wordcount) + self.bigrams.update(page.bigrams) + self.trigrams.update(page.trigrams) + + self.page_queue.extend(page.links) + + self.crawled_pages.append(page) + self.crawled_urls.add(page.url) + + if not self.follow_links: + break + except Exception as e: + print(f"Error occurred during crawling: {e}")