From 9c88f425cc8c2409773559ee3994e0848e51e964 Mon Sep 17 00:00:00 2001
From: Dhrumil Mistry <56185972+dmdhrumilmistry@users.noreply.github.com>
Date: Mon, 13 Jun 2022 21:29:42 +0530
Subject: [PATCH] update spider module

---
 pyhtools/attackers/attackers.py  | 11 ++++-
 pyhtools/attackers/web/spider.py | 69 ++++++++++++++++----------------
 2 files changed, 43 insertions(+), 37 deletions(-)

diff --git a/pyhtools/attackers/attackers.py b/pyhtools/attackers/attackers.py
index a04c760..51a0581 100644
--- a/pyhtools/attackers/attackers.py
+++ b/pyhtools/attackers/attackers.py
@@ -3,10 +3,10 @@
 import pyhtools.attackers.network.nwscan as nwscan
 import pyhtools.attackers.network.machngr as machngr
 import pyhtools.attackers.web.login_guesser as web_login
-import pyhtools.attackers.web.spider as spider
 
 from pyhtools.UI.colors import *
 from pyhtools.attackers.web.vuln_scanner.scanner import Scanner
+from pyhtools.attackers.web.spider import Spider
 from pyhtools.attackers.web.webdiscover import Discoverer
 
 
@@ -114,7 +114,14 @@ def webspider():
     returns: None
     '''
     target_url = input('[+] TARGET URL : ')
-    spider.start_spider(target_url)
+    spider = Spider()
+
+    print(f'{BRIGHT_YELLOW}[*] Starting Spider... Press Ctrl+C to interrupt')
+    discovered_links = spider.start(
+        target_url=target_url,
+        print_links=True
+    )
+    print(f'[*] Total Links Found: {len(discovered_links)}')
 
 
 def webcrawldirs():
diff --git a/pyhtools/attackers/web/spider.py b/pyhtools/attackers/web/spider.py
index 0f63ed3..f23ef7a 100644
--- a/pyhtools/attackers/web/spider.py
+++ b/pyhtools/attackers/web/spider.py
@@ -1,22 +1,18 @@
-#!usr/bin/env python3
-from os import name
 import requests
 import re
-from urllib.parse import urljoin
 import argparse
-from pyhtools.UI.colors import *
 
+from urllib.parse import urljoin
+from pyhtools.UI.colors import *
 
-# list to save links on the whole webpage
-# to avoid repetition
-target_links = [] 
 
-def start_spider(target_url):
-    '''
-    description: starts spider
-    '''
+class Spider:
+    def __init__(self) -> None:
+        # list to save links on the whole webpage
+        # to avoid repetition
+        self.target_links = []
 
-    def get_links(url:str)->list:
+    def get_links(self, url: str) -> list:
         '''
         description: extracts links from the whole webpage.
         params: url(str) of the webpage
@@ -24,49 +20,52 @@ def get_links(url:str)->list:
         '''
         response = requests.get(url)
         content = str(response.content)
-        return re.findall(r'(?:href=")(.*?)"',content)
-
+        return re.findall(r'(?:href=")(.*?)"', content)
 
-    def get_target_links(url:str):
+    def get_target_links(self, url: str, print_link: bool = True):
         '''
         description: extracts useful links and prints them which are
         only related to the target webpage.
         params: links(list) from the target webpage
         returns: useful links(list) related to target webpage
         '''
-        global target_links
-        links = get_links(url)
+        target_links = self.target_links
+        links = self.get_links(url)
+
         for link in links:
             link = urljoin(url, link)
 
             if '#' in link:
                 link = link.split('#')[0]
 
-            # print(BRIGHT_RED+ link)
-            if link not in target_links and target_url in link:
+            if link not in target_links and url in link:
                 target_links.append(link)
-                print(link)
-                get_target_links(link)
+                if print_link:
+                    print(link)
+                self.get_target_links(url=link, print_link=print_link)
+
+    def start(self, target_url:str, print_links: bool = True):
+        '''
+        description: starts spider
+        '''
+        # try:
+        self.get_target_links(target_url, print_links)
+
+        # except Exception as e:
+            # print(f'{BRIGHT_RED}[!] Exception: {e}')
 
-    try:
-        print(BRIGHT_YELLOW + '[*] Starting SPIDER...')
-        get_target_links(target_url)
-        print(BRIGHT_YELLOW + f'[*] Mapped all links found on {target_url}')
-        print(BRIGHT_YELLOW + "[*] Total Links Found : ", len(target_links))
-    except KeyboardInterrupt:
-        print(BRIGHT_YELLOW + '\r[!] ctrl+c detected! Exiting Spider.')
-    except Exception as e:
-        print(BRIGHT_RED + '[-] Exception : ', e)
-    finally:
-        print(BRIGHT_YELLOW + "[*] Total Links Found Before Exception : ", len(target_links))
+        # finally:
+        return self.target_links
 
 
 if __name__ == '__main__':
     # Parse arguments
     parser = argparse.ArgumentParser()
-    parser.add_argument('-t', '--target', dest='target_url', help='url of the target eg: https://facebook.com, https://github.com, http://bing.com')
+    parser.add_argument('-t', '--target', dest='target_url', required=True,
+                        help='url of the target eg: https://facebook.com, https://github.com, http://bing.com')
     args = parser.parse_args()
-    del parser
 
     target_url = args.target_url
-    start_spider(target_url)
\ No newline at end of file
+    spider = Spider()
+    discovered_links = spider.start(target_url=target_url, print_links=True)
+    print(f'[*] Total Links Found: {len(discovered_links)}')