From 9c88f425cc8c2409773559ee3994e0848e51e964 Mon Sep 17 00:00:00 2001 From: Dhrumil Mistry <56185972+dmdhrumilmistry@users.noreply.github.com> Date: Mon, 13 Jun 2022 21:29:42 +0530 Subject: [PATCH] update spider module --- pyhtools/attackers/attackers.py | 11 ++++- pyhtools/attackers/web/spider.py | 69 ++++++++++++++++---------------- 2 files changed, 43 insertions(+), 37 deletions(-) diff --git a/pyhtools/attackers/attackers.py b/pyhtools/attackers/attackers.py index a04c760..51a0581 100644 --- a/pyhtools/attackers/attackers.py +++ b/pyhtools/attackers/attackers.py @@ -3,10 +3,10 @@ import pyhtools.attackers.network.nwscan as nwscan import pyhtools.attackers.network.machngr as machngr import pyhtools.attackers.web.login_guesser as web_login -import pyhtools.attackers.web.spider as spider from pyhtools.UI.colors import * from pyhtools.attackers.web.vuln_scanner.scanner import Scanner +from pyhtools.attackers.web.spider import Spider from pyhtools.attackers.web.webdiscover import Discoverer @@ -114,7 +114,14 @@ def webspider(): returns: None ''' target_url = input('[+] TARGET URL : ') - spider.start_spider(target_url) + spider = Spider() + + print(f'{BRIGHT_YELLOW}[*] Starting Spider... Press Ctrl+C to interrupt') + discovered_links = spider.start( + target_url=target_url, + print_links=True + ) + print(f'[*] Total Links Found: {len(discovered_links)}') def webcrawldirs(): diff --git a/pyhtools/attackers/web/spider.py b/pyhtools/attackers/web/spider.py index 0f63ed3..f23ef7a 100644 --- a/pyhtools/attackers/web/spider.py +++ b/pyhtools/attackers/web/spider.py @@ -1,22 +1,18 @@ -#!usr/bin/env python3 -from os import name import requests import re -from urllib.parse import urljoin import argparse -from pyhtools.UI.colors import * +from urllib.parse import urljoin +from pyhtools.UI.colors import * -# list to save links on the whole webpage -# to avoid repetition -target_links = [] -def start_spider(target_url): - ''' - description: starts spider - ''' +class Spider: + def __init__(self) -> None: + # list to save links on the whole webpage + # to avoid repetition + self.target_links = [] - def get_links(url:str)->list: + def get_links(self, url: str) -> list: ''' description: extracts links from the whole webpage. params: url(str) of the webpage @@ -24,49 +20,52 @@ def get_links(url:str)->list: ''' response = requests.get(url) content = str(response.content) - return re.findall(r'(?:href=")(.*?)"',content) - + return re.findall(r'(?:href=")(.*?)"', content) - def get_target_links(url:str): + def get_target_links(self, url: str, print_link: bool = True): ''' description: extracts useful links and prints them which are only related to the target webpage. params: links(list) from the target webpage returns: useful links(list) related to target webpage ''' - global target_links - links = get_links(url) + target_links = self.target_links + links = self.get_links(url) + for link in links: link = urljoin(url, link) if '#' in link: link = link.split('#')[0] - # print(BRIGHT_RED+ link) - if link not in target_links and target_url in link: + if link not in target_links and url in link: target_links.append(link) - print(link) - get_target_links(link) + if print_link: + print(link) + self.get_target_links(url=link, print_link=print_link) + + def start(self, target_url:str, print_links: bool = True): + ''' + description: starts spider + ''' + # try: + self.get_target_links(target_url, print_links) + + # except Exception as e: + # print(f'{BRIGHT_RED}[!] Exception: {e}') - try: - print(BRIGHT_YELLOW + '[*] Starting SPIDER...') - get_target_links(target_url) - print(BRIGHT_YELLOW + f'[*] Mapped all links found on {target_url}') - print(BRIGHT_YELLOW + "[*] Total Links Found : ", len(target_links)) - except KeyboardInterrupt: - print(BRIGHT_YELLOW + '\r[!] ctrl+c detected! Exiting Spider.') - except Exception as e: - print(BRIGHT_RED + '[-] Exception : ', e) - finally: - print(BRIGHT_YELLOW + "[*] Total Links Found Before Exception : ", len(target_links)) + # finally: + return self.target_links if __name__ == '__main__': # Parse arguments parser = argparse.ArgumentParser() - parser.add_argument('-t', '--target', dest='target_url', help='url of the target eg: https://facebook.com, https://github.com, http://bing.com') + parser.add_argument('-t', '--target', dest='target_url', required=True, + help='url of the target eg: https://facebook.com, https://github.com, http://bing.com') args = parser.parse_args() - del parser target_url = args.target_url - start_spider(target_url) \ No newline at end of file + spider = Spider() + discovered_links = spider.start(target_url=target_url, print_links=True) + print(f'[*] Total Links Found: {len(discovered_links)}')