Skip to content

Commit

Permalink
raise filesystem events from filedownload
Browse files Browse the repository at this point in the history
  • Loading branch information
domwhewell-sage committed Jun 3, 2024
1 parent 85d5e92 commit a6e14c5
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 10 deletions.
14 changes: 10 additions & 4 deletions bbot/modules/filedownload.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class filedownload(BaseModule):
"""

watched_events = ["URL_UNVERIFIED", "HTTP_RESPONSE"]
produced_events = []
produced_events = ["FILESYSTEM"]
flags = ["active", "safe", "web-basic", "web-thorough"]
meta = {
"description": "Download common filetypes such as PDF, DOCX, PPTX, etc.",
Expand Down Expand Up @@ -122,22 +122,28 @@ async def handle_event(self, event):
extension_matches = any(url_lower.endswith(f".{e}") for e in self.extensions)
filedownload_requested = "filedownload" in event.tags
if extension_matches or filedownload_requested:
await self.download_file(event.data)
await self.download_file(event.data, source_event=event)
elif event.type == "HTTP_RESPONSE":
headers = event.data.get("header", {})
content_type = headers.get("content_type", "")
if content_type:
url = event.data["url"]
await self.download_file(url, content_type=content_type)
await self.download_file(url, content_type=content_type, source_event=event)

async def download_file(self, url, content_type=None):
async def download_file(self, url, content_type=None, source_event=None):
orig_filename, file_destination, base_url = self.make_filename(url, content_type=content_type)
if orig_filename is None:
return
result = await self.helpers.download(url, warn=False, filename=file_destination, max_size=self.max_filesize)
if result:
self.info(f'Found "{orig_filename}" at "{base_url}", downloaded to {file_destination}')
self.files_downloaded += 1
if source_event:
file_event = self.make_event(
{"path": str(file_destination)}, "FILESYSTEM", tags=["filedownload", "file"], source=source_event
)
file_event.scope_distance = source_event.scope_distance
await self.emit_event(file_event)
self.urls_downloaded.add(hash(url))

def make_filename(self, url, content_type=None):
Expand Down
14 changes: 8 additions & 6 deletions bbot/test/test_step_2/module_tests/test_module_filedownload.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from pathlib import Path
from .base import ModuleTestBase


Expand Down Expand Up @@ -39,19 +40,20 @@ async def setup_after_prep(self, module_test):
)

def check(self, module_test, events):
filesystem_events = [e for e in events if e.type == "FILESYSTEM"]
download_dir = module_test.scan.home / "filedownload"

# text file
text_files = list(download_dir.glob("*test-file.txt"))
assert len(text_files) == 1, f"No text file found at {download_dir}"
file = text_files[0]
text_file_event = [e for e in filesystem_events if "test-file.txt" in e.data["path"]]
assert 1 == len(text_file_event), f"No text file found at {download_dir}"
file = Path(text_file_event[0].data["path"])
assert file.is_file(), f"File not found at {file}"
assert open(file).read() == "juicy stuff", f"File at {file} does not contain the correct content"

# PDF file (no extension)
pdf_files = list(download_dir.glob("*test-pdf.pdf"))
assert len(pdf_files) == 1, f"No PDF file found at {download_dir}"
file = pdf_files[0]
pdf_file_event = [e for e in filesystem_events if "test-pdf.pdf" in e.data["path"]]
assert 1 == len(pdf_file_event), f"No PDF file found at {download_dir}"
file = Path(pdf_file_event[0].data["path"])
assert file.is_file(), f"File not found at {file}"
assert open(file).read() == self.pdf_data, f"File at {file} does not contain the correct content"

Expand Down

0 comments on commit a6e14c5

Please sign in to comment.