progress

truonghm · Sep 12, 2023 · db2d2c9 · db2d2c9
1 parent 7f8ef2f
commit db2d2c9
Show file tree

Hide file tree

Showing 30 changed files with 6,295 additions and 442 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,8 @@
+# report
+report/_extensions/*
+report/QTDublinIrish.otf
+report/*.tex
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -14,7 +19,6 @@ dist/
 downloads/
 eggs/
 .eggs/
-lib/
 lib64/
 parts/
 sdist/

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,34 @@
+minimum_pre_commit_version: 3.3.3
+
+repos:
+  - repo: local
+    hooks:
+      - id: ruff-lint
+        name: ruff-lint
+        entry: ruff --fix --config ./pyproject.toml
+        language: system
+        types: [python]
+
+      - id: black-format
+        name: ruff-lint
+        entry: black --config ./pyproject.toml
+        language: system
+        types: [python]
+
+      - id: mypy
+        name: mypy
+        entry: mypy --config-file ./pyproject.toml --enable-incomplete-feature=Unpack --install-types --non-interactive
+        language: system
+        types: [python]
+
+      - id: ruff-check
+        name: ruff-check
+        entry: ruff check --config ./pyproject.toml
+        language: system
+        types: [python]
+
+      - id: black-check
+        name: black-check
+        entry: black --check --config ./pyproject.toml
+        language: system
+        types: [python]
diff --git a/Makefile b/Makefile
@@ -3,31 +3,113 @@ SHELL = /bin/bash
 CONDA_ENV_PATH=.conda/m1
 CONDA_HOME_PATH=$(HOME)/miniconda3
 
+## Download data for training
 download:
 	./scripts/download_set1.sh
 	./scripts/download_set2.sh
 
 count:
 	./scripts/count_data.sh
 
+## Generate tree of data folder
 tree:
 	./scripts/gen_tree.sh data
 
+## Create conda env (python 3.10) using environment.yml
 env: 
 	source $(CONDA_HOME_PATH)/bin/activate; conda create -p $(CONDA_ENV_PATH) --no-default-packages --no-deps python=3.10 -y; conda env update -p $(CONDA_ENV_PATH) --file environment.yml
 
+## Remove old conda env and create a new one
 env-reset:
 	rm -rf $(CONDA_ENV_PATH)
 	make env
 
+PATH_TO_CHECK=./lib/* ./crawler/*
+## Format files using black, using pre-commit hooks
 format:
-	black src --config pyproject.toml
-	ruff src --fix --config pyproject.toml
+	pre-commit run ruff-lint --files $(PATH_TO_CHECK)
+	pre-commit run black-format --files $(PATH_TO_CHECK)
 
-## Run checks (ruff + test)
-check:
-	ruff check src --config pyproject.toml
-	black --check src --config pyproject.toml
+## Run checks (ruff + test), using pre-commit hooks
+check-format:
+	pre-commit run ruff-check --files $(PATH_TO_CHECK)
+	pre-commit run black-check --files $(PATH_TO_CHECK)
 
-type:
-	mypy src --config-file pyproject.toml
+## Run mypy type checking using pre-commit hook
+check-mypy:
+	pre-commit run mypy --files $(PATH_TO_CHECK)
+
+## Run all checks (ruff + test + mypy), using pre-commit hooks
+check-all:
+	pre-commit run --files $(PATH_TO_CHECK)
+
+## crawl urls from the kaggle dataset
+crawl:
+	export PYTHONPATH=$(shell pwd) && python crawler/crawl_kaggle_dataset.py --skip=108201 --limit=700000 --input=data/malicious_phish.csv --output=data/all/kaggle1 --super_label=goodjs
+
+## render report
+render:
+	quarto render ./report/index.qmd
+
+## preview report
+preview:
+	quarto preview ./report/index.qmd --no-watch-inputs --no-browser --port 7733
+
+#################################################################################
+# Self Documenting Commands                                                     #
+#################################################################################
+
+.DEFAULT_GOAL := help
+
+# Inspired by <http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html>
+# sed script explained:
+# /^##/:
+# 	* save line in hold space
+# 	* purge line
+# 	* Loop:
+# 		* append newline + line to hold space
+# 		* go to next line
+# 		* if line starts with doc comment, strip comment character off and loop
+# 	* remove target prerequisites
+# 	* append hold space (+ newline) to line
+# 	* replace newline plus comments by `---`
+# 	* print line
+# Separate expressions are necessary because labels cannot be delimited by
+# semicolon; see <http://stackoverflow.com/a/11799865/1968>
+.PHONY: help
+help:
+	@echo "$$(tput bold)Available commands:$$(tput sgr0)"
+	@sed -n -e "/^## / { \
+		h; \
+		s/.*//; \
+		:doc" \
+		-e "H; \
+		n; \
+		s/^## //; \
+		t doc" \
+		-e "s/:.*//; \
+		G; \
+		s/\\n## /---/; \
+		s/\\n/ /g; \
+		p; \
+	}" ${MAKEFILE_LIST} \
+	| awk -F '---' \
+		-v ncol=$$(tput cols) \
+		-v indent=19 \
+		-v col_on="$$(tput setaf 6)" \
+		-v col_off="$$(tput sgr0)" \
+	'{ \
+		printf "%s%*s%s ", col_on, -indent, $$1, col_off; \
+		n = split($$2, words, " "); \
+		line_length = ncol - indent; \
+		for (i = 1; i <= n; i++) { \
+			line_length -= length(words[i]) + 1; \
+			if (line_length <= 0) { \
+				line_length = ncol - indent - length(words[i]) - 1; \
+				printf "\n%*s ", -indent, " "; \
+			} \
+			printf "%s ", words[i]; \
+		} \
+		printf "\n"; \
+	}' \
+	| more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars')
diff --git a/crawler/crawl_kaggle_dataset.py b/crawler/crawl_kaggle_dataset.py
@@ -0,0 +1,120 @@
+import argparse
+import concurrent.futures
+import hashlib
+import json
+import os
+import signal
+
+import bs4
+import pandas as pd
+import pymongo
+import requests
+from lib.utils.logging import logger
+
+TIMEOUT_SECS = 3
+shutdown = False  # Flag to signal workers to stop working
+
+
+def handle_shutdown(signum, frame):
+    global shutdown
+    shutdown = True
+
+
+signal.signal(signal.SIGINT, handle_shutdown)
+
+
+def extract_js(url):
+    if shutdown:  # Check shutdown flag before making the request
+        return None
+
+    try:
+        response = requests.get(url, timeout=TIMEOUT_SECS)
+        if response.status_code != 200 or not response.text:
+            raise requests.exceptions.RequestException("Invalid response")
+        soup = bs4.BeautifulSoup(response.text, "html.parser")
+        scripts = soup.find_all("script")
+        js_code = ""
+        for script in scripts:
+            if script.contents:
+                js_code += script.contents[0]
+        return js_code
+    except requests.exceptions.RequestException as e:
+        raise
+
+
+def extract_js_and_save(idx, url, label, super_label):
+    if shutdown:  # Check shutdown flag
+        return None
+
+    if not url.startswith("http") and not url.startswith("https"):
+        new_url = "http://" + url
+    else:
+        new_url = url
+
+    try:
+        js_code = extract_js(new_url)
+    except Exception as e:
+        logger.error(f"{idx}, Failed,{super_label},{label},{new_url},{url}")
+        return None
+
+    if js_code == "":
+        return None
+
+    logger.info(f"{idx}, Success,{super_label},{label},{new_url},{url}")
+    return {"url": new_url, "label": label, "super_label": super_label, "js_code": js_code}
+
+
+def main():
+    global shutdown
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--skip", type=int, default=0)
+    parser.add_argument("--limit", type=int, default=10)
+    parser.add_argument("--input", type=str, required=True)
+    parser.add_argument("--output", type=str, required=True)
+    parser.add_argument("--super_label", type=str)
+    args = parser.parse_args()
+    skip = args.skip
+    limit = args.limit
+    input_file = args.input
+    output_dir = args.output
+    super_label = args.super_label
+
+    url_df = pd.read_csv(
+        input_file,
+        usecols=[0, 1, 2, 3],
+        header=0,
+        names=["idx", "url", "label", "super_label"],
+        skiprows=range(1, skip + 1),  # skip the first 'skip' rows (assuming there's a header row too)
+        nrows=limit,
+    )  # read 'limit' rows
+    if super_label:
+        url_df = url_df[url_df["super_label"] == super_label]
+
+    # url_df_slice = url_df.iloc[skip : skip + limit]
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        args = [(idx, url, label, super_label) for idx, url, label, super_label in url_df.itertuples(index=False)]
+
+        try:
+            for res in executor.map(lambda x: extract_js_and_save(*x), args):
+                if shutdown:  # Check shutdown flag
+                    break
+
+                if res:
+                    file_name = hashlib.sha256(res["js_code"].encode("utf-8")).hexdigest()
+                    file_path = os.path.join(output_dir, res["super_label"], file_name)
+                    if not os.path.exists(os.path.dirname(file_path)):
+                        os.makedirs(os.path.dirname(file_path))
+
+                    if os.path.exists(file_path):
+                        continue
+
+                    with open(file_path, "w") as f:
+                        f.write(res["js_code"])
+        except KeyboardInterrupt:
+            # This part is executed after Ctrl+C is pressed
+            print("\nReceived shutdown signal, stopping workers...")
+            shutdown = True
+
+
+if __name__ == "__main__":
+    main()
diff --git a/crawler/db_connector.py b/crawler/db_connector.py
@@ -0,0 +1,22 @@
+from types import TracebackType
+from typing import Optional, Type
+
+import pymongo
+
+
+class MongoContextManager:
+    def __init__(self, uri: str, db_name: str, collection_name: str) -> None:
+        self.uri = uri
+        self.db_name = db_name
+        self.collection_name = collection_name
+
+    def __enter__(self) -> pymongo.collection.Collection:
+        self.client: pymongo.MongoClient = pymongo.MongoClient(self.uri)
+        self.db = self.client[self.db_name]
+        self.collection = self.db[self.collection_name]
+        return self.collection
+
+    def __exit__(
+        self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType]
+    ) -> None:
+        self.client.close()
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,18 @@
+version: '3.9'
+
+services:
+
+  mongodb:
+    image: mongo:4.2.1
+    container_name: js-code
+    restart: unless-stopped
+    ports:
+      - 27017:27017
+    volumes: [ mongodbM1Data:/data/db, mongodbM1Config:/data/configdb ]
+    environment:
+      MONGO_INITDB_ROOT_USERNAME: jscode
+      MONGO_INITDB_ROOT_PASSWORD: jscode
+
+volumes:
+  mongodbM1Data: null
+  mongodbM1Config: null
diff --git a/environment.yml b/environment.yml
@@ -1,13 +1,13 @@
 name: m1
 channels:
-  # - pytorch
+  - pytorch
   - conda-forge
 dependencies:
   - python=3.10.*
   - conda-lock
   - mamba
   - pip
-  # - pytorch::pytorch=1.11.0
+  - pytorch::pytorch=2.0.1
 
 # Non-standard section listing target platforms for conda-lock:
 platforms:

diff --git a/src/__init__.py → lib/__init__.py b/src/__init__.py → lib/__init__.py