Skip to content

Commit

Permalink
progress
Browse files Browse the repository at this point in the history
  • Loading branch information
truonghm committed Sep 12, 2023
1 parent 7f8ef2f commit db2d2c9
Show file tree
Hide file tree
Showing 30 changed files with 6,295 additions and 442 deletions.
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# report
report/_extensions/*
report/QTDublinIrish.otf
report/*.tex

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand All @@ -14,7 +19,6 @@ dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
Expand Down
34 changes: 34 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
minimum_pre_commit_version: 3.3.3

repos:
- repo: local
hooks:
- id: ruff-lint
name: ruff-lint
entry: ruff --fix --config ./pyproject.toml
language: system
types: [python]

- id: black-format
name: ruff-lint
entry: black --config ./pyproject.toml
language: system
types: [python]

- id: mypy
name: mypy
entry: mypy --config-file ./pyproject.toml --enable-incomplete-feature=Unpack --install-types --non-interactive
language: system
types: [python]

- id: ruff-check
name: ruff-check
entry: ruff check --config ./pyproject.toml
language: system
types: [python]

- id: black-check
name: black-check
entry: black --check --config ./pyproject.toml
language: system
types: [python]
98 changes: 90 additions & 8 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,113 @@ SHELL = /bin/bash
CONDA_ENV_PATH=.conda/m1
CONDA_HOME_PATH=$(HOME)/miniconda3

## Download data for training
download:
./scripts/download_set1.sh
./scripts/download_set2.sh

count:
./scripts/count_data.sh

## Generate tree of data folder
tree:
./scripts/gen_tree.sh data

## Create conda env (python 3.10) using environment.yml
env:
source $(CONDA_HOME_PATH)/bin/activate; conda create -p $(CONDA_ENV_PATH) --no-default-packages --no-deps python=3.10 -y; conda env update -p $(CONDA_ENV_PATH) --file environment.yml

## Remove old conda env and create a new one
env-reset:
rm -rf $(CONDA_ENV_PATH)
make env

PATH_TO_CHECK=./lib/* ./crawler/*
## Format files using black, using pre-commit hooks
format:
black src --config pyproject.toml
ruff src --fix --config pyproject.toml
pre-commit run ruff-lint --files $(PATH_TO_CHECK)
pre-commit run black-format --files $(PATH_TO_CHECK)

## Run checks (ruff + test)
check:
ruff check src --config pyproject.toml
black --check src --config pyproject.toml
## Run checks (ruff + test), using pre-commit hooks
check-format:
pre-commit run ruff-check --files $(PATH_TO_CHECK)
pre-commit run black-check --files $(PATH_TO_CHECK)

type:
mypy src --config-file pyproject.toml
## Run mypy type checking using pre-commit hook
check-mypy:
pre-commit run mypy --files $(PATH_TO_CHECK)

## Run all checks (ruff + test + mypy), using pre-commit hooks
check-all:
pre-commit run --files $(PATH_TO_CHECK)

## crawl urls from the kaggle dataset
crawl:
export PYTHONPATH=$(shell pwd) && python crawler/crawl_kaggle_dataset.py --skip=108201 --limit=700000 --input=data/malicious_phish.csv --output=data/all/kaggle1 --super_label=goodjs

## render report
render:
quarto render ./report/index.qmd

## preview report
preview:
quarto preview ./report/index.qmd --no-watch-inputs --no-browser --port 7733

#################################################################################
# Self Documenting Commands #
#################################################################################

.DEFAULT_GOAL := help

# Inspired by <http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html>
# sed script explained:
# /^##/:
# * save line in hold space
# * purge line
# * Loop:
# * append newline + line to hold space
# * go to next line
# * if line starts with doc comment, strip comment character off and loop
# * remove target prerequisites
# * append hold space (+ newline) to line
# * replace newline plus comments by `---`
# * print line
# Separate expressions are necessary because labels cannot be delimited by
# semicolon; see <http://stackoverflow.com/a/11799865/1968>
.PHONY: help
help:
@echo "$$(tput bold)Available commands:$$(tput sgr0)"
@sed -n -e "/^## / { \
h; \
s/.*//; \
:doc" \
-e "H; \
n; \
s/^## //; \
t doc" \
-e "s/:.*//; \
G; \
s/\\n## /---/; \
s/\\n/ /g; \
p; \
}" ${MAKEFILE_LIST} \
| awk -F '---' \
-v ncol=$$(tput cols) \
-v indent=19 \
-v col_on="$$(tput setaf 6)" \
-v col_off="$$(tput sgr0)" \
'{ \
printf "%s%*s%s ", col_on, -indent, $$1, col_off; \
n = split($$2, words, " "); \
line_length = ncol - indent; \
for (i = 1; i <= n; i++) { \
line_length -= length(words[i]) + 1; \
if (line_length <= 0) { \
line_length = ncol - indent - length(words[i]) - 1; \
printf "\n%*s ", -indent, " "; \
} \
printf "%s ", words[i]; \
} \
printf "\n"; \
}' \
| more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars')
120 changes: 120 additions & 0 deletions crawler/crawl_kaggle_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
import argparse
import concurrent.futures
import hashlib
import json
import os
import signal

import bs4
import pandas as pd
import pymongo
import requests
from lib.utils.logging import logger

TIMEOUT_SECS = 3
shutdown = False # Flag to signal workers to stop working


def handle_shutdown(signum, frame):
global shutdown
shutdown = True


signal.signal(signal.SIGINT, handle_shutdown)


def extract_js(url):
if shutdown: # Check shutdown flag before making the request
return None

try:
response = requests.get(url, timeout=TIMEOUT_SECS)
if response.status_code != 200 or not response.text:
raise requests.exceptions.RequestException("Invalid response")
soup = bs4.BeautifulSoup(response.text, "html.parser")
scripts = soup.find_all("script")
js_code = ""
for script in scripts:
if script.contents:
js_code += script.contents[0]
return js_code
except requests.exceptions.RequestException as e:
raise


def extract_js_and_save(idx, url, label, super_label):
if shutdown: # Check shutdown flag
return None

if not url.startswith("http") and not url.startswith("https"):
new_url = "http://" + url
else:
new_url = url

try:
js_code = extract_js(new_url)
except Exception as e:
logger.error(f"{idx}, Failed,{super_label},{label},{new_url},{url}")
return None

if js_code == "":
return None

logger.info(f"{idx}, Success,{super_label},{label},{new_url},{url}")
return {"url": new_url, "label": label, "super_label": super_label, "js_code": js_code}


def main():
global shutdown
parser = argparse.ArgumentParser()
parser.add_argument("--skip", type=int, default=0)
parser.add_argument("--limit", type=int, default=10)
parser.add_argument("--input", type=str, required=True)
parser.add_argument("--output", type=str, required=True)
parser.add_argument("--super_label", type=str)
args = parser.parse_args()
skip = args.skip
limit = args.limit
input_file = args.input
output_dir = args.output
super_label = args.super_label

url_df = pd.read_csv(
input_file,
usecols=[0, 1, 2, 3],
header=0,
names=["idx", "url", "label", "super_label"],
skiprows=range(1, skip + 1), # skip the first 'skip' rows (assuming there's a header row too)
nrows=limit,
) # read 'limit' rows
if super_label:
url_df = url_df[url_df["super_label"] == super_label]

# url_df_slice = url_df.iloc[skip : skip + limit]
with concurrent.futures.ThreadPoolExecutor() as executor:
args = [(idx, url, label, super_label) for idx, url, label, super_label in url_df.itertuples(index=False)]

try:
for res in executor.map(lambda x: extract_js_and_save(*x), args):
if shutdown: # Check shutdown flag
break

if res:
file_name = hashlib.sha256(res["js_code"].encode("utf-8")).hexdigest()
file_path = os.path.join(output_dir, res["super_label"], file_name)
if not os.path.exists(os.path.dirname(file_path)):
os.makedirs(os.path.dirname(file_path))

if os.path.exists(file_path):
continue

with open(file_path, "w") as f:
f.write(res["js_code"])
except KeyboardInterrupt:
# This part is executed after Ctrl+C is pressed
print("\nReceived shutdown signal, stopping workers...")
shutdown = True


if __name__ == "__main__":
main()
22 changes: 22 additions & 0 deletions crawler/db_connector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from types import TracebackType
from typing import Optional, Type

import pymongo


class MongoContextManager:
def __init__(self, uri: str, db_name: str, collection_name: str) -> None:
self.uri = uri
self.db_name = db_name
self.collection_name = collection_name

def __enter__(self) -> pymongo.collection.Collection:
self.client: pymongo.MongoClient = pymongo.MongoClient(self.uri)
self.db = self.client[self.db_name]
self.collection = self.db[self.collection_name]
return self.collection

def __exit__(
self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType]
) -> None:
self.client.close()
18 changes: 18 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
version: '3.9'

services:

mongodb:
image: mongo:4.2.1
container_name: js-code
restart: unless-stopped
ports:
- 27017:27017
volumes: [ mongodbM1Data:/data/db, mongodbM1Config:/data/configdb ]
environment:
MONGO_INITDB_ROOT_USERNAME: jscode
MONGO_INITDB_ROOT_PASSWORD: jscode

volumes:
mongodbM1Data: null
mongodbM1Config: null
4 changes: 2 additions & 2 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
name: m1
channels:
# - pytorch
- pytorch
- conda-forge
dependencies:
- python=3.10.*
- conda-lock
- mamba
- pip
# - pytorch::pytorch=1.11.0
- pytorch::pytorch=2.0.1

# Non-standard section listing target platforms for conda-lock:
platforms:
Expand Down
File renamed without changes.
Loading

0 comments on commit db2d2c9

Please sign in to comment.