Skip to content

Commit

Permalink
reorganize
Browse files Browse the repository at this point in the history
  • Loading branch information
truonghm committed Sep 12, 2023
1 parent 9eec7c8 commit 3d41288
Show file tree
Hide file tree
Showing 15 changed files with 176 additions and 74 deletions.
12 changes: 6 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,22 @@ CONDA_HOME_PATH=$(HOME)/miniconda3

## Download data for training
download:
./scripts/download_set1.sh
./scripts/download_set2.sh
./scripts/utils/download_set1.sh
./scripts/utils/download_set2.sh

count:
./scripts/count_data.sh
./scripts/utils/count_data.sh

## Generate tree of data folder
tree:
./scripts/gen_tree.sh data
./scripts/utils/gen_tree.sh data

## Create conda env (python 3.10) using environment.yml
env:
source $(CONDA_HOME_PATH)/bin/activate; conda create -p $(CONDA_ENV_PATH) --no-default-packages --no-deps python=3.10 -y; conda env update -p $(CONDA_ENV_PATH) --file environment.yml

bootstrap:
./scripts/vastai_bootstrap.sh
./scripts/utils/vastai_bootstrap.sh
## Remove old conda env and create a new one
env-reset:
rm -rf $(CONDA_ENV_PATH)
Expand All @@ -47,7 +47,7 @@ check-all:

## crawl urls from the kaggle dataset
crawl:
export PYTHONPATH=$(shell pwd) && python crawler/crawl_kaggle_dataset.py --skip=108201 --limit=700000 --input=data/malicious_phish.csv --output=data/all/kaggle1 --super_label=goodjs
export PYTHONPATH=$(shell pwd) && python scripts/crawler/crawl_kaggle_dataset.py --skip=108201 --limit=700000 --input=data/malicious_phish.csv --output=data/all/kaggle1 --super_label=goodjs

## render report
render:
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
127 changes: 81 additions & 46 deletions scripts/split_train_test.py
Original file line number Diff line number Diff line change
@@ -1,62 +1,97 @@
import argparse
import os

import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from dask_ml.model_selection import train_test_split


def read_parquet_and_split(parquet_dir_paths, sample_size):
ddfs = []
for path in parquet_dir_paths:
with ProgressBar():
ddf = dd.read_parquet(path)
ddfs.append(ddf)

# Combine all the Dask DataFrames
combined_ddf = dd.concat(ddfs)

# Apply stratified sampling if sample_size is less than 1
if sample_size < 1.0:
total_length = len(combined_ddf)
sample_length = int(total_length * sample_size)
# Assuming the label column is named 'label'
combined_ddf = combined_ddf.sample(frac=sample_size, random_state=42, replace=False).compute()
combined_ddf = dd.from_pandas(combined_ddf, npartitions=combined_ddf.npartitions)

# Split the data into training and test sets (Assuming the label column is named 'label')
X_train, X_test = train_test_split(
combined_ddf, test_size=0.2, shuffle=True, random_state=42, stratify=combined_ddf["label"]
)
import pandas as pd
from sklearn.model_selection import train_test_split

return X_train, X_test
TRAIN_SET = "train_set.csv"
TEST_SET = "test_set.csv"


def list_of_strings(arg):
return arg.split(",")


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Split parquet data into train and test sets")
parser.add_argument("-i", "--inputs", help="Parquet directories to read", required=True, type=list_of_strings)
parser.add_argument("-o", "--output", help="Output directory", required=True)
parser.add_argument("-p", "--prefix", help="Prefix for the output files", required=True)
parser.add_argument("-ss", "--sample-size", help="Sample size", required=False, type=float, default=1.0)
def get_files_from_subdir(dir_path):
return [
os.path.join(dir_path, fname) for fname in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, fname))
]


def main():
parser = argparse.ArgumentParser(description="Split text data into train and test sets")
parser.add_argument("-i", "--inputs", help="Directories to read", required=True, type=list_of_strings)
parser.add_argument(
"-o", "--output", help="Output directory to contain the train and test set", required=True, type=str
)
parser.add_argument("-ss", "--sample-size", help="Size of the sample to use (0.0 to 1.0)", type=float, default=0.2)
parser.add_argument(
"-ts", "--train-size", help="Size of the training set from sample (0.0 to 1.0)", type=float, default=0.8
)

args = parser.parse_args()
parquet_dir_paths = args.inputs
output_dir = args.output
prefix = args.prefix
sample_size = args.sample_size

X_train, X_test = read_parquet_and_split(parquet_dir_paths, sample_size)
print(f"Inputs : {args.inputs}")
print(f"Sample size : {args.sample_size*100} %")
print(f"Train size : {args.train_size*100} %")
print(f"Test size : {round(1.0 - args.train_size, 1)*100} %")

with ProgressBar():
X_train.repartition(npartitions=1).to_parquet(
os.path.join(output_dir, f"{prefix}_train_data.parquet"),
)
X_test.repartition(npartitions=1).to_parquet(
os.path.join(output_dir, f"{prefix}_test_data.parquet"),
good_files = []
bad_files = []
labels = []

for input_dir in args.inputs:
goodjs_dir = os.path.join(input_dir, "goodjs")
badjs_dir = os.path.join(input_dir, "badjs")

if os.path.exists(goodjs_dir) and os.path.exists(badjs_dir):
goodjs_files = get_files_from_subdir(goodjs_dir)
badjs_files = get_files_from_subdir(badjs_dir)

good_files.extend(goodjs_files)
bad_files.extend(badjs_files)

labels.extend(["goodjs"] * len(goodjs_files))
labels.extend(["badjs"] * len(badjs_files))
else:
print(f"Skipping {input_dir} as it doesn't contain both 'goodjs' and 'badjs' directories.")

all_files = good_files + bad_files
total = len(all_files)
print(f"# before sampling : {total}")
print(f"# goodjs : {len(good_files)}")
print(f"# badjs : {len(bad_files)}")
# Sample from the data if necessary
if args.sample_size < 1.0:
sample_size = int(len(all_files) * args.sample_size)
all_files, _, labels, _ = train_test_split(
all_files, labels, train_size=sample_size, stratify=labels, random_state=42
)

print("Train and test data have been saved.")
good_files_after_sample = [all_files[i] for i in range(len(all_files)) if labels[i] == "goodjs"]
bad_files_after_sample = [all_files[i] for i in range(len(all_files)) if labels[i] == "badjs"]
print(f"# after sampling : {len(all_files)}")
print(f"# goodjs sampled : {len(good_files_after_sample)}")
print(f"# badjs sampled : {len(bad_files_after_sample)}")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
all_files, labels, train_size=args.train_size, stratify=labels, random_state=42
)

output_dir = args.output

train = pd.DataFrame({"file": X_train, "label": y_train})
test = pd.DataFrame({"file": X_test, "label": y_test})
train_path = os.path.join(output_dir, TRAIN_SET)
test_path = os.path.join(output_dir, TEST_SET)

train.to_csv(train_path, index=False)
test.to_csv(test_path, index=False)
print(f"Train set size : {len(train)}")
print(f"Test set size : {len(test)}")
print(f"Output : [{train_path}, {test_path}]")


if __name__ == "__main__":
main()
59 changes: 37 additions & 22 deletions scripts/tokenize_corpus.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import argparse
import os
import warnings
from concurrent.futures import ThreadPoolExecutor
from typing import Any, List, Tuple

import dask.dataframe as dd
import esprima
from dask.diagnostics import ProgressBar
import pandas as pd
from lib.utils.logging import logger
from tqdm import tqdm

warnings.filterwarnings("ignore")


def tokenize(snippet: str, label) -> List[Tuple[Any, Any]]:
token_types = []
Expand All @@ -22,37 +25,49 @@ def tokenize(snippet: str, label) -> List[Tuple[Any, Any]]:
return tokens, token_types, label


def process_row(file, label):
with open(file, "r") as f:
snippet = f.read()
return tokenize(snippet, label)


def main():
argparser = argparse.ArgumentParser()
argparser.add_argument("-p", "--parquet-dir", type=str, required=True)
argparser.add_argument("-st", "--text-dir", type=str, required=True)
argparser.add_argument("-i", "--input", type=str, required=True)
argparser.add_argument("-o", "--output", type=str, required=True)

args = argparser.parse_args()
path = args.parquet_dir
text_dir = args.text_dir
tokens_file = os.path.join(text_dir, "tokens_corpus.txt")
token_types_file = os.path.join(text_dir, "token_types_corpus.txt")
labels_file = os.path.join(text_dir, "labels.txt")
path_file = args.input
text_dir = args.output
prefix = os.path.basename(path_file)
tokens_file = os.path.join(text_dir, f"{prefix}_tokens_corpus.txt")
token_types_file = os.path.join(text_dir, f"{prefix}_token_types_corpus.txt")
labels_file = os.path.join(text_dir, f"{prefix}_labels.txt")

logger.info(f"Load paths to actual data: {path_file}")
df = pd.read_csv(path_file)
total = len(df)

logger.info("Reading parquet files...")
with ProgressBar():
ddf = dd.read_parquet(path)
# logger.info("Tokenizing...")
results = []
with ThreadPoolExecutor() as executor:
results = list(
tqdm(
executor.map(process_row, df["file"].tolist(), df["label"].tolist()),
total=total,
bar_format="Tokenizing: {desc:<5.5}{percentage:3.0f}%|{bar:30}{r_bar}",
)
)

logger.info("Saving corpus to text files...")
total = len(ddf)
logger.info(f"Total snippets: {total}")

df = ddf.compute()
total = len(df)
# with tqdm(total=total) as pbar:
with open(tokens_file, "a") as tokens_f, open(token_types_file, "a") as token_types_f, open(
labels_file, "a") as labels_f:
for snippet, label in tqdm(zip(df["content"], df["label"]), total=total):
tokens, token_types, label = tokenize(snippet, label)
labels_file, "a"
) as labels_f:
for tokens, token_types, label in results:
tokens_f.write(" ".join(tokens) + "\n")
token_types_f.write(" ".join(token_types) + "\n")
labels_f.write(label + "\n")
# pbar.update(1)


if __name__ == "__main__":
main()
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
52 changes: 52 additions & 0 deletions scripts/utils/parquet_to_js.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import argparse
import hashlib
import os

import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from tqdm import tqdm


def create_text_files(parquet_path, root_dir):
with ProgressBar():
# Read the parquet file
ddf = dd.read_parquet(parquet_path)

# Compute to bring into memory (use this cautiously)
df = ddf.compute()

# Create root directory named after the parquet file
root_path = os.path.join(root_dir, os.path.basename(parquet_path).replace(".parquet", ""))
os.makedirs(root_path, exist_ok=True)

# Create subdirectories
goodjs_path = os.path.join(root_path, "goodjs")
badjs_path = os.path.join(root_path, "badjs")
os.makedirs(goodjs_path, exist_ok=True)
os.makedirs(badjs_path, exist_ok=True)

# Create text files
for _, row in tqdm(df.iterrows(), total=len(df)):
label = row["label"]
content = row["content"]
hash_value = hashlib.sha256(content.encode()).hexdigest()

if label == "good":
file_path = os.path.join(goodjs_path, hash_value)
else:
file_path = os.path.join(badjs_path, hash_value)

with open(file_path, "w") as f:
f.write(content)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Create text files from a parquet file")
parser.add_argument("-i", "--input", help="Input parquet file path", required=True)
parser.add_argument("-o", "--output", help="Output root directory path", required=True)

args = parser.parse_args()
parquet_path = args.input
root_dir = args.output

create_text_files(parquet_path, root_dir)
File renamed without changes.
File renamed without changes.

0 comments on commit 3d41288

Please sign in to comment.