reorganize

truonghm · Sep 12, 2023 · 3d41288 · 3d41288
1 parent 9eec7c8
commit 3d41288
Show file tree

Hide file tree

Showing 15 changed files with 176 additions and 74 deletions.
diff --git a/Makefile b/Makefile
@@ -5,22 +5,22 @@ CONDA_HOME_PATH=$(HOME)/miniconda3
 
 ## Download data for training
 download:
-	./scripts/download_set1.sh
-	./scripts/download_set2.sh
+	./scripts/utils/download_set1.sh
+	./scripts/utils/download_set2.sh
 
 count:
-	./scripts/count_data.sh
+	./scripts/utils/count_data.sh
 
 ## Generate tree of data folder
 tree:
-	./scripts/gen_tree.sh data
+	./scripts/utils/gen_tree.sh data
 
 ## Create conda env (python 3.10) using environment.yml
 env: 
 	source $(CONDA_HOME_PATH)/bin/activate; conda create -p $(CONDA_ENV_PATH) --no-default-packages --no-deps python=3.10 -y; conda env update -p $(CONDA_ENV_PATH) --file environment.yml
 
 bootstrap:
-	./scripts/vastai_bootstrap.sh
+	./scripts/utils/vastai_bootstrap.sh
 ## Remove old conda env and create a new one
 env-reset:
 	rm -rf $(CONDA_ENV_PATH)
@@ -47,7 +47,7 @@ check-all:
 
 ## crawl urls from the kaggle dataset
 crawl:
-	export PYTHONPATH=$(shell pwd) && python crawler/crawl_kaggle_dataset.py --skip=108201 --limit=700000 --input=data/malicious_phish.csv --output=data/all/kaggle1 --super_label=goodjs
+	export PYTHONPATH=$(shell pwd) && python scripts/crawler/crawl_kaggle_dataset.py --skip=108201 --limit=700000 --input=data/malicious_phish.csv --output=data/all/kaggle1 --super_label=goodjs
 
 ## render report
 render:

diff --git a/scripts/build_word_rep_model.py → scripts/build_fasttext_model.py b/scripts/build_word_rep_model.py → scripts/build_fasttext_model.py
diff --git a/crawler/crawl_kaggle_dataset.py → scripts/crawler/crawl_kaggle_dataset.py b/crawler/crawl_kaggle_dataset.py → scripts/crawler/crawl_kaggle_dataset.py
diff --git a/crawler/db_connector.py → scripts/crawler/db_connector.py b/crawler/db_connector.py → scripts/crawler/db_connector.py
diff --git a/scripts/split_train_test.py b/scripts/split_train_test.py
@@ -1,62 +1,97 @@
 import argparse
 import os
 
-import dask.dataframe as dd
-from dask.diagnostics import ProgressBar
-from dask_ml.model_selection import train_test_split
-
-
-def read_parquet_and_split(parquet_dir_paths, sample_size):
-    ddfs = []
-    for path in parquet_dir_paths:
-        with ProgressBar():
-            ddf = dd.read_parquet(path)
-            ddfs.append(ddf)
-
-    # Combine all the Dask DataFrames
-    combined_ddf = dd.concat(ddfs)
-
-    # Apply stratified sampling if sample_size is less than 1
-    if sample_size < 1.0:
-        total_length = len(combined_ddf)
-        sample_length = int(total_length * sample_size)
-        # Assuming the label column is named 'label'
-        combined_ddf = combined_ddf.sample(frac=sample_size, random_state=42, replace=False).compute()
-        combined_ddf = dd.from_pandas(combined_ddf, npartitions=combined_ddf.npartitions)
-
-    # Split the data into training and test sets (Assuming the label column is named 'label')
-    X_train, X_test = train_test_split(
-        combined_ddf, test_size=0.2, shuffle=True, random_state=42, stratify=combined_ddf["label"]
-    )
+import pandas as pd
+from sklearn.model_selection import train_test_split
 
-    return X_train, X_test
+TRAIN_SET = "train_set.csv"
+TEST_SET = "test_set.csv"
 
 
 def list_of_strings(arg):
     return arg.split(",")
 
 
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Split parquet data into train and test sets")
-    parser.add_argument("-i", "--inputs", help="Parquet directories to read", required=True, type=list_of_strings)
-    parser.add_argument("-o", "--output", help="Output directory", required=True)
-    parser.add_argument("-p", "--prefix", help="Prefix for the output files", required=True)
-    parser.add_argument("-ss", "--sample-size", help="Sample size", required=False, type=float, default=1.0)
+def get_files_from_subdir(dir_path):
+    return [
+        os.path.join(dir_path, fname) for fname in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, fname))
+    ]
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Split text data into train and test sets")
+    parser.add_argument("-i", "--inputs", help="Directories to read", required=True, type=list_of_strings)
+    parser.add_argument(
+        "-o", "--output", help="Output directory to contain the train and test set", required=True, type=str
+    )
+    parser.add_argument("-ss", "--sample-size", help="Size of the sample to use (0.0 to 1.0)", type=float, default=0.2)
+    parser.add_argument(
+        "-ts", "--train-size", help="Size of the training set from sample (0.0 to 1.0)", type=float, default=0.8
+    )
 
     args = parser.parse_args()
-    parquet_dir_paths = args.inputs
-    output_dir = args.output
-    prefix = args.prefix
-    sample_size = args.sample_size
 
-    X_train, X_test = read_parquet_and_split(parquet_dir_paths, sample_size)
+    print(f"Inputs            : {args.inputs}")
+    print(f"Sample size       : {args.sample_size*100} %")
+    print(f"Train size        : {args.train_size*100} %")
+    print(f"Test size         : {round(1.0 - args.train_size, 1)*100} %")
 
-    with ProgressBar():
-        X_train.repartition(npartitions=1).to_parquet(
-            os.path.join(output_dir, f"{prefix}_train_data.parquet"),
-        )
-        X_test.repartition(npartitions=1).to_parquet(
-            os.path.join(output_dir, f"{prefix}_test_data.parquet"),
+    good_files = []
+    bad_files = []
+    labels = []
+
+    for input_dir in args.inputs:
+        goodjs_dir = os.path.join(input_dir, "goodjs")
+        badjs_dir = os.path.join(input_dir, "badjs")
+
+        if os.path.exists(goodjs_dir) and os.path.exists(badjs_dir):
+            goodjs_files = get_files_from_subdir(goodjs_dir)
+            badjs_files = get_files_from_subdir(badjs_dir)
+
+            good_files.extend(goodjs_files)
+            bad_files.extend(badjs_files)
+
+            labels.extend(["goodjs"] * len(goodjs_files))
+            labels.extend(["badjs"] * len(badjs_files))
+        else:
+            print(f"Skipping {input_dir} as it doesn't contain both 'goodjs' and 'badjs' directories.")
+
+    all_files = good_files + bad_files
+    total = len(all_files)
+    print(f"# before sampling : {total}")
+    print(f"# goodjs          : {len(good_files)}")
+    print(f"# badjs           : {len(bad_files)}")
+    # Sample from the data if necessary
+    if args.sample_size < 1.0:
+        sample_size = int(len(all_files) * args.sample_size)
+        all_files, _, labels, _ = train_test_split(
+            all_files, labels, train_size=sample_size, stratify=labels, random_state=42
         )
 
-    print("Train and test data have been saved.")
+    good_files_after_sample = [all_files[i] for i in range(len(all_files)) if labels[i] == "goodjs"]
+    bad_files_after_sample = [all_files[i] for i in range(len(all_files)) if labels[i] == "badjs"]
+    print(f"# after sampling  : {len(all_files)}")
+    print(f"# goodjs sampled  : {len(good_files_after_sample)}")
+    print(f"# badjs sampled   : {len(bad_files_after_sample)}")
+
+    # Split the data
+    X_train, X_test, y_train, y_test = train_test_split(
+        all_files, labels, train_size=args.train_size, stratify=labels, random_state=42
+    )
+
+    output_dir = args.output
+
+    train = pd.DataFrame({"file": X_train, "label": y_train})
+    test = pd.DataFrame({"file": X_test, "label": y_test})
+    train_path = os.path.join(output_dir, TRAIN_SET)
+    test_path = os.path.join(output_dir, TEST_SET)
+
+    train.to_csv(train_path, index=False)
+    test.to_csv(test_path, index=False)
+    print(f"Train set size    : {len(train)}")
+    print(f"Test set size     : {len(test)}")
+    print(f"Output            : [{train_path}, {test_path}]")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/tokenize_corpus.py b/scripts/tokenize_corpus.py
@@ -1,13 +1,16 @@
 import argparse
 import os
+import warnings
+from concurrent.futures import ThreadPoolExecutor
 from typing import Any, List, Tuple
 
-import dask.dataframe as dd
 import esprima
-from dask.diagnostics import ProgressBar
+import pandas as pd
 from lib.utils.logging import logger
 from tqdm import tqdm
 
+warnings.filterwarnings("ignore")
+
 
 def tokenize(snippet: str, label) -> List[Tuple[Any, Any]]:
     token_types = []
@@ -22,37 +25,49 @@ def tokenize(snippet: str, label) -> List[Tuple[Any, Any]]:
     return tokens, token_types, label
 
 
+def process_row(file, label):
+    with open(file, "r") as f:
+        snippet = f.read()
+    return tokenize(snippet, label)
+
+
 def main():
     argparser = argparse.ArgumentParser()
-    argparser.add_argument("-p", "--parquet-dir", type=str, required=True)
-    argparser.add_argument("-st", "--text-dir", type=str, required=True)
+    argparser.add_argument("-i", "--input", type=str, required=True)
+    argparser.add_argument("-o", "--output", type=str, required=True)
 
     args = argparser.parse_args()
-    path = args.parquet_dir
-    text_dir = args.text_dir
-    tokens_file = os.path.join(text_dir, "tokens_corpus.txt")
-    token_types_file = os.path.join(text_dir, "token_types_corpus.txt")
-    labels_file = os.path.join(text_dir, "labels.txt")
+    path_file = args.input
+    text_dir = args.output
+    prefix = os.path.basename(path_file)
+    tokens_file = os.path.join(text_dir, f"{prefix}_tokens_corpus.txt")
+    token_types_file = os.path.join(text_dir, f"{prefix}_token_types_corpus.txt")
+    labels_file = os.path.join(text_dir, f"{prefix}_labels.txt")
+
+    logger.info(f"Load paths to actual data: {path_file}")
+    df = pd.read_csv(path_file)
+    total = len(df)
 
-    logger.info("Reading parquet files...")
-    with ProgressBar():
-        ddf = dd.read_parquet(path)
+    # logger.info("Tokenizing...")
+    results = []
+    with ThreadPoolExecutor() as executor:
+        results = list(
+            tqdm(
+                executor.map(process_row, df["file"].tolist(), df["label"].tolist()),
+                total=total,
+                bar_format="Tokenizing: {desc:<5.5}{percentage:3.0f}%|{bar:30}{r_bar}",
+            )
+        )
 
     logger.info("Saving corpus to text files...")
-    total = len(ddf)
-    logger.info(f"Total snippets: {total}")
-
-    df = ddf.compute()
-    total = len(df)
-    # with tqdm(total=total) as pbar:
     with open(tokens_file, "a") as tokens_f, open(token_types_file, "a") as token_types_f, open(
-    labels_file, "a") as labels_f:
-        for snippet, label in tqdm(zip(df["content"], df["label"]), total=total):
-            tokens, token_types, label = tokenize(snippet, label)
+        labels_file, "a"
+    ) as labels_f:
+        for tokens, token_types, label in results:
             tokens_f.write(" ".join(tokens) + "\n")
             token_types_f.write(" ".join(token_types) + "\n")
             labels_f.write(label + "\n")
-                # pbar.update(1)
+
 
 if __name__ == "__main__":
     main()
diff --git a/scripts/count_data.sh → scripts/utils/count_data.sh b/scripts/count_data.sh → scripts/utils/count_data.sh
diff --git a/scripts/csv_to_parquet.py → scripts/utils/csv_to_parquet.py b/scripts/csv_to_parquet.py → scripts/utils/csv_to_parquet.py
diff --git a/scripts/download_set1.sh → scripts/utils/download_set1.sh b/scripts/download_set1.sh → scripts/utils/download_set1.sh
diff --git a/scripts/download_set2.sh → scripts/utils/download_set2.sh b/scripts/download_set2.sh → scripts/utils/download_set2.sh
diff --git a/scripts/gen_tree.sh → scripts/utils/gen_tree.sh b/scripts/gen_tree.sh → scripts/utils/gen_tree.sh
diff --git a/scripts/move_to_root.py → scripts/utils/move_to_root.py b/scripts/move_to_root.py → scripts/utils/move_to_root.py
diff --git a/scripts/utils/parquet_to_js.py b/scripts/utils/parquet_to_js.py
@@ -0,0 +1,52 @@
+import argparse
+import hashlib
+import os
+
+import dask.dataframe as dd
+from dask.diagnostics import ProgressBar
+from tqdm import tqdm
+
+
+def create_text_files(parquet_path, root_dir):
+    with ProgressBar():
+        # Read the parquet file
+        ddf = dd.read_parquet(parquet_path)
+
+        # Compute to bring into memory (use this cautiously)
+        df = ddf.compute()
+
+    # Create root directory named after the parquet file
+    root_path = os.path.join(root_dir, os.path.basename(parquet_path).replace(".parquet", ""))
+    os.makedirs(root_path, exist_ok=True)
+
+    # Create subdirectories
+    goodjs_path = os.path.join(root_path, "goodjs")
+    badjs_path = os.path.join(root_path, "badjs")
+    os.makedirs(goodjs_path, exist_ok=True)
+    os.makedirs(badjs_path, exist_ok=True)
+
+    # Create text files
+    for _, row in tqdm(df.iterrows(), total=len(df)):
+        label = row["label"]
+        content = row["content"]
+        hash_value = hashlib.sha256(content.encode()).hexdigest()
+
+        if label == "good":
+            file_path = os.path.join(goodjs_path, hash_value)
+        else:
+            file_path = os.path.join(badjs_path, hash_value)
+
+        with open(file_path, "w") as f:
+            f.write(content)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Create text files from a parquet file")
+    parser.add_argument("-i", "--input", help="Input parquet file path", required=True)
+    parser.add_argument("-o", "--output", help="Output root directory path", required=True)
+
+    args = parser.parse_args()
+    parquet_path = args.input
+    root_dir = args.output
+
+    create_text_files(parquet_path, root_dir)
diff --git a/scripts/text_to_parquet.py → scripts/utils/text_to_parquet.py b/scripts/text_to_parquet.py → scripts/utils/text_to_parquet.py
diff --git a/scripts/vastai_bootstrap.sh → scripts/utils/vastai_bootstrap.sh b/scripts/vastai_bootstrap.sh → scripts/utils/vastai_bootstrap.sh