update

truonghm · Sep 17, 2023 · 09a546c · 09a546c
1 parent c704046
commit 09a546c
Show file tree

Hide file tree

Showing 4 changed files with 173 additions and 184 deletions.
diff --git a/confs/input.yaml b/confs/input.yaml
@@ -0,0 +1,37 @@
+input:
+  kaggle1:
+    path: data/all/kaggle1
+    classes:
+      goodjs:
+        dist: 0.1
+  kaggle2:
+    path: data/all/kaggle2
+    classes:
+      badjs:
+        dist: 0.1
+  misc1:
+    path: data/all/misc1
+    classes:
+      goodjs:
+        dist: 0.7
+      badjs:
+        dist: 0.6
+  misc2:
+    path: data/all/misc2
+    classes:
+      goodjs:
+        dist: 0.5
+      badjs:
+        dist: 0.6
+  misc3:
+    path: data/all/misc3
+    classes:
+      badjs:
+        dist: 0.1
+  packt:
+    path: data/all/packt
+    classes:
+      goodjs:
+        dist: 0.7
+      badjs:
+        dist: 0.6
diff --git a/notebooks/test3.ipynb b/notebooks/test3.ipynb
@@ -732,102 +732,6 @@
     "print(metrics.classification_report(y_test, y_pred, target_names=[\"non-obfuscted\", \"obfuscated\"]))"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Epoch 1, Loss: 0.3822276294231415\n",
-      "Epoch 2, Loss: 0.38222718238830566\n",
-      "Epoch 3, Loss: 0.41670992970466614\n",
-      "Epoch 4, Loss: 0.3822271525859833\n",
-      "Epoch 5, Loss: 0.4856754243373871\n",
-      "Epoch 6, Loss: 0.3477444052696228\n",
-      "Epoch 7, Loss: 0.3132616877555847\n",
-      "Epoch 8, Loss: 0.38222718238830566\n",
-      "Epoch 9, Loss: 0.38222718238830566\n",
-      "Epoch 10, Loss: 0.4511926770210266\n",
-      "Accuracy: 91.1685994647636%\n"
-     ]
-    }
-   ],
-   "source": [
-    "# 1. Import PyTorch Libraries\n",
-    "import torch\n",
-    "import torch.nn as nn\n",
-    "import torch.optim as optim\n",
-    "from torch.utils.data import Dataset, DataLoader\n",
-    "from sklearn.preprocessing import MinMaxScaler\n",
-    "\n",
-    "# 2. Data Preprocessing\n",
-    "# Scaling the features\n",
-    "scaler = MinMaxScaler()\n",
-    "X_train_scaled = scaler.fit_transform(X_train)\n",
-    "X_test_scaled = scaler.transform(X_test)\n",
-    "\n",
-    "# 3. Define PyTorch Dataset and Dataloader\n",
-    "class JSDataset(Dataset):\n",
-    "    def __init__(self, features, labels):\n",
-    "        self.features = torch.tensor(features, dtype=torch.float32)\n",
-    "        self.labels = torch.tensor(labels.values, dtype=torch.long)\n",
-    "\n",
-    "    def __len__(self):\n",
-    "        return len(self.labels)\n",
-    "\n",
-    "    def __getitem__(self, idx):\n",
-    "        return self.features[idx], self.labels[idx]\n",
-    "\n",
-    "train_dataset = JSDataset(X_train_scaled, y_train)\n",
-    "test_dataset = JSDataset(X_test_scaled, y_test)\n",
-    "\n",
-    "train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)\n",
-    "test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)\n",
-    "\n",
-    "# 4. Define Neural Network Architecture\n",
-    "class JSClassifier(nn.Module):\n",
-    "    def __init__(self, input_dim):\n",
-    "        super(JSClassifier, self).__init__()\n",
-    "        self.layer1 = nn.Linear(input_dim, 64)\n",
-    "        self.layer2 = nn.Linear(64, 32)\n",
-    "        self.layer3 = nn.Linear(32, 2) # Binary classification\n",
-    "\n",
-    "    def forward(self, x):\n",
-    "        x = torch.relu(self.layer1(x))\n",
-    "        x = torch.relu(self.layer2(x))\n",
-    "        x = torch.softmax(self.layer3(x), dim=1)\n",
-    "        return x\n",
-    "\n",
-    "model = JSClassifier(X_train_scaled.shape[1])\n",
-    "criterion = nn.CrossEntropyLoss()\n",
-    "optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
-    "\n",
-    "# 5. Training the Model\n",
-    "for epoch in range(10):  # 10 epochs\n",
-    "    for batch_features, batch_labels in train_loader:\n",
-    "        optimizer.zero_grad()\n",
-    "        output = model(batch_features)\n",
-    "        loss = criterion(output, batch_labels)\n",
-    "        loss.backward()\n",
-    "        optimizer.step()\n",
-    "    print(f\"Epoch {epoch + 1}, Loss: {loss.item()}\")\n",
-    "\n",
-    "# 6. Evaluation\n",
-    "correct = 0\n",
-    "total = 0\n",
-    "with torch.no_grad():\n",
-    "    for features, labels in test_loader:\n",
-    "        outputs = model(features)\n",
-    "        _, predicted = torch.max(outputs.data, 1)\n",
-    "        total += labels.size(0)\n",
-    "        correct += (predicted == labels).sum().item()\n",
-    "\n",
-    "print(f\"Accuracy: {100 * correct / total}%\")"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 25,
@@ -872,94 +776,6 @@
     "print(accuracy_score(y_test, y_test_pred))\n",
     "print(confusion_matrix(y_test, y_test_pred))"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer\n",
-    "from sklearn.model_selection import train_test_split\n",
-    "from sklearn.metrics import accuracy_score, confusion_matrix\n",
-    "from torch.utils.data import DataLoader\n",
-    "import torch\n",
-    "import torch.nn as nn\n",
-    "\n",
-    "# Preprocess the text data\n",
-    "vectorizer = HashingVectorizer(input=\"content\", ngram_range=(1, 3))\n",
-    "tfidf_transformer = TfidfTransformer(use_idf=True)\n",
-    "X = vectorizer.transform(df.js)\n",
-    "X_tfidf = tfidf_transformer.fit_transform(X)\n",
-    "\n",
-    "# Split the data\n",
-    "X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df.label, test_size=0.33, random_state=42)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "TypeError",
-     "evalue": "sparse array length is ambiguous; use getnnz() or shape[0]",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[42], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[39m# Create PyTorch Dataset and DataLoader\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m train_dataset \u001b[39m=\u001b[39m JSDataset(X_train, y_train)\n\u001b[1;32m      3\u001b[0m test_dataset \u001b[39m=\u001b[39m JSDataset(X_test, y_test)\n\u001b[1;32m      4\u001b[0m train_loader \u001b[39m=\u001b[39m DataLoader(train_dataset, batch_size\u001b[39m=\u001b[39m\u001b[39m64\u001b[39m, shuffle\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n",
-      "Cell \u001b[0;32mIn[37], line 17\u001b[0m, in \u001b[0;36mJSDataset.__init__\u001b[0;34m(self, features, labels)\u001b[0m\n\u001b[1;32m     16\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__init__\u001b[39m(\u001b[39mself\u001b[39m, features, labels):\n\u001b[0;32m---> 17\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfeatures \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39;49mtensor(features, dtype\u001b[39m=\u001b[39;49mtorch\u001b[39m.\u001b[39;49mfloat32)\n\u001b[1;32m     18\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlabels \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mtensor(labels\u001b[39m.\u001b[39mvalues, dtype\u001b[39m=\u001b[39mtorch\u001b[39m.\u001b[39mlong)\n",
-      "File \u001b[0;32m~/learning/m1-internship/malicious-code-detection/.conda/m1/lib/python3.10/site-packages/scipy/sparse/_base.py:340\u001b[0m, in \u001b[0;36m_spbase.__len__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    339\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__len__\u001b[39m(\u001b[39mself\u001b[39m):\n\u001b[0;32m--> 340\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mTypeError\u001b[39;00m(\u001b[39m\"\u001b[39m\u001b[39msparse array length is ambiguous; use getnnz()\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    341\u001b[0m                     \u001b[39m\"\u001b[39m\u001b[39m or shape[0]\u001b[39m\u001b[39m\"\u001b[39m)\n",
-      "\u001b[0;31mTypeError\u001b[0m: sparse array length is ambiguous; use getnnz() or shape[0]"
-     ]
-    }
-   ],
-   "source": [
-    "# Create PyTorch Dataset and DataLoader\n",
-    "train_dataset = JSDataset(X_train, y_train)\n",
-    "test_dataset = JSDataset(X_test, y_test)\n",
-    "train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)\n",
-    "test_loader = DataLoader(test_dataset, batch_size=64)\n",
-    "\n",
-    "# Define the PyTorch Model\n",
-    "class Classifier(nn.Module):\n",
-    "    def __init__(self, input_dim):\n",
-    "        super(Classifier, self).__init__()\n",
-    "        self.fc = nn.Linear(input_dim, 2)\n",
-    "\n",
-    "    def forward(self, x):\n",
-    "        return self.fc(x)\n",
-    "\n",
-    "# Training Loop\n",
-    "model = Classifier(X_train.shape[1])\n",
-    "criterion = nn.CrossEntropyLoss()\n",
-    "optimizer = torch.optim.Adam(model.parameters(), lr=0.001)\n",
-    "\n",
-    "for epoch in range(10): # number of epochs\n",
-    "    for inputs, labels in train_loader:\n",
-    "        # Forward pass\n",
-    "        outputs = model(inputs)\n",
-    "        loss = criterion(outputs, labels)\n",
-    "\n",
-    "        # Backward pass and optimization\n",
-    "        optimizer.zero_grad()\n",
-    "        loss.backward()\n",
-    "        optimizer.step()\n",
-    "\n",
-    "# Evaluate\n",
-    "correct = 0\n",
-    "total = 0\n",
-    "with torch.no_grad():\n",
-    "    for inputs, labels in test_loader:\n",
-    "        outputs = model(inputs)\n",
-    "        _, predicted = torch.max(outputs.data, 1)\n",
-    "        total += labels.size(0)\n",
-    "        correct += (predicted == labels).sum().item()\n",
-    "\n",
-    "accuracy = 100 * correct / total\n",
-    "print('Accuracy: {}'.format(accuracy))"
-   ]
   }
  ],
  "metadata": {

diff --git a/scripts/split_train_test_from_conf.py b/scripts/split_train_test_from_conf.py
@@ -0,0 +1,123 @@
+import argparse
+import os
+
+import pandas as pd
+import yaml
+from sklearn.model_selection import train_test_split
+
+TRAIN_SET = "train_set.csv"
+TEST_SET = "test_set.csv"
+VAL_SET = "valid_set.csv"
+
+
+def get_files_from_subdir(dir_path):
+    return [
+        os.path.join(dir_path, fname) for fname in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, fname))
+    ]
+
+
+def traverse_path(path):
+    good_files = []
+    bad_files = []
+
+    goodjs_dir = os.path.join(path, "goodjs")
+    badjs_dir = os.path.join(path, "badjs")
+
+    if not os.path.exists(goodjs_dir) and not os.path.exists(badjs_dir):
+        print("No goodjs or badjs directory found in the path")
+
+    if os.path.exists(goodjs_dir):
+        goodjs_files = get_files_from_subdir(goodjs_dir)
+        good_files.extend(goodjs_files)
+
+    if os.path.exists(badjs_dir):
+        badjs_files = get_files_from_subdir(badjs_dir)
+        bad_files.extend(badjs_files)
+
+    return good_files, bad_files
+
+
+def sample_files(good_files, bad_files, good_sample_size, bad_sample_size):
+    import random
+
+    good_sample = random.sample(good_files, good_sample_size)
+    bad_sample = random.sample(bad_files, bad_sample_size)
+
+    return good_sample, bad_sample
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Split text data into train and test sets")
+    parser.add_argument("-c", "--config", help="Path to config", required=True, type=str)
+    parser.add_argument(
+        "-o", "--output", help="Output directory to contain the train and test set", required=True, type=str
+    )
+    parser.add_argument(
+        "-ts", "--train-size", help="Size of the training set from sample (0.0 to 1.0)", type=float, default=0.8
+    )
+
+    args = parser.parse_args()
+
+    path = args.config
+
+    with open(path) as f:
+        config = yaml.safe_load(f)
+        input_conf = config["input"]
+
+        output_dir = args.output
+
+    total_good = 0
+    total_bad = 0
+    all_good_samples = []
+    all_bad_samples = []
+    for dir, conf in input_conf.items():
+        dist_zero = {"dist": 0.0}
+        dir_path = conf["path"]
+        # print(dir_path)
+        goodjs_dist = float(conf["classes"].get("goodjs", dist_zero)["dist"])
+        badjs_dist = float(conf["classes"].get("badjs", dist_zero)["dist"])
+
+        good_files, bad_files = traverse_path(dir_path)
+        total_good += len(good_files)
+        total_bad += len(bad_files)
+        good_samples, bad_samples = sample_files(
+            good_files, bad_files, int(goodjs_dist * len(good_files)), int(badjs_dist * len(bad_files))
+        )
+        all_good_samples.extend(good_samples)
+        all_bad_samples.extend(bad_samples)
+
+    print("Total good files : ", total_good)
+    print("Total bad files  : ", total_bad)
+    print("# of good samples: ", len(all_good_samples))
+    print("# of bad samples : ", len(all_bad_samples))
+
+    all_files = all_good_samples + all_bad_samples
+    labels = ["goodjs"] * len(all_good_samples) + ["badjs"] * len(all_bad_samples)
+
+    X_train, X_test, y_train, y_test = train_test_split(all_files, labels, test_size=0.2, random_state=42)
+    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)
+
+    print("Train size       : ", len(X_train))
+    print("Test size        : ", len(X_test))
+    print("Val size         : ", len(X_val))
+
+    train = pd.DataFrame({"file": X_train, "label": y_train})
+    test = pd.DataFrame({"file": X_test, "label": y_test})
+    val = pd.DataFrame({"file": X_val, "label": y_val})
+
+    train_path = os.path.join(output_dir, TRAIN_SET)
+    test_path = os.path.join(output_dir, TEST_SET)
+    val_path = os.path.join(output_dir, VAL_SET)
+
+    train.to_csv(train_path, index=False)
+    test.to_csv(test_path, index=False)
+    val.to_csv(val_path, index=False)
+    print(f"Train set size    : {len(train)}")
+    print(f"Test set size     : {len(test)}")
+    print(f"Val set size      : {len(val)}")
+
+    print(f"Output            : [{train_path}, {test_path}, {val_path}]")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/utils/count_by_dir.sh b/scripts/utils/count_by_dir.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+# Define the directory you want to analyze
+directory_to_analyze="data/all/"
+
+# Use find to locate all subdirectories
+subdirs=($(find "$directory_to_analyze" -type d))
+
+# Iterate through subdirectories and count files
+for subdir in "${subdirs[@]}"; do
+  file_count=$(find "$subdir" -type f | wc -l)
+  echo "$subdir: $file_count files"
+done