Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
truonghm committed Sep 17, 2023
1 parent c704046 commit 09a546c
Show file tree
Hide file tree
Showing 4 changed files with 173 additions and 184 deletions.
37 changes: 37 additions & 0 deletions confs/input.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
input:
kaggle1:
path: data/all/kaggle1
classes:
goodjs:
dist: 0.1
kaggle2:
path: data/all/kaggle2
classes:
badjs:
dist: 0.1
misc1:
path: data/all/misc1
classes:
goodjs:
dist: 0.7
badjs:
dist: 0.6
misc2:
path: data/all/misc2
classes:
goodjs:
dist: 0.5
badjs:
dist: 0.6
misc3:
path: data/all/misc3
classes:
badjs:
dist: 0.1
packt:
path: data/all/packt
classes:
goodjs:
dist: 0.7
badjs:
dist: 0.6
184 changes: 0 additions & 184 deletions notebooks/test3.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -732,102 +732,6 @@
"print(metrics.classification_report(y_test, y_pred, target_names=[\"non-obfuscted\", \"obfuscated\"]))"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1, Loss: 0.3822276294231415\n",
"Epoch 2, Loss: 0.38222718238830566\n",
"Epoch 3, Loss: 0.41670992970466614\n",
"Epoch 4, Loss: 0.3822271525859833\n",
"Epoch 5, Loss: 0.4856754243373871\n",
"Epoch 6, Loss: 0.3477444052696228\n",
"Epoch 7, Loss: 0.3132616877555847\n",
"Epoch 8, Loss: 0.38222718238830566\n",
"Epoch 9, Loss: 0.38222718238830566\n",
"Epoch 10, Loss: 0.4511926770210266\n",
"Accuracy: 91.1685994647636%\n"
]
}
],
"source": [
"# 1. Import PyTorch Libraries\n",
"import torch\n",
"import torch.nn as nn\n",
"import torch.optim as optim\n",
"from torch.utils.data import Dataset, DataLoader\n",
"from sklearn.preprocessing import MinMaxScaler\n",
"\n",
"# 2. Data Preprocessing\n",
"# Scaling the features\n",
"scaler = MinMaxScaler()\n",
"X_train_scaled = scaler.fit_transform(X_train)\n",
"X_test_scaled = scaler.transform(X_test)\n",
"\n",
"# 3. Define PyTorch Dataset and Dataloader\n",
"class JSDataset(Dataset):\n",
" def __init__(self, features, labels):\n",
" self.features = torch.tensor(features, dtype=torch.float32)\n",
" self.labels = torch.tensor(labels.values, dtype=torch.long)\n",
"\n",
" def __len__(self):\n",
" return len(self.labels)\n",
"\n",
" def __getitem__(self, idx):\n",
" return self.features[idx], self.labels[idx]\n",
"\n",
"train_dataset = JSDataset(X_train_scaled, y_train)\n",
"test_dataset = JSDataset(X_test_scaled, y_test)\n",
"\n",
"train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)\n",
"test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)\n",
"\n",
"# 4. Define Neural Network Architecture\n",
"class JSClassifier(nn.Module):\n",
" def __init__(self, input_dim):\n",
" super(JSClassifier, self).__init__()\n",
" self.layer1 = nn.Linear(input_dim, 64)\n",
" self.layer2 = nn.Linear(64, 32)\n",
" self.layer3 = nn.Linear(32, 2) # Binary classification\n",
"\n",
" def forward(self, x):\n",
" x = torch.relu(self.layer1(x))\n",
" x = torch.relu(self.layer2(x))\n",
" x = torch.softmax(self.layer3(x), dim=1)\n",
" return x\n",
"\n",
"model = JSClassifier(X_train_scaled.shape[1])\n",
"criterion = nn.CrossEntropyLoss()\n",
"optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
"\n",
"# 5. Training the Model\n",
"for epoch in range(10): # 10 epochs\n",
" for batch_features, batch_labels in train_loader:\n",
" optimizer.zero_grad()\n",
" output = model(batch_features)\n",
" loss = criterion(output, batch_labels)\n",
" loss.backward()\n",
" optimizer.step()\n",
" print(f\"Epoch {epoch + 1}, Loss: {loss.item()}\")\n",
"\n",
"# 6. Evaluation\n",
"correct = 0\n",
"total = 0\n",
"with torch.no_grad():\n",
" for features, labels in test_loader:\n",
" outputs = model(features)\n",
" _, predicted = torch.max(outputs.data, 1)\n",
" total += labels.size(0)\n",
" correct += (predicted == labels).sum().item()\n",
"\n",
"print(f\"Accuracy: {100 * correct / total}%\")"
]
},
{
"cell_type": "code",
"execution_count": 25,
Expand Down Expand Up @@ -872,94 +776,6 @@
"print(accuracy_score(y_test, y_test_pred))\n",
"print(confusion_matrix(y_test, y_test_pred))"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import accuracy_score, confusion_matrix\n",
"from torch.utils.data import DataLoader\n",
"import torch\n",
"import torch.nn as nn\n",
"\n",
"# Preprocess the text data\n",
"vectorizer = HashingVectorizer(input=\"content\", ngram_range=(1, 3))\n",
"tfidf_transformer = TfidfTransformer(use_idf=True)\n",
"X = vectorizer.transform(df.js)\n",
"X_tfidf = tfidf_transformer.fit_transform(X)\n",
"\n",
"# Split the data\n",
"X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df.label, test_size=0.33, random_state=42)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "sparse array length is ambiguous; use getnnz() or shape[0]",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[42], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39m# Create PyTorch Dataset and DataLoader\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m train_dataset \u001b[39m=\u001b[39m JSDataset(X_train, y_train)\n\u001b[1;32m 3\u001b[0m test_dataset \u001b[39m=\u001b[39m JSDataset(X_test, y_test)\n\u001b[1;32m 4\u001b[0m train_loader \u001b[39m=\u001b[39m DataLoader(train_dataset, batch_size\u001b[39m=\u001b[39m\u001b[39m64\u001b[39m, shuffle\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n",
"Cell \u001b[0;32mIn[37], line 17\u001b[0m, in \u001b[0;36mJSDataset.__init__\u001b[0;34m(self, features, labels)\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__init__\u001b[39m(\u001b[39mself\u001b[39m, features, labels):\n\u001b[0;32m---> 17\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfeatures \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39;49mtensor(features, dtype\u001b[39m=\u001b[39;49mtorch\u001b[39m.\u001b[39;49mfloat32)\n\u001b[1;32m 18\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlabels \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mtensor(labels\u001b[39m.\u001b[39mvalues, dtype\u001b[39m=\u001b[39mtorch\u001b[39m.\u001b[39mlong)\n",
"File \u001b[0;32m~/learning/m1-internship/malicious-code-detection/.conda/m1/lib/python3.10/site-packages/scipy/sparse/_base.py:340\u001b[0m, in \u001b[0;36m_spbase.__len__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 339\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__len__\u001b[39m(\u001b[39mself\u001b[39m):\n\u001b[0;32m--> 340\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mTypeError\u001b[39;00m(\u001b[39m\"\u001b[39m\u001b[39msparse array length is ambiguous; use getnnz()\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 341\u001b[0m \u001b[39m\"\u001b[39m\u001b[39m or shape[0]\u001b[39m\u001b[39m\"\u001b[39m)\n",
"\u001b[0;31mTypeError\u001b[0m: sparse array length is ambiguous; use getnnz() or shape[0]"
]
}
],
"source": [
"# Create PyTorch Dataset and DataLoader\n",
"train_dataset = JSDataset(X_train, y_train)\n",
"test_dataset = JSDataset(X_test, y_test)\n",
"train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)\n",
"test_loader = DataLoader(test_dataset, batch_size=64)\n",
"\n",
"# Define the PyTorch Model\n",
"class Classifier(nn.Module):\n",
" def __init__(self, input_dim):\n",
" super(Classifier, self).__init__()\n",
" self.fc = nn.Linear(input_dim, 2)\n",
"\n",
" def forward(self, x):\n",
" return self.fc(x)\n",
"\n",
"# Training Loop\n",
"model = Classifier(X_train.shape[1])\n",
"criterion = nn.CrossEntropyLoss()\n",
"optimizer = torch.optim.Adam(model.parameters(), lr=0.001)\n",
"\n",
"for epoch in range(10): # number of epochs\n",
" for inputs, labels in train_loader:\n",
" # Forward pass\n",
" outputs = model(inputs)\n",
" loss = criterion(outputs, labels)\n",
"\n",
" # Backward pass and optimization\n",
" optimizer.zero_grad()\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
"# Evaluate\n",
"correct = 0\n",
"total = 0\n",
"with torch.no_grad():\n",
" for inputs, labels in test_loader:\n",
" outputs = model(inputs)\n",
" _, predicted = torch.max(outputs.data, 1)\n",
" total += labels.size(0)\n",
" correct += (predicted == labels).sum().item()\n",
"\n",
"accuracy = 100 * correct / total\n",
"print('Accuracy: {}'.format(accuracy))"
]
}
],
"metadata": {
Expand Down
123 changes: 123 additions & 0 deletions scripts/split_train_test_from_conf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import argparse
import os

import pandas as pd
import yaml
from sklearn.model_selection import train_test_split

TRAIN_SET = "train_set.csv"
TEST_SET = "test_set.csv"
VAL_SET = "valid_set.csv"


def get_files_from_subdir(dir_path):
return [
os.path.join(dir_path, fname) for fname in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, fname))
]


def traverse_path(path):
good_files = []
bad_files = []

goodjs_dir = os.path.join(path, "goodjs")
badjs_dir = os.path.join(path, "badjs")

if not os.path.exists(goodjs_dir) and not os.path.exists(badjs_dir):
print("No goodjs or badjs directory found in the path")

if os.path.exists(goodjs_dir):
goodjs_files = get_files_from_subdir(goodjs_dir)
good_files.extend(goodjs_files)

if os.path.exists(badjs_dir):
badjs_files = get_files_from_subdir(badjs_dir)
bad_files.extend(badjs_files)

return good_files, bad_files


def sample_files(good_files, bad_files, good_sample_size, bad_sample_size):
import random

good_sample = random.sample(good_files, good_sample_size)
bad_sample = random.sample(bad_files, bad_sample_size)

return good_sample, bad_sample


def main():
parser = argparse.ArgumentParser(description="Split text data into train and test sets")
parser.add_argument("-c", "--config", help="Path to config", required=True, type=str)
parser.add_argument(
"-o", "--output", help="Output directory to contain the train and test set", required=True, type=str
)
parser.add_argument(
"-ts", "--train-size", help="Size of the training set from sample (0.0 to 1.0)", type=float, default=0.8
)

args = parser.parse_args()

path = args.config

with open(path) as f:
config = yaml.safe_load(f)
input_conf = config["input"]

output_dir = args.output

total_good = 0
total_bad = 0
all_good_samples = []
all_bad_samples = []
for dir, conf in input_conf.items():
dist_zero = {"dist": 0.0}
dir_path = conf["path"]
# print(dir_path)
goodjs_dist = float(conf["classes"].get("goodjs", dist_zero)["dist"])
badjs_dist = float(conf["classes"].get("badjs", dist_zero)["dist"])

good_files, bad_files = traverse_path(dir_path)
total_good += len(good_files)
total_bad += len(bad_files)
good_samples, bad_samples = sample_files(
good_files, bad_files, int(goodjs_dist * len(good_files)), int(badjs_dist * len(bad_files))
)
all_good_samples.extend(good_samples)
all_bad_samples.extend(bad_samples)

print("Total good files : ", total_good)
print("Total bad files : ", total_bad)
print("# of good samples: ", len(all_good_samples))
print("# of bad samples : ", len(all_bad_samples))

all_files = all_good_samples + all_bad_samples
labels = ["goodjs"] * len(all_good_samples) + ["badjs"] * len(all_bad_samples)

X_train, X_test, y_train, y_test = train_test_split(all_files, labels, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

print("Train size : ", len(X_train))
print("Test size : ", len(X_test))
print("Val size : ", len(X_val))

train = pd.DataFrame({"file": X_train, "label": y_train})
test = pd.DataFrame({"file": X_test, "label": y_test})
val = pd.DataFrame({"file": X_val, "label": y_val})

train_path = os.path.join(output_dir, TRAIN_SET)
test_path = os.path.join(output_dir, TEST_SET)
val_path = os.path.join(output_dir, VAL_SET)

train.to_csv(train_path, index=False)
test.to_csv(test_path, index=False)
val.to_csv(val_path, index=False)
print(f"Train set size : {len(train)}")
print(f"Test set size : {len(test)}")
print(f"Val set size : {len(val)}")

print(f"Output : [{train_path}, {test_path}, {val_path}]")


if __name__ == "__main__":
main()
13 changes: 13 additions & 0 deletions scripts/utils/count_by_dir.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

# Define the directory you want to analyze
directory_to_analyze="data/all/"

# Use find to locate all subdirectories
subdirs=($(find "$directory_to_analyze" -type d))

# Iterate through subdirectories and count files
for subdir in "${subdirs[@]}"; do
file_count=$(find "$subdir" -type f | wc -l)
echo "$subdir: $file_count files"
done

0 comments on commit 09a546c

Please sign in to comment.