Skip to content

Commit

Permalink
codebert result
Browse files Browse the repository at this point in the history
  • Loading branch information
truonghm committed Sep 17, 2023
1 parent 3d41288 commit c704046
Show file tree
Hide file tree
Showing 28 changed files with 4,924 additions and 605 deletions.
17 changes: 15 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,27 @@ tree:
## Create conda env (python 3.10) using environment.yml
env:
source $(CONDA_HOME_PATH)/bin/activate; conda create -p $(CONDA_ENV_PATH) --no-default-packages --no-deps python=3.10 -y; conda env update -p $(CONDA_ENV_PATH) --file environment.yml
touch .conda/.gitignore
echo "*" > .conda/.gitignore

bootstrap:
./scripts/utils/vastai_bootstrap.sh
source ./scripts/utils/vastai_bootstrap.sh
## Remove old conda env and create a new one
env-reset:
rm -rf $(CONDA_ENV_PATH)
make env

split:
python scripts/split_train_test.py --input=data/all/kaggle1,data/all/misc2,data/all/packt --output=data/exp --sample-size=0.2 --train-size=0.8
PYTHONPATH=$(shell pwd) python scripts/create_train_input.py

tokenize:
PYTHONPATH=$(shell pwd) python scripts/tokenize_corpus.py --input=data/exp/train_set.csv --output=data/exp
PYTHONPATH=$(shell pwd) python scripts/tokenize_corpus.py --input=data/exp/test_set.csv --output=data/exp

fasttext:
PYTHONPATH=$(shell pwd) python scripts/build_fasttext_model.py --input=data/exp/train_set_token_types_corpus.txt --model-dir=models/fasttext_embeddings.bin --no-hierarchical-softmax

PATH_TO_CHECK=./lib/* ./crawler/*
## Format files using black, using pre-commit hooks
format:
Expand All @@ -47,7 +60,7 @@ check-all:

## crawl urls from the kaggle dataset
crawl:
export PYTHONPATH=$(shell pwd) && python scripts/crawler/crawl_kaggle_dataset.py --skip=108201 --limit=700000 --input=data/malicious_phish.csv --output=data/all/kaggle1 --super_label=goodjs
export PYTHONPATH=$(shell pwd) && python scripts/utils/crawl_kaggle_dataset.py --skip=108201 --limit=700000 --input=data/malicious_phish.csv --output=data/all/kaggle1 --super_label=goodjs

## render report
render:
Expand Down
Empty file added README.md
Empty file.
98 changes: 68 additions & 30 deletions conda-linux-64.lock

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ dependencies:
- mamba
- pip
- pytorch::pytorch=2.0.1
- pytorch::torchtext
- cudatoolkit=11.8.0

# Non-standard section listing target platforms for conda-lock:
platforms:
Expand Down
16 changes: 16 additions & 0 deletions lib/codebert/inference.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
python run.py \
--output_dir=./saved_models \
--tokenizer_name=microsoft/codebert-base \
--model_name_or_path=microsoft/codebert-base \
--do_eval \
--do_test \
--train_data_file=../dataset/train.jsonl \
--eval_data_file=../dataset/valid.jsonl \
--test_data_file=../dataset/test.jsonl \
--num_train_epochs 1 \
--block_size 64 \
--train_batch_size 8 \
--eval_batch_size 16 \
--learning_rate 2e-5 \
--max_grad_norm 1.0 \
--seed 123456 2>&1 | tee test.log
28 changes: 28 additions & 0 deletions lib/codebert/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import copy

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn import CrossEntropyLoss, MSELoss


class Model(nn.Module):
def __init__(self, encoder, config, tokenizer, args):
super(Model, self).__init__()
self.encoder = encoder
self.config = config
self.tokenizer = tokenizer
self.args = args

def forward(self, input_ids=None, labels=None):
logits = self.encoder(input_ids, attention_mask=input_ids.ne(1))[0]
prob = torch.softmax(logits, -1)
if labels is not None:
loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
loss = loss_fct(logits, labels)
return loss, prob
else:
return prob
Loading

0 comments on commit c704046

Please sign in to comment.