Skip to content

Commit

Permalink
codebert bimodal training
Browse files Browse the repository at this point in the history
  • Loading branch information
truonghm committed Sep 21, 2023
1 parent 4ae1d62 commit fe4b294
Show file tree
Hide file tree
Showing 8 changed files with 41,501 additions and 30 deletions.
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,17 +1,10 @@
import copy

import torch
import torch.nn as nn
import torch
from torch.autograd import Variable
import copy
from transformers.modeling_bert import BertLayerNorm
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn import CrossEntropyLoss, MSELoss
# from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
# BertConfig, BertForMaskedLM, BertTokenizer,
# GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
# OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
# RobertaConfig, RobertaModel, RobertaTokenizer,
# DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
from transformers.modeling_utils import PreTrainedModel


Expand All @@ -38,6 +31,7 @@ def forward(self, code_inputs, nl_inputs, labels, return_vec=False):
return code_vec, nl_vec

logits = self.mlp(torch.cat((nl_vec, code_vec, nl_vec-code_vec, nl_vec*code_vec), 1))
logits = logits.squeeze(-1)
loss = self.loss_func(logits, labels.float())
predictions = (logits > 0.5).int() # (Batch, )
return loss, predictions
Expand Down
45 changes: 45 additions & 0 deletions lib/codebert-bimodal/preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import random

import pandas as pd

random.seed(123456)

def create_doc(js):
malicious_doc = "javascript perform malicious actions to trick users, steal data from users, \
or otherwise cause harm."
benign_doc = "javascript perform normal, non-harmful actions"

label = js["label"]
# choose randomly between malicious and benign doc
if random.random() < 0.5:
doc = malicious_doc
new_label = 1 if label == 1 else 0
else:
doc = benign_doc
new_label = 1 if label == 0 else 0

# js["label"] = new_label
# js["doc"] = doc

# return js
return doc, new_label

def modify_dataset(file_path, type:str):
with open (file_path, "r") as f:
# convert jsonl file to pandas
df = pd.read_json(f, lines=True)

df[["doc", "label"]] = df.apply(create_doc, axis=1, result_type="expand")
df["idx"] = type + "_" + df.index.astype(str)
new_path = file_path.replace(".jsonl", "_new.jsonl")
with open(new_path, "w") as f:
f.write(df.to_json(orient='records', lines=True, force_ascii=False))


if __name__ == "__main__":
print("modifying test set")
modify_dataset("data/exp/test_set.jsonl", "test")
print("modifying valid set")
modify_dataset("data/exp/valid_set.jsonl", "valid")
print("modifying train set")
modify_dataset("data/exp/train_set.jsonl", "train")
Original file line number Diff line number Diff line change
Expand Up @@ -22,25 +22,29 @@
import os
import random


import numpy as np
import torch
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler, TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch.utils.data.distributed import DistributedSampler

try:
from torch.utils.tensorboard import SummaryWriter
except:
from tensorboardX import SummaryWriter
from tqdm import tqdm, trange
import multiprocessing

from transformers import (WEIGHTS_NAME, get_linear_schedule_with_warmup, AdamW,
RobertaConfig,
RobertaModel,
RobertaTokenizer)
from model import Model
from tqdm import tqdm, trange
from transformers import (
WEIGHTS_NAME,
AdamW,
RobertaConfig,
RobertaModel,
RobertaTokenizer,
get_linear_schedule_with_warmup,
)
from utils import TextDataset, acc_and_f1

from models import Model
from utils import acc_and_f1, TextDataset
import multiprocessing
cpu_cont = multiprocessing.cpu_count()

logger = logging.getLogger(__name__)
Expand Down
19 changes: 19 additions & 0 deletions lib/codebert-bimodal/train.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
python lib/codebert-bimodal/run_classifier.py \
--model_type roberta \
--do_train \
--do_eval \
--eval_all_checkpoints \
--train_file train_set_new.jsonl \
--dev_file valid_set_new.jsonl \
--max_seq_length 200 \
--per_gpu_train_batch_size 16 \
--per_gpu_eval_batch_size 16 \
--learning_rate 1e-5 \
--num_train_epochs 20 \
--gradient_accumulation_steps 1 \
--warmup_steps 1000 \
--evaluate_during_training \
--data_dir ./data/exp \
--output_dir ./models \
--encoder_name_or_path microsoft/codebert-base \
--seed 123456 2>&1 | tee train.log
19 changes: 11 additions & 8 deletions lib/codebert-nl-pl/code/utils.py → lib/codebert-bimodal/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,14 +77,17 @@ def __init__(self, tokenizer, args, file_path=None, type=None):
# json file: dict: idx, query, doc, code
self.examples = []
self.type = type
data=[]
with open(file_path, 'r') as f:
data = json.load(f)
if self.type == 'test':
for js in data:
js['label'] = 0
for js in data:
self.examples.append(convert_examples_to_features(js, tokenizer, args))
with open(file_path) as f:
for line in f:
js = json.loads(line.strip())
self.examples.append(convert_examples_to_features(js, tokenizer, args))
# with open(file_path, 'r') as f:
# data = json.load(f)
# if self.type == 'test':
# for js in data:
# js['label'] = 0
# for js in data:
# self.examples.append(convert_examples_to_features(js, tokenizer, args))
if 'train' in file_path:
for idx, example in enumerate(self.examples[:3]):
logger.info("*** Example ***")
Expand Down
4 changes: 2 additions & 2 deletions lib/codebert/train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ PYTHONPATH=$(shell pwd) python run.py \
--train_data_file=../data/exp/train_set.jsonl \
--eval_data_file=../data/exp/valid_set.jsonl \
--test_data_file=../data/exp/test_set.jsonl \
--num_train_epochs 20 \
--block_size 256 \
--num_train_epochs 5 \
--block_size 512 \
--train_batch_size 8 \
--eval_batch_size 16 \
--learning_rate 2e-5 \
Expand Down
Loading

0 comments on commit fe4b294

Please sign in to comment.