codebert bimodal training

truonghm · Sep 21, 2023 · fe4b294 · fe4b294
1 parent 4ae1d62
commit fe4b294
Show file tree

Hide file tree

Showing 8 changed files with 41,501 additions and 30 deletions.
diff --git a/lib/codebert-nl-pl/code/evaluator.py → lib/codebert-bimodal/evaluator.py b/lib/codebert-nl-pl/code/evaluator.py → lib/codebert-bimodal/evaluator.py
diff --git a/lib/codebert-nl-pl/code/models.py → lib/codebert-bimodal/model.py b/lib/codebert-nl-pl/code/models.py → lib/codebert-bimodal/model.py
@@ -1,17 +1,10 @@
+import copy
+
 import torch
 import torch.nn as nn
-import torch
-from torch.autograd import Variable
-import copy
-from transformers.modeling_bert import BertLayerNorm
 import torch.nn.functional as F
+from torch.autograd import Variable
 from torch.nn import CrossEntropyLoss, MSELoss
-# from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
-#                           BertConfig, BertForMaskedLM, BertTokenizer,
-#                           GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
-#                           OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
-#                           RobertaConfig, RobertaModel, RobertaTokenizer,
-#                           DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
 from transformers.modeling_utils import PreTrainedModel
 
 
@@ -38,6 +31,7 @@ def forward(self, code_inputs, nl_inputs, labels, return_vec=False):
             return code_vec, nl_vec
 
         logits = self.mlp(torch.cat((nl_vec, code_vec, nl_vec-code_vec, nl_vec*code_vec), 1))
+        logits = logits.squeeze(-1)
         loss = self.loss_func(logits, labels.float())
         predictions = (logits > 0.5).int()  # (Batch, )
         return loss, predictions

diff --git a/lib/codebert-bimodal/preprocess.py b/lib/codebert-bimodal/preprocess.py
@@ -0,0 +1,45 @@
+import random
+
+import pandas as pd
+
+random.seed(123456)
+
+def create_doc(js):
+    malicious_doc = "javascript perform malicious actions to trick users, steal data from users, \
+        or otherwise cause harm."
+    benign_doc = "javascript perform normal, non-harmful actions"
+
+    label = js["label"]
+        # choose randomly between malicious and benign doc
+    if random.random() < 0.5:
+        doc = malicious_doc
+        new_label = 1 if label == 1 else 0
+    else:
+        doc = benign_doc
+        new_label = 1 if label == 0 else 0
+
+    # js["label"] = new_label
+    # js["doc"] = doc
+
+    # return js
+    return doc, new_label
+
+def modify_dataset(file_path, type:str):
+    with open (file_path, "r") as f:
+        # convert jsonl file to pandas
+        df = pd.read_json(f, lines=True)
+
+    df[["doc", "label"]] = df.apply(create_doc, axis=1, result_type="expand")
+    df["idx"] = type + "_" + df.index.astype(str)
+    new_path = file_path.replace(".jsonl", "_new.jsonl")
+    with open(new_path, "w") as f:
+        f.write(df.to_json(orient='records', lines=True, force_ascii=False))
+
+
+if __name__ == "__main__":
+    print("modifying test set")
+    modify_dataset("data/exp/test_set.jsonl", "test")
+    print("modifying valid set")
+    modify_dataset("data/exp/valid_set.jsonl", "valid")
+    print("modifying train set")
+    modify_dataset("data/exp/train_set.jsonl", "train")
diff --git a/lib/codebert-nl-pl/code/run_classifier.py → lib/codebert-bimodal/run_classifier.py b/lib/codebert-nl-pl/code/run_classifier.py → lib/codebert-bimodal/run_classifier.py
@@ -22,25 +22,29 @@
 import os
 import random
 
-
 import numpy as np
 import torch
-from torch.utils.data import DataLoader, SequentialSampler, RandomSampler, TensorDataset
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
+
 try:
     from torch.utils.tensorboard import SummaryWriter
 except:
     from tensorboardX import SummaryWriter
-from tqdm import tqdm, trange
+import multiprocessing
 
-from transformers import (WEIGHTS_NAME, get_linear_schedule_with_warmup, AdamW,
-                          RobertaConfig,
-                          RobertaModel,
-                          RobertaTokenizer)
+from model import Model
+from tqdm import tqdm, trange
+from transformers import (
+    WEIGHTS_NAME,
+    AdamW,
+    RobertaConfig,
+    RobertaModel,
+    RobertaTokenizer,
+    get_linear_schedule_with_warmup,
+)
+from utils import TextDataset, acc_and_f1
 
-from models import Model
-from utils import acc_and_f1, TextDataset
-import multiprocessing
 cpu_cont = multiprocessing.cpu_count()
 
 logger = logging.getLogger(__name__)

diff --git a/lib/codebert-bimodal/train.sh b/lib/codebert-bimodal/train.sh
@@ -0,0 +1,19 @@
+python lib/codebert-bimodal/run_classifier.py \
+			--model_type roberta \
+			--do_train \
+			--do_eval \
+			--eval_all_checkpoints \
+			--train_file train_set_new.jsonl \
+			--dev_file valid_set_new.jsonl \
+			--max_seq_length 200 \
+			--per_gpu_train_batch_size 16 \
+			--per_gpu_eval_batch_size 16 \
+			--learning_rate 1e-5 \
+			--num_train_epochs 20 \
+			--gradient_accumulation_steps 1 \
+			--warmup_steps 1000 \
+			--evaluate_during_training \
+			--data_dir ./data/exp \
+			--output_dir ./models \
+			--encoder_name_or_path microsoft/codebert-base \
+			--seed 123456 2>&1 | tee train.log
diff --git a/lib/codebert-nl-pl/code/utils.py → lib/codebert-bimodal/utils.py b/lib/codebert-nl-pl/code/utils.py → lib/codebert-bimodal/utils.py
@@ -77,14 +77,17 @@ def __init__(self, tokenizer, args, file_path=None, type=None):
         # json file: dict: idx, query, doc, code
         self.examples = []
         self.type = type
-        data=[]
-        with open(file_path, 'r') as f:
-            data = json.load(f)
-        if self.type == 'test':
-            for js in data:
-                js['label'] = 0
-        for js in data:
-            self.examples.append(convert_examples_to_features(js, tokenizer, args))
+        with open(file_path) as f:
+            for line in f:
+                js = json.loads(line.strip())
+                self.examples.append(convert_examples_to_features(js, tokenizer, args))
+        # with open(file_path, 'r') as f:
+        #     data = json.load(f)
+        # if self.type == 'test':
+        #     for js in data:
+        #         js['label'] = 0
+        # for js in data:
+        #     self.examples.append(convert_examples_to_features(js, tokenizer, args))
         if 'train' in file_path:
             for idx, example in enumerate(self.examples[:3]):
                 logger.info("*** Example ***")

diff --git a/lib/codebert/train.sh b/lib/codebert/train.sh
@@ -6,8 +6,8 @@ PYTHONPATH=$(shell pwd) python run.py \
     --train_data_file=../data/exp/train_set.jsonl \
     --eval_data_file=../data/exp/valid_set.jsonl \
     --test_data_file=../data/exp/test_set.jsonl \
-    --num_train_epochs 20 \
-    --block_size 256 \
+    --num_train_epochs 5 \
+    --block_size 512 \
     --train_batch_size 8 \
     --eval_batch_size 16 \
     --learning_rate 2e-5 \