nl-pl

truonghm · Sep 21, 2023 · 4ae1d62 · 4ae1d62
1 parent 14b7c8e
commit 4ae1d62
Show file tree

Hide file tree

Showing 9 changed files with 135,458 additions and 28 deletions.
diff --git a/Makefile b/Makefile
@@ -11,6 +11,9 @@ download:
 count:
 	./scripts/utils/count_data.sh
 
+count2:
+	./scripts/utils/count_by_dir.sh
+
 ## Generate tree of data folder
 tree:
 	./scripts/utils/gen_tree.sh data

diff --git a/js-code-detection.log b/js-code-detection.log
diff --git a/lib/codebert-nl-pl/code/evaluator.py b/lib/codebert-nl-pl/code/evaluator.py
@@ -0,0 +1,59 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+import logging
+import sys, json, os
+import numpy as np
+import argparse
+from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
+
+
+def read_answers(filename):
+    answers = {}
+    with open(filename, 'r', encoding='utf-8') as f:
+        for line in f.readlines():
+            line = line.strip()
+            answers[line.split('\t')[0]] = int(line.split('\t')[1])
+    return answers
+
+
+def read_predictions(filename):
+    predictions = {}
+    with open(filename, 'r', encoding='utf-8') as f:
+        for line in f.readlines():
+            line = line.strip()
+            predictions[line.split('\t')[0]] = int(line.split('\t')[1])
+    return predictions
+
+
+def calculate_scores(answers, predictions):
+    y_trues, y_preds = [], []
+    for key in answers:
+        if key not in predictions:
+            logging.error("Missing prediction for index {}.".format(key))
+            sys.exit()
+        y_trues.append(answers[key])
+        y_preds.append(predictions[key])
+    scores={}
+    scores['Precision']=precision_score(y_trues, y_preds)
+    scores['Recall']=recall_score(y_trues, y_preds)
+    scores['F1']=f1_score(y_trues, y_preds)
+    scores['Accuracy']=accuracy_score(y_trues, y_preds)
+    return scores
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Evaluate leaderboard predictions for ClozeTest-maxmin dataset.')
+    parser.add_argument('--answers_webquery', '-aw', help="filename of the labels on webquery test set, in txt format.")
+    parser.add_argument('--predictions_webquery', '-pw', help="filename  of the leaderboard predictions on webquery test set, in txt format.")
+    args = parser.parse_args()
+
+    answers = read_answers(args.answers_webquery)
+    predictions = read_predictions(args.predictions_webquery)
+    acc_webquery = calculate_scores(answers, predictions)
+    # print('NL-code-search-WebQuery on WebQuery test set, acc: {}'.format(acc_webquery))
+    print('NL-code-search-WebQuery on WebQuery test set:')
+    print(acc_webquery)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/lib/codebert-nl-pl/code/models.py b/lib/codebert-nl-pl/code/models.py
@@ -0,0 +1,44 @@
+import torch
+import torch.nn as nn
+import torch
+from torch.autograd import Variable
+import copy
+from transformers.modeling_bert import BertLayerNorm
+import torch.nn.functional as F
+from torch.nn import CrossEntropyLoss, MSELoss
+# from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
+#                           BertConfig, BertForMaskedLM, BertTokenizer,
+#                           GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
+#                           OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
+#                           RobertaConfig, RobertaModel, RobertaTokenizer,
+#                           DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
+from transformers.modeling_utils import PreTrainedModel
+
+
+class Model(PreTrainedModel):
+    def __init__(self, encoder, config, tokenizer, args):
+        super(Model, self).__init__(config)
+        self.encoder = encoder
+        self.config = config
+        self.tokenizer = tokenizer
+        self.mlp = nn.Sequential(nn.Linear(768*4, 768),
+                                 nn.Tanh(),
+                                 nn.Linear(768, 1),
+                                 nn.Sigmoid())
+        self.loss_func = nn.BCELoss()
+        self.args = args
+
+    def forward(self, code_inputs, nl_inputs, labels, return_vec=False):
+        bs = code_inputs.shape[0]
+        inputs = torch.cat((code_inputs, nl_inputs), 0)
+        outputs = self.encoder(inputs, attention_mask=inputs.ne(1))[1]
+        code_vec = outputs[:bs]
+        nl_vec = outputs[bs:]
+        if return_vec:
+            return code_vec, nl_vec
+
+        logits = self.mlp(torch.cat((nl_vec, code_vec, nl_vec-code_vec, nl_vec*code_vec), 1))
+        loss = self.loss_func(logits, labels.float())
+        predictions = (logits > 0.5).int()  # (Batch, )
+        return loss, predictions
+