Skip to content

Commit

Permalink
nl-pl
Browse files Browse the repository at this point in the history
  • Loading branch information
truonghm committed Sep 21, 2023
1 parent 14b7c8e commit 4ae1d62
Show file tree
Hide file tree
Showing 9 changed files with 135,458 additions and 28 deletions.
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ download:
count:
./scripts/utils/count_data.sh

count2:
./scripts/utils/count_by_dir.sh

## Generate tree of data folder
tree:
./scripts/utils/gen_tree.sh data
Expand Down
134,401 changes: 134,401 additions & 0 deletions js-code-detection.log

Large diffs are not rendered by default.

59 changes: 59 additions & 0 deletions lib/codebert-nl-pl/code/evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import logging
import sys, json, os
import numpy as np
import argparse
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score


def read_answers(filename):
answers = {}
with open(filename, 'r', encoding='utf-8') as f:
for line in f.readlines():
line = line.strip()
answers[line.split('\t')[0]] = int(line.split('\t')[1])
return answers


def read_predictions(filename):
predictions = {}
with open(filename, 'r', encoding='utf-8') as f:
for line in f.readlines():
line = line.strip()
predictions[line.split('\t')[0]] = int(line.split('\t')[1])
return predictions


def calculate_scores(answers, predictions):
y_trues, y_preds = [], []
for key in answers:
if key not in predictions:
logging.error("Missing prediction for index {}.".format(key))
sys.exit()
y_trues.append(answers[key])
y_preds.append(predictions[key])
scores={}
scores['Precision']=precision_score(y_trues, y_preds)
scores['Recall']=recall_score(y_trues, y_preds)
scores['F1']=f1_score(y_trues, y_preds)
scores['Accuracy']=accuracy_score(y_trues, y_preds)
return scores


def main():
parser = argparse.ArgumentParser(description='Evaluate leaderboard predictions for ClozeTest-maxmin dataset.')
parser.add_argument('--answers_webquery', '-aw', help="filename of the labels on webquery test set, in txt format.")
parser.add_argument('--predictions_webquery', '-pw', help="filename of the leaderboard predictions on webquery test set, in txt format.")
args = parser.parse_args()

answers = read_answers(args.answers_webquery)
predictions = read_predictions(args.predictions_webquery)
acc_webquery = calculate_scores(answers, predictions)
# print('NL-code-search-WebQuery on WebQuery test set, acc: {}'.format(acc_webquery))
print('NL-code-search-WebQuery on WebQuery test set:')
print(acc_webquery)


if __name__ == '__main__':
main()
44 changes: 44 additions & 0 deletions lib/codebert-nl-pl/code/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import torch
import torch.nn as nn
import torch
from torch.autograd import Variable
import copy
from transformers.modeling_bert import BertLayerNorm
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss, MSELoss
# from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
# BertConfig, BertForMaskedLM, BertTokenizer,
# GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
# OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
# RobertaConfig, RobertaModel, RobertaTokenizer,
# DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
from transformers.modeling_utils import PreTrainedModel


class Model(PreTrainedModel):
def __init__(self, encoder, config, tokenizer, args):
super(Model, self).__init__(config)
self.encoder = encoder
self.config = config
self.tokenizer = tokenizer
self.mlp = nn.Sequential(nn.Linear(768*4, 768),
nn.Tanh(),
nn.Linear(768, 1),
nn.Sigmoid())
self.loss_func = nn.BCELoss()
self.args = args

def forward(self, code_inputs, nl_inputs, labels, return_vec=False):
bs = code_inputs.shape[0]
inputs = torch.cat((code_inputs, nl_inputs), 0)
outputs = self.encoder(inputs, attention_mask=inputs.ne(1))[1]
code_vec = outputs[:bs]
nl_vec = outputs[bs:]
if return_vec:
return code_vec, nl_vec

logits = self.mlp(torch.cat((nl_vec, code_vec, nl_vec-code_vec, nl_vec*code_vec), 1))
loss = self.loss_func(logits, labels.float())
predictions = (logits > 0.5).int() # (Batch, )
return loss, predictions

Loading

0 comments on commit 4ae1d62

Please sign in to comment.