Skip to content

Commit

Permalink
reorganize
Browse files Browse the repository at this point in the history
  • Loading branch information
truonghm committed Sep 22, 2023
1 parent 78d0e1b commit fe89b78
Show file tree
Hide file tree
Showing 17 changed files with 25 additions and 2,068 deletions.
2,047 changes: 0 additions & 2,047 deletions logs/predictions_codebert_bimodal.txt

This file was deleted.

2 changes: 1 addition & 1 deletion scripts/build_fasttext_model.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import argparse

import fasttext
from lib.utils.logging import logger
from src.utils.logging import logger

if __name__ == "__main__":
argparser = argparse.ArgumentParser()
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
python run.py \
--output_dir=./saved_models \
python src/codebert-unimodal/run.py \
--output_dir=./models \
--tokenizer_name=microsoft/codebert-base \
--model_name_or_path=microsoft/codebert-base \
--do_eval \
--do_test \
--train_data_file=../dataset/train.jsonl \
--eval_data_file=../dataset/valid.jsonl \
--test_data_file=../dataset/test.jsonl \
--train_data_file=data/exp/train_set.jsonl \
--eval_data_file=data/exp/valid_set.jsonl \
--test_data_file=data/exp/test_set.jsonl \
--num_train_epochs 1 \
--block_size 64 \
--train_batch_size 8 \
--eval_batch_size 16 \
--learning_rate 2e-5 \
--max_grad_norm 1.0 \
--seed 123456 2>&1 | tee test.log
--seed 123456 2>&1 | tee test.log
2 changes: 1 addition & 1 deletion scripts/tokenize_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import esprima
import numpy as np
import pandas as pd
from lib.utils.logging import logger
from src.utils.logging import logger
from tqdm import tqdm

warnings.filterwarnings("ignore")
Expand Down
File renamed without changes.
12 changes: 6 additions & 6 deletions lib/codebert/train.sh → scripts/train_codebert_unimodal.sh
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
PYTHONPATH=$(shell pwd) python run.py \
python src/codebert-unimodal/run.py \
--output_dir=./models \
--tokenizer_name=microsoft/codebert-base \
--model_name_or_path=microsoft/codebert-base \
--do_train \
--train_data_file=../data/exp/train_set.jsonl \
--eval_data_file=../data/exp/valid_set.jsonl \
--test_data_file=../data/exp/test_set.jsonl \
--num_train_epochs 5 \
--train_data_file=data/exp/train_set.jsonl \
--eval_data_file=data/exp/valid_set.jsonl \
--test_data_file=data/exp/test_set.jsonl \
--num_train_epochs 20 \
--block_size 512 \
--train_batch_size 8 \
--train_batch_size 16 \
--eval_batch_size 16 \
--learning_rate 2e-5 \
--max_grad_norm 1.0 \
Expand Down
2 changes: 1 addition & 1 deletion scripts/utils/crawl_kaggle_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import pandas as pd
import pymongo
import requests
from lib.utils.logging import logger
from src.utils.logging import logger

TIMEOUT_SECS = 3
shutdown = False # Flag to signal workers to stop working
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
15 changes: 10 additions & 5 deletions lib/codebert/run.py → src/codebert-unimodal/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@

import numpy as np
import torch
from lib.codebert.model import Model
from model import Model
from torch.utils.data import DataLoader, Dataset, IterableDataset, RandomSampler, SequentialSampler, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange
Expand All @@ -58,7 +58,9 @@ def __init__(
input_tokens,
input_ids,
label,
idx=None,
):
self.idx = idx
self.input_tokens = input_tokens
self.input_ids = input_ids
self.label = label
Expand All @@ -72,7 +74,7 @@ def convert_examples_to_features(js, tokenizer, args):
source_ids = tokenizer.convert_tokens_to_ids(source_tokens)
padding_length = args.block_size - len(source_ids)
source_ids += [tokenizer.pad_token_id] * padding_length
return InputFeatures(source_tokens, source_ids, js["label"])
return InputFeatures(source_tokens, source_ids, js["label"], js.get("idx", None))


class TextDataset(Dataset):
Expand Down Expand Up @@ -270,7 +272,10 @@ def test(args, model, tokenizer):
preds = logits.argmax(-1)
with open(os.path.join(args.output_dir, "predictions.txt"), "w") as f:
for example, pred in zip(eval_dataset.examples, preds):
f.write(str(pred) + "\n")
if example.idx is not None:
f.write(example.idx + "\t" + str(pred) + "\n")
else:
f.write(str(pred) + "\n")


def main():
Expand Down Expand Up @@ -361,7 +366,7 @@ def main():
# Evaluation
results = {}
if args.do_eval:
checkpoint_prefix = "checkpoint-best-acc/model.bin"
checkpoint_prefix = "checkpoint-best-acc-codebert/model.bin"
output_dir = os.path.join(args.output_dir, "{}".format(checkpoint_prefix))
model.load_state_dict(torch.load(output_dir))
model.to(args.device)
Expand All @@ -371,7 +376,7 @@ def main():
logger.info(" %s = %s", key, str(round(result[key], 4)))

if args.do_test:
checkpoint_prefix = "checkpoint-best-acc/model.bin"
checkpoint_prefix = "checkpoint-best-acc-codebert/model.bin"
output_dir = os.path.join(args.output_dir, "{}".format(checkpoint_prefix))
model.load_state_dict(torch.load(output_dir))
model.to(args.device)
Expand Down
File renamed without changes.
File renamed without changes.

0 comments on commit fe89b78

Please sign in to comment.