reorganize

truonghm · Sep 22, 2023 · fe89b78 · fe89b78
1 parent 78d0e1b
commit fe89b78
Show file tree

Hide file tree

Showing 17 changed files with 25 additions and 2,068 deletions.
diff --git a/logs/predictions_codebert_bimodal.txt b/logs/predictions_codebert_bimodal.txt
diff --git a/scripts/build_fasttext_model.py b/scripts/build_fasttext_model.py
@@ -1,7 +1,7 @@
 import argparse
 
 import fasttext
-from lib.utils.logging import logger
+from src.utils.logging import logger
 
 if __name__ == "__main__":
     argparser = argparse.ArgumentParser()

diff --git a/lib/codebert-bimodal/inference.sh → scripts/inference_codebert_bimodal.sh b/lib/codebert-bimodal/inference.sh → scripts/inference_codebert_bimodal.sh
diff --git a/lib/codebert/inference.sh → scripts/inference_codebert_unimodal.sh b/lib/codebert/inference.sh → scripts/inference_codebert_unimodal.sh
@@ -1,16 +1,15 @@
-python run.py \
-    --output_dir=./saved_models \
+python src/codebert-unimodal/run.py \
+    --output_dir=./models \
     --tokenizer_name=microsoft/codebert-base \
     --model_name_or_path=microsoft/codebert-base \
-    --do_eval \
     --do_test \
-    --train_data_file=../dataset/train.jsonl \
-    --eval_data_file=../dataset/valid.jsonl \
-    --test_data_file=../dataset/test.jsonl \
+    --train_data_file=data/exp/train_set.jsonl \
+    --eval_data_file=data/exp/valid_set.jsonl \
+    --test_data_file=data/exp/test_set.jsonl \
     --num_train_epochs 1 \
     --block_size 64 \
     --train_batch_size 8 \
     --eval_batch_size 16 \
     --learning_rate 2e-5 \
     --max_grad_norm 1.0 \
-    --seed 123456  2>&1 | tee test.log
+    --seed 123456  2>&1 | tee test.log
diff --git a/scripts/tokenize_corpus.py b/scripts/tokenize_corpus.py
@@ -7,7 +7,7 @@
 import esprima
 import numpy as np
 import pandas as pd
-from lib.utils.logging import logger
+from src.utils.logging import logger
 from tqdm import tqdm
 
 warnings.filterwarnings("ignore")

diff --git a/lib/codebert-bimodal/train.sh → scripts/train_codebert_bimodal.sh b/lib/codebert-bimodal/train.sh → scripts/train_codebert_bimodal.sh
diff --git a/lib/codebert/train.sh → scripts/train_codebert_unimodal.sh b/lib/codebert/train.sh → scripts/train_codebert_unimodal.sh
@@ -1,14 +1,14 @@
-PYTHONPATH=$(shell pwd) python run.py \
+python src/codebert-unimodal/run.py \
     --output_dir=./models \
     --tokenizer_name=microsoft/codebert-base \
     --model_name_or_path=microsoft/codebert-base \
     --do_train \
-    --train_data_file=../data/exp/train_set.jsonl \
-    --eval_data_file=../data/exp/valid_set.jsonl \
-    --test_data_file=../data/exp/test_set.jsonl \
-    --num_train_epochs 5 \
+    --train_data_file=data/exp/train_set.jsonl \
+    --eval_data_file=data/exp/valid_set.jsonl \
+    --test_data_file=data/exp/test_set.jsonl \
+    --num_train_epochs 20 \
     --block_size 512 \
-    --train_batch_size 8 \
+    --train_batch_size 16 \
     --eval_batch_size 16 \
     --learning_rate 2e-5 \
     --max_grad_norm 1.0 \

diff --git a/scripts/utils/crawl_kaggle_dataset.py b/scripts/utils/crawl_kaggle_dataset.py
@@ -9,7 +9,7 @@
 import pandas as pd
 import pymongo
 import requests
-from lib.utils.logging import logger
+from src.utils.logging import logger
 
 TIMEOUT_SECS = 3
 shutdown = False  # Flag to signal workers to stop working

diff --git a/lib/__init__.py → src/__init__.py b/lib/__init__.py → src/__init__.py
diff --git a/lib/codebert-bimodal/model.py → src/codebert-bimodal/model.py b/lib/codebert-bimodal/model.py → src/codebert-bimodal/model.py
diff --git a/lib/codebert-bimodal/preprocess.py → src/codebert-bimodal/preprocess.py b/lib/codebert-bimodal/preprocess.py → src/codebert-bimodal/preprocess.py
diff --git a/lib/codebert-bimodal/run_classifier.py → src/codebert-bimodal/run_classifier.py b/lib/codebert-bimodal/run_classifier.py → src/codebert-bimodal/run_classifier.py
diff --git a/lib/codebert-bimodal/utils.py → src/codebert-bimodal/utils.py b/lib/codebert-bimodal/utils.py → src/codebert-bimodal/utils.py
diff --git a/lib/codebert/model.py → src/codebert-unimodal/model.py b/lib/codebert/model.py → src/codebert-unimodal/model.py
diff --git a/lib/codebert/run.py → src/codebert-unimodal/run.py b/lib/codebert/run.py → src/codebert-unimodal/run.py
@@ -34,7 +34,7 @@
 
 import numpy as np
 import torch
-from lib.codebert.model import Model
+from model import Model
 from torch.utils.data import DataLoader, Dataset, IterableDataset, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
@@ -58,7 +58,9 @@ def __init__(
         input_tokens,
         input_ids,
         label,
+        idx=None,
     ):
+        self.idx = idx
         self.input_tokens = input_tokens
         self.input_ids = input_ids
         self.label = label
@@ -72,7 +74,7 @@ def convert_examples_to_features(js, tokenizer, args):
     source_ids = tokenizer.convert_tokens_to_ids(source_tokens)
     padding_length = args.block_size - len(source_ids)
     source_ids += [tokenizer.pad_token_id] * padding_length
-    return InputFeatures(source_tokens, source_ids, js["label"])
+    return InputFeatures(source_tokens, source_ids, js["label"], js.get("idx", None))
 
 
 class TextDataset(Dataset):
@@ -270,7 +272,10 @@ def test(args, model, tokenizer):
     preds = logits.argmax(-1)
     with open(os.path.join(args.output_dir, "predictions.txt"), "w") as f:
         for example, pred in zip(eval_dataset.examples, preds):
-            f.write(str(pred) + "\n")
+            if example.idx is not None:
+                f.write(example.idx + "\t" + str(pred) + "\n")
+            else:
+                f.write(str(pred) + "\n")
 
 
 def main():
@@ -361,7 +366,7 @@ def main():
     # Evaluation
     results = {}
     if args.do_eval:
-        checkpoint_prefix = "checkpoint-best-acc/model.bin"
+        checkpoint_prefix = "checkpoint-best-acc-codebert/model.bin"
         output_dir = os.path.join(args.output_dir, "{}".format(checkpoint_prefix))
         model.load_state_dict(torch.load(output_dir))
         model.to(args.device)
@@ -371,7 +376,7 @@ def main():
             logger.info("  %s = %s", key, str(round(result[key], 4)))
 
     if args.do_test:
-        checkpoint_prefix = "checkpoint-best-acc/model.bin"
+        checkpoint_prefix = "checkpoint-best-acc-codebert/model.bin"
         output_dir = os.path.join(args.output_dir, "{}".format(checkpoint_prefix))
         model.load_state_dict(torch.load(output_dir))
         model.to(args.device)

diff --git a/lib/dataset.py → src/dataset.py b/lib/dataset.py → src/dataset.py
diff --git a/lib/utils/logging.py → src/utils/logging.py b/lib/utils/logging.py → src/utils/logging.py