switch to jsonl format

truonghm · Sep 17, 2023 · 14b7c8e · 14b7c8e
1 parent 077fa35
commit 14b7c8e
Showing 1 changed file with 14 additions and 24 deletions.
diff --git a/scripts/tokenize_corpus.py b/scripts/tokenize_corpus.py
@@ -5,6 +5,7 @@
 from typing import Any, List, Tuple
 
 import esprima
+import numpy as np
 import pandas as pd
 from lib.utils.logging import logger
 from tqdm import tqdm
@@ -25,10 +26,10 @@ def tokenize(snippet: str) -> List[Tuple[Any, Any]]:
     return token_types
 
 
-def process_row(file, label):
-    with open(file, "r") as f:
-        snippet = f.read()
-    return tokenize(snippet), label
+def process_row(code, pbar):
+    tokens = tokenize(code)
+    pbar.update(1)
+    return tokens
 
 
 def main():
@@ -39,34 +40,23 @@ def main():
     args = argparser.parse_args()
     path_file = args.input
     text_dir = args.output
-    prefix = os.path.basename(path_file).replace(".csv", "")
-    token_types_file = os.path.join(text_dir, f"{prefix}_token_types_corpus.txt")
-    labels_file = os.path.join(text_dir, f"{prefix}_labels.txt")
+    prefix = os.path.basename(path_file).replace(".jsonl", "")
+    token_types_file = os.path.join(text_dir, f"{prefix}_token_types_corpus.parquet")
 
     logger.info(f"Load paths to actual data: {path_file}")
-    df = pd.read_csv(path_file)
+    with open(path_file, "r") as f:
+        df = pd.read_json(f, lines=True)
     total = len(df)
     logger.info(f"Total number of files: {total}")
 
     # logger.info("Tokenizing...")
-    results = []
-    with ThreadPoolExecutor() as executor:
-        results = list(
-            tqdm(
-                executor.map(process_row, df["file"].tolist(), df["label"].tolist()),
-                total=total,
-                bar_format="Tokenizing: {desc:<5.5}{percentage:3.0f}%|{bar:30}{r_bar}",
-            )
-        )
+    bar_format = "Tokenizing: {desc:<5.5}{percentage:3.0f}%|{bar:30}{r_bar}"
+    with tqdm(total=total, bar_format=bar_format) as pbar:
+        df["ast"] = np.vectorize(process_row)(df["code"], pbar)
 
-    logger.info(f"Number of observation tokenized: {len(results)}")
     logger.info("Saving corpus to text files...")
-    with open(token_types_file, "a") as token_types_f, open(
-        labels_file, "a"
-    ) as labels_f:
-        for token_types, label in results:
-            token_types_f.write(" ".join(token_types) + "\n")
-            labels_f.write(label + "\n")
+
+    df[["ast", "label"]].to_parquet(token_types_file, index=False)
 
 
 if __name__ == "__main__":