Skip to content

Commit

Permalink
switch to jsonl format
Browse files Browse the repository at this point in the history
  • Loading branch information
truonghm committed Sep 17, 2023
1 parent 077fa35 commit 14b7c8e
Showing 1 changed file with 14 additions and 24 deletions.
38 changes: 14 additions & 24 deletions scripts/tokenize_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import Any, List, Tuple

import esprima
import numpy as np
import pandas as pd
from lib.utils.logging import logger
from tqdm import tqdm
Expand All @@ -25,10 +26,10 @@ def tokenize(snippet: str) -> List[Tuple[Any, Any]]:
return token_types


def process_row(file, label):
with open(file, "r") as f:
snippet = f.read()
return tokenize(snippet), label
def process_row(code, pbar):
tokens = tokenize(code)
pbar.update(1)
return tokens


def main():
Expand All @@ -39,34 +40,23 @@ def main():
args = argparser.parse_args()
path_file = args.input
text_dir = args.output
prefix = os.path.basename(path_file).replace(".csv", "")
token_types_file = os.path.join(text_dir, f"{prefix}_token_types_corpus.txt")
labels_file = os.path.join(text_dir, f"{prefix}_labels.txt")
prefix = os.path.basename(path_file).replace(".jsonl", "")
token_types_file = os.path.join(text_dir, f"{prefix}_token_types_corpus.parquet")

logger.info(f"Load paths to actual data: {path_file}")
df = pd.read_csv(path_file)
with open(path_file, "r") as f:
df = pd.read_json(f, lines=True)
total = len(df)
logger.info(f"Total number of files: {total}")

# logger.info("Tokenizing...")
results = []
with ThreadPoolExecutor() as executor:
results = list(
tqdm(
executor.map(process_row, df["file"].tolist(), df["label"].tolist()),
total=total,
bar_format="Tokenizing: {desc:<5.5}{percentage:3.0f}%|{bar:30}{r_bar}",
)
)
bar_format = "Tokenizing: {desc:<5.5}{percentage:3.0f}%|{bar:30}{r_bar}"
with tqdm(total=total, bar_format=bar_format) as pbar:
df["ast"] = np.vectorize(process_row)(df["code"], pbar)

logger.info(f"Number of observation tokenized: {len(results)}")
logger.info("Saving corpus to text files...")
with open(token_types_file, "a") as token_types_f, open(
labels_file, "a"
) as labels_f:
for token_types, label in results:
token_types_f.write(" ".join(token_types) + "\n")
labels_f.write(label + "\n")

df[["ast", "label"]].to_parquet(token_types_file, index=False)


if __name__ == "__main__":
Expand Down

0 comments on commit 14b7c8e

Please sign in to comment.