-
Notifications
You must be signed in to change notification settings - Fork 0
/
prepare_data_for_finetune.py
81 lines (75 loc) · 2.34 KB
/
prepare_data_for_finetune.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from src.checker.checker import Checker
import json
import os
from tqdm import tqdm
import math
def get_python_files_list(dir_path):
python_files = []
for dirpath, dirnames, filenames in os.walk(dir_path):
python_files += [
os.path.join(dirpath, file) for file in filenames if file.endswith(".py")
]
if len(python_files) == 20000:
break
return python_files
all_files = get_python_files_list(os.path.join(os.getcwd(), "data", "repos"))
print("number of python files:", len(all_files))
checkers = []
for file in all_files:
checkers.append(Checker(file))
if len(checkers) == 20000:
break
# checkers = [Checker(file) for file in all_files][:10000] # only 10000 samples are enough
all_candidates = []
all_paths = []
for checker in tqdm(checkers, desc="Processing checkers", total=len(checkers)):
candidates = [
checker.prepare_inputs_for_infill(level=i)
for i in tqdm(
[
"function_names",
"variable_names",
"class_names",
"comments",
"docstrings",
"strings",
],
desc="Processing levels",
leave=False,
)
]
all_candidates += [
{
"infill": str(candidate["infill"]),
"prefix": str(candidate["prefix"]),
"suffix": str(candidate["suffix"]),
"path": checker.input_path,
}
for sublist in candidates
for candidate in sublist
if candidate
]
num_records_per_json = 2000
num_jsons = math.ceil(len(all_candidates) / num_records_per_json)
for i in range(num_jsons):
start_index = i * num_records_per_json
end_index = min((i + 1) * num_records_per_json, len(all_candidates))
json_data = all_candidates[start_index:end_index]
json_paths = set([j["path"] for j in json_data])
json_filename = os.path.join(
os.getcwd(),
"data",
"finetune_ds",
f"finetune_candidates_{i+1}.json",
)
json_paths_file_name = os.path.join(
os.getcwd(),
"data",
"finetune_ds",
f"finetune_candidates_{i+1}_paths.txt",
)
with open(json_filename, "w") as f:
json.dump(json_data, f)
with open(json_paths_file_name, "w") as f:
for i in json_paths:
f.write(i + "\n")