forked from Nuclear-Squid/ergol
-
Notifications
You must be signed in to change notification settings - Fork 1
/
merge.py
executable file
·43 lines (35 loc) · 1.29 KB
/
merge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#!/bin/env python3
""" Merge two corpus dictionaries. """
import json
from sys import argv
def merge(filenames, filecount):
merged = {
"symbols": {},
"digrams": {},
}
# merge dictionaries
for filename in filenames:
with open(filename, "r") as corpus:
data = json.load(corpus)
for section in merged.keys():
for key, count in data[section].items():
if key not in merged[section]:
merged[section][key] = 0.0
merged[section][key] += count / filecount
# sort the merged dictionary by symbol frequency (requires CPython 3.6+)
def sort_by_frequency(table, precision=2):
sorted_dict = {}
for (key, count) in sorted(table.items(), key=lambda x: -x[1]):
freq = round(count, precision)
if freq > 0:
sorted_dict[key] = freq
return sorted_dict
results = {}
results["corpus"] = ""
results["symbols"] = sort_by_frequency(merged["symbols"])
results["digrams"] = sort_by_frequency(merged["digrams"])
return results
if __name__ == "__main__":
argl = len(argv) - 1 # number of files to merge
if argl >= 2:
print(json.dumps(merge(argv[1:], argl), indent=4, ensure_ascii=False))