-
Notifications
You must be signed in to change notification settings - Fork 0
/
process.py
45 lines (44 loc) · 1.58 KB
/
process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from utils.data_process import Data
from utils.tf_idf import TF_IDF
from utils.inverted_index import Inverted_Index
import logging
import zstd
import pickle
save_dir = "output"
logging.basicConfig(level=logging.DEBUG)
logging.info("Loading data from files")
data = Data()
data.load("data",1000)
data.process(6)
wordcount = None
logging.info("Writing data to output")
with open(f'{save_dir}/inverted_index.zstd', 'wb') as f:
# include words in title
ii = Inverted_Index(data.data, data.headerdata, data.dict)
ii.procecss()
wordcount = ii.word_count
logging.info("Compressing and saving inverted index")
ii_data = pickle.dumps(ii.inverted_index)
f.write(zstd.compress(ii_data))
f.close()
with open(f'{save_dir}/tf_idf_matrix.zstd', 'wb') as f:
tf_idf = TF_IDF(data.data, data.headerdata, data.dict, wordcount)
tf_idf.process()
logging.info("Compressing and saving tf-idf matrix")
tf_idf_data = pickle.dumps(tf_idf.tf_idf)
f.write(zstd.compress(tf_idf_data))
f.close()
with open(f'{save_dir}/header_tf_idf_matrix.zstd', 'wb') as f:
header_tf_idf_data = pickle.dumps(tf_idf.header_tf_idf)
f.write(zstd.compress(header_tf_idf_data))
f.close()
with open(f'{save_dir}/dictionary.zstd', 'wb') as f:
logging.info("Compressing and saving dictionary")
dict_data = pickle.dumps(data.dict)
f.write(zstd.compress(dict_data))
f.close()
with open(f'{save_dir}/metadata.zstd', 'wb') as f:
logging.info("Compressing and saving metadata")
meta_data = pickle.dumps(data.metadata)
f.write(zstd.compress(meta_data))
f.close()