-
Notifications
You must be signed in to change notification settings - Fork 83
/
create_caches2.py
executable file
·132 lines (109 loc) · 4.15 KB
/
create_caches2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python3
'''Takes a bunch of types training files. First argument is what index the receptor starts on
(ligands are assumed to be right after). Reads in the gninatypes files specified in these types
files and writes out two monolithic receptor and ligand cache files in version 2 format.
Version 2 is optimized for memory mapped storage of caches. keys (file names) are stored
first followed by dense storage of values (coordinates and types).
'''
import os, sys
import struct, argparse, traceback
import multiprocessing
mols_to_read = multiprocessing.Queue()
mols_to_write = multiprocessing.Queue()
N = multiprocessing.cpu_count()*2
def read_data(data_root):
'''read a types file and put it in mols_to_write'''
while True:
sys.stdout.flush()
mol = mols_to_read.get()
if mol == None:
break
fname = mol
if len(data_root):
fname = data_root+'/'+mol
try:
with open(fname,'rb') as gninatype:
data = gninatype.read()
assert(len(data) % 16 == 0)
if len(data) == 0:
print(fname,"EMPTY")
else:
mols_to_write.put((mol,data))
except Exception as e:
print(fname)
print(e)
mols_to_write.put(None)
def fill_queue(molfiles):
'thread for filling mols_to_read'
for mol in molfiles:
mols_to_read.put(mol)
for _ in range(N):
mols_to_read.put(None)
def create_cache2(molfiles, data_root, outfile):
'''Create an outfile molcache2 file from the list molfiles stored at data_root.'''
out = open(outfile,'wb')
#first byte is for versioning
out.write(struct.pack('i',-1))
out.write(struct.pack('L',0)) #placeholder for offset to keys
filler = multiprocessing.Process(target=fill_queue,args=(molfiles,))
filler.start()
readers = multiprocessing.Pool(N)
for _ in range(N):
readers.apply_async(read_data,(data_root,))
offsets = dict() #indxed by mol, location of data
#start writing molecular data
endcnt = 0
while True:
moldata = mols_to_write.get()
if moldata == None:
endcnt += 1
if endcnt == N:
break
else:
continue
(mol,data) = moldata
offsets[mol] = out.tell()
natoms = len(data)//16
out.write(struct.pack('i',natoms))
out.write(data)
start = out.tell() #where the names start
for mol in molfiles:
if len(mol) > 255:
print("Skipping",mol,"since filename is too long")
continue
if mol not in offsets:
print("SKIPPING",mol,"since failed to read it in")
continue
s = bytes(mol, encoding='UTF-8')
out.write(struct.pack('B',len(s)))
out.write(s)
out.write(struct.pack('L',offsets[mol]))
#now set start
out.seek(4)
out.write(struct.pack('L',start))
out.seek(0,os.SEEK_END)
out.close()
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--col', required=True,type=int,help='Column receptor starts on')
parser.add_argument('--recmolcache', default='rec.molcache2',type=str,help='Filename of receptor cache')
parser.add_argument('--ligmolcache', default='lig.molcache2',type=str,help='Filename of ligand cache')
parser.add_argument('-d','--data_root',type=str,required=False,help="Root folder for relative paths in train/test files",default='')
parser.add_argument('fnames',nargs='+',type=str,help='types files to process')
args = parser.parse_args()
#load all file names into memory
seenlig = set()
seenrec = set()
for fname in args.fnames:
for line in open(fname):
vals = line.split()
rec = vals[args.col]
ligs = vals[args.col+1:]
if rec not in seenrec:
seenrec.add(rec)
for lig in ligs:
if lig == '#' or lig.startswith('#'):
break
if lig not in seenlig:
seenlig.add(lig)
create_cache2(sorted(list(seenrec)), args.data_root, args.recmolcache)
create_cache2(sorted(list(seenlig)), args.data_root, args.ligmolcache)