-
Notifications
You must be signed in to change notification settings - Fork 22
/
test_polyfun.py
321 lines (261 loc) · 12.9 KB
/
test_polyfun.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
import numpy as np; np.set_printoptions(precision=3, linewidth=200)
import pandas as pd; pd.set_option('display.width', 200)
import sys
import tempfile
import subprocess
import os
import glob
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
def compare_dfs(dir1, dir2, filename, sort_column=None):
file1 = os.path.join(dir1, filename)
file2 = os.path.join(dir2, filename)
if not os.path.exists(file1):
raise IOError('%s not found'%(file1))
if not os.path.exists(file2):
raise IOError('%s not found'%(file2))
if file1.endswith('.parquet'): df1 = pd.read_parquet(file1)
else: df1 = pd.read_table(file1, sep='\s+')
if file2.endswith('.parquet'): df2 = pd.read_parquet(file2)
else: df2 = pd.read_table(file2, sep='\s+')
assert np.all(df1.shape == df2.shape), 'found dimension mismatch between %s and %s'%(file1, file2)
assert np.all(df1.columns == df2.columns), 'found mismatch between %s and %s'%(file1, file2)
if sort_column is not None:
df1 = df1.sort_values(sort_column).reset_index(drop=True)
df2 = df2.sort_values(sort_column).reset_index(drop=True)
for c in df1.columns:
if c=='CREDIBLE_SET':
continue
if is_numeric_dtype(df1[c]):
if c=='Mp_STDERR': atol=1e-1
elif c=='BETA_MEAN': atol=1e-3
elif c=='BETA_SD': atol=1e-3
elif c=='PIP': atol=1e-2
else: atol=1e-4
if c=='PIP':
assert np.corrcoef(df1[c], df2[c])[0,1]>0.97, 'found major mismatch between %s and %s in column %s'%(file1, file2, c)
pip_cutoff = 0.8
is_large_pip = df2[c] > pip_cutoff
assert np.allclose(df1.loc[is_large_pip,c], df2.loc[is_large_pip,c], atol=atol), 'found mismatch between %s and %s in column %s'%(file1, file2, c)
else:
assert np.allclose(df1[c], df2[c], atol=atol), 'found mismatch between %s and %s in column %s'%(file1, file2, c)
else:
assert np.all(df1[c] == df2[c]), 'found mismatch between %s and %s in column %s'%(file1, file2, c)
def is_susie_installed():
try:
import rpy2
import rpy2.robjects.numpy2ri as numpy2ri
import rpy2.robjects as ro
ro.conversion.py2ri = numpy2ri
numpy2ri.activate()
from rpy2.robjects.packages import importr
susieR = importr('susieR')
except:
return False
return True
def is_Ckmeans_installed():
try:
import rpy2
import rpy2.robjects.numpy2ri as numpy2ri
try:
from importlib import reload
reload(rpy2.robjects.numpy2ri)
except:
pass
import rpy2.robjects as ro
ro.conversion.py2ri = numpy2ri
numpy2ri.activate()
from rpy2.robjects.packages import importr
importr('Ckmeans.1d.dp')
median_seg_func = ro.r('Ckmedian.1d.dp')
mean_seg_func = ro.r('Ckmeans.1d.dp')
except:
return False
return True
def is_finemap_installed(finemap_exe):
if finemap_exe is None: return False
assert os.path.exists(finemap_exe), 'the FINEMAP path provided is wrong (%s)'%(finemap_exe)
assert os.access(finemap_exe, os.X_OK), 'the FINEMAP executable doesn\'t have execution permissions'
return True
def test_munge_sumstats(tmpdir, python3_exe):
script_path = os.path.dirname(os.path.realpath(__file__))
example_dir = os.path.join(script_path, 'example_data')
gold_dir = os.path.join(script_path, 'gold')
script_exe = os.path.join(script_path, 'munge_polyfun_sumstats.py')
input_file = os.path.join(example_dir, 'boltlmm_sumstats.gz')
outfile = 'sumstats_munged.parquet'
output_file = os.path.join(tmpdir, outfile)
gold_file = os.path.join(gold_dir, outfile)
cmd = '%s %s --sumstats %s --out %s --n 327209 --min-info 0.6 --min-maf 0.001'%(python3_exe, script_exe, input_file, output_file)
retval = os.system(cmd)
if retval != 0:
raise ValueError('munge_sumstats command failed when running the following command:\n%s'%(cmd))
compare_dfs(tmpdir, gold_dir, outfile)
def test_polyfun(tmpdir, python3_exe):
if not is_Ckmeans_installed():
print('Skipping polyfun tests becuase CkMeans is not installed')
return
script_path = os.path.dirname(os.path.realpath(__file__))
example_dir = os.path.join(script_path, 'example_data')
gold_dir = os.path.join(script_path, 'gold')
script_exe = os.path.join(script_path, 'polyfun.py')
sumstats_file = os.path.join(example_dir, 'sumstats.parquet')
output_prefix = os.path.join(tmpdir, 'testrun')
ref_ld_prefix = os.path.join(example_dir, 'annotations.')
w_ld_prefix = os.path.join(example_dir, 'weights.')
plink_prefix = os.path.join(example_dir, 'reference.')
#polyfun stage 2
cmd = '%s %s --compute-h2-L2 --output-prefix %s --sumstats %s --ref-ld-chr %s --w-ld-chr %s --nnls-exact' % \
(python3_exe, script_exe, output_prefix, sumstats_file, ref_ld_prefix, w_ld_prefix)
retval = os.system(cmd)
if retval != 0:
raise ValueError('PolyFun command failed when running the following command:\n%s'%(cmd))
for outfile in ['testrun.22.bins.parquet', 'testrun.22.snpvar_ridge_constrained.gz', 'testrun.22.snpvar_ridge.gz', 'testrun.22.l2.M']:
compare_dfs(tmpdir, gold_dir, outfile)
#polyfun stage 3
cmd = '%s %s --compute-ldscores --output-prefix %s --bfile-chr %s --nnls-exact'% \
(python3_exe, script_exe, output_prefix, plink_prefix)
retval = os.system(cmd)
if retval != 0:
raise ValueError('PolyFun command failed when running the following command:\n%s'%(cmd))
compare_dfs(tmpdir, gold_dir, 'testrun.22.l2.ldscore.parquet')
#polyfun stage 4
cmd = '%s %s --compute-h2-bins --output-prefix %s --sumstats %s --w-ld-chr %s --nnls-exact'% \
(python3_exe, script_exe, output_prefix, sumstats_file, w_ld_prefix)
retval = os.system(cmd)
if retval != 0:
raise ValueError('PolyFun command failed when running the following command:\n%s'%(cmd))
compare_dfs(tmpdir, gold_dir, 'testrun.22.snpvar_constrained.gz')
def test_polyloc(tmpdir, python3_exe):
if not is_Ckmeans_installed():
print('Skipping polyloc tests becuase CkMeans is not installed')
return
script_path = os.path.dirname(os.path.realpath(__file__))
example_dir = os.path.join(script_path, 'example_data')
gold_dir = os.path.join(script_path, 'gold')
script_exe = os.path.join(script_path, 'polyloc.py')
sumstats_file = os.path.join(example_dir, 'sumstats2.parquet')
output_prefix = os.path.join(tmpdir, 'polyloc_test')
w_ld_prefix = os.path.join(example_dir, 'weights.')
plink_prefix = os.path.join(example_dir, 'reference.')
posterior_file = os.path.join(example_dir, 'posterior_betas.gz')
#polyloc stage 1
polyloc_cmd = ('%s %s --compute-partitions --output-prefix %s --posterior %s --bfile-chr %s --nnls-exact'% \
(python3_exe, script_exe, output_prefix, posterior_file, plink_prefix))
retval = os.system(polyloc_cmd)
if retval != 0:
raise ValueError('PolyLoc command failed when running the following command:\n%s'%(polyloc_cmd))
for outfile in ['polyloc_test.22.bins.parquet', 'polyloc_test.22.l2.M']:
compare_dfs(tmpdir, gold_dir, outfile)
#polyloc stage 2
polyloc_cmd = ('%s %s --compute-ldscores --output-prefix %s --bfile-chr %s --nnls-exact'% \
(python3_exe, script_exe, output_prefix, plink_prefix))
retval = os.system(polyloc_cmd)
if retval != 0:
raise ValueError('PolyLoc command failed when running the following command:\n%s'%(polyloc_cmd))
compare_dfs(tmpdir, gold_dir, 'polyloc_test.22.l2.ldscore.parquet')
#polyloc stage 3
polyloc_cmd = ('%s %s --compute-polyloc --output-prefix %s --w-ld-chr %s --sumstats %s --nnls-exact'% \
(python3_exe, script_exe, output_prefix, w_ld_prefix, sumstats_file))
retval = os.system(polyloc_cmd)
if retval != 0:
raise ValueError('PolyLoc command failed when running the following command:\n%s'%(polyloc_cmd))
for outfile in ['polyloc_test.bin_h2', 'polyloc_test.Mp']:
compare_dfs(tmpdir, gold_dir, outfile)
def test_extract_snpvar(tmpdir, python3_exe):
script_path = os.path.dirname(os.path.realpath(__file__))
example_dir = os.path.join(script_path, 'example_data')
gold_dir = os.path.join(script_path, 'gold')
script_exe = os.path.join(script_path, 'extract_snpvar.py')
input_file = os.path.join(example_dir, 'snps_to_finemap.txt.gz')
outfile = 'snps_with_var.gz'
output_file = os.path.join(tmpdir, outfile)
gold_file = os.path.join(gold_dir, outfile)
cmd = '%s %s --sumstats %s --out %s'%(python3_exe, script_exe, input_file, output_file)
retval = os.system(cmd)
if retval != 0:
raise ValueError('extract_snpvar command failed when running the following command:\n%s'%(cmd))
compare_dfs(tmpdir, gold_dir, outfile)
def test_finemapper_susie(tmpdir, python3_exe):
if not is_susie_installed():
print('Skipping fine-mapping test because either rpy2 or SuSiE are not properly installed. Please try reinstalling rpy2 and/or SuSiE')
return
script_path = os.path.dirname(os.path.realpath(__file__))
example_dir = os.path.join(script_path, 'example_data')
gold_dir = os.path.join(script_path, 'gold')
script_exe = os.path.join(script_path, 'finemapper.py')
sumstats_file = os.path.join(example_dir, 'chr1.finemap_sumstats.txt.gz')
plink_file = os.path.join(example_dir, 'chr1')
outfile = 'finemap.1.46000001.49000001.gz'
output_file = os.path.join(tmpdir, outfile)
finemapper_cmd = '%s %s \
--geno %s \
--sumstats %s \
--n 383290 \
--chr 1 \
--start 46000001 \
--end 49000001 \
--method susie \
--max-num-causal 5 \
--out %s \
--threads 1 \
--verbose \
' \
%(python3_exe, script_exe, plink_file, sumstats_file, output_file)
#print(finemapper_cmd)
retval = os.system(finemapper_cmd)
if retval != 0:
raise ValueError('finemapper command failed when running the following command:\n%s'%(cmd))
compare_dfs(tmpdir, gold_dir, outfile, sort_column='SNP')
def test_finemapper_finemap(tmpdir, python3_exe, finemap_exe):
if not is_finemap_installed(finemap_exe):
print('Skipping FINEMAP test because the executable is not available. Re-run with --finemap-exe to test FINEMAP')
return
script_path = os.path.dirname(os.path.realpath(__file__))
example_dir = os.path.join(script_path, 'example_data')
gold_dir = os.path.join(script_path, 'gold')
script_exe = os.path.join(script_path, 'finemapper.py')
sumstats_file = os.path.join(example_dir, 'chr1.finemap_sumstats.txt.gz')
plink_file = os.path.join(example_dir, 'chr1')
outfile = 'finemap.FINEMAP.1.46000001.49000001.gz'
output_file = os.path.join(tmpdir, outfile)
finemapper_cmd = '%s %s \
--geno %s \
--sumstats %s \
--n 383290 \
--chr 1 \
--start 46000001 \
--end 49000001 \
--method finemap \
--finemap-exe %s \
--max-num-causal 5 \
--out %s \
--threads 1 \
--verbose \
' \
%(python3_exe, script_exe, plink_file, sumstats_file, finemap_exe, output_file)
#print(finemapper_cmd)
retval = os.system(finemapper_cmd)
if retval != 0:
raise ValueError('finemapper command failed when running the following command:\n%s'%(cmd))
compare_dfs(tmpdir, gold_dir, outfile, sort_column='SNP')
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--python3', default='python', help='python 3 executable')
parser.add_argument('--finemap-exe', default=None, help='Path to FINEMAP executable file')
args = parser.parse_args()
temp_dir = tempfile.mkdtemp()
test_finemapper_susie(temp_dir, args.python3)
test_finemapper_finemap(temp_dir, args.python3, args.finemap_exe)
test_polyloc(temp_dir, args.python3)
test_polyfun(temp_dir, args.python3)
test_extract_snpvar(temp_dir, args.python3)
test_munge_sumstats(temp_dir, args.python3)
print()
print()
print()
print('---------------------------------------')
print('All tests completed successfully, hooray!!!')
print('---------------------------------------')
print()