From 512a80aa13de404ead7940b0fe95e035241e8dc7 Mon Sep 17 00:00:00 2001 From: xyg123 <33658607+xyg123@users.noreply.github.com> Date: Thu, 21 Mar 2024 11:23:52 +0000 Subject: [PATCH] test(method): improved performance in coloc tests (#536) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: pytest for coloc unit tests * fix: removed unused coloc tests * test(coloc): add coloc semantic test (#538) * test(coloc): define fixtures and parametrise coloc tests * test(coloc): compare dfs with assert_frame_equal * fix: remove unused threshold variable --------- Co-authored-by: Irene López <45119610+ireneisdoomed@users.noreply.github.com> Co-authored-by: David Ochoa --- .../coloc_test_data.snappy.parquet | Bin 11927 -> 0 bytes .../method/test_colocalisation_method.py | 233 ++++++------------ 2 files changed, 81 insertions(+), 152 deletions(-) delete mode 100644 tests/gentropy/data_samples/coloc_test_data.snappy.parquet diff --git a/tests/gentropy/data_samples/coloc_test_data.snappy.parquet b/tests/gentropy/data_samples/coloc_test_data.snappy.parquet deleted file mode 100644 index 71b3913ebfc68e1bc5e0425b3915d59e66406684..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 11927 zcmbVy1z43!)b0j0DGi&J?(Pzt1uBA+M-gm9N)9Pti^m{TPz1$5L2LyTF|dH4B%~Xp zyOEHTxF0G<|5NwBcRzbQYvP@?X4X41-=5judZP_$6f%XK!c7_NywSpHK%u6PNhAg# zD$@54g+!sFFi}3Zz+8@vB09I}5cVXbpNc*AWB7U{-*ECt3pluOJD7luu)_%98#i44(PF5#)TLcn{MdZI+6TB_r z2*twt@3atu}jP7@C{sXyWg{-m2jV8ejv4atYe+XGqHD7)y zv#5yz#+OS4mc-uj6Z_XRS@SzL1jO?GUn>{$WKDH5_8*CFfzl+nu^j&gs3A$ya)^+d zgsdloimXYHwAf)aCp8L34#xqC>^JT-HJ22zsr|icWXgtwVySN+(a9Te-{?r@3o99gk6jO78b;4As0nF z>MZ_E4D@sspP_zwy5E%^@hmQ0NvT~)=@^b2Wcl@R`#|K&eYPJDT&fp$_3wY3L09fG zY?8S)j~Mcl@|B*5fkK+k;4Zcd{OX$4y7ml4&^Php+>@>g5so{|I6Jt2b~ZJt&&~i$ z#fRvw-mSn&$B;n7<669~D=jZ4TNSpfc^+ssp#}{KMO!D6>0prO(7A2u?C`0T&8Rj) z5PX;qe@=O-0lMq_tKSMM0|OooxYmi0+2|(LbG3@FA$mGbE=?Wg*CxMAV4{O`r{0$J z(K0~l(&9!r0^sj=TS4M;AIdRFG4H&p3LBzAPKLP2!=Y6Z0!kMI;qV_F4|qkW!K-Y- z`Am;GjBQ$LESsbVYhu#mf0LI1I+x-#_l?T2!-lHkiSHNCR@-|Qo~`7ECe9YT&X)rD zjGYB(4*XCOUTnd=wh~Vq7|2ebmV|KM+Na0KN}$|#__t5ib$G`MyM4ap?BLk*Bx(~b z$4Pk|n`*s8!B}wF{GJ2_sClbYpcO0&-iGVPg8M{4gYLS#aCJWB=@h)oZz2w--r1xC zFG#_;LDe19v?sZB?p2r63(~-pfedM%s~{|Vsb24@E)A(SG>d0L8Ns&lgVK%9W2kOorSXw>g}5dy zrythaIN=)|K)gbh!+@`yBNR>1-V+vnT-q(Df7vEN0$hc z_XngbJZ?t`n>?xVR?EQVfrzKwJSDigR6ERLa}lEY5R#Z|%>Y+VrQK-QC=BizD++4o z*?@lCmU{-O9q3eTsK@g+?MNwylIW8`f*QJOTg!P{(XEjF<3m225ZdBsxAkiey8qB# zWn88OrH{?6*Ip$FLQTh$;+OG3&W`$&cdUXia=WSC=JzxB~a<2!nIk#&h1f7x7l(P%DRA5!lt$6_+F- z30no$`3ULo!={rl2I1#l~dOoA96b6 z_Wl}_Ao6OZ6t>^^JTS6io6*@C)=@=-I_+qyV7|LDW>a{TIlaU zSZ2KhaLw6_^3n~WW8<Q9i4V zGA?*&7(Cp-$`0;-n2imcA4a<;-5)bPDMd5}2aQ}llq1{IZ$f1y=1>vOrvB@ly(nYn z*ZC_unc?9ZWZl3uhDuWTkCd1VqvM^FOT%PtP$-RR9amujrJnrEhU|G{v^&nE*PRQ7 zR3F~-Sl|XZrHfaSt;P|aTR*_Kfezjquv$y6Dn`?}9;yqFh0lvqHJdsOqOB%Q{Wrvh z5f$GCw!Tn$NP0c1AjmBZ)St5#LWi3WRVDokTBd50eZqFC`!PG*{`B59rJ)0jat(FV zxU<2;dk!y2sVn$FX<$(Nj$-^!ve-DNpAH;!Erydh^Kk0Mkz)r0USnDHSP|zPbI6u| zCE4R;2~Kr>b>-0WSyV8)6ry^C(A%yEA^1=eG*h<6`HwTt;f-ivsPWn z9Yhlg_1IKI{CH{CKRf7H#6I4=uYl+=N3mm7D7M7XzpfDrR z!I;4S}c$CEs2~&1*A+lH}sD z^C~itwn5af(8HDZ@$DBMOLjG4D*NCUtPX>y->%8sVNE?|i)QeZc>WBfg|~TfP>TWY z-I{{BgY~#Iy<1B6W;1e_cGNlgx*A8H3A$`}v>rEfu`4VDR-iT8X^%{-eub`_r~9h+ zAs*RA^h*u<6ywQb;VXoUDzQYiR-yT-clfwF7j?#&X2jcLSgItKi{gXsxmHX(MP?iK zY;3!qiqBZSxM`x(h9!o^@5~rxW7po*+IMtv@#m!6EF0B0I_`c}NTbWxgc>CW6HS7KP_6BB zde(|E>?t%frY!IZJ)1-IF3&3Po@UjTUn-igQGH`Cw?!3}zIsh;{Zs}zFrOVm*%?F3%khA=;>CfWB5ZQVM3U7y1VC}N{D@>+(F%DD+;F{(!ep+hwH1>HsKr5YjZB>HsI}UwS0%X3GwL#Da9wMaKR$_$lXJH*@-CeCJfaD3@k0%4&UC zVfvyG-`VbAnUP$NA6Y5bXkFHf6JHE35r@df$RXbUN1NsBY zY|XpT!p>xwiG95&=Z<(5O>{r9@;WWExgZloMZ94#nkvLE^j~cbb*;pQDx)@!`-;Nl z2$jk{y)GO;dwh4L@Lm)Y@=(z#x)pEHDSQsytyqm=e@~)s3l5$ez8zLuhJ9wN4MYW- zvB9Cv?qe4^Ft6pOoUZ;xY}MEF#GJYnPxzG|Ici3Nb0N>Zgmr#GhD;Be*v7jsb?{kd z+7%7BHmIIWTfG&>o!nAgwW3A;G1uDO+2fmdzOY~C$7ghke!`od?M zh#guj1k*}7aL(tAb;_%&aZu|9(!$1Wd}-95o4z&&SBh=pIFG7u;Am+&$F3@TmwmN} zoO3(&I#bY_Pt$;Dvty!v>l(mZ&p)}_phC>Y(;JpK^$ZDm3v@3$E5vTOU(#|Ly5 zufI!Y`B;GuK0(j9NIf{^Wz@us=pdF)2$N9sdWT)!kMC=Z>ciK>4?1e!9l#Sv-aB<` zHa@yWddJf*@(`wbcjq662C-txtX_6gKc=yHhyBz^?v6dK3|u!rIxxE-}Bn5oA9g+N9P`M$R6v$<=XQ0>2r#ZcxrP`wR1m) ztLL8^E$hbA(&~@SQMz%$P2;@1@@3e7`v&bk`5|mf$=KUIGKeP|yKlqbAhs=7b~FA= zHIAn^R$u4q#KX@&oV_+WfQ{D;R29Vwfg6>(p%;BAX1eR_WX;fvCvU&7h!Y`rRyNmf z{M3erlj?^LcsAi}XJohZJsiL-f2`V>xw8w;*S=a_a;^oRm(7!G@gBq-#}1oq= zSXF@)WFpKy7EGY%h(P}mZh2U@J-FCpP#A7M%_-68k%QH~D#w~mh=Ccaf?MbwRgm8w zvu(rPF(gQOx6tpU2DgOXCl0-kfP;Q@-g2~(5G4>8xqj6I3R+ps-*cD;<|6*E-Lg^w z))j7jZDYm>Tjl1nT-!+S=PACkf4uo$Pw{#tU%wtSsK93n-!e|!=K!}bu0gdRAz;5- zaH#xUH5PoePy90)N6Otco5XhM!2W>VC!I#?u9stW zK9x5M57i;?qD}I?cx`C0jUmfO&Z5fF+Gtmu1@yRy(o*2c0L4YYbr)DQVO!FzQ19QA z!Io{?33tXU{H5`hyHkt=NUZ4Y37gh}GkQkV-n64=LPfGEe5(w!E4@Db(p?Y+N4Z)r z&NSl{Ji?O9@6|y&HII$bEeA|@`R<35$w0K+j*Q{ZUUXPY(QqfV9Ar7&ATx}s1C_@1 z#Syz}kzz0|yxjNEzB=5g$KDV*#tlg&qc>I%Sd-&zG~Rf0`-#?GP{Yg*mouM~iuqT4$NunuGy># zfhyC+Eo+saI#%YjLAVOMj!^!TB+CcOPIi9m8y`b122?9K9J%2_M}VNCDW*L?qAZ~( z1sRW@fU`LttcWVYs=LHN(YM&o-!2ucYIY2XCr*8a^1HU*mJx$C_shM8>7qbI3fz5O zG80Qpm<;dr5eLr-zbwmmX6V;4p|2lpz(LHi{%JcnL7qOFt-zTJ%1ifO>MoE0Nh;Pt zpH@kDu5^*A@gy^x+pFLGdx|LZlpeq7Gs+EZslBhWy~N<|`CasFzIoVXce3G|vtq<5 z_t@9UG>x>1w#JPTr~lQ~pPnzQE5mCxkyX=(TcFEBca*C>Q$VLr$ldS}HK>Ve1%{sC zf(3m4wB8*a*y&vt$IwO%HRoOw1sBu8lhpmw^-5I0v@eW$aC9Dtk6Nu?F~tBHVV?Gh zG5oM$|9)4{-~;ZdmwVQq+Ygf6`Vr5J*~I-%+z_!wvNo254&LzZ$PMhJg^YU) zA0KQIfg@V>_R0(Nur|2d`ovdyIDa-{`p{((_}jWw=ywW$-jpBJJeMd;ik7^@E?l6W zncj9lwgSsl*kArcCk{NhHW^QazoLBxG7XpHmw__pqcGFx86?#Bs77T*1o97m9jbMZ zfY-&<=pNML0J-Ju!HyMJn$yutvq=;b?>=_qV6MP%i7|KBvq-SlygR%vnF7jRI41|f zxnVnB@?fw&52)RwdKi6+1E%?Yzp&j}1X6gjchMZ-hQpC|W|2eOP%CxVVtu#}R4diC zS;nx!Y@31ShnP$h`b_tWTTmw|xx*2nMBJ`uA2!yS8y-g?W2rIA_lbaKa>D*+c~r2) z_suxvHZv$q@?Fho7J;}m*J+%)dBNKGxUIRw6e>HBy?N`1C`k3XbH-SULEm!6bhA5D zaI*gB5ldQ8@Y}i9_|OL8*Cy-L^xp-Z;)P2(tig%=5O|4p^CseH`bP+xR?$`BwkhFZ zmg|0LDAG`y@=>D!>jQO+S3}#-z&5Fo@mx``c4EkRL*fLJ`XUA2W6NP}K!WAuVasmCoh>R3uc{N_t+(ksQmqiA4igXpm^6xmof(RlL$t&PdvPx1W^c@<5Ve+^B6Y8`uibFl9fYhm-d=M=;tvLPn*l zc#0)Cz|QE++O_f&aA(@JZT0~>Y#tPJBP&lKQ~HzRGa+(N_j^{+oeoYoBdZzY)>Vqy zUTH71`*A{DxPs03cH&mWUz(Omiy3@B=G~B!C4ub6{&3qgdNAaF=RV)nfc#ok3BDa; zhm?FY!u`G)1wM}pwSV4tUXEMJxrC}`cwwiSF@l%17S3O(Upa!s_vx+6J7}y^K$O z${*op8C#>B$Ix47?MmAh4 zUUW~B;0Ed9-YrxkXnN=ALu<>a!R&F}j>xUKcteGpBHMv^v_X(Hp)X(##fXGQ*b={^ zJUX2ipHhE8r$ypk3tXH+MHW}J=N~R0*SV%uPW5CEzqMe+1k@nbU+tJa#0D>~Y;Bgy zXNB$y(Jy4Kvx6I@d+fbnIle0(ZEt!uBQ$iO^3X%luJupKZ;U z=e@vwbq{)|zbm|H58DVj-5yZvAe>G-_s=?Ijx-@_ZEc_9(oJ|wN49#NK&Lx2=>kdRUjziX9hgMBiY%k)eA01ekd%nviyrj5Qas(U6HS==T)Y3N)h+nVy3U~4 z17DEYld0H?-9!>xQtpaQRNo}Vs~>%ekS5_y-|C()4z zt5NEU#HrLG&ZM776^!@HmZzMpMRD_&c;{aC>`|%= zB-czkODdg4yZn#qp}`w?|IzEkVIfoKz}fMxE59`$weveJIlF&CX;s_s33o*ZYFuYN zQ8M#rHN^gq*lxrFZ`sxs{;*_ev=HyoJOYIy60{OH{+&>Ml0nFJt*5& zm~qFtEEM1#ViD3=gQ^bryp=rNg;#7!AMHIN2epocO{yWY$lX@A{jPlzmNjmpw#jP4 zBga`{GWDwQ^2nLv&zNS=r(?b@q3frSLZy*I4`l{1Y!g?QS>A{@QC=$zo44Yzy53Wb zZnTiYD9I!$!3o~=ZQ;|9j?D5~yu%NDMU~s_H+Oaw;P|zBsFfKCu@9}r#$_B0xM4!0 z`!lgmgq+}b_(`h_TTVXMTNc}l{Yp<39Oj%v^BrBStXrG0`(O(;Sw#&i&DoDel8Mu5 zeVp=50vCU{xXe6K5PHskR7w%(MN~8+qA!&d;dIFFJba!iU>_QwU$~H3C@gF*s3!-P3u7=9&iYiC=$_tivwf{*(>E(c5LhA{jw5}(x>WV zR;B@`nm=B9*wKT=IB#tZkfA`LdtoBodIgY)V^J&fslpw}N4@ATszc1>`G00gB}#4sk=WVoD-O-r^NQM5&s_Qtx8#ywSe9S?pC8$>_Ge+ z)ckx5OK;`j2s~6vWTl^5!OUYlVsnq!SsMLNbp!Oq5X#Di_eko&Ex^^OL zEd4|OYxUCnC#si@{y(aPc#J9fL{~`sLOd_2{!HGc{xjJ?O=(*G1MyCj5{7=HbL@Yl z2}@z7%zi!&zV7yZj(eOv9VkpB3R#N8$L`_m;=j>_@^w9GQ@q0)c3sK1YQVeANzxR(qO@Mejc>JY} zr5S{NGWRpZLz4ZI&!43vensLHOEQI;MBLjfDiY77TT~Qop7K~K%1V;-OX9W1A3py+ z`0q}C=kXqPn0X^uVNbZN=(F)Xd4MW=r+{ncsU?Ztsse{;&y zmr6y|=K;m2XW7a5n`ilOTW}<4!YU;}BqO!a9xp_E=>gYIYs5S<`` yYCAh>sXJ;ox;SV%YH04(ady$rP*>AXS9jFYl;igxw&lNkNmO(s(t6^R^#239QVC4} diff --git a/tests/gentropy/method/test_colocalisation_method.py b/tests/gentropy/method/test_colocalisation_method.py index f90d54b3f..37613354c 100644 --- a/tests/gentropy/method/test_colocalisation_method.py +++ b/tests/gentropy/method/test_colocalisation_method.py @@ -2,11 +2,14 @@ from __future__ import annotations +from typing import Any + +import pytest from gentropy.dataset.colocalisation import Colocalisation from gentropy.dataset.study_locus_overlap import StudyLocusOverlap from gentropy.method.colocalisation import Coloc, ECaviar +from pandas.testing import assert_frame_equal from pyspark.sql import SparkSession -from pyspark.sql import functions as f def test_coloc(mock_study_locus_overlap: StudyLocusOverlap) -> None: @@ -14,165 +17,91 @@ def test_coloc(mock_study_locus_overlap: StudyLocusOverlap) -> None: assert isinstance(Coloc.colocalise(mock_study_locus_overlap), Colocalisation) -def test_coloc_colocalise( - spark: SparkSession, - threshold: float = 1e-5, -) -> None: - """Compare COLOC results with R implementation, using provided sample dataset from R package (StudyLocusOverlap).""" - test_overlap_df = spark.read.parquet( - "tests/gentropy/data_samples/coloc_test_data.snappy.parquet", header=True - ) - test_overlap = StudyLocusOverlap(test_overlap_df, StudyLocusOverlap.get_schema()) - test_result = Coloc.colocalise(test_overlap) - - expected = spark.createDataFrame( - [ - { - "h0": 1.3769995397857477e-18, - "h1": 2.937336451601565e-10, - "h2": 8.593226431647826e-12, - "h3": 8.338916748775843e-4, - "h4": 0.9991661080227981, - } - ] - ) - difference = test_result.df.select("h0", "h1", "h2", "h3", "h4").subtract(expected) - for col in difference.columns: - assert difference.filter(f.abs(f.col(col)) > threshold).count() == 0 - - -def test_single_snp_coloc( - spark: SparkSession, - threshold: float = 1e-5, -) -> None: - """Test edge case of coloc where only one causal SNP is present in the StudyLocusOverlap.""" - test_overlap_df = spark.createDataFrame( - [ - { - "leftStudyLocusId": 1, - "rightStudyLocusId": 2, - "chromosome": "1", - "tagVariantId": "snp", - "left_logBF": 10.3, - "right_logBF": 10.5, - } - ] - ) - test_overlap = StudyLocusOverlap( - test_overlap_df.select( - "leftStudyLocusId", - "rightStudyLocusId", - "chromosome", - "tagVariantId", - f.struct(f.col("left_logBF"), f.col("right_logBF")).alias("statistics"), +@pytest.mark.parametrize( + "observed_data, expected_data", + [ + # associations with a single overlapping SNP + ( + # observed overlap + [ + { + "leftStudyLocusId": 1, + "rightStudyLocusId": 2, + "chromosome": "1", + "tagVariantId": "snp", + "statistics": {"left_logBF": 10.3, "right_logBF": 10.5}, + }, + ], + # expected coloc + [ + { + "h0": 9.254841951638903e-5, + "h1": 2.7517068829182966e-4, + "h2": 3.3609423764447284e-4, + "h3": 9.254841952564387e-13, + "h4": 0.9992961866536217, + }, + ], ), - StudyLocusOverlap.get_schema(), - ) - test_result = Coloc.colocalise(test_overlap) - - expected = spark.createDataFrame( - [ - { - "h0": 9.254841951638903e-5, - "h1": 2.7517068829182966e-4, - "h2": 3.3609423764447284e-4, - "h3": 9.254841952564387e-13, - "h4": 0.9992961866536217, - } - ] - ) - difference = test_result.df.select("h0", "h1", "h2", "h3", "h4").subtract(expected) - for col in difference.columns: - assert difference.filter(f.abs(f.col(col)) > threshold).count() == 0 - - -def test_single_snp_coloc_one_negative( + # associations with multiple overlapping SNPs + ( + # observed overlap + [ + { + "leftStudyLocusId": 1, + "rightStudyLocusId": 2, + "chromosome": "1", + "tagVariantId": "snp1", + "statistics": {"left_logBF": 10.3, "right_logBF": 10.5}, + }, + { + "leftStudyLocusId": 1, + "rightStudyLocusId": 2, + "chromosome": "1", + "tagVariantId": "snp2", + "statistics": {"left_logBF": 10.3, "right_logBF": 10.5}, + }, + ], + # expected coloc + [ + { + "h0": 4.6230151407950416e-5, + "h1": 2.749086942648107e-4, + "h2": 3.357742374172504e-4, + "h3": 9.983447421747411e-4, + "h4": 0.9983447421747356, + }, + ], + ), + ], +) +def test_coloc_semantic( spark: SparkSession, - threshold: float = 1e-5, + observed_data: list[Any], + expected_data: list[Any], ) -> None: - """Test edge case of coloc where only one causal SNP is present (On one side!) in the StudyLocusOverlap.""" - test_overlap_df = spark.createDataFrame( - [ - { - "leftStudyLocusId": 1, - "rightStudyLocusId": 2, - "chromosome": "1", - "tagVariantId": "snp", - "left_logBF": 18.3, - "right_logBF": 0.01, - } - ] + """Test our COLOC with the implementation in R.""" + observed_overlap = StudyLocusOverlap( + _df=spark.createDataFrame(observed_data, schema=StudyLocusOverlap.get_schema()), + _schema=StudyLocusOverlap.get_schema(), ) - test_overlap = StudyLocusOverlap( - test_overlap_df.select( - "leftStudyLocusId", - "rightStudyLocusId", - "chromosome", - "tagVariantId", - f.struct(f.col("left_logBF"), f.col("right_logBF")).alias("statistics"), - ), - StudyLocusOverlap.get_schema(), + observed_coloc_pdf = ( + Coloc.colocalise(observed_overlap) + .df.select("h0", "h1", "h2", "h3", "h4") + .toPandas() ) - test_result = Coloc.colocalise(test_overlap) - test_result.df.show(1, False) - expected = spark.createDataFrame( - [ - { - "h0": 1.0246538505087709e-4, - "h1": 0.9081680002273896, - "h2": 1.0349517929098209e-8, - "h3": 1.0246538506112363e-12, - "h4": 0.09172952403701702, - } - ] + expected_coloc_pdf = ( + spark.createDataFrame(expected_data) + .select("h0", "h1", "h2", "h3", "h4") + .toPandas() ) - difference = test_result.df.select("h0", "h1", "h2", "h3", "h4").subtract(expected) - for col in difference.columns: - assert difference.filter(f.abs(f.col(col)) > threshold).count() == 0 - -def test_single_snp_coloc_both_negative( - spark: SparkSession, - threshold: float = 1e-5, -) -> None: - """Test edge case of coloc where only one non-causal SNP overlaps in the StudyLocusOverlap.""" - test_overlap_df = spark.createDataFrame( - [ - { - "leftStudyLocusId": 1, - "rightStudyLocusId": 2, - "chromosome": "1", - "tagVariantId": "snp", - "left_logBF": 0.03, - "right_logBF": 0.01, - } - ] - ) - test_overlap = StudyLocusOverlap( - test_overlap_df.select( - "leftStudyLocusId", - "rightStudyLocusId", - "chromosome", - "tagVariantId", - f.struct(f.col("left_logBF"), f.col("right_logBF")).alias("statistics"), - ), - StudyLocusOverlap.get_schema(), - ) - test_result = Coloc.colocalise(test_overlap) - expected = spark.createDataFrame( - [ - { - "h0": 0.9997855774090624, - "h1": 1.0302335812225042e-4, - "h2": 1.0098335895103664e-4, - "h3": 9.9978557750904e-9, - "h4": 1.0405876008495098e-5, - } - ] + assert_frame_equal( + observed_coloc_pdf, + expected_coloc_pdf, + check_exact=False, + check_dtype=True, ) - difference = test_result.df.select("h0", "h1", "h2", "h3", "h4").subtract(expected) - for col in difference.columns: - assert difference.filter(f.abs(f.col(col)) > threshold).count() == 0 def test_ecaviar(mock_study_locus_overlap: StudyLocusOverlap) -> None: