-
Notifications
You must be signed in to change notification settings - Fork 4
/
Main.hs
617 lines (511 loc) · 30.4 KB
/
Main.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
{-# LANGUAGE RecordWildCards, TupleSections, NamedFieldPuns #-}
{-# OPTIONS_GHC -fwarn-unused-imports -fwarn-incomplete-patterns #-}
-- | The MAIN module, of course. This is the script that deals with
-- command line options and calling into the heart of the beast.
module Main where
import Data.List (sort, intersperse, foldl')
import Data.List.Split (splitOneOf)
import qualified Data.ByteString.Char8 as B
import qualified Data.Map as M
import qualified Data.Set as S
import Control.Monad
import Control.Concurrent (readChan)
import System.Environment (getArgs, withArgs)
import System.Console.GetOpt (OptDescr(Option), ArgDescr(..), ArgOrder(..), usageInfo, getOpt)
import System.Exit (exitSuccess)
import System.IO (stdout)
import Control.Exception (catch, SomeException)
import qualified Data.Vector as V
import Test.HUnit as HU
import Bio.Phylogeny.PhyBin.CoreTypes
import Bio.Phylogeny.PhyBin (driver, normalize, annotateWLabLists,
unitTests, acquireTreeFiles, deAnnotate,
retrieveHighlights, matchAnyHighlight)
import Bio.Phylogeny.PhyBin.Parser (parseNewick, parseNewickFiles, unitTests)
import Bio.Phylogeny.PhyBin.Visualize (viewNewickTree) -- dotNewickTree_debug
import Bio.Phylogeny.PhyBin.RFDistance
import Bio.Phylogeny.PhyBin.PreProcessor
import qualified Data.Clustering.Hierarchical as C
import Data.Version (showVersion)
import Paths_phybin (version)
----------------------------------------------------------------------------------------------------
-- MAIN script: Read command line options and call the program.
----------------------------------------------------------------------------------------------------
-- Note: ORDER is important here, we process options in this order:
data Flag
= Verbose
| Version
| Output String
| SetNumTaxa Int
| BranchThresh Double
| BootStrapThresh Int
| PruneTaxa [String]
| NullOpt
| Graph | Draw
| Force
| View
| TabDelimited Int Int
| Highlight FilePath
| ShowTreesInDendro | ShowInterior
| RFMode WhichRFMode
| SelfTest
| RFMatrix | LineSetDiffMode
| PrintNorms | PrintReg | PrintConsensus | PrintMatching
| Cluster C.Linkage
| BinningMode
| EditDistThresh Int
| DendogramOnly
| NameCutoff String
| NamePrefix Int
| NameTable String -- Must come after Cutoff/Prefix
deriving (Show, Eq, Ord) -- ORD is really used.
parseTabDelim :: String -> Flag
parseTabDelim _str =
TabDelimited 8 9
options :: [OptDescr Flag]
options =
[ Option ['v'] ["verbose"] (NoArg Verbose) "print WARNINGS and other information (recommended at first)"
, Option ['V'] ["version"] (NoArg Version) "show version number"
, Option ['o'] ["output"] (ReqArg Output "DIR") "set directory to contain all output files (default \"./phybin_out/\")"
, Option [] ["selftest"] (NoArg SelfTest) "run internal unit tests"
{- -- TODO: FIXME: IMPLEMENT THIS:
, Option [] [] (NoArg NullOpt) ""
, Option ['t'] ["tabbed"] (ReqArg parseTabDelim "NUM1:NUM2")$ "assume the input is a ab-delimited file with gene names \n"++
"in column NUM1 and Newick trees in NUM2"
-}
, Option [] [] (NoArg NullOpt) ""
, Option [] [] (NoArg$ error "internal problem") "----------------------------- Clustering Options ------------------------------"
, Option [] ["bin"] (NoArg BinningMode)$ "Use simple binning, the cheapest form of 'clustering'"
, Option [] ["single"] (NoArg$ Cluster C.SingleLinkage) $ "Use single-linkage clustering (nearest neighbor)"
, Option [] ["complete"] (NoArg$ Cluster C.CompleteLinkage)$ "Use complete-linkage clustering (furthest neighbor)"
, Option [] ["UPGMA"] (NoArg$ Cluster C.UPGMA) $ "Use Unweighted Pair Group Method (average linkage) - DEFAULT mode"
, Option [] ["editdist"] (ReqArg (EditDistThresh . read) "DIST")$
"Combine all clusters separated by DIST or less. Report a flat list of clusters.\n"++
"Irrespective of whether this is activated, a hierarchical clustering (dendogram.pdf) is produced."
-- , Option [] ["dendogram"] (NoArg DendogramOnly)$ "Report a hierarchical clustering (default)"
, Option [] [] (NoArg NullOpt) ""
, Option [] [] (NoArg$ error "internal problem") "--- Select Robinson-Foulds (symmetric difference) distance algorithm: ---"
, Option [] ["hashrf"] (NoArg$ RFMode HashRF)
"(default) use a variant of the HashRF algorithm for the distance matrix"
, Option [] ["tolerant"] (NoArg$ RFMode TolerantNaive)
"use a slower, modified RF metric that tolerates missing taxa"
, Option [] [] (NoArg NullOpt) ""
, Option [] [] (NoArg$ error "internal problem") "----------------------------- Visualization --------------------------------"
, Option ['g'] ["graphbins"] (NoArg Graph) "use graphviz to produce .dot and .pdf output files"
-- TODO: Produce the consensus tree as well as the individual trees.
, Option ['d'] ["drawbins"] (NoArg Draw) "like -g, but open GUI windows to show each bin's tree"
, Option ['w'] ["view"] (NoArg View)$ "for convenience, \"view mode\" simply displays input Newick files without binning"
, Option [] ["showtrees"] (NoArg ShowTreesInDendro) "Print (textual) tree topology inside the nodes of the dendrogram"
, Option [] ["highlight"] (ReqArg Highlight "FILE") $
"Highlight nodes in the tree-of-trees (dendrogram) consistent with the.\n"++
"given tree file. Multiple highlights are permitted and use different colors."
-- "Highlights may be SUBTREES that use any subset of the taxa."
, Option [] ["interior"] (NoArg ShowInterior)
"Show the consensus trees for interior nodes in the dendogram, rather than just points."
, Option [] [] (NoArg NullOpt) ""
, Option [] [] (NoArg$ error "internal problem") "---------------------------- Tree pre-processing -----------------------------"
, Option [] ["prune"] (ReqArg (PruneTaxa . parseTaxa) "TAXA")
("Prune trees to only TAXA before doing anything else.\n"++
"Space and comma separated lists of taxa are allowed. Use quotes.")
, Option ['b'] ["minbranchlen"] (ReqArg (BranchThresh . read) "LEN") "collapse branches less than LEN"
, Option [] ["minbootstrap"] (ReqArg (BootStrapThresh . read) "INT")
"collapse branches with bootstrap values less than INT"
-- , Option ['n'] ["numtaxa"] (ReqArg (SetNumTaxa . read) "NUM")
-- "expect NUM taxa for this dataset, demand trees have all of them"
, Option [] [] (NoArg NullOpt) ""
, Option [] [] (NoArg$ error "internal problem") "--------------------------- Extracting taxa names ----------------------------"
-- , Option ['n'] ["numtaxa"] (ReqArg (SetNumTaxa . read) "NUM") "expect NUM taxa for this dataset (otherwise it will guess)"
-- TODO, FIXME: The "guessing" part doesn't actually work yet -- implement it!!
-- What's a good algorithm? Insist they all have the same number? Take the mode?
{- -- TODO: FIXME: IMPLEMENT THIS:
, Option ['f'] ["force"] (NoArg Force) "force phybin to consume and bin trees with different numbers of taxa"
-}
, Option [] [] (NoArg NullOpt) ""
, Option ['p'] ["nameprefix"] (ReqArg (NamePrefix . read) "NUM") $
"Leaf names in the input Newick trees can be gene names, not taxa.\n"++
"Then it is typical to extract taxa names from genes. This option extracts\n"++
"a prefix of NUM characters to serve as the taxa name."
, Option [] [] (NoArg NullOpt) ""
, Option ['s'] ["namesep"] (ReqArg NameCutoff "STR") $ --"\n"++
"An alternative to --nameprefix, STR provides a set of delimeter characters,\n"++
"for example '-' or '0123456789'. The taxa name is then a variable-length\n"++
"prefix of each gene name up to but not including any character in STR."
, Option [] [] (NoArg NullOpt) ""
, Option ['m'] ["namemap"] (ReqArg NameTable "FILE") $
"Even once prefixes are extracted it may be necessary to use a lookup table\n"++
"to compute taxa names, e.g. if multiple genes/plasmids map onto one taxa.\n"++
"This option specifies a text file with find/replace entries of the form\n"++
"\"<string> <taxaname>\", which are applied AFTER -s and -p."
, Option [] [] (NoArg NullOpt) ""
, Option [] [] (NoArg$ error "internal problem") "--------------------------- Utility Modes ----------------------------"
, Option [] ["rfdist"] (NoArg RFMatrix) "print a Robinson Foulds distance matrix for the input trees"
, Option [] ["setdiff"] (NoArg LineSetDiffMode) "for convenience, print the set difference between cluster*.txt files"
, Option [] ["print"] (NoArg PrintReg) "simply print out a concise form of each input tree"
, Option [] ["printnorms"] (NoArg PrintNorms) "simply print out a concise and NORMALIZED form of each input tree"
, Option [] ["consensus"] (NoArg PrintConsensus) "print a strict consensus tree for the inputs, then exit"
, Option [] ["matching"] (NoArg PrintMatching) "print a list of tree names that match any --highlight argument"
]
-- | Parse the list of taxa provided on the command line.
parseTaxa :: String -> [String]
parseTaxa str = filter (not . null) $
splitOneOf ",; " str
usage :: String
usage = "\nUsage: phybin [OPTION...] files or directories...\n\n"++
-- "MODE must be one of 'bin' or 'cluster'.\n\n" ++
"PhyBin takes Newick tree files as input. Paths of Newick files can\n"++
"be passed directly on the command line. Or, if directories are provided,\n"++
"all files in those directories will be read. Taxa are named based on the files\n"++
"containing them. If a file contains multiple trees, all are read by phybin, and\n"++
"the taxa name then includes a suffix indicating the position in the file:\n"++
" e.g. FILENAME_0, FILENAME_1, etc.\n"++
"\n"++
-- "In binning mode, Phybin output contains, at minimum, files of the form binXX_YY.tr,\n"++
-- "each containing a list of input file paths that fall into that bin. XX signifies\n"++
-- "the rank of the bin (biggest first), and YY is how many trees it contains.\n"++
-- "\n"++
"When clustering trees, Phybin computes a complete all-to-all Robinson-Foulds distance matrix.\n"++
"If a threshold distance (tree edit distance) is given, then a flat set of clusters\n"++
"will be produced in files clusterXX_YY.tr. Otherwise it produces a full dendogram.\n"++
"\n"++
"Binning mode provides an especially quick-and-dirty form of clustering.\n"++
"When running with the --bin option, only exactly equal trees are put in the same cluster.\n"++
"Tree pre-processing still applies, however: for example collapsing short branches.\n"++
"\n"++
"USAGE NOTES: \n"++
" * Currently phybin ignores input trees with the wrong number of taxa.\n"++
" * If given a directory as input phybin will assume all contained files are Newick trees.\n\n"++
"\nOptions include:\n"
defaultErr :: [String] -> t
defaultErr errs = error $ "ERROR!\n" ++ (concat errs ++ usageInfo usage options)
--------------------------------------------------------------------------------
main :: IO ()
main =
do argv <- getArgs
(opts,files) <-
case getOpt Permute options argv of
(o,n,[] ) -> return (o,n)
(_,_,errs) -> defaultErr errs
all_inputs <- acquireTreeFiles files
let process_opt cfg opt = case opt of
NullOpt -> return cfg
Verbose -> return cfg { verbose= True }
Version -> do putStrLn$ "phybin version "++ showVersion version; exitSuccess
SelfTest -> do _ <- runTestTT allUnitTests; exitSuccess
RFMatrix -> return cfg { print_rfmatrix= True }
ShowTreesInDendro -> return cfg { show_trees_in_dendro = True }
ShowInterior -> return cfg { show_interior_consensus = True }
Highlight path -> return cfg { highlights = path : highlights cfg }
LineSetDiffMode -> do
bss <- mapM B.readFile files
case map (S.fromList . B.lines) bss of
[set1,set2] -> do let [f1,f2] = files
let diff = S.difference set1 set2
putStrLn$" Found "++show(S.size diff)++" lines occuring in "++f1++" but not "++f2
mapM_ B.putStrLn $ S.toList diff
oth -> error $"Line set difference mode expects two files as input, got "++show(length oth)
exitSuccess
PrintNorms -> return cfg
PrintReg -> return cfg
PrintConsensus -> return cfg
PrintMatching -> return cfg
Cluster lnk -> return cfg { clust_mode = ClusterThem lnk }
RFMode TolerantNaive
| Expected _ <- num_taxa cfg -> error "should not set --numtaxa AND --tolerant"
| otherwise -> return cfg { rfmode = TolerantNaive }
RFMode HashRF -> return cfg { rfmode = HashRF }
BinningMode -> return cfg { clust_mode = BinThem }
EditDistThresh n -> return cfg { dist_thresh = Just n }
DendogramOnly -> return cfg { dist_thresh = Nothing }
Output s -> return cfg { output_dir= s }
SetNumTaxa n
| rfmode cfg == TolerantNaive -> error "should not set --tolerant AND --numtaxa"
| otherwise -> return cfg { num_taxa= Expected n }
BranchThresh n -> return cfg { branch_collapse_thresh = Just n }
BootStrapThresh n -> return cfg { bootstrap_collapse_thresh = Just n }
PruneTaxa ls -> return cfg { preprune_labels = Just ls }
Graph -> return cfg { do_graph= True }
Draw -> return cfg { do_draw = True }
View -> return cfg -- Handled below
TabDelimited _ _ -> error "tabbed option not handled yet"
Force -> error "force option not handled yet"
NameCutoff str -> let chopper = takeWhile (not . flip S.member (S.fromList str))
in return cfg { name_hack = chopper . name_hack cfg }
NamePrefix n -> return cfg { name_hack = (take n) . name_hack cfg }
-- This should always be after cutoff/prefix:
NameTable file -> do reader <- name_table_reader file
return cfg { name_hack = reader . name_hack cfg }
config <- foldM process_opt default_phybin_config{ inputs= all_inputs }
(sort opts) -- NOTE: options processed in sorted order.
when (null files) $ do
defaultErr ["No file arguments!\n"]
--------------------------------------------------------------------
----- Handle SPECIAL modes before going into normal processing -----
--------------------------------------------------------------------
-- This mode kicks in AFTER config options are processed.
when (elem PrintReg opts) $ do
(_,fts) <- parseNewickFiles (name_hack config) all_inputs
forM_ fts $ \ ft@(FullTree name _ _) -> do
putStrLn $ "Tree "++show name
putStrLn$ show$ displayDefaultTree ft
exitSuccess
------------------------------------------------------------
when (elem PrintNorms opts) $ do
(_,fts) <- parseNewickFiles (name_hack config) all_inputs
forM_ fts $ \ ft@(FullTree name _ _) -> do
putStrLn $ "Tree , NORMALIZED:"++show name
putStrLn$ show$ displayDefaultTree$ deAnnotate $
liftFT (normalize . annotateWLabLists) ft
exitSuccess
------------------------------------------------------------
when (elem PrintConsensus opts) $
case (num_taxa config) of
Expected numtax -> do
(_,fts) <- parseNewickFiles (name_hack config) all_inputs
putStrLn $ "Strict Consensus Tree of "++show (length fts)++" trees:"
when (null fts) $ error "No trees provided!"
let ctree = consensusTree numtax (map nwtree fts)
FullTree{labelTable} = head fts
print$ displayStrippedTree$ FullTree "" labelTable ctree
exitSuccess
_ -> error "--consensus: cannot compute consensus when number of taxa is unknown or variable"
------------------------------------------------------------
when (elem PrintMatching opts) $ do
(labs,fts) <- parseNewickFiles (name_hack config) all_inputs
when (null$ highlights config) $ error "No --highlight given, so --matching makes no sense. Matching what?"
htrs <- retrieveHighlights (name_hack config) labs (highlights config)
let isMatch = matchAnyHighlight htrs
forM_ fts $ \ FullTree{treename,nwtree} ->
when (isMatch$ fmap (const()) nwtree) $
putStrLn treename
exitSuccess
------------------------------------------------------------
when (View `elem` opts) $ do
view_graphs config
exitSuccess
------------------------------------------------------------
-- Otherwise do the main, normal thing:
driver config
-- | Completely optional visualization step. If this fails, we don't
-- sweat it.
view_graphs :: PhyBinConfig -> IO ()
view_graphs PBC{..} = catch act hndl
where
hndl :: SomeException -> IO ()
hndl exn = putStrLn$ "Warning: Viewing graph failed, ignoring. Error was: "++show exn
act = do chans <- forM inputs $ \ file -> do
putStrLn$ "Drawing "++ file ++"...\n"
str <- B.readFile file
putStrLn$ "Parsed: " ++ (B.unpack str)
let (tbl,tr) = parseNewick M.empty name_hack file str
(chan, _tr) <- viewNewickTree file (FullTree "" tbl (annotateWLabLists (head tr)))
return chan
forM_ chans readChan
return ()
--------------------------------------------------------------------------------
-- Every dataset it seems needs a new hack on the names!
name_table_reader :: String -> IO (String -> String)
name_table_reader file =
do contents <- readFile file
let mp = M.fromList $
map (\ls -> case ls of
[a,b] -> (a,b)
_ -> error$ "Each line of "++file++" must contain two whitespace free strings: "++ unwords ls) $
filter (not . null) $
map tokenize $
lines contents
return$
\ name_to_hack ->
case M.lookup name_to_hack mp of -- Could use a trie
Just x -> x
Nothing -> name_to_hack
where
tokenize :: String -> [String]
tokenize line =
case words line of
[] -> []
[one] -> error$"Name table contained bad line: "++ show one
[one,two] -> [one,two]
(one:rest) -> [one, unwords rest]
temp :: IO ()
temp = driver default_phybin_config{ num_taxa= Expected 7, inputs=["../datasets/test.tr"] }
--------------------------------------------------------------------------------
-- Aggregated Unit Tests
--------------------------------------------------------------------------------
allUnitTests :: Test
-- allUnitTests = unitTests ++
allUnitTests = test
[ Bio.Phylogeny.PhyBin.unitTests
, Bio.Phylogeny.PhyBin.Parser.unitTests
, "norm/Bip1" ~: (testNorm prob1)
, "bipTreeConversion" ~: testBipConversion
, "t3" ~: t3_consensusTest,
"t4" ~: t4_consensusTest,
"t5" ~: t5_consensusTest
]
-- [2013.07.23]
-- This was INCORRECTLY normalizing to:
-- ((1_, 2_), (7_, (18, 6_)), ((14, 3_), (19, (13, 5_))))
prob1 :: String
prob1 = "(5_, (19, ((3_, 14), ((2_, 1_), (7_, (6_, 18))))), 13);"
-- | Make sure that the normalized version of a tree yields the same bipartitions as
-- the unnormalized one.
testNorm :: String -> IO ()
testNorm str = do
let (labs,parseds) = parseNewick M.empty id "test" (B.pack str)
parsed = head parseds
normed = normalize $ annotateWLabLists parsed
bips1 = allBips parsed
bips2 = allBips normed
added = S.difference bips2 bips1
removed = S.difference bips1 bips2
-- dispBips bip = show$
-- map (map (labs M.!)) $
-- map IS.toList$ S.toList bip
unless (bips1 == bips2) $ do
putStrLn$ "Normalized this: "++show (displayDefaultTree $ FullTree "" labs parsed)
putStrLn$ "To this : "++show (displayDefaultTree $ deAnnotate $ FullTree "" labs normed)
error$ "Normalization added and removed these bipartitions, respectively:\n "
++concat (intersperse " " (map (dispBip labs) (S.toList added))) ++"\n "
++concat (intersperse " " (map (dispBip labs) (S.toList removed)))
-- 112 and 13
rftest :: IO ()
rftest = do
(_mp,[t1,t2]) <- parseNewickFiles (take 2) ["tests/13.tr", "tests/112.tr"]
putStrLn$ "Tree 13 : " ++ show (displayDefaultTree t1)
putStrLn$ "Tree 112 : "++ show (displayDefaultTree t2)
putStrLn$ "Tree 13 normed : "++ show (disp t1)
putStrLn$ "Tree 112 normed : "++ show (disp t2)
putStrLn$ "13 collapsed 0.02: " ++show (disp$ liftFT (collapseBranchLenThresh 0.02) t1)
putStrLn$ "112 collapsed 0.02: " ++show (disp$ liftFT (collapseBranchLenThresh 0.02) t2)
putStrLn$ "13 collapsed 0.03: " ++show (disp$ liftFT (collapseBranchLenThresh 0.03) t1)
putStrLn$ "112 collapsed 0.03: " ++show (disp$ liftFT (collapseBranchLenThresh 0.03) t2)
let (mat,_) = naiveDistMatrix [nwtree t1, nwtree t2]
printDistMat stdout mat
return ()
where
disp (FullTree nm labs tr) =
let collapsed :: AnnotatedTree
collapsed = normalize$ annotateWLabLists tr
in displayDefaultTree$ deAnnotate $ FullTree nm labs collapsed
-- | This test was done with --editdist 4 --complete
t3_consensusTest :: IO ()
t3_consensusTest = consensusTest "./tests/t3_consensus/cluster1_284_alltrees.tr"
"./tests/t3_consensus/cluster1_284_consensus.tr"
-- | This test was done with --editdist 0 --complete
t4_consensusTest :: IO ()
t4_consensusTest = consensusTest "./tests/t4_consensus/cluster1_16_alltrees.tr"
"./tests/t4_consensus/cluster1_16_consensus.tr"
-- | This test was done with --editdist 1 --complete
t5_consensusTest :: IO ()
t5_consensusTest = consensusTest "./tests/t5_consensus/cluster1_35_alltrees.tr"
"./tests/t5_consensus/cluster1_35_consensus.tr"
consensusTest :: String -> String -> IO ()
consensusTest alltrees consensus = do
(_,ctree:ftrees) <- parseNewickFiles id [consensus,alltrees]
let num_taxa = numLeaves (nwtree ctree)
plainTrs = map nwtree ftrees
eachbips = map allBips plainTrs
totalBips = foldl' S.union S.empty eachbips
intersectBips = foldl' S.intersection S.empty eachbips
FullTree _ labs _ = ctree
linesPrnt x = unlines (map ((" "++) . dispBip labs) $ S.toList x)
putStrLn$ "Bips in each: "++ show (map S.size eachbips)
putStrLn$ "Total bips in all: "++show (S.size totalBips)
putStrLn$ "Bips in common: "++ show (S.size intersectBips)
putStrLn$ "Bips of first member:\n" ++ linesPrnt (head eachbips)
putStrLn$ "Some bips in the union that are NOT in the intersection:\n" ++
linesPrnt (S.fromList$ take 20$ S.toList$ S.difference totalBips intersectBips)
let cbips = allBips $ nwtree ctree
putStrLn$ "ConsensusBips ("++show (S.size cbips)++"):\n"++linesPrnt cbips
putStrLn$"Things in the consensus that should NOT be:\n"++linesPrnt (S.difference cbips intersectBips)
putStrLn$"Things not in the consensus that SHOULD be:\n"++linesPrnt (S.difference intersectBips cbips)
putStrLn$ "Now recomputing consensus tree for "++show num_taxa++" taxa"
let ctree2 = consensusTree num_taxa plainTrs
cbips2 = allBips ctree2
putStrLn$ "Freshly recomputed consensusBips ("++show (S.size cbips2)++"):\n"++linesPrnt cbips2
HU.assertEqual "Consensus tree on disk should match computed one:"
cbips cbips2 -- (allBips$ fmap (const ()) $ nwtree ctree)
putStrLn " Partial distance matrix WITHIN this cluster:"
let (mat,_) = naiveDistMatrix (map nwtree ftrees)
printDistMat stdout (V.take 30 mat)
HU.assertBool "Consensus should only include bicbips2ps in the members" (S.isSubsetOf cbips totalBips)
HU.assertEqual "Consensus tree matches intersected bips" cbips intersectBips
return ()
testBipConversion :: IO ()
testBipConversion =
do (_,trs) <- allTestTrees
mapM_ testOne trs
putStrLn "All trees passed bip conversion."
where
testOne (FullTree{nwtree}) = do
let sz = numLeaves nwtree
bips = allBips nwtree
tr2 = bipsToTree sz bips
bips2 = allBips tr2
assertEqual "round trip bips->tree->bips" bips bips2
-- | Read in all test trees which we happen to have put in the repository for testing
-- purposes.
allTestTrees :: IO (LabelTable, [FullTree DefDecor])
allTestTrees =
parseNewickFiles id $
[ "./tests/t3_consensus/cluster1_284_alltrees.tr"
, "./tests/t3_consensus/cluster1_284_consensus.tr"
, "./tests/t4_consensus/cluster1_16_alltrees.tr"
, "./tests/t4_consensus/cluster1_16_consensus.tr"
, "./tests/t5_consensus/cluster1_35_alltrees.tr"
, "./tests/t5_consensus/cluster1_35_consensus.tr"
]
----------------------------------------------------------------------------------------------------
-- TODO: expose a command line argument for testing.
-- The below test exposed my normalization bug relating to "deriving Ord".
-- I need to transform it into one or more proper unit tests.
main_test :: IO ()
main_test =
withArgs ["-w","~/newton_and_newton_local/datasets/yersinia/yersinia_trees/111.dnd","-m","../datasets/yersinia/name_table_hack_yersinia.txt"]
main
-- [2013.07.22] Disabling for the new Label representation:
{-
pa :: NewickTree DefDecor
pa = set_dec (Nothing,1) $
NTInterior () [NTInterior () [NTLeaf () "RE",NTInterior () [NTLeaf () "SD",NTLeaf () "SM"]],NTInterior () [NTLeaf () "BB",NTLeaf () "BJ"],NTInterior () [NTLeaf () "MB",NTLeaf () "ML"]]
pb :: NewickTree DefDecor
pb = set_dec (Nothing,1) $
NTInterior () [NTInterior () [NTLeaf () "BB",NTLeaf () "BJ"],NTInterior () [NTLeaf () "MB",NTLeaf () "ML"],NTInterior () [NTLeaf () "RE",NTInterior () [NTLeaf () "SD",NTLeaf () "SM"]]]
ls1 :: [(String, NewickTree DefDecor)]
ls1 = [("a",pa),("b",pb)]
-- This is one:
num_binned :: Int
num_binned = M.size $ binthem ls1
a_ :: (String, NewickTree DefDecor)
a_ = ("980.dnd",
fmap (\x -> (Nothing,x)) $
NTInterior 0.0 [NTInterior 5.697e-2 [NTLeaf 3.95e-2 "SM",NTLeaf 5.977e-2 "SD"],NTLeaf 0.13143 "RE",NTInterior 0.13899 [NTInterior 9.019e-2 [NTLeaf 0.11856 "BB",NTLeaf 0.13592 "BJ"],NTInterior 0.13194 [NTLeaf 0.19456 "MB",NTLeaf 0.16603 "ML"]]])
b_ :: (String, NewickTree DefDecor)
b_ = ("999.dnd",
fmap (\x -> (Nothing,x)) $
NTInterior 0.0 [NTInterior 6.527e-2 [NTInterior 0.13734 [NTLeaf 2.975e-2 "SM",NTLeaf 3.002e-2 "SD"],NTLeaf 0.18443 "RE"],NTInterior 6.621e-2 [NTLeaf 0.16184 "MB",NTLeaf 0.15233 "ML"],NTInterior 0.23143 [NTLeaf 9.192e-2 "BB",NTLeaf 0.10125 "BJ"]])
-- But THIS is two: ack!
num2 :: Int
num2 = M.size $ binthem [a_,b_]
-- Here's the test that breaks things:
a_norm :: NewickTree StandardDecor
a_norm = normalize (annotateWLabLists$ snd a_)
b_norm_ :: NewickTree (Double, Int, [Label])
b_norm_ = NTInterior (0.0,7,["BB","BJ","MB","ML","RE","SD","SM"])
[NTInterior (0.23143,2,["BB","BJ"]) [NTLeaf (9.192e-2,1,["BB"]) "BB",NTLeaf (0.10125,1,["BJ"]) "BJ"],NTInterior (6.621e-2,2,["MB","ML"]) [NTLeaf (0.16184,1,["MB"]) "MB",NTLeaf (0.15233,1,["ML"]) "ML"],NTInterior (6.527e-2,3,["RE","SD","SM"]) [NTLeaf (0.18443,1,["RE"]) "RE",NTInterior (0.13734,2,["SD","SM"]) [NTLeaf (3.002e-2,1,["SD"]) "SD",NTLeaf (2.975e-2,1,["SM"]) "SM"]]]
b_norm :: NewickTree StandardDecor
b_norm = fmap (\ (bl,w,ls) -> StandardDecor bl Nothing w ls) b_norm_
d1 :: IO (Chan (), NewickTree StandardDecor)
d1 = viewNewickTree "" a_norm
d2 :: IO (Chan (), NewickTree StandardDecor)
d2 = viewNewickTree "" b_norm
d1_ :: IO ThreadId
d1_ = forkIO $ do runGraphvizCanvas Dot (dotNewickTree_debug "" a_norm) Xlib; return ()
d2_ :: IO ThreadId
d2_ = forkIO $ do runGraphvizCanvas Dot (dotNewickTree_debug "" b_norm) Xlib; return ()
-- | A of a tree with _____ weights attached to it:
withBootstrap :: String
withBootstrap = "((((A8F330_:0.01131438136322714984,(G0GWK2_:0.00568050636963043226,(Q92FV4_:0.00284163304504484121,((B0BVQ5_:0.00319487112504297311,A8GU65_:0.00000122123005994819)74:0.00279881991324161267,(C3PM27_:0.00560787769333294297,C4K2Z0_:0.00559642713265556899)15:0.00000122123005994819)4:0.00000122123005994819)56:0.00276851661606284868)60:0.00283144414216590342)76:0.00886304965525876697,(A8GQC0_:0.05449879836105625541,(A8F0B2_:0.04736199885985507840,Q4UJN9_:0.02648399728559588939)64:0.00905997055810744446)28:0.00323255855543533657)29:0.02237505187863457132,(Q1RGK5_:0.00000122123005994819,A8GYD7_:0.00000122123005994819)100:0.28299884298270094884)100:0.05776841634437222123,(Q9ZC84_:0.00000122123005994819,D5AYH5_:0.00000122123005994819)99:0.00951976341375833368,Q68VM9_:0.04408933524904214141);"
withBootstrap2 :: String
withBootstrap2 = "((((A8F330_:0.01131438136322714984,(G0GWK2_:0.00568050636963043226,(Q92FV4_:0.00284163304504484121,((B0BVQ5_:0.00319487112504297311,A8GU65_:0.00000122123005994819):0.00279881991324161267[74],(C3PM27_:0.00560787769333294297,C4K2Z0_:0.00559642713265556899):0.00000122123005994819[15]):0.00000122123005994819[4]):0.00276851661606284868[56]):0.00283144414216590342[60]):0.00886304965525876697[76],(A8GQC0_:0.05449879836105625541,(A8F0B2_:0.04736199885985507840,Q4UJN9_:0.02648399728559588939):0.00905997055810744446[64]):0.00323255855543533657[28]):0.02237505187863457132[29],(Q1RGK5_:0.00000122123005994819,A8GYD7_:0.00000122123005994819):0.28299884298270094884[100]):0.05776841634437222123[100],(Q9ZC84_:0.00000122123005994819,D5AYH5_:0.00000122123005994819):0.00951976341375833368[99],Q68VM9_:0.04408933524904214141);"
-}