module Bio.Phylogeny.PhyBin.Parser
(newick_parser, parseNewick, parseNewicks, parseNewickFiles, unitTests)
where
import Control.Exception (evaluate, handle, SomeException)
import qualified Data.ByteString.Char8 as B
import Data.Char (isSpace)
import Data.Map as M
import Data.List as L
import Text.Parsec
import Text.Parsec.ByteString
import Test.HUnit ((~:),(~=?),Test,test,assertFailure)
import System.FilePath (takeBaseName)
import Bio.Phylogeny.PhyBin.CoreTypes
import Prelude as P
type NameHack = (String->String)
parseNewick :: LabelTable -> NameHack -> String -> B.ByteString -> (LabelTable, [NewickTree DefDecor])
parseNewick tbl0 name_hack file input = (lbls,trs)
where
(lbls,trs) =
extractLabelTable tbl0 $
runB file (many1$ newick_parser name_hack) $
B.filter (not . isSpace) input
parseNewickFiles :: NameHack -> [String] -> IO (LabelTable, [FullTree DefDecor])
parseNewickFiles fn nms = do
bss <- mapM B.readFile nms
return $ parseNewicks fn (zip nms bss)
parseNewicks :: NameHack -> [(String,B.ByteString)] -> (LabelTable, [FullTree DefDecor])
parseNewicks name_hack pairs = (labtbl, fullTrs)
where
fullTrs = [ FullTree (tweakName file ind) labtbl tr
| (file,tr) <- trs
| ind <- [(0::Int)..] ]
tweakName file ind =
if addSuffix
then (takeBaseName file)++"_"++show ind
else (takeBaseName file)
addSuffix = case trs of
[] -> False
[_] -> False
_ -> True
(labtbl, trs) = P.foldr fn (M.empty,[]) pairs
fn (file,bstr) (!acc,!ls) =
let (acc',trs) = parseNewick acc name_hack file bstr
names = if length trs > 1
then zipWith (\x y -> x++"_"++show y) (repeat file) [0..]
else [file]
in (acc', (zip names trs) ++ ls)
runB :: Show a => String -> Parser a -> B.ByteString -> a
runB file p input = case (parse p "" input) of
Left err -> error ("parse error in file "++ show file ++" at "++ show err)
Right x -> x
extractLabelTable :: LabelTable -> [TempTree] -> (LabelTable, [NewickTree DefDecor])
extractLabelTable tbl0 trs = (finMap,finTree)
where
flipped = M.fromList $ L.map (\(x,y)->(y,x)) $ M.toList tbl0
(_,finMap,finTree) = loop1 flipped tbl0 trs
loop1 seen acc [] = (seen, acc, [])
loop1 seen acc (hd:tl) =
let (seen2,acc2,hd') = loop2 seen acc hd
(seen3,acc3,tl') = loop1 seen2 acc2 tl in
(seen3,acc3, hd':tl')
loop2 seen acc (NTLeaf (d,Just nm) _)
| M.member nm seen = (seen, acc, NTLeaf d (seen M.! nm))
| otherwise = let nxt = M.size acc in
(M.insert nm nxt seen,
M.insert nxt nm acc, NTLeaf d nxt)
loop2 seen1 acc1 (NTInterior (d,Nothing) chlds) =
let (seen',acc',ls') =
P.foldr (\ x (seen2,acc2,ls) ->
let (seen3,acc3,x') = loop2 seen2 acc2 x in
(seen3, acc3, x':ls))
(seen1,acc1,[])
chlds
in (seen',acc', NTInterior d ls')
type TempTree = NewickTree (DefDecor,Maybe String)
tag :: DefDecor -> TempTree -> TempTree
tag l s =
case s of
NTLeaf (_,m) n -> NTLeaf (l,m) n
NTInterior (_,Nothing) ls -> NTInterior (l,Nothing) ls
newick_parser :: NameHack -> Parser TempTree
newick_parser name_hack =
do x <- subtree name_hack
l <- branchMetadat name_hack
_ <- char ';'
return$ tag l x
subtree :: NameHack -> Parser TempTree
subtree name_hack = internal name_hack <|> leaf name_hack
defaultMeta :: (Maybe Int, Double)
defaultMeta = (Nothing,0.0)
leaf :: NameHack -> Parser TempTree
leaf name_hack =
do nm <- name
let nm' = name_hack nm
return$ NTLeaf (defaultMeta,Just nm') 0
internal :: NameHack -> Parser TempTree
internal name_hack =
do _ <- char '('
bs <- branchset name_hack
_ <- char ')'
_nm <- name
return$ NTInterior (defaultMeta,Nothing) bs
branchset :: NameHack -> Parser [TempTree]
branchset name_hack =
do b <- branch name_hack <?> "at least one branch"
rest <- option [] $ try$ do char ','; branchset name_hack
return (b:rest)
branch :: NameHack -> Parser TempTree
branch name_hack =
do s <- subtree name_hack
l <- branchMetadat name_hack
return$ tag l s
branchMetadat :: NameHack -> Parser DefDecor
branchMetadat name_hack = option defaultMeta $ do
char ':'
n <- (try sciNotation <|> number)
bootstrap <- option Nothing $ do
char '['
s <- many1 digit
char ']'
return (Just (read s))
return (bootstrap,n)
number :: Parser Double
number =
do sign <- option "" $ string "-"
first <- many1 digit
second <- option "0" $ try$ do char '.'; many1 digit
return (read (sign ++ first++"."++second) :: Double)
sciNotation :: Parser Double
sciNotation =
do coeff <- do first <- many1 digit
second <- option "0" $ try$ do char '.'; many1 digit
return $ first++"."++second
char 'e'
sign <- option "" $ string "-"
expon <- many1 digit
return (read (coeff++"e"++sign++expon))
name :: Parser String
name = option "" $ many1 (noneOf "()[]:;, \t\n\r\f\v")
tre1 :: NewickTree DefDecor
tre1 = head $ snd $ parseNewick M.empty id "" $ B.pack "(A:0.1,B:0.2,(C:0.3,D:0.4):0.5);"
unitTests :: Test
unitTests = test
[ "test name" ~: "foo" ~=? run name "foo"
, "test number" ~: 3.3 ~=? run number "3.3"
, "test number" ~: 3.0 ~=? run number "3"
, "test number" ~: 3.0 ~=? run number "-3"
, "leaf" ~: ntl "A" ~=? run (leaf id) "A"
, "subtree" ~: ntl "A" ~=? run (subtree id) "A"
, "null branchset" ~: errortest$ run (branchset id) ""
, "internal" ~: NTInterior nada [ntl "A"] ~=? run (internal id) "(A);"
, "example: no nodes are named" ~: NTInterior nada
[ntl "", ntl "", NTInterior nada [ntl "", ntl ""]]
~=? run (newick_parser id) "(,,(,));"
, "example: leaf nodes are named" ~: 6 ~=? treeSize (run (newick_parser id) "(A,B,(C,D));")
, "example: all nodes are named" ~: 6 ~=? treeSize (run (newick_parser id) "(A,B,(C,D)E)F;")
, "example: all but root node have a distance to parent" ~: 6 ~=? treeSize (run (newick_parser id) "(:0.1,:0.2,(:0.3,:0.4):0.5);")
, "example: all have a distance to parent" ~: 6 ~=? treeSize (run (newick_parser id) "(:0.1,:0.2,(:0.3,:0.4):0.5):0.6;")
, "example: distances and leaf names (popular)" ~: 6 ~=? treeSize tre1
, "example: distances and all names" ~: 6 ~=? treeSize (run (newick_parser id) "(A:0.1,B:0.2,(C:0.3,D:0.4)E:0.5)F;")
, "example: a tree rooted on a leaf node (rare)" ~: 6 ~=? treeSize (run (newick_parser id) "((B:0.2,(C:0.3,D:0.4)E:0.5)F:0.1)A;")
]
where
ntl s = NTLeaf ((Nothing,0.0),Just s) 0
nada = ((Nothing,0),Nothing)
run :: Show a => Parser a -> String -> a
run p input = runB "<unknown>" p (B.pack input)
errortest :: t -> IO ()
errortest x =
handle (\ (e::SomeException) -> return ()) $
do evaluate x
assertFailure "test was expected to throw an error"