module Bio.Phylogeny.PhyBin.RFDistance
(
DenseLabelSet, DistanceMatrix,
allBips, foldBips, dispBip,
consensusTree, bipsToTree, filterCompatible, compatibleWith,
mkSingleDense, mkEmptyDense, bipSize,
denseUnions, denseDiff, invertDense, markLabel,
naiveDistMatrix, hashRF,
printDistMat)
where
import Control.Monad
import Control.Monad.ST
import Data.Function (on)
import Data.Word
import qualified Data.Vector as V
import qualified Data.Vector.Mutable as MV
import qualified Data.Vector.Unboxed.Mutable as MU
import qualified Data.Vector.Unboxed as U
import Text.PrettyPrint.HughesPJClass hiding (char, Style)
import System.IO (hPutStrLn, hPutStr, Handle)
import System.IO.Unsafe
import Bio.Phylogeny.PhyBin.CoreTypes
import Bio.Phylogeny.PhyBin.PreProcessor (pruneTreeLeaves)
import qualified Data.Set as S
import qualified Data.List as L
import qualified Data.IntSet as SI
import qualified Data.Map.Strict as M
import qualified Data.Foldable as F
import qualified Data.Traversable as T
import Data.Monoid
import Prelude as P
import Debug.Trace
#ifdef BITVEC_BIPS
import qualified Data.Vector.Unboxed.Bit as UB
import qualified Data.Bit as B
#endif
#define NORMALIZATION
#ifdef BITVEC_BIPS
# if 1
type DenseLabelSet = UB.Vector B.Bit
markLabel lab = UB.modify (\vec -> MU.write vec lab (B.fromBool True))
mkEmptyDense size = UB.replicate size (B.fromBool False)
mkSingleDense size ind = markLabel ind (mkEmptyDense size)
denseUnions = UB.unions
bipSize = UB.countBits
denseDiff = UB.difference
invertDense size bip = UB.invert bip
dispBip labs bip = show$ map (\(ix,_) -> (labs M.! ix)) $
filter (\(_,bit) -> B.toBool bit) $
zip [0..] (UB.toList bip)
denseIsSubset a b = UB.or (UB.difference b a)
traverseDense_ fn bip =
U.ifoldr' (\ ix bit acc ->
(if B.toBool bit
then fn ix
else return ()) >> acc)
(return ()) bip
# else
data DenseLabelSet = DLS !Int (UB.Vector B.Bit)
markLabel lab (DLS _ vec)= DLS (UB.modify (\vec -> return (MU.write vec lab (B.fromBool True))) ) vec
# endif
#else
type DenseLabelSet = SI.IntSet
markLabel lab set = SI.insert lab set
mkEmptyDense _size = SI.empty
mkSingleDense _size = SI.singleton
denseUnions _size = SI.unions
bipSize = SI.size
denseDiff = SI.difference
denseIsSubset = SI.isSubsetOf
dispBip labs bip = "[" ++ unwords strs ++ "]"
where strs = map (labs M.!) $ SI.toList bip
invertDense size bip = loop SI.empty (size1)
where
loop !acc ix | ix < 0 = acc
| SI.member ix bip = loop acc (ix1)
| otherwise = loop (SI.insert ix acc) (ix1)
traverseDense_ fn bip =
SI.foldr' (\ix acc -> fn ix >> acc) (return ()) bip
#endif
markLabel :: Label -> DenseLabelSet -> DenseLabelSet
mkEmptyDense :: Int -> DenseLabelSet
mkSingleDense :: Int -> Label -> DenseLabelSet
denseUnions :: Int -> [DenseLabelSet] -> DenseLabelSet
bipSize :: DenseLabelSet -> Int
dispBip :: LabelTable -> DenseLabelSet -> String
invertDense :: Int -> DenseLabelSet -> DenseLabelSet
traverseDense_ :: Monad m => (Int -> m ()) -> DenseLabelSet -> m ()
type DistanceMatrix = V.Vector (U.Vector Int)
naiveDistMatrix :: [NewickTree DefDecor] -> (DistanceMatrix, V.Vector (S.Set DenseLabelSet))
naiveDistMatrix lst =
let sz = P.length lst
treeVect = V.fromList lst
labelSets = V.map treeLabels treeVect
eachbips = V.map allBips treeVect
mat = V.generate sz $ \ i ->
U.generate i $ \ j ->
let
inI = (labelSets V.! i)
inJ = (labelSets V.! j)
inBoth = S.intersection inI inJ
Just prI = pruneTreeLeaves inBoth (treeVect V.! i)
Just prJ = pruneTreeLeaves inBoth (treeVect V.! j)
bipsI = if S.size inBoth == S.size inI
then (eachbips V.! i)
else allBips prI
bipsJ = if S.size inBoth == S.size inJ
then (eachbips V.! j)
else allBips prJ
diff1 = S.size (S.difference bipsI bipsJ)
diff2 = S.size (S.difference bipsJ bipsI)
in if S.size inBoth == 0
then 0
else diff1 + diff2
in (mat, eachbips)
where
treeLabels :: NewickTree a -> S.Set Label
treeLabels (NTLeaf _ lab) = S.singleton lab
treeLabels (NTInterior _ ls) = S.unions (map treeLabels ls)
labelBips :: NewickTree a -> NewickTree (a, [DenseLabelSet])
labelBips tr =
#ifdef NORMALIZATION
fmap (\(a,ls) -> (a,map (normBip size) ls)) $
#endif
loop tr
where
size = numLeaves tr
zero = mkEmptyDense size
loop (NTLeaf dec lab) = NTLeaf (dec, [markLabel lab zero]) lab
loop (NTInterior dec chlds) =
let chlds' = map loop chlds
sets = map (denseUnions size . snd . get_dec) chlds' in
NTInterior (dec, sets) chlds'
allLeaves = leafSet tr
leafSet (NTLeaf _ lab) = mkSingleDense size lab
leafSet (NTInterior _ ls) = denseUnions size $ map leafSet ls
normBip :: Int -> DenseLabelSet -> DenseLabelSet
normBip totsize bip =
let
halfSize = totsize `quot` 2
flipped = invertDense totsize bip
in
case compare (bipSize bip) halfSize of
LT -> bip
GT -> flipped
EQ -> min bip flipped
foldBips :: Monoid m => (DenseLabelSet -> m) -> NewickTree a -> m
foldBips f tr = F.foldMap f' (labelBips tr)
where f' (_,bips) = F.foldMap f bips
allBips :: NewickTree a -> S.Set DenseLabelSet
allBips tr = S.filter ((> 1) . bipSize) $ foldBips S.insert tr S.empty
#if 0
type BipTable s = IMap DenseLabelSet s (SparseTreeSet s)
type SparseTreeSet s = IS.ISet s TreeID
type TreeID = AnnotatedTree
#endif
hashRF :: Int -> [NewickTree a] -> DistanceMatrix
hashRF num_taxa trees = build M.empty (zip [0..] trees)
where
num_trees = length trees
build acc [] = ingest acc
build acc ((ix,hd):tl) =
let bips = allBips hd
acc' = S.foldl' fn acc bips
fn acc bip = M.alter fn2 bip acc
fn2 (Just membs) = Just (markLabel ix membs)
fn2 Nothing = Just (mkSingleDense num_taxa ix)
in
build acc' tl
ingest :: M.Map DenseLabelSet DenseLabelSet -> DistanceMatrix
ingest bipTable = runST theST
where
theST :: forall s0 . ST s0 DistanceMatrix
theST = do
matr <- MV.new num_trees
for_ (0,num_trees) $ \ ix -> do
row <- MU.replicate ix (0::Int)
MV.write matr ix row
return ()
unsafeIOToST$ putStrLn$" Built matrix for dim "++show num_trees
let bumpMatr i j | j < i = incr i j
| otherwise = incr j i
incr :: Int -> Int -> ST s0 ()
incr i j = do
row <- MV.read matr i
elm <- MU.read row j
MU.write row j (elm+1)
return ()
fn bipMembs =
let haveIt = bipMembs
dontHave = invertDense num_trees bipMembs
fn1 trId = traverseDense_ (fn2 trId) dontHave
fn2 trId1 trId2 = bumpMatr trId1 trId2
in
traverseDense_ fn1 haveIt
F.traverse_ fn bipTable
v1 <- V.unsafeFreeze matr
T.traverse (U.unsafeFreeze) v1
instance Pretty a => Pretty (S.Set a) where
pPrint s = pPrint (S.toList s)
printDistMat :: Handle -> V.Vector (U.Vector Int) -> IO ()
printDistMat h mat = do
hPutStrLn h "Robinson-Foulds distance (matrix format):"
hPutStrLn h "-----------------------------------------"
V.forM_ mat $ \row -> do
U.forM_ row $ \elem -> do
hPutStr h (show elem)
hPutStr h " "
hPutStr h "0\n"
hPutStrLn h "-----------------------------------------"
for_ :: Monad m => (Int, Int) -> (Int -> m ()) -> m ()
for_ (start, end) _fn | start > end = error "for_: start is greater than end"
for_ (start, end) fn = loop start
where
loop !i | i == end = return ()
| otherwise = do fn i; loop (i+1)
filterCompatible :: NewickTree a -> [NewickTree b] -> [NewickTree b]
filterCompatible consensus trees =
let cbips = allBips consensus in
[ tr | tr <- trees
, cbips `S.isSubsetOf` allBips tr ]
compatibleWith :: NewickTree a -> NewickTree b -> Bool
compatibleWith consensus =
let consBips = allBips consensus in
\ newTr -> S.isSubsetOf consBips (allBips newTr)
consensusTreeFull (FullTree n1 l1 t1) (FullTree n2 l2 t2) =
error "FINISHME - consensusTreeFull"
consensusTree :: Int -> [NewickTree a] -> NewickTree ()
consensusTree _ [] = error "Cannot take the consensusTree of the empty list"
consensusTree num_taxa (hd:tl) = bipsToTree num_taxa intersection
where
intersection = L.foldl' S.intersection (allBips hd) (map allBips tl)
bipsToTree :: Int -> S.Set DenseLabelSet -> NewickTree ()
bipsToTree num_taxa origbip =
loop lvl0 sorted
where
sorted = L.sortBy (compare `on` bipSize) (S.toList origbip)
lvl0 = [ (mkSingleDense num_taxa ix, NTLeaf () ix)
| ix <- [0..num_taxa1] ]
isMatch bip x = denseIsSubset x bip
loop !subtrees [] =
case subtrees of
[] -> error "bipsToTree: internal error"
[(_,one)] -> one
lst -> NTInterior () (map snd lst)
loop !subtrees (bip:tl) =
let (in_,out) = L.partition (isMatch bip. fst) subtrees in
case in_ of
[] -> error $"bipsToTree: Internal error! No match for bip: "++show bip
++" out is\n "++show out++"\n and remaining bips "++show (length tl)
++"\n when processing orig bip set:\n "++show origbip
_ ->
loop ((denseUnions num_taxa (map fst in_),
NTInterior () (map snd in_)) : out) tl