{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE DuplicateRecordFields #-}
module Collapse
( printToInfo
, gather
, gatherWiggle
, getLabelMap
, collapse
, addCloneIDs
) where
import Data.Function (on)
import Data.Int
import Data.List
import Data.Maybe
import qualified Data.ByteString.Lazy.Char8 as B
import qualified Data.Map.Strict as Map
import qualified Data.Clustering.Hierarchical as Cluster
import Control.Lens
import qualified Text.Show.ByteString as B
import Types
printToInfo :: PrintITD -> ITDInfo
printToInfo x = ITDInfo
{ label = label (x :: PrintITD)
, fHeader = fHeader (x :: PrintITD)
, fSequence = fSequence (x :: PrintITD)
, dSubstring = dSubstring (x :: PrintITD)
, dLocations = dLocations (x :: PrintITD)
, dMutations = dMutations (x :: PrintITD)
, sSubstring = sSubstring (x :: PrintITD)
, sLocation = sLocation (x :: PrintITD)
, sOtherLocations = sOtherLocations (x :: PrintITD)
, classification = classification (x :: PrintITD)
, dLocationNum = getLastDLocation x
, sLocationNum = fmap fst . B.readInt $ sLocation (x :: PrintITD)
, dSubstringLen = fromIntegral . B.length $ dSubstring (x :: PrintITD)
, sSubstringLen = fromIntegral . B.length $ sSubstring (x :: PrintITD)
}
gather :: [ITDInfo] -> [[[ITDInfo]]]
gather = fmap allGather . subjectGather
where
allGather = groupBy ((==) `on` compareFactors)
. sortBy (compare `on` compareFactors)
subjectGather = groupBy ((==) `on` (head. B.split '_' . (label :: ITDInfo -> B.ByteString)))
. sortBy (compare `on` (head . B.split '_' . (label :: ITDInfo -> B.ByteString)))
gatherWiggle :: Method -> Wiggle -> [ITDInfo] -> [[[ITDInfo]]]
gatherWiggle method wiggle = fmap (allGather method) . subjectGather . allSort
where
allGather Hierarchical = wiggleCluster wiggle
allGather CompareAll = wiggleCompare wiggle
allSort = sortBy (compare `on` compareFactors)
subjectGather = groupBy ((==) `on` (head . B.split '_' . (label :: ITDInfo -> B.ByteString)))
getLabelMap :: [ITDInfo] -> Map.Map Label Int
getLabelMap xs =
Map.fromList
. zip (fmap (Label . (\x -> label (x :: ITDInfo)) . head) labelGathered)
. fmap length
$ labelGathered
where
labelGathered = groupBy ((==) `on` (label :: ITDInfo -> B.ByteString))
. sortBy (compare `on` (label :: ITDInfo -> B.ByteString))
$ xs
wiggleCompare :: Wiggle -> [ITDInfo] -> [[ITDInfo]]
wiggleCompare wiggle (x:xs) = go (getInitialMinMax x) [] [x] xs
where
getInitialMinMax !a = CurrentMinMax
{ minMaxDLocationNum = (dLocationNum a, dLocationNum a)
, minMaxSLocationNum = (sLocationNum a, sLocationNum a)
, minMaxDSubstringLen = (dSubstringLen a, dSubstringLen a)
, minMaxSSubstringLen = (sSubstringLen a, sSubstringLen a)
}
go :: CurrentMinMax -> [[ITDInfo]] -> [ITDInfo] -> [ITDInfo] -> [[ITDInfo]]
go !current !accGlobal !accLocal [] = accLocal:accGlobal
go !current !accGlobal !accLocal (b:bs) =
case wiggleTestMinMax wiggle current b of
(True, newCurrent) -> go newCurrent accGlobal (b:accLocal) bs
(False, _) ->
go (getInitialMinMax b) (accLocal:accGlobal) [b] bs
wiggleCluster :: Wiggle -> [ITDInfo] -> [[ITDInfo]]
wiggleCluster (Wiggle wiggle) =
fmap Cluster.elements
. flip Cluster.cutAt wiggle
. flip (Cluster.dendrogram Cluster.CLINK) getDistance
getDistance :: ITDInfo -> ITDInfo -> Double
getDistance x y =
maximum [ locDistance (dLocationNum x) (dLocationNum y)
, locDistance (sLocationNum x) (sLocationNum y)
, fromIntegral $ abs (dSubstringLen x - dSubstringLen y)
, fromIntegral $ abs (sSubstringLen x - sSubstringLen y)
]
where
locDistance :: Maybe Int -> Maybe Int -> Double
locDistance Nothing Nothing = 0
locDistance _ Nothing = (1 / 0)
locDistance Nothing _ = (1 / 0)
locDistance (Just a) (Just b) = fromIntegral $ abs (a - b)
wiggleTestMinMax :: Wiggle -> CurrentMinMax -> ITDInfo -> (Bool, CurrentMinMax)
wiggleTestMinMax (Wiggle wiggle) current x = (test, updateCurrentMinMax)
where
updateCurrentMinMax = CurrentMinMax
{ minMaxDLocationNum = updateMinMaxMaybe minMaxDLocationNum dLocationNum current x
, minMaxSLocationNum = updateMinMaxMaybe minMaxSLocationNum sLocationNum current x
, minMaxDSubstringLen = updateMinMax minMaxDSubstringLen dSubstringLen current x
, minMaxSSubstringLen = updateMinMax minMaxSSubstringLen sSubstringLen current x
}
test = (andOf both . over both ((<= wiggle) . fromMaybe (wiggle + 1)) $ diffDLoc)
&& (andOf both . over both ((<= wiggle) . fromMaybe (wiggle + 1)) $ diffSLoc)
&& (andOf both . over both (<= wiggle) $ diffDLen)
&& (andOf both . over both (<= wiggle) $ diffSLen)
diffDLoc = over _2 (locTest (dLocationNum x))
. over _1 (\a -> locTest a (dLocationNum x))
. minMaxDLocationNum
$ current
diffSLoc = over _2 (locTest (sLocationNum x))
. over _1 (\a -> locTest a (sLocationNum x))
. minMaxSLocationNum
$ current
diffDLen = over _2 (fromIntegral . (\a -> dSubstringLen x - a))
. over _1 (fromIntegral . (\a -> a - dSubstringLen x))
. minMaxDSubstringLen
$ current
diffSLen = over _2 (fromIntegral . (\a -> sSubstringLen x - a))
. over _1 (fromIntegral . (\a -> a - sSubstringLen x))
. minMaxSSubstringLen
$ current
updateMinMax
:: (CurrentMinMax -> (Int, Int))
-> (ITDInfo -> Int)
-> CurrentMinMax
-> ITDInfo
-> (Int, Int)
updateMinMax currentF infoF current info =
case (over both (signum . (\x -> x - infoF info)) . currentF $ current) of
(1, _) -> (infoF info, snd . currentF $ current)
(_, -1) -> (fst . currentF $ current, infoF info)
(_, _) -> currentF current
updateMinMaxMaybe
:: (CurrentMinMax -> (Maybe Int, Maybe Int))
-> (ITDInfo -> Maybe Int)
-> CurrentMinMax
-> ITDInfo
-> (Maybe Int, Maybe Int)
updateMinMaxMaybe currentF infoF current info =
case ( over both (\x -> fmap signum $ (-) <$> x <*> infoF info)
. currentF
$ current
) of
(Just 1, _) -> (infoF info, snd . currentF $ current)
(_, Just (-1)) -> (fst . currentF $ current, infoF info)
(_, _) -> currentF current
locTest :: Maybe Int -> Maybe Int -> Maybe Double
locTest Nothing Nothing = Just 0
locTest _ Nothing = Nothing
locTest Nothing _ = Nothing
locTest (Just a) (Just b) = Just . fromIntegral $ (a - b)
wiggleTest :: Wiggle -> ITDInfo -> ITDInfo -> Bool
wiggleTest (Wiggle wiggle) x y =
locTest (dLocationNum x) (dLocationNum y)
&& locTest (sLocationNum x) (sLocationNum y)
&& (fromIntegral $ abs (dSubstringLen x - dSubstringLen y)) <= wiggle
&& (fromIntegral $ abs (sSubstringLen x - sSubstringLen y)) <= wiggle
where
locTest :: Maybe Int -> Maybe Int -> Bool
locTest Nothing Nothing = True
locTest _ Nothing = False
locTest Nothing _ = False
locTest (Just a) (Just b) = (fromIntegral $ abs (a - b)) <= wiggle
compareFactors :: ITDInfo -> (B.ByteString, Maybe Int, Maybe Int, Int, Int)
compareFactors x =
( head . B.split '_' . (label :: ITDInfo -> B.ByteString) $ x
, dLocationNum x
, sLocationNum x
, dSubstringLen x
, sSubstringLen x
)
getLastDLocation :: PrintITD -> Maybe Int
getLastDLocation x =
if B.elem '/' $ dLocations (x :: PrintITD)
then
Just
. fst
. fromMaybe (error "Cannot read dLocation")
. B.readInt
. last
. B.split '/'
$ dLocations (x :: PrintITD)
else
Nothing
getFrequent :: (Eq a, Ord a, Show a) => [a] -> a
getFrequent = fst
. maximumBy (compare `on` snd)
. Map.toAscList
. Map.fromListWith (+)
. flip zip [1,1..]
collapse :: ID -> Int -> [ITDInfo] -> PrintWithCloneID
collapse (ID cloneID) labelLen xs = result . getFrequent . fmap unit $ xs
where
result (!dSub, !dLoc, !dMut, !sSub, !sLoc, !sOther, !cla) = PrintWithCloneID
{ label = (\x -> label (x :: ITDInfo)) . head $ xs
, fHeader = (\x -> fHeader (x :: ITDInfo)) . head $ xs
, fSequence = (\x -> fSequence (x :: ITDInfo)) . head $ xs
, dSubstring = getFrequent . fmap (\x -> dSubstring (x :: ITDInfo)) $ xs
, dLocations = getFrequent . fmap (\x -> dLocations (x :: ITDInfo)) $ xs
, dMutations = getFrequent . fmap (\x -> dMutations (x :: ITDInfo)) $ xs
, sSubstring = getFrequent . fmap (\x -> sSubstring (x :: ITDInfo)) $ xs
, sLocation = getFrequent . fmap (\x -> sLocation (x :: ITDInfo)) $ xs
, sOtherLocations = getFrequent . fmap (\x -> sOtherLocations (x :: ITDInfo)) $ xs
, classification = getFrequent . fmap (\x -> classification (x :: ITDInfo)) $ xs
, frequency = genericLength xs / fromIntegral labelLen
, cloneID = B.show cloneID
}
unit x = ( dSubstring (x :: ITDInfo)
, dLocations (x :: ITDInfo)
, dMutations (x :: ITDInfo)
, sSubstring (x :: ITDInfo)
, sLocation (x :: ITDInfo)
, sOtherLocations (x :: ITDInfo)
, classification (x :: ITDInfo)
)
addCloneIDs :: Entity -> ID -> Int -> [ITDInfo] -> [PrintWithCloneID]
addCloneIDs entity cloneID labelLen xs = go entity xs
where
go Clone = (:[]) . collapse cloneID labelLen
go Read = addF
addF = fmap (addCloneID cloneID cloneSize labelLen)
cloneSize = genericLength xs
addCloneID :: ID -> Int -> Int -> ITDInfo -> PrintWithCloneID
addCloneID (ID cloneID) cloneSize labelLen rep = PrintWithCloneID
{ label = label (rep :: ITDInfo)
, fHeader = fHeader (rep :: ITDInfo)
, fSequence = fSequence (rep :: ITDInfo)
, dSubstring = dSubstring (rep :: ITDInfo)
, dLocations = dLocations (rep :: ITDInfo)
, dMutations = dMutations (rep :: ITDInfo)
, sSubstring = sSubstring (rep :: ITDInfo)
, sLocation = sLocation (rep :: ITDInfo)
, sOtherLocations = sOtherLocations (rep :: ITDInfo)
, classification = classification (rep :: ITDInfo)
, frequency = fromIntegral cloneSize
/ fromIntegral labelLen
, cloneID = B.show cloneID
}