-- | A structure to allow fast lookup of objects whose sequence -- location lines up with a give position. module Bio.Location.LocMap (LocMap, mkLocMap, defaultZonesize, fromList , lookupWithin, lookupOverlaps , delete, deleteBy, insert , checkInvariants ) where import qualified Data.IntMap as IM import qualified Data.IntSet as IS import Data.List hiding (insert, delete, deleteBy) import Data.Maybe import Data.Monoid import qualified Bio.Location.Location as Loc import qualified Bio.Location.Position as Pos import Bio.Sequence.SeqData (Offset) defaultZonesize :: Offset defaultZonesize = 65536 -- | Collection mapping a collection of 'Loc' locations, possibly -- overlapping, binned for efficient lookup by position. data LocMap a = LocMap !Offset !(IM.IntMap (Loc.Loc, a)) !(IM.IntMap IS.IntSet) instance Monoid (LocMap a) where mempty = mkLocMap defaultZonesize mappend lm0 (LocMap _ keyToLoc1 _) = foldl' (\lm (l,x) -> insert l x lm) lm0 $ IM.elems keyToLoc1 -- | Create an empty 'LocMap' with a specified position bin size mkLocMap :: Offset -> LocMap a mkLocMap zonesize = LocMap zonesize IM.empty IM.empty -- | Create a 'LocMap' from an associated list. fromList :: Offset -> [(Loc.Loc, a)] -> LocMap a fromList zonesize = foldl' (\lm0 (l,x) -> insert l x lm0) (mkLocMap zonesize) -- | Add an object with an associated 'Loc' sequence region insert :: Loc.Loc -> a -> LocMap a -> LocMap a insert l x (LocMap zonesize keyToLoc zoneKeys) = let !key = case IM.maxViewWithKey keyToLoc of Just ((k, _), _) -> k + 1 Nothing -> 1 !newKeyToLoc = IM.insertWithKey duplicateError key (l, x) keyToLoc !newZoneKeys = foldl' (insertIntoZone key) zoneKeys $ locZones zonesize l in LocMap zonesize newKeyToLoc newZoneKeys where insertIntoZone key currZoneKeys z = case IM.lookup z currZoneKeys of Nothing -> IM.insert z (IS.singleton key) currZoneKeys Just currZoneKeySet -> IM.insert z (IS.insert key currZoneKeySet) currZoneKeys duplicateError k (l1, _) (l2, _) = error $ "LocMap: key " ++ show k ++ " duplicated: " ++ show (l1, l2) -- | Find the (possibly empty) list of sequence regions and associated -- objects that contain a 'Pos' position, in the sense of 'withinLoc' lookupWithin :: Pos.Pos -> LocMap a -> [(Loc.Loc, a)] lookupWithin pos (LocMap zonesize keyToLoc zoneKeys) = let !zoneKeySet = IM.findWithDefault IS.empty (posZone zonesize pos) zoneKeys in filter ((Loc.isWithin pos) . fst) $ keySetLocs keyToLoc zoneKeySet -- | Find the (possibly empty) list of sequence regions and associated -- objects that overlap a 'Loc' region, in the sense of 'overlapsLoc' lookupOverlaps :: Loc.Loc -> LocMap a -> [(Loc.Loc, a)] lookupOverlaps loc (LocMap zonesize keyToLoc zoneKeys) = let !zonesKeySet = IS.unions $ map (\z -> IM.findWithDefault IS.empty z zoneKeys) $ locZones zonesize loc in filter ((Loc.overlaps loc) . fst) $ keySetLocs keyToLoc zonesKeySet -- | Remove a region / object association from the map, if it is -- present. If it is present multiple times, only the first -- occurrence will be deleted. delete :: (Eq a) => (Loc.Loc, a) -> LocMap a -> LocMap a delete target = deleteBy (== target) -- | Remove the first region / object association satisfying a -- predicate function. deleteBy :: ((Loc.Loc, a) -> Bool) -> LocMap a -> LocMap a deleteBy isTarget lm0@(LocMap zonesize keyToLoc zoneKeys) = case filter (isTarget . snd) $ IM.toList keyToLoc of [] -> lm0 ((key,(l,_)):_) -> let !newKeyToLoc = IM.delete key keyToLoc !newZoneKeys = foldl' (deleteFromZone key) zoneKeys $ locZones zonesize l in LocMap zonesize newKeyToLoc newZoneKeys where deleteFromZone key currZoneKeys z = let !currZoneKeySet = IM.findWithDefault missingZone z currZoneKeys in IM.insert z (IS.delete key currZoneKeySet) currZoneKeys where missingZone = error $ "LocMap deleteBy: empty keyset for zone " ++ show z posZone :: Offset -> Pos.Pos -> Int posZone zonesize pos = fromIntegral $ (Pos.offset pos) `div` zonesize locZones :: Offset -> Loc.Loc -> [Int] locZones zonesize loc = let !(pmin, pmax) = Loc.bounds loc !zmin = fromIntegral $ pmin `div` zonesize !zmax = fromIntegral $ pmax `div` zonesize in [zmin..zmax] keySetLocs :: (IM.IntMap (Loc.Loc, a)) -> IS.IntSet -> [(Loc.Loc, a)] keySetLocs keyToLoc = map keyLoc . IS.toList where keyLoc key = IM.findWithDefault unknownKey key keyToLoc where unknownKey = error $ "LocMap: unknown key " ++ show key checkInvariants :: LocMap a -> [String] checkInvariants (LocMap zonesize keyToLoc zoneKeys) = concat $ checkAllKeys : map checkKeyLoc (IM.toList keyToLoc) where checkAllKeys = let !allZoneKeys = IS.unions $ IM.elems zoneKeys !allLocKeys = IM.keysSet keyToLoc !missingKeys = IS.toAscList $ allZoneKeys `IS.difference` allLocKeys !extraKeys = IS.toAscList $ allLocKeys `IS.difference` allZoneKeys in concat [ map (\key -> "Missing key " ++ show key) missingKeys , map (\key -> "Extra key " ++ show key) extraKeys] checkKeyLoc (key, (loc, _)) = let !keyZoneSet = IM.keysSet $ IM.filter (\keyset -> key `IS.member` keyset) zoneKeys !locZoneSet = IS.fromList $ locZones zonesize loc !missingZones = IS.toAscList $ locZoneSet `IS.difference` keyZoneSet !extraZones = IS.toAscList $ keyZoneSet `IS.difference` locZoneSet in concat [ map (\zone -> "Missing zone " ++ show zone ++ " for " ++ show (key, loc)) missingZones , map (\zone -> "Extra zone " ++ show zone ++ " for " ++ show (key, loc)) extraZones]