{-# LANGUAGE GeneralizedNewtypeDeriving, TypeSynonymInstances, FlexibleInstances #-} {-| Efficient lookup based on potential location overlap Collection of objects with locations, potentially on named sequences, that can be queried to recover all objects whose location could potentially overlap a query location. |-} module Bio.SeqLoc.LocMap ( -- * Mapping objects to locations LocMap , emptyLM, insertLoc , queryLoc -- * Mapping objects to locations on named sequneces , SeqLocMap , emptySLM, insertSeqLoc , querySeqLoc -- * Mapping transcripts to their locations , transcriptSeqLocMap -- * Generalization of objects with locations , Locatable(..) , WithLocation(..) , locatableSeqLocMap , queryLocatable, queryLocCompatible ) where import qualified Data.HashMap.Strict as HM import Data.List -- import Data.Maybe import qualified Bio.SeqLoc.Location as Loc import Bio.SeqLoc.OnSeq import qualified Bio.SeqLoc.Position as Pos import qualified Bio.SeqLoc.SpliceLocation as SpLoc import Bio.SeqLoc.Strand import Bio.SeqLoc.Transcript import qualified Bio.SeqLoc.ShiftedVector as ShV -- | Mapping objects to sequence locations in the sense of `Loc.Location`. data LocMap a = LocMap { binSize :: !Pos.Offset, bins :: !(ShV.ShiftedVector [a]) } deriving (Show) -- | Mapping objects to locations on named sequences data SeqLocMap a = SeqLocMap { slmBinSize :: !Pos.Offset, locmaps :: !(HM.HashMap SeqLabel (LocMap a)) } deriving (Show) -- | Object with a genomic location expressed by `SpliceSeqLoc`. class Locatable o where locate :: o -> SpliceSeqLoc instance Locatable ContigSeqLoc where locate (OnSeq ref loc) = OnSeq ref (SpLoc.singleton loc) instance Locatable SpliceSeqLoc where locate = id instance Locatable Transcript where locate = location -- | Simple representation of a `Locatable` object as an arbitrary -- object adjoined with a location. data WithLocation a = WithLocation { withoutLocation :: !a, withLocate :: !SpliceSeqLoc } deriving (Eq) instance Locatable (WithLocation a) where locate = withLocate -- | Create an empty object / location map. -- -- Specify a characteristic size for efficient queries, which depends -- on the underlying genome. Smaller sizes yield fewer false -- candidates but greater memory usage and potentially slower queries. emptyLM :: Pos.Offset -> LocMap a emptyLM bsz = LocMap { binSize = bsz, bins = ShV.empty } -- | Insert a new object / location pair and reutrn the updated map insertLoc :: (Loc.Location l) => l -> a -> LocMap a -> LocMap a insertLoc l x lm0 = lm0 { bins = ShV.modifySome (bins lm0) (locBins l lm0) (x :) } -- | Retrieve a list of objects that could potentially overlap the -- query location. -- -- Some objects may not actually overlap the query location. Some -- objects may appear more than once in the list. However, no object -- whose location truly overlaps the query will be missing from the -- list. -- -- Overlap is defined by one or more nucleotides in common between the -- bounds of the two locations, regardless of the relative strands of -- the query and the object location. queryLoc :: (Loc.Location l) => l -> LocMap a -> [a] queryLoc l lm = concat [ (bins lm) ShV.!? b | b <- locBins l lm ] locBins :: (Loc.Location l) => l -> LocMap a -> [Int] locBins l lm = let (start, end) = Loc.bounds l binlow = fromIntegral $ start `div` binSize lm binhigh = fromIntegral $ end `div` binSize lm in [binlow..binhigh] -- | Create an empty object / location map -- -- Specify a characteristic size as per `emptyLM`. emptySLM :: Pos.Offset -> SeqLocMap a emptySLM bsz = SeqLocMap { slmBinSize = bsz, locmaps = HM.empty } -- | Insert a new object / location pair and reutrn the updated map insertSeqLoc :: (Loc.Location l) => OnSeq l -> a -> SeqLocMap a -> SeqLocMap a insertSeqLoc sl x slm0 = let lm0 = HM.lookupDefault (emptyLM $ slmBinSize slm0) (onSeqLabel sl) (locmaps slm0) lm' = insertLoc (unOnSeq sl) x lm0 in slm0 { locmaps = HM.insert (onSeqLabel sl) lm' (locmaps slm0) } -- | Retrieve a list of objects that could potentially overlap the -- query location. -- -- Some objects may not actually overlap the query location. Some -- objects may appear more than once in the list. However, no object -- whose location truly overlaps the query will be missing from the -- list. -- -- Overlap is defined by one or more nucleotides in common between the -- bounds of the two locations, regardless of the relative strands of -- the query and the object location, as well as the same name for the -- underlying reference sequence. -- -- To retrieve objects and test for actual overlap see the `Locatable` -- interface and `queryLocatable` or `queryLocCompatible`. querySeqLoc :: (Loc.Location l) => OnSeq l -> SeqLocMap a -> [a] querySeqLoc sl slm = maybe [] (queryLoc (unOnSeq sl)) $ HM.lookup (onSeqLabel sl) (locmaps slm) -- | Construct a mapping from transcripts to their genomic locations. transcriptSeqLocMap :: Pos.Offset -> [Transcript] -> SeqLocMap Transcript transcriptSeqLocMap bsz = foldl' insertTrx (emptySLM bsz) where insertTrx slm0 t = insertSeqLoc (location t) t slm0 -- | Construct a mapping from a general `Locatable` object to its -- genomic location locatableSeqLocMap :: (Locatable l) => Pos.Offset -> [l] -> SeqLocMap l locatableSeqLocMap bsz = foldl' insertLocable (emptySLM bsz) where insertLocable slm0 t = insertSeqLoc (locate t) t slm0 -- | Retrieve a list of `Locatable` objects whose overall, contiguous -- genomic coordinates intersect at any position the genomic interval -- spanned by the specified `Loc.Location`. This does not require that -- the spliced structure of the query is a subset of the spliced -- structure of the `Locatable` nor that the query location lie -- entirely within the hit location (contrast with -- `queryLocCompatible`). -- -- When a strand argument is given, restrict matches to those lying on -- the same strand as the query location, for `Just Plus`, or the -- opposite strand, for `Just Minus`. queryLocatable :: (Locatable o, Loc.Location l) => Maybe Strand -> OnSeq l -> SeqLocMap o -> [o] queryLocatable mstr qyloc = filter realHit . querySeqLoc qyloc where realHit sb = let sbloc = locate sb (qylb, qyub) = Loc.bounds . unOnSeq $ qyloc (sblb, sbub) = Loc.bounds . unOnSeq $ sbloc qystr = Loc.strand . unOnSeq $ qyloc sbstr = Loc.strand . unOnSeq $ sbloc in (qylb <= sbub) && (qyub >= sblb) && (qystr `strandCompat` sbstr) strandCompat = case mstr of Nothing -> \_ _ -> True Just Plus -> (==) Just Minus -> (/=) -- | Retrieve a list of `Locatable` objects whose spliced structure -- contains the query location specifically. queryLocCompatible :: (Locatable o) => Maybe Strand -> SpliceSeqLoc -> SeqLocMap o -> [o] queryLocCompatible mstr qyloc = filter compatHit . querySeqLoc qyloc where compatHit sb = maybe False strandCompat $ unOnSeq qyloc `SpLoc.locInto` (unOnSeq . locate $ sb) strandCompat = case mstr of Nothing -> const True Just s -> (== s) . Loc.strand