-- |Data types for working with locations in a sequence. Zero-based -- 'Int64' indices are used throughout, to facilitate direct use of -- indexing functions on 'SeqData'. module Bio.Location.ContigLocation ( ContigLoc(..), fromStartEnd, fromPosLen , bounds, startPos, endPos , slide, extend, posInto, posOutof , seqData, seqDataPadded, isWithin, overlaps , display ) where import Prelude hiding (length) import Control.Monad.Error import qualified Data.ByteString.Lazy.Char8 as LBS import Bio.Sequence.SeqData import qualified Bio.Location.Position as Pos import Bio.Location.Strand -- | Contiguous set of positions in a sequence data ContigLoc = ContigLoc { offset5 :: !Offset -- ^ 5' end of region on target sequence, 0-based index , length :: !Offset -- ^ length of region on target sequence , strand :: !Strand -- ^ strand of region } deriving (Eq, Ord, Show) instance Stranded ContigLoc where revCompl (ContigLoc seq5 len str) = ContigLoc seq5 len $ revCompl str -- | Create a 'ContigLoc' from 0-based starting and ending positions. -- When 'start' is less than 'end' the position will be on the 'Fwd' -- 'Strand', otherwise it will be on the 'RevCompl' strand. fromStartEnd :: Offset -> Offset -> ContigLoc fromStartEnd start end | start < end = ContigLoc start (1 + end - start) Fwd | otherwise = ContigLoc end (1 + start - end) RevCompl -- | Create a 'ContigLoc' from a Pos.Pos defining the start -- ('ContigLoc' 5 prime end) position on the sequence and the length. fromPosLen :: Pos.Pos -> Offset -> ContigLoc fromPosLen (Pos.Pos off5 Fwd) len = ContigLoc off5 len Fwd fromPosLen (Pos.Pos off3 RevCompl) len = ContigLoc (off3 - (len - 1)) len RevCompl -- | The bounds of a 'ContigLoc', a pair of the lowest and highest -- sequence indices covered by the region, which ignores the strand -- of the 'ContigLoc'. The first element of the pair will always be -- lower than the second. bounds :: ContigLoc -> (Offset, Offset) bounds (ContigLoc seq5 len _) = (seq5, seq5 + len - 1) -- | 0-based starting (5' in the region orientation) position startPos :: ContigLoc -> Pos.Pos startPos (ContigLoc seq5 len str) = case str of Fwd -> Pos.Pos seq5 str RevCompl -> Pos.Pos (seq5 + len - 1) str -- | 0-based ending (3' in the region orientation) position endPos :: ContigLoc -> Pos.Pos endPos (ContigLoc seq5 len str) = case str of Fwd -> Pos.Pos (seq5 + len - 1) str RevCompl -> Pos.Pos seq5 str -- | Move a 'ContigLoc' region by a specified offset slide :: Offset -> ContigLoc -> ContigLoc slide dpos (ContigLoc seq5 len str) = ContigLoc (seq5 + dpos) len str -- | Subsequence 'SeqData' for a 'ContigLoc', provided that the region -- is entirely within the sequence. seqData :: (Error e, MonadError e m) => SeqData -> ContigLoc -> m SeqData seqData sequ (ContigLoc seq5 len str) | seq5 < 0 = outOfBounds | otherwise = case LBS.take len $ LBS.drop seq5 sequ of fwdseq | LBS.length fwdseq == len -> return $ stranded str fwdseq | otherwise -> outOfBounds where outOfBounds = throwError $ strMsg $ "contig seq loc " ++ show (seq5, seq5 + len - 1) ++ " out of SeqData bounds" -- | Subsequence 'SeqData' for a 'ContigLoc', padded as needed with Ns seqDataPadded :: SeqData -> ContigLoc -> SeqData seqDataPadded sequ (ContigLoc seq5 len str) = stranded str fwdseq where fwdseq | seq5 + len <= 0 = LBS.replicate len 'N' | seq5 >= LBS.length sequ = LBS.replicate len 'N' | seq5 < 0 = LBS.replicate (negate seq5) 'N' `LBS.append` takePadded (len + seq5) sequ | otherwise = takePadded len $ LBS.drop seq5 sequ takePadded sublen subsequ | sublen <= LBS.length subsequ = LBS.take sublen subsequ | otherwise = subsequ `LBS.append` LBS.replicate (sublen - LBS.length subsequ) 'N' -- | For a 'Pos' and a 'ContigLoc' on the same sequence, find the -- corresponding 'Pos' relative to the 'ContigLoc', provided it is -- within the 'ContigLoc'. posInto :: Pos.Pos -> ContigLoc -> Maybe Pos.Pos posInto (Pos.Pos pos pStrand) (ContigLoc seq5 len cStrand) | pos < seq5 || pos >= seq5 + len = Nothing | otherwise = Just $ case cStrand of Fwd -> Pos.Pos (pos - seq5) pStrand RevCompl -> Pos.Pos (seq5 + len - (pos + 1)) (revCompl pStrand) -- | For a 'Pos' specified relative to a 'ContigLoc', find the -- corresponding 'Pos' relative to the outer sequence, provided that -- the 'Pos' is within the bounds of the 'ContigLoc'. posOutof :: Pos.Pos -> ContigLoc -> Maybe Pos.Pos posOutof (Pos.Pos pos pStrand) (ContigLoc seq5 len cStrand) | pos < 0 || pos >= len = Nothing | otherwise = Just $ case cStrand of Fwd -> Pos.Pos (pos + seq5) pStrand RevCompl -> Pos.Pos (seq5 + len - (pos + 1)) (revCompl pStrand) -- | 'ContigLoc' extended on the 5' and 3' ends. extend :: (Offset, Offset) -> ContigLoc -> ContigLoc extend (ext5, ext3) (ContigLoc seq5 len str) = case str of Fwd -> ContigLoc (seq5 - ext5) (len + ext5 + ext3) str RevCompl -> ContigLoc (seq5 - ext3) (len + ext5 + ext3) str -- | For a 'Pos' and a 'ContigLoc' on the same sequence, is the 'Pos' -- within the 'ContigLoc'. isWithin :: Pos.Pos -> ContigLoc -> Bool isWithin (Pos.Pos pos pStrand) (ContigLoc seq5 len cStrand) = (pos >= seq5) && (pos < seq5 + len) && (cStrand == pStrand) -- | For a pair of 'ContigLoc' regions on the same sequence, indicates -- if they overlap at all. overlaps :: ContigLoc -> ContigLoc -> Bool overlaps contig1 contig2 = case (bounds contig1, bounds contig2) of ((low1, high1),(low2, high2)) -> (strand contig1 == strand contig2) && (low1 <= high2) && (low2 <= high1) display :: ContigLoc -> String display cloc = show (Pos.offset $ startPos cloc) ++ "to" ++ show (Pos.offset $ endPos cloc)