-- |Data types for working with locations in a sequence. Zero-based -- 'Int64' indices are used throughout, to facilitate direct use of -- indexing functions on 'SeqData'. module Bio.Location.Location ( Loc(..), bounds, length, startPos, endPos , extend, posInto, posOutof, isWithin, overlaps , seqData, seqDataPadded , display ) where import Prelude hiding (length) import Control.Arrow ((***)) import Control.Monad.Error import qualified Data.ByteString.Lazy.Char8 as LBS import Data.List (intercalate) --import Data.Maybe import qualified Bio.Location.ContigLocation as CLoc import qualified Bio.Location.Position as Pos import Bio.Location.Strand import Bio.Sequence.SeqData -- | General (disjoint) sequence region consisting of a concatenated -- set of contiguous regions newtype Loc = Loc [CLoc.ContigLoc] deriving (Eq, Ord, Show) instance Stranded Loc where revCompl (Loc contigs) = Loc $ reverse $ map revCompl contigs -- | Length of the region length :: Loc -> Offset length (Loc contigs) = sum $ map CLoc.length contigs -- | The bounds of a 'Loc', consisting of the lowest & highest -- sequence indices lying within the region. The first element of -- the pair will always be lower than the second. bounds :: Loc -> (Offset, Offset) bounds (Loc []) = error "locBounds on zero-contig Loc" bounds (Loc contigs) = (minimum *** maximum) $ unzip $ map CLoc.bounds contigs -- | 0-based starting (5' in the region orientation) offset of the -- region on its sequence. startPos :: Loc -> Pos.Pos startPos (Loc []) = error "startPos: zero-contig Loc" startPos (Loc contigs) = CLoc.startPos $ head contigs -- | 0-based ending (3' in the region orientation) offset of the -- region on its sequence. endPos :: Loc -> Pos.Pos endPos (Loc []) = error "endPos: zero-contig Loc" endPos (Loc contigs) = CLoc.endPos $ last contigs -- | Subsequence 'SeqData' for a 'Loc', provided that the region is -- entirely within the sequence. seqData :: (Error e, MonadError e m) => SeqData -> Loc -> m SeqData seqData sequ (Loc contigs) = liftM LBS.concat $ mapM (CLoc.seqData sequ) contigs seqDataPadded :: SeqData -> Loc -> SeqData seqDataPadded sequ (Loc contigs) = LBS.concat $ map (CLoc.seqDataPadded sequ) contigs -- | For a 'Pos' and a 'Loc' region on the same sequence, find the -- corresponding 'Pos' relative to the region, if the 'Pos' is -- within the region. If the 'Loc' region has redundant positions -- for a given sequence position, the first is returned. posInto :: Pos.Pos -> Loc -> Maybe Pos.Pos posInto seqpos (Loc contigs) = posIntoContigs seqpos contigs posIntoContigs :: Pos.Pos -> [CLoc.ContigLoc] -> Maybe Pos.Pos posIntoContigs _ [] = Nothing posIntoContigs seqpos (contig@(CLoc.ContigLoc _ len _):rest) = case CLoc.posInto seqpos contig of just@(Just _) -> just Nothing -> liftM (flip Pos.slide len) $ posIntoContigs seqpos rest -- | For a 'Loc' region on a sequence and a 'Pos' relative to the -- region, find the corresponding 'Pos' on the sequence, provided -- that the position is within the bounds of the region. posOutof :: Pos.Pos -> Loc -> Maybe Pos.Pos posOutof pos (Loc contigs) = posOutofContigs pos contigs posOutofContigs :: Pos.Pos -> [CLoc.ContigLoc] -> Maybe Pos.Pos posOutofContigs _ [] = Nothing posOutofContigs seqpos (contig@(CLoc.ContigLoc _ len _):rest) = case CLoc.posOutof seqpos contig of just@(Just _) -> just Nothing -> posOutofContigs (Pos.slide seqpos $ negate len) rest -- | Extend a 'Loc' region by incorporating contigous nucleotide -- regions of the specified lengths on the 5' and 3' ends extend :: (Offset, Offset) -- ^ (5' extension, 3' extension) -> Loc -> Loc extend _ (Loc []) = error "extendLoc on zero-contig Loc" extend (ext5, ext3) (Loc contigs) = Loc $ case extendContigs3 contigs of [] -> error "Empty contigs after extendContigs3" (cfirst:crest) -> (CLoc.extend (ext5, 0) cfirst):crest where extendContigs3 [] = error "Empty contigs in extendContigs3" extendContigs3 [clast] = [CLoc.extend (0, ext3) clast] extendContigs3 (contig:crest) = contig : extendContigs3 crest -- | For a 'Pos' and a 'Loc' on the same sequence, does the position -- fall within the 'Loc' region? isWithin :: Pos.Pos -> Loc -> Bool isWithin seqpos (Loc contigs) = or $ map (CLoc.isWithin seqpos) contigs overlappingContigs :: Loc -> Loc -> [(CLoc.ContigLoc, CLoc.ContigLoc)] overlappingContigs (Loc contigs1) (Loc contigs2) = filter (uncurry CLoc.overlaps) [(c1, c2) | c1 <- contigs1, c2 <- contigs2 ] -- | For a pair of 'Loc' regions on the same sequence, do they overlap -- at all? overlaps :: Loc -> Loc -> Bool overlaps l1 l2 = not $ null $ overlappingContigs l1 l2 display :: Loc -> String display (Loc contigs) = intercalate ";" $ map CLoc.display contigs