{- | Data structures for manipulating (biological) sequences. Generally supports both nucleotide and protein sequences, some functions, like @revcompl@, only makes sense for nucleotides. -} module Bio.Sequence.SeqData ( -- * Data structure -- | A sequence is a header, sequence data itself, and optional quality data. -- All items are lazy bytestrings. The Offset type can be used for indexing. Sequence(..), Offset, SeqData, -- | Quality data is normally associated with nucleotide sequences Qual, QualData -- * Accessor functions , (!), seqlength,seqlabel,seqheader,seqdata , (?), hasqual, seqqual -- * Converting to and from [Char] , fromStr, toStr -- * Nucleotide functionality -- | Nucleotide sequences contain the alphabet [A,C,G,T]. -- IUPAC specifies an extended nucleotide alphabet with wildcards, but -- it is not supported at this point. , compl, revcompl -- * Protein functionality -- | Proteins are chains of amino acids, represented by the IUPAC alphabet. , Amino(..), translate, fromIUPAC, toIUPAC -- amino acids ) where import Data.List (unfoldr) import Data.Char (toUpper) import Data.Int import Data.Word import Data.Maybe (fromJust, isJust) import qualified Data.ByteString.Lazy.Char8 as B import qualified Data.ByteString.Lazy as BB -- | An offset, index, or length of a 'SeqData' type Offset = Int64 -- | The basic data type used in 'Sequence's type SeqData = B.ByteString -- | Basic type for quality data. Range 0..255. Typical Phred output is in -- the range 6..50, with 20 as the line in the sand separating good from bad. type Qual = Word8 -- | Quality data is a 'Qual' vector, currently implemented as a 'ByteString'. type QualData = BB.ByteString -- mild abuse -- | A sequence consists of a header, the sequence data itself, and optional quality data. data Sequence = Seq !SeqData !SeqData !(Maybe QualData) -- ^ header and actual sequence deriving (Show,Eq) -- | Convert a String to 'SeqData' fromStr :: String -> SeqData fromStr = B.pack -- | Convert a 'SeqData' to a String toStr :: SeqData -> String toStr = B.unpack -- | Read the character at the specified position in the sequence. {-# INLINE (!) #-} (!) :: Sequence -> Offset -> Char (!) (Seq _ bs _) = B.index bs {-# INLINE (?) #-} (?) :: Sequence -> Offset -> Qual (?) (Seq _ _ (Just qs)) = BB.index qs -- | Return sequence length. seqlength :: Sequence -> Offset seqlength (Seq _ bs _) = B.length bs -- | Return sequence label (first word of header) seqlabel :: Sequence -> SeqData seqlabel (Seq l _ _) = head (B.words l) -- | Return full header. seqheader :: Sequence -> SeqData seqheader (Seq l _ _) = l -- | Return the sequence data. seqdata :: Sequence -> SeqData seqdata (Seq _ d _) = d -- | Return the quality data, or error if none exist. Use hasqual if in doubt. seqqual :: Sequence -> QualData seqqual (Seq _ _ (Just q)) = q -- | Check whether the sequence has associated quality data. hasqual :: Sequence -> Bool hasqual (Seq _ _ q) = isJust q ------------------------------------------------------------ -- Nucleotide (DNA, RNA) specific stuff ------------------------------------------------------------ -- | Calculate the reverse complement. -- This is only relevant for the nucleotide alphabet, -- and it leaves other characters unmodified. revcompl :: Sequence -> Sequence revcompl (Seq l bs qs) = Seq l (B.reverse $ B.map compl bs) (maybe Nothing (Just . BB.reverse) qs) -- | Complement a single character. I.e. identify the nucleotide it -- can hybridize with. Note that for multiple nucleotides, you usually -- want the reverse complement (see 'revcompl' for that). compl :: Char -> Char compl 'A' = 'T' compl 'T' = 'A' compl 'C' = 'G' compl 'G' = 'C' compl 'a' = 't' compl 't' = 'a' compl 'c' = 'g' compl 'g' = 'c' compl x = x ------------------------------------------------------------ -- Amino acid (protein) stuff ------------------------------------------------------------ -- | Translate a nucleotide sequence into the corresponding protein -- sequence. This works rather blindly, with no attempt to identify ORFs -- or otherwise QA the result. translate :: Sequence -> Offset -> [Amino] translate s' o' = unfoldr triples (s',o') where triples (s,o) = if o > seqlength s - 3 then Nothing else Just (trans1 (map (s!) [o,o+1,o+2]),(s,o+3)) -- prop_trans s = tail (translate s 0) == translate s 3 trans1 :: String -> Amino trans1 = maybe Xaa id . flip lookup trans_tbl . map (repUT . toUpper) where repUT x = if x == 'U' then 'T' else x -- RNA uses U for T data Amino = Ala | Arg | Asn | Asp | Cys | Gln | Glu | Gly | His | Ile | Leu | Lys | Met | Phe | Pro | Ser | Thr | Tyr | Trp | Val | STP | Asx | Glx | Xle | Xaa -- unknowns deriving (Show,Eq) trans_tbl :: [(String,Amino)] trans_tbl = [("GCT",Ala),("GCC",Ala),("GCA",Ala),("GCG",Ala), ("CGT",Arg),("CGC",Arg),("CGA",Arg),("CGG",Arg),("AGA",Arg),("AGG",Arg), ("AAT",Asn),("AAC",Asn), ("GAT",Asp),("GAC",Asp), -- ("RAT",Asx),("RAC",Asx), -- IUPAC: R is purine (A or G) ("TGT",Cys),("TGC",Cys), ("CAA",Gln),("CAG",Gln), ("GAA",Glu),("GAG",Glu), -- ("SAA",Glx),("SAG",Glx), -- IUPAC: S is C or G ("GGT",Gly),("GGC",Gly),("GGA",Gly),("GGG",Gly), ("CAT",His),("CAC",His), ("ATT",Ile),("ATC",Ile),("ATA",Ile), ("TTA",Leu),("TTG",Leu),("CTT",Leu),("CTC",Leu),("CTA",Leu),("CTG",Leu), ("AAA",Lys),("AAG",Lys), ("ATG",Met), ("TTT",Phe),("TTC",Phe), ("CCT",Pro),("CCC",Pro),("CCA",Pro),("CCG",Pro), ("TCT",Ser),("TCC",Ser),("TCA",Ser),("TCG",Ser),("AGT",Ser),("AGC",Ser), ("ACT",Thr),("ACC",Thr),("ACA",Thr),("ACG",Thr), ("TAT",Tyr),("TAC",Tyr), ("TGG",Trp), ("GTT",Val),("GTC",Val),("GTA",Val),("GTG",Val), ("TAG",STP),("TGA",STP),("TAA",STP) ] -- todo: extend with more IUPAC nucleotide wildcards? -- | Convert a list of amino acids to a sequence in IUPAC format. toIUPAC :: [Amino] -> SeqData toIUPAC = B.pack . map (fromJust . flip lookup iupac) -- | Convert a sequence in IUPAC format to a list of amino acids. fromIUPAC :: SeqData -> [Amino] fromIUPAC = map (maybe Xaa id . flip lookup iupac') . B.unpack iupac :: [(Amino,Char)] iupac = [(Ala,'A') ,(Arg,'R') ,(Asn,'N') ,(Asp,'D') ,(Cys,'C') ,(Gln,'Q') ,(Glu,'E') ,(Gly,'G') ,(His,'H') ,(Ile,'I') ,(Leu,'L') ,(Lys,'K') ,(Met,'M') ,(Phe,'F') ,(Pro,'P') ,(Ser,'S') ,(Thr,'T') ,(Tyr,'Y') ,(Trp,'W') ,(Val,'V') ,(Asx,'B') -- Asn or Asp ,(Glx,'Z') -- Gln or Glu ,(Xle,'J') -- Ile or Leu ,(STP,'X') ,(Xaa,'X') ] iupac' :: [(Char,Amino)] iupac' = map (\(a,b)->(b,a)) iupac