{- | Module: Bio.Sequence.Fasta This module incorporates functionality for reading and writing sequence data in the Fasta format. Each sequence consists of a header (with a '>' prefix) and a set of lines containing the sequence data. -} module Bio.Sequence.Fasta ( -- * Reading and writing plain FASTA files readFasta, writeFasta, hReadFasta, hWriteFasta -- * Reading and writing quality files , readQual, writeQual, hWriteQual -- * Combining FASTA and quality files , readFastaQual, hWriteFastaQual, writeFastaQual -- * Counting sequences in a FASTA file , countSeqs -- * Helper function for reading your own sequences , mkSeqs ) where -- import Data.Char (isSpace) import Data.List (groupBy,intersperse) import Data.Int import Data.Maybe import System.IO import System.IO.Unsafe import Control.Monad import qualified Data.ByteString.Char8 as BS import qualified Data.ByteString.Lazy.Char8 as B import qualified Data.ByteString.Lazy as BB import Data.ByteString.Lazy.Char8 (ByteString) import Bio.Sequence.SeqData splitsAt :: Offset -> ByteString -> [ByteString] splitsAt n s = let (s1,s2) = B.splitAt n s in if B.null s2 then [s1] else s1 : splitsAt n s2 {- -- ugly? class SeqType sd where toSeq :: sd -> sd -> Sequence fromSeq :: Sequence -> (sd,sd) instance SeqType B.ByteString where toSeq = Seq fromSeq (Seq x y) = (x,y) instance SeqType BS.ByteString where toSeq h s = Seq (B.fromChunks [h]) (B.fromChunks [s]) fromSeq (Seq x y) = (c x, c y) where c = BS.concat . B.toChunks -} -- | Lazily read sequences from a FASTA-formatted file readFasta :: FilePath -> IO [Sequence] readFasta f = do ls <- getLines f return (mkSeqs ls) -- | Write sequences to a FASTA-formatted file. -- Line length is 60. writeFasta :: FilePath -> [Sequence] -> IO () writeFasta f ss = do h <- openFile f WriteMode hWriteFasta h ss hClose h -- | Read quality data for sequences to a file. readQual :: FilePath -> IO [Sequence] readQual f = do ls <- getLines f return (mkQual ls) -- | Write quality data for sequences to a file. writeQual :: FilePath -> [Sequence] -> IO () writeQual f ss = do h <- openFile f WriteMode hWriteQual h ss hClose h -- | Read sequence and associated quality. Will error if -- the sequences and qualites do not match one-to-one in sequence. readFastaQual :: FilePath -> FilePath -> IO [Sequence] readFastaQual s q = do ss <- readFasta s qs <- readQual q return ss -- warning: assumes correct qual file! let mkseq s1@(Seq x y _) s2@(Seq _ _ (Just z)) | seqlabel s1 /= seqlabel s2 = error ("mismatching sequence and quality: " ++show (seqlabel s1,seqlabel s2)) | B.length y /= B.length z = error ("mismatching sequence and quality lengths:" ++ show (seqlabel s1,B.length y,B.length z)) | otherwise = Seq x y (Just z) mkseq _ _ = error "readFastaQual: could not combine Fasta and Qual information" return $ zipWith mkseq ss qs -- | Write sequence and quality data simulatnously -- This may be more laziness-friendly. writeFastaQual :: FilePath -> FilePath -> [Sequence] -> IO () writeFastaQual f q ss = do hf <- openFile f WriteMode hq <- openFile q WriteMode hWriteFastaQual hf hq ss hClose hq hClose hf hWriteFastaQual :: Handle -> Handle -> [Sequence] -> IO () hWriteFastaQual hf hq = mapM_ wFQ where wFQ s = wFasta hf s >> wQual hq s -- | Lazily read sequence from handle hReadFasta :: Handle -> IO [Sequence] hReadFasta h = do ls <- hGetLines h return (mkSeqs ls) -- | Write sequences in FASTA format to a handle. hWriteFasta :: Handle -> [Sequence] -> IO () hWriteFasta h = mapM_ (wFasta h) wHead :: Handle -> SeqData -> IO () wHead h l = do B.hPut h $ B.pack ">" B.hPut h l B.hPut h $ B.pack "\n" wFasta :: Handle -> Sequence -> IO () wFasta h (Seq l d _) = do wHead h l let ls = splitsAt 60 d mapM_ (B.hPut h) $ intersperse (B.pack "\n") ls B.hPut h $ B.pack "\n" hWriteQual :: Handle -> [Sequence] -> IO () hWriteQual h = mapM_ (wQual h) wQual :: Handle -> Sequence -> IO () wQual h (Seq l _ (Just q)) = do wHead h l let qls = splitsAt 20 q qs = B.pack . unwords . map show . BB.unpack mapM_ ((\l' -> B.hPut h l' >> B.hPut h (B.pack "\n")) . qs) qls wQual _ (Seq _ _ Nothing) = return () -- ByteString operations -- These aren't (or possible weren't) provided by the FPS library. -- Implement line-based IO (in retrospect it'd be simpler -- and better to just use 'lines'.) -- lazily read lines from file getLines :: FilePath -> IO [ByteString] getLines f = do h <- openFile f ReadMode hGetLines' (hClose h) h -- lazily read lines from handle hGetLines :: Handle -> IO [ByteString] hGetLines = hGetLines' (return ()) -- add an optional handle-closing parameter hGetLines' :: IO () -> Handle -> IO [ByteString] hGetLines' c h = do e <- hIsEOF h if e then c >> return [] else do l' <- BS.hGetLine h let l = B.fromChunks $ if BS.null l' then [] else [l'] ls <- unsafeInterleaveIO $ hGetLines' c h return (l:ls) -- | Convert a list of FASTA-formatted lines into a list of sequences. -- Blank lines are ignored. -- Comment lines start with "#" are allowed between sequences (and ignored). -- Lines starting with ">" initiate a new sequence. mkSeqs :: [ByteString] -> [Sequence] mkSeqs = map mkSeq . blocks mkSeq :: [ByteString] -> Sequence mkSeq (l:ls) = Seq (B.drop 1 l) (B.concat $ takeWhile isSeq ls) Nothing where isSeq s = (not . B.null) s && ((flip elem) (['A'..'Z']++['a'..'z']) . B.head) s mkSeq [] = error "empty input to mkSeq" mkQual :: [ByteString] -> [Sequence] mkQual = map f . blocks where f (l:ls) = Seq (B.drop 1 l) B.empty (Just $ BB.pack (map int (B.words $ B.unlines ls))) f [] = error "mkQual: empty quality data" int = fromIntegral . fst . fromJust' . B.readInt fromJust' = maybe (error "Error in qual format") id -- | Split lines into blocks starting with '>' characters -- Filter out # comments (but not semicolons?) blocks :: [ByteString] -> [[ByteString]] blocks = groupBy (const (('>' /=) . B.head)) . filter ((/='#') . B.head) . filter (not . B.null) countSeqs :: FilePath -> IO Int countSeqs f = do ss <- B.readFile f let hdrs = filter (('>'==).B.head) $ filter (not . B.null) $ B.lines ss return (length hdrs)