module Bio.Sequence.Phd( Phd(..), readPhd, hReadPhd, readPhdTags) where import Bio.Core.Sequence import Bio.Sequence.PhdData import Bio.Sequence.PhdTag import Text.ParserCombinators.Parsec hiding (label) import qualified Data.ByteString as BBB import qualified Data.ByteString.Lazy as BB import qualified Data.ByteString.Lazy.Char8 as B import Data.Ix import Data.Int (Int64) import Data.Char (isSpace) import Data.List import Data.Maybe import Data.Text (Text) import Data.Time import Data.Time.Format import Data.Binary (encode) import System.IO import System.Locale -- | Parse a .phd file, extracting the contents as a PHD readPhd :: FilePath -> IO Phd readPhd f = return . mkPhd =<< readFile f readPhdTags :: FilePath -> IO (Maybe [PhdTag]) readPhdTags f = return . mkPhdTags . lines =<< readFile f -- | Parse .phd contents from a handle hReadPhd :: Handle -> IO Phd hReadPhd h = return . mkPhd =<< hGetContents h -- | The actual phd parser. mkPhd :: String -> Phd mkPhd inp = let (hd:fs) = filter (not . null) . lines $ inp (magic,label) = splitAt 15 hd (comment'',seqAndTags) = break (== "BEGIN_DNA") fs comment' = init $ tail comment'' comment = map ((tail . snd) . break (== ' ')) comment' (seq', tags') = break (== "END_DNA") seqAndTags seq = drop 1 seq' tags = init $ tail tags' fields = words . unlines $ comment sdata = filter ((==3).length) . map words $ seq err = error "failed to parse quality value" dna = concatMap (!! 0) sdata qual = map ((\x -> read x :: Int) . (!! 1)) sdata traceInd = map ((\x -> read x :: Int) . (!! 2)) sdata in if magic == "BEGIN_SEQUENCE " then Phd (mkComment comment) (mkDNABlock label dna qual traceInd) (mkPhdTags tags) else error "Incorrectly formatted PHD file - missing BEGIN_SEQUENCE" -- Todo: also check that we have a BEGIN_DNA/END_DNA region there. mkComment :: [String] -> Comment mkComment com = Comment { chromatFile = head com , abiThumbprint = com!!1 , phredVersion = com!!2 , callMethod = com!!3 , qualityLevels = read (com!!4) :: Int , time = readTime defaultTimeLocale "%a %b %-e %T %Y" $ com!!5 , traceArrayMinIndex = read (com!!6) :: Int , traceArrayMaxIndex = read (com!!7) :: Int , trim = if length com > 10 then Just $ com!!8 else Nothing , chem = (!!) com (if length com > 10 then 9 else 8) , dye = (!!) com (if length com > 10 then 10 else 9) } mkDNABlock :: String -> String -> [Int] -> [Int] -> DNABlock mkDNABlock l d q t = DNABlock { label = SeqLabel { unSL = B.pack l} , bases = SeqData { unSD = B.pack d } , qualities = QualData { unQD = encode q } , traceIndices = t } mkPhdTags :: [String] -> Maybe [PhdTag] mkPhdTags phdLines = case groupByTags phdLines of [] -> Nothing _ -> Just (map (fromJust . mkOnePhdTag) (groupByTags phdLines)) groupByTags :: [String] -> [[String]] groupByTags [] = [] groupByTags xs = let begin_indices = elemIndices "BEGIN_TAG" xs end_indices = elemIndices "END_TAG" xs tag_spans = zip begin_indices end_indices grouping x = drop (fst (tag_spans !! x)) (take (snd (tag_spans !! x) + 1) xs) in map grouping (Data.Ix.range (0, length tag_spans - 1)) parseReadPosition :: String -> (Offset, Offset) parseReadPosition line = (head positions, (head . tail) positions) where positions = map (\x -> Offset {unOff = read x :: Int64}) $ dropLabel line dropLabel :: String -> [String] dropLabel = tail . words mkOnePhdTag :: [String] -> Maybe PhdTag mkOnePhdTag td = case length td of 0 -> Nothing 9 -> Just PhdTag { tagType = head $ dropLabel (td!!1) , source = head $ dropLabel (td!!2) , unpaddedReadPosition = parseReadPosition (td!!3) , date = readTime defaultTimeLocale "%y/%d/%m %T" $ intercalate " " $ dropLabel (td!!4) , Bio.Sequence.PhdTag.comment = if td!!6 == "BEGIN_TAG" then Nothing else Just (td!!6) } _ -> Just PhdTag { tagType = head $ dropLabel (td!!1) , source = head $ dropLabel (td!!2) , unpaddedReadPosition = parseReadPosition (td!!3) , date = readTime defaultTimeLocale "%y/%d/%m %T" $ intercalate " " $ dropLabel (td!!4) , Bio.Sequence.PhdTag.comment = Nothing }