module Bio.Sequence.Phd(
Phd(..),
readPhd, hReadPhd,
readPhdTags) where
import Bio.Core.Sequence
import Bio.Sequence.PhdData
import Bio.Sequence.PhdTag
import Text.ParserCombinators.Parsec hiding (label)
import qualified Data.ByteString as BBB
import qualified Data.ByteString.Lazy as BB
import qualified Data.ByteString.Lazy.Char8 as B
import Data.Ix
import Data.Int (Int64)
import Data.Char (isSpace)
import Data.List
import Data.Maybe
import Data.Text (Text)
import Data.Time
import Data.Time.Format
import Data.Binary (encode)
import System.IO
import System.Locale
readPhd :: FilePath -> IO Phd
readPhd f = return . mkPhd =<< readFile f
readPhdTags :: FilePath -> IO (Maybe [PhdTag])
readPhdTags f = return . mkPhdTags . lines =<< readFile f
hReadPhd :: Handle -> IO Phd
hReadPhd h = return . mkPhd =<< hGetContents h
mkPhd :: String -> Phd
mkPhd inp =
let (hd:fs) = filter (not . null) . lines $ inp
(magic,label) = splitAt 15 hd
(comment'',seqAndTags) = break (== "BEGIN_DNA") fs
comment' = init $ tail comment''
comment = map ((tail . snd) . break (== ' ')) comment'
(seq', tags') = break (== "END_DNA") seqAndTags
seq = drop 1 seq'
tags = init $ tail tags'
fields = words . unlines $ comment
sdata = filter ((==3).length) . map words $ seq
err = error "failed to parse quality value"
dna = concatMap (!! 0) sdata
qual = map ((\x -> read x :: Int) . (!! 1)) sdata
traceInd = map ((\x -> read x :: Int) . (!! 2)) sdata
in if magic == "BEGIN_SEQUENCE " then
Phd (mkComment comment)
(mkDNABlock label dna qual traceInd)
(mkPhdTags tags)
else
error "Incorrectly formatted PHD file - missing BEGIN_SEQUENCE"
mkComment :: [String] -> Comment
mkComment com = Comment { chromatFile = head com
, abiThumbprint = com!!1
, phredVersion = com!!2
, callMethod = com!!3
, qualityLevels = read (com!!4) :: Int
, time = readTime defaultTimeLocale "%a %b %-e %T %Y" $ com!!5
, traceArrayMinIndex = read (com!!6) :: Int
, traceArrayMaxIndex = read (com!!7) :: Int
, trim = if length com > 10 then
Just $ com!!8
else
Nothing
, chem = (!!) com (if length com > 10 then 9 else 8)
, dye = (!!) com (if length com > 10 then 10 else 9)
}
mkDNABlock :: String -> String -> [Int] -> [Int] -> DNABlock
mkDNABlock l d q t = DNABlock { label = SeqLabel { unSL = B.pack l}
, bases = SeqData { unSD = B.pack d }
, qualities = QualData { unQD = encode q }
, traceIndices = t }
mkPhdTags :: [String] -> Maybe [PhdTag]
mkPhdTags phdLines = case groupByTags phdLines of
[] -> Nothing
_ -> Just (map (fromJust . mkOnePhdTag)
(groupByTags phdLines))
groupByTags :: [String] -> [[String]]
groupByTags [] = []
groupByTags xs =
let begin_indices = elemIndices "BEGIN_TAG" xs
end_indices = elemIndices "END_TAG" xs
tag_spans = zip begin_indices end_indices
grouping x = drop (fst (tag_spans !! x)) (take (snd (tag_spans !! x) + 1) xs)
in map grouping (Data.Ix.range (0, length tag_spans 1))
parseReadPosition :: String -> (Offset, Offset)
parseReadPosition line = (head positions, (head . tail) positions)
where positions = map (\x -> Offset {unOff = read x :: Int64}) $ dropLabel line
dropLabel :: String -> [String]
dropLabel = tail . words
mkOnePhdTag :: [String] -> Maybe PhdTag
mkOnePhdTag td = case length td of
0 -> Nothing
9 -> Just PhdTag { tagType = head $ dropLabel (td!!1)
, source = head $ dropLabel (td!!2)
, unpaddedReadPosition = parseReadPosition (td!!3)
, date = readTime defaultTimeLocale "%y/%d/%m %T" $ intercalate " " $ dropLabel (td!!4)
, Bio.Sequence.PhdTag.comment = if td!!6 == "BEGIN_TAG" then Nothing
else Just (td!!6)
}
_ -> Just PhdTag { tagType = head $ dropLabel (td!!1)
, source = head $ dropLabel (td!!2)
, unpaddedReadPosition = parseReadPosition (td!!3)
, date = readTime defaultTimeLocale "%y/%d/%m %T" $ intercalate " " $ dropLabel (td!!4)
, Bio.Sequence.PhdTag.comment = Nothing
}