{-# OPTIONS_GHC -fno-warn-orphans #-}
module Bio.ABI.Decode () where
import Bio.Sequence (SequenceDecodable (..),
weightedSequence)
import Bio.Sequence.Basecalled (BasecalledSequence)
import Data.ByteString as BS (ByteString)
import Data.ByteString.Lazy as BSL (ByteString, fromStrict)
import Data.ByteString.Lazy.Char8 as BSL8 (unpack)
import Data.Char (ord)
import Data.List (elem, find)
import Data.Maybe (maybe)
import Data.Text (Text)
import Hyrax.Abif (Abif (..), Directory (..))
import Hyrax.Abif.Read (getAbif)
instance SequenceDecodable BSL.ByteString BasecalledSequence where
sequenceDecode :: BSL.ByteString -> Either Text BasecalledSequence
sequenceDecode bs = do
abif <- getAbif bs
sequence' <- extractSequence abif
quality' <- extractQuality abif
weightedSequence sequence' quality'
instance SequenceDecodable BS.ByteString BasecalledSequence where
sequenceDecode :: BS.ByteString -> Either Text BasecalledSequence
sequenceDecode = sequenceDecode . BSL.fromStrict
extractSequence :: Abif -> Either Text String
extractSequence abif = findDataByDirectory "PBAS" abif >>= checkACGT
extractQuality :: Abif -> Either Text [Double]
extractQuality abif = map (fromIntegral . ord) <$> findDataByDirectory "PCON" abif
checkACGT :: String -> Either Text String
checkACGT str | all validChar str = Right str
| otherwise = Left "Bio.ABI.Decode: could not parse sequence"
where
validChar :: Char -> Bool
validChar ch = ch `elem` ['A', 'C', 'G', 'T']
findDataByDirectory :: Text -> Abif -> Either Text String
findDataByDirectory dirName abif =
let directoryM = find (\Directory{..} -> dTagName == dirName) . aDirs $ abif
in maybe (Left errorMsg) (Right . getData) directoryM
where
errorMsg :: Text
errorMsg = "Bio.ABI.Decode: could not find directory " <> dirName
getData :: Directory -> String
getData = BSL8.unpack . dData