{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE LambdaCase #-}
module Biobase.BLAST.Import (blastCmdJSON2FromFile,
parseJSONBlast,
blastFromFile,
parseTabularBlasts,
parseTabularHTTPBlasts,
blastHTTPFromFile
) where
import Prelude hiding (takeWhile)
import Data.Attoparsec.ByteString.Char8 hiding (isSpace)
import qualified Data.Attoparsec.ByteString.Lazy as L
import qualified Data.ByteString.Char8 as C
import qualified Data.ByteString.Builder as S
import qualified Data.ByteString.Lazy.Char8 as B
import qualified Data.Vector as V
import System.Directory
import Data.Char
import Control.Monad
import Debug.Trace
import Text.Printf
import Biobase.BLAST.Types
import qualified Data.Aeson as A
blastCmdJSON2FromFile :: String -> IO (Either String BlastCmdJSON2)
blastCmdJSON2FromFile filePath = do
printf "# reading blast JSON2 input from file %s\n" filePath
blastFileExists <- doesFileExist filePath
if blastFileExists
then do
bs <- B.readFile filePath
let json = parseJSONBlastCmd bs
return json
else fail "# JSON2 blast file \"%s\" does not exist\n" filePath
parseJSONBlastCmd :: B.ByteString -> Either String BlastCmdJSON2
parseJSONBlastCmd bs = A.eitherDecode bs :: Either String BlastCmdJSON2
parseJSONBlast :: B.ByteString -> Either String BlastJSON2
parseJSONBlast bs = A.eitherDecode bs :: Either String BlastJSON2
blastFromFile :: String -> IO [BlastTabularResult]
blastFromFile filePath = do
printf "# reading tabular blast input from file %s\n" filePath
blastFileExists <- doesFileExist filePath
if blastFileExists
then parseTabularBlasts <$> B.readFile filePath
else fail "# tabular blast file \"%s\" does not exist\n" filePath
blastHTTPFromFile :: String -> IO [BlastTabularResult]
blastHTTPFromFile filePath = do
printf "# reading tabular blast input from file %s\n" filePath
blastFileExists <- doesFileExist filePath
if blastFileExists
then parseTabularHTTPBlasts <$> B.readFile filePath
else fail "# tabular blast file \"%s\" does not exist\n" filePath
parseTabularBlasts :: B.ByteString -> [BlastTabularResult]
parseTabularBlasts = go
where go xs = case L.parse genParseTabularBlast xs of
L.Fail remainingInput ctxts err -> error $ "parseTabularBlasts failed! " ++ err ++ " ctxt: " ++ show ctxts ++ " head of remaining input: " ++ B.unpack (B.take 1000 remainingInput)
L.Done remainingInput btr
| B.null remainingInput -> [btr]
| otherwise -> btr : go remainingInput
parseTabularHTTPBlasts :: B.ByteString -> [BlastTabularResult]
parseTabularHTTPBlasts = go
where go xs = case L.parse genParseTabularHTTPBlast xs of
L.Fail remainingInput ctxts err -> error $ "parseTabularHTTPBlasts failed! " ++ err ++ " ctxt: " ++ show ctxts ++ " head of remaining input: " ++ B.unpack (B.take 1000 remainingInput)
L.Done remainingInput btr
| B.null remainingInput -> [btr]
| otherwise -> btr : go remainingInput
genParseBlastProgram :: Parser BlastProgram
genParseBlastProgram = do
choice [string "# BLAST",string "# blast"]
(toLower <$> anyChar) >>= return . \case
'x' -> BlastX
'p' -> BlastP
'n' -> BlastN
genParseTabularBlast :: Parser BlastTabularResult
genParseTabularBlast = do
_blastProgram <- genParseBlastProgram <?> "Program"
many1 (notChar '\n')
endOfLine
string "# Query: " <?> "Query"
_blastQueryId <- takeWhile (not . isSpace) <* manyTill anyChar endOfLine <?> "QueryId"
string "# Database: " <?> "Database"
_blastDatabase <- many1 (notChar '\n') <?> "Db"
string "\n# " <?> "header linebreak"
skipMany (try genParseFieldLine) <?> "Fields"
_blastHitNumber <- decimal <?> "Hit number"
string " hits found\n" <?> "hits found"
_tabularHit <- count _blastHitNumber (try genParseBlastTabularHit) <?> "Tabular hit"
skipMany endOfLine
return $ BlastTabularResult _blastProgram (toLB _blastQueryId) (B.pack _blastDatabase) _blastHitNumber (V.fromList _tabularHit)
genParseTabularHTTPBlast :: Parser BlastTabularResult
genParseTabularHTTPBlast = do
_blastProgram <- genParseBlastProgram <?> "Program"
endOfLine
string "# Iteration: " <?> "Iteration"
_ <- takeWhile (not . isSpace) <* manyTill anyChar endOfLine <?> "IterationNumber"
string "# Query: " <?> "Query"
_blastQueryId <- takeWhile (not . isSpace) <* manyTill anyChar endOfLine <?> "QueryId"
string "# RID: " <?> "RID"
_ <- takeWhile (not . isSpace) <* manyTill anyChar endOfLine <?> "RID"
string "# Database: " <?> "Database"
_blastDatabase <- many1 (notChar '\n') <?> "Db"
string "\n# " <?> "header linebreak"
skipMany (try genParseFieldLine) <?> "Fields"
_blastHitNumber <- decimal <?> "Hit number"
string " hits found\n" <?> "hits found"
_tabularHit <- count _blastHitNumber (try genParseBlastHTTPTabularHit) <?> "Tabular hit"
skipMany endOfLine
return $ BlastTabularResult _blastProgram (toLB _blastQueryId) (B.pack _blastDatabase) _blastHitNumber (V.fromList _tabularHit)
genParseFieldLine :: Parser ()
genParseFieldLine = do
string "Fields:"
skipMany (notChar '\n')
string "\n# "
return ()
genParseBlastTabularHit :: Parser BlastTabularHit
genParseBlastTabularHit = do
_queryId <- takeWhile1 ((/=9) . ord) <?> "hit qid"
char '\t'
_subjectId <- takeWhile1 ((/=9) . ord) <?> "hit sid"
char '\t'
_seqIdentity <- double <?> "hit seqid"
char '\t'
_alignmentLength <- decimal <?> "hit sid"
char '\t'
_misMatches <- decimal <?> "hit mmatch"
char '\t'
_gapOpenScore <- decimal <?> "hit gopen"
char '\t'
_queryStart <- decimal <?> "hit qstart"
char '\t'
_queryEnd <- decimal <?> "hit qend"
char '\t'
_hitSeqStart <- decimal <?> "hit sstart"
char '\t'
_hitSeqEnd <- decimal <?> "hit send"
char '\t'
_eValue <- double <?> "hit eval"
char '\t'
_bitScore <- double <?> "hit bs"
char '\t'
_subjectFrame <- decimal <?> "hit sF"
char '\t'
_querySeq <- takeWhile1 ((/=9) . ord) <?> "hit qseq"
char '\t'
_subjectSeq <- takeWhile1 ((/=10) . ord) <?> "hit subSeq"
char '\n'
return $ BlastTabularHit (B.fromStrict _queryId) (B.fromStrict _subjectId) _seqIdentity _alignmentLength _misMatches _gapOpenScore _queryStart _queryEnd _hitSeqStart _hitSeqEnd _eValue _bitScore _subjectFrame (B.fromStrict _querySeq) (B.fromStrict _subjectSeq)
genParseBlastHTTPTabularHit :: Parser BlastTabularHit
genParseBlastHTTPTabularHit = do
_queryId <- takeWhile1 ((/=9) . ord) <?> "hit qid"
char '\t'
_subjectId <- takeWhile1 ((/=9) . ord) <?> "hit sid"
char '\t'
_ <- takeWhile1 ((/=9) . ord) <?> "redundant id1"
char '\t'
_ <- takeWhile1 ((/=9) . ord) <?> "redundant id2"
char '\t'
_seqIdentity <- double <?> "hit seqid"
char '\t'
_alignmentLength <- decimal <?> "hit sid"
char '\t'
_misMatches <- decimal <?> "hit mmatch"
char '\t'
_gapOpenScore <- decimal <?> "hit gopen"
char '\t'
_queryStart <- decimal <?> "hit qstart"
char '\t'
_queryEnd <- decimal <?> "hit qend"
char '\t'
_hitSeqStart <- decimal <?> "hit sstart"
char '\t'
_hitSeqEnd <- decimal <?> "hit send"
char '\t'
_eValue <- double <?> "hit eval"
char '\t'
_bitScore <- double <?> "hit bs"
char '\n'
return $ BlastTabularHit (B.fromStrict _queryId) (B.fromStrict _subjectId) _seqIdentity _alignmentLength _misMatches _gapOpenScore _queryStart _queryEnd _hitSeqStart _hitSeqEnd _eValue _bitScore 0 B.empty B.empty
readEvalue :: C.ByteString -> Double
readEvalue eValBs
| (head stringEval) == '.' = read ('0':(stringEval)) :: Double
| otherwise = read stringEval :: Double
where stringEval = C.unpack eValBs
aminoacidLetters = inClass "ARNDCQEGHILMFPSTWYVBZX-"
nucleotideLetters = inClass "AGTCURYSWKMBDHVN-."
bioLetters = inClass "ABCDEFGHIJKLMNOPQRSTUVWXYZ.-"
toLB :: C.ByteString -> B.ByteString
toLB = S.toLazyByteString . S.byteString