{-# LANGUAGE RankNTypes #-}
module Biobase.RNAlien.Library (
module Biobase.RNAlien.Types,
createSessionID,
logMessage,
logEither,
modelConstructer,
constructTaxonomyRecordsCSVTable,
resultSummary,
setVerbose,
logToolVersions,
checkTools,
systemCMsearch,
readCMSearch,
readCMSearches,
compareCM,
parseCMSearch,
cmSearchsubString,
setInitialTaxId,
evaluateConstructionResult,
readCMstat,
parseCMstat,
checkNCBIConnection,
preprocessClustalForRNAz,
preprocessClustalForRNAzExternal,
preprocessClustalForRNAcodeExternal,
rnaZEvalOutput,
reformatFasta,
checkTaxonomyRestriction,
evaluePartitionTrimCMsearchHits,
readFastaFile,
writeFastaFile
)
where
import System.Process
import qualified System.FilePath as FP
import Text.ParserCombinators.Parsec
import Data.List
import Data.Char
import Biobase.Fasta.Strict
import qualified Biobase.BLAST.Types as J
import Bio.ClustalParser
import Data.Int (Int16)
import Biobase.RNAlien.Types
import qualified Data.ByteString.Lazy.Char8 as L
import qualified Data.ByteString.Char8 as B
import Bio.Taxonomy
import Data.Either.Unwrap
import Data.Maybe
import Biobase.Entrez.HTTP
import System.Exit
import Data.Either (lefts,rights,Either)
import qualified Text.EditDistance as ED
import qualified Data.Vector as V
import Control.Concurrent
import System.Random
import Data.Csv
import Data.Matrix
import Biobase.BLAST.HTTP
import Data.Clustering.Hierarchical
import System.Directory
import System.Console.CmdArgs
import qualified Control.Exception.Base as CE
import Bio.RNAfoldParser
import Bio.RNAalifoldParser
import Bio.RNAzParser
import qualified Network.HTTP.Conduit as N
import Network.HTTP.Types.Status
import qualified Bio.RNAcodeParser as RC
import qualified Biobase.RNAlien.RNAcentralHTTP as RCH
import Biobase.RNAlien.InfernalParser
import qualified Data.Text as T
import qualified Data.Text.IO as TI
import qualified Data.Text.Encoding as E
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.IO as TIO
import Text.Printf
import qualified Data.Text.Metrics as TM
import Control.Monad
import qualified Data.Sequence as DS
import Data.Foldable
import Biobase.Types.BioSequence
import qualified Biobase.BLAST.Import as BBI
modelConstructer :: StaticOptions -> ModelConstruction -> IO ModelConstruction
modelConstructer staticOptions modelConstruction = do
logMessage ("Iteration: " ++ show (iterationNumber modelConstruction) ++ "\n") (tempDirPath staticOptions)
iterationSummary modelConstruction staticOptions
let currentIterationNumber = iterationNumber modelConstruction
let foundSequenceNumber = length (concatMap sequenceRecords (taxRecords modelConstruction))
let queries = extractQueries foundSequenceNumber modelConstruction
logVerboseMessage (verbositySwitch staticOptions) ("Queries:" ++ show queries ++ "\n") (tempDirPath staticOptions)
let iterationDirectory = tempDirPath staticOptions ++ show currentIterationNumber ++ "/"
let maybeLastTaxId = extractLastTaxId (taxonomicContext modelConstruction)
Control.Monad.when (isNothing maybeLastTaxId) $ logMessage ("Lineage: Could not extract last tax id \n") (tempDirPath staticOptions)
if maybe True (\uppertaxlimit -> maybe True (\lastTaxId -> uppertaxlimit /= lastTaxId) maybeLastTaxId) (upperTaxonomyLimit modelConstruction)
then do
createDirectory iterationDirectory
let (upperTaxLimit,lowerTaxLimit) = setTaxonomicContextEntrez currentIterationNumber (taxonomicContext modelConstruction) (upperTaxonomyLimit modelConstruction)
logVerboseMessage (verbositySwitch staticOptions) ("Upper taxonomy limit: " ++ show upperTaxLimit ++ "\n " ++ "Lower taxonomy limit: "++ show lowerTaxLimit ++ "\n") (tempDirPath staticOptions)
let expectThreshold = setBlastExpectThreshold modelConstruction
searchResults <- catchAll (searchCandidates staticOptions Nothing currentIterationNumber upperTaxLimit lowerTaxLimit expectThreshold queries)
(\e -> do logWarning ("Warning: Search results iteration" ++ show (iterationNumber modelConstruction) ++ " - exception: " ++ show e) (tempDirPath staticOptions)
return (SearchResult [] Nothing))
currentTaxonomicContext <- getTaxonomicContextEntrez upperTaxLimit (taxonomicContext modelConstruction)
if null (candidates searchResults)
then
alignmentConstructionWithoutCandidates currentTaxonomicContext upperTaxLimit staticOptions modelConstruction
else
alignmentConstructionWithCandidates currentTaxonomicContext upperTaxLimit searchResults staticOptions modelConstruction
else do
logMessage "Message: Modelconstruction complete: Out of queries or taxonomic tree exhausted\n" (tempDirPath staticOptions)
modelConstructionResult staticOptions modelConstruction
catchAll :: IO a -> (CE.SomeException -> IO a) -> IO a
catchAll = CE.catch
setInitialTaxId :: Bool -> Int -> Maybe String -> String -> Maybe Int -> Fasta () ()-> IO (Maybe Int)
setInitialTaxId offlineMode threads inputBlastDatabase tempdir inputTaxId inputSequence =
if (isNothing inputTaxId)
then do
initialTaxId <- findTaxonomyStart offlineMode threads inputBlastDatabase tempdir inputSequence
return (Just initialTaxId)
else do
return inputTaxId
extractLastTaxId :: Maybe Taxon -> Maybe Int
extractLastTaxId taxon
| isNothing taxon = Nothing
| V.null lineageExVector = Nothing
| otherwise = Just (lineageTaxId (V.head lineageExVector))
where lineageExVector = V.fromList (lineageEx (fromJust taxon))
modelConstructionResult :: StaticOptions -> ModelConstruction -> IO ModelConstruction
modelConstructionResult staticOptions modelConstruction = do
let currentIterationNumber = iterationNumber modelConstruction
let outputDirectory = tempDirPath staticOptions
logMessage ("Global search iteration: " ++ show currentIterationNumber ++ "\n") outputDirectory
iterationSummary modelConstruction staticOptions
let foundSequenceNumber = length (concatMap sequenceRecords (taxRecords modelConstruction))
let queries = extractQueries foundSequenceNumber modelConstruction
logVerboseMessage (verbositySwitch staticOptions) ("Queries:" ++ show queries ++ "\n") outputDirectory
let iterationDirectory = outputDirectory ++ show currentIterationNumber ++ "/"
createDirectory iterationDirectory
let logFileDirectoryPath = iterationDirectory ++ "log"
createDirectoryIfMissing False logFileDirectoryPath
let expectThreshold = setBlastExpectThreshold modelConstruction
(alignmentResults,currentPotentialMembers) <- if isJust (taxRestriction staticOptions)
then do
let (upperTaxLimit,lowerTaxLimit) = setRestrictedTaxonomyLimits (fromJust (taxRestriction staticOptions))
restrictedCandidates <- catchAll (searchCandidates staticOptions (taxRestriction staticOptions) currentIterationNumber upperTaxLimit lowerTaxLimit expectThreshold queries)
(\e -> do logWarning ("Warning: Search results iteration" ++ show currentIterationNumber ++ " - exception: " ++ show e) outputDirectory
return (SearchResult [] Nothing))
let uniqueCandidates = filterDuplicates modelConstruction restrictedCandidates
(restrictedAlignmentResults,restrictedPotentialMembers) <- catchAll (alignCandidates staticOptions modelConstruction (fromJust (taxRestriction staticOptions)) uniqueCandidates)
(\e -> do logWarning ("Warning: Alignment results iteration" ++ show currentIterationNumber ++ " - exception: " ++ show e) outputDirectory
return ([],[]))
let currentPotentialMembers = [SearchResult restrictedPotentialMembers (blastDatabaseSize restrictedCandidates)]
return (restrictedAlignmentResults,currentPotentialMembers)
else do
let (upperTaxLimit1,lowerTaxLimit1) = (Just (2157 :: Int), Nothing)
candidates1 <- catchAll (searchCandidates staticOptions (Just "archea") currentIterationNumber upperTaxLimit1 lowerTaxLimit1 expectThreshold queries)
(\e -> do logWarning ("Warning: Search results iteration" ++ show currentIterationNumber ++ " - exception: " ++ show e) outputDirectory
return (SearchResult [] Nothing))
let uniqueCandidates1 = filterDuplicates modelConstruction candidates1
(alignmentResults1,potentialMembers1) <- catchAll (alignCandidates staticOptions modelConstruction "archea" uniqueCandidates1)
(\e -> do logWarning ("Warning: Alignment results iteration" ++ show currentIterationNumber ++ " - exception: " ++ show e) outputDirectory
return ([],[]))
let (upperTaxLimit2,lowerTaxLimit2) = (Just (2 :: Int), Nothing)
candidates2 <- catchAll (searchCandidates staticOptions (Just "bacteria") currentIterationNumber upperTaxLimit2 lowerTaxLimit2 expectThreshold queries)
(\e -> do logWarning ("Warning: Search results iteration" ++ show currentIterationNumber ++ " - exception: " ++ show e) outputDirectory
return (SearchResult [] Nothing))
let uniqueCandidates2 = filterDuplicates modelConstruction candidates2
(alignmentResults2,potentialMembers2) <- catchAll (alignCandidates staticOptions modelConstruction "bacteria" uniqueCandidates2)
(\e -> do logWarning ("Warning: Alignment results iteration" ++ show currentIterationNumber ++ " - exception: " ++ show e) outputDirectory
return ([],[]))
let (upperTaxLimit3,lowerTaxLimit3) = (Just (2759 :: Int), Nothing)
candidates3 <- catchAll (searchCandidates staticOptions (Just "eukaryia") currentIterationNumber upperTaxLimit3 lowerTaxLimit3 expectThreshold queries)
(\e -> do logWarning ("Warning: Search results iteration" ++ show currentIterationNumber ++ " - exception: " ++ show e) outputDirectory
return (SearchResult [] Nothing))
let uniqueCandidates3 = filterDuplicates modelConstruction candidates3
(alignmentResults3,potentialMembers3) <- catchAll (alignCandidates staticOptions modelConstruction "eukaryia" uniqueCandidates3)
(\e -> do logWarning ("Warning: Alignment results iteration" ++ show currentIterationNumber ++ " - exception: " ++ show e) outputDirectory
return ([],[]))
let alignmentResults = alignmentResults1 ++ alignmentResults2 ++ alignmentResults3
let currentPotentialMembers = [SearchResult potentialMembers1 (blastDatabaseSize candidates1), SearchResult potentialMembers2 (blastDatabaseSize candidates2), SearchResult potentialMembers3 (blastDatabaseSize candidates3)]
return (alignmentResults,currentPotentialMembers)
let preliminaryFastaPath = iterationDirectory ++ "model.fa"
let preliminaryCMPath = iterationDirectory ++ "model.cm"
let preliminaryAlignmentPath = iterationDirectory ++ "model.stockholm"
let preliminaryCMLogPath = iterationDirectory ++ "model.cm.log"
let nextModelConstructionInput = constructNext currentIterationNumber modelConstruction alignmentResults Nothing Nothing [] currentPotentialMembers (alignmentModeInfernal modelConstruction)
if (null alignmentResults) && not (alignmentModeInfernal modelConstruction)
then do
logVerboseMessage (verbositySwitch staticOptions) "Alignment result initial mode\n" outputDirectory
logMessage "Message: No sequences found that statisfy filters. Try to reconstruct model with less strict cutoff parameters." outputDirectory
let alignedSequences = extractAlignedSequences (iterationNumber modelConstruction) modelConstruction
let alignmentSequences = map snd (V.toList (V.concat [alignedSequences]))
writeFastaFile preliminaryFastaPath alignmentSequences
let cmBuildFilepath = iterationDirectory ++ "model" ++ ".cmbuild"
let refinedAlignmentFilepath = iterationDirectory ++ "modelrefined" ++ ".stockholm"
let cmBuildOptions ="--refine " ++ refinedAlignmentFilepath
let foldFilepath = iterationDirectory ++ "model" ++ ".fold"
_ <- systemRNAfold preliminaryFastaPath foldFilepath
foldoutput <- readRNAfold foldFilepath
let seqStructure = foldSecondaryStructure (fromRight foldoutput)
let stockholAlignment = convertFastaFoldStockholm (head alignmentSequences) seqStructure
writeFile preliminaryAlignmentPath stockholAlignment
_ <- systemCMbuild cmBuildOptions preliminaryAlignmentPath preliminaryCMPath cmBuildFilepath
_ <- systemCMcalibrate "fast" (cpuThreads staticOptions) preliminaryCMPath preliminaryCMLogPath
reevaluatePotentialMembers staticOptions nextModelConstructionInput
else
if (alignmentModeInfernal modelConstruction)
then do
logVerboseMessage (verbositySwitch staticOptions) ("Alignment construction with candidates - infernal mode\n") outputDirectory
constructModel nextModelConstructionInput staticOptions
writeFile (iterationDirectory ++ "done") ""
logMessage (iterationSummaryLog nextModelConstructionInput) outputDirectory
logVerboseMessage (verbositySwitch staticOptions) (show nextModelConstructionInput) outputDirectory
resultModelConstruction <- reevaluatePotentialMembers staticOptions nextModelConstructionInput
return resultModelConstruction
else do
logVerboseMessage (verbositySwitch staticOptions) ("Alignment construction with candidates - initial mode\n") outputDirectory
constructModel nextModelConstructionInput staticOptions
let nextModelConstructionInputInfernalMode = nextModelConstructionInput {alignmentModeInfernal = True}
logMessage (iterationSummaryLog nextModelConstructionInputInfernalMode) outputDirectory
logVerboseMessage (verbositySwitch staticOptions) (show nextModelConstructionInputInfernalMode) outputDirectory
writeFile (iterationDirectory ++ "done") ""
resultModelConstruction <- reevaluatePotentialMembers staticOptions nextModelConstructionInputInfernalMode
return resultModelConstruction
writeFastaFile :: String -> [Fasta () ()] -> IO ()
writeFastaFile fastaFilePath alignmentSequences = do
let sequenceOutput = B.concat (map (fastaToByteString 80) alignmentSequences)
B.writeFile fastaFilePath sequenceOutput
reevaluatePotentialMembers :: StaticOptions -> ModelConstruction -> IO ModelConstruction
reevaluatePotentialMembers staticOptions modelConstruction = do
let currentIterationNumber = iterationNumber modelConstruction
let outputDirectory = tempDirPath staticOptions
iterationSummary modelConstruction staticOptions
logMessage ("Reevaluation of potential members iteration: " ++ show currentIterationNumber ++ "\n") outputDirectory
let iterationDirectory = outputDirectory ++ show currentIterationNumber ++ "/"
createDirectory iterationDirectory
let indexedPotentialMembers = V.indexed (V.fromList (potentialMembers modelConstruction))
potentialMembersAlignmentResultVector <- V.mapM (\(i,searchresult) -> (alignCandidates staticOptions modelConstruction (show i ++ "_") searchresult)) indexedPotentialMembers
let potentialMembersAlignmentResults = V.toList potentialMembersAlignmentResultVector
let alignmentResults = concatMap fst potentialMembersAlignmentResults
let discardedMembers = concatMap snd potentialMembersAlignmentResults
writeFile (outputDirectory ++ "log/discarded") (concatMap show discardedMembers)
let resultFastaPath = outputDirectory ++ "result.fa"
let resultCMPath = outputDirectory ++ "result.cm"
let resultAlignmentPath = outputDirectory ++ "result.stockholm"
let resultClustalFilepath = outputDirectory ++ "result.clustal"
let resultCMLogPath = outputDirectory ++ "log/result.cm.log"
if null alignmentResults
then do
let lastIterationFastaPath = outputDirectory ++ show (currentIterationNumber - 1)++ "/model.fa"
let lastIterationAlignmentPath = outputDirectory ++ show (currentIterationNumber - 1) ++ "/model.stockholm"
let lastIterationCMPath = outputDirectory ++ show (currentIterationNumber - 1)++ "/model.cm"
copyFile lastIterationCMPath resultCMPath
copyFile lastIterationFastaPath resultFastaPath
copyFile lastIterationAlignmentPath resultAlignmentPath
_ <- systemCMcalibrate "standard" (cpuThreads staticOptions) resultCMPath resultCMLogPath
systemCMalign ("--outformat=Clustal --cpu " ++ show (cpuThreads staticOptions)) resultCMPath resultFastaPath resultClustalFilepath
writeFile (iterationDirectory ++ "done") ""
return modelConstruction
else do
let lastIterationFastaPath = outputDirectory ++ show currentIterationNumber ++ "/model.fa"
let lastIterationAlignmentPath = outputDirectory ++ show currentIterationNumber ++ "/model.stockholm"
let lastIterationCMPath = outputDirectory ++ show currentIterationNumber ++ "/model.cm"
logVerboseMessage (verbositySwitch staticOptions) "Alignment construction with candidates - infernal mode\n" outputDirectory
let nextModelConstructionInput = constructNext currentIterationNumber modelConstruction alignmentResults Nothing Nothing [] [] (alignmentModeInfernal modelConstruction)
constructModel nextModelConstructionInput staticOptions
copyFile lastIterationCMPath resultCMPath
copyFile lastIterationFastaPath resultFastaPath
copyFile lastIterationAlignmentPath resultAlignmentPath
logMessage (iterationSummaryLog nextModelConstructionInput) outputDirectory
logVerboseMessage (verbositySwitch staticOptions) (show nextModelConstructionInput) outputDirectory
_ <- systemCMcalibrate "standard" (cpuThreads staticOptions) resultCMPath resultCMLogPath
systemCMalign ("--outformat=Clustal --cpu " ++ show (cpuThreads staticOptions)) resultCMPath resultFastaPath resultClustalFilepath
writeFile (iterationDirectory ++ "done") ""
return nextModelConstructionInput
alignmentConstructionWithCandidates :: Maybe Taxon -> Maybe Int -> SearchResult -> StaticOptions -> ModelConstruction -> IO ModelConstruction
alignmentConstructionWithCandidates currentTaxonomicContext currentUpperTaxonomyLimit searchResults staticOptions modelConstruction = do
let currentIterationNumber = iterationNumber modelConstruction
let iterationDirectory = tempDirPath staticOptions ++ show currentIterationNumber ++ "/"
(alignmentResults,potentialMemberEntries) <- catchAll (alignCandidates staticOptions modelConstruction "" searchResults)
(\e -> do logWarning ("Warning: Alignment results iteration" ++ show (iterationNumber modelConstruction) ++ " - exception: " ++ show e) (tempDirPath staticOptions)
return ([],[]))
let currentPotentialMembers = [SearchResult potentialMemberEntries (blastDatabaseSize searchResults)]
if (null alignmentResults) && not (alignmentModeInfernal modelConstruction)
then do
logVerboseMessage (verbositySwitch staticOptions) ("Alignment construction with candidates - length 1 - inital mode" ++ "\n") (tempDirPath staticOptions)
let newTaxEntries = taxRecords modelConstruction ++ buildTaxRecords alignmentResults currentIterationNumber
let nextModelConstructionInputWithThreshold = modelConstruction {iterationNumber = currentIterationNumber + 1,upperTaxonomyLimit = currentUpperTaxonomyLimit, taxRecords = newTaxEntries,taxonomicContext = currentTaxonomicContext}
logMessage (iterationSummaryLog nextModelConstructionInputWithThreshold) (tempDirPath staticOptions)
logVerboseMessage (verbositySwitch staticOptions) (show nextModelConstructionInputWithThreshold) (tempDirPath staticOptions)
writeFile (iterationDirectory ++ "done") ""
modelConstructer staticOptions nextModelConstructionInputWithThreshold
else
if (alignmentModeInfernal modelConstruction)
then do
logVerboseMessage (verbositySwitch staticOptions) ("Alignment construction with candidates - infernal mode\n") (tempDirPath staticOptions)
let nextModelConstructionInput = constructNext currentIterationNumber modelConstruction alignmentResults currentUpperTaxonomyLimit currentTaxonomicContext [] currentPotentialMembers True
constructModel nextModelConstructionInput staticOptions
writeFile (iterationDirectory ++ "done") ""
logMessage (iterationSummaryLog nextModelConstructionInput) (tempDirPath staticOptions)
logVerboseMessage (verbositySwitch staticOptions) (show nextModelConstructionInput) (tempDirPath staticOptions)
currentSelectedQueries <- selectQueries staticOptions modelConstruction alignmentResults
let nextModelConstructionInputWithQueries = nextModelConstructionInput {selectedQueries = currentSelectedQueries}
nextModelConstruction <- modelConstructer staticOptions nextModelConstructionInputWithQueries
return nextModelConstruction
else do
logVerboseMessage (verbositySwitch staticOptions) ("Alignment construction with candidates - initial mode\n") (tempDirPath staticOptions)
let nextModelConstructionInput = constructNext currentIterationNumber modelConstruction alignmentResults currentUpperTaxonomyLimit currentTaxonomicContext [] currentPotentialMembers False
constructModel nextModelConstructionInput staticOptions
currentSelectedQueries <- selectQueries staticOptions modelConstruction alignmentResults
let nextModelConstructionInputWithInfernalMode = nextModelConstructionInput {alignmentModeInfernal = True, selectedQueries = currentSelectedQueries}
logMessage (iterationSummaryLog nextModelConstructionInputWithInfernalMode) (tempDirPath staticOptions)
logVerboseMessage (verbositySwitch staticOptions) (show nextModelConstructionInputWithInfernalMode) (tempDirPath staticOptions)
writeFile (iterationDirectory ++ "done") ""
nextModelConstruction <- modelConstructer staticOptions nextModelConstructionInputWithInfernalMode
return nextModelConstruction
alignmentConstructionWithoutCandidates :: Maybe Taxon -> Maybe Int -> StaticOptions -> ModelConstruction -> IO ModelConstruction
alignmentConstructionWithoutCandidates currentTaxonomicContext upperTaxLimit staticOptions modelConstruction = do
let currentIterationNumber = iterationNumber modelConstruction
let iterationDirectory = tempDirPath staticOptions ++ show currentIterationNumber ++ "/"
let nextModelConstructionInputWithThreshold = modelConstruction {iterationNumber = currentIterationNumber + 1,upperTaxonomyLimit = upperTaxLimit,taxonomicContext = currentTaxonomicContext}
let previousIterationCMPath = tempDirPath staticOptions ++ show (currentIterationNumber - 1) ++ "/model.cm"
previousIterationCMexists <- doesFileExist previousIterationCMPath
if previousIterationCMexists
then do
logVerboseMessage (verbositySwitch staticOptions) "Alignment construction no candidates - previous cm\n" (tempDirPath staticOptions)
let previousIterationFastaPath = tempDirPath staticOptions ++ show (currentIterationNumber - 1) ++ "/model.fa"
let previousIterationAlignmentPath = tempDirPath staticOptions ++ show (currentIterationNumber - 1) ++ "/model.stockholm"
let thisIterationFastaPath = tempDirPath staticOptions ++ show (currentIterationNumber) ++ "/model.fa"
let thisIterationAlignmentPath = tempDirPath staticOptions ++ show (currentIterationNumber) ++ "/model.stockholm"
let thisIterationCMPath = tempDirPath staticOptions ++ show (currentIterationNumber) ++ "/model.cm"
copyFile previousIterationFastaPath thisIterationFastaPath
copyFile previousIterationAlignmentPath thisIterationAlignmentPath
copyFile previousIterationCMPath thisIterationCMPath
logMessage (iterationSummaryLog nextModelConstructionInputWithThreshold) (tempDirPath staticOptions)
logVerboseMessage (verbositySwitch staticOptions) (show nextModelConstructionInputWithThreshold) (tempDirPath staticOptions)
writeFile (iterationDirectory ++ "done") ""
modelConstructer staticOptions nextModelConstructionInputWithThreshold
else do
logVerboseMessage (verbositySwitch staticOptions) "Alignment construction no candidates - no previous iteration cm\n" (tempDirPath staticOptions)
logMessage (iterationSummaryLog nextModelConstructionInputWithThreshold) (tempDirPath staticOptions)
logVerboseMessage (verbositySwitch staticOptions) (show nextModelConstructionInputWithThreshold) (tempDirPath staticOptions)
writeFile (iterationDirectory ++ "done") ""
modelConstructer staticOptions nextModelConstructionInputWithThreshold
findTaxonomyStart :: Bool -> Int -> Maybe String -> String -> Fasta () () -> IO Int
findTaxonomyStart offlineMode threads inputBlastDatabase temporaryDirectory querySequence = do
let queryIndexString = "1"
let hitNumberQuery = buildHitNumberQuery "&HITLIST_SIZE=10"
let registrationInfo = buildRegistration "RNAlien" "florian.eggenhofer@univie.ac.at"
let blastQuery = BlastHTTPQuery (Just "ncbi") (Just "blastn") inputBlastDatabase [querySequence] (Just (hitNumberQuery ++ registrationInfo)) (Just (5400000000 :: Int))
logMessage "No tax id provided - Sending find taxonomy start blast query \n" temporaryDirectory
let logFileDirectoryPath = temporaryDirectory ++ "taxonomystart" ++ "/"
createDirectory logFileDirectoryPath
blastOutput <-if offlineMode
then CE.catch (blast logFileDirectoryPath threads Nothing Nothing (Just (10 :: Double)) False blastQuery)
(\e -> do let err = show (e :: CE.IOException)
logWarning ("Warning: Blast attempt failed:" ++ " " ++ err) logFileDirectoryPath
return (Left ""))
else CE.catch (blastHTTP blastQuery)
(\e -> do let err = show (e :: CE.IOException)
logWarning ("Warning: Blast attempt failed:" ++ " " ++ err) logFileDirectoryPath
return (Left ""))
writeFile (logFileDirectoryPath ++ "/" ++ queryIndexString ++ "_1blastOutput") (show blastOutput)
logEither blastOutput temporaryDirectory
let blastHitsArePresent = either (const False) blastMatchesPresent blastOutput
if blastHitsArePresent
then do
let rightBlast = fromRight blastOutput
let bestHit = getBestHit rightBlast
bestBlastHitTaxIdOutput <- retrieveBlastHitTaxIdEntrez [bestHit]
let taxIdFromEntrySummaries = extractTaxIdFromEntrySummaries (snd bestBlastHitTaxIdOutput)
Control.Monad.when (null taxIdFromEntrySummaries) (error "findTaxonomyStart: - head: empty list of taxonomy entry summary for best hit")
let rightBestTaxIdResult = head taxIdFromEntrySummaries
logMessage ("Initial TaxId: " ++ show rightBestTaxIdResult ++ "\n") temporaryDirectory
CE.evaluate rightBestTaxIdResult
else error "Find taxonomy start: Could not find blast hits to use as a taxonomic starting point"
searchCandidates :: StaticOptions -> Maybe String -> Int -> Maybe Int -> Maybe Int -> Double -> [Fasta () ()] -> IO SearchResult
searchCandidates staticOptions finaliterationprefix iterationnumber upperTaxLimit lowerTaxLimit expectThreshold inputQuerySequences = do
Control.Monad.when (null inputQuerySequences) $ error "searchCandidates: - head: empty list of query sequences"
let queryLength = fromIntegral (B.length (_bioSequence (_fasta (head inputQuerySequences))))
let queryIndexString = "1"
let entrezTaxFilter = buildTaxFilterQuery upperTaxLimit lowerTaxLimit
logVerboseMessage (verbositySwitch staticOptions) ("entrezTaxFilter" ++ show entrezTaxFilter ++ "\n") (tempDirPath staticOptions)
let hitNumberQuery = buildHitNumberQuery "&HITLIST_SIZE=5000&EXPECT=" ++ show expectThreshold
let registrationInfo = buildRegistration "RNAlien" "florian.eggenhofer@univie.ac.at"
let softmaskFilter = if blastSoftmaskingToggle staticOptions then "&FILTER=True&FILTER=m" else ""
let blastQuery = BlastHTTPQuery (Just "ncbi") (Just "blastn") (blastDatabase staticOptions) inputQuerySequences (Just (hitNumberQuery ++ entrezTaxFilter ++ softmaskFilter ++ registrationInfo)) (Just (5400000000 :: Int))
logVerboseMessage (verbositySwitch staticOptions) ("Sending blast query " ++ show iterationnumber ++ "\n") (tempDirPath staticOptions)
let logFileDirectoryPath = tempDirPath staticOptions ++ show iterationnumber ++ "/" ++ fromMaybe "" finaliterationprefix ++ "log"
logDirectoryPresent <- doesDirectoryExist logFileDirectoryPath
Control.Monad.when (not logDirectoryPresent) $ createDirectory (logFileDirectoryPath)
blastOutput <- if (offline staticOptions)
then CE.catch (blast logFileDirectoryPath (cpuThreads staticOptions) upperTaxLimit lowerTaxLimit (Just expectThreshold) (blastSoftmaskingToggle staticOptions) blastQuery)
(\e -> do let err = show (e :: CE.IOException)
logWarning ("Warning: Blast attempt failed:" ++ " " ++ err) (tempDirPath staticOptions)
return (Left ""))
else CE.catch (blastHTTP blastQuery)
(\e -> do let err = show (e :: CE.IOException)
logWarning ("Warning: Blast attempt failed:" ++ " " ++ err) (tempDirPath staticOptions)
return (Left ""))
writeFile (logFileDirectoryPath ++ "/" ++ queryIndexString ++ "_1blastOutput") (show blastOutput)
logEither blastOutput (tempDirPath staticOptions)
let blastHitsArePresent = either (const False) blastMatchesPresent blastOutput
if blastHitsArePresent
then do
let rightBlast = fromRight blastOutput
let blastHits = J._hits (J._search . J._results . J._report . J._blastoutput2 $ rightBlast)
writeFile (logFileDirectoryPath ++ "/" ++ queryIndexString ++ "_2blastHits") (showlines blastHits)
let blastHitsFilteredByLength = filterByHitLength blastHits queryLength (lengthFilterToggle staticOptions)
writeFile (logFileDirectoryPath ++ "/" ++ queryIndexString ++ "_3blastHitsFilteredByLength") (showlines blastHitsFilteredByLength)
let blastHitsFilteredByCoverage = filterByCoverage blastHitsFilteredByLength queryLength (coverageFilterToggle staticOptions)
writeFile (logFileDirectoryPath ++ "/" ++ queryIndexString ++ "_3ablastHitsFilteredByLength") (showlines blastHitsFilteredByCoverage)
let blastHitsWithTaxId = extractBlastHitsTaxId blastHitsFilteredByCoverage
blastHitsWithParentTaxIdOutput <- retrieveParentTaxIdsEntrez blastHitsWithTaxId
let blastHitsFilteredByParentTaxIdWithParentTaxId = filterByParentTaxId blastHitsWithParentTaxIdOutput True
writeFile (logFileDirectoryPath ++ "/" ++ queryIndexString ++ "_4blastHitsFilteredByParentTaxId") (showlines blastHitsFilteredByParentTaxIdWithParentTaxId)
let nonEmptyfilteredBlastResults = filter (\(blasthit,_) -> not (null (J._hsps blasthit))) blastHitsFilteredByParentTaxIdWithParentTaxId
let requestedSequenceElements = map (getRequestedSequenceElement queryLength) nonEmptyfilteredBlastResults
writeFile (logFileDirectoryPath ++ "/" ++ queryIndexString ++ "_6requestedSequenceElements") (showlines requestedSequenceElements)
fullSequencesWithSimilars <- retrieveFullSequences staticOptions requestedSequenceElements
if null fullSequencesWithSimilars
then do
writeFile (logFileDirectoryPath ++ "/" ++ queryIndexString ++ "_10afullSequencesWithSimilars") "No sequences retrieved"
CE.evaluate (SearchResult [] Nothing)
else do
writeFile (logFileDirectoryPath ++ "/" ++ queryIndexString ++ "_10afullSequencesWithSimilars") (showlines fullSequencesWithSimilars)
let fullSequences = filterIdenticalSequences fullSequencesWithSimilars 100
writeFile (logFileDirectoryPath ++ "/" ++ queryIndexString ++ "_10fullSequences") (showlines fullSequences)
let maybeFractionEvalueMatch = getHitWithFractionEvalue rightBlast
if isNothing maybeFractionEvalueMatch
then CE.evaluate (SearchResult [] Nothing)
else do
let fractionEvalueMatch = fromJust maybeFractionEvalueMatch
let dbSize = computeDataBaseSize (J._evalue fractionEvalueMatch) (J._bit_score fractionEvalueMatch) (fromIntegral queryLength ::Double)
CE.evaluate (SearchResult fullSequences (Just dbSize))
else CE.evaluate (SearchResult [] Nothing)
computeDataBaseSize :: Double -> Double -> Double -> Double
computeDataBaseSize evalue bitscore querylength = ((evalue * 2 ** bitscore) / querylength)/10^(6 :: Integer)
alignCandidates :: StaticOptions -> ModelConstruction -> String -> SearchResult -> IO ([(Fasta () (),Int,B.ByteString)],[(Fasta () (),Int,B.ByteString)])
alignCandidates staticOptions modelConstruction multipleSearchResultPrefix searchResults = do
let iterationDirectory = tempDirPath staticOptions ++ show (iterationNumber modelConstruction) ++ "/" ++ multipleSearchResultPrefix
createDirectoryIfMissing False (iterationDirectory ++ "log")
if null (candidates searchResults)
then do
writeFile (iterationDirectory ++ "log" ++ "/11candidates") "No candidates to align"
return ([],[])
else do
writeFile (iterationDirectory ++ "log" ++ "/11candidates") (showlines (candidates searchResults))
let alignedSequences = map snd (V.toList (extractAlignedSequences (iterationNumber modelConstruction) modelConstruction))
let filteredCandidates = filterWithCollectedSequences (candidates searchResults) alignedSequences 99
writeFile (iterationDirectory ++ "log" ++ "/12candidatesFilteredByCollected") (showlines filteredCandidates)
if alignmentModeInfernal modelConstruction
then alignCandidatesInfernalMode staticOptions modelConstruction multipleSearchResultPrefix (blastDatabaseSize searchResults) filteredCandidates
else alignCandidatesInitialMode staticOptions modelConstruction multipleSearchResultPrefix filteredCandidates
alignCandidatesInfernalMode :: StaticOptions -> ModelConstruction -> String -> Maybe Double -> [(Fasta () (),Int,B.ByteString)] -> IO ([(Fasta () (),Int,B.ByteString)],[(Fasta () (),Int,B.ByteString)])
alignCandidatesInfernalMode staticOptions modelConstruction multipleSearchResultPrefix blastDbSize filteredCandidates = do
let iterationDirectory = tempDirPath staticOptions ++ show (iterationNumber modelConstruction) ++ "/" ++ multipleSearchResultPrefix
let candidateSequences = extractCandidateSequences filteredCandidates
logVerboseMessage (verbositySwitch staticOptions) "Alignment Mode Infernal\n" (tempDirPath staticOptions)
let indexedCandidateSequenceList = V.toList candidateSequences
let cmSearchFastaFilePaths = map (constructFastaFilePaths iterationDirectory) indexedCandidateSequenceList
let cmSearchFilePaths = map (constructCMsearchFilePaths iterationDirectory) indexedCandidateSequenceList
let covarianceModelPath = tempDirPath staticOptions ++ show (iterationNumber modelConstruction - 1) ++ "/" ++ "model.cm"
mapM_ (\(number,_nucleotideSequence) -> writeFastaFile (iterationDirectory ++ show number ++ ".fa") [_nucleotideSequence]) indexedCandidateSequenceList
let zippedFastaCMSearchResultPaths = zip cmSearchFastaFilePaths cmSearchFilePaths
mapM_ (uncurry (systemCMsearch (cpuThreads staticOptions) ("-Z " ++ show (fromJust blastDbSize)) covarianceModelPath)) zippedFastaCMSearchResultPaths
cmSearchResults <- mapM readCMSearch cmSearchFilePaths
writeFile (iterationDirectory ++ "cm_error") (concatMap show (lefts cmSearchResults))
let rightCMSearchResults = rights cmSearchResults
let cmSearchCandidatesWithSequences = zip rightCMSearchResults filteredCandidates
let (trimmedSelectedCandidates,potentialCandidates,rejectedCandidates) = evaluePartitionTrimCMsearchHits (evalueThreshold modelConstruction) cmSearchCandidatesWithSequences
createDirectoryIfMissing False (iterationDirectory ++ "log")
writeFile (iterationDirectory ++ "log" ++ "/13selectedCandidates'") (showlines trimmedSelectedCandidates)
writeFile (iterationDirectory ++ "log" ++ "/14rejectedCandidates'") (showlines rejectedCandidates)
writeFile (iterationDirectory ++ "log" ++ "/15potentialCandidates'") (showlines potentialCandidates)
return (map snd trimmedSelectedCandidates,map snd potentialCandidates)
alignCandidatesInitialMode :: StaticOptions -> ModelConstruction -> String -> [(Fasta () (),Int,B.ByteString)] -> IO ([(Fasta () (),Int,B.ByteString)],[(Fasta () (),Int,B.ByteString)])
alignCandidatesInitialMode staticOptions modelConstruction multipleSearchResultPrefix filteredCandidates = do
let iterationDirectory = tempDirPath staticOptions ++ show (iterationNumber modelConstruction) ++ "/" ++ multipleSearchResultPrefix
createDirectoryIfMissing False (iterationDirectory ++ "log")
let candidateSequences = extractCandidateSequences filteredCandidates
logVerboseMessage (verbositySwitch staticOptions) "Alignment Mode Initial\n" (tempDirPath staticOptions)
let inputFastaFilepath = iterationDirectory ++ "input.fa"
let inputFoldFilepath = iterationDirectory ++ "input.fold"
writeFastaFile (iterationDirectory ++ "input.fa") [(head (inputFasta modelConstruction))]
logMessage (showlines (V.toList candidateSequences)) (tempDirPath staticOptions)
V.mapM_ (\(number,nucleotideSequence') -> writeFastaFile (iterationDirectory ++ show number ++ ".fa") [nucleotideSequence']) candidateSequences
let candidateFastaFilepath = V.toList (V.map (\(number,_) -> iterationDirectory ++ show number ++ ".fa") candidateSequences)
let candidateFoldFilepath = V.toList (V.map (\(number,_) -> iterationDirectory ++ show number ++ ".fold") candidateSequences)
let locarnainClustalw2FormatFilepath = V.toList (V.map (\(number,_) -> iterationDirectory ++ show number ++ "." ++ "clustalmlocarna") candidateSequences)
let candidateAliFoldFilepath = V.toList (V.map (\(number,_) -> iterationDirectory ++ show number ++ ".alifold") candidateSequences)
let locarnaFilepath = V.toList (V.map (\(number,_) -> iterationDirectory ++ show number ++ "." ++ "mlocarna") candidateSequences)
alignSequences "locarna" " --write-structure --free-endgaps=++-- " (replicate (V.length candidateSequences) inputFastaFilepath) candidateFastaFilepath locarnainClustalw2FormatFilepath locarnaFilepath
let sequenceIdentities = V.map (\(_,s) -> sequenceIdentity (head (inputFasta modelConstruction)) s/(100 :: Double)) candidateSequences
systemRNAfold inputFastaFilepath inputFoldFilepath
inputfoldResult <- readRNAfold inputFoldFilepath
let inputFoldMFE = foldingEnergy (fromRight inputfoldResult)
mapM_ (uncurry systemRNAfold) (zip candidateFastaFilepath candidateFoldFilepath)
foldResults <- mapM readRNAfold candidateFoldFilepath
let candidateMFEs = map (foldingEnergy . fromRight) foldResults
let averageMFEs = map (\candidateMFE -> (candidateMFE + inputFoldMFE)/2) candidateMFEs
mapM_ (uncurry (systemRNAalifold "")) (zip locarnainClustalw2FormatFilepath candidateAliFoldFilepath)
alifoldResults <- mapM readRNAalifold candidateAliFoldFilepath
let consensusMFE = map (alignmentConsensusMinimumFreeEnergy . fromRight) alifoldResults
let sciidfraction = map (\(consMFE,averMFE,seqId) -> (consMFE/averMFE)/seqId) (zip3 consensusMFE averageMFEs (V.toList sequenceIdentities))
let idlog = concatMap (\(sciidfraction',consMFE,averMFE,seqId) -> show sciidfraction' ++ "," ++ show consMFE ++ "," ++ show averMFE ++ "," ++ show seqId ++ "\n")(zip4 sciidfraction consensusMFE averageMFEs (V.toList sequenceIdentities))
writeFile (iterationDirectory ++ "log" ++ "/idlog") idlog
let alignedCandidates = zip sciidfraction filteredCandidates
writeFile (iterationDirectory ++ "log" ++ "/zscores") (showlines alignedCandidates)
let (selectedCandidates,rejectedCandidates) = partition (\(sciidfraction',_) -> sciidfraction' > nSCICutoff staticOptions) alignedCandidates
mapM_ print (zip3 consensusMFE averageMFEs (V.toList sequenceIdentities))
writeFile (iterationDirectory ++ "log" ++ "/13selectedCandidates") (showlines selectedCandidates)
writeFile (iterationDirectory ++ "log" ++ "/14rejectedCandidates") (showlines rejectedCandidates)
return (map snd selectedCandidates,[])
setClusterNumber :: Int -> Int
setClusterNumber x
| x <= 5 = x
| otherwise = 5
findCutoffforClusterNumber :: Dendrogram a -> Int -> Distance -> Distance
findCutoffforClusterNumber clustaloDendrogram numberOfClusters currentCutoff
| currentClusterNumber >= numberOfClusters = currentCutoff
| otherwise = findCutoffforClusterNumber clustaloDendrogram numberOfClusters (currentCutoff-0.01)
where currentClusterNumber = length (cutAt clustaloDendrogram currentCutoff)
selectQueries :: StaticOptions -> ModelConstruction -> [(Fasta () (),Int,B.ByteString)] -> IO [Fasta () ()]
selectQueries staticOptions modelConstruction selectedCandidates = do
logVerboseMessage (verbositySwitch staticOptions) "SelectQueries\n" (tempDirPath staticOptions)
let alignedSequences = extractAlignedSequences (iterationNumber modelConstruction) modelConstruction
let candidateSequences = extractQueryCandidates selectedCandidates
let iterationDirectory = tempDirPath staticOptions ++ show (iterationNumber modelConstruction) ++ "/"
let stockholmFilepath = iterationDirectory ++ "model" ++ ".stockholm"
let alignmentSequences = map snd (V.toList (V.concat [candidateSequences,alignedSequences]))
if length alignmentSequences > 3
then
if (querySelectionMethod staticOptions) == "clustering"
then do
writeFastaFile (iterationDirectory ++ "query" ++ ".fa") alignmentSequences
let fastaFilepath = iterationDirectory ++ "query" ++ ".fa"
let clustaloFilepath = iterationDirectory ++ "query" ++ ".clustalo"
let clustaloDistMatrixPath = iterationDirectory ++ "query" ++ ".matrix"
alignSequences "clustalo" ("--full --distmat-out=" ++ clustaloDistMatrixPath ++ " ") [fastaFilepath] [] [clustaloFilepath] []
idsDistancematrix <- readClustaloDistMatrix clustaloDistMatrixPath
logEither idsDistancematrix (tempDirPath staticOptions)
let (clustaloIds,clustaloDistMatrix) = fromRight idsDistancematrix
logVerboseMessage (verbositySwitch staticOptions) ("Clustalid: " ++ intercalate "," clustaloIds ++ "\n") (tempDirPath staticOptions)
logVerboseMessage (verbositySwitch staticOptions) ("Distmatrix: " ++ show clustaloDistMatrix ++ "\n") (tempDirPath staticOptions)
let clustaloDendrogram = dendrogram UPGMA clustaloIds (getDistanceMatrixElements clustaloIds clustaloDistMatrix)
logVerboseMessage (verbositySwitch staticOptions) ("ClustaloDendrogram: " ++ show clustaloDendrogram ++ "\n") (tempDirPath staticOptions)
logVerboseMessage (verbositySwitch staticOptions) ("ClustaloDendrogram: " ++ show clustaloDistMatrix ++ "\n") (tempDirPath staticOptions)
let numberOfClusters = setClusterNumber (length alignmentSequences)
logVerboseMessage (verbositySwitch staticOptions) ("numberOfClusters: " ++ show numberOfClusters ++ "\n") (tempDirPath staticOptions)
let dendrogramStartCutDistance = 1 :: Double
let dendrogramCutDistance' = findCutoffforClusterNumber clustaloDendrogram numberOfClusters dendrogramStartCutDistance
logVerboseMessage (verbositySwitch staticOptions) ("dendrogramCutDistance': " ++ show dendrogramCutDistance' ++ "\n") (tempDirPath staticOptions)
let cutDendrogram = cutAt clustaloDendrogram dendrogramCutDistance'
let currentSelectedSequenceIds = map B.pack (take (queryNumber staticOptions) (concatMap (take 1 . elements) cutDendrogram))
let fastaSelectedSequences = concatMap (filterSequenceById alignmentSequences) currentSelectedSequenceIds
stockholmSelectedSequences <- extractAlignmentSequencesByIds stockholmFilepath currentSelectedSequenceIds
let currentSelectedSequences = if (blastSoftmaskingToggle staticOptions) then stockholmSelectedSequences else fastaSelectedSequences
logVerboseMessage (verbositySwitch staticOptions) ("SelectedQueries: " ++ show currentSelectedSequences ++ "\n") (tempDirPath staticOptions)
writeFile (tempDirPath staticOptions ++ show (iterationNumber modelConstruction) ++ "/log" ++ "/13selectedQueries") (showlines currentSelectedSequences)
CE.evaluate currentSelectedSequences
else do
let fastaSelectedSequences = filterIdenticalSequences' alignmentSequences (95 :: Double)
let currentSelectedSequenceIds = map fastaHeader (take (queryNumber staticOptions) fastaSelectedSequences)
stockholmSelectedSequences <- extractAlignmentSequencesByIds stockholmFilepath currentSelectedSequenceIds
let currentSelectedSequences = if (blastSoftmaskingToggle staticOptions) then stockholmSelectedSequences else fastaSelectedSequences
writeFile (tempDirPath staticOptions ++ show (iterationNumber modelConstruction) ++ "/log" ++ "/13selectedQueries") (showlines currentSelectedSequences)
CE.evaluate currentSelectedSequences
else return []
fastaHeader :: Fasta () () -> B.ByteString
fastaHeader currentFasta = _sequenceIdentifier . _header $ currentFasta
filterSequenceById :: [Fasta () ()] -> B.ByteString-> [Fasta () ()]
filterSequenceById alignmentSequences querySequenceId = filter (seqenceHasId querySequenceId) alignmentSequences
seqenceHasId :: B.ByteString -> Fasta () () -> Bool
seqenceHasId querySequenceId alignmentSequence = fastaHeader alignmentSequence == querySequenceId
constructModel :: ModelConstruction -> StaticOptions -> IO String
constructModel modelConstruction staticOptions = do
let alignedSequences = extractAlignedSequences (iterationNumber modelConstruction) modelConstruction
let outputDirectory = tempDirPath staticOptions ++ show (iterationNumber modelConstruction - 1) ++ "/"
let alignmentSequences = map snd (V.toList (V.concat [alignedSequences]))
writeFastaFile (outputDirectory ++ "model" ++ ".fa") alignmentSequences
let fastaFilepath = outputDirectory ++ "model" ++ ".fa"
let locarnaFilepath = outputDirectory ++ "model" ++ ".mlocarna"
let stockholmFilepath = outputDirectory ++ "model" ++ ".stockholm"
let updatedStructureStockholmFilepath = outputDirectory ++ "newstructuremodel" ++ ".stockholm"
let cmalignCMFilepath = tempDirPath staticOptions ++ show (iterationNumber modelConstruction - 2) ++ "/" ++ "model" ++ ".cm"
let cmFilepath = outputDirectory ++ "model" ++ ".cm"
let cmCalibrateFilepath = outputDirectory ++ "model" ++ ".cmcalibrate"
let cmBuildFilepath = outputDirectory ++ "model" ++ ".cmbuild"
let alifoldFilepath = outputDirectory ++ "model" ++ ".alifold"
let refinedAlignmentFilepath = outputDirectory ++ "modelrefined" ++ ".stockholm"
let cmBuildOptions ="--refine " ++ refinedAlignmentFilepath
if alignmentModeInfernal modelConstruction
then do
logVerboseMessage (verbositySwitch staticOptions) "Construct Model - infernal mode\n" (tempDirPath staticOptions)
systemCMalign ("--cpu " ++ show (cpuThreads staticOptions)) cmalignCMFilepath fastaFilepath stockholmFilepath
systemRNAalifold "-r --cfactor 0.6 --nfactor 0.5" stockholmFilepath alifoldFilepath
replaceStatus <- replaceStockholmStructure stockholmFilepath alifoldFilepath updatedStructureStockholmFilepath
if null replaceStatus
then do
systemCMbuild cmBuildOptions updatedStructureStockholmFilepath cmFilepath cmBuildFilepath
systemCMcalibrate "fast" (cpuThreads staticOptions) cmFilepath cmCalibrateFilepath
return cmFilepath
else do
logWarning ("Warning: A problem occured updating the secondary structure of iteration " ++ show (iterationNumber modelConstruction) ++ " stockholm alignment: " ++ replaceStatus) (tempDirPath staticOptions)
systemCMbuild cmBuildOptions stockholmFilepath cmFilepath cmBuildFilepath
systemCMcalibrate "fast" (cpuThreads staticOptions) cmFilepath cmCalibrateFilepath
return cmFilepath
else do
logVerboseMessage (verbositySwitch staticOptions) "Construct Model - initial mode\n" (tempDirPath staticOptions)
alignSequences "mlocarna" ("--threads=" ++ show (cpuThreads staticOptions) ++ " ") [fastaFilepath] [] [locarnaFilepath] []
mlocarnaAlignment <- readStructuralClustalAlignment locarnaFilepath
logEither mlocarnaAlignment (tempDirPath staticOptions)
let stockholAlignment = convertClustaltoStockholm (fromRight mlocarnaAlignment)
TI.writeFile stockholmFilepath stockholAlignment
_ <- systemCMbuild cmBuildOptions stockholmFilepath cmFilepath cmBuildFilepath
_ <- systemCMcalibrate "fast" (cpuThreads staticOptions) cmFilepath cmCalibrateFilepath
return cmFilepath
replaceStockholmStructure :: String -> String -> String -> IO String
replaceStockholmStructure stockholmFilepath alifoldFilepath updatedStructureStockholmFilepath = do
inputAln <- readFile stockholmFilepath
inputRNAalifold <- readRNAalifold alifoldFilepath
if isLeft inputRNAalifold
then
return (show (fromLeft inputRNAalifold))
else do
let alifoldstructure = alignmentConsensusDotBracket (fromRight inputRNAalifold)
let seedLinesVector = V.fromList (lines inputAln)
let structureIndices = V.toList (V.findIndices isStructureLine seedLinesVector)
let updatedStructureElements = updateStructureElements seedLinesVector alifoldstructure structureIndices
let newVector = seedLinesVector V.// updatedStructureElements
let newVectorString = unlines (V.toList newVector)
writeFile updatedStructureStockholmFilepath newVectorString
return []
updateStructureElements :: V.Vector String -> String -> [Int] -> [(Int,String)]
updateStructureElements inputVector structureString indices
| null indices = []
| otherwise = newElement ++ updateStructureElements inputVector (drop structureLength structureString) (tail indices)
where currentIndex = head indices
currentElement = inputVector V.! currentIndex
elementLength = length currentElement
structureStartIndex = maximum (elemIndices ' ' currentElement) + 1
structureLength = elementLength - structureStartIndex
newElementHeader = take structureStartIndex currentElement
newElementStructure = take structureLength structureString
newElement = [(currentIndex,newElementHeader ++ newElementStructure)]
isStructureLine :: String -> Bool
isStructureLine input = "#=GC SS_cons" `isInfixOf` input
iterationSummaryLog :: ModelConstruction -> String
iterationSummaryLog mC = output
where upperTaxonomyLimitOutput = maybe "not set" show (upperTaxonomyLimit mC)
output = "Upper taxonomy id limit: " ++ upperTaxonomyLimitOutput ++ ", Collected members: " ++ show (length (concatMap sequenceRecords (taxRecords mC))) ++ "\n"
iterationSummary :: ModelConstruction -> StaticOptions -> IO()
iterationSummary mC sO = do
let upperTaxonomyLimitOutput = maybe "not set" show (upperTaxonomyLimit mC)
let output = show (iterationNumber mC) ++ "," ++ upperTaxonomyLimitOutput ++ "," ++ show (length (concatMap sequenceRecords (taxRecords mC)))
writeFile (tempDirPath sO ++ "/log/" ++ show (iterationNumber mC) ++ ".log") output
resultSummary :: ModelConstruction -> StaticOptions -> IO()
resultSummary mC sO = do
let upperTaxonomyLimitOutput = maybe "not set" show (upperTaxonomyLimit mC)
let output = show (iterationNumber mC) ++ "," ++ upperTaxonomyLimitOutput ++ "," ++ show (length (concatMap sequenceRecords (taxRecords mC)))
writeFile (tempDirPath sO ++ "/log/result" ++ ".log") output
readClustaloDistMatrix :: String -> IO (Either ParseError ([String],Matrix Double))
readClustaloDistMatrix = parseFromFile genParserClustaloDistMatrix
genParserClustaloDistMatrix :: GenParser Char st ([String],Matrix Double)
genParserClustaloDistMatrix = do
_ <- many1 digit
newline
clustaloDistRow <- many1 (try genParserClustaloDistRow)
eof
return (map fst clustaloDistRow,fromLists (map snd clustaloDistRow))
genParserClustaloDistRow :: GenParser Char st (String,[Double])
genParserClustaloDistRow = do
entryId <- many1 (noneOf " ")
many1 space
distances <- many1 (try genParserClustaloDistance)
newline
return (entryId,distances)
genParserClustaloDistance :: GenParser Char st Double
genParserClustaloDistance = do
distance <- many1 (oneOf "1234567890.")
optional (try (char ' ' ))
return (readDouble distance)
getDistanceMatrixElements :: [String] -> Matrix Double -> String -> String -> Double
getDistanceMatrixElements ids distMatrix id1 id2 = distance
where indexid1 = fromJust (elemIndex id1 ids) + 1
indexid2 = fromJust (elemIndex id2 ids) + 1
distance = getElem indexid1 indexid2 distMatrix
filterDuplicates :: ModelConstruction -> SearchResult -> SearchResult
filterDuplicates modelConstruction inputSearchResult = uniqueSearchResult
where alignedSequences = map snd (V.toList (extractAlignedSequences (iterationNumber modelConstruction) modelConstruction))
collectedIdentifiers = map fastaHeader alignedSequences
uniques = filter (\(s,_,_) -> notElem (fastaHeader s) collectedIdentifiers) (candidates inputSearchResult)
uniqueSearchResult = SearchResult uniques (blastDatabaseSize inputSearchResult)
filterIdenticalSequences :: [(Fasta () (),Int,B.ByteString)] -> Double -> [(Fasta () (),Int,B.ByteString)]
filterIdenticalSequences (headSequence:rest) identitycutoff = result
where filteredSequences = filter (\x -> sequenceIdentity (firstOfTriple headSequence) (firstOfTriple x) < identitycutoff) rest
result = headSequence:filterIdenticalSequences filteredSequences identitycutoff
filterIdenticalSequences [] _ = []
filterWithCollectedSequences :: [(Fasta () (),Int,B.ByteString)] -> [Fasta () ()] -> Double -> [(Fasta () (),Int,B.ByteString)]
filterWithCollectedSequences inputCandidates collectedSequences identitycutoff = filter (isUnSimilarSequence collectedSequences identitycutoff . firstOfTriple) inputCandidates
filterIdenticalSequences' :: [Fasta () ()] -> Double -> [Fasta () ()]
filterIdenticalSequences' (headEntry:rest) identitycutoff = result
where filteredEntries = filter (\ x -> sequenceIdentity headEntry x < identitycutoff) rest
result = headEntry:filterIdenticalSequences' filteredEntries identitycutoff
filterIdenticalSequences' [] _ = []
isUnSimilarSequence :: [Fasta () ()] -> Double -> Fasta () () -> Bool
isUnSimilarSequence collectedSequences identitycutoff checkSequence = any (\ x -> sequenceIdentity checkSequence x < identitycutoff) collectedSequences
firstOfTriple :: (t, t1, t2) -> t
firstOfTriple (a,_,_) = a
blastMatchesPresent :: J.BlastJSON2 -> Bool
blastMatchesPresent blastJS2
| null resultList = False
| otherwise = True
where resultList = concatMap J._hsps ((Data.Foldable.toList . J._hits . J._search . J._results . J._report . J._blastoutput2 $ blastJS2))
textIdentity :: T.Text -> T.Text -> Double
textIdentity text1 text2 = identityPercent
where distance = TM.hamming text1 text2
maximumDistance = maximum [T.length text1, T.length text2]
distanceDouble = toInteger ( fromJust distance )
identityPercent = 1 - (fromIntegral distanceDouble/fromIntegral maximumDistance)
sequenceIdentity :: Fasta () () -> Fasta () () -> Double
sequenceIdentity sequence1 sequence2 = identityPercent
where distance = ED.levenshteinDistance ED.defaultEditCosts sequence1string sequence2string
sequence1string = B.unpack . _bioSequence . _fasta $ sequence1
sequence2string = B.unpack . _bioSequence . _fasta $ sequence2
maximumDistance = maximum [length sequence1string,length sequence2string]
identityPercent = 100 - ((fromIntegral distance/fromIntegral maximumDistance) * (read "100" ::Double))
getTaxonomicContextEntrez :: Maybe Int -> Maybe Taxon -> IO (Maybe Taxon)
getTaxonomicContextEntrez upperTaxLimit currentTaxonomicContext =
if isJust upperTaxLimit
then if isJust currentTaxonomicContext
then return currentTaxonomicContext
else retrieveTaxonomicContextEntrez (fromJust upperTaxLimit)
else return Nothing
setTaxonomicContextEntrez :: Int -> Maybe Taxon -> Maybe Int -> (Maybe Int, Maybe Int)
setTaxonomicContextEntrez currentIterationNumber currentTaxonomicContext subTreeTaxId
| currentIterationNumber == 0 = (subTreeTaxId, Nothing)
| otherwise = setUpperLowerTaxLimitEntrez (fromJust subTreeTaxId) (fromJust currentTaxonomicContext)
setUpperLowerTaxLimitEntrez :: Int -> Taxon -> (Maybe Int, Maybe Int)
setUpperLowerTaxLimitEntrez subTreeTaxId currentTaxonomicContext = (upperLimit,lowerLimit)
where upperLimit = raiseTaxIdLimitEntrez subTreeTaxId currentTaxonomicContext
lowerLimit = Just subTreeTaxId
raiseTaxIdLimitEntrez :: Int -> Taxon -> Maybe Int
raiseTaxIdLimitEntrez subTreeTaxId taxon = parentNodeTaxId
where lastUpperBoundNodeIndex = fromJust (V.findIndex (\node -> lineageTaxId node == subTreeTaxId) lineageExVector)
linageNodeTaxId = Just (lineageTaxId (lineageExVector V.! (lastUpperBoundNodeIndex -1)))
lineageExVector = V.fromList (lineageEx taxon)
parentNodeTaxId = if subTreeTaxId == taxonTaxId taxon then Just (taxonParentTaxId taxon) else linageNodeTaxId
constructNext :: Int -> ModelConstruction -> [(Fasta () (),Int,B.ByteString)] -> Maybe Int -> Maybe Taxon -> [Fasta () ()] -> [SearchResult] -> Bool -> ModelConstruction
constructNext currentIterationNumber modelconstruction alignmentResults upperTaxLimit inputTaxonomicContext inputSelectedQueries inputPotentialMembers toggleInfernalAlignmentModeTrue = nextModelConstruction
where newIterationNumber = currentIterationNumber + 1
taxEntries = taxRecords modelconstruction ++ buildTaxRecords alignmentResults currentIterationNumber
potMembers = potentialMembers modelconstruction ++ inputPotentialMembers
currentAlignmentMode = toggleInfernalAlignmentModeTrue || alignmentModeInfernal modelconstruction
nextModelConstruction = ModelConstruction newIterationNumber (inputFasta modelconstruction) taxEntries upperTaxLimit inputTaxonomicContext (evalueThreshold modelconstruction) currentAlignmentMode inputSelectedQueries potMembers
buildTaxRecords :: [(Fasta () (),Int,B.ByteString)] -> Int -> [TaxonomyRecord]
buildTaxRecords alignmentResults currentIterationNumber = taxonomyRecords
where taxIdGroups = groupBy sameTaxIdAlignmentResult alignmentResults
taxonomyRecords = map (buildTaxRecord currentIterationNumber) taxIdGroups
sameTaxIdAlignmentResult :: (Fasta () (),Int,B.ByteString) -> (Fasta () (),Int,B.ByteString) -> Bool
sameTaxIdAlignmentResult (_,taxId1,_) (_,taxId2,_) = taxId1 == taxId2
buildTaxRecord :: Int -> [(Fasta () (),Int,B.ByteString)] -> TaxonomyRecord
buildTaxRecord currentIterationNumber entries = taxRecord
where recordTaxId = (\(_,currentTaxonomyId,_) -> currentTaxonomyId) (head entries)
seqRecords = map (buildSeqRecord currentIterationNumber) entries
taxRecord = TaxonomyRecord recordTaxId seqRecords
buildSeqRecord :: Int -> (Fasta () (),Int,B.ByteString) -> SequenceRecord
buildSeqRecord currentIterationNumber (parsedFasta,_,seqSubject) = SequenceRecord parsedFasta currentIterationNumber seqSubject
evaluePartitionTrimCMsearchHits :: Double -> [(CMsearch,(Fasta () (), Int, B.ByteString))] -> ([(CMsearch,(Fasta () (), Int, B.ByteString))],[(CMsearch,(Fasta () (), Int, B.ByteString))],[(CMsearch,(Fasta () (), Int, B.ByteString))])
evaluePartitionTrimCMsearchHits eValueThreshold cmSearchCandidatesWithSequences = (trimmedSelectedCandidates,potentialCandidates,rejectedCandidates)
where potentialMemberseValueThreshold = eValueThreshold * 1000
(prefilteredCandidates,rejectedCandidates) = partition (\(cmSearchResult,_) -> any (\hitScore' -> potentialMemberseValueThreshold >= hitEvalue hitScore') (cmsearchHits cmSearchResult)) cmSearchCandidatesWithSequences
(selectedCandidates,potentialCandidates) = partition (\(cmSearchResult,_) -> any (\hitScore' -> eValueThreshold >= hitEvalue hitScore') (cmsearchHits cmSearchResult)) prefilteredCandidates
trimmedSelectedCandidates = map (\(cmSearchResult,inputSequence) -> (cmSearchResult,trimCMsearchHit cmSearchResult inputSequence)) selectedCandidates
trimCMsearchHit :: CMsearch -> (Fasta () (), Int, B.ByteString) -> (Fasta () (), Int, B.ByteString)
trimCMsearchHit cmSearchResult (inputSequence,b,c) = (subSequence,b,c)
where hitScoreEntry = head (cmsearchHits cmSearchResult)
sequenceString = B.unpack . _bioSequence . _fasta $ inputSequence
sequenceSubstring = cmSearchsubString (hitStart hitScoreEntry) (hitEnd hitScoreEntry) sequenceString
newSequenceHeader = SequenceIdentifier (B.pack (B.unpack (fastaHeader inputSequence) ++ "cmS_" ++ show (hitStart hitScoreEntry) ++ "_" ++ show (hitEnd hitScoreEntry) ++ "_" ++ show (hitStrand hitScoreEntry)))
subSequence = Fasta newSequenceHeader (BioSequence (B.pack sequenceSubstring))
cmSearchsubString :: Int -> Int -> String -> String
cmSearchsubString startSubString endSubString inputString
| startSubString < endSubString = take (endSubString - (startSubString -1))(drop (startSubString - 1) inputString)
| startSubString > endSubString = take (reverseEnd - (reverseStart - 1))(drop (reverseStart - 1 ) (reverse inputString))
| otherwise = take (endSubString - (startSubString -1))(drop (startSubString - 1) inputString)
where stringLength = length inputString
reverseStart = stringLength - (startSubString + 1)
reverseEnd = stringLength - (endSubString - 1)
extractQueries :: Int -> ModelConstruction -> [Fasta () ()]
extractQueries foundSequenceNumber modelconstruction
| foundSequenceNumber < 3 = fastaSeqData
| otherwise = querySequences'
where fastaSeqData = inputFasta modelconstruction
querySequences' = selectedQueries modelconstruction
extractQueryCandidates :: [(Fasta () (),Int,B.ByteString)] -> V.Vector (Int,Fasta () ())
extractQueryCandidates querycandidates = indexedSeqences
where sequences = map (\(candidateSequence,_,_) -> candidateSequence) querycandidates
indexedSeqences = V.map (\(number,candidateSequence) -> (number + 1,candidateSequence))(V.indexed (V.fromList sequences))
buildTaxFilterQuery :: Maybe Int -> Maybe Int -> String
buildTaxFilterQuery upperTaxLimit lowerTaxLimit
| isNothing upperTaxLimit = ""
| isNothing lowerTaxLimit = "&ENTREZ_QUERY=" ++ encodedTaxIDQuery (fromJust upperTaxLimit)
| otherwise = "&ENTREZ_QUERY=" ++ "%28txid" ++ show (fromJust upperTaxLimit) ++ "%5BORGN%5D%29" ++ "NOT" ++ "%28txid" ++ show (fromJust lowerTaxLimit) ++ "%5BORGN%5D&EQ_OP%29"
buildHitNumberQuery :: String -> String
buildHitNumberQuery hitNumber
| hitNumber == "" = ""
| otherwise = "&ALIGNMENTS=" ++ hitNumber
encodedTaxIDQuery :: Int -> String
encodedTaxIDQuery taxID = "txid" ++ show taxID ++ "%20%5BORGN%5D&EQ_OP"
randomid :: Int16 -> String
randomid number = "cm" ++ show number
createSessionID :: Maybe String -> IO String
createSessionID sessionIdentificator =
if isJust sessionIdentificator
then return (fromJust sessionIdentificator)
else do
randomNumber <- randomIO :: IO Int16
let sessionId = randomid (abs (randomNumber))
return sessionId
systemlocarna :: String -> (String,String,String,String) -> IO ExitCode
systemlocarna options (inputFilePath1, inputFilePath2, clustalformatoutputFilePath, outputFilePath) = system ("locarna " ++ options ++ " --clustal=" ++ clustalformatoutputFilePath ++ " " ++ inputFilePath1 ++ " " ++ inputFilePath2 ++ " > " ++ outputFilePath)
systemMlocarna :: String -> (String,String) -> IO ExitCode
systemMlocarna options (inputFilePath, outputFilePath) = system ("mlocarna " ++ options ++ " " ++ inputFilePath ++ " > " ++ outputFilePath)
systemMlocarnaWithTimeout :: String -> String -> (String,String) -> IO ExitCode
systemMlocarnaWithTimeout timeout options (inputFilePath, outputFilePath) = system ("timeout " ++ timeout ++"s "++ "mlocarna " ++ options ++ " " ++ inputFilePath ++ " > " ++ outputFilePath)
systemClustalw2 :: String -> (String,String,String) -> IO ExitCode
systemClustalw2 options (inputFilePath, outputFilePath, summaryFilePath) = system ("clustalw2 " ++ options ++ "-INFILE=" ++ inputFilePath ++ " -OUTFILE=" ++ outputFilePath ++ ">" ++ summaryFilePath)
systemClustalo :: String -> (String,String) -> IO ExitCode
systemClustalo options (inputFilePath, outputFilePath) = system ("clustalo " ++ options ++ "--infile=" ++ inputFilePath ++ " >" ++ outputFilePath)
systemCMbuild :: String -> String -> String -> String -> IO ExitCode
systemCMbuild options alignmentFilepath modelFilepath outputFilePath = system ("cmbuild " ++ options ++ " " ++ modelFilepath ++ " " ++ alignmentFilepath ++ " > " ++ outputFilePath)
systemCMcompare :: String -> String -> String -> IO ExitCode
systemCMcompare model1path model2path outputFilePath = system ("CMCompare -q " ++ model1path ++ " " ++ model2path ++ " >" ++ outputFilePath)
systemCMsearch :: Int -> String -> String -> String -> String -> IO ExitCode
systemCMsearch cpus options covarianceModelPath sequenceFilePath outputPath = system ("cmsearch --notrunc --cpu " ++ show cpus ++ " " ++ options ++ " -g " ++ covarianceModelPath ++ " " ++ sequenceFilePath ++ "> " ++ outputPath)
systemCMstat :: String -> String -> IO ExitCode
systemCMstat covarianceModelPath outputPath = system ("cmstat " ++ covarianceModelPath ++ " > " ++ outputPath)
systemCMcalibrate :: String -> Int -> String -> String -> IO ExitCode
systemCMcalibrate mode cpus covarianceModelPath outputPath
| mode == "fast" = system ("cmcalibrate --beta 1E-4 --cpu " ++ show cpus ++ " " ++ covarianceModelPath ++ "> " ++ outputPath)
| otherwise = system ("cmcalibrate --cpu " ++ show cpus ++ " " ++ covarianceModelPath ++ "> " ++ outputPath)
systemCMalign :: String -> String -> String -> String -> IO ExitCode
systemCMalign options filePathCovarianceModel filePathSequence filePathAlignment = system ("cmalign " ++ options ++ " " ++ filePathCovarianceModel ++ " " ++ filePathSequence ++ "> " ++ filePathAlignment)
compareCM :: String -> String -> String -> IO (Either String Double)
compareCM rfamCMPath resultCMpath outputDirectory = do
let myOptions = defaultDecodeOptions {
decDelimiter = fromIntegral (ord ' ')
}
let rfamCMFileName = FP.takeBaseName rfamCMPath
let resultCMFileName = FP.takeBaseName resultCMpath
let cmcompareResultPath = outputDirectory ++ rfamCMFileName ++ resultCMFileName ++ ".cmcompare"
_ <- systemCMcompare rfamCMPath resultCMpath cmcompareResultPath
inputCMcompare <- readFile cmcompareResultPath
let singlespaceCMcompare = unwords(words inputCMcompare)
let decodedCmCompareOutput = head (V.toList (fromRight (decodeWith myOptions NoHeader (L.pack singlespaceCMcompare) :: Either String (V.Vector [String]))))
let bitscore1 = read (decodedCmCompareOutput !! 2) :: Double
let bitscore2 = read (decodedCmCompareOutput !! 3) :: Double
let minmax = minimum [bitscore1,bitscore2]
return (Right minmax)
readInt :: String -> Int
readInt = read
readDouble :: String -> Double
readDouble = read
extractCandidateSequences :: [(Fasta () (),Int,B.ByteString)] -> V.Vector (Int,Fasta () ())
extractCandidateSequences candidates' = indexedSeqences
where sequences = map (\(inputSequence,_,_) -> inputSequence) candidates'
indexedSeqences = V.map (\(number,inputSequence) -> (number + 1,inputSequence))(V.indexed (V.fromList sequences))
extractAlignedSequences :: Int -> ModelConstruction -> V.Vector (Int,Fasta () ())
extractAlignedSequences iterationnumber modelconstruction
| iterationnumber == 0 = V.map (\(number,seq') -> (number + 1,seq')) (V.indexed (V.fromList inputSequence))
| otherwise = indexedSeqRecords
where inputSequence = inputFasta modelconstruction
seqRecordsperTaxrecord = map sequenceRecords (taxRecords modelconstruction)
seqRecords = concat seqRecordsperTaxrecord
indexedSeqRecords = V.map (\(number,seq') -> (number + 1,seq')) (V.indexed (V.fromList (inputSequence ++ map nucleotideSequence seqRecords)))
filterByParentTaxId :: [(J.Hit,Int)] -> Bool -> [(J.Hit,Int)]
filterByParentTaxId blastHitsWithParentTaxId singleHitPerParentTaxId
| singleHitPerParentTaxId = singleBlastHitperParentTaxId
| otherwise = blastHitsWithParentTaxId
where blastHitsWithParentTaxIdSortedByParentTaxId = sortBy compareTaxId blastHitsWithParentTaxId
blastHitsWithParentTaxIdGroupedByParentTaxId = groupBy sameTaxId blastHitsWithParentTaxIdSortedByParentTaxId
singleBlastHitperParentTaxId = map (maximumBy compareHitEValue) blastHitsWithParentTaxIdGroupedByParentTaxId
filterByHitLength :: DS.Seq J.Hit -> Int -> Bool -> DS.Seq J.Hit
filterByHitLength blastHits queryLength filterOn
| filterOn = filteredBlastHits
| otherwise = blastHits
where filteredBlastHits = DS.filter (hitLengthCheck queryLength) blastHits
hitLengthCheck :: Int -> J.Hit -> Bool
hitLengthCheck queryLength blastHit = lengthStatus
where hsps = J._hsps blastHit
minHfrom = minimum (map J._hit_from hsps)
minHfromHSP = fromJust (find (\hsp -> minHfrom == J._hit_from hsp) hsps)
maxHto = maximum (map J._hit_to hsps)
maxHtoHSP = fromJust (find (\hsp -> maxHto == J._hit_to hsp) hsps)
minHonQuery = J._query_from minHfromHSP
maxHonQuery = J._query_to maxHtoHSP
startCoordinate = minHfrom - minHonQuery
endCoordinate = maxHto + (queryLength - maxHonQuery)
fullSeqLength = endCoordinate - startCoordinate
lengthStatus = fullSeqLength < (queryLength * 3)
filterByCoverage :: DS.Seq J.Hit -> Int -> Bool -> DS.Seq J.Hit
filterByCoverage blastHits queryLength filterOn
| filterOn = filteredBlastHits
| otherwise = blastHits
where filteredBlastHits = DS.filter (coverageCheck queryLength) blastHits
coverageCheck :: Int -> J.Hit -> Bool
coverageCheck queryLength hit = coverageStatus
where hsps = J._hsps hit
maxIdentity = fromIntegral (maximum (map J._identity hsps))
coverageStatus = (maxIdentity/fromIntegral queryLength)* (100 :: Double) >= (80 :: Double)
retrieveFullSequences :: StaticOptions -> [(String,Int,Int,String,T.Text,Int,B.ByteString)] -> IO [(Fasta () (),Int,B.ByteString)]
retrieveFullSequences staticOptions requestedSequences = do
if offline staticOptions
then do
fullSequences <- mapM (retrieveFullSequenceBlastDb (fromJust (blastDatabase staticOptions)) (tempDirPath staticOptions)) requestedSequences
if any (isNothing . firstOfTriple) fullSequences
then do
let fullSequencesWithRequestedSequences = zip fullSequences requestedSequences
let (failedRetrievals, successfulRetrievals) = partition (isNothing . firstOfTriple . fst) fullSequencesWithRequestedSequences
missingSequences <- mapM (retrieveFullSequence (tempDirPath staticOptions) .snd) failedRetrievals
let (stillMissingSequences,reRetrievedSequences) = partition (isNothing . firstOfTriple) missingSequences
logWarning ("Sequence retrieval failed: \n" ++ concatMap show stillMissingSequences ++ "\n") (tempDirPath staticOptions)
let unwrappedRetrievals = map (\(x,y,z) -> (fromJust x,y,z)) (map fst successfulRetrievals ++ reRetrievedSequences)
CE.evaluate unwrappedRetrievals
else CE.evaluate (map (\(x,y,z) -> (fromJust x,y,z)) fullSequences)
else do
fullSequences <- mapM (retrieveFullSequence (tempDirPath staticOptions)) requestedSequences
if any (isNothing . firstOfTriple) fullSequences
then do
let fullSequencesWithRequestedSequences = zip fullSequences requestedSequences
let (failedRetrievals, successfulRetrievals) = partition (isNothing . firstOfTriple . fst) fullSequencesWithRequestedSequences
missingSequences <- mapM (retrieveFullSequence (tempDirPath staticOptions) .snd) failedRetrievals
let (stillMissingSequences,reRetrievedSequences) = partition (isNothing . firstOfTriple) missingSequences
logWarning ("Sequence retrieval failed: \n" ++ concatMap show stillMissingSequences ++ "\n") (tempDirPath staticOptions)
let unwrappedRetrievals = map (\(x,y,z) -> (fromJust x,y,z)) (map fst successfulRetrievals ++ reRetrievedSequences)
CE.evaluate unwrappedRetrievals
else CE.evaluate (map (\(x,y,z) -> (fromJust x,y,z)) fullSequences)
retrieveFullSequenceBlastDb :: String -> String -> (String,Int,Int,String,T.Text,Int,B.ByteString) -> IO (Maybe (Fasta () ()),Int,B.ByteString)
retrieveFullSequenceBlastDb blastDb temporaryDirectoryPath (nucleotideId,seqStart,seqStop,strand,_,taxid,subject') = do
let sequencePath = temporaryDirectoryPath ++ "/" ++ nucleotideId ++ ".fa"
let cmd = "blastdbcmd -db " ++ blastDb ++ " -range " ++ (show seqStart) ++ "-" ++ (show seqStop) ++ " -strand " ++ (setBlastDbStrand strand) ++ " -entry " ++ nucleotideId ++ " -outfmt %f -target_only -out " ++ sequencePath
print cmd
system(cmd)
retrievedSequence <- readFastaFile sequencePath
if null retrievedSequence
then return(Nothing,taxid,subject')
else do
let justSequence = Just . head $ retrievedSequence
return(justSequence,taxid,subject')
setBlastDbStrand :: String -> String
setBlastDbStrand strand
| strand == "2" = "minus"
| strand == "1" = "plus"
| otherwise = "plus"
retrieveFullSequence :: String -> (String,Int,Int,String,T.Text,Int,B.ByteString) -> IO (Maybe (Fasta () ()),Int,B.ByteString)
retrieveFullSequence temporaryDirectoryPath (nucleotideId,seqStart,seqStop,strand,_,taxid,subject') = do
let program' = Just "efetch"
let database' = Just "nucleotide"
let registrationInfo = buildRegistration "RNAlien" "florian.eggenhofer@univie.ac.at"
let queryString = "id=" ++ nucleotideId ++ "&seq_start=" ++ show seqStart ++ "&seq_stop=" ++ show seqStop ++ "&rettype=fasta" ++ "&strand=" ++ strand ++ registrationInfo
let entrezQuery = EntrezHTTPQuery program' database' queryString
result <- CE.catch (entrezHTTP entrezQuery)
(\e -> do let err = show (e :: CE.IOException)
logWarning ("Warning: Full sequence retrieval failed:" ++ " " ++ err) temporaryDirectoryPath
return [])
if null result
then return (Nothing,taxid,subject')
else do
let parsedFastas = byteStringToMultiFasta (L.pack result)
if (null parsedFastas)
then return (Nothing,taxid,subject')
else do
let parsedFasta = head parsedFastas
if null (B.unpack . _bioSequence . _fasta $ parsedFasta)
then return (Nothing,taxid,subject')
else CE.evaluate (Just parsedFasta,taxid,subject')
getRequestedSequenceElement :: Int -> (J.Hit,Int) -> (String,Int,Int,String,T.Text,Int,B.ByteString)
getRequestedSequenceElement queryLength (blastHit,taxid)
| blastHitIsReverseComplement (blastHit,taxid) = getReverseRequestedSequenceElement queryLength (blastHit,taxid)
| otherwise = getForwardRequestedSequenceElement queryLength (blastHit,taxid)
blastHitIsReverseComplement :: (J.Hit,Int) -> Bool
blastHitIsReverseComplement (blastHit,_) = isReverse
where blastMatch = head (J._hsps blastHit)
firstHSPfrom = J._hit_from blastMatch
firstHSPto = J._hit_to blastMatch
isReverse = firstHSPfrom > firstHSPto
getForwardRequestedSequenceElement :: Int -> (J.Hit,Int) -> (String,Int,Int,String,T.Text,Int,B.ByteString)
getForwardRequestedSequenceElement queryLength (blastHit,taxid) = (geneIdentifier',startcoordinate,endcoordinate,strand,accession',taxid,subjectBlast)
where accession' = J._accession . head . J._description $ blastHit
subjectBlast = E.encodeUtf8 . J._title . head . J._description $ blastHit
geneIdentifier' = extractGeneId blastHit
blastMatch = head (J._hsps blastHit)
blastHitOriginSequenceLength = J._len blastHit
minHfrom = J._hit_from blastMatch
maxHto = J._hit_to blastMatch
minHonQuery = J._query_from blastMatch
maxHonQuery = J._query_to blastMatch
unsafestartcoordinate = minHfrom - minHonQuery
unsafeendcoordinate = maxHto + (queryLength - maxHonQuery)
startcoordinate = lowerBoundryCoordinateSetter 0 unsafestartcoordinate
endcoordinate = upperBoundryCoordinateSetter blastHitOriginSequenceLength unsafeendcoordinate
strand = "1"
lowerBoundryCoordinateSetter :: Int -> Int -> Int
lowerBoundryCoordinateSetter lowerBoundry currentValue
| currentValue < lowerBoundry = lowerBoundry
| otherwise = currentValue
upperBoundryCoordinateSetter :: Int -> Int -> Int
upperBoundryCoordinateSetter upperBoundry currentValue
| currentValue > upperBoundry = upperBoundry
| otherwise = currentValue
getReverseRequestedSequenceElement :: Int -> (J.Hit,Int) -> (String,Int,Int,String,T.Text,Int,B.ByteString)
getReverseRequestedSequenceElement queryLength (blastHit,taxid) = (geneIdentifier',startcoordinate,endcoordinate,strand,accession',taxid,subjectBlast)
where accession' = J._accession . head . J._description $ blastHit
subjectBlast = E.encodeUtf8 . J._title . head . J._description $ blastHit
geneIdentifier' = extractGeneId blastHit
blastMatch = head (J._hsps blastHit)
blastHitOriginSequenceLength = J._len blastHit
maxHfrom = J._hit_from blastMatch
minHto = J._hit_to blastMatch
minHonQuery = J._query_from blastMatch
maxHonQuery = J._query_to blastMatch
unsafestartcoordinate = maxHfrom + minHonQuery
unsafeendcoordinate = minHto - (queryLength - maxHonQuery)
startcoordinate = lowerBoundryCoordinateSetter 0 unsafeendcoordinate
endcoordinate = upperBoundryCoordinateSetter blastHitOriginSequenceLength unsafestartcoordinate
strand = "2"
alignSequences :: String -> String -> [String] -> [String] -> [String] -> [String] -> IO ()
alignSequences program' options fastaFilepaths fastaFilepaths2 alignmentFilepaths summaryFilepaths = do
let zipped4Filepaths = zip4 fastaFilepaths fastaFilepaths2 alignmentFilepaths summaryFilepaths
let zipped3Filepaths = zip3 fastaFilepaths alignmentFilepaths summaryFilepaths
let zippedFilepaths = zip fastaFilepaths alignmentFilepaths
let timeout = "3600"
case program' of
"locarna" -> mapM_ (systemlocarna options) zipped4Filepaths
"mlocarna" -> mapM_ (systemMlocarna options) zippedFilepaths
"mlocarnatimeout" -> mapM_ (systemMlocarnaWithTimeout timeout options) zippedFilepaths
"clustalo" -> mapM_ (systemClustalo options) zippedFilepaths
_ -> mapM_ (systemClustalw2 options ) zipped3Filepaths
constructFastaFilePaths :: String -> (Int, Fasta () ()) -> String
constructFastaFilePaths currentDirectory (fastaIdentifier, _) = currentDirectory ++ show fastaIdentifier ++".fa"
constructCMsearchFilePaths :: String -> (Int, Fasta () ()) -> String
constructCMsearchFilePaths currentDirectory (fastaIdentifier, _) = currentDirectory ++ show fastaIdentifier ++".cmsearch"
compareHitEValue :: (J.Hit,Int) -> (J.Hit,Int) -> Ordering
compareHitEValue (hit1,_) (hit2,_)
| hitEValue hit1 > hitEValue hit2 = LT
| hitEValue hit1 < hitEValue hit2 = GT
| hitEValue hit1 == hitEValue hit2 = GT
compareHitEValue (_,_) (_,_) = EQ
compareTaxId :: (J.Hit,Int) -> (J.Hit,Int) -> Ordering
compareTaxId (_,taxId1) (_,taxId2)
| taxId1 > taxId2 = LT
| taxId1 < taxId2 = GT
| taxId1 == taxId2 = EQ
compareTaxId (_,_) (_,_) = EQ
sameTaxId :: (J.Hit,Int) -> (J.Hit,Int) -> Bool
sameTaxId (_,taxId1) (_,taxId2) = taxId1 == taxId2
hitEValue :: J.Hit -> Double
hitEValue currentHit = minimum (map J._evalue (J._hsps currentHit))
convertFastaFoldStockholm :: Fasta () () -> String -> String
convertFastaFoldStockholm fastasequence foldedStructure = stockholmOutput
where alnHeader = "# STOCKHOLM 1.0\n\n"
seqIdentifier = B.unpack . _sequenceIdentifier . _header $fastasequence
seqSequence = B.unpack . _bioSequence . _fasta $ fastasequence
identifierLength = length seqIdentifier
spacerLength' = maximum [14,identifierLength + 2]
spacer = replicate (spacerLength' - identifierLength) ' '
entrystring = seqIdentifier ++ spacer ++ seqSequence ++ "\n"
structureString = "#=GC SS_cons" ++ replicate (spacerLength' - 12) ' ' ++ foldedStructure ++ "\n"
bottom = "//"
stockholmOutput = alnHeader ++ entrystring ++ structureString ++ bottom
convertClustaltoStockholm :: StructuralClustalAlignment -> T.Text
convertClustaltoStockholm parsedMlocarnaAlignment = stockholmOutput
where alnHeader = T.pack "# STOCKHOLM 1.0\n\n"
clustalAlignment = structuralAlignmentEntries parsedMlocarnaAlignment
uniqueIds = nub (map entrySequenceIdentifier clustalAlignment)
mergedEntries = map (mergeEntry clustalAlignment) uniqueIds
maxIdentifierLenght = maximum (map (T.length . entrySequenceIdentifier) clustalAlignment)
spacerLength' = maxIdentifierLenght + 2
stockholmEntries = T.concat (map (buildStockholmAlignmentEntries spacerLength') mergedEntries)
structureString = T.pack "#=GC SS_cons" `T.append` T.replicate (spacerLength' - 12) (T.pack " ") `T.append` secondaryStructureTrack parsedMlocarnaAlignment `T.append` T.pack "\n"
bottom = T.pack "//"
stockholmOutput = alnHeader `T.append` stockholmEntries `T.append` structureString `T.append` bottom
mergeEntry :: [ClustalAlignmentEntry] -> T.Text -> ClustalAlignmentEntry
mergeEntry clustalAlignment uniqueId = mergedEntry
where idEntries = filter (\entry -> entrySequenceIdentifier entry==uniqueId) clustalAlignment
mergedSeq = foldr (T.append . entryAlignedSequence) (T.pack "") idEntries
mergedEntry = ClustalAlignmentEntry uniqueId mergedSeq
buildStockholmAlignmentEntries :: Int -> ClustalAlignmentEntry -> T.Text
buildStockholmAlignmentEntries inputSpacerLength entry = entrystring
where idLength = T.length (T.filter (/= '\n') (entrySequenceIdentifier entry))
spacer = T.replicate (inputSpacerLength - idLength) (T.pack " ")
entrystring = entrySequenceIdentifier entry `T.append` spacer `T.append` entryAlignedSequence entry `T.append` T.pack "\n"
retrieveTaxonomicContextEntrez :: Int -> IO (Maybe Taxon)
retrieveTaxonomicContextEntrez inputTaxId = do
let program' = Just "efetch"
let database' = Just "taxonomy"
let taxIdString = show inputTaxId
let registrationInfo = buildRegistration "RNAlien" "florian.eggenhofer@univie.ac.at"
let queryString = "id=" ++ taxIdString ++ registrationInfo
let entrezQuery = EntrezHTTPQuery program' database' queryString
result <- entrezHTTP entrezQuery
if null result
then do
error "Could not retrieve taxonomic context from NCBI Entrez, cannot proceed."
return Nothing
else do
let taxon = head (readEntrezTaxonSet result)
if null (lineageEx taxon)
then error "Retrieved taxonomic context taxon from NCBI Entrez with empty lineage, cannot proceed."
else return (Just taxon)
retrieveParentTaxIdEntrez :: [(J.Hit,Int)] -> IO [(J.Hit,Int)]
retrieveParentTaxIdEntrez blastHitsWithHitTaxids =
if not (null blastHitsWithHitTaxids)
then do
let program' = Just "efetch"
let database' = Just "taxonomy"
let extractedBlastHits = map fst blastHitsWithHitTaxids
let taxIds = map snd blastHitsWithHitTaxids
let taxIdStrings = map show taxIds
let taxIdQuery = intercalate "," taxIdStrings
let registrationInfo = buildRegistration "RNAlien" "florian.eggenhofer@univie.ac.at"
let queryString = "id=" ++ taxIdQuery ++ registrationInfo
let entrezQuery = EntrezHTTPQuery program' database' queryString
result <- entrezHTTP entrezQuery
let parentTaxIds = readEntrezParentIds result
if null parentTaxIds
then return []
else CE.evaluate (zip extractedBlastHits parentTaxIds)
else return []
retrieveParentTaxIdsEntrez :: [(J.Hit,Int)] -> IO [(J.Hit,Int)]
retrieveParentTaxIdsEntrez taxIdwithBlastHits = do
let splits = portionListElements taxIdwithBlastHits 20
taxIdsOutput <- mapM retrieveParentTaxIdEntrez splits
return (concat taxIdsOutput)
extractBlastHitsTaxId :: DS.Seq J.Hit -> [(J.Hit,Int)]
extractBlastHitsTaxId blastHits = do
map (\a -> (a,J._taxid . head . J._description $ a)) (Data.Foldable.toList blastHits)
retrieveBlastHitTaxIdEntrez :: [J.Hit] -> IO ([J.Hit],String)
retrieveBlastHitTaxIdEntrez blastHits =
if not (null blastHits)
then do
let geneIds = map extractGeneId blastHits
let idList = intercalate "," geneIds
let registrationInfo = buildRegistration "RNAlien" "florian.eggenhofer@univie.ac.at"
let query' = "id=" ++ idList ++ registrationInfo
let entrezQuery = EntrezHTTPQuery (Just "esummary") (Just "nucleotide") query'
threadDelay 10000000
result <- entrezHTTP entrezQuery
CE.evaluate (blastHits,result)
else return (blastHits,"")
extractTaxIdFromEntrySummaries :: String -> [Int]
extractTaxIdFromEntrySummaries input
| null input = []
| null parsedResultList = []
| otherwise = hitTaxIds
where parsedResultList = readEntrezSummaries input
parsedResult = head parsedResultList
blastHitSummaries = documentSummaries parsedResult
hitTaxIdStrings = map extractTaxIdfromDocumentSummary blastHitSummaries
hitTaxIds = map readInt hitTaxIdStrings
extractGeneId :: J.Hit -> String
extractGeneId currentBlastHit = nucleotideId
where truncatedId = drop 3 (T.unpack (J._id (head (J._description currentBlastHit))))
pipeSymbolIndex = fromJust (elemIndex '|' truncatedId)
nucleotideId = take pipeSymbolIndex truncatedId
extractTaxIdfromDocumentSummary :: EntrezDocSum -> String
extractTaxIdfromDocumentSummary documentSummary = itemContent (fromJust (find (\item -> "TaxId" == itemName item) (summaryItems documentSummary)))
getBestHit :: J.BlastJSON2 -> J.Hit
getBestHit blastJS2
| null (J._hits (J._search . J._results . J._report . J._blastoutput2 $ blastJS2)) = error "getBestHit - head: empty list"
| otherwise = DS.index (J._hits (J._search . J._results . J._report . J._blastoutput2 $ blastJS2)) 1
getHitWithFractionEvalue :: J.BlastJSON2 -> Maybe J.Hsp
getHitWithFractionEvalue blastJS2
| null currentHits = Nothing
| otherwise = find (\hsp -> J._evalue hsp /= (0 ::Double)) (concatMap J._hsps currentHits)
where currentHits = J._hits . J._search . J._results . J._report . J._blastoutput2 $ blastJS2
showlines :: (Show a, Foldable t) => t a -> String
showlines = concatMap (\x -> show x ++ "\n")
logMessage :: String -> String -> IO ()
logMessage logoutput temporaryDirectoryPath = appendFile (temporaryDirectoryPath ++ "Log") logoutput
logWarning :: String -> String -> IO ()
logWarning logoutput temporaryDirectoryPath = appendFile (temporaryDirectoryPath ++ "log/warnings") logoutput
logVerboseMessage :: Bool -> String -> String -> IO ()
logVerboseMessage verboseTrue logoutput temporaryDirectoryPath
| verboseTrue = appendFile (temporaryDirectoryPath ++ "Log") (show logoutput)
| otherwise = return ()
logEither :: (Show a) => Either a b -> String -> IO ()
logEither (Left logoutput) temporaryDirectoryPath = appendFile (temporaryDirectoryPath ++ "Log") (show logoutput)
logEither _ _ = return ()
checkTools :: [String] -> String -> String -> IO (Either String String)
checkTools tools temporaryDirectoryPath selectedQuerySelectionMethod = do
let additionaltools = if selectedQuerySelectionMethod == "clustering" then tools ++ ["clustalo"] else tools
checks <- mapM checkTool additionaltools
if not (null (lefts checks))
then return (Left (concat (lefts checks)))
else do
logMessage ("Tools : " ++ intercalate "," tools ++ "\n") temporaryDirectoryPath
return (Right "Tools ok")
logToolVersions :: String -> String -> IO ()
logToolVersions inputQuerySelectionMethod temporaryDirectoryPath = do
let clustaloversionpath = temporaryDirectoryPath ++ "log/clustalo.version"
let mlocarnaversionpath = temporaryDirectoryPath ++ "log/mlocarna.version"
let rnafoldversionpath = temporaryDirectoryPath ++ "log/RNAfold.version"
let infernalversionpath = temporaryDirectoryPath ++ "log/Infernal.version"
_ <- system ("mlocarna --version >" ++ mlocarnaversionpath)
_ <- system ("RNAfold --version >" ++ rnafoldversionpath)
_ <- system ("cmcalibrate -h >" ++ infernalversionpath)
mlocarnaversion <- readFile mlocarnaversionpath
rnafoldversion <- readFile rnafoldversionpath
infernalversionOutput <- readFile infernalversionpath
let infernalversion = lines infernalversionOutput !! 1
if inputQuerySelectionMethod == "clustering"
then do
_ <- system ("clustalo --version >" ++ clustaloversionpath)
clustaloversion <- readFile clustaloversionpath
let messageString = "Clustalo version: " ++ clustaloversion ++ "mlocarna version: " ++ mlocarnaversion ++ "RNAfold version: " ++ rnafoldversion ++ "infernalversion: " ++ infernalversion ++ "\n"
logMessage messageString temporaryDirectoryPath
else do
let messageString = "mlocarna version: " ++ mlocarnaversion ++ "RNAfold version: " ++ rnafoldversion ++ "infernalversion: " ++ infernalversion ++ "\n"
logMessage messageString temporaryDirectoryPath
checkTool :: String -> IO (Either String String)
checkTool tool = do
toolcheck <- findExecutable tool
if isJust toolcheck
then return (Right (fromJust toolcheck))
else return (Left ("RNAlien could not find "++ tool ++ " in your $PATH and has to abort.\n"))
constructTaxonomyRecordsCSVTable :: ModelConstruction -> String
constructTaxonomyRecordsCSVTable modelconstruction = csvtable
where tableheader = "Taxonomy Id;Added in Iteration Step;Entry Header\n"
tablebody = concatMap constructTaxonomyRecordCSVEntries (taxRecords modelconstruction)
csvtable = tableheader ++ tablebody
constructTaxonomyRecordCSVEntries :: TaxonomyRecord -> String
constructTaxonomyRecordCSVEntries taxRecord = concatMap (constructTaxonomyRecordCSVEntry taxIdString) (sequenceRecords taxRecord)
where taxIdString = show (recordTaxonomyId taxRecord)
constructTaxonomyRecordCSVEntry :: String -> SequenceRecord -> String
constructTaxonomyRecordCSVEntry taxIdString seqrec = taxIdString ++ ";" ++ show (aligned seqrec) ++ ";" ++ filter checkTaxonomyRecordCSVChar (B.unpack (fastaHeader (nucleotideSequence seqrec))) ++ "\n"
checkTaxonomyRecordCSVChar :: Char -> Bool
checkTaxonomyRecordCSVChar c
| c == '"' = False
| c == ';' = False
| otherwise = True
setVerbose :: Verbosity -> Bool
setVerbose verbosityLevel
| verbosityLevel == Loud = True
| otherwise = False
evaluateConstructionResult :: StaticOptions -> ModelConstruction -> IO String
evaluateConstructionResult staticOptions mCResult = do
let evaluationDirectoryFilepath = tempDirPath staticOptions ++ "evaluation/"
createDirectoryIfMissing False evaluationDirectoryFilepath
let reformatedClustalPath = evaluationDirectoryFilepath ++ "result.clustal.reformated"
let cmFilepath = tempDirPath staticOptions ++ "result.cm"
let resultSequences = inputFasta mCResult ++ map nucleotideSequence (concatMap sequenceRecords (taxRecords mCResult))
let resultNumber = length resultSequences
let rnaCentralQueries = map RCH.buildSequenceViaMD5Query resultSequences
rnaCentralEntries <- RCH.getRNACentralEntries rnaCentralQueries
let rnaCentralEvaluationResult = RCH.showRNAcentralAlienEvaluation rnaCentralEntries
writeFile (tempDirPath staticOptions ++ "result.rnacentral") rnaCentralEvaluationResult
let resultModelStatistics = tempDirPath staticOptions ++ "result.cmstat"
systemCMstat cmFilepath resultModelStatistics
inputcmStat <- readCMstat resultModelStatistics
let cmstatString = cmstatEvalOutput inputcmStat
if resultNumber > 1
then do
let resultRNAz = tempDirPath staticOptions ++ "result.rnaz"
let resultRNAcode = tempDirPath staticOptions ++ "result.rnacode"
let resultClustalFilepath = tempDirPath staticOptions ++ "result.clustal"
let seqNumber = 6 :: Int
let optimalIdentity = 80 :: Double
let maximalIdentity = 99 :: Double
let referenceSequence = True
preprocessingOutput <- preprocessClustalForRNAz resultClustalFilepath reformatedClustalPath seqNumber optimalIdentity maximalIdentity referenceSequence
if isRight preprocessingOutput
then do
let rightPreprocessingOutput = fromRight preprocessingOutput
let rnazClustalpath = snd rightPreprocessingOutput
systemRNAz "-l" rnazClustalpath resultRNAz
inputRNAz <- readRNAz resultRNAz
let rnaZString = rnaZEvalOutput inputRNAz
RC.systemRNAcode " -t " rnazClustalpath resultRNAcode
inputRNAcode <- RC.readRNAcodeTabular resultRNAcode
let rnaCodeString = rnaCodeEvalOutput inputRNAcode
return ("\nEvaluation of RNAlien result :\nCMstat statistics for result.cm\n" ++ cmstatString ++ "\nRNAz statistics for result alignment: " ++ rnaZString ++ "\nRNAcode output for result alignment:\n" ++ rnaCodeString ++ "\nSequences found by RNAlien with RNAcentral entry:\n" ++ rnaCentralEvaluationResult)
else do
logWarning ("Running RNAz for result evalution encountered a problem:" ++ fromLeft preprocessingOutput) (tempDirPath staticOptions)
return ("\nEvaluation of RNAlien result :\nCMstat statistics for result.cm\n" ++ cmstatString ++ "\nRNAz statistics for result alignment: Running RNAz for result evalution encountered a problem\n" ++ fromLeft preprocessingOutput ++ "\n" ++ "Sequences found by RNAlien with RNAcentral entry:\n" ++ rnaCentralEvaluationResult)
else do
logWarning "Message: RNAlien could not find additional covariant sequences\n Could not run RNAz statistics. Could not run RNAz statistics with a single sequence.\n" (tempDirPath staticOptions)
return ("\nEvaluation of RNAlien result :\nCMstat statistics for result.cm\n" ++ cmstatString ++ "\nRNAlien could not find additional covariant sequences. Could not run RNAz statistics with a single sequence.\n\nSequences found by RNAlien with RNAcentral entry:\n" ++ rnaCentralEvaluationResult)
cmstatEvalOutput :: Either ParseError CMstat -> String
cmstatEvalOutput inputcmstat
| isRight inputcmstat = cmstatString
| otherwise = show (fromLeft inputcmstat)
where cmStat = fromRight inputcmstat
cmstatString = " Sequence Number: " ++ show (statSequenceNumber cmStat)++ "\n" ++ " Effective Sequences: " ++ show (statEffectiveSequences cmStat)++ "\n" ++ " Consensus length: " ++ show (statConsensusLength cmStat) ++ "\n" ++ " Expected maximum hit-length: " ++ show (statW cmStat) ++ "\n" ++ " Basepairs: " ++ show (statBasepairs cmStat)++ "\n" ++ " Bifurcations: " ++ show (statBifurcations cmStat) ++ "\n" ++ " Modeltype: " ++ show (statModel cmStat) ++ "\n" ++ " Relative Entropy CM: " ++ show (relativeEntropyCM cmStat) ++ "\n" ++ " Relative Entropy HMM: " ++ show (relativeEntropyHMM cmStat) ++ "\n"
rnaZEvalOutput :: Either ParseError RNAz -> String
rnaZEvalOutput inputRNAz
| isRight inputRNAz = rnazString
| otherwise = show (fromLeft inputRNAz)
where rnaZ = fromRight inputRNAz
rnazString = " Mean pairwise identity: " ++ show (meanPairwiseIdentity rnaZ) ++ "\n Shannon entropy: " ++ show (shannonEntropy rnaZ) ++ "\n GC content: " ++ show (gcContent rnaZ) ++ "\n Mean single sequence minimum free energy: " ++ show (meanSingleSequenceMinimumFreeEnergy rnaZ) ++ "\n Consensus minimum free energy: " ++ show (consensusMinimumFreeEnergy rnaZ) ++ "\n Energy contribution: " ++ show (energyContribution rnaZ) ++ "\n Covariance contribution: " ++ show (covarianceContribution rnaZ) ++ "\n Combinations pair: " ++ show (combinationsPair rnaZ) ++ "\n Mean z-score: " ++ show (meanZScore rnaZ) ++ "\n Structure conservation index: " ++ show (structureConservationIndex rnaZ) ++ "\n Background model: " ++ backgroundModel rnaZ ++ "\n Decision model: " ++ decisionModel rnaZ ++ "\n SVM decision value: " ++ show (svmDecisionValue rnaZ) ++ "\n SVM class propability: " ++ show (svmRNAClassProbability rnaZ) ++ "\n Prediction: " ++ prediction rnaZ
rnaCodeEvalOutput :: Either ParseError RC.RNAcode -> String
rnaCodeEvalOutput inputRNAcode
| isRight inputRNAcode = rnaCodeString
| otherwise = show (fromLeft inputRNAcode)
where rnaCode = fromRight inputRNAcode
rnaCodeString = "HSS\tFrame\tLength\tFrom\tTo\tName\tStart\tEnd\tScore\tP\n" ++ rnaCodeEntries
rnaCodeEntries = concatMap showRNACodeHits (RC.rnacodeHits rnaCode)
showRNACodeHits :: RC.RNAcodeHit -> String
showRNACodeHits rnacodeHit = show (RC.hss rnacodeHit) ++ "\t" ++ show (RC.frame rnacodeHit) ++ "\t" ++ show (RC.hitLength rnacodeHit) ++ "\t"++ show (RC.from rnacodeHit) ++ "\t" ++ show (RC.to rnacodeHit) ++ "\t" ++ RC.name rnacodeHit ++ "\t" ++ show (RC.start rnacodeHit) ++ "\t" ++ show (RC.end rnacodeHit) ++ "\t" ++ show (RC.score rnacodeHit) ++ show (RC.pvalue rnacodeHit) ++ "\n"
preprocessClustalForRNAzExternal :: String -> String -> Int -> Int -> Int -> Bool -> IO (Either String (String,String))
preprocessClustalForRNAzExternal clustalFilepath reformatedClustalPath seqenceNumber optimalIdentity maximalIdenity referenceSequence = do
clustalText <- TI.readFile clustalFilepath
let reformatedClustalText = T.map reformatAln clustalText
TI.writeFile reformatedClustalPath reformatedClustalText
let selectedClustalpath = clustalFilepath ++ ".selected"
let sequenceNumberOption = " -n " ++ show seqenceNumber ++ " "
let optimalIdentityOption = " -i " ++ show optimalIdentity ++ " "
let maximalIdentityOption = " --max-id=" ++ show maximalIdenity ++ " "
let referenceSequenceOption = if referenceSequence then " " else " -x "
let syscall = "rnazSelectSeqs.pl " ++ reformatedClustalPath ++ " " ++ sequenceNumberOption ++ optimalIdentityOption ++ maximalIdentityOption ++ referenceSequenceOption ++ " >" ++ selectedClustalpath
system syscall
selectedClustalText <- readFile selectedClustalpath
return (Right ([],selectedClustalText))
preprocessClustalForRNAcodeExternal :: String -> String -> Int -> Int -> Int -> Bool -> IO (Either String (String,String))
preprocessClustalForRNAcodeExternal clustalFilepath reformatedClustalPath seqenceNumber optimalIdentity maximalIdenity referenceSequence = do
clustalText <- TI.readFile clustalFilepath
let clustalTextLines = T.lines clustalText
let headerClustalTextLines = T.unlines (take 2 clustalTextLines)
let headerlessClustalTextLines = T.unlines (drop 2 clustalTextLines)
let reformatedClustalText = T.map reformatRNACodeAln headerlessClustalTextLines
TI.writeFile reformatedClustalPath (headerClustalTextLines `T.append` T.singleton '\n' `T.append` reformatedClustalText)
let selectedClustalpath = clustalFilepath ++ ".selected"
let sequenceNumberOption = " -n " ++ show seqenceNumber ++ " "
let optimalIdentityOption = " -i " ++ show optimalIdentity ++ " "
let maximalIdentityOption = " --max-id=" ++ show maximalIdenity ++ " "
let referenceSequenceOption = if referenceSequence then " " else " -x "
let syscall = "rnazSelectSeqs.pl " ++ reformatedClustalPath ++ " " ++ sequenceNumberOption ++ optimalIdentityOption ++ maximalIdentityOption ++ referenceSequenceOption ++ " >" ++ selectedClustalpath
system syscall
selectedClustalText <- readFile selectedClustalpath
return (Right ([],selectedClustalText))
preprocessClustalForRNAz :: String -> String -> Int -> Double -> Double -> Bool -> IO (Either String (String,String))
preprocessClustalForRNAz clustalFilepath _ seqenceNumber optimalIdentity maximalIdenity referenceSequence = do
clustalText <- TI.readFile clustalFilepath
let clustalTextLines = T.lines clustalText
parsedClustalInput <- readClustalAlignment clustalFilepath
let selectedClustalpath = clustalFilepath ++ ".selected"
if length clustalTextLines > 5
then
if isRight parsedClustalInput
then do
let (idMatrix,filteredClustalInput) = rnaCodeSelectSeqs2 (fromRight parsedClustalInput) seqenceNumber optimalIdentity maximalIdenity referenceSequence
writeFile selectedClustalpath (show filteredClustalInput)
let formatedIdMatrix = show (fmap formatIdMatrix idMatrix)
return (Right (formatedIdMatrix,selectedClustalpath))
else return (Left (show (fromLeft parsedClustalInput)))
else do
let clustalLines = T.lines clustalText
let headerClustalTextLines = T.unlines (take 2 clustalLines)
let headerlessClustalTextLines = T.unlines (drop 2 clustalLines)
let reformatedClustalText = T.map reformatRNACodeAln headerlessClustalTextLines
TI.writeFile selectedClustalpath (headerClustalTextLines `T.append` T.singleton '\n' `T.append` reformatedClustalText)
return (Right ([],clustalFilepath))
formatIdMatrix :: Maybe (Int,Int,Double) -> String
formatIdMatrix (Just (_,_,c)) = printf "%.2f" c
formatIdMatrix _ = "-"
rnaCodeSelectSeqs2 :: ClustalAlignment -> Int -> Double -> Double -> Bool -> (Matrix (Maybe (Int,Int,Double)),ClustalAlignment)
rnaCodeSelectSeqs2 currentClustalAlignment targetSeqNumber optimalIdentity maximalIdentity referenceSequence = (identityMatrix,newClustalAlignment)
where entryVector = V.fromList (alignmentEntries currentClustalAlignment)
entrySequences = V.map entryAlignedSequence entryVector
entryReformatedSequences = V.map (T.map reformatRNACodeAln) entrySequences
totalSeqNumber = V.length entryVector
identityMatrix = computeSequenceIdentityMatrix entryReformatedSequences
entryIdentityVector = V.map fromJust (V.filter isJust (getMatrixAsVector identityMatrix))
entryIdentities = V.toList entryIdentityVector
entriesToDiscard = preFilterIdentityMatrix maximalIdentity targetSeqNumber totalSeqNumber [] entryIdentities
allEntries = [1..totalSeqNumber]
prefilteredEntries = allEntries \\ entriesToDiscard
costList = map (computeEntryCost optimalIdentity entryIdentityVector) prefilteredEntries
sortedCostList = sortBy compareEntryCost2 costList
sortedIndices = map fst sortedCostList
selectedEntryIndices = selectEntryIndices referenceSequence targetSeqNumber sortedIndices
selectedEntries = map (\ind -> entryVector V.! (ind-1)) selectedEntryIndices
selectedEntryHeader = map entrySequenceIdentifier selectedEntries
reformatedSelectedEntryHeader = map (T.map reformatRNACodeId) selectedEntryHeader
selectedEntrySequences = map (\ind -> entryReformatedSequences V.! (ind-1)) selectedEntryIndices
gapfreeEntrySequences = T.transpose (filter (not . T.all isGap) (T.transpose selectedEntrySequences))
gapfreeEntries = map (uncurry ClustalAlignmentEntry)(zip reformatedSelectedEntryHeader gapfreeEntrySequences)
emptyConservationTrack = setEmptyConservationTrack gapfreeEntries (conservationTrack currentClustalAlignment)
newClustalAlignment = currentClustalAlignment {alignmentEntries = gapfreeEntries, conservationTrack = emptyConservationTrack}
selectEntryIndices :: Bool -> Int -> [Int] -> [Int]
selectEntryIndices referenceSequence targetSeqNumber sortedIndices
| referenceSequence = if (1 :: Int) `elem` firstX then firstX else 1:firstXm1
| otherwise = firstX
where firstXm1 = take (targetSeqNumber - 1) sortedIndices
firstX = take targetSeqNumber sortedIndices
setEmptyConservationTrack :: [ClustalAlignmentEntry] -> T.Text -> T.Text
setEmptyConservationTrack alnentries currentConservationTrack
| null alnentries = currentConservationTrack
| otherwise = newConservationTrack
where trackLength = T.length (entryAlignedSequence (head alnentries))
newConservationTrack = T.replicate (trackLength + 0) (T.pack " ")
isGap :: Char -> Bool
isGap a
| a == '-' = True
| otherwise = False
computeEntryCost :: Double -> V.Vector (Int,Int,Double) -> Int -> (Int,Double)
computeEntryCost optimalIdentity allIdentities currentIndex = (currentIndex,entryCost)
where entryCost = V.sum (V.map (computeCost optimalIdentity) entryIdentities)
entryIdentities = getEntryIdentities currentIndex allIdentities
getEntryIdentities :: Int -> V.Vector (Int,Int,Double) -> V.Vector (Int,Int,Double)
getEntryIdentities currentIndex allIdentities = V.filter (isIIdx currentIndex) allIdentities V.++ V.filter (isJIdx currentIndex) allIdentities
isIIdx :: Int -> (Int,Int,Double) -> Bool
isIIdx currentIdx (i,_,_) = currentIdx == i
isJIdx :: Int -> (Int,Int,Double) -> Bool
isJIdx currentIdx (_,j,_) = currentIdx == j
computeCost :: Double -> (Int,Int,Double) -> Double
computeCost optimalIdentity (_,_,c) = (c - optimalIdentity) * (c - optimalIdentity)
compareEntryCost2 :: (Int, Double) -> (Int, Double) -> Ordering
compareEntryCost2 (_,costA) (_,costB) = compare costA costB
preFilterIdentityMatrix :: Double -> Int -> Int-> [Int] -> [(Int,Int,Double)] -> [Int]
preFilterIdentityMatrix identityCutoff minSeqNumber totalSeqNumber filteredIds entryIdentities
| (totalSeqNumber - length filteredIds) <= minSeqNumber = []
| identityCutoff == (100 :: Double) = []
| Prelude.null entryIdentities = []
| otherwise = entryresult ++ preFilterIdentityMatrix identityCutoff minSeqNumber totalSeqNumber (filteredIds ++ entryresult) (tail entryIdentities)
where currentEntry = head entryIdentities
entryresult = checkIdentityEntry identityCutoff filteredIds currentEntry
checkIdentityEntry :: Double -> [Int] -> (Int,Int,Double) -> [Int]
checkIdentityEntry identityCutoff filteredIds (i,j,ident)
| i `elem` filteredIds = []
| j `elem` filteredIds = []
| ident > identityCutoff = [j]
| otherwise = []
computeSequenceIdentityMatrix :: V.Vector T.Text -> Matrix (Maybe (Int,Int,Double))
computeSequenceIdentityMatrix entryVector = matrix (V.length entryVector) (V.length entryVector) (computeSequenceIdentityEntry entryVector)
computeSequenceIdentityEntry :: V.Vector T.Text -> (Int,Int) -> Maybe (Int,Int,Double)
computeSequenceIdentityEntry entryVector (row,col)
| i < j = Just (row,col,ident)
| otherwise = Nothing
where i=row-1
j=col-1
ientry = entryVector V.! i
jentry = entryVector V.! j
(gfi,gfj) = unzip (filter notDoubleGap (T.zip ientry jentry))
gfitext = T.pack gfi
gfjtext = T.pack gfj
ident=textIdentity gfitext gfjtext
notDoubleGap :: (Char,Char) -> Bool
notDoubleGap (a,b)
| a == '-' && b == '-' = False
| otherwise = True
reformatRNACodeId :: Char -> Char
reformatRNACodeId c
| c == ':' = '-'
| c == '|' = '-'
| c == '.' = '-'
| c == '~' = '-'
| c == '_' = '-'
| c == '/' = '-'
| otherwise = c
reformatRNACodeAln :: Char -> Char
reformatRNACodeAln c
| c == ':' = '-'
| c == '|' = '-'
| c == '.' = '-'
| c == '~' = '-'
| c == '_' = '-'
| c == 'u' = 'U'
| c == 't' = 'T'
| c == 'g' = 'G'
| c == 'c' = 'C'
| c == 'a' = 'A'
| otherwise = c
reformatAln :: Char -> Char
reformatAln c
| c == '.' = '-'
| c == '~' = '-'
| c == '_' = '-'
| c == 'u' = 'U'
| c == 't' = 'T'
| c == 'g' = 'G'
| c == 'c' = 'C'
| c == 'a' = 'A'
| otherwise = c
checkNCBIConnection :: IO (Either String String)
checkNCBIConnection = do
req <- N.parseRequest "https://www.ncbi.nlm.nih.gov"
manager <- N.newManager N.tlsManagerSettings
response <- N.httpLbs req manager
let sta = N.responseStatus response
if statusIsSuccessful sta
then return (Right "Network connection with NCBI server was successful")
else return (Left ("Could not connect to NCBI server \"https://www.ncbi.nlm.nih.gov\". Response Code: " ++ show (statusCode sta) ++ " \n" ++ B.unpack (statusMessage sta)))
setBlastExpectThreshold :: ModelConstruction -> Double
setBlastExpectThreshold modelConstruction
| alignmentModeInfernal modelConstruction = 1 :: Double
| otherwise = 0.1 :: Double
reformatFasta :: Fasta () () -> Fasta () ()
reformatFasta input = Fasta (_header input) updatedSequence
where updatedSequence = BioSequence (B.pack (map reformatFastaSequence (B.unpack . _bioSequence . _fasta $ input)))
reformatFastaSequence :: Char -> Char
reformatFastaSequence c
| c == '.' = '-'
| c == '~' = '-'
| c == '_' = '-'
| c == 'u' = 'T'
| c == 't' = 'T'
| c == 'g' = 'G'
| c == 'c' = 'C'
| c == 'a' = 'A'
| c == 'U' = 'T'
| otherwise = c
setRestrictedTaxonomyLimits :: String -> (Maybe Int,Maybe Int)
setRestrictedTaxonomyLimits trestriction
| trestriction == "bacteria" = (Just (2 :: Int), Nothing)
| trestriction == "archea" = (Just (2157 :: Int), Nothing)
| trestriction == "eukaryia" = (Just (2759 :: Int), Nothing)
| otherwise = (Nothing, Nothing)
checkTaxonomyRestriction :: Maybe String -> Maybe String
checkTaxonomyRestriction taxonomyRestriction
| isJust taxonomyRestriction = checkTaxonomyRestrictionString (fromJust taxonomyRestriction)
| otherwise = Nothing
checkTaxonomyRestrictionString :: String -> Maybe String
checkTaxonomyRestrictionString restrictionString
| restrictionString == "archea" = Just "archea"
| restrictionString == "bacteria" = Just "bacteria"
| restrictionString == "eukaryia" = Just "eukaryia"
| otherwise = Nothing
extractAlignmentSequencesByIds :: String -> [B.ByteString] -> IO [Fasta () ()]
extractAlignmentSequencesByIds stockholmFilePath sequenceIds = do
inputSeedAln <- TIO.readFile stockholmFilePath
let alnEntries = extractAlignmentSequences inputSeedAln
let filteredEntries = concatMap (filterSequencesById alnEntries) sequenceIds
return filteredEntries
extractAlignmentSequences :: TL.Text -> [Fasta () ()]
extractAlignmentSequences seedFamilyAln = rfamIDAndseedFamilySequences
where seedFamilyAlnLines = TL.lines seedFamilyAln
seedFamilyNonEmpty = filter (\alnline -> TL.empty /= alnline) seedFamilyAlnLines
seedFamilyIdSeqLines = filter (\alnline -> not ((TL.head alnline) == '#') && not ((TL.head alnline) == ' ') && not ((TL.head alnline) == '/')) seedFamilyNonEmpty
seedFamilyIdandSeqLines = map TL.words seedFamilyIdSeqLines
seedFamilyIdandSeqLineTuples = map (\alnline -> (head alnline,filterAlnChars (last alnline))) seedFamilyIdandSeqLines
seedFamilyIdandSeqTupleSorted = sortBy (\tuple1 tuple2 -> compare (fst tuple1) (fst tuple2)) seedFamilyIdandSeqLineTuples
seedFamilyIdandSeqTupleGroups = groupBy (\tuple1 tuple2 -> fst tuple1 == fst tuple2) seedFamilyIdandSeqTupleSorted
seedFamilySequences = map mergeIdSeqTuplestoSequence seedFamilyIdandSeqTupleGroups
rfamIDAndseedFamilySequences = seedFamilySequences
filterSequencesById :: [Fasta () ()] -> B.ByteString -> [Fasta () ()]
filterSequencesById alignmentSequences sequenceId = filter (sequenceHasId sequenceId) alignmentSequences
sequenceHasId :: B.ByteString -> Fasta () () -> Bool
sequenceHasId sequenceId currentSequence = sequenceId == fastaHeader currentSequence
filterAlnChars :: TL.Text -> TL.Text
filterAlnChars cs = TL.filter (\c -> not (c == '-') && not (c == '.')) cs
mergeIdSeqTuplestoSequence :: [(TL.Text,TL.Text)] -> Fasta () ()
mergeIdSeqTuplestoSequence tuplelist = currentSequence
where seqId = TL.toStrict (fst (head tuplelist))
seqData = TL.toStrict (TL.concat (map snd tuplelist))
currentSequence = Fasta (SequenceIdentifier (E.encodeUtf8 seqId)) (BioSequence (E.encodeUtf8 seqData))
readFastaFile :: String -> IO [Fasta () ()]
readFastaFile fastaFilePath = do
inputFastaFile <- L.readFile fastaFilePath
let inputFastas = byteStringToMultiFasta inputFastaFile
return inputFastas
blast :: String -> Int -> Maybe Int -> Maybe Int -> Maybe Double -> Bool -> BlastHTTPQuery -> IO (Either String J.BlastJSON2)
blast _tempDirPath threads upperTaxIdLimit lowerTaxIdLimit expectThreshold _blastSoftmaskingToggle blastHTTPQuery = do
let upperTaxIdLimitPath = if isJust upperTaxIdLimit then _tempDirPath ++ "/upper.txids" else ""
let lowerTaxIdLimitPath = if isJust lowerTaxIdLimit then _tempDirPath ++ "/lower.txids" else ""
when (isJust upperTaxIdLimit) $ systemGetSpeciesTaxId (fromJust upperTaxIdLimit) upperTaxIdLimitPath
when (isJust lowerTaxIdLimit) $ systemGetSpeciesTaxId (fromJust lowerTaxIdLimit) lowerTaxIdLimitPath
let positiveSetTaxIdLimitPath = _tempDirPath ++ "/postitiveset.txids"
if isJust lowerTaxIdLimit && isJust upperTaxIdLimit
then do
upperTaxIdsFile <- readFile upperTaxIdLimitPath
let upperTaxIds = lines upperTaxIdsFile
lowerTaxIdsFile <- readFile lowerTaxIdLimitPath
let lowerTaxIds = lines lowerTaxIdsFile
let positiveSetTaxIds = upperTaxIds \\ lowerTaxIds
let positiveSetTaxIdsFile = unlines positiveSetTaxIds
writeFile positiveSetTaxIdLimitPath positiveSetTaxIdsFile
else return ()
let fastaFilePath = _tempDirPath ++ "/blastQuery.fa"
let blastResultFilePath = _tempDirPath ++ "/blastResult.json2"
let selectedBlastDatabase = fromMaybe "" (Biobase.BLAST.HTTP.database blastHTTPQuery)
writeFastaFile fastaFilePath (querySequences blastHTTPQuery)
systemBlast threads selectedBlastDatabase upperTaxIdLimitPath lowerTaxIdLimitPath positiveSetTaxIdLimitPath expectThreshold _blastSoftmaskingToggle fastaFilePath blastResultFilePath
blastCmdResult <- BBI.blastCmdJSON2FromFile blastResultFilePath
if isRight blastCmdResult
then do
let blastCmdOutput = J._blastcmdoutput2 (fromRight blastCmdResult)
when ((length blastCmdOutput) > 1) $ print "Blast output list with multiple elements"
if (not (null blastCmdOutput))
then (return (Right (J.BlastJSON2 (head blastCmdOutput)):: Either String J.BlastJSON2))
else (return (Left "Empty BlastOutput List" :: Either String J.BlastJSON2))
else (return (Left (fromLeft blastCmdResult) :: Either String J.BlastJSON2))
systemBlast :: Int -> String -> String -> String -> String -> Maybe Double -> Bool -> String -> String -> IO ExitCode
systemBlast threads _blastDatabase upperTaxLimitPath lowerTaxLimitPath positiveSetTaxIdLimitPath _evalueThreshold _blastSoftmaskingToggle queryFilepath outputFilePath = do
let cmd = ("blastn " ++ threadedOption ++ expectThresholdOption ++ taxonomyOption ++ " " ++ softmaskOption ++ dbOption ++ " -query " ++ queryFilepath ++ " -outfmt 15 -out " ++ outputFilePath)
putStrLn cmd
system cmd
where threadedOption = " -num_threads " ++ show threads
expectThresholdOption = if isJust _evalueThreshold then " -evalue " ++ show (fromJust _evalueThreshold) else ""
dbOption = if null _blastDatabase then "" else " -db " ++ _blastDatabase ++ " "
softmaskOption = if _blastSoftmaskingToggle then " -soft_masking " else ""
taxonomyOption = setBlastCallTaxonomyOptions upperTaxLimitPath lowerTaxLimitPath positiveSetTaxIdLimitPath
setBlastCallTaxonomyOptions :: String -> String -> String -> String
setBlastCallTaxonomyOptions upperTaxLimitPath lowerTaxLimitPath positiveSetTaxIdLimitPath
| and [(not (null upperTaxLimitPath)),(not (null lowerTaxLimitPath))] = " -taxidlist " ++ positiveSetTaxIdLimitPath ++ " "
| not (null upperTaxLimitPath) = " -taxidlist " ++ upperTaxLimitPath ++ " "
| not (null lowerTaxLimitPath) = " -negative_taxidlist " ++ lowerTaxLimitPath ++ " "
| otherwise = ""
systemGetSpeciesTaxId :: Int -> String -> IO ()
systemGetSpeciesTaxId requestedTaxId outputFilePath = do
system ("get_species_taxids.sh " ++ " -t " ++ show requestedTaxId ++ " > " ++ outputFilePath)
return ()