module Bio.ClustalParser (
parseClustalAlignment,
readClustalAlignment,
parseStructuralClustalAlignment,
readStructuralClustalAlignment,
parseClustalSummary,
readClustalSummary,
module Bio.ClustalData
) where
import Bio.ClustalData
import Text.ParserCombinators.Parsec
import Control.Monad
import Data.List
readDouble :: String -> Double
readDouble = read
readInt :: String -> Int
readInt = read
genParserClustalSummary :: GenParser Char st ClustalSummary
genParserClustalSummary = do
newline
newline
newline
space
string "CLUSTAL "
version <- many1 (noneOf " ")
many1 (noneOf "\n")
newline
newline
newline
string "Sequence format is "
sequenceFormat' <- many1 (noneOf "\n")
newline
sequenceParametersList <- many1 (try genParserSequenceParameters)
string "Start of Pairwise alignments"
newline
string "Aligning..."
newline
newline
pairwiseAlignmentSummaryList <- many1 genParserPairwiseAlignmentSummary
string "Guide tree file created: ["
guideTreeFileName' <- many1 (noneOf "]")
char ']'
newline
newline
string "There are "
numberOfGroups <- many1 digit
string " groups"
newline
string "Start of Multiple Alignment"
newline
newline
string "Aligning..."
newline
groupSummaryList <- many1 genParserGroupSummary
string "Alignment Score "
alignmentScore' <- many1 digit
newline
newline
string "CLUSTAL-Alignment file created ["
alignmentFileName' <- many1 (noneOf "]")
char ']'
newline
newline
eof
return $ ClustalSummary version sequenceFormat' sequenceParametersList pairwiseAlignmentSummaryList guideTreeFileName' (readInt numberOfGroups) groupSummaryList (readInt alignmentScore') alignmentFileName'
genParserGroupSummary :: GenParser Char st GroupSummary
genParserGroupSummary = do
string "Group "
groupIndex <- many1 digit
string ":"
optional space
optional (string "Sequences:")
many1 space
sequenceNumber <- optionMaybe (many1 digit)
optional (many1 space)
string "Score:" <|> string "Delayed"
groupScore' <- optionMaybe (many1 digit)
newline
return $ GroupSummary (readInt groupIndex) (liftM readInt sequenceNumber) (liftM readInt groupScore')
genParserPairwiseAlignmentSummary :: GenParser Char st PairwiseAlignmentSummary
genParserPairwiseAlignmentSummary = do
string "Sequences ("
firstSeqIndex <- many1 digit
string ":"
secondSeqIndex <- many1 digit
string ") Aligned. Score:"
many1 space
pairwiseScore <- many1 digit
newline
return $ PairwiseAlignmentSummary (readInt firstSeqIndex) (readInt secondSeqIndex) (readInt pairwiseScore)
genParserSequenceParameters :: GenParser Char st SequenceParameters
genParserSequenceParameters = do
string "Sequence "
sequenceIndexParam <- many1 digit
string ": "
sequenceIdentifierParam <- many1 (noneOf " ")
spaces
sequenceLengthParam <- many1 digit
space
string "bp"
newline
return $ SequenceParameters (readInt sequenceIndexParam) sequenceIdentifierParam (readInt sequenceLengthParam)
genParserClustalAlignment :: GenParser Char st ClustalAlignment
genParserClustalAlignment = do
many1 (noneOf "\n")
newline
newline
newline
alignmentSlices <- many1 genParserClustalAlignmentSlice
eof
return (mergealignmentSlices alignmentSlices)
mergealignmentSlices :: [ClustalAlignmentSlice] -> ClustalAlignment
mergealignmentSlices slices = alignment
where entrySlicesList = map entrySlices slices
sequenceIdentifiers = map entrySequenceSliceIdentifier (head entrySlicesList)
alignmentEntriesListBySlice = map (map entryAlignedSliceSequence) entrySlicesList
transposedAlignmentEntriesListbySlice = transpose alignmentEntriesListBySlice
mergedAlignmentSequenceEntries = map concat transposedAlignmentEntriesListbySlice
mergedAlignmentEntries = map constructAlignmentEntries (zip sequenceIdentifiers mergedAlignmentSequenceEntries)
conservationTrackSlices = map conservationTrackSlice slices
mergedConservationTrack = concat conservationTrackSlices
alignment = ClustalAlignment mergedAlignmentEntries mergedConservationTrack
constructAlignmentEntries :: (String, String) -> ClustalAlignmentEntry
constructAlignmentEntries (entryIdentifier,entrySequence) = ClustalAlignmentEntry entryIdentifier entrySequence
genParserClustalAlignmentSlice :: GenParser Char st ClustalAlignmentSlice
genParserClustalAlignmentSlice = do
entrySlices' <- many1 genParserClustalEntrySlice
let offsetLenght = length (entrySequenceSliceIdentifier (head entrySlices')) + spacerLength (head entrySlices')
spacerAndConservationTrackSlice <- many1 (noneOf "\n")
let conservationTrackSlice' = drop offsetLenght spacerAndConservationTrackSlice
newline
optional newline
return $ ClustalAlignmentSlice entrySlices' conservationTrackSlice'
genParserClustalEntrySlice :: GenParser Char st ClustalAlignmentEntrySlice
genParserClustalEntrySlice = do
sliceIdentifier <- many1 (noneOf " ")
spacer <- many1 (char ' ')
sliceSequence <- many1 (oneOf "NYRUAGCT-")
newline
return $ ClustalAlignmentEntrySlice sliceIdentifier sliceSequence (length spacer)
genParserStructuralClustalAlignment :: GenParser Char st StructuralClustalAlignment
genParserStructuralClustalAlignment = do
genParseMlocarnaHeader
alignmentSlices <- many1 (try genParserStructuralClustalAlignmentSlice)
secondaryStructure <- genSecondaryStructure
energy' <- genParseEnergy
eof
return (mergeStructuralAlignmentSlices alignmentSlices secondaryStructure energy')
genSecondaryStructure :: GenParser Char st String
genSecondaryStructure = do
string "alifold"
many1 space
secondaryStructure <- many1 (oneOf ".()")
space
return secondaryStructure
genParseEnergy :: GenParser Char st Double
genParseEnergy = do
string "("
many space
energy' <- many1 (noneOf " ")
optional space
char ('=')
many1 (noneOf "\n")
newline
return (readDouble energy')
genParseMlocarnaHeader :: GenParser Char st String
genParseMlocarnaHeader = do
string "mLo"
many1 (noneOf "\n")
newline
string "Copyright"
many1 (noneOf "\n")
newline
newline
many1 genParseAlignmentProcessStep
newline
newline
return ""
genParseAlignmentProcessStep :: GenParser Char st String
genParseAlignmentProcessStep = do
many1 (noneOf ".\n")
choice [try (string ("... ")), try (string ("..."))]
newline
return ""
mergeStructuralAlignmentSlices :: [StructuralClustalAlignmentSlice] -> String -> Double -> StructuralClustalAlignment
mergeStructuralAlignmentSlices slices secondaryStructure energy' = alignment
where entrySlicesList = map structuralEntrySlices slices
sequenceIdentifiers = map structuralEntrySequenceSliceIdentifier (head entrySlicesList)
alignmentEntriesListBySlice = map (map structuralEntryAlignedSliceSequence) entrySlicesList
transposedAlignmentEntriesListbySlice = transpose alignmentEntriesListBySlice
mergedAlignmentSequenceEntries = map concat transposedAlignmentEntriesListbySlice
mergedAlignmentEntries = map constructStructuralAlignmentEntries (zip sequenceIdentifiers mergedAlignmentSequenceEntries)
alignment = StructuralClustalAlignment mergedAlignmentEntries secondaryStructure energy'
constructStructuralAlignmentEntries :: (String, String) -> ClustalAlignmentEntry
constructStructuralAlignmentEntries (entryIdentifier,entrySequence) = ClustalAlignmentEntry entryIdentifier entrySequence
genParserStructuralClustalAlignmentSlice :: GenParser Char st StructuralClustalAlignmentSlice
genParserStructuralClustalAlignmentSlice = do
entrySlices' <- many1 (try genParserStructuralClustalEntrySlice)
optional newline
return $ StructuralClustalAlignmentSlice entrySlices'
genParserStructuralClustalEntrySlice :: GenParser Char st StructuralClustalAlignmentEntrySlice
genParserStructuralClustalEntrySlice = do
sliceIdentifier <- many1 (noneOf " ")
many1 (char ' ')
sliceSequence <- many1 (oneOf "NYRUAGCT-")
newline
return $ StructuralClustalAlignmentEntrySlice (filter (/='\n') sliceIdentifier) sliceSequence
parseClustalAlignment :: String -> Either ParseError ClustalAlignment
parseClustalAlignment = parse genParserClustalAlignment "genParserClustalAlignment"
readClustalAlignment :: String -> IO (Either ParseError ClustalAlignment)
readClustalAlignment = parseFromFile genParserClustalAlignment
parseStructuralClustalAlignment :: String -> Either ParseError StructuralClustalAlignment
parseStructuralClustalAlignment = parse genParserStructuralClustalAlignment "genParserClustalAlignment"
readStructuralClustalAlignment :: String -> IO (Either ParseError StructuralClustalAlignment)
readStructuralClustalAlignment = parseFromFile genParserStructuralClustalAlignment
parseClustalSummary :: String -> Either ParseError ClustalSummary
parseClustalSummary = parse genParserClustalSummary "genParserClustalSummary"
readClustalSummary :: String -> IO (Either ParseError ClustalSummary)
readClustalSummary = parseFromFile genParserClustalSummary