-- | Parse Clustal output
--   For more information on Clustal tools consult: <http://www.clustal.org/>
module Bio.ClustalParser (
                       parseClustalAlignment,
                       readClustalAlignment,
                       parseStructuralClustalAlignment,
                       readStructuralClustalAlignment,
                       parseClustalSummary,
                       readClustalSummary,
                       module Bio.ClustalData
                      ) where

import Bio.ClustalData
import Text.ParserCombinators.Parsec    
import Control.Monad
import Data.List

readDouble :: String -> Double
readDouble = read              

readInt :: String -> Int
readInt = read

-- | Parse the input as ClustalSummary datatype
genParserClustalSummary :: GenParser Char st ClustalSummary
genParserClustalSummary = do
  newline
  newline
  newline
  space
  string "CLUSTAL "
  version <- many1 (noneOf " ")
  many1 (noneOf "\n")
  newline
  newline
  newline
  string "Sequence format is "
  sequenceFormat' <- many1 (noneOf "\n")
  newline
  sequenceParametersList <- many1 (try genParserSequenceParameters)
  string "Start of Pairwise alignments" 
  newline
  string "Aligning..."
  newline
  newline
  pairwiseAlignmentSummaryList <- many1 genParserPairwiseAlignmentSummary
  string "Guide tree file created:   ["
  guideTreeFileName' <- many1 (noneOf "]")
  char ']'
  newline
  newline
  string "There are "
  numberOfGroups <- many1 digit
  string " groups"
  newline
  string "Start of Multiple Alignment"
  newline
  newline
  string "Aligning..."
  newline
  groupSummaryList <- many1 genParserGroupSummary
  string "Alignment Score "
  alignmentScore' <- many1 digit
  newline
  newline
  string "CLUSTAL-Alignment file created  ["
  alignmentFileName' <- many1 (noneOf "]")
  char ']'
  newline
  newline
  eof  
  return $ ClustalSummary version sequenceFormat' sequenceParametersList pairwiseAlignmentSummaryList guideTreeFileName' (readInt numberOfGroups) groupSummaryList (readInt alignmentScore') alignmentFileName'

genParserGroupSummary :: GenParser Char st GroupSummary
genParserGroupSummary = do
  string "Group " 
  groupIndex <- many1 digit
  string ":"
  optional space
  optional (string "Sequences:")
  many1 space
  sequenceNumber <- optionMaybe (many1 digit)
  optional (many1 space)
  string "Score:" <|>  string "Delayed"
  groupScore' <- optionMaybe (many1 digit) 
  newline
  return $ GroupSummary (readInt groupIndex) (liftM readInt sequenceNumber) (liftM readInt groupScore')

genParserPairwiseAlignmentSummary :: GenParser Char st PairwiseAlignmentSummary
genParserPairwiseAlignmentSummary = do
  string "Sequences (" 
  firstSeqIndex <- many1 digit
  string ":"
  secondSeqIndex <- many1 digit
  string ") Aligned. Score:"
  many1 space
  pairwiseScore <- many1 digit
  newline
  return $ PairwiseAlignmentSummary (readInt firstSeqIndex) (readInt secondSeqIndex) (readInt pairwiseScore)

genParserSequenceParameters :: GenParser Char st SequenceParameters
genParserSequenceParameters = do
  string "Sequence " 
  sequenceIndexParam <- many1 digit
  string ": "
  sequenceIdentifierParam <- many1 (noneOf " ")
  spaces
  sequenceLengthParam <- many1 digit
  space
  string "bp"
  newline
  return $ SequenceParameters (readInt sequenceIndexParam) sequenceIdentifierParam (readInt sequenceLengthParam)

-- | Parse the input as ClustalAlignment datatype
genParserClustalAlignment :: GenParser Char st ClustalAlignment
genParserClustalAlignment = do
  many1 (noneOf "\n")
  newline
  newline
  newline
  alignmentSlices <- many1 genParserClustalAlignmentSlice
  eof  
  return (mergealignmentSlices alignmentSlices)

mergealignmentSlices :: [ClustalAlignmentSlice] -> ClustalAlignment
mergealignmentSlices slices = alignment
  where entrySlicesList = map entrySlices slices -- list of lists of entry slices
        sequenceIdentifiers = map entrySequenceSliceIdentifier (head entrySlicesList)
        alignmentEntriesListBySlice =  map (map entryAlignedSliceSequence) entrySlicesList  
        transposedAlignmentEntriesListbySlice = transpose alignmentEntriesListBySlice
        mergedAlignmentSequenceEntries = map concat transposedAlignmentEntriesListbySlice
        mergedAlignmentEntries = map constructAlignmentEntries (zip sequenceIdentifiers mergedAlignmentSequenceEntries)
        conservationTrackSlices = map conservationTrackSlice slices
        mergedConservationTrack = concat conservationTrackSlices
        alignment = ClustalAlignment mergedAlignmentEntries mergedConservationTrack

constructAlignmentEntries :: (String, String) -> ClustalAlignmentEntry
constructAlignmentEntries (entryIdentifier,entrySequence) = ClustalAlignmentEntry entryIdentifier entrySequence

genParserClustalAlignmentSlice :: GenParser Char st ClustalAlignmentSlice
genParserClustalAlignmentSlice = do
  entrySlices' <- many1 genParserClustalEntrySlice
  --extract length of identifier and spacer to determine offset of conservation track
  let offsetLenght = length (entrySequenceSliceIdentifier (head entrySlices')) + spacerLength (head entrySlices')
  spacerAndConservationTrackSlice <- many1 (noneOf "\n")
  let conservationTrackSlice' = drop offsetLenght spacerAndConservationTrackSlice
  newline
  optional newline
  return $ ClustalAlignmentSlice entrySlices' conservationTrackSlice'

genParserClustalEntrySlice :: GenParser Char st ClustalAlignmentEntrySlice
genParserClustalEntrySlice = do
  sliceIdentifier <- many1 (noneOf " ")
  spacer <- many1 (char ' ')
  sliceSequence <- many1 (oneOf "NYRUAGCT-")
  newline
  return $ ClustalAlignmentEntrySlice sliceIdentifier sliceSequence (length spacer)

--Structural Clustal Parser functions

-- | Parse the input as ClustalAlignment datatype as used in mlocarna
genParserStructuralClustalAlignment :: GenParser Char st StructuralClustalAlignment
genParserStructuralClustalAlignment = do
  genParseMlocarnaHeader
  alignmentSlices <- many1 (try genParserStructuralClustalAlignmentSlice)
  secondaryStructure <- genSecondaryStructure  
  energy' <- genParseEnergy
  eof  
  return (mergeStructuralAlignmentSlices alignmentSlices secondaryStructure energy')

genSecondaryStructure :: GenParser Char st String
genSecondaryStructure = do
  string "alifold"
  many1 space
  secondaryStructure <- many1 (oneOf ".()")
  space
  return secondaryStructure

genParseEnergy :: GenParser Char st Double
genParseEnergy = do
  string "("
  many space 
  energy' <- many1 (noneOf " ")
  optional space
  char ('=')
  many1 (noneOf "\n")
  newline  
  return (readDouble energy')

genParseMlocarnaHeader :: GenParser Char st String
genParseMlocarnaHeader = do
  string "mLo"
  many1 (noneOf "\n")
  newline
  string "Copyright"
  many1 (noneOf "\n")
  newline
  newline  
  many1 genParseAlignmentProcessStep
  newline
  newline
  return ""

genParseAlignmentProcessStep :: GenParser Char st String
genParseAlignmentProcessStep = do
  many1 (noneOf ".\n")
  choice [try (string ("... ")), try (string ("..."))]
  newline
  return ""

mergeStructuralAlignmentSlices :: [StructuralClustalAlignmentSlice] -> String -> Double -> StructuralClustalAlignment
mergeStructuralAlignmentSlices slices secondaryStructure energy' = alignment
  where entrySlicesList = map structuralEntrySlices slices -- list of lists of entry slices
        sequenceIdentifiers = map structuralEntrySequenceSliceIdentifier (head entrySlicesList)
        alignmentEntriesListBySlice =  map (map structuralEntryAlignedSliceSequence) entrySlicesList  
        transposedAlignmentEntriesListbySlice = transpose alignmentEntriesListBySlice
        mergedAlignmentSequenceEntries = map concat transposedAlignmentEntriesListbySlice
        mergedAlignmentEntries = map constructStructuralAlignmentEntries (zip sequenceIdentifiers mergedAlignmentSequenceEntries)
        alignment = StructuralClustalAlignment mergedAlignmentEntries secondaryStructure energy' 

constructStructuralAlignmentEntries :: (String, String) -> ClustalAlignmentEntry
constructStructuralAlignmentEntries (entryIdentifier,entrySequence) = ClustalAlignmentEntry entryIdentifier entrySequence

genParserStructuralClustalAlignmentSlice :: GenParser Char st StructuralClustalAlignmentSlice
genParserStructuralClustalAlignmentSlice = do
  entrySlices' <- many1 (try genParserStructuralClustalEntrySlice)
  optional newline
  return $ StructuralClustalAlignmentSlice entrySlices'

genParserStructuralClustalEntrySlice :: GenParser Char st StructuralClustalAlignmentEntrySlice
genParserStructuralClustalEntrySlice = do
  sliceIdentifier <- many1 (noneOf " ")
  many1 (char ' ')
  sliceSequence <- many1 (oneOf "NYRUAGCT-")
  newline
  return $ StructuralClustalAlignmentEntrySlice (filter (/='\n') sliceIdentifier) sliceSequence

-- exported functions

-- | Parse Clustal alignment (.aln) from String
parseClustalAlignment :: String -> Either ParseError ClustalAlignment 
parseClustalAlignment = parse genParserClustalAlignment "genParserClustalAlignment"

-- | Parse Clustal alignment (.aln) from filehandle                  
readClustalAlignment :: String -> IO (Either ParseError ClustalAlignment)   
readClustalAlignment = parseFromFile genParserClustalAlignment

-- | Parse Clustal alignment (.aln) with secondary structure in dot-bracket notation from String (as produced by mlocarna)
parseStructuralClustalAlignment :: String -> Either ParseError StructuralClustalAlignment 
parseStructuralClustalAlignment = parse genParserStructuralClustalAlignment "genParserClustalAlignment"

-- | Parse Clustal alignment (.aln) with secondary structure in dot-bracket notation from filehandle (as produced by mlocarna)                  
readStructuralClustalAlignment :: String -> IO (Either ParseError StructuralClustalAlignment)   
readStructuralClustalAlignment = parseFromFile genParserStructuralClustalAlignment

-- |  Parse Clustal summary (printed to STDOUT) from String
parseClustalSummary :: String -> Either ParseError ClustalSummary
parseClustalSummary = parse genParserClustalSummary "genParserClustalSummary"

-- | Parse Clustal summary (printed to STDOUT) from file
readClustalSummary :: String -> IO (Either ParseError ClustalSummary)       
readClustalSummary = parseFromFile genParserClustalSummary