-- | Functions for parsing genebank format
module Bio.GenbankParser (
                       parseGenbank,
                       readGenbank,
                       module Bio.GenbankData
                      ) where

import Bio.GenbankData
import Text.ParserCombinators.Parsec
import Text.ParserCombinators.Parsec.Token
import Text.ParserCombinators.Parsec.Language (emptyDef)    
import Control.Monad
import Data.List
import Data.List.Split (splitOn)
import Data.Maybe
import Bio.Core.Sequence
import qualified Data.ByteString.Lazy.Char8 as L

--------------------------------------------------
--Parsing functions:

-- | Parse the input as Genbank datatype
genParserGenbank :: GenParser Char st Genbank
genParserGenbank = do
  string "LOCUS"
  many1 space
  locus <- many1 (noneOf " ")
  many1 space
  length <- many1 (noneOf " ")
  string " bp"
  many1 space
  moleculeType <- many1 (noneOf " ")
  many1 space
  circular <- optionMaybe (try (choice [string "linear",string "circular",string "LINEAR",string "CIRCULAR"]))
  many space
  division <-  many1 (noneOf " ")
  many1 space
  creationDate <- many1 (noneOf "\n")
  newline
  definition <- genParserField "DEFINITION" "ACCESSION"
  accession <- genParserField "ACCESSION" "VERSION"
  string "VERSION"
  many1 space
  version <- many1 (noneOf " ")
  many1 space
  geneIdentifier <- many1 (noneOf "\n")
  newline
  dblink <- optionMaybe (try (genParserField "DBLINK" "KEYWORDS"))
  keywords <- genParserField "KEYWORDS" "SOURCE"
  source <- genParserField "SOURCE" "ORGANISM"
  organism <- genParserField "ORGANISM" "REFERENCE"
  references <- many1 genParserReference
  comment <- optionMaybe (try (genParserField "COMMENT" "FEATURES"))
  string "FEATURES"
  many1 space
  string "Location/Qualifiers"
  newline
  features <- many genParserFeature
  contig <- optionMaybe (try (genParserField "CONTIG" "ORIGIN"))
  string "ORIGIN"
  many (string " ")
  newline
  origin <- many1 genParserOriginSequence
  string "//"
  newline
  return $ Genbank (L.pack locus) (readInt length) (L.pack moleculeType) (liftM L.pack circular) (L.pack division) (L.pack creationDate) (L.pack definition) (L.pack accession) (L.pack version) (L.pack geneIdentifier) (liftM L.pack dblink) (L.pack keywords) (L.pack source)  (L.pack organism) references (liftM L.pack comment) features contig (origintoSeqData origin) 

-- | Parse a feature
genParserFeature :: GenParser Char st Feature
genParserFeature = do
  string "     "
  featureType <- choice [try (string "gene") , try (string "repeat_region"), try (string "source")]
  many1 space
  genericFeatureCoordinates <- choice [genParserCoordinatesSet "join", genParserCoordinatesSet "order"]
  attibutes <- many (try genParserAttributes)
  subFeatures <- many (try genParserSubFeature) 
  choice [try geneAhead, try repeatAhead, try (lookAhead (string "CONTIG")), try (lookAhead (string "ORIGIN"))]
  return $ Feature (L.pack featureType) genericFeatureCoordinates attibutes subFeatures

-- | Parse a attribute, a GO attribute or a Flag
genParserAttributes :: GenParser Char st Attribute
genParserAttributes = choice [try genParserAttribute, try genParseGOattribute, try genParserFlagAttribute]

-- | Parse a attribute, consisting of attribute designation and value
genParserAttribute :: GenParser Char st Attribute
genParserAttribute = do
  many1 space
  string "/"
  notFollowedBy (string "translation")
  fieldName <- many1 (noneOf "=")
  string "=\""
  stringField <- many1 (noneOf "\"")
  string "\""
  newline
  return $ Field (L.pack fieldName) (L.pack stringField)

-- | Parse a Subfeature
genParserSubFeature :: GenParser Char st SubFeature
genParserSubFeature = do
  string "     "
  notFollowedBy (choice [string "gene", string "repeat_region", string "source"])
  subFeatureType <- many1 (noneOf " ")
  many1 space
  subFeatureCoordinates <- choice [genParserCoordinatesSet "join", genParserCoordinatesSet "order"]
  attibutes <- many (try genParserAttributes)
  subFeatureTranslation <- optionMaybe (try (parseStringField "translation"))
  return $ SubFeature (L.pack subFeatureType) subFeatureCoordinates attibutes (translationtoSeqData subFeatureTranslation)

-- | Parse GO attribute 
genParseGOattribute :: GenParser Char st Attribute
genParseGOattribute = do
  many1 space
  string "/GO_"
  goType <- many1 (noneOf "=")
  string "=\""
  goId <- many1 (noneOf "-")
  string "-"
  goName <- many1 (noneOf "\"")
  string "\""
  newline
  return $ GOattribute (L.pack goType) (L.pack goId) (L.pack goName)

-- | Parse flag attribute
genParserFlagAttribute :: GenParser Char st Attribute
genParserFlagAttribute = do
  many1 space
  string "/"
  notFollowedBy (string "translation")
  flagType <- many1 (noneOf "\n")
  newline
  return $ Flag (L.pack flagType)

-- | Parse the input as Genbank datatype
parseGenbank :: String -> Either ParseError Genbank
parseGenbank = parse genParserGenbank "genParserGenbank" 

-- | Read the file as Genbank datatype                     
readGenbank :: String -> IO (Either ParseError Genbank)          
readGenbank  = parseFromFile genParserGenbank 

-- | Parse a Field 
genParserField :: String -> String -> GenParser Char st String
genParserField fieldStart fieldEnd = do 
  string fieldStart
  many1 space
  manyTill anyChar (try (lookAhead (string fieldEnd)))
                 
-- | Parse the input as OriginSlice datatype
genParserOriginSequence :: GenParser Char st String
genParserOriginSequence = do
  many1 space
  many1 (noneOf " ")
  space
  originSequence <- many1 (noneOf "\n")
  newline
  return originSequence
 
-- | Parse the input as OriginSlice datatype
genParserOriginSlice :: GenParser Char st OriginSlice
genParserOriginSlice = do
  many1 space
  originIndex <- many1 (noneOf " ")
  space
  originSequence <- many1 (noneOf "\n")
  newline
  return $ OriginSlice (readInt originIndex) originSequence

-- | Parse the input as Reference datatype
genParserReference :: GenParser Char st Reference
genParserReference = do
  string "REFERENCE"
  many1 space
  index <- many1 digit
  many (string " ")
  optional (try (string "(bases"))
  many (string " ")
  baseFrom <- optionMaybe (try (many1 digit))
  many (string " ")
  optional (try (string "to"))
  many (string " ")
  baseTo  <- optionMaybe (try (many1 digit))
  optional (try (string ")"))
  newline
  many1 space
  authors <- choice [genParserField "AUTHORS" "TITLE", genParserField "CONSRTM" "TITLE"]
  title <- genParserField "TITLE" "JOURNAL"
  journal <- choice [try (genParserField "JOURNAL" "REFERENCE"), try (genParserField "JOURNAL" "COMMENT"), try (genParserField "JOURNAL" "FEATURES")]
  return $ Reference (readInt index) (liftM readInt baseFrom) (liftM readInt baseTo) authors title journal Nothing Nothing --pubmedId remark 

parseFlag :: String -> GenParser Char st Char
parseFlag flagString = do
  many1 space
  flag <- string ('/' : flagString)
  newline

geneAhead = lookAhead (string "     gene")

repeatAhead = lookAhead (string "     repeat")

origintoSeqData :: [String] -> SeqData
origintoSeqData originInput = SeqData (L.pack (filter (\nuc -> nuc /= '\n' && (nuc /= ' ')) (concat originInput)))

translationtoSeqData :: Maybe String -> Maybe SeqData
translationtoSeqData translationInput 
  | isJust translationInput = Just (SeqData (L.pack (filter (\aminoacid -> (aminoacid /=  '\n') && (aminoacid /=  ' ') ) (fromJust translationInput))))
  | otherwise = Nothing 

genParserCoordinates :: GenParser Char st Coordinates
genParserCoordinates = do
  coordinates <- choice [try genParserForwardCoordinates, try genParserComplementCoordinates]
  return coordinates

genParserCoordinatesSet :: String -> GenParser Char st CoordinateSet
genParserCoordinatesSet prefix = do
  coordinates <- choice [try (many1 genParserForwardCoordinates), try (many1 genParserComplementCoordinates), try (genParserForwardPrefix prefix), try (genParserComplementPrefix prefix)]
  return $ CoordinateSet coordinates (Just prefix)

-- | Parsing of coordinate lists with prefix e.g. order, join
genParserForwardPrefix :: String -> GenParser Char st [Coordinates]
genParserForwardPrefix prefix = do
  string (prefix ++ "(")
  coordinates <- many1 genParserForwardPrefixCoordinates
  string ")"
  return coordinates

genParserForwardPrefixCoordinates :: GenParser Char st Coordinates
genParserForwardPrefixCoordinates = do
  coordinateFromEqualitySymbol <- optionMaybe (try (oneOf "><"))  
  coordinateFrom <- many1 digit
  optional (oneOf "><")
  string "."
  string "."
  coordinateToEqualitySymbol <- optionMaybe (try (oneOf "><"))
  coordinateTo <- many1 digit
  optional (choice [try (string ",\n"),try (string ",")])
  optional (many1 (string " "))
  return $ Coordinates (readInt coordinateFrom) coordinateFromEqualitySymbol (readInt coordinateTo) coordinateToEqualitySymbol True

-- | Parseing of coordinate complement coordinate lists with prefix
genParserComplementPrefix :: String -> GenParser Char st [Coordinates]
genParserComplementPrefix prefix = do
  string "complement("
  string (prefix ++ "(")
  coordinates <- many1 genParserForwardPrefixCoordinates
  string ")"
  string ")"
  newline
  return (setComplement False coordinates)

genParserForwardCoordinates :: GenParser Char st Coordinates
genParserForwardCoordinates = do
  coordinateFromEqualitySymbol <- optionMaybe (try (oneOf "><"))  
  coordinateFrom <- many1 digit
  optional (oneOf "><")
  string "."
  string "."
  coordinateToEqualitySymbol <- optionMaybe (try (oneOf "><"))
  coordinateTo <- many1 digit
  newline
  return $ Coordinates (readInt coordinateFrom) coordinateFromEqualitySymbol (readInt coordinateTo) coordinateToEqualitySymbol False

genParserComplementCoordinates :: GenParser Char st Coordinates
genParserComplementCoordinates = do
  string "complement("
  coordinateFromEqualitySymbol <- optionMaybe (try (oneOf "><")) 
  coordinateFrom <- many1 digit
  optional (oneOf "><")
  string "."
  string "."
  coordinateToEqualitySymbol <- optionMaybe (try (oneOf "><"))
  coordinateTo <- many1 digit
  string ")"
  newline
  return $ Coordinates (readInt coordinateFrom) coordinateFromEqualitySymbol (readInt coordinateTo) coordinateToEqualitySymbol True

setComplement :: Bool -> [Coordinates] -> [Coordinates]
setComplement complementBool coordinates = coordinatesWithComplement
  where updateCoordinate complementBool coordinate = coordinate { complement = complementBool }
        coordinatesWithComplement = map (updateCoordinate complementBool) coordinates

genParseGOterm :: GenParser Char st GOterm
genParseGOterm = do
  many1 space
  string "/GO_"
  goType <- many1 (noneOf "=")
  string "=\""
  goId <- many1 (noneOf "-")
  string "-"
  goName <- many1 (noneOf "\"")
  string "\""
  newline
  return $ GOterm goType goId goName

genParseDbXRef :: GenParser Char st DbXRef
genParseDbXRef = do
  many1 space
  string "/db_xref=\""
  db <- many1 (noneOf ":")
  string ":"
  ref <- many1 (noneOf "\"")
  string "\""
  newline
  return $ DbXRef (L.pack db) (L.pack ref)
  
---------------------------
-- Auxiliary functions

readDouble :: String -> Double
readDouble = read              

readInt :: String -> Int
readInt = read

readChar :: String -> Char
readChar = read

parseStringBracketField :: String -> GenParser Char st String
parseStringBracketField fieldname = do
  many1 space
  string ("/" ++ fieldname ++ "=(")
  stringBracketField <- manyTill anyChar (try (string ")\n"))
  return stringBracketField
  
-- | Parse a field containing a String         
parseStringField :: String -> GenParser Char st String
parseStringField fieldname = do
  many1 space
  string ("/" ++ fieldname ++ "=\"")
  stringField <- many1( noneOf "\"")
  string "\""
  newline
  return stringField

-- | Parse a field containing a Int          
parseIntField :: String -> GenParser Char st Int
parseIntField fieldname = do
  many1 space
  string ("/" ++ fieldname ++ "=")
  int <- many1 (noneOf "\n")
  newline
  return (readInt int)

isComplement :: Maybe String -> Bool
isComplement string
  | isJust string = True
  | otherwise = False