module NLP.Scoring.SimpleUnigram.Import where

import           Control.Applicative
--import           Data.ByteString.Char8 (ByteString)
import           Data.HashTable.IO (BasicHashTable)
import           Data.Stringable
--import qualified Data.Attoparsec.ByteString as AB
--import qualified Data.Attoparsec.ByteString.Char8 as AB hiding (takeWhile1,skipWhile)
--import qualified Data.ByteString.Char8 as B
import qualified Data.HashTable.IO as H
import           System.IO.Unsafe (unsafePerformIO)

import qualified Data.Attoparsec.Text as AT
import           Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.IO as T

import           NLP.Alphabet.IMMC
import           NLP.Scoring.SimpleUnigram



-- | Each parsed line gives a set of characters, or tells us a score.
--
-- TODO add @LPimport@ which starts a recursive import (note: start by storing
-- the hash or whatever of the file to be imported so that we can comment on
-- circular imports)

data ParsedLine
  = PLset Text [IMMC]
  | PLeq Text Double
  | PLeqset Text [IMMC]
  | PLinset Text Text Double
  | PLgap Double
  | PLgapopen Double
  | PLgapextend Double
  | PLdefmatch Double
  | PLdefmismatch Double
  | PLcomment Text
  deriving (Show,Eq,Ord)

-- | Here we simple parse individual lines.

parseLine :: Text -> ParsedLine
parseLine l = case AT.parseOnly (go <* AT.endOfInput) l of
                Left  err -> error $ err ++ " " ++ show l
                Right p   -> p
  where go =   PLset         <$ "Set"       <*> wd <*> mc `AT.sepBy1` AT.skipSpace -- AB.skipWhile AB.isHorizontalSpace
           <|> PLeq          <$ "Eq"        <*> wd <*> nm
           <|> PLinset       <$ "InSet"     <*> wd <*> wd <*> nm
           <|> PLgap         <$ "Gap"       <*> nm
           <|> PLgapopen     <$ "GapOpen"   <*> nm
           <|> PLgapextend   <$ "GapExtend" <*> nm
           <|> PLdefmatch    <$ "Match"     <*> nm
           <|> PLdefmismatch <$ "Mismatch"  <*> nm
           <|> PLeqset       <$ "EqSet"     <*> wd <*> mc `AT.sepBy1` AT.skipSpace
           <|> PLcomment     <$ "--"        <*> AT.takeText
        wd = AT.skipSpace *> AT.takeWhile1 (not . AT.isHorizontalSpace)
        mc = fromText <$> wd
        nm = AT.skipSpace *> AT.double

-- | Parses a bytestring to create a simple scoring. We don't do much error
-- checking, many of the bindings below will easily fail.
--
-- TODO obviously: implement error-checking

genSimpleScoring :: Text -> SimpleScoring
genSimpleScoring l = SimpleScoring t g go ge dm di
  where
    t    = unsafePerformIO $ H.fromListWithSizeHint (Prelude.length ys) ys
    ls   = T.lines l
    xs   = map parseLine ls
    ys   = concatMap genPairs $ iss ++ eqs
    sets = [s  | s@(PLset _ _)     <- xs]
    eqss = [e  | e@(PLeqset _ _)   <- xs]
    eqs  = [e  | e@(PLeq _ _)      <- xs]
    iss  = [i  | i@(PLinset _ _ _) <- xs]
    [dm] = [dm | PLdefmatch dm     <- xs]
    [di] = [di | PLdefmismatch di  <- xs]
    [g]  = [g  | PLgap g           <- xs]
    [go] = [go | PLgapopen go      <- xs]
    [ge] = [ge | PLgapextend ge    <- xs]
    -- this generates all "equality pairs", i.e. that 'a' == 'a'
    -- the second list generates all equivalence classes, say that 'a' == 'ã'
    genPairs (PLeq    x   d) = let ss = lookupSet x
                                   tt = lookupEqSet x
                               in  [ ((s,s),d) | s <- ss ] ++
                                   [ ((s,t),d) | ts <- tt, s<-ts,t<-ts ]
    -- this creates all variants of, say, vowel against vowel (but unequal)
    genPairs (PLinset x y d) = let ss = lookupSet x
                                   tt = lookupSet y in [ ((s,t),d) | s <- ss, t <- tt ]
    -- find every character from an equivalence set
    lookupEqSet k = let go [] = []
                        go (PLeqset n xs:ss) = if k==n then xs : go ss else go ss
                    in  go eqss
    -- find every character from a certain class
    lookupSet k = let go [] = []
                      go (PLset n xs:ss) = if k==n then xs : go ss else go ss
                      go (PLeqset n xs:ss) = if k==n then xs : go ss else go ss
                  in  case go $ sets ++ eqss of
                        xs -> concat xs

-- | parse a simple scoring file.

simpleScoreFromFile f = T.readFile f >>= return . genSimpleScoring