module NLP.Scoring.SimpleUnigram.Import where
import Control.Applicative
import Data.HashTable.IO (BasicHashTable)
import Data.Stringable
import qualified Data.HashTable.IO as H
import System.IO.Unsafe (unsafePerformIO)
import qualified Data.Attoparsec.Text as AT
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.IO as T
import NLP.Alphabet.IMMC
import NLP.Scoring.SimpleUnigram
data ParsedLine
= PLset Text [IMMC]
| PLeq Text Double
| PLeqset Text [IMMC]
| PLinset Text Text Double
| PLgap Double
| PLgapopen Double
| PLgapextend Double
| PLdefmatch Double
| PLdefmismatch Double
| PLcomment Text
deriving (Show,Eq,Ord)
parseLine :: Text -> ParsedLine
parseLine l = case AT.parseOnly (go <* AT.endOfInput) l of
Left err -> error $ err ++ " " ++ show l
Right p -> p
where go = PLset <$ "Set" <*> wd <*> mc `AT.sepBy1` AT.skipSpace
<|> PLeq <$ "Eq" <*> wd <*> nm
<|> PLinset <$ "InSet" <*> wd <*> wd <*> nm
<|> PLgap <$ "Gap" <*> nm
<|> PLgapopen <$ "GapOpen" <*> nm
<|> PLgapextend <$ "GapExtend" <*> nm
<|> PLdefmatch <$ "Match" <*> nm
<|> PLdefmismatch <$ "Mismatch" <*> nm
<|> PLeqset <$ "EqSet" <*> wd <*> mc `AT.sepBy1` AT.skipSpace
<|> PLcomment <$ "--" <*> AT.takeText
wd = AT.skipSpace *> AT.takeWhile1 (not . AT.isHorizontalSpace)
mc = fromText <$> wd
nm = AT.skipSpace *> AT.double
genSimpleScoring :: Text -> SimpleScoring
genSimpleScoring l = SimpleScoring t g go ge dm di
where
t = unsafePerformIO $ H.fromListWithSizeHint (Prelude.length ys) ys
ls = T.lines l
xs = map parseLine ls
ys = concatMap genPairs $ iss ++ eqs
sets = [s | s@(PLset _ _) <- xs]
eqss = [e | e@(PLeqset _ _) <- xs]
eqs = [e | e@(PLeq _ _) <- xs]
iss = [i | i@(PLinset _ _ _) <- xs]
[dm] = [dm | PLdefmatch dm <- xs]
[di] = [di | PLdefmismatch di <- xs]
[g] = [g | PLgap g <- xs]
[go] = [go | PLgapopen go <- xs]
[ge] = [ge | PLgapextend ge <- xs]
genPairs (PLeq x d) = let ss = lookupSet x
tt = lookupEqSet x
in [ ((s,s),d) | s <- ss ] ++
[ ((s,t),d) | ts <- tt, s<-ts,t<-ts ]
genPairs (PLinset x y d) = let ss = lookupSet x
tt = lookupSet y in [ ((s,t),d) | s <- ss, t <- tt ]
lookupEqSet k = let go [] = []
go (PLeqset n xs:ss) = if k==n then xs : go ss else go ss
in go eqss
lookupSet k = let go [] = []
go (PLset n xs:ss) = if k==n then xs : go ss else go ss
go (PLeqset n xs:ss) = if k==n then xs : go ss else go ss
in case go $ sets ++ eqss of
xs -> concat xs
simpleScoreFromFile f = T.readFile f >>= return . genSimpleScoring