module NLP.Extraction.Parsec
where
import Data.Text (Text)
import qualified Data.Text as T
import Text.Parsec.String ()
import Text.Parsec.Prim (lookAhead, token, Parsec, try, Stream(..))
import qualified Text.Parsec.Combinator as PC
import Text.Parsec.Pos (newPos)
import NLP.Types (TaggedSentence(..), Tag(..), CaseSensitive(..),
POS(..), Token(..))
instance (Monad m, Tag t) => Stream (TaggedSentence t) m (POS t) where
uncons (TaggedSent ts) = do
mRes <- uncons ts
case mRes of
Nothing -> return $ Nothing
Just (mTok, rest) -> return $ Just (mTok, TaggedSent rest)
type Extractor t = Parsec (TaggedSentence t) ()
posTok :: Tag t => t -> Extractor t (POS t)
posTok tag = token showTok posFromTok testTok
where
showTok = show
posFromTok _ = newPos "unknown" 0 0
testTok tok@(POS t _) = if tag == t then Just tok else Nothing
posPrefix :: Tag t => Text -> Extractor t (POS t)
posPrefix str = token showTok posFromTok testTok
where
showTok = show
posFromTok _ = newPos "unknown" 0 0
testTok tok@(POS t _) = if str `T.isPrefixOf` (tagTerm t) then Just tok else Nothing
matches :: CaseSensitive -> Token -> Token -> Bool
matches Sensitive x y = x == y
matches Insensitive (Token x) (Token y) = (T.toLower x) == (T.toLower y)
txtTok :: Tag t => CaseSensitive -> Token -> Extractor t (POS t)
txtTok sensitive txt = token showTok posFromTok testTok
where
showTok = show
posFromTok _ = newPos "unknown" 0 0
testTok tok@(POS _ t) | matches sensitive txt t = Just tok
| otherwise = Nothing
anyToken :: Tag t => Extractor t (POS t)
anyToken = token showTok posFromTok testTok
where
showTok = show
posFromTok _ = newPos "unknown" 0 0
testTok tok@(POS _ txt) | txt == "" = Nothing
| otherwise = Just tok
oneOf :: Tag t => CaseSensitive -> [Token] -> Extractor t (POS t)
oneOf sensitive terms = PC.choice (map (\t -> try (txtTok sensitive t)) terms)
followedBy :: Tag t => Extractor t b -> Extractor t a -> Extractor t a
followedBy fill end = do
_ <- PC.manyTill fill (lookAhead end)
end