module NLP.Corpora.Parsing where
import qualified Data.Text as T
import Data.Text (Text)
import NLP.Types (Tag(..), parseTag, tagUNK, TaggedSentence(..)
, POS(..), Token(..))
readPOS :: Tag t => Text -> TaggedSentence t
readPOS str = readPOSWith parseTag str
readPOSWith :: Tag t => (Text -> t) -> Text -> TaggedSentence t
readPOSWith parser str = TaggedSent $ map toTagged $ T.words str
where
toTagged txt | "/" `T.isInfixOf` txt = let
(tok, tagStr) = T.breakOnEnd "/" (T.strip txt)
in POS (parser tagStr) (Token $ safeInit tok)
| otherwise = POS tagUNK (Token txt)
safeInit :: Text -> Text
safeInit str | T.length str == 0 = str
| otherwise = T.init str