module NLP.Tokenize
( EitherList(..)
, Tokenizer
, tokenize
, run
, defaultTokenizer
, whitespace
, uris
, punctuation
, finalPunctuation
, initialPunctuation
, contractions
, negatives
)
where
import qualified Data.Char as Char
import Data.Maybe
import Control.Monad.Instances ()
import Control.Monad
import Data.Text (Text)
import qualified Data.Text as T
type Tokenizer = Text -> EitherList Text Text
newtype EitherList a b = E { unE :: [Either a b] }
tokenize :: Text -> [Text]
tokenize = run defaultTokenizer
run :: Tokenizer -> (Text -> [Text])
run f = \txt -> map T.copy $ (map unwrap . unE . f) txt
defaultTokenizer :: Tokenizer
defaultTokenizer = whitespace
>=> uris
>=> punctuation
>=> contractions
>=> negatives
uris :: Tokenizer
uris x | isUri x = E [Left x]
| True = E [Right x]
where isUri u = any (`T.isPrefixOf` u) ["http://","ftp://","mailto:"]
punctuation :: Tokenizer
punctuation = finalPunctuation >=> initialPunctuation
hyphens :: Tokenizer
hyphens xs = E [Right w | w <- T.split (=='-') xs ]
finalPunctuation :: Tokenizer
finalPunctuation x = E $ filter (not . T.null . unwrap) res
where
res :: [Either Text Text]
res = case T.span Char.isPunctuation (T.reverse x) of
(ps, w) | T.null ps -> [ Right $ T.reverse w ]
| otherwise -> [ Right $ T.reverse w
, Right $ T.reverse ps]
initialPunctuation :: Tokenizer
initialPunctuation x = E $ filter (not . T.null . unwrap) $
case T.span Char.isPunctuation x of
(ps,w) | T.null ps -> [ Right w ]
| otherwise -> [ Right ps
, Right w ]
negatives :: Tokenizer
negatives x | "n't" `T.isSuffixOf` x = E [ Right . T.reverse . T.drop 3 . T.reverse $ x
, Left "n't" ]
| True = E [ Right x ]
contractions :: Tokenizer
contractions x = case catMaybes . map (splitSuffix x) $ cts of
[] -> return x
((w,s):_) -> E [ Right w,Left s]
where cts = ["'m","'s","'d","'ve","'ll"]
splitSuffix w sfx =
let w' = T.reverse w
len = T.length sfx
in if sfx `T.isSuffixOf` w
then Just (T.take (T.length w len) w, T.reverse . T.take len $ w')
else Nothing
whitespace :: Tokenizer
whitespace xs = E [Right w | w <- T.words xs ]
instance Monad (EitherList a) where
return x = E [Right x]
E xs >>= f = E $ concatMap (either (return . Left) (unE . f)) xs
unwrap :: Either a a -> a
unwrap (Left x) = x
unwrap (Right x) = x
examples :: [Text]
examples =
["This shouldn't happen."
,"Some 'quoted' stuff"
,"This is a URL: http://example.org."
,"How about an email@example.com"
,"ReferenceError #1065 broke my debugger!"
,"I would've gone."
,"They've been there."
]