Safe Haskell	Safe-Inferred
Language	Haskell2010

NLP.Tokenize.String

Synopsis

newtype EitherList a b = E {
- unE :: [Either a b]
}
type Tokenizer = String -> EitherList String String
tokenize :: String -> [String]
run :: Tokenizer -> String -> [String]
defaultTokenizer :: Tokenizer
whitespace :: Tokenizer
uris :: Tokenizer
punctuation :: Tokenizer
finalPunctuation :: Tokenizer
initialPunctuation :: Tokenizer
allPunctuation :: Tokenizer
contractions :: Tokenizer
negatives :: Tokenizer

Documentation

newtype EitherList a b Source #

The EitherList is a newtype-wrapped list of Eithers.

Constructors

E
Fields unE :: [Either a b]

Instances

Instances details

Applicative (EitherList a) Source #
Instance details Defined in NLP.Tokenize.String Methods pure :: a0 -> EitherList a a0 # (<>) :: EitherList a (a0 -> b) -> EitherList a a0 -> EitherList a b # liftA2 :: (a0 -> b -> c) -> EitherList a a0 -> EitherList a b -> EitherList a c # (>) :: EitherList a a0 -> EitherList a b -> EitherList a b # (<*) :: EitherList a a0 -> EitherList a b -> EitherList a a0 #
Functor (EitherList a) Source #
Instance details Defined in NLP.Tokenize.String Methods fmap :: (a0 -> b) -> EitherList a a0 -> EitherList a b # (<$) :: a0 -> EitherList a b -> EitherList a a0 #
Monad (EitherList a) Source #
Instance details Defined in NLP.Tokenize.String Methods (>>=) :: EitherList a a0 -> (a0 -> EitherList a b) -> EitherList a b # (>>) :: EitherList a a0 -> EitherList a b -> EitherList a b # return :: a0 -> EitherList a a0 #

type Tokenizer = String -> EitherList String String Source #

A Tokenizer is function which takes a list and returns a list of Eithers (wrapped in a newtype). Right Strings will be passed on for processing to tokenizers down the pipeline. Left Strings will be passed through the pipeline unchanged. Use a Left String in a tokenizer to protect certain tokens from further processing (e.g. see the uris tokenizer). You can define your own custom tokenizer pipelines by chaining tokenizers together:

tokenize :: String -> [String] Source #

Split string into words using the default tokenizer pipeline

run :: Tokenizer -> String -> [String] Source #

Run a tokenizer

defaultTokenizer :: Tokenizer Source #

whitespace :: Tokenizer Source #

Split string on whitespace. This is just a wrapper for Data.List.words

uris :: Tokenizer Source #

Detect common uris and freeze them

punctuation :: Tokenizer Source #

Split off initial and final punctuation

finalPunctuation :: Tokenizer Source #

Split off word-final punctuation

initialPunctuation :: Tokenizer Source #

Split off word-initial punctuation

allPunctuation :: Tokenizer Source #

Split tokens on transitions between punctuation and non-punctuation characters. This tokenizer is not included in defaultTokenizer pipeline because dealing with word-internal punctuation is quite application specific.

contractions :: Tokenizer Source #

Split common contractions off and freeze them. | Currently deals with: 'm, 's, 'd, 've, 'll

negatives :: Tokenizer Source #

Split words ending in n't, and freeze n't