Safe Haskell	Safe
Language	Haskell2010

Control.Monad.Tokenizer

Contents

Monad
Tests
Movement
Transactions
Embedding
Conversion
Text types

Description

A monad for writing pure tokenizers in an imperative-looking way.

Main idea: You walk through the input string like a turtle, and everytime you find a token boundary, you call emit. If some specific kinds of tokens should be suppressed, you can discard them instead (or filter afterwards).

This module supports strict text, lazy text, and strings, though the package also provides support for ASCII bytestrings in separate modules.

Example for a simple tokenizer, that splits words by whitespace and discards stop symbols:

tokenizeWords :: T.Text -> [T.Text]
tokenizeWords = runTokenizer $ untilEOT $ do
  c <- pop
  if isStopSym c
    then discard
    else if c `elem` ("  \t\r\n" :: [Char])
         then discard
         else do
           walkWhile (\c -> (c=='_') || not (isSpace c || isPunctuation' c))
           emit

Synopsis

data Tokenizer t a
runTokenizer :: Tokenizable t => Tokenizer t () -> t -> [t]
runTokenizerCS :: Tokenizable t => Tokenizer t () -> t -> [t]
untilEOT :: MonadTokenizer m => m () -> m ()
peek :: MonadTokenizer m => m Char
isEOT :: MonadTokenizer m => m Bool
lookAhead :: MonadTokenizer m => [Char] -> m Bool
walk :: MonadTokenizer m => m ()
walkBack :: MonadTokenizer m => m ()
pop :: MonadTokenizer m => m Char
walkWhile :: MonadTokenizer m => (Char -> Bool) -> m ()
walkFold :: MonadTokenizer m => a -> (Char -> a -> Maybe a) -> m ()
emit :: MonadTokenizer m => m ()
discard :: MonadTokenizer m => m ()
restore :: MonadTokenizer m => m ()
embed :: Tokenizable t => ((t, t) -> (a, [t], t)) -> Tokenizer t a
embed_ :: Tokenizable t => ((t, t) -> ([t], t)) -> Tokenizer t ()
discardAndEmbed :: Tokenizable t => (t -> (a, [t], t)) -> Tokenizer t a
convert :: (Tokenizable t, IsString t, Tokenizable s, IsString s) => Tokenizer s a -> Tokenizer t a
convertWith :: (s -> t) -> (t -> s) -> Tokenizer s a -> Tokenizer t a
class Tokenizable t where
- tnull :: t -> Bool
- thead :: t -> Char
- ttail :: t -> t
- ttake :: Int -> t -> t
- tdrop :: Int -> t -> t
- tlower :: t -> t

Monad

data Tokenizer t a Source #

Tokenizer monad. Use runTokenizer or runTokenizerCS to run it

Instances

Monad (Tokenizer t) Source #
Instance details Defined in Control.Monad.Tokenizer Methods (>>=) :: Tokenizer t a -> (a -> Tokenizer t b) -> Tokenizer t b # (>>) :: Tokenizer t a -> Tokenizer t b -> Tokenizer t b # return :: a -> Tokenizer t a # fail :: String -> Tokenizer t a #
Functor (Tokenizer t) Source #
Instance details Defined in Control.Monad.Tokenizer Methods fmap :: (a -> b) -> Tokenizer t a -> Tokenizer t b # (<$) :: a -> Tokenizer t b -> Tokenizer t a #
Applicative (Tokenizer t) Source #
Instance details Defined in Control.Monad.Tokenizer Methods pure :: a -> Tokenizer t a # (<>) :: Tokenizer t (a -> b) -> Tokenizer t a -> Tokenizer t b # liftA2 :: (a -> b -> c) -> Tokenizer t a -> Tokenizer t b -> Tokenizer t c # (>) :: Tokenizer t a -> Tokenizer t b -> Tokenizer t b # (<*) :: Tokenizer t a -> Tokenizer t b -> Tokenizer t a #
Tokenizable t => MonadTokenizer (Tokenizer t) Source #
Instance details Defined in Control.Monad.Tokenizer Methods walk :: Tokenizer t () Source # walkBack :: Tokenizer t () Source # peek :: Tokenizer t Char Source # restore :: Tokenizer t () Source # emit :: Tokenizer t () Source # discard :: Tokenizer t () Source # isEOT :: Tokenizer t Bool Source # lookAhead :: [Char] -> Tokenizer t Bool Source #

runTokenizer :: Tokenizable t => Tokenizer t () -> t -> [t] Source #

Split a string into tokens using the given tokenizer

runTokenizerCS :: Tokenizable t => Tokenizer t () -> t -> [t] Source #

Split a string into tokens using the given tokenizer, case sensitive version

untilEOT :: MonadTokenizer m => m () -> m () Source #

Repeat a given tokenizer as long as the end of text is not reached

Tests

peek :: MonadTokenizer m => m Char Source #

Peek the current character

isEOT :: MonadTokenizer m => m Bool Source #

Have I reached the end of the input text?

lookAhead :: MonadTokenizer m => [Char] -> m Bool Source #

Check if the next input chars agree with the given string

Movement

walk :: MonadTokenizer m => m () Source #

Proceed to the next character

walkBack :: MonadTokenizer m => m () Source #

Walk back to the previous character, unless it was discarded/emitted.

pop :: MonadTokenizer m => m Char Source #

Peek the current character and proceed

walkWhile :: MonadTokenizer m => (Char -> Bool) -> m () Source #

Proceed as long as a given function succeeds

walkFold :: MonadTokenizer m => a -> (Char -> a -> Maybe a) -> m () Source #

Proceed as long as a given fold returns Just (generalization of walkWhile)

Transactions

emit :: MonadTokenizer m => m () Source #

Break at the current position and emit the scanned token

discard :: MonadTokenizer m => m () Source #

Break at the current position and discard the scanned token

restore :: MonadTokenizer m => m () Source #

Restore the state after the last emit/discard.

Embedding

embed :: Tokenizable t => ((t, t) -> (a, [t], t)) -> Tokenizer t a Source #

Embed a pure tokenizer into the monad. The arguments to the function are (visited string, remaining string), and the return value is expected to be (result, emitted tokens, remaining string).

embed_ :: Tokenizable t => ((t, t) -> ([t], t)) -> Tokenizer t () Source #

Embed a pure tokenizer into the monad without a result.

discardAndEmbed :: Tokenizable t => (t -> (a, [t], t)) -> Tokenizer t a Source #

Embed a pure tokenizer into the monad. The visited string is discarded, and the given function is run on the unvisited part. The return value is expected to be (result, emitted tokens, remaining string).

Conversion

convert :: (Tokenizable t, IsString t, Tokenizable s, IsString s) => Tokenizer s a -> Tokenizer t a Source #

Natural transformation to convert between tokenizers of different text types. Note that this operation does not perform encoding/decoding (i.e. converting from ByteString to Text does not decode Unicode characters). To do so, use convertWith a provide the correct encoding/decoding functions.

convertWith :: (s -> t) -> (t -> s) -> Tokenizer s a -> Tokenizer t a Source #

Natural transformation to convert between tokenizers of different text types, using the given conversion functions.

Text types

class Tokenizable t where Source #

Text types that can be split by the Tokenizer monad. In this module, instances are provided for String, strict Text, and lazy Text. If you are dealing with ASCII ByteStrings, you can find instances in the modules Control.Monad.Tokenizer.Char8.Strict and Control.Monad.Tokenizer.Char8.Lazy

Methods

tnull :: t -> Bool Source #

thead :: t -> Char Source #

ttail :: t -> t Source #

ttake :: Int -> t -> t Source #

tdrop :: Int -> t -> t Source #

tlower :: t -> t Source #

Instances

Tokenizable ByteString Source #	Assuming ASCII encoding
Instance details Defined in Control.Monad.Tokenizer.Char8.Lazy Methods tnull :: ByteString -> Bool Source # thead :: ByteString -> Char Source # ttail :: ByteString -> ByteString Source # ttake :: Int -> ByteString -> ByteString Source # tdrop :: Int -> ByteString -> ByteString Source # tlower :: ByteString -> ByteString Source #
Tokenizable ByteString Source #	Assuming ASCII encoding
Instance details Defined in Control.Monad.Tokenizer.Char8.Strict Methods tnull :: ByteString -> Bool Source # thead :: ByteString -> Char Source # ttail :: ByteString -> ByteString Source # ttake :: Int -> ByteString -> ByteString Source # tdrop :: Int -> ByteString -> ByteString Source # tlower :: ByteString -> ByteString Source #
Tokenizable Text Source #
Instance details Defined in Control.Monad.Tokenizer.Class Methods tnull :: Text -> Bool Source # thead :: Text -> Char Source # ttail :: Text -> Text Source # ttake :: Int -> Text -> Text Source # tdrop :: Int -> Text -> Text Source # tlower :: Text -> Text Source #
Tokenizable Text Source #
Instance details Defined in Control.Monad.Tokenizer.Class Methods tnull :: Text -> Bool Source # thead :: Text -> Char Source # ttail :: Text -> Text Source # ttake :: Int -> Text -> Text Source # tdrop :: Int -> Text -> Text Source # tlower :: Text -> Text Source #
Tokenizable [Char] Source #
Instance details Defined in Control.Monad.Tokenizer.Class Methods tnull :: [Char] -> Bool Source # thead :: [Char] -> Char Source # ttail :: [Char] -> [Char] Source # ttake :: Int -> [Char] -> [Char] Source # tdrop :: Int -> [Char] -> [Char] Source # tlower :: [Char] -> [Char] Source #