Safe Haskell | None |
---|---|
Language | Haskell2010 |
Functions for running TokenizerT
on Unicode bytestring streams.
For more information on how to work with TokenizerT
, have a look at the
module Control.Monad.Tokenizer.Streaming. For more information on writing
tokenizers, have a look at the module Control.Monad.Tokenizer from the
package tokenizer-monad.
Example for a simple tokenizer, that splits words by whitespace and discards stop symbols:
tokenizeWords :: Monad m => Q.ByteString m () -> Stream (Of T.Text) m () tokenizeWords = runUtf8TokenizerT $ untilEOT $ do c <- pop if isStopSym c then discard else if c `elem` (" \t\r\n" :: [Char]) then discard else do walkWhile (\c -> (c=='_') || not (isSpace c || isPunctuation' c)) emit
Synopsis
- runUtf8TokenizerT :: Monad m => TokenizerT Text m a -> ByteString m () -> Stream (Of Text) m a
- runUtf8TokenizerCST :: Monad m => TokenizerT Text m a -> ByteString m () -> Stream (Of Text) m a
- runUtf16LETokenizerT :: Monad m => TokenizerT Text m a -> ByteString m () -> Stream (Of Text) m a
- runUtf16LETokenizerCST :: Monad m => TokenizerT Text m a -> ByteString m () -> Stream (Of Text) m a
- runUtf16BETokenizerT :: Monad m => TokenizerT Text m a -> ByteString m () -> Stream (Of Text) m a
- runUtf16BETokenizerCST :: Monad m => TokenizerT Text m a -> ByteString m () -> Stream (Of Text) m a
- runUtf32LETokenizerT :: Monad m => TokenizerT Text m a -> ByteString m () -> Stream (Of Text) m a
- runUtf32LETokenizerCST :: Monad m => TokenizerT Text m a -> ByteString m () -> Stream (Of Text) m a
- runUtf32BETokenizerT :: Monad m => TokenizerT Text m a -> ByteString m () -> Stream (Of Text) m a
- runUtf32BETokenizerCST :: Monad m => TokenizerT Text m a -> ByteString m () -> Stream (Of Text) m a
- module Control.Monad.Tokenizer.Streaming
- module Control.Monad.Tokenizer.Streaming
- runDecodingTokenizerT :: Monad m => (ByteString -> DecodeResult) -> TokenizerT Text m a -> ByteString m () -> Stream (Of Text) m a
- runDecodingTokenizerCST :: Monad m => (ByteString -> DecodeResult) -> TokenizerT Text m a -> ByteString m () -> Stream (Of Text) m a
- decodeStream :: Monad m => (ByteString -> DecodeResult) -> ByteString m () -> Stream (Of Text) m ()
UTF-8
runUtf8TokenizerT :: Monad m => TokenizerT Text m a -> ByteString m () -> Stream (Of Text) m a Source #
runUtf8TokenizerCST :: Monad m => TokenizerT Text m a -> ByteString m () -> Stream (Of Text) m a Source #
UTF-16
runUtf16LETokenizerT :: Monad m => TokenizerT Text m a -> ByteString m () -> Stream (Of Text) m a Source #
runUtf16LETokenizerCST :: Monad m => TokenizerT Text m a -> ByteString m () -> Stream (Of Text) m a Source #
runUtf16BETokenizerT :: Monad m => TokenizerT Text m a -> ByteString m () -> Stream (Of Text) m a Source #
runUtf16BETokenizerCST :: Monad m => TokenizerT Text m a -> ByteString m () -> Stream (Of Text) m a Source #
UTF-32
runUtf32LETokenizerT :: Monad m => TokenizerT Text m a -> ByteString m () -> Stream (Of Text) m a Source #
runUtf32LETokenizerCST :: Monad m => TokenizerT Text m a -> ByteString m () -> Stream (Of Text) m a Source #
runUtf32BETokenizerT :: Monad m => TokenizerT Text m a -> ByteString m () -> Stream (Of Text) m a Source #
runUtf32BETokenizerCST :: Monad m => TokenizerT Text m a -> ByteString m () -> Stream (Of Text) m a Source #
Helpers
runDecodingTokenizerT :: Monad m => (ByteString -> DecodeResult) -> TokenizerT Text m a -> ByteString m () -> Stream (Of Text) m a Source #
runDecodingTokenizerCST :: Monad m => (ByteString -> DecodeResult) -> TokenizerT Text m a -> ByteString m () -> Stream (Of Text) m a Source #
decodeStream :: Monad m => (ByteString -> DecodeResult) -> ByteString m () -> Stream (Of Text) m () Source #
Decode a Unicode bytestring stream into a stream of Text chunks.