{-# LANGUAGE FlexibleInstances #-}
module Control.Monad.Tokenizer.Streaming (
TokenizerT,
runTokenizerT,
runTokenizerCST,
C.untilEOT,
C.peek,
C.isEOT,
C.lookAhead,
C.walk,
C.walkBack,
C.pop,
C.walkWhile,
C.walkFold,
C.emit,
C.discard,
C.restore,
Tokenizable(..)
) where
import Control.Monad
import Control.Monad.Trans
import Data.Monoid
import qualified Streaming.Prelude as S
import Streaming
import qualified Control.Monad.Tokenizer.Class as C
import Control.Monad.Tokenizer.Char8.Lazy ()
import Control.Monad.Tokenizer.Char8.Strict ()
import qualified Data.Text as T
import qualified Data.Text.Lazy as LT
import qualified Data.ByteString.Char8 as BS
import qualified Data.ByteString.Lazy.Char8 as LBS
newtype TokenizerT t m a = TokenizerT {
runTokenizerT' :: (t,Stream (Of t) m ()) -> Stream (Of t) m (a,t,Stream (Of t) m ())
}
runTokenizerCST :: (Tokenizable t,Monad m) => TokenizerT t m a -> Stream (Of t) m () -> Stream (Of t) m a
runTokenizerCST tok ins = do
(a,_,_) <- runTokenizerT' tok (mempty,ins)
return a
runTokenizerT :: (Tokenizable t,Monad m) => TokenizerT t m a -> Stream (Of t) m () -> Stream (Of t) m a
runTokenizerT tok = runTokenizerCST tok . S.map C.tlower
class (C.Tokenizable t, Monoid t) => Tokenizable t where
tsingleton :: Char -> t
tinit :: t -> t
tlast :: t -> Char
instance Tokenizable [Char] where
tsingleton = pure
tinit = init
tlast = last
instance Tokenizable T.Text where
tsingleton = T.singleton
tinit = T.init
tlast = T.last
instance Tokenizable LT.Text where
tsingleton = LT.singleton
tinit = LT.init
tlast = LT.last
instance Tokenizable BS.ByteString where
tsingleton = BS.singleton
tinit = BS.init
tlast = BS.last
instance Tokenizable LBS.ByteString where
tsingleton = LBS.singleton
tinit = LBS.init
tlast = LBS.last
instance Monad m => Functor (TokenizerT t m) where
fmap = liftM
instance Monad m => Applicative (TokenizerT t m) where
(<*>) = ap
pure = return
instance Monad m => Monad (TokenizerT t m) where
return a = TokenizerT $ \(v,rem) -> return (a,v,rem)
m >>= f = TokenizerT $ \(v,rem) -> do
(a1,v1,rem1) <- runTokenizerT' m (v,rem)
(a2,v2,rem2) <- runTokenizerT' (f a1) (v1,rem1)
return (a2,v2,rem2)
instance MonadTrans (TokenizerT t) where
lift m = TokenizerT $ \(v,rem) -> do
a <- lift m
return (a,v,rem)
uncons1 :: (Monad m,Tokenizable t) =>
Stream (Of t) m a -> m (Maybe (Char,Stream (Of t) m a))
uncons1 stream = do
muc <- S.uncons stream
case muc of
Nothing -> return Nothing
Just (one,more) | C.tnull one -> uncons1 more
Just (one,more) ->
let (th, tt) = (C.thead one, C.ttail one)
more' | C.tnull tt = more
| otherwise = S.cons tt more
in return $ Just (th, more')
unconsn :: (Monad m,Tokenizable t) => Int ->
Stream (Of t) m a -> m ([Char],Stream (Of t) m a)
unconsn 0 stream = return ([],stream)
unconsn n stream = do
muc <- uncons1 stream
case muc of
Nothing -> return ([],stream)
Just (h,stream') -> do
(t,stream'') <- unconsn (n-1) stream'
return (h:t,stream'')
instance (Monad m, Tokenizable t) => C.MonadTokenizer (TokenizerT t m) where
walk = void C.pop
pop = TokenizerT $ \(v,rem) -> do
muc <- lift $ uncons1 rem
case muc of
Nothing -> return ('\0',v,rem)
Just (h,rem') -> return (h,v<>tsingleton h,rem')
peek = TokenizerT $ \(v,rem) -> do
muc <- lift $ uncons1 rem
case muc of
Nothing -> return ('\0',v,rem)
Just (h,_) -> return (h,v,rem)
restore = TokenizerT $ \(v,rem) -> return ((),mempty,S.cons v rem)
emit = TokenizerT $ \(v,rem) -> S.yield v >> return ((),mempty,rem)
discard = TokenizerT $ \(v,rem) -> return ((),mempty,rem)
isEOT = TokenizerT $ \(v,rem) -> do
muc <- lift $ S.uncons rem
case muc of
Nothing -> return (True,v,rem)
Just _ -> return (False,v,rem)
lookAhead cs = TokenizerT $ \(v,rem) -> do
(cs',_) <- lift $ unconsn (length cs) rem
return (cs == cs', v, rem)
walkBack = TokenizerT $ \(v,rem) ->
if C.tnull v
then return ((),v,rem)
else return ((),tinit v,S.cons (tsingleton $ tlast v) rem)
words' :: C.MonadTokenizer m => m ()
words' = C.untilEOT $ do
c <- C.pop
if c `elem` " \t\n\r"
then C.discard
else do
C.walkWhile (\c -> not (c `elem` " \t\n\r"))
C.emit