{-# LANGUAGE FlexibleInstances #-}

-- | A monad transformer for tokenizing streams of text.
--
-- Main idea: You 'walk' through the input string like a turtle, and everytime
-- you find a token boundary, you call 'emit'. If some specific kinds of tokens
-- should be suppressed, you can 'discard' them instead (or filter afterwards).
--
-- The package tokenizer-monad provides a monad (and class) for tokenizing
-- pure text/strings in memory. This package supplements it with a transformer
-- to work on impure Streams of text/strings. Your existing tokenizers can
-- be ported without code changes.
--
-- This module supports strict text, lazy text, strings, lazy ASCII bytestrings and strict bytestrings. For working with Unicode encodings, have a look at "Control.Monad.Tokenizer.Streaming.Decode".
--
-- For examples on how to write tokenizers, have a look at the package
-- tokenizer-monad. Here's an example on how to use it with streams:
--
-- Example for a simple tokenizer, that splits words by whitespace and discards stop symbols:
--
-- > tokenizeWords :: Monad m => Stream (Of T.Text) m () -> Stream (Of T.Text) m ()
-- > tokenizeWords = runTokenizerT $ untilEOT $ do
-- >   c <- pop
-- >   if isStopSym c
-- >     then discard
-- >     else if c `elem` ("  \t\r\n" :: [Char])
-- >          then discard
-- >          else do
-- >            walkWhile (\c -> (c=='_') || not (isSpace c || isPunctuation' c))
-- >            emit

module Control.Monad.Tokenizer.Streaming (
  -- * Monad transformer
  TokenizerT,
  runTokenizerT,
  runTokenizerCST,
  C.untilEOT,
  -- * Tests
  C.peek,
  C.isEOT,
  C.lookAhead,
  -- * Movement
  C.walk,
  C.walkBack,
  C.pop,
  C.walkWhile,
  C.walkFold,
  -- * Transactions
  C.emit,
  C.discard,
  C.restore,
  -- * Text types
  Tokenizable(..)
  ) where

import Control.Monad
import Control.Monad.Trans
import Data.Monoid
import qualified Streaming.Prelude as S
import Streaming
import qualified Control.Monad.Tokenizer.Class as C
import Control.Monad.Tokenizer.Char8.Lazy ()
import Control.Monad.Tokenizer.Char8.Strict ()
import qualified Data.Text as T
import qualified Data.Text.Lazy as LT
import qualified Data.ByteString.Char8 as BS
import qualified Data.ByteString.Lazy.Char8 as LBS

-- | Tokenizer transformer. Use 'runTokenizerT' or 'runTokenizerCST' to run it
newtype TokenizerT t m a = TokenizerT {
  runTokenizerT' :: (t,Stream (Of t) m ()) -> Stream (Of t) m (a,t,Stream (Of t) m ())
  }
-- (visited, remaining) -> Stream (of emissions) m (result, visited, remaining)

-- | Split a text stream into tokens using the given tokenizer, case  sensitive version
runTokenizerCST :: (Tokenizable t,Monad m) => TokenizerT t m a -> Stream (Of t) m () -> Stream (Of t) m a
runTokenizerCST tok ins = do
  (a,_,_) <- runTokenizerT' tok (mempty,ins)
  return a

-- | Split a text stream into tokens using the given tokenizer
runTokenizerT :: (Tokenizable t,Monad m) => TokenizerT t m a -> Stream (Of t) m () -> Stream (Of t) m a
runTokenizerT tok = runTokenizerCST tok . S.map C.tlower

-- | Text types that can be split by the TokenizerT transformer. In this module,
-- instances are provided for String, strict Text, and lazy Text.
-- There are also instances for strict and lazy ByteStrings, but keep in mind
-- that they assume ASCII encoding. If you want to apply reasonable decoding,
-- try Control.Monad.Tokenizer.Streaming.Decode.
class (C.Tokenizable t, Monoid t) => Tokenizable t where
  tsingleton :: Char -> t
  tinit :: t -> t
  tlast :: t -> Char

instance Tokenizable [Char] where
  tsingleton = pure
  tinit = init
  tlast = last

instance Tokenizable T.Text where
  tsingleton = T.singleton
  tinit = T.init
  tlast = T.last

instance Tokenizable LT.Text where
  tsingleton = LT.singleton
  tinit = LT.init
  tlast = LT.last

instance Tokenizable BS.ByteString where
  tsingleton = BS.singleton
  tinit = BS.init
  tlast = BS.last

instance Tokenizable LBS.ByteString where
  tsingleton = LBS.singleton
  tinit = LBS.init
  tlast = LBS.last

instance Monad m => Functor (TokenizerT t m) where
  fmap = liftM

instance Monad m => Applicative (TokenizerT t m) where
  (<*>) = ap
  pure = return

instance Monad m => Monad (TokenizerT t m) where
  return a = TokenizerT $ \(v,rem) -> return (a,v,rem)
  m >>= f = TokenizerT $ \(v,rem) -> do
    (a1,v1,rem1) <- runTokenizerT' m (v,rem)
    (a2,v2,rem2) <- runTokenizerT' (f a1) (v1,rem1)
    return (a2,v2,rem2)

instance MonadTrans (TokenizerT t) where
  lift m = TokenizerT $ \(v,rem) -> do
    a <- lift m
    return (a,v,rem)

uncons1 :: (Monad m,Tokenizable t) =>
           Stream (Of t) m a -> m (Maybe (Char,Stream (Of t) m a))
uncons1 stream = do
  muc <- S.uncons stream
  case muc of
    Nothing -> return Nothing
    Just (one,more) | C.tnull one -> uncons1 more
    Just (one,more) ->
      let (th, tt) = (C.thead one, C.ttail one)
          more' | C.tnull tt = more
                | otherwise = S.cons tt more
      in return $ Just (th, more')

unconsn :: (Monad m,Tokenizable t) => Int ->
           Stream (Of t) m a -> m ([Char],Stream (Of t) m a)
unconsn 0 stream = return ([],stream)
unconsn n stream = do
  muc <- uncons1 stream
  case muc of
    Nothing -> return ([],stream)
    Just (h,stream') -> do
      (t,stream'') <- unconsn (n-1) stream'
      return (h:t,stream'')

instance (Monad m, Tokenizable t) => C.MonadTokenizer (TokenizerT t m) where
  walk = void C.pop
  pop = TokenizerT $ \(v,rem) -> do
    muc <- lift $ uncons1 rem
    case muc of
      Nothing -> return ('\0',v,rem)
      Just (h,rem') -> return (h,v<>tsingleton h,rem')
  peek = TokenizerT $ \(v,rem) -> do
    muc <- lift $ uncons1 rem
    case muc of
      Nothing -> return ('\0',v,rem)
      Just (h,_) -> return (h,v,rem)
  restore = TokenizerT $ \(v,rem) -> return ((),mempty,S.cons v rem)
  emit = TokenizerT $ \(v,rem) -> S.yield v >> return ((),mempty,rem)
  discard = TokenizerT $ \(v,rem) -> return ((),mempty,rem)
  isEOT = TokenizerT $ \(v,rem) -> do
    muc <- lift $ S.uncons rem
    case muc of
      Nothing -> return (True,v,rem)
      Just _ -> return (False,v,rem)
  lookAhead cs = TokenizerT $ \(v,rem) -> do
    (cs',_) <- lift $ unconsn (length cs) rem
    return (cs == cs', v, rem)
  walkBack = TokenizerT $ \(v,rem) ->
    if C.tnull v
    then return ((),v,rem)
    else return ((),tinit v,S.cons (tsingleton $ tlast v) rem)

words' :: C.MonadTokenizer m => m ()
words' = C.untilEOT $ do
   c <- C.pop
   if c `elem` " \t\n\r"
     then C.discard
     else do
       C.walkWhile (\c -> not (c `elem` " \t\n\r"))
       C.emit