-- | Functions for running 'M.TokenizerT' on Unicode bytestring streams.
--
-- For more information on how to work with 'M.TokenizerT', have a look at the
-- module "Control.Monad.Tokenizer.Streaming". For more information on writing
-- tokenizers, have a look at the module "Control.Monad.Tokenizer" from the
-- package tokenizer-monad.
--
-- Example for a simple tokenizer, that splits words by whitespace and discards stop symbols:
--
-- > tokenizeWords :: Monad m => Q.ByteString m () -> Stream (Of T.Text) m ()
-- > tokenizeWords = runUtf8TokenizerT $ untilEOT $ do
-- >   c <- pop
-- >   if isStopSym c
-- >     then discard
-- >     else if c `elem` ("  \t\r\n" :: [Char])
-- >          then discard
-- >          else do
-- >            walkWhile (\c -> (c=='_') || not (isSpace c || isPunctuation' c))
-- >            emit

module Control.Monad.Tokenizer.Streaming.Decode (
  -- * UTF-8
  runUtf8TokenizerT,
  runUtf8TokenizerCST,
  -- * UTF-16
  runUtf16LETokenizerT,
  runUtf16LETokenizerCST,
  runUtf16BETokenizerT,
  runUtf16BETokenizerCST,
  -- * UTF-32
  runUtf32LETokenizerT,
  runUtf32LETokenizerCST,
  runUtf32BETokenizerT,
  runUtf32BETokenizerCST,
  -- * Helpers
  module Control.Monad.Tokenizer.Streaming,
  runDecodingTokenizerT,
  runDecodingTokenizerCST,
  decodeStream
  ) where

import qualified Control.Monad.Tokenizer.Streaming as M
import qualified Control.Monad.Tokenizer.Streaming
import Streaming
import qualified Streaming.Prelude as S
import qualified Data.Text as T
import qualified Data.ByteString.Streaming as Q
import qualified Data.ByteString as BS
import Data.Streaming.Text

-- | Decode a Unicode bytestring stream into a stream of Text chunks.
decodeStream :: Monad m => (BS.ByteString -> DecodeResult) -> Q.ByteString m () -> Stream (Of T.Text) m ()
decodeStream decoder =
  let step decode stream = do
        muc <- lift $ S.uncons stream
        case muc of
          Nothing ->
            case decoder BS.empty of
              DecodeResultFailure succ _ -> do
                S.yield succ
                fail "Decoding ended ungracefully"
              DecodeResultSuccess succ _ -> do
                S.yield succ
          Just (one,more) | BS.null one -> step decode more
                          | otherwise ->
                            case decoder one of
                              DecodeResultFailure succ _ -> do
                                S.yield succ
                                fail "Decoding error"
                              DecodeResultSuccess succ cont -> do
                                S.yield succ
                                step cont more
  in step decoder . Q.toChunks

runDecodingTokenizerCST :: Monad m => (BS.ByteString -> DecodeResult) -> M.TokenizerT T.Text m a -> Q.ByteString m () -> Stream (Of T.Text) m a
runDecodingTokenizerCST decoder tok ins =
  M.runTokenizerCST tok $ decodeStream decoder ins

runDecodingTokenizerT :: Monad m => (BS.ByteString -> DecodeResult) -> M.TokenizerT T.Text m a -> Q.ByteString m () -> Stream (Of T.Text) m a
runDecodingTokenizerT decoder tok ins =
  M.runTokenizerT tok $ decodeStream decoder ins

runUtf8TokenizerCST, runUtf8TokenizerT, runUtf16LETokenizerCST,
  runUtf16LETokenizerT, runUtf16BETokenizerCST, runUtf16BETokenizerT,
  runUtf32LETokenizerCST, runUtf32LETokenizerT, runUtf32BETokenizerCST,
  runUtf32BETokenizerT :: Monad m => M.TokenizerT T.Text m a -> Q.ByteString m () -> Stream (Of T.Text) m a

runUtf8TokenizerCST = runDecodingTokenizerCST decodeUtf8
runUtf8TokenizerT = runDecodingTokenizerT decodeUtf8

runUtf16LETokenizerCST = runDecodingTokenizerCST decodeUtf16LE
runUtf16LETokenizerT = runDecodingTokenizerT decodeUtf16LE

runUtf16BETokenizerCST = runDecodingTokenizerCST decodeUtf16BE
runUtf16BETokenizerT = runDecodingTokenizerT decodeUtf16BE

runUtf32LETokenizerCST = runDecodingTokenizerCST decodeUtf32LE
runUtf32LETokenizerT = runDecodingTokenizerT decodeUtf32LE

runUtf32BETokenizerCST = runDecodingTokenizerCST decodeUtf32BE
runUtf32BETokenizerT = runDecodingTokenizerT decodeUtf32BE