-- | Split incombing bytestrings based on bytestrings. module Pipes.Split.ByteString where import Control.Monad (join,unless) import Control.Monad.Trans.Class (lift) import Data.ByteString (ByteString) import Data.ByteString.Search (indices) import Data.Monoid ((<>)) import Debug.Trace import Pipes (Producer,next,yield) import qualified Data.ByteString as BS type Lens' a b = forall f . Functor f => (b -> f b) -> (a -> f a) -- | Splits bytestrings after each pattern @pat@. Tries to minimize the -- number of intermediate bytestring constructors. -- -- The following function @ske@ expects a string @str@ and a pattern @pat@ -- and then returns a tuple with the splitted bytestrings in @fst@ and the -- return value in @snd@. -- -- The inner parser @parse@ uses @zoom@ to draw the full inner producer, -- which should contain just one bytestring, namely one of the split off -- ones. @parse@ doesn't do anything with the inner producer, except -- returning the contained bytestring. -- -- @parse@ returns @Right $ concat xs@ on a correct parse, and @Left []@ -- once the input has been exhausted. -- -- @ -- ske :: ByteString -> ByteString -> ([ByteString],[ByteString],[ByteString]) -- ske pat str | BS.null pat || BS.null str = ([],[],[]) -- ske pat str = -- let parse = do -- xs <- zoom (splitKeepEnd pat) PP.drawAll -- case xs of -- [] -> return $ Left [] -- xs -> return $ Right $ BS.concat xs -- (a,(b,p)) = runIdentity . P.toListM' $ PP.parsed parse $ PP.yield str -- in (a,b, fst . runIdentity . P.toListM' $ p) -- @ splitKeepEnd :: Monad m => ByteString -> Lens' (Producer ByteString m x) (Producer ByteString m (Producer ByteString m x)) splitKeepEnd pat k p0 = fmap join (k (go BS.empty p0)) where go pre p = do x <- lift (next p) case x of Left r -> return $ return r Right (bs, p') -> do case fnd (pre <> bs) of -- no hit yet, send the bs down, keep some suffix [] -> do unless (BS.null bs) (yield bs) let pfx = BS.drop (BS.length bs - l + 1) bs go pfx p' -- at least one hit, split off the correct part, remainder goes -- back. (k:_) -> do let (y,suf) = BS.splitAt (k - BS.length pre + l) bs yield y return (yield suf >> p') l = BS.length pat fnd = indices pat {-# Inlineable splitKeepEnd #-} -- | Split a string into substrings, where each substring starts with @pat@ -- and continues until just before the next @pat@ (or until there is no -- more input). -- -- Any prefix that does not start with the substring is /kept/! -- -- Since each substring is supposed to start with @pat@, there is a small -- problem. What about a header that prefixes the string we are interested -- in? splitKeepStart :: Monad m => ByteString -> Lens' (Producer ByteString m x) (Producer ByteString m (Producer ByteString m x)) splitKeepStart = splitGeneric (\bs k p l -> BS.splitAt (k - p) bs) {-# Inlineable splitKeepStart #-} -- | Generic splitting function. Takes a bytestring @[a,b,c]@ (where -- @a,b,c@ are substrings of the bytestring!) and performs the split. -- splitGeneric :: Monad m => (ByteString -> Int -> Int -> Int -> (ByteString,ByteString)) -- ^ splitter function -> ByteString -- ^ pattern to split on -> Lens' (Producer ByteString m x) (Producer ByteString m (Producer ByteString m x)) -- ^ lens into the individual split off bytestrings splitGeneric splt pat k p0 = fmap join (k (go BS.empty p0)) where go pre p = do x <- lift (next p) case x of Left r -> do -- yield final split off string unless (BS.null pre) (yield pre) return $ return r Right (bs, p') -> do -- will not search in the part of the prefix that *can not contain* -- the @pat@tern. case fnd ((BS.drop (BS.length pre - l) pre) <> bs) of -- no hit yet, send the prefix down completely, make bs new -- prefix if possible. If either @pre@ or @bs@ are too short, we -- keep @pre <> bs@ for the next round. This should not happen if -- the pattern is reasonably short compared to the size of the -- bytestring chunks. [] -> do if (BS.length bs >= l) then yield pre >> go bs p' else go (pre <> bs) p' -- at least one hit, split off the correct part, remainder goes -- back. (k:_) -> do let (y,suf) = splt bs k (BS.length pre) l yield y return (yield suf >> p') l = BS.length pat fnd = indices pat {-# Inline splitGeneric #-} -- manual splitting, for @splitKeepEnd@ referenceByteStringTokenizer pat str | BS.null pat || BS.null str = [] referenceByteStringTokenizer pat str = (h `BS.append` BS.take (BS.length pat) t) : if BS.null t then [] else referenceByteStringTokenizer pat (BS.drop (BS.length pat) t) where (h,t) = BS.breakSubstring pat str