{-# LANGUAGE RankNTypes #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE GADTs #-} module Data.Warc ( Record(..) , Warc(..) -- * Parsing , parseWarc , iterRecords , produceRecords -- * Encoding , encodeRecord -- * Headers , module Data.Warc.Header ) where import Data.Char (ord) import Pipes hiding (each) import qualified Pipes.ByteString as PBS import Control.Lens import qualified Pipes.Attoparsec as PA import qualified Data.ByteString as BS import qualified Data.ByteString.Lazy.Builder as BB import Data.ByteString (ByteString) import Control.Monad (join) import Control.Monad.Trans.Free import Control.Monad.Trans.State.Strict import Data.Warc.Header -- | A WARC record data Record m r = Record { recHeader :: RecordHeader , recContent :: Producer BS.ByteString m r } instance Monad m => Functor (Record m) where fmap f (Record hdr r) = Record hdr (fmap f r) -- | A WARC archive type Warc m a = FreeT (Record m) m (Producer BS.ByteString m a) -- | Parse a WARC archive. parseWarc :: (Functor m, Monad m) => Producer ByteString m a -> Warc m a parseWarc = loop where loop upstream = FreeT $ do (hdr, rest) <- runStateT (PA.parse header) upstream go hdr rest go mhdr rest | Nothing <- mhdr = return $ Pure rest | Just (Left err) <- mhdr = error $ show err | Just (Right hdr) <- mhdr , Just len <- hdr ^? recHeaders . each . _ContentLength = do let produceBody = fmap consumeWhitespace . view (PBS.splitAt len) consumeWhitespace = PBS.dropWhile isEOL isEOL c = c == ord8 '\r' || c == ord8 '\n' ord8 = fromIntegral . ord return $ Free $ Record hdr $ fmap loop $ produceBody rest -- | Iterate over the 'Record's in a WARC archive iterRecords :: forall m a. Monad m => (forall b. Record m b -> m b) -> Warc m a -> m (Producer BS.ByteString m a) iterRecords f warc = iterT iter warc where iter :: Record m (m (Producer BS.ByteString m a)) -> m (Producer BS.ByteString m a) iter r = join $ f r produceRecords :: forall m o a. Monad m => (forall b. RecordHeader -> Producer BS.ByteString m b -> Producer o m b) -- ^ consume the record producing some output -> Warc m a -- ^ a WARC archive (see 'parseWarc') -> Producer o m (Producer BS.ByteString m a) -- ^ returns any leftovers produceRecords f warc = iterTM iter warc where iter :: Record m (Producer o m (Producer BS.ByteString m a)) -> Producer o m (Producer BS.ByteString m a) iter (Record hdr body) = join $ f hdr body encodeRecord :: Monad m => Record m a -> Producer BS.ByteString m a encodeRecord (Record hdr content) = do PBS.fromLazy $ BB.toLazyByteString $ encodeHeader hdr content