{-| Module : Data.CSV.Sip Description : works with CSV files Copyright : (C) Jonathan Lamothe License : GPL-3.0-or-later Maintainer : jonathan@jlamothe.net Stability : experimental Portability : POSIX This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . -} {-# LANGUAGE LambdaCase, OverloadedStrings #-} module Data.CSV.Sip ( decodeRows, decodeRawRows, decodeUTF8, toBytes, ) where import Conduit (ConduitT, await, mapC, yield, (.|)) import Control.Monad (unless) import Control.Monad.Trans.Class (lift) import Control.Monad.Trans.State (StateT, evalStateT, get, gets, modify) import qualified Data.ByteString as BS import Data.Maybe (fromMaybe) import qualified Data.Text as T import Data.Text.Encoding (decodeUtf8') import Data.Word (Word8) -- | decode the rows from a stream of ByteStrings decodeRows :: Monad m => ConduitT BS.ByteString [T.Text] m () decodeRows = decodeRawRows .| mapC (map $ fromMaybe "" . decodeUTF8) -- | decode the rows returning raw ByteStrings instead of text decodeRawRows :: Monad m => ConduitT BS.ByteString [BS.ByteString] m () decodeRawRows = toBytes .| evalStateT decodeLoop newDecodeState -- | decode a raw ByteString into Text (if possible) decodeUTF8 :: BS.ByteString -> Maybe T.Text decodeUTF8 bs = case decodeUtf8' bs of Left _ -> Nothing Right txt -> Just txt -- | convert a stream to ByteStrings to a string of bytes toBytes :: Monad m => ConduitT BS.ByteString Word8 m () toBytes = await >>= \case Just bs -> do let bytes = BS.unpack bs mapM_ yield bytes toBytes Nothing -> return () -- Internal data DecodeState = DecodeState { isQuoted :: Bool , fields :: [BS.ByteString] , collected :: BS.ByteString } deriving (Eq, Show) type Decoder m = StateT DecodeState (ConduitT Word8 [BS.ByteString] m) () type Modifier = DecodeState -> DecodeState newDecodeState :: DecodeState newDecodeState = DecodeState { isQuoted = False , fields = [] , collected = "" } -- Decoders decodeLoop :: Monad m => Decoder m decodeLoop = lift await >>= \case Just byte -> case byte of 0x22 -> processQuote 0x2c -> processComma 0x0d -> processCR 0x0a -> processLF _ -> performAction $ addByte byte Nothing -> cleanup processQuote :: Monad m => Decoder m processQuote = gets isQuoted >>= \case True -> processQuotedQuote False -> processUnquotedQuote processComma :: Monad m => Decoder m processComma = gets isQuoted >>= performAction . \case True -> addByte 0x2c False -> commitField processCR :: Monad m => Decoder m processCR = gets isQuoted >>= \case True -> performAction $ addByte 0xd False -> endRow processLF :: Monad m => Decoder m processLF = gets isQuoted >>= \case True -> performAction $ addByte 0xa False -> endRow processQuotedQuote :: Monad m => Decoder m processQuotedQuote = lift await >>= \case Just byte -> case byte of 0x22 -> performAction $ addByte 0x22 -- quote 0x2c -> performAction commitField -- comma 0x0d -> commitRow -- carriage return 0x0a -> commitRow -- line feed _ -> corruptedField Nothing -> cleanup processUnquotedQuote :: Monad m => Decoder m processUnquotedQuote = gets (BS.null . collected) >>= \case True -> performAction setQuoted False -> corruptedField endRow :: Monad m => Decoder m endRow = do s <- get if null (fields s) && BS.null (collected s) then decodeLoop else commitRow commitRow :: Monad m => Decoder m commitRow = do modify commitField gets fields >>= lift . yield performAction dropFields corruptedField :: Monad m => Decoder m corruptedField = do modify dropField ignoreField ignoreField :: Monad m => Decoder m ignoreField = lift await >>= \case Just byte -> case byte of 0x2c -> performAction commitField -- comma 0x0d -> commitRow _ -> ignoreField Nothing -> cleanup cleanup :: Monad m => Decoder m cleanup = do gets isQuoted >>= \case True -> modify $ commitField . dropField False -> gets (BS.null . collected) >>= \case True -> return () False -> modify commitField fs <- gets fields unless (null fs) $ lift $ yield fs performAction :: Monad m => Modifier -> Decoder m performAction f = do modify f decodeLoop -- Modifiers addByte :: Word8 -> Modifier addByte b s = let collected' = BS.snoc (collected s) b in s { collected = collected' } commitField :: Modifier commitField s = let isQuoted' = False fields' = fields s ++ [collected s] collected' = "" in s { isQuoted = isQuoted' , fields = fields' , collected = collected' } dropFields :: Modifier dropFields s = s { fields = [] } dropField :: Modifier dropField s = s { isQuoted = False , collected = "" } setQuoted :: Modifier setQuoted s = s { isQuoted = True } --jl