{-| Module : Data.CSV.Sip Description : works with CSV files Copyright : (C) Jonathan Lamothe License : GPL-3.0-or-later Maintainer : jonathan@jlamothe.net Stability : experimental Portability : POSIX This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . -} {-# LANGUAGE LambdaCase, OverloadedStrings #-} module Data.CSV.Sip ( -- * Working with Files -- ** Read an entire CSV file slurpCSV, slurpRawCSV, slurpLabelledCSV, slurpRawLabelledCSV, -- ** Write an entire CSV file writeCSV, writeRawCSV, -- * Conduits -- ** Producers readFromCSV, readFromCSVRaw, encodeCSV, encodeRawCSV, -- ** Consumers writeToCSV, writeToCSVRaw, -- ** Transformers -- *** Encoding encodeRows, encodeRawRows, -- *** Decoding labelFields, decodeRows, decodeRawRows, decodeUTF8, toBytes, ) where import Conduit ( ConduitT , MonadResource , await , mapC , runConduit , sinkFile , sourceFile , yield , (.|) ) import Control.Monad (unless) import Control.Monad.Trans.Class (lift) import Control.Monad.Trans.State (StateT, evalStateT, get, gets, modify) import qualified Data.ByteString as BS import Data.Conduit.List (consume, sourceList) import qualified Data.Map as M import Data.Maybe (fromMaybe) import qualified Data.Text as T import Data.Text.Encoding (decodeUtf8', encodeUtf8) import Data.Word (Word8) -- | read an entire CSV file slurpCSV :: MonadResource m => FilePath -- ^ the path to the file to read -> m [[T.Text]] slurpCSV file = runConduit $ sourceFile file .| decodeRows .| consume -- | read an entire CSV file in raw mode slurpRawCSV :: MonadResource m => FilePath -- ^ the path to the file to read -> m [[BS.ByteString]] slurpRawCSV file = runConduit $ sourceFile file .| decodeRawRows .| consume -- | read an entire CSV file with a header slurpLabelledCSV :: MonadResource m => FilePath -- ^ the path to the file to read -> m [M.Map T.Text T.Text] slurpLabelledCSV file = runConduit $ sourceFile file .| decodeRows .| labelFields .|consume -- | read an entire CSV file with a header slurpRawLabelledCSV :: MonadResource m => FilePath -- ^ the path to the file to read -> m [M.Map BS.ByteString BS.ByteString] slurpRawLabelledCSV file = runConduit $ sourceFile file .| decodeRawRows .| labelFields .|consume -- | write a CSV file from Text-based rows writeCSV :: MonadResource m => FilePath -- ^ the path to the file to write to -> [[T.Text]] -- ^ the fields/rows being written -> m () writeCSV file csv = runConduit $ encodeCSV csv .| sinkFile file -- | write a CSV file from raw ByteString-based rows writeRawCSV :: MonadResource m => FilePath -- ^ the path to the file to write to -> [[BS.ByteString]] -- ^ the fields/rows being written -> m () writeRawCSV file csv = runConduit $ encodeRawCSV csv .| sinkFile file -- | reads a stream of Text-based rows from a CSV file readFromCSV :: MonadResource m => FilePath -- ^ the path to the CSV file to read from -> ConduitT i [T.Text] m () readFromCSV file = sourceFile file .| decodeRows -- | reads a stream of ByteString-based rows from a CSV file readFromCSVRaw :: MonadResource m => FilePath -- ^ the path to the CSV file to read from -> ConduitT i [BS.ByteString] m () readFromCSVRaw file = sourceFile file .| decodeRawRows -- | encode an entire CSV file encodeCSV :: Monad m => [[T.Text]] -- ^ the data being encoded, organized into rows and fields -> ConduitT () BS.ByteString m () encodeCSV csv = sourceList csv .| encodeRows -- | encode an entire CSV file encodeRawCSV :: Monad m => [[BS.ByteString]] -- ^ the data being encoded, organized into rows and fields -> ConduitT () BS.ByteString m () encodeRawCSV csv = sourceList csv .| encodeRawRows -- | Writes a stream of Text-based rows to a CSV file writeToCSV :: MonadResource m => FilePath -- ^ the path to the CSV file to write to -> ConduitT [T.Text] o m () writeToCSV file = encodeRows .| sinkFile file -- | Writes a stream of ByteString-based rows to a CSV file writeToCSVRaw :: MonadResource m => FilePath -- ^ the path to the CSV file to write to -> ConduitT [BS.ByteString] o m () writeToCSVRaw file = encodeRawRows .| sinkFile file -- | encode a CSV stream row by row, each element in the list read -- represents a field, with the entire list representing a row encodeRows :: Monad m => ConduitT [T.Text] BS.ByteString m () encodeRows = mapC (map encodeUtf8) .| encodeRawRows -- | encode raw CSV stream row by row, each element in the list read -- represents a field, with the entire list representing a row encodeRawRows :: Monad m => ConduitT [BS.ByteString] BS.ByteString m () encodeRawRows = await >>= \case Just fs-> do encodeFields fs encodeRawRows Nothing -> return () -- | read a CSV stream, using the first row as a header containing -- field labels labelFields :: (Monad m, Ord a) => ConduitT [a] (M.Map a a) m () labelFields = await >>= \case Just headers -> labelLoop headers Nothing -> return () -- | decode the rows from a stream of ByteStrings decodeRows :: Monad m => ConduitT BS.ByteString [T.Text] m () decodeRows = decodeRawRows .| mapC (map $ fromMaybe "" . decodeUTF8) -- | decode the rows returning raw ByteStrings instead of text decodeRawRows :: Monad m => ConduitT BS.ByteString [BS.ByteString] m () decodeRawRows = toBytes .| evalStateT decodeLoop newDecodeState -- | decode a raw ByteString into Text (if possible) decodeUTF8 :: BS.ByteString -> Maybe T.Text decodeUTF8 bs = case decodeUtf8' bs of Left _ -> Nothing Right txt -> Just txt -- | convert a stream to ByteStrings to a string of bytes toBytes :: Monad m => ConduitT BS.ByteString Word8 m () toBytes = await >>= \case Just bs -> do let bytes = BS.unpack bs mapM_ yield bytes toBytes Nothing -> return () -- Internal data DecodeState = DecodeState { isQuoted :: Bool , fields :: [BS.ByteString] , collected :: BS.ByteString } deriving (Eq, Show) type Decoder m = StateT DecodeState (ConduitT Word8 [BS.ByteString] m) () type Modifier = DecodeState -> DecodeState newDecodeState :: DecodeState newDecodeState = DecodeState { isQuoted = False , fields = [] , collected = "" } -- Conduits encodeFields :: Monad m => [BS.ByteString] -> ConduitT [BS.ByteString] BS.ByteString m () encodeFields [] = yield "\r\n" encodeFields [f] = yield $ escapeField f `BS.append` "\r\n" encodeFields (f:fs) = do yield $ escapeField f `BS.append` "," encodeFields fs labelLoop :: (Monad m, Ord a) => [a] -> ConduitT [a] (M.Map a a) m () labelLoop headers = await >>= \case Just values -> do yield $ M.fromList $ zip headers values labelLoop headers Nothing -> return () -- Decoders decodeLoop :: Monad m => Decoder m decodeLoop = lift await >>= \case Just byte -> case byte of 0x22 -> processQuote 0x2c -> processComma 0x0d -> processCR 0x0a -> processLF _ -> performAction $ addByte byte Nothing -> cleanup processQuote :: Monad m => Decoder m processQuote = gets isQuoted >>= \case True -> processQuotedQuote False -> processUnquotedQuote processComma :: Monad m => Decoder m processComma = gets isQuoted >>= performAction . \case True -> addByte 0x2c False -> commitField processCR :: Monad m => Decoder m processCR = gets isQuoted >>= \case True -> performAction $ addByte 0xd False -> endRow processLF :: Monad m => Decoder m processLF = gets isQuoted >>= \case True -> performAction $ addByte 0xa False -> endRow processQuotedQuote :: Monad m => Decoder m processQuotedQuote = lift await >>= \case Just byte -> case byte of 0x22 -> performAction $ addByte 0x22 -- quote 0x2c -> performAction commitField -- comma 0x0d -> commitRow -- carriage return 0x0a -> commitRow -- line feed _ -> corruptedField Nothing -> cleanup processUnquotedQuote :: Monad m => Decoder m processUnquotedQuote = gets (BS.null . collected) >>= \case True -> performAction setQuoted False -> corruptedField endRow :: Monad m => Decoder m endRow = do s <- get if null (fields s) && BS.null (collected s) then decodeLoop else commitRow commitRow :: Monad m => Decoder m commitRow = do modify commitField gets fields >>= lift . yield performAction dropFields corruptedField :: Monad m => Decoder m corruptedField = do modify dropField ignoreField ignoreField :: Monad m => Decoder m ignoreField = lift await >>= \case Just byte -> case byte of 0x2c -> performAction commitField -- comma 0x0d -> commitRow _ -> ignoreField Nothing -> cleanup cleanup :: Monad m => Decoder m cleanup = do gets isQuoted >>= \case True -> modify $ commitField . dropField False -> gets (BS.null . collected) >>= \case True -> return () False -> modify commitField fs <- gets fields unless (null fs) $ lift $ yield fs performAction :: Monad m => Modifier -> Decoder m performAction f = do modify f decodeLoop -- Modifiers addByte :: Word8 -> Modifier addByte b s = let collected' = BS.snoc (collected s) b in s { collected = collected' } commitField :: Modifier commitField s = let isQuoted' = False fields' = fields s ++ [collected s] collected' = "" in s { isQuoted = isQuoted' , fields = fields' , collected = collected' } dropFields :: Modifier dropFields s = s { fields = [] } dropField :: Modifier dropField s = s { isQuoted = False , collected = "" } setQuoted :: Modifier setQuoted s = s { isQuoted = True } -- Helpers escapeField :: BS.ByteString -> BS.ByteString escapeField field = let bytes = BS.unpack field in BS.concat [ "\"" , BS.pack $ escapeLoop bytes , "\"" ] escapeLoop :: [Word8] -> [Word8] escapeLoop [] = [] escapeLoop (0x22:bs) = [0x22, 0x22] ++ escapeLoop bs -- escape quote escapeLoop (b:bs) = b : escapeLoop bs --jl