{-# LANGUAGE BangPatterns, CPP #-}

-- | A CSV parser. The parser defined here is RFC 4180 compliant, with
-- the following extensions:
--
--  * Empty lines are ignored.
--
--  * Non-escaped fields may contain any characters except
--    double-quotes, commas, carriage returns, and newlines.
--
--  * Escaped fields may contain any characters (but double-quotes
--    need to be escaped).
--
-- The functions in this module can be used to implement e.g. a
-- resumable parser that is fed input incrementally.
module Data.Csv.Parser
    ( DecodeOptions(..)
    , defaultDecodeOptions
    , csv
    , csvWithHeader
    , header
    , record
    , name
    , field
    ) where

import Data.ByteString.Builder (byteString, toLazyByteString, charUtf8)
import Control.Applicative (optional)
import Data.Attoparsec.ByteString.Char8 (char, endOfInput)
import qualified Data.Attoparsec.ByteString as A
import qualified Data.Attoparsec.Lazy as AL
import qualified Data.Attoparsec.Zepto as Z
import qualified Data.ByteString as S
import qualified Data.ByteString.Unsafe as S
import qualified Data.Vector as V
import Data.Word (Word8)

import Data.Csv.Types
import Data.Csv.Util ((<$!>), blankLine, endOfLine, liftM2', cr, newline, doubleQuote, toStrict)

-- | Options that controls how data is decoded. These options can be
-- used to e.g. decode tab-separated data instead of comma-separated
-- data.
--
-- To avoid having your program stop compiling when new fields are
-- added to 'DecodeOptions', create option records by overriding
-- values in 'defaultDecodeOptions'. Example:
--
-- > myOptions = defaultDecodeOptions {
-- >       decDelimiter = fromIntegral (ord '\t')
-- >     }
data DecodeOptions = DecodeOptions
    { -- | Field delimiter.
      DecodeOptions -> Word8
decDelimiter  :: {-# UNPACK #-} !Word8
    } deriving (DecodeOptions -> DecodeOptions -> Bool
(DecodeOptions -> DecodeOptions -> Bool)
-> (DecodeOptions -> DecodeOptions -> Bool) -> Eq DecodeOptions
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: DecodeOptions -> DecodeOptions -> Bool
== :: DecodeOptions -> DecodeOptions -> Bool
$c/= :: DecodeOptions -> DecodeOptions -> Bool
/= :: DecodeOptions -> DecodeOptions -> Bool
Eq, Int -> DecodeOptions -> ShowS
[DecodeOptions] -> ShowS
DecodeOptions -> String
(Int -> DecodeOptions -> ShowS)
-> (DecodeOptions -> String)
-> ([DecodeOptions] -> ShowS)
-> Show DecodeOptions
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> DecodeOptions -> ShowS
showsPrec :: Int -> DecodeOptions -> ShowS
$cshow :: DecodeOptions -> String
show :: DecodeOptions -> String
$cshowList :: [DecodeOptions] -> ShowS
showList :: [DecodeOptions] -> ShowS
Show)

-- | Decoding options for parsing CSV files.
defaultDecodeOptions :: DecodeOptions
defaultDecodeOptions :: DecodeOptions
defaultDecodeOptions = DecodeOptions
    { decDelimiter :: Word8
decDelimiter = Word8
44  -- comma
    }

-- | Parse a CSV file that does not include a header.
csv :: DecodeOptions -> AL.Parser Csv
csv :: DecodeOptions -> Parser Csv
csv !DecodeOptions
opts = do
    [Record]
vals <- Parser Record -> Parser [Record]
forall a. Parser a -> Parser [a]
sepByEndOfLine1' (Word8 -> Parser Record
record (DecodeOptions -> Word8
decDelimiter DecodeOptions
opts))
    Maybe ()
_ <- Parser ByteString () -> Parser ByteString (Maybe ())
forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
optional Parser ByteString ()
endOfLine
    Parser ByteString ()
forall t. Chunk t => Parser t ()
endOfInput
    let nonEmpty :: [Record]
nonEmpty = [Record] -> [Record]
removeBlankLines [Record]
vals
    Csv -> Parser Csv
forall a. a -> Parser ByteString a
forall (m :: * -> *) a. Monad m => a -> m a
return (Csv -> Parser Csv) -> Csv -> Parser Csv
forall a b. (a -> b) -> a -> b
$! [Record] -> Csv
forall a. [a] -> Vector a
V.fromList [Record]
nonEmpty
{-# INLINE csv #-}

-- | Specialized version of 'sepBy1'' which is faster due to not
-- accepting an arbitrary separator.
sepByDelim1' :: AL.Parser a
             -> Word8  -- ^ Field delimiter
             -> AL.Parser [a]
sepByDelim1' :: forall a. Parser a -> Word8 -> Parser [a]
sepByDelim1' Parser a
p !Word8
delim = (a -> [a] -> [a])
-> Parser a -> Parser ByteString [a] -> Parser ByteString [a]
forall (m :: * -> *) a b c.
Monad m =>
(a -> b -> c) -> m a -> m b -> m c
liftM2' (:) Parser a
p Parser ByteString [a]
loop
  where
    loop :: Parser ByteString [a]
loop = do
        Maybe Word8
mb <- Parser (Maybe Word8)
A.peekWord8
        case Maybe Word8
mb of
            Just Word8
b | Word8
b Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
delim -> (a -> [a] -> [a])
-> Parser a -> Parser ByteString [a] -> Parser ByteString [a]
forall (m :: * -> *) a b c.
Monad m =>
(a -> b -> c) -> m a -> m b -> m c
liftM2' (:) (Parser Word8
A.anyWord8 Parser Word8 -> Parser a -> Parser a
forall a b.
Parser ByteString a -> Parser ByteString b -> Parser ByteString b
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Parser a
p) Parser ByteString [a]
loop
            Maybe Word8
_                   -> [a] -> Parser ByteString [a]
forall a. a -> Parser ByteString a
forall (f :: * -> *) a. Applicative f => a -> f a
pure []
{-# INLINE sepByDelim1' #-}

-- | Specialized version of 'sepBy1'' which is faster due to not
-- accepting an arbitrary separator.
sepByEndOfLine1' :: AL.Parser a
                 -> AL.Parser [a]
sepByEndOfLine1' :: forall a. Parser a -> Parser [a]
sepByEndOfLine1' Parser a
p = (a -> [a] -> [a])
-> Parser a -> Parser ByteString [a] -> Parser ByteString [a]
forall (m :: * -> *) a b c.
Monad m =>
(a -> b -> c) -> m a -> m b -> m c
liftM2' (:) Parser a
p Parser ByteString [a]
loop
  where
    loop :: Parser ByteString [a]
loop = do
        Maybe Word8
mb <- Parser (Maybe Word8)
A.peekWord8
        case Maybe Word8
mb of
            Just Word8
b | Word8
b Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
cr ->
                (a -> [a] -> [a])
-> Parser a -> Parser ByteString [a] -> Parser ByteString [a]
forall (m :: * -> *) a b c.
Monad m =>
(a -> b -> c) -> m a -> m b -> m c
liftM2' (:) (Parser Word8
A.anyWord8 Parser Word8 -> Parser Word8 -> Parser Word8
forall a b.
Parser ByteString a -> Parser ByteString b -> Parser ByteString b
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Word8 -> Parser Word8
A.word8 Word8
newline Parser Word8 -> Parser a -> Parser a
forall a b.
Parser ByteString a -> Parser ByteString b -> Parser ByteString b
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Parser a
p) Parser ByteString [a]
loop
                   | Word8
b Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
newline ->
                (a -> [a] -> [a])
-> Parser a -> Parser ByteString [a] -> Parser ByteString [a]
forall (m :: * -> *) a b c.
Monad m =>
(a -> b -> c) -> m a -> m b -> m c
liftM2' (:) (Parser Word8
A.anyWord8 Parser Word8 -> Parser a -> Parser a
forall a b.
Parser ByteString a -> Parser ByteString b -> Parser ByteString b
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Parser a
p) Parser ByteString [a]
loop
            Maybe Word8
_ -> [a] -> Parser ByteString [a]
forall a. a -> Parser ByteString a
forall (f :: * -> *) a. Applicative f => a -> f a
pure []
{-# INLINE sepByEndOfLine1' #-}

-- | Parse a CSV file that includes a header.
csvWithHeader :: DecodeOptions -> AL.Parser (Header, V.Vector NamedRecord)
csvWithHeader :: DecodeOptions -> Parser (Record, Vector NamedRecord)
csvWithHeader !DecodeOptions
opts = do
    !Record
hdr <- Word8 -> Parser Record
header (DecodeOptions -> Word8
decDelimiter DecodeOptions
opts)
    [NamedRecord]
vals <- (Record -> NamedRecord) -> [Record] -> [NamedRecord]
forall a b. (a -> b) -> [a] -> [b]
map (Record -> Record -> NamedRecord
toNamedRecord Record
hdr) ([Record] -> [NamedRecord])
-> ([Record] -> [Record]) -> [Record] -> [NamedRecord]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Record] -> [Record]
removeBlankLines ([Record] -> [NamedRecord])
-> Parser [Record] -> Parser ByteString [NamedRecord]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$>
            Parser Record -> Parser [Record]
forall a. Parser a -> Parser [a]
sepByEndOfLine1' (Word8 -> Parser Record
record (DecodeOptions -> Word8
decDelimiter DecodeOptions
opts))
    Maybe ()
_ <- Parser ByteString () -> Parser ByteString (Maybe ())
forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
optional Parser ByteString ()
endOfLine
    Parser ByteString ()
forall t. Chunk t => Parser t ()
endOfInput
    let !v :: Vector NamedRecord
v = [NamedRecord] -> Vector NamedRecord
forall a. [a] -> Vector a
V.fromList [NamedRecord]
vals
    (Record, Vector NamedRecord) -> Parser (Record, Vector NamedRecord)
forall a. a -> Parser ByteString a
forall (m :: * -> *) a. Monad m => a -> m a
return (Record
hdr, Vector NamedRecord
v)

-- | Parse a header, including the terminating line separator.
header :: Word8  -- ^ Field delimiter
       -> AL.Parser Header
header :: Word8 -> Parser Record
header !Word8
delim = [ByteString] -> Record
forall a. [a] -> Vector a
V.fromList ([ByteString] -> Record)
-> Parser ByteString [ByteString] -> Parser Record
forall (m :: * -> *) a b. Monad m => (a -> b) -> m a -> m b
<$!> Word8 -> Parser ByteString
name Word8
delim Parser ByteString -> Word8 -> Parser ByteString [ByteString]
forall a. Parser a -> Word8 -> Parser [a]
`sepByDelim1'` Word8
delim Parser Record -> Parser ByteString () -> Parser Record
forall a b.
Parser ByteString a -> Parser ByteString b -> Parser ByteString a
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* Parser ByteString ()
endOfLine

-- | Parse a header name. Header names have the same format as regular
-- 'field's.
name :: Word8 -> AL.Parser Name
name :: Word8 -> Parser ByteString
name !Word8
delim = Word8 -> Parser ByteString
field Word8
delim

removeBlankLines :: [Record] -> [Record]
removeBlankLines :: [Record] -> [Record]
removeBlankLines = (Record -> Bool) -> [Record] -> [Record]
forall a. (a -> Bool) -> [a] -> [a]
filter (Bool -> Bool
not (Bool -> Bool) -> (Record -> Bool) -> Record -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Record -> Bool
blankLine)

-- | Parse a record, not including the terminating line separator. The
-- terminating line separate is not included as the last record in a
-- CSV file is allowed to not have a terminating line separator. You
-- most likely want to use the 'endOfLine' parser in combination with
-- this parser.
record :: Word8  -- ^ Field delimiter
       -> AL.Parser Record
record :: Word8 -> Parser Record
record !Word8
delim = [ByteString] -> Record
forall a. [a] -> Vector a
V.fromList ([ByteString] -> Record)
-> Parser ByteString [ByteString] -> Parser Record
forall (m :: * -> *) a b. Monad m => (a -> b) -> m a -> m b
<$!> Word8 -> Parser ByteString
field Word8
delim Parser ByteString -> Word8 -> Parser ByteString [ByteString]
forall a. Parser a -> Word8 -> Parser [a]
`sepByDelim1'` Word8
delim
{-# INLINE record #-}

-- | Parse a field. The field may be in either the escaped or
-- non-escaped format. The return value is unescaped.
field :: Word8 -> AL.Parser Field
field :: Word8 -> Parser ByteString
field !Word8
delim = do
    Maybe Word8
mb <- Parser (Maybe Word8)
A.peekWord8
    -- We purposely don't use <|> as we want to commit to the first
    -- choice if we see a double quote.
    case Maybe Word8
mb of
        Just Word8
b | Word8
b Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
doubleQuote -> Parser ByteString
escapedField
        Maybe Word8
_                         -> Word8 -> Parser ByteString
unescapedField Word8
delim
{-# INLINE field #-}

escapedField :: AL.Parser S.ByteString
escapedField :: Parser ByteString
escapedField = do
    Char
_ <- Parser Char
dquote
    -- The scan state is 'True' if the previous character was a double
    -- quote.  We need to drop a trailing double quote left by scan.
    ByteString
s <- HasCallStack => ByteString -> ByteString
ByteString -> ByteString
S.init (ByteString -> ByteString)
-> Parser ByteString -> Parser ByteString
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Bool -> (Bool -> Word8 -> Maybe Bool) -> Parser ByteString
forall s. s -> (s -> Word8 -> Maybe s) -> Parser ByteString
A.scan Bool
False ((Bool -> Word8 -> Maybe Bool) -> Parser ByteString)
-> (Bool -> Word8 -> Maybe Bool) -> Parser ByteString
forall a b. (a -> b) -> a -> b
$ \Bool
s Word8
c -> if Word8
c Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
doubleQuote
                                            then Bool -> Maybe Bool
forall a. a -> Maybe a
Just (Bool -> Bool
not Bool
s)
                                            else if Bool
s then Maybe Bool
forall a. Maybe a
Nothing
                                                 else Bool -> Maybe Bool
forall a. a -> Maybe a
Just Bool
False)
    if Word8
doubleQuote Word8 -> ByteString -> Bool
`S.elem` ByteString
s
        then case Parser ByteString -> ByteString -> Either String ByteString
forall a. Parser a -> ByteString -> Either String a
Z.parse Parser ByteString
unescape ByteString
s of
            Right ByteString
r  -> ByteString -> Parser ByteString
forall a. a -> Parser ByteString a
forall (m :: * -> *) a. Monad m => a -> m a
return ByteString
r
            Left String
err -> String -> Parser ByteString
forall a. String -> Parser ByteString a
forall (m :: * -> *) a. MonadFail m => String -> m a
fail String
err
        else ByteString -> Parser ByteString
forall a. a -> Parser ByteString a
forall (m :: * -> *) a. Monad m => a -> m a
return ByteString
s

unescapedField :: Word8 -> AL.Parser S.ByteString
unescapedField :: Word8 -> Parser ByteString
unescapedField !Word8
delim = (Word8 -> Bool) -> Parser ByteString
A.takeWhile (\ Word8
c -> Word8
c Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
doubleQuote Bool -> Bool -> Bool
&&
                                            Word8
c Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
newline Bool -> Bool -> Bool
&&
                                            Word8
c Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
delim Bool -> Bool -> Bool
&&
                                            Word8
c Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
cr)

dquote :: AL.Parser Char
dquote :: Parser Char
dquote = Char -> Parser Char
char Char
'"'

unescape :: Z.Parser S.ByteString
unescape :: Parser ByteString
unescape = (LazyByteString -> ByteString
toStrict (LazyByteString -> ByteString)
-> (Builder -> LazyByteString) -> Builder -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Builder -> LazyByteString
toLazyByteString) (Builder -> ByteString)
-> ZeptoT Identity Builder -> Parser ByteString
forall (m :: * -> *) a b. Monad m => (a -> b) -> m a -> m b
<$!> Builder -> ZeptoT Identity Builder
forall {m :: * -> *}. Monad m => Builder -> ZeptoT m Builder
go Builder
forall a. Monoid a => a
mempty where
  go :: Builder -> ZeptoT m Builder
go Builder
acc = do
    ByteString
h <- (Word8 -> Bool) -> ZeptoT m ByteString
forall (m :: * -> *).
Monad m =>
(Word8 -> Bool) -> ZeptoT m ByteString
Z.takeWhile (Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
/= Word8
doubleQuote)
    let rest :: ZeptoT m Builder
rest = do
          ByteString
start <- Int -> ZeptoT m ByteString
forall (m :: * -> *). Monad m => Int -> ZeptoT m ByteString
Z.take Int
2
          if (ByteString -> Word8
S.unsafeHead ByteString
start Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
doubleQuote Bool -> Bool -> Bool
&&
              ByteString -> Int -> Word8
S.unsafeIndex ByteString
start Int
1 Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
doubleQuote)
              then Builder -> ZeptoT m Builder
go (Builder
acc Builder -> Builder -> Builder
forall a. Monoid a => a -> a -> a
`mappend` ByteString -> Builder
byteString ByteString
h Builder -> Builder -> Builder
forall a. Monoid a => a -> a -> a
`mappend` Char -> Builder
charUtf8 Char
'"')
              else String -> ZeptoT m Builder
forall a. String -> ZeptoT m a
forall (m :: * -> *) a. MonadFail m => String -> m a
fail String
"invalid CSV escape sequence"
    Bool
done <- ZeptoT m Bool
forall (m :: * -> *). Monad m => ZeptoT m Bool
Z.atEnd
    if Bool
done
      then Builder -> ZeptoT m Builder
forall a. a -> ZeptoT m a
forall (m :: * -> *) a. Monad m => a -> m a
return (Builder
acc Builder -> Builder -> Builder
forall a. Monoid a => a -> a -> a
`mappend` ByteString -> Builder
byteString ByteString
h)
      else ZeptoT m Builder
rest