{-# LANGUAGE BangPatterns, CPP #-}
module Data.Csv.Parser
( DecodeOptions(..)
, defaultDecodeOptions
, csv
, csvWithHeader
, header
, record
, name
, field
) where
import Blaze.ByteString.Builder (fromByteString, toByteString)
import Blaze.ByteString.Builder.Char.Utf8 (fromChar)
import Control.Applicative (optional)
import Data.Attoparsec.ByteString.Char8 (char, endOfInput)
import qualified Data.Attoparsec.ByteString as A
import qualified Data.Attoparsec.Lazy as AL
import qualified Data.Attoparsec.Zepto as Z
import qualified Data.ByteString as S
import qualified Data.ByteString.Unsafe as S
import qualified Data.Vector as V
import Data.Word (Word8)
import Data.Csv.Types
import Data.Csv.Util ((<$!>), blankLine, endOfLine, liftM2', cr, newline, doubleQuote)
#if !MIN_VERSION_base(4,8,0)
import Control.Applicative ((<$>), (*>), (<*), pure)
import Data.Monoid (mappend, mempty)
#endif
data DecodeOptions = DecodeOptions
{
decDelimiter :: {-# UNPACK #-} !Word8
} deriving (Eq, Show)
defaultDecodeOptions :: DecodeOptions
defaultDecodeOptions = DecodeOptions
{ decDelimiter = 44
}
csv :: DecodeOptions -> AL.Parser Csv
csv !opts = do
vals <- sepByEndOfLine1' (record (decDelimiter opts))
_ <- optional endOfLine
endOfInput
let nonEmpty = removeBlankLines vals
return $! V.fromList nonEmpty
{-# INLINE csv #-}
sepByDelim1' :: AL.Parser a
-> Word8
-> AL.Parser [a]
sepByDelim1' p !delim = liftM2' (:) p loop
where
loop = do
mb <- A.peekWord8
case mb of
Just b | b == delim -> liftM2' (:) (A.anyWord8 *> p) loop
_ -> pure []
{-# INLINE sepByDelim1' #-}
sepByEndOfLine1' :: AL.Parser a
-> AL.Parser [a]
sepByEndOfLine1' p = liftM2' (:) p loop
where
loop = do
mb <- A.peekWord8
case mb of
Just b | b == cr ->
liftM2' (:) (A.anyWord8 *> A.word8 newline *> p) loop
| b == newline ->
liftM2' (:) (A.anyWord8 *> p) loop
_ -> pure []
{-# INLINE sepByEndOfLine1' #-}
csvWithHeader :: DecodeOptions -> AL.Parser (Header, V.Vector NamedRecord)
csvWithHeader !opts = do
!hdr <- header (decDelimiter opts)
vals <- map (toNamedRecord hdr) . removeBlankLines <$>
sepByEndOfLine1' (record (decDelimiter opts))
_ <- optional endOfLine
endOfInput
let !v = V.fromList vals
return (hdr, v)
header :: Word8
-> AL.Parser Header
header !delim = V.fromList <$!> name delim `sepByDelim1'` delim <* endOfLine
name :: Word8 -> AL.Parser Name
name !delim = field delim
removeBlankLines :: [Record] -> [Record]
removeBlankLines = filter (not . blankLine)
record :: Word8
-> AL.Parser Record
record !delim = V.fromList <$!> field delim `sepByDelim1'` delim
{-# INLINE record #-}
field :: Word8 -> AL.Parser Field
field !delim = do
mb <- A.peekWord8
case mb of
Just b | b == doubleQuote -> escapedField
_ -> unescapedField delim
{-# INLINE field #-}
escapedField :: AL.Parser S.ByteString
escapedField = do
_ <- dquote
s <- S.init <$> (A.scan False $ \s c -> if c == doubleQuote
then Just (not s)
else if s then Nothing
else Just False)
if doubleQuote `S.elem` s
then case Z.parse unescape s of
Right r -> return r
Left err -> fail err
else return s
unescapedField :: Word8 -> AL.Parser S.ByteString
unescapedField !delim = A.takeWhile (\ c -> c /= doubleQuote &&
c /= newline &&
c /= delim &&
c /= cr)
dquote :: AL.Parser Char
dquote = char '"'
unescape :: Z.Parser S.ByteString
unescape = toByteString <$!> go mempty where
go acc = do
h <- Z.takeWhile (/= doubleQuote)
let rest = do
start <- Z.take 2
if (S.unsafeHead start == doubleQuote &&
S.unsafeIndex start 1 == doubleQuote)
then go (acc `mappend` fromByteString h `mappend` fromChar '"')
else fail "invalid CSV escape sequence"
done <- Z.atEnd
if done
then return (acc `mappend` fromByteString h)
else rest