module Data.Csv.Parser
( DecodeOptions(..)
, defaultDecodeOptions
, csv
, csvWithHeader
, header
, record
, name
, field
) where
import Blaze.ByteString.Builder (fromByteString, toByteString)
import Blaze.ByteString.Builder.Char.Utf8 (fromChar)
import Control.Applicative
import Data.Attoparsec.Char8 hiding (Parser, Result, parse)
import qualified Data.Attoparsec as A
import qualified Data.Attoparsec.Lazy as AL
import qualified Data.Attoparsec.Zepto as Z
import qualified Data.ByteString as S
import qualified Data.ByteString.Unsafe as S
import qualified Data.HashMap.Strict as HM
import Data.Monoid
import qualified Data.Vector as V
import Data.Word
import Data.Csv.Types
data DecodeOptions = DecodeOptions
{
decDelimiter :: !Word8
}
defaultDecodeOptions :: DecodeOptions
defaultDecodeOptions = DecodeOptions
{ decDelimiter = 44
}
csv :: DecodeOptions -> AL.Parser Csv
csv !opts = do
vals <- record (decDelimiter opts) `sepBy1` endOfLine
_ <- optional endOfLine
endOfInput
let nonEmpty = removeBlankLines vals
return (V.fromList nonEmpty)
csvWithHeader :: DecodeOptions -> AL.Parser (Header, V.Vector NamedRecord)
csvWithHeader !opts = do
hdr <- header (decDelimiter opts)
vals <- map (toNamedRecord hdr) . removeBlankLines <$>
(record (decDelimiter opts)) `sepBy1` endOfLine
_ <- optional endOfLine
endOfInput
return (hdr, V.fromList vals)
toNamedRecord :: V.Vector S.ByteString -> Record -> NamedRecord
toNamedRecord hdr v = HM.fromList . V.toList $ V.zip hdr v
header :: Word8
-> AL.Parser Header
header delim = V.fromList <$> name `sepBy1` (A.word8 delim) <* endOfLine
name :: AL.Parser Field
name = field
removeBlankLines :: [Record] -> [Record]
removeBlankLines = filter (not . blankLine)
where blankLine v = V.length v == 1 && (S.null (V.head v))
record :: Word8
-> AL.Parser Record
record !delim = V.fromList <$> field `sepBy1` (A.word8 delim)
field :: AL.Parser Field
field = do
mb <- A.peekWord8
case mb of
Just b | b == doubleQuote -> escapedField
_ -> unescapedField
escapedField :: AL.Parser S.ByteString
escapedField = do
_ <- dquote
s <- S.init <$> (A.scan False $ \s c -> if c == doubleQuote
then Just (not s)
else if s then Nothing
else Just False)
if doubleQuote `S.elem` s
then case Z.parse unescape s of
Right r -> return r
Left err -> fail err
else return s
unescapedField :: AL.Parser S.ByteString
unescapedField = A.takeWhile (\ c -> c /= doubleQuote &&
c /= newline &&
c /= commaB &&
c /= cr)
dquote :: AL.Parser Char
dquote = char '"'
unescape :: Z.Parser S.ByteString
unescape = toByteString <$> go mempty where
go acc = do
h <- Z.takeWhile (/= doubleQuote)
let rest = do
start <- Z.take 2
if (S.unsafeHead start == doubleQuote &&
S.unsafeIndex start 1 == doubleQuote)
then go (acc `mappend` fromByteString h `mappend` fromChar '"')
else fail "invalid CSV escape sequence"
done <- Z.atEnd
if done
then return (acc `mappend` fromByteString h)
else rest
doubleQuote, newline, commaB, cr :: Word8
doubleQuote = 34
newline = 10
commaB = 44
cr = 13