Text/CSV/Lazy/ByteString.hs

-- | The CSV (comma-separated value) format is defined by RFC 4180,
--   \"Common Format and MIME Type for Comma-Separated Values (CSV) Files\",
--   <http://www.rfc-editor.org/rfc/rfc4180.txt>
--
--   This lazy parser can report all CSV formatting errors, whilst also
--   returning all the valid data, so the user can choose whether to
--   continue, to show warnings, or to halt on error.
--
--   Valid fields retain information about their original location in the
--   input, so a secondary parser from textual fields to typed values
--   can give intelligent error messages.
--
--   In a valid CSV file, all rows must have the same number of columns.
--   This parser will flag a row with the wrong number of columns as a error.
--   (But the error type contains the actual data, so the user can recover
--   it if desired.)  Completely blank lines are also treated as errors,
--   and again the user is free either to filter these out or convert them
--   to a row of actual null fields.

module Text.CSV.Lazy.ByteString
  ( -- * CSV types
    CSVTable
  , CSVRow
  , CSVField(..)
    -- * CSV parsing
  , CSVError(..)
  , CSVResult
  , csvErrors
  , csvTable
  , csvTableFull
  , csvTableHeader
  , parseCSV
  , parseDSV
    -- * Pretty-printing
  , ppCSVError
  , ppCSVField
  , ppCSVTable
  , ppDSVTable
    -- * Conversion between standard and simple representations
  , fromCSVTable
  , toCSVTable
    -- * Selection, validation, and algebra of CSV tables
  , selectFields
  , expectFields
  , mkEmptyColumn
  , joinCSV
  , mkCSVField
  ) where

--  , ppCSVTableAsTuples

import Data.List     (groupBy, partition, elemIndex, intercalate, takeWhile
                     ,deleteFirstsBy, nub)
import Data.Function (on)
import Data.Maybe    (fromJust)
import qualified Data.ByteString.Lazy.Char8 as BS
import Data.ByteString.Lazy.Char8 (ByteString)

-- | A CSV table is a sequence of rows.  All rows have the same number
--   of fields.
type CSVTable   = [CSVRow]

-- | A CSV row is just a sequence of fields.
type CSVRow     = [CSVField]

-- | A CSV field's content is stored with its logical row and column number,
--   as well as its textual extent.  This information is necessary if you
--   want to generate good error messages in a secondary parsing stage,
--   should you choose to convert the textual fields to typed data values.
data CSVField   = CSVField       { csvRowNum        :: !Int
                                 , csvColNum        :: !Int
                                 , csvTextStart     :: !(Int,Int)
                                 , csvTextEnd       :: !(Int,Int)
                                 , csvFieldContent  :: !ByteString
                                 , csvFieldQuoted   :: !Bool }
                | CSVFieldError  { csvRowNum        :: !Int
                                 , csvColNum        :: !Int
                                 , csvTextStart     :: !(Int,Int)
                                 , csvTextEnd       :: !(Int,Int)
                                 , csvFieldError    :: !String }
                                                    deriving (Eq,Show)

-- | A structured error type for CSV formatting mistakes.
data CSVError   = IncorrectRow   { csvRow           :: Int
                                 , csvColsExpected  :: Int
                                 , csvColsActual    :: Int
                                 , csvFields        :: [CSVField] }
                | BlankLine      { csvRow           :: !Int
                                 , csvColsExpected  :: !Int
                                 , csvColsActual    :: !Int
                                 , csvField         :: CSVField }
                | FieldError     { csvField         :: CSVField }
                | DuplicateHeader{ csvColsExpected  :: !Int
                                 , csvHeaderSerial  :: !Int
                                 , csvDuplicate     :: !String }
                | NoData
                                                    deriving (Eq,Show)

-- | The result of parsing a CSV input is a mixed collection of errors
--   and valid rows.  This way of representing things is crucial to the
--   ability to parse lazily whilst still catching format errors.
type CSVResult  = [ Either [CSVError] [CSVField] ]

-- | Extract just the valid portions of a CSV parse.
csvTable    :: CSVResult -> CSVTable
csvTable  r  = [ row | Right row <- r ]
-- | Extract just the errors from a CSV parse.
csvErrors   :: CSVResult -> [CSVError]
csvErrors r  = concat [ err | Left err  <- r ]
-- | Extract the full table, including invalid rows, with padding, and
--   de-duplicated headers.
csvTableFull:: CSVResult -> CSVTable
csvTableFull = map beCareful . deduplicate
    where beCareful (Right row) = row
          beCareful (Left (r@IncorrectRow{}:_)) =
              csvFields r ++
              replicate (csvColsExpected r - csvColsActual r)
                        (mkCSVField (csvRow r) 0 BS.empty)
          beCareful (Left (r@BlankLine{}:_)) =
              replicate (csvColsExpected r)
                        (mkCSVField (csvRow r) 0 BS.empty)
          beCareful (Left (r@DuplicateHeader{}:_)) = -- obsolete with deduping
              replicate (csvColsExpected r)
                        (mkCSVField 0 0 BS.empty)
          beCareful (Left (FieldError{}:r))      = beCareful (Left r)
          beCareful (Left (NoData:_))            = []
          beCareful (Left [])                    = []

          deduplicate (Left (errs@(DuplicateHeader{}:_)):Right heads:rows) = 
--               Right (reverse $ foldl replace [] heads)
                 Right (replaceInOrder errs (zip heads [0..]))
                 : rows
          deduplicate rows = rows
{-
          replace output header
              | headerName `elem` map csvFieldContent output
                          = header{ csvFieldContent = headerName
                                            `BS.append` BS.pack "_duplicate" }
                                  : output
              | otherwise = header: output
              where headerName = csvFieldContent header
-}
          replaceInOrder []       headers        = map fst headers
          replaceInOrder _        []             = []
          replaceInOrder (d:dups) ((h,n):headers)
              | csvHeaderSerial d == n = h{ csvFieldContent = BS.pack
                                                (csvDuplicate d++"_"++show n) }
                                          : replaceInOrder dups     headers
              | otherwise              = h: replaceInOrder (d:dups) headers

-- | The header row of the CSV table, assuming it is non-empty.
csvTableHeader :: CSVResult -> [String]
csvTableHeader = map (BS.unpack . csvFieldContent) . firstRow
    where firstRow (Left _: rest) = firstRow rest
          firstRow (Right x: _)   = x


-- | A first-stage parser for CSV (comma-separated values) data.
--   The individual fields remain as text, but errors in CSV formatting
--   are reported.  Errors (containing unrecognisable rows/fields) are
--   interspersed with the valid rows/fields.
parseCSV :: ByteString -> CSVResult
parseCSV = parseDSV True ','

-- | Sometimes CSV is not comma-separated, but delimiter-separated
--   values (DSV).  The choice of delimiter is arbitrary, but semi-colon
--   is common in locales where comma is used as a decimal point, and tab
--   is also common.  The Boolean argument is
--   whether newlines should be accepted within quoted fields.  The CSV RFC
--   says newlines can occur in quotes, but other DSV formats might say
--   otherwise.  You can often get better error messages if newlines are
--   disallowed.
parseDSV :: Bool -> Char -> ByteString -> CSVResult
parseDSV qn delim = validate
                    . groupBy ((==)`on`csvRowNum)
                    . lexCSV qn delim

validate          :: [CSVRow] -> CSVResult
validate []        = [Left [NoData]]
validate xs@(x:_)  = checkDuplicateHeaders x $ map (extractErrs (length x)) xs

extractErrs       :: Int -> CSVRow -> Either [CSVError] CSVRow
extractErrs size row
    | length row0 == size && null errs0    = Right row0
    | length row0 == 1    && empty field0  = Left [blankLine field0]
    | otherwise                            = Left (map convert errs0
                                                   ++ validateColumns row0)
  where
  (row0,errs0)  = partition isField row
  (field0:_)    = row0

  isField (CSVField{})       = True
  isField (CSVFieldError{})  = False

  empty   f@(CSVField{})    = BS.null (csvFieldContent f)
  empty   _                 = False

  convert err = FieldError {csvField = err}

  validateColumns r  =
      if length r == size then []
      else [ IncorrectRow{ csvRow  = if null r then 0 else csvRowNum (head r)
                         , csvColsExpected  = size
                         , csvColsActual    = length r
                         , csvFields        = r } ]
  blankLine f = BlankLine{ csvRow           = csvRowNum f
                         , csvColsExpected  = size
                         , csvColsActual    = 1
                         , csvField         = f }

checkDuplicateHeaders :: CSVRow -> CSVResult -> CSVResult
checkDuplicateHeaders row result =
    let headers = [ f | f@(CSVField{}) <- row ]
        dups    = deleteFirstsBy ((==)`on`csvFieldContent)
                                 headers (nub headers)
        n       = length headers
    in if null dups then result
       else Left (map (\d-> DuplicateHeader
                              { csvColsExpected = n
                              , csvHeaderSerial = csvColNum d
                              , csvDuplicate = BS.unpack (csvFieldContent d)})
                      dups)
            : result



-- Reading CSV data is essentially lexical, and can be implemented with a
-- simple finite state machine.  We keep track of logical row number,
-- logical column number (in tabular terms), and textual position (row,col)
-- to enable good error messages.
-- Positional data is retained even after successful lexing, in case a
-- second-stage field parser wants to complain.
--
-- A double-quoted CSV field may contain commas, newlines, and double quotes.

data CSVState  = CSVState  { tableRow, tableCol  :: !Int
                           , textRow,  textCol   :: !Int }
    deriving Show

incTableRow, incTableCol, incTextRow :: CSVState -> CSVState
incTableRow  st = st { tableRow  = tableRow  st + 1 , tableCol = 1 }
incTableCol  st = st { tableCol  = tableCol  st + 1 }
incTextRow   st = st { textRow   = textRow   st + 1 , textCol = 1 }

incTextCol :: Int -> CSVState -> CSVState
incTextCol n st = st { textCol   = textCol   st + n }

here :: CSVState -> (Int,Int)
here st = (textRow st, textCol st)

-- Lexer is a small finite state machine.
lexCSV :: Bool -> Char -> ByteString -> [CSVField]
lexCSV qn delim =
    getFields qn delim
              (CSVState{tableRow=1,tableCol=1,textRow=1,textCol=1}) (1,1)

getFields :: Bool -> Char -> CSVState -> (Int, Int) -> ByteString -> [CSVField]
getFields q d state begin bs0
 = case BS.uncons bs0 of
   Nothing -> []
   Just ('"', bs1) -> doStringFieldContent q d (incTextCol 1 state) begin
                                           BS.empty bs1
   _ ->
       case BS.break interestingChar bs0 of
       (fieldBs, bs1) ->
           let field   = mkField end begin fieldBs False
               end     = incTextCol (len-1) $ state
               state'  = incTableCol $ incTextCol 2 end
               stateNL = incTableRow . incTextRow $ state
               len     = fromIntegral $ BS.length fieldBs
           in case BS.uncons bs1 of
              Just (c,bs2)
                   | c==d     -> field: getFields q d state' (here state') bs2
              Just ('\r',bs2) ->
                  case BS.uncons bs2 of
                  Just ('\n',bs3)
                              -> field: getFields q d stateNL (here stateNL) bs3
                                 -- XXX This could be an error instead:
                  _           -> field: getFields q d stateNL (here stateNL) bs2
              Just ('\n',bs2) -> field: getFields q d stateNL (here stateNL) bs2
              Just ('"', _)   -> field:
                                 mkError state' begin
                                         "unexpected quote, resync at EOL":
                                 getFields q d stateNL (here stateNL)
                                           (BS.dropWhile (/='\n') bs1)
              Just _          -> [mkError state' begin "XXX Can't happen"]
              Nothing         -> field: getFields q d stateNL (here stateNL) bs1
 where interestingChar '\r' = True
       interestingChar '\n' = True
       interestingChar '"'  = True
       interestingChar c    | c==d = True
       interestingChar _    = False

doStringFieldContent :: Bool -> Char -> CSVState -> (Int, Int) -> ByteString
                     -> ByteString -> [CSVField]
doStringFieldContent q d state begin acc bs1
 = case BS.break interestingCharInsideString bs1 of
   (newBs, bs2) ->
       let fieldBs = acc `BS.append` newBs
           field   = mkField end  begin fieldBs True
           end     = incTextCol (len-1) state
           state'  = incTableCol $ incTextCol 3 end
           stateNL = incTableRow . incTextRow $ state
           len     = fromIntegral $ BS.length newBs
       in case BS.uncons bs2 of
          Just ('\r',bs3) ->
              case BS.uncons bs3 of
              Just ('\n',bs4)  | q ->
                   doStringFieldContent q d (incTextRow end) begin
                                     (fieldBs `BS.append` BS.singleton '\n') bs4
              _ -> doStringFieldContent q d end begin
                                     (fieldBs `BS.append` BS.singleton '\r') bs3
          Just ('\n',bs3) | q ->
                   doStringFieldContent q d (incTextRow end) begin
                                     (fieldBs `BS.append` BS.singleton '\n') bs3
          Just ('\n',bs3) ->
                   field:
                   mkError end begin "Found newline within quoted field":
                   getFields q d stateNL (here stateNL) bs3
          Just ('"', bs3) ->
              case BS.uncons bs3 of
              Just (c,bs4)
                   | c==d     -> field: getFields q d state' (here state') bs4
              Just ('\r',bs4) ->
                  case BS.uncons bs4 of
                  Just ('\n',bs5) ->
                       field: getFields q d stateNL (here stateNL) bs5
                       -- XXX This could be an error instead:
                  _ -> field: getFields q d stateNL (here stateNL) bs4
              Just ('\n',bs4) -> field: getFields q d stateNL (here stateNL) bs4
              Just ('"',bs4)  ->
                  doStringFieldContent q d (incTextCol 3 end) begin
                                      (fieldBs `BS.append` BS.singleton '"') bs4
              Just _  -> field:
                         mkError state' begin "End-quote not followed by comma":
                         getFields q d state' (here state') bs3
              Nothing -> field: getFields q d stateNL (here stateNL) bs3
          Just _  -> [mkError state' begin "XXX Can't happen (string field)"]
          Nothing -> field:
                     mkError state' begin "CSV data ends within a quoted string"
                     :[]
 where interestingCharInsideString '\r' = True
       interestingCharInsideString '\n' = True
       interestingCharInsideString '"'  = True
       interestingCharInsideString _    = False

mkField :: CSVState -> (Int, Int) -> ByteString -> Bool -> CSVField
mkField st begin bs q =   CSVField { csvRowNum       = tableRow st
                                   , csvColNum       = tableCol st
                                   , csvTextStart    = begin
                                   , csvTextEnd      = (textRow st,textCol st)
                                   , csvFieldContent = bs
                                   , csvFieldQuoted  = q }

mkError :: CSVState -> (Int, Int) -> String -> CSVField
mkError st begin e = CSVFieldError { csvRowNum     = tableRow st
                                   , csvColNum     = tableCol st
                                   , csvTextStart  = begin
                                   , csvTextEnd    = (textRow st,textCol st)
                                   , csvFieldError = e }


-- Some pretty-printing for structured CSV errors.
ppCSVError :: CSVError -> String
ppCSVError (err@IncorrectRow{}) =
        "\nRow "++show (csvRow err)++" has wrong number of fields."++
        "\n    Expected "++show (csvColsExpected err)++" but got "++
        show (csvColsActual err)++"."++
        "\n    The fields are:"++
        indent 8 (concatMap ppCSVField (csvFields err))
ppCSVError (err@BlankLine{}) =
        "\nRow "++show (csvRow err)++" is blank."++
        "\n    Expected "++show (csvColsExpected err)++" fields."
ppCSVError (err@FieldError{}) = ppCSVField (csvField err)
ppCSVError (err@DuplicateHeader{}) =
        "\nThere are two (or more) identical column headers: "++
        show (csvDuplicate err)++"."++
        "\n    Column number "++show (csvHeaderSerial err)
ppCSVError (NoData{})         =
        "\nNo usable data (after accounting for any other errors)."

-- | Pretty-printing for CSV fields, shows positional information in addition
--   to the textual content.
ppCSVField :: CSVField -> String
ppCSVField (f@CSVField{}) =
        "\n"++BS.unpack (quoted (csvFieldQuoted f) (csvFieldContent f))++
        "\nin row "++show (csvRowNum f)++" at column "++show (csvColNum f)++
        " (textually from "++show (csvTextStart f)++" to "++
        show (csvTextEnd f)++")"
ppCSVField (f@CSVFieldError{}) =
        "\n"++csvFieldError f++
        "\nin row "++show (csvRowNum f)++" at column "++show (csvColNum f)++
        " (textually from "++show (csvTextStart f)++" to "++
        show (csvTextEnd f)


-- | Output a table back to a lazily-constructed string.  There are lots of
--   possible design decisions one could take, e.g. to re-arrange columns
--   back into something resembling their original order, but here we just
--   take the given table without looking at Row and Field numbers etc.
ppCSVTable :: CSVTable -> ByteString
ppCSVTable = BS.unlines . map (BS.intercalate (BS.pack ",") . map ppField)
  where ppField f = quoted (csvFieldQuoted f) (csvFieldContent f)

-- | Output a table back to a lazily-constructed bytestring, using the given
--   delimiter char.  The Boolean argument is to repair fields containing
--   newlines, by replacing the nl with a space.
ppDSVTable :: Bool -> Char -> CSVTable -> ByteString
ppDSVTable nl d = BS.unlines . map (BS.intercalate (BS.pack [d]) . map ppField)
  where ppField f = quoted (csvFieldQuoted f) (doNL $ csvFieldContent f)
        doNL | nl        = replaceNL
             | otherwise = id

{-
-- | Output a table back to a string, but using Haskell list-of-tuple notation
--   rather than CSV.
ppCSVTableAsTuples :: CSVTable -> String
ppCSVTableAsTuples = indent 4 . unlines . map ( (", ("++) . (++")")
                                              . intercalate ", " . map ppField )
  where ppField f = quoted (csvFieldQuoted f) (BS.unpack (csvFieldContent f))
-}

-- Some pp helpers - indent and quoted - should live elsewhere, in a
-- pretty-printing package.

indent :: Int -> String -> String
indent n = unlines . map (replicate n ' ' ++) . lines

quoted :: Bool -> ByteString -> ByteString
quoted False  s  = s
quoted True   s  = BS.concat [BS.pack "\"", escape s, BS.pack"\""]
  where escape s = let (good,next) = BS.span (/='"') s
                   in if BS.null next then good
                    else BS.concat [ good, BS.pack "\"\"", escape (BS.tail next) ]

replaceNL :: ByteString -> ByteString
replaceNL s = let (good,next) = BS.span (/='\n') s
              in if BS.null next then good
                 else if BS.null good then replaceNL (BS.tail next)
                 else BS.concat [ good, BS.pack " ", replaceNL next ]


-- | Convert a CSV table to a simpler representation, by dropping all
--   the original location information.
fromCSVTable :: CSVTable -> [[ByteString]]
fromCSVTable = map (map csvFieldContent)

-- | Convert a simple list of lists into a CSVTable by the addition of
--   logical locations.  (Textual locations are not so useful.)
--   Rows of varying lengths generate errors.  Fields that need
--   quotation marks are automatically marked as such.
toCSVTable   :: [[ByteString]] -> ([CSVError], CSVTable)
toCSVTable []         = ([NoData], [])
toCSVTable rows@(r:_) = (\ (a,b)-> (concat a, b)) $
                        unzip (zipWith walk [1..] rows)
  where
    n            = length r
    walk        :: Int -> [ByteString] -> ([CSVError], CSVRow)
    walk rnum [] = ( [blank rnum]
                   , map (\c-> mkCSVField rnum c (BS.empty)) [1..n])
    walk rnum cs = ( if length cs /= n then [bad rnum cs] else []
                   , zipWith (mkCSVField rnum) [1..n] cs )

    blank rnum =  BlankLine{ csvRow          = rnum
                           , csvColsExpected = n
                           , csvColsActual   = 0
                           , csvField        = mkCSVField rnum 0 BS.empty
                           }
    bad r cs = IncorrectRow{ csvRow          = r
                           , csvColsExpected = n
                           , csvColsActual   = length cs
                           , csvFields       = zipWith (mkCSVField r) [1..] cs
                           }


-- | Select and/or re-arrange columns from a CSV table, based on names in the
--   header row of the table.  The original header row is re-arranged too.
--   The result is either a list of column names that were not present, or
--   the (possibly re-arranged) sub-table.
selectFields :: [String] -> CSVTable -> Either [String] CSVTable
selectFields names table
    | null table          = Left names
    | not (null missing)  = Left missing
    | otherwise           = Right (map select table)
  where
    header         = map (BS.unpack . csvFieldContent) (head table)
    missing        = filter (`notElem` header) names
    reordering     = map (fromJust . (\n-> elemIndex n header)) names
    select fields  = map (fields!!) reordering

-- | Validate that the columns of a table have exactly the names and
--   ordering given in the argument.
expectFields :: [String] -> CSVTable -> Either [String] CSVTable
expectFields names table
    | null table          = Left ["CSV table is empty"]
    | not (null missing)  = Left (map ("CSV table is missing field: "++)
                                      missing)
    | header /= names     = Left ["CSV columns are in the wrong order"
                                 ,"Expected: "++intercalate ", " names
                                 ,"Found:    "++intercalate ", " header]
    | otherwise           = Right table
  where
    header         = map (BS.unpack . csvFieldContent) (head table)
    missing        = filter (`notElem` header) names

-- | A join operator, adds the columns of two tables together.
--   Precondition: the tables have the same number of rows.
joinCSV :: CSVTable -> CSVTable -> CSVTable
joinCSV = zipWith (++)

-- | A generator for a new CSV column, of arbitrary length.
--   The result can be joined to an existing table if desired.
mkEmptyColumn :: String -> CSVTable
mkEmptyColumn header = [headField] : map ((:[]).emptyField) [2..]
  where
    headField = (emptyField 1) { csvFieldContent = BS.pack header
                               , csvFieldQuoted  = True }
    emptyField n = CSVField { csvRowNum       = n
                            , csvColNum       = 0
                            , csvTextStart    = (0,0)
                            , csvTextEnd      = (0,0)
                            , csvFieldContent = BS.empty
                            , csvFieldQuoted  = False
                            }

-- | Generate a fresh field with the given textual content.
--   The quoting flag is set automatically based on the text.
--   Textual extents are not particularly useful, since there was no original
--   input to refer to.
mkCSVField :: Int -> Int -> ByteString -> CSVField
mkCSVField n c text =
        CSVField { csvRowNum       = n
                 , csvColNum       = c
                 , csvTextStart    = (0,0)
                 , csvTextEnd      = ( fromIntegral
                                             . BS.length 
                                             . BS.filter (=='\n')
                                             $ text
                                     , fromIntegral
                                             . BS.length
                                             . BS.takeWhile (/='\n')
                                             . BS.reverse $ text )
                 , csvFieldContent = text
                 , csvFieldQuoted  = any (`elem`"\",\n\r") (BS.unpack text)
                 }