module Text.CSV.Lazy.ByteString
(
CSVTable
, CSVRow
, CSVField(..)
, CSVError(..)
, CSVResult
, csvErrors
, csvTable
, csvTableFull
, csvTableHeader
, parseCSV
, parseDSV
, ppCSVError
, ppCSVField
, ppCSVTable
, ppDSVTable
, fromCSVTable
, toCSVTable
, selectFields
, expectFields
, mkEmptyColumn
, joinCSV
, mkCSVField
) where
import Data.List (groupBy, partition, elemIndex, intercalate, takeWhile
,deleteFirstsBy, nub)
import Data.Function (on)
import Data.Maybe (fromJust)
import qualified Data.ByteString.Lazy.Char8 as BS
import Data.ByteString.Lazy.Char8 (ByteString)
type CSVTable = [CSVRow]
type CSVRow = [CSVField]
data CSVField = CSVField { csvRowNum :: !Int
, csvColNum :: !Int
, csvTextStart :: !(Int,Int)
, csvTextEnd :: !(Int,Int)
, csvFieldContent :: !ByteString
, csvFieldQuoted :: !Bool }
| CSVFieldError { csvRowNum :: !Int
, csvColNum :: !Int
, csvTextStart :: !(Int,Int)
, csvTextEnd :: !(Int,Int)
, csvFieldError :: !String }
deriving (Eq,Show)
data CSVError = IncorrectRow { csvRow :: Int
, csvColsExpected :: Int
, csvColsActual :: Int
, csvFields :: [CSVField] }
| BlankLine { csvRow :: !Int
, csvColsExpected :: !Int
, csvColsActual :: !Int
, csvField :: CSVField }
| FieldError { csvField :: CSVField }
| DuplicateHeader{ csvColsExpected :: !Int
, csvHeaderSerial :: !Int
, csvDuplicate :: !String }
| NoData
deriving (Eq,Show)
type CSVResult = [ Either [CSVError] [CSVField] ]
csvTable :: CSVResult -> CSVTable
csvTable r = [ row | Right row <- r ]
csvErrors :: CSVResult -> [CSVError]
csvErrors r = concat [ err | Left err <- r ]
csvTableFull:: CSVResult -> CSVTable
csvTableFull = map beCareful . deduplicate
where beCareful (Right row) = row
beCareful (Left (r@IncorrectRow{}:_)) =
csvFields r ++
replicate (csvColsExpected r csvColsActual r)
(mkCSVField (csvRow r) 0 BS.empty)
beCareful (Left (r@BlankLine{}:_)) =
replicate (csvColsExpected r)
(mkCSVField (csvRow r) 0 BS.empty)
beCareful (Left (r@DuplicateHeader{}:_)) =
replicate (csvColsExpected r)
(mkCSVField 0 0 BS.empty)
beCareful (Left (FieldError{}:r)) = beCareful (Left r)
beCareful (Left (NoData:_)) = []
beCareful (Left []) = []
deduplicate (Left (errs@(DuplicateHeader{}:_)):Right heads:rows) =
Right (replaceInOrder errs (zip heads [0..]))
: rows
deduplicate rows = rows
replaceInOrder [] headers = map fst headers
replaceInOrder _ [] = []
replaceInOrder (d:dups) ((h,n):headers)
| csvHeaderSerial d == n = h{ csvFieldContent = BS.pack
(csvDuplicate d++"_"++show n) }
: replaceInOrder dups headers
| otherwise = h: replaceInOrder (d:dups) headers
csvTableHeader :: CSVResult -> [String]
csvTableHeader = map (BS.unpack . csvFieldContent) . firstRow
where firstRow (Left _: rest) = firstRow rest
firstRow (Right x: _) = x
parseCSV :: ByteString -> CSVResult
parseCSV = parseDSV True ','
parseDSV :: Bool -> Char -> ByteString -> CSVResult
parseDSV qn delim = validate
. groupBy ((==)`on`csvRowNum)
. lexCSV qn delim
validate :: [CSVRow] -> CSVResult
validate [] = [Left [NoData]]
validate xs@(x:_) = checkDuplicateHeaders x $ map (extractErrs (length x)) xs
extractErrs :: Int -> CSVRow -> Either [CSVError] CSVRow
extractErrs size row
| length row0 == size && null errs0 = Right row0
| length row0 == 1 && empty field0 = Left [blankLine field0]
| otherwise = Left (map convert errs0
++ validateColumns row0)
where
(row0,errs0) = partition isField row
(field0:_) = row0
isField (CSVField{}) = True
isField (CSVFieldError{}) = False
empty f@(CSVField{}) = BS.null (csvFieldContent f)
empty _ = False
convert err = FieldError {csvField = err}
validateColumns r =
if length r == size then []
else [ IncorrectRow{ csvRow = if null r then 0 else csvRowNum (head r)
, csvColsExpected = size
, csvColsActual = length r
, csvFields = r } ]
blankLine f = BlankLine{ csvRow = csvRowNum f
, csvColsExpected = size
, csvColsActual = 1
, csvField = f }
checkDuplicateHeaders :: CSVRow -> CSVResult -> CSVResult
checkDuplicateHeaders row result =
let headers = [ f | f@(CSVField{}) <- row ]
dups = deleteFirstsBy ((==)`on`csvFieldContent)
headers (nub headers)
n = length headers
in if null dups then result
else Left (map (\d-> DuplicateHeader
{ csvColsExpected = n
, csvHeaderSerial = csvColNum d
, csvDuplicate = BS.unpack (csvFieldContent d)})
dups)
: result
data CSVState = CSVState { tableRow, tableCol :: !Int
, textRow, textCol :: !Int }
deriving Show
incTableRow, incTableCol, incTextRow :: CSVState -> CSVState
incTableRow st = st { tableRow = tableRow st + 1 , tableCol = 1 }
incTableCol st = st { tableCol = tableCol st + 1 }
incTextRow st = st { textRow = textRow st + 1 , textCol = 1 }
incTextCol :: Int -> CSVState -> CSVState
incTextCol n st = st { textCol = textCol st + n }
here :: CSVState -> (Int,Int)
here st = (textRow st, textCol st)
lexCSV :: Bool -> Char -> ByteString -> [CSVField]
lexCSV qn delim =
getFields qn delim
(CSVState{tableRow=1,tableCol=1,textRow=1,textCol=1}) (1,1)
getFields :: Bool -> Char -> CSVState -> (Int, Int) -> ByteString -> [CSVField]
getFields q d state begin bs0
= case BS.uncons bs0 of
Nothing -> []
Just ('"', bs1) -> doStringFieldContent q d (incTextCol 1 state) begin
BS.empty bs1
_ ->
case BS.break interestingChar bs0 of
(fieldBs, bs1) ->
let field = mkField end begin fieldBs False
end = incTextCol (len1) $ state
state' = incTableCol $ incTextCol 2 end
stateNL = incTableRow . incTextRow $ state
len = fromIntegral $ BS.length fieldBs
in case BS.uncons bs1 of
Just (c,bs2)
| c==d -> field: getFields q d state' (here state') bs2
Just ('\r',bs2) ->
case BS.uncons bs2 of
Just ('\n',bs3)
-> field: getFields q d stateNL (here stateNL) bs3
_ -> field: getFields q d stateNL (here stateNL) bs2
Just ('\n',bs2) -> field: getFields q d stateNL (here stateNL) bs2
Just ('"', _) -> field:
mkError state' begin
"unexpected quote, resync at EOL":
getFields q d stateNL (here stateNL)
(BS.dropWhile (/='\n') bs1)
Just _ -> [mkError state' begin "XXX Can't happen"]
Nothing -> field: getFields q d stateNL (here stateNL) bs1
where interestingChar '\r' = True
interestingChar '\n' = True
interestingChar '"' = True
interestingChar c | c==d = True
interestingChar _ = False
doStringFieldContent :: Bool -> Char -> CSVState -> (Int, Int) -> ByteString
-> ByteString -> [CSVField]
doStringFieldContent q d state begin acc bs1
= case BS.break interestingCharInsideString bs1 of
(newBs, bs2) ->
let fieldBs = acc `BS.append` newBs
field = mkField end begin fieldBs True
end = incTextCol (len1) state
state' = incTableCol $ incTextCol 3 end
stateNL = incTableRow . incTextRow $ state
len = fromIntegral $ BS.length newBs
in case BS.uncons bs2 of
Just ('\r',bs3) ->
case BS.uncons bs3 of
Just ('\n',bs4) | q ->
doStringFieldContent q d (incTextRow end) begin
(fieldBs `BS.append` BS.singleton '\n') bs4
_ -> doStringFieldContent q d end begin
(fieldBs `BS.append` BS.singleton '\r') bs3
Just ('\n',bs3) | q ->
doStringFieldContent q d (incTextRow end) begin
(fieldBs `BS.append` BS.singleton '\n') bs3
Just ('\n',bs3) ->
field:
mkError end begin "Found newline within quoted field":
getFields q d stateNL (here stateNL) bs3
Just ('"', bs3) ->
case BS.uncons bs3 of
Just (c,bs4)
| c==d -> field: getFields q d state' (here state') bs4
Just ('\r',bs4) ->
case BS.uncons bs4 of
Just ('\n',bs5) ->
field: getFields q d stateNL (here stateNL) bs5
_ -> field: getFields q d stateNL (here stateNL) bs4
Just ('\n',bs4) -> field: getFields q d stateNL (here stateNL) bs4
Just ('"',bs4) ->
doStringFieldContent q d (incTextCol 3 end) begin
(fieldBs `BS.append` BS.singleton '"') bs4
Just _ -> field:
mkError state' begin "End-quote not followed by comma":
getFields q d state' (here state') bs3
Nothing -> field: getFields q d stateNL (here stateNL) bs3
Just _ -> [mkError state' begin "XXX Can't happen (string field)"]
Nothing -> field:
mkError state' begin "CSV data ends within a quoted string"
:[]
where interestingCharInsideString '\r' = True
interestingCharInsideString '\n' = True
interestingCharInsideString '"' = True
interestingCharInsideString _ = False
mkField :: CSVState -> (Int, Int) -> ByteString -> Bool -> CSVField
mkField st begin bs q = CSVField { csvRowNum = tableRow st
, csvColNum = tableCol st
, csvTextStart = begin
, csvTextEnd = (textRow st,textCol st)
, csvFieldContent = bs
, csvFieldQuoted = q }
mkError :: CSVState -> (Int, Int) -> String -> CSVField
mkError st begin e = CSVFieldError { csvRowNum = tableRow st
, csvColNum = tableCol st
, csvTextStart = begin
, csvTextEnd = (textRow st,textCol st)
, csvFieldError = e }
ppCSVError :: CSVError -> String
ppCSVError (err@IncorrectRow{}) =
"\nRow "++show (csvRow err)++" has wrong number of fields."++
"\n Expected "++show (csvColsExpected err)++" but got "++
show (csvColsActual err)++"."++
"\n The fields are:"++
indent 8 (concatMap ppCSVField (csvFields err))
ppCSVError (err@BlankLine{}) =
"\nRow "++show (csvRow err)++" is blank."++
"\n Expected "++show (csvColsExpected err)++" fields."
ppCSVError (err@FieldError{}) = ppCSVField (csvField err)
ppCSVError (err@DuplicateHeader{}) =
"\nThere are two (or more) identical column headers: "++
show (csvDuplicate err)++"."++
"\n Column number "++show (csvHeaderSerial err)
ppCSVError (NoData{}) =
"\nNo usable data (after accounting for any other errors)."
ppCSVField :: CSVField -> String
ppCSVField (f@CSVField{}) =
"\n"++BS.unpack (quoted (csvFieldQuoted f) (csvFieldContent f))++
"\nin row "++show (csvRowNum f)++" at column "++show (csvColNum f)++
" (textually from "++show (csvTextStart f)++" to "++
show (csvTextEnd f)++")"
ppCSVField (f@CSVFieldError{}) =
"\n"++csvFieldError f++
"\nin row "++show (csvRowNum f)++" at column "++show (csvColNum f)++
" (textually from "++show (csvTextStart f)++" to "++
show (csvTextEnd f)
ppCSVTable :: CSVTable -> ByteString
ppCSVTable = BS.unlines . map (BS.intercalate (BS.pack ",") . map ppField)
where ppField f = quoted (csvFieldQuoted f) (csvFieldContent f)
ppDSVTable :: Bool -> Char -> CSVTable -> ByteString
ppDSVTable nl d = BS.unlines . map (BS.intercalate (BS.pack [d]) . map ppField)
where ppField f = quoted (csvFieldQuoted f) (doNL $ csvFieldContent f)
doNL | nl = replaceNL
| otherwise = id
indent :: Int -> String -> String
indent n = unlines . map (replicate n ' ' ++) . lines
quoted :: Bool -> ByteString -> ByteString
quoted False s = s
quoted True s = BS.concat [BS.pack "\"", escape s, BS.pack"\""]
where escape s = let (good,next) = BS.span (/='"') s
in if BS.null next then good
else BS.concat [ good, BS.pack "\"\"", escape (BS.tail next) ]
replaceNL :: ByteString -> ByteString
replaceNL s = let (good,next) = BS.span (/='\n') s
in if BS.null next then good
else if BS.null good then replaceNL (BS.tail next)
else BS.concat [ good, BS.pack " ", replaceNL next ]
fromCSVTable :: CSVTable -> [[ByteString]]
fromCSVTable = map (map csvFieldContent)
toCSVTable :: [[ByteString]] -> ([CSVError], CSVTable)
toCSVTable [] = ([NoData], [])
toCSVTable rows@(r:_) = (\ (a,b)-> (concat a, b)) $
unzip (zipWith walk [1..] rows)
where
n = length r
walk :: Int -> [ByteString] -> ([CSVError], CSVRow)
walk rnum [] = ( [blank rnum]
, map (\c-> mkCSVField rnum c (BS.empty)) [1..n])
walk rnum cs = ( if length cs /= n then [bad rnum cs] else []
, zipWith (mkCSVField rnum) [1..n] cs )
blank rnum = BlankLine{ csvRow = rnum
, csvColsExpected = n
, csvColsActual = 0
, csvField = mkCSVField rnum 0 BS.empty
}
bad r cs = IncorrectRow{ csvRow = r
, csvColsExpected = n
, csvColsActual = length cs
, csvFields = zipWith (mkCSVField r) [1..] cs
}
selectFields :: [String] -> CSVTable -> Either [String] CSVTable
selectFields names table
| null table = Left names
| not (null missing) = Left missing
| otherwise = Right (map select table)
where
header = map (BS.unpack . csvFieldContent) (head table)
missing = filter (`notElem` header) names
reordering = map (fromJust . (\n-> elemIndex n header)) names
select fields = map (fields!!) reordering
expectFields :: [String] -> CSVTable -> Either [String] CSVTable
expectFields names table
| null table = Left ["CSV table is empty"]
| not (null missing) = Left (map ("CSV table is missing field: "++)
missing)
| header /= names = Left ["CSV columns are in the wrong order"
,"Expected: "++intercalate ", " names
,"Found: "++intercalate ", " header]
| otherwise = Right table
where
header = map (BS.unpack . csvFieldContent) (head table)
missing = filter (`notElem` header) names
joinCSV :: CSVTable -> CSVTable -> CSVTable
joinCSV = zipWith (++)
mkEmptyColumn :: String -> CSVTable
mkEmptyColumn header = [headField] : map ((:[]).emptyField) [2..]
where
headField = (emptyField 1) { csvFieldContent = BS.pack header
, csvFieldQuoted = True }
emptyField n = CSVField { csvRowNum = n
, csvColNum = 0
, csvTextStart = (0,0)
, csvTextEnd = (0,0)
, csvFieldContent = BS.empty
, csvFieldQuoted = False
}
mkCSVField :: Int -> Int -> ByteString -> CSVField
mkCSVField n c text =
CSVField { csvRowNum = n
, csvColNum = c
, csvTextStart = (0,0)
, csvTextEnd = ( fromIntegral
. BS.length
. BS.filter (=='\n')
$ text
, fromIntegral
. BS.length
. BS.takeWhile (/='\n')
. BS.reverse $ text )
, csvFieldContent = text
, csvFieldQuoted = any (`elem`"\",\n\r") (BS.unpack text)
}