module Text.CSV.Lazy.String
(
CSVTable
, CSVRow
, CSVField(..)
, CSVError(..)
, CSVResult
, csvErrors
, csvTable
, csvTableFull
, csvTableHeader
, parseCSV
, parseDSV
, ppCSVError
, ppCSVField
, ppCSVTable
, ppDSVTable
, fromCSVTable
, toCSVTable
, selectFields
, expectFields
, mkEmptyColumn
, joinCSV
, mkCSVField
) where
import Data.List (groupBy, partition, elemIndex, intercalate, takeWhile
,deleteFirstsBy, nub)
import Data.Function (on)
import Data.Maybe (fromJust)
type CSVTable = [CSVRow]
type CSVRow = [CSVField]
data CSVField = CSVField { csvRowNum :: !Int
, csvColNum :: !Int
, csvTextStart :: !(Int,Int)
, csvTextEnd :: !(Int,Int)
, csvFieldContent :: !String
, csvFieldQuoted :: !Bool }
| CSVFieldError { csvRowNum :: !Int
, csvColNum :: !Int
, csvTextStart :: !(Int,Int)
, csvTextEnd :: !(Int,Int)
, csvFieldError :: !String }
deriving (Eq,Show)
data CSVError = IncorrectRow { csvRow :: !Int
, csvColsExpected :: !Int
, csvColsActual :: !Int
, csvFields :: [CSVField] }
| BlankLine { csvRow :: !Int
, csvColsExpected :: !Int
, csvColsActual :: !Int
, csvField :: CSVField }
| FieldError { csvField :: CSVField }
| DuplicateHeader{ csvColsExpected :: !Int
, csvHeaderSerial :: !Int
, csvDuplicate :: !String }
| NoData
deriving (Eq,Show)
type CSVResult = [Either [CSVError] CSVRow]
csvTable :: CSVResult -> CSVTable
csvTable r = [ v | Right v <- r ]
csvErrors :: CSVResult -> [CSVError]
csvErrors r = concat [ v | Left v <- r ]
csvTableFull:: CSVResult -> CSVTable
csvTableFull = map beCareful . deduplicate
where beCareful (Right row) = row
beCareful (Left (r@IncorrectRow{}:_)) =
csvFields r ++
replicate (csvColsExpected r csvColsActual r)
(mkCSVField (csvRow r) 0 "")
beCareful (Left (r@BlankLine{}:_)) =
replicate (csvColsExpected r)
(mkCSVField (csvRow r) 0 "")
beCareful (Left (r@DuplicateHeader{}:_)) =
replicate (csvColsExpected r)
(mkCSVField 0 0 "")
beCareful (Left (FieldError{}:r)) = beCareful (Left r)
beCareful (Left (NoData:_)) = []
beCareful (Left []) = []
deduplicate (Left (errs@(DuplicateHeader{}:_)):Right heads:rows) =
Right (replaceInOrder errs (zip heads [0..]))
: rows
deduplicate rows = rows
replaceInOrder [] headers = map fst headers
replaceInOrder _ [] = []
replaceInOrder (d:dups) ((h,n):headers)
| csvHeaderSerial d == n = h{ csvFieldContent =
(csvDuplicate d++"_"++show n) }
: replaceInOrder dups headers
| otherwise = h: replaceInOrder (d:dups) headers
csvTableHeader :: CSVResult -> [String]
csvTableHeader = map csvFieldContent . firstRow
where firstRow (Left _: rest) = firstRow rest
firstRow (Right x: _) = x
parseCSV :: String -> CSVResult
parseCSV = parseDSV True ','
parseDSV :: Bool -> Char -> String -> CSVResult
parseDSV qn delim = validate
. groupBy ((==)`on`csvRowNum)
. lexCSV qn delim
validate :: [CSVRow] -> CSVResult
validate [] = [Left [NoData]]
validate xs@(x:_) = checkDuplicateHeaders x $ map (extractErrs (length x)) xs
extractErrs :: Int -> CSVRow -> Either [CSVError] CSVRow
extractErrs size row
| length row0 == size && null errs0 = Right row0
| length row0 == 1 && empty field0 = Left [blankLine field0]
| otherwise = Left (map convert errs0
++ validateColumns row0)
where
(row0,errs0) = partition isField row
(field0:_) = row0
isField (CSVField{}) = True
isField (CSVFieldError{}) = False
empty f@(CSVField{}) = null (csvFieldContent f)
empty _ = False
convert err = FieldError {csvField = err}
validateColumns r =
if length r == size then []
else [ IncorrectRow{ csvRow = if null r then 0 else csvRowNum (head r)
, csvColsExpected = size
, csvColsActual = length r
, csvFields = r } ]
blankLine f = BlankLine{ csvRow = csvRowNum f
, csvColsExpected = size
, csvColsActual = 1
, csvField = f }
checkDuplicateHeaders :: CSVRow -> CSVResult -> CSVResult
checkDuplicateHeaders row result =
let headers = [ f | f@(CSVField{}) <- row ]
dups = deleteFirstsBy ((==)`on`csvFieldContent)
headers (nub headers)
n = length headers
in if null dups then result
else Left (map (\d-> DuplicateHeader
{ csvColsExpected = n
, csvHeaderSerial = csvColNum d
, csvDuplicate = csvFieldContent d })
dups)
: result
data CSVState = CSVState { tableRow, tableCol :: !Int
, textRow, textCol :: !Int }
incTableRow, incTableCol, incTextRow, incTextCol :: CSVState -> CSVState
incTableRow st = st { tableRow = tableRow st + 1 }
incTableCol st = st { tableCol = tableCol st + 1 }
incTextRow st = st { textRow = textRow st + 1 }
incTextCol st = st { textCol = textCol st + 1 }
lexCSV :: Bool -> Char -> [Char] -> [CSVField]
lexCSV quotedNewline delim =
simple CSVState{tableRow=1,tableCol=1,textRow=1,textCol=1} (1,1) []
where
simple :: CSVState -> (Int,Int) -> String -> String -> [CSVField]
simple _ _ [] [] = []
simple s begin acc [] = mkField s begin acc False : []
simple s begin acc (c:cs)
| not (interesting c) = simple (incTextCol $! s) begin (c:acc) cs
simple s begin acc (c:'"':cs)
| c==delim = mkField s begin acc False :
string s' (textRow s',textCol s') [] cs
where s' = incTextCol . incTextCol .
incTableCol $! s
simple s begin acc (c:cs)
| c==delim = mkField s begin acc False :
simple s' (textRow s',textCol s') [] cs
where s' = incTableCol . incTextCol $! s
simple s begin acc ('\r':'\n':cs)
= mkField s begin acc False :
simple s' (textRow s',1) [] cs
where s' = incTableRow . incTextRow $!
s {tableCol=1, textCol=1}
simple s begin acc ('\n' :cs) = mkField s begin acc False :
simple s' (textRow s',1) [] cs
where s' = incTableRow . incTextRow $!
s {tableCol=1, textCol=1}
simple s begin acc ('\r' :cs) = mkField s begin acc False :
simple s' (textRow s',1) [] cs
where s' = incTableRow . incTextRow $!
s {tableCol=1, textCol=1}
simple s begin [] ('"' :cs) = string (incTextCol $! s) begin [] cs
simple s begin acc ('"' :cs) = mkError s begin
"Start-quote not next to comma":
string (incTextCol $! s) begin acc cs
string :: CSVState -> (Int,Int) -> String -> String -> [CSVField]
string s begin [] [] = mkError s begin "Data ends at start-quote":
[]
string s begin acc [] = mkError s begin "Data ends in quoted field":
[]
string s begin acc (c:cs)
| not (interestingInString c) = string (incTextCol $! s) begin (c:acc) cs
string s begin acc ('"':'"':cs) = string s' begin ('"':acc) cs
where s' = incTextCol . incTextCol $! s
string s begin acc ('"':c:'"':cs)
| c==delim = mkField s begin acc True :
string s' (textRow s',textCol s') [] cs
where s' = incTextCol . incTextCol .
incTextCol . incTableCol $! s
string s begin acc ('"':c:cs)
| c==delim = mkField s begin acc True :
simple s' (textRow s',textCol s') [] cs
where s' = incTextCol . incTextCol .
incTableCol $! s
string s begin acc ('"':'\n':cs)= mkField s begin acc True :
simple s' (textRow s',1) [] cs
where s' = incTableRow . incTextRow $!
s {tableCol=1, textCol=1}
string s begin acc ('"':'\r':'\n':cs)
= mkField s begin acc True :
simple s' (textRow s',1) [] cs
where s' = incTableRow . incTextRow $!
s {tableCol=1, textCol=1}
string s begin acc ('"':[]) = mkField s begin acc True : []
string s begin acc ('"':cs) = mkError s begin
"End-quote not followed by comma":
simple (incTextCol $! s) begin acc cs
string s begin acc ('\r':'\n':cs)
| quotedNewline = string s' begin ('\n':acc) cs
| otherwise = mkError s begin
"Found newline within quoted field":
simple s'' (textRow s'',textCol s'') [] cs
where s' = incTextRow $! s {textCol=1}
s'' = incTableRow . incTextRow $!
s {textCol=1, tableCol=1}
string s begin acc ('\n' :cs)
| quotedNewline = string s' begin ('\n':acc) cs
| otherwise = mkError s begin
"Found newline within quoted field":
simple s'' (textRow s'',textCol s'') [] cs
where s' = incTextRow $! s {textCol=1}
s'' = incTableRow . incTextRow $!
s {textCol=1, tableCol=1}
interesting :: Char -> Bool
interesting '\n' = True
interesting '\r' = True
interesting '"' = True
interesting c = c==delim
interestingInString :: Char -> Bool
interestingInString '\n' = True
interestingInString '\r' = True
interestingInString '"' = True
interestingInString _ = False
mkField st begin f q = CSVField { csvRowNum = tableRow st
, csvColNum = tableCol st
, csvTextStart = begin
, csvTextEnd = (textRow st,textCol st)
, csvFieldContent = reverse f
, csvFieldQuoted = q }
mkError st begin e = CSVFieldError { csvRowNum = tableRow st
, csvColNum = tableCol st
, csvTextStart = begin
, csvTextEnd = (textRow st,textCol st)
, csvFieldError = e }
ppCSVError :: CSVError -> String
ppCSVError (err@IncorrectRow{}) =
"\nRow "++show (csvRow err)++" has wrong number of fields."++
"\n Expected "++show (csvColsExpected err)++" but got "++
show (csvColsActual err)++"."++
"\n The fields are:"++
indent 8 (concatMap ppCSVField (csvFields err))
ppCSVError (err@BlankLine{}) =
"\nRow "++show (csvRow err)++" is blank."++
"\n Expected "++show (csvColsExpected err)++" fields."
ppCSVError (err@FieldError{}) = ppCSVField (csvField err)
ppCSVError (err@DuplicateHeader{}) =
"\nThere are two (or more) identical column headers: "++
show (csvDuplicate err)++"."++
"\n Column number "++show (csvHeaderSerial err)
ppCSVError (err@NoData{}) =
"\nNo usable data (after accounting for any other errors)."
ppCSVField :: CSVField -> String
ppCSVField (f@CSVField{}) =
"\n"++quoted (csvFieldQuoted f) (csvFieldContent f)++
"\nin row "++show (csvRowNum f)++" at column "++show (csvColNum f)++
" (textually from "++show (csvTextStart f)++" to "++
show (csvTextEnd f)++")"
ppCSVField (f@CSVFieldError{}) =
"\n"++csvFieldError f++
"\nin row "++show (csvRowNum f)++" at column "++show (csvColNum f)++
" (textually from "++show (csvTextStart f)++" to "++
show (csvTextEnd f)++")"
ppCSVTable :: CSVTable -> String
ppCSVTable = unlines . map (intercalate "," . map ppField)
where ppField f = quoted (csvFieldQuoted f) (csvFieldContent f)
ppDSVTable :: Bool -> Char -> CSVTable -> String
ppDSVTable nl delim = unlines . map (intercalate [delim] . map ppField)
where ppField f = quoted (csvFieldQuoted f) (doNL $ csvFieldContent f)
doNL | nl = replaceNL
| otherwise = id
indent :: Int -> String -> String
indent n = unlines . map (replicate n ' ' ++) . lines
quoted :: Bool -> String -> String
quoted False s = s
quoted True s = '"': escape s ++"\""
where escape ('"':cs) = '"':'"': escape cs
escape (c:cs) = c: escape cs
escape [] = []
replaceNL :: String -> String
replaceNL ('\n':s) = ' ':replaceNL s
replaceNL (c:s) = c: replaceNL s
replaceNL [] = []
fromCSVTable :: CSVTable -> [[String]]
fromCSVTable = map (map csvFieldContent)
toCSVTable :: [[String]] -> ([CSVError], CSVTable)
toCSVTable [] = ([NoData], [])
toCSVTable rows@(r:_) = (\ (a,b)-> (concat a, b)) $
unzip (zipWith walk [1..] rows)
where
n = length r
walk :: Int -> [String] -> ([CSVError], CSVRow)
walk rnum [] = ( [blank rnum]
, map (\c-> mkCSVField rnum c "") [1..n])
walk rnum cs = ( if length cs /= n then [bad rnum cs] else []
, zipWith (mkCSVField rnum) [1..n] cs )
blank rnum = BlankLine{ csvRow = rnum
, csvColsExpected = n
, csvColsActual = 0
, csvField = mkCSVField rnum 0 ""
}
bad r cs = IncorrectRow{ csvRow = r
, csvColsExpected = n
, csvColsActual = length cs
, csvFields = zipWith (mkCSVField r) [1..] cs
}
selectFields :: [String] -> CSVTable -> Either [String] CSVTable
selectFields names table
| null table = Left names
| not (null missing) = Left missing
| otherwise = Right (map select table)
where
header = map csvFieldContent (head table)
missing = filter (`notElem` header) names
reordering = map (fromJust . (\n-> elemIndex n header)) names
select fields = map (fields!!) reordering
expectFields :: [String] -> CSVTable -> Either [String] CSVTable
expectFields names table
| null table = Left ["CSV table is empty"]
| not (null missing) = Left (map ("CSV table is missing field: "++)
missing)
| header /= names = Left ["CSV columns are in the wrong order"
,"Expected: "++intercalate ", " names
,"Found: "++intercalate ", " header]
| otherwise = Right table
where
header = map csvFieldContent (head table)
missing = filter (`notElem` header) names
joinCSV :: CSVTable -> CSVTable -> CSVTable
joinCSV = zipWith (++)
mkEmptyColumn :: String -> CSVTable
mkEmptyColumn header = [mkCSVField 1 0 header] :
map (\n-> [mkCSVField n 0 ""]) [2..]
mkCSVField :: Int -> Int -> String -> CSVField
mkCSVField n c text =
CSVField { csvRowNum = n
, csvColNum = c
, csvTextStart = (0,0)
, csvTextEnd = (length (filter (=='\n') text)
,length . takeWhile (/='\n')
. reverse $ text )
, csvFieldContent = text
, csvFieldQuoted = any (`elem`"\",\n\r") text
}