{-| Module : Data.Svfactor.Parse.Internal Copyright : (C) CSIRO 2017-2018 License : BSD3 Maintainer : George Wilson Stability : experimental Portability : non-portable This module contains internal implementation details of svfactor's parser. As the Internal name suggests, this file is exempt from the PVP. Depend on this module at your own risk! -} module Data.Svfactor.Parse.Internal ( separatedValues , header , field , singleQuotedField , doubleQuotedField , unquotedField , spaced , spacedField , record , records , ending ) where import Control.Applicative (Applicative ((<*>), pure), (*>), (<*), (<$), Alternative ((<|>), empty), optional) import Control.Lens (review, view) import Data.CharSet (CharSet, (\\)) import qualified Data.CharSet as CharSet (fromList, insert, singleton) import Data.Functor (($>), (<$>), void) import Data.List.NonEmpty (NonEmpty ((:|))) import Data.Maybe (fromMaybe) import Data.Semigroup ((<>)) import qualified Data.Vector as V import Text.Parser.Char (CharParsing, char, notChar, noneOfSet, oneOfSet, string) import Text.Parser.Combinators (between, choice, eof, many, notFollowedBy, sepEndBy, try) import Data.Svfactor.Syntax.Sv (Sv (Sv), Header, mkHeader, noHeader, Headedness (Unheaded, Headed), headedness, Separator) import Data.Svfactor.Syntax.Field (Field (Unquoted, Quoted)) import Data.Svfactor.Syntax.Record (Record (Record), Records (Records, EmptyRecords)) import Data.Svfactor.Parse.Options (ParseOptions, separator, endOnBlankLine, encodeString) import Data.Svfactor.Vector.NonEmpty as V import Data.Svfactor.Text.Escape (Unescaped (Unescaped)) import Data.Svfactor.Text.Newline (Newline (CR, CRLF, LF)) import Data.Svfactor.Text.Space (HorizontalSpace (Space, Tab), Spaces, Spaced, betwixt) import Data.Svfactor.Text.Quote (Quote (SingleQuote, DoubleQuote), quoteChar) -- | This function is in newer versions of the parsers package, but in -- order to maintain compatibility with older versions I've left it here. sepEndByNonEmpty :: Alternative m => m a -> m sep -> m (NonEmpty a) sepEndByNonEmpty p sep = (:|) <$> p <*> ((sep *> sepEndBy p sep) <|> pure []) -- | Parse a field surrounded by single quotes singleQuotedField :: CharParsing m => (String -> s) -> m (Field s) singleQuotedField = quotedField SingleQuote -- | Parse a field surrounded by double quotes doubleQuotedField :: CharParsing m => (String -> s) -> m (Field s) doubleQuotedField = quotedField DoubleQuote -- | Given a quote, parse its escaped form (which is it repeated twice) escapeQuote :: CharParsing m => Quote -> m Char escapeQuote q = let c = review quoteChar q in try (string (two c)) $> c two :: a -> [a] two a = [a,a] quotedField :: CharParsing m => Quote -> (String -> s) -> m (Field s) quotedField quote str = let q = review quoteChar quote c = char q cc = escapeQuote quote in Quoted quote . Unescaped . str <$> between c c (many (cc <|> notChar q)) -- | Parse a field that is not surrounded by quotes unquotedField :: CharParsing m => Separator -> (String -> s) -> m (Field s) unquotedField sep str = let spaceSet = CharSet.fromList " \t" \\ CharSet.singleton sep oneSpace = oneOfSet spaceSet nonSpaceFieldChar = noneOfSet (newlineOr sep <> spaceSet) terminalWhitespace = many oneSpace *> fieldEnder fieldEnder = void (oneOfSet (newlineOr sep)) <|> eof in Unquoted . str <$> many ( nonSpaceFieldChar <|> (notFollowedBy (try terminalWhitespace)) *> oneSpace ) -- | Parse a field, be it quoted or unquoted field :: CharParsing m => Separator -> (String -> s) -> m (Field s) field sep str = choice [ singleQuotedField str , doubleQuotedField str , unquotedField sep str ] -- | Parse a field with its surrounding spacing spacedField :: CharParsing m => Separator -> (String -> s) -> m (Spaced (Field s)) spacedField sep str = spaced sep (field sep str) newlineOr :: Char -> CharSet newlineOr c = CharSet.insert c newlines newlines :: CharSet newlines = CharSet.fromList "\r\n" newline :: CharParsing m => m Newline newline = CRLF <$ try (string "\r\n") <|> CR <$ char '\r' <|> LF <$ char '\n' space :: CharParsing m => Separator -> m HorizontalSpace space sep = let removeIfSep c s = if sep == c then empty else char c $> s in removeIfSep ' ' Space <|> removeIfSep '\t' Tab spaces :: CharParsing m => Separator -> m Spaces spaces = fmap V.fromList . many . space -- | Combinator to parse some data surrounded by spaces spaced :: CharParsing m => Separator -> m a -> m (Spaced a) spaced sep p = betwixt <$> spaces sep <*> p <*> spaces sep -- | Parse an entire record, or "row" record :: CharParsing m => ParseOptions s -> m (Record s) record opts = let sep = view separator opts str = view encodeString opts in Record . V.fromNel <$> (spacedField sep str `sepEndByNonEmpty` char sep) -- | Parse many records, or "rows" records :: CharParsing m => ParseOptions s -> m (Records s) records opts = let manyV = fmap V.fromList . many in fromMaybe EmptyRecords <$> optional (Records <$> firstRecord opts <*> manyV (subsequentRecord opts)) firstRecord :: CharParsing m => ParseOptions s -> m (Record s) firstRecord opts = notFollowedBy (try (ending opts)) *> record opts subsequentRecord :: CharParsing m => ParseOptions s -> m (Newline, Record s) subsequentRecord opts = (,) <$> (notFollowedBy (try (ending opts)) *> newline) -- ((if view endOnBlankLine opts then (void newline) else empty) <|> eof)) <*> record opts -- | Parse zero or many newlines ending :: CharParsing m => ParseOptions s -> m [Newline] ending opts = let end = if view endOnBlankLine opts then pure [] else many newline in [] <$ eof <|> try (pure <$> newline <* eof) <|> (:) <$> newline <*> ((:) <$> newline <*> end) -- | Maybe parse the header row of a CSV file, depending on the given 'Headedness' header :: CharParsing m => ParseOptions s -> m (Maybe (Header s)) header opts = case view headedness opts of Unheaded -> pure noHeader Headed -> mkHeader <$> record opts <*> newline -- | Parse an 'Sv' separatedValues :: CharParsing m => ParseOptions s -> m (Sv s) separatedValues opts = Sv (view separator opts) <$> header opts <*> records opts <*> ending opts