The parser uses a separate lexer for two reasons:
1. sql syntax is very awkward to parse, the separate lexer makes it
easier to handle this in most places (in some places it makes it
harder or impossible, the fix is to switch to something better than
parsec)
2. using a separate lexer gives a huge speed boost because it reduces
backtracking. (We could get this by making the parsing code a lot more
complex also.)
3. we can test the lexer relatively exhaustively, then even when we
don't do nearly as comprehensive testing on the syntax level, we still
have a relatively high assurance of the low level of bugs. This is
much more difficult to get parity with when testing the syntax parser
directly without the separately testing lexing stage.
>
> {-# LANGUAGE TupleSections #-}
> module Language.SQL.SimpleSQL.Lex
> (Token(..)
> ,lexSQL
> ,prettyToken
> ,prettyTokens
> ,ParseError(..)
> ,tokenListWillPrintAndLex
> ,ansi2011
> ) where
> import Language.SQL.SimpleSQL.Dialect
> import Text.Parsec (option,string,manyTill,anyChar
> ,try,string,many1,oneOf,digit,(<|>),choice,char,eof
> ,many,runParser,lookAhead,satisfy
> ,setPosition,getPosition
> ,setSourceColumn,setSourceLine
> ,sourceName, setSourceName
> ,sourceLine, sourceColumn
> ,notFollowedBy)
> import Language.SQL.SimpleSQL.Combinators
> import Language.SQL.SimpleSQL.Errors
> import Control.Applicative hiding ((<|>), many)
> import Data.Char
> import Control.Monad
> import Prelude hiding (takeWhile)
> import Text.Parsec.String (Parser)
> import Data.Maybe
>
> data Token
>
>
>
>
>
> = Symbol String
>
>
>
>
> | Identifier (Maybe (String,String)) String
>
>
>
> | PrefixedVariable Char String
>
>
> | PositionalArg Int
>
>
>
>
>
> | SqlString String String String
>
>
>
> | SqlNumber String
>
>
> | Whitespace String
>
>
>
>
>
> | LineComment String
>
>
> | BlockComment String
>
> deriving (Eq,Show)
>
>
> prettyToken :: Dialect -> Token -> String
> prettyToken _ (Symbol s) = s
> prettyToken _ (Identifier Nothing t) = t
> prettyToken _ (Identifier (Just (q1,q2)) t) = q1 ++ t ++ q2
> prettyToken _ (PrefixedVariable c p) = c:p
> prettyToken _ (PositionalArg p) = '$':show p
> prettyToken _ (SqlString s e t) = s ++ t ++ e
> prettyToken _ (SqlNumber r) = r
> prettyToken _ (Whitespace t) = t
> prettyToken _ (LineComment l) = l
> prettyToken _ (BlockComment c) = c
> prettyTokens :: Dialect -> [Token] -> String
> prettyTokens d ts = concat $ map (prettyToken d) ts
TODO: try to make all parsers applicative only
>
> lexSQL :: Dialect
>
> -> FilePath
>
> -> Maybe (Int,Int)
>
>
> -> String
>
> -> Either ParseError [((String,Int,Int),Token)]
> lexSQL dialect fn' p src =
> let (l',c') = fromMaybe (1,1) p
> in either (Left . convParseError src) Right
> $ runParser (setPos (fn',l',c') *> many (sqlToken dialect) <* eof) () fn' src
> where
> setPos (fn,l,c) = do
> fmap (flip setSourceName fn
> . flip setSourceLine l
> . flip setSourceColumn c) getPosition
> >>= setPosition
>
> sqlToken :: Dialect -> Parser ((String,Int,Int),Token)
> sqlToken d = do
> p' <- getPosition
> let p = (sourceName p',sourceLine p', sourceColumn p')
The order of parsers is important: strings and quoted identifiers can
start out looking like normal identifiers, so we try to parse these
first and use a little bit of try. Line and block comments start like
symbols, so we try these before symbol. Numbers can start with a . so
this is also tried before symbol (a .1 will be parsed as a number, but
. otherwise will be parsed as a symbol).
> (p,) <$> choice [sqlString d
> ,identifier d
> ,lineComment d
> ,blockComment d
> ,sqlNumber d
> ,positionalArg d
> ,dontParseEndBlockComment d
> ,prefixedVariable d
> ,symbol d
> ,sqlWhitespace d]
Parses identifiers:
simple_identifier_23
u&"unicode quoted identifier"
"quoted identifier"
"quoted identifier "" with double quote char"
`mysql quoted identifier`
> identifier :: Dialect -> Parser Token
> identifier d =
> choice
> [quotedIden
> ,unicodeQuotedIden
> ,regularIden
> ,guard (diBackquotedIden d) >> mySqlQuotedIden
> ,guard (diSquareBracketQuotedIden d) >> sqlServerQuotedIden
> ]
> where
> regularIden = Identifier Nothing <$> identifierString
> quotedIden = Identifier (Just ("\"","\"")) <$> qidenPart
> mySqlQuotedIden = Identifier (Just ("`","`"))
> <$> (char '`' *> takeWhile1 (/='`') <* char '`')
> sqlServerQuotedIden = Identifier (Just ("[","]"))
> <$> (char '[' *> takeWhile1 (`notElem` "[]") <* char ']')
>
>
> unicodeQuotedIden = Identifier
> <$> (f <$> try (oneOf "uU" <* string "&"))
> <*> qidenPart
> where f x = Just (x: "&\"", "\"")
> qidenPart = char '"' *> qidenSuffix ""
> qidenSuffix t = do
> s <- takeTill (=='"')
> void $ char '"'
>
> choice [do
> void $ char '"'
> qidenSuffix $ concat [t,s,"\"\""]
> ,return $ concat [t,s]]
This parses a valid identifier without quotes.
> identifierString :: Parser String
> identifierString =
> startsWith (\c -> c == '_' || isAlpha c) isIdentifierChar
this can be moved to the dialect at some point
> isIdentifierChar :: Char -> Bool
> isIdentifierChar c = c == '_' || isAlphaNum c
use try because : and @ can be part of other things also
> prefixedVariable :: Dialect -> Parser Token
> prefixedVariable d = try $ choice
> [PrefixedVariable <$> char ':' <*> identifierString
> ,guard (diAtIdentifier d) >>
> PrefixedVariable <$> char '@' <*> identifierString
> ,guard (diHashIdentifier d) >>
> PrefixedVariable <$> char '#' <*> identifierString
> ]
> positionalArg :: Dialect -> Parser Token
> positionalArg d =
> guard (diPositionalArg d) >>
>
> PositionalArg <$> try (char '$' *> (read <$> many1 digit))
Parse a SQL string. Examples:
'basic string'
'string with '' a quote'
n'international text'
b'binary string'
x'hexidecimal string'
> sqlString :: Dialect -> Parser Token
> sqlString d = dollarString <|> csString <|> normalString
> where
> dollarString = do
> guard $ diDollarString d
>
>
> delim <- (\x -> concat ["$",x,"$"])
> <$> try (char '$' *> option "" identifierString <* char '$')
> SqlString delim delim <$> manyTill anyChar (try $ string delim)
> normalString = SqlString "'" "'" <$> (char '\'' *> normalStringSuffix False "")
> normalStringSuffix allowBackslash t = do
> s <- takeTill $ if allowBackslash
> then (`elem` "'\\")
> else (== '\'')
>
> choice [do
> ctu <- choice ["''" <$ try (string "''")
> ,"\\'" <$ string "\\'"
> ,"\\" <$ char '\\']
> normalStringSuffix allowBackslash $ concat [t,s,ctu]
> ,concat [t,s] <$ char '\'']
>
>
>
>
>
>
>
>
> csString
> | diEString d =
> choice [SqlString <$> try (string "e'" <|> string "E'")
> <*> return "'" <*> normalStringSuffix True ""
> ,csString']
> | otherwise = csString'
> csString' = SqlString
> <$> try cs
> <*> return "'"
> <*> normalStringSuffix False ""
> csPrefixes = "nNbBxX"
> cs = choice $ (map (\x -> string ([x] ++ "'")) csPrefixes)
> ++ [string "u&'"
> ,string "U&'"]
numbers
digits
digits.[digits][e[+-]digits]
[digits].digits[e[+-]digits]
digitse[+-]digits
where digits is one or more decimal digits (0 through 9). At least one
digit must be before or after the decimal point, if one is used. At
least one digit must follow the exponent marker (e), if one is
present. There cannot be any spaces or other characters embedded in
the constant. Note that any leading plus or minus sign is not actually
considered part of the constant; it is an operator applied to the
constant.
> sqlNumber :: Dialect -> Parser Token
> sqlNumber d =
> SqlNumber <$> completeNumber
>
> <* choice [
> guard (diPostgresSymbols d)
> *> (void $ lookAhead $ try $ string "..")
> <|> void (notFollowedBy (oneOf "eE."))
> ,notFollowedBy (oneOf "eE.")
> ]
> where
> completeNumber =
> (int <??> (pp dot <??.> pp int)
>
>
>
>
> <|> try ((++) <$> dot <*> int))
> <??> pp expon
> int = many1 digit
>
>
>
> dot = let p = string "." <* notFollowedBy (char '.')
> in if diPostgresSymbols d
> then try p
> else p
> expon = (:) <$> oneOf "eE" <*> sInt
> sInt = (++) <$> option "" (string "+" <|> string "-") <*> int
> pp = (<$$> (++))
Symbols
A symbol is an operator, or one of the misc symbols which include:
. .. := : :: ( ) ? ; , { } (for odbc)
The postgresql operator syntax allows a huge range of operators
compared with ansi and other dialects
> symbol :: Dialect -> Parser Token
> symbol d = Symbol <$> choice (concat
> [dots
> ,if diPostgresSymbols d
> then postgresExtraSymbols
> else []
> ,miscSymbol
> ,if diOdbc d then odbcSymbol else []
> ,if diPostgresSymbols d
> then generalizedPostgresqlOperator
> else basicAnsiOps
> ])
> where
> dots = [many1 (char '.')]
> odbcSymbol = [string "{", string "}"]
> postgresExtraSymbols =
> [try (string ":=")
>
> ,try (string "::" <* notFollowedBy (char ':'))
> ,try (string ":" <* notFollowedBy (char ':'))]
> miscSymbol = map (string . (:[])) $
> case () of
> _ | diSqlServerSymbols d -> ",;():?"
> | diPostgresSymbols d -> "[],;()"
> | otherwise -> "[],;():?"
try is used because most of the first characters of the two character
symbols can also be part of a single character symbol
> basicAnsiOps = map (try . string) [">=","<=","!=","<>"]
> ++ map (string . (:[])) "+-^*/%~&<>="
> ++ pipes
> pipes =
>
>
> [char '|' *>
> choice ["||" <$ char '|' <* notFollowedBy (char '|')
> ,return "|"]]
postgresql generalized operators
this includes the custom operators that postgres supports,
plus all the standard operators which could be custom operators
according to their grammar
rules
An operator name is a sequence of up to NAMEDATALEN-1 (63 by default) characters from the following list:
+ - * / < > = ~ ! @ # % ^ & | ` ?
There are a few restrictions on operator names, however:
A multiple-character operator name cannot end in + or -, unless the name also contains at least one of these characters:
~ ! @ # % ^ & | ` ?
which allows the last character of a multi character symbol to be + or
-
> generalizedPostgresqlOperator :: [Parser String]
> generalizedPostgresqlOperator = [singlePlusMinus,opMoreChars]
> where
> allOpSymbols = "+-*/<>=~!@#%^&|`?"
>
>
> exceptionOpSymbols = "~!@#%^&|`?"
>
> singlePlusMinus = try $ do
> c <- oneOf "+-"
> notFollowedBy $ oneOf allOpSymbols
> return [c]
>
>
>
> moreOpCharsException = do
> c <- oneOf (filter (`notElem` "-/*") allOpSymbols)
>
>
> <|> try (char '/' <* notFollowedBy (char '*'))
> <|> try (char '-' <* notFollowedBy (char '-'))
>
>
> <|> try (char '*' <* notFollowedBy (char '/'))
> (c:) <$> option [] moreOpCharsException
> opMoreChars = choice
> [
> (:)
> <$> oneOf exceptionOpSymbols
> <*> option [] moreOpCharsException
> ,(:)
> <$> (
> try (char '+' <* lookAhead (oneOf allOpSymbols))
> <|>
>
> try (char '-'
> <* notFollowedBy (char '-')
> <* lookAhead (oneOf allOpSymbols))
> <|>
> try (char '/' <* notFollowedBy (char '*'))
> <|>
> try (char '*' <* notFollowedBy (char '/'))
> <|>
> oneOf "<>=")
> <*> option [] opMoreChars
> ]
> sqlWhitespace :: Dialect -> Parser Token
> sqlWhitespace _ = Whitespace <$> many1 (satisfy isSpace)
> lineComment :: Dialect -> Parser Token
> lineComment _ =
> (\s -> LineComment $ concat ["--",s]) <$>
>
>
> (try (string "--") *> (
>
> conc <$> manyTill anyChar (lookAhead lineCommentEnd) <*> lineCommentEnd))
> where
> conc a Nothing = a
> conc a (Just b) = a ++ b
> lineCommentEnd =
> Just "\n" <$ char '\n'
> <|> Nothing <$ eof
Try is used in the block comment for the two symbol bits because we
want to backtrack if we read the first symbol but the second symbol
isn't there.
> blockComment :: Dialect -> Parser Token
> blockComment _ =
> (\s -> BlockComment $ concat ["/*",s]) <$>
> (try (string "/*") *> commentSuffix 0)
> where
> commentSuffix :: Int -> Parser String
> commentSuffix n = do
>
> x <- takeWhile (\e -> e /= '/' && e /= '*')
> choice [
>
> try (string "*/") *> let t = concat [x,"*/"]
> in if n == 0
> then return t
> else (\s -> concat [t,s]) <$> commentSuffix (n - 1)
>
> ,try (string "/*") *> ((\s -> concat [x,"/*",s]) <$> commentSuffix (n + 1))
>
> ,(\c s -> x ++ [c] ++ s) <$> anyChar <*> commentSuffix n]
This is to improve user experience: provide an error if we see */
outside a comment. This could potentially break postgres ops with */
in them (which is a stupid thing to do). In other cases, the user
should write * / instead (I can't think of any cases when this would
be valid syntax though).
> dontParseEndBlockComment :: Dialect -> Parser Token
> dontParseEndBlockComment _ =
>
> try (string "*/") *> fail "comment end without comment start"
Some helper combinators
> startsWith :: (Char -> Bool) -> (Char -> Bool) -> Parser String
> startsWith p ps = do
> c <- satisfy p
> choice [(:) c <$> (takeWhile1 ps)
> ,return [c]]
> takeWhile1 :: (Char -> Bool) -> Parser String
> takeWhile1 p = many1 (satisfy p)
> takeWhile :: (Char -> Bool) -> Parser String
> takeWhile p = many (satisfy p)
> takeTill :: (Char -> Bool) -> Parser String
> takeTill p = manyTill anyChar (peekSatisfy p)
> peekSatisfy :: (Char -> Bool) -> Parser ()
> peekSatisfy p = void $ lookAhead (satisfy p)
This utility function will accurately report if the two tokens are
pretty printed, if they should lex back to the same two tokens. This
function is used in testing (and can be used in other places), and
must not be implemented by actually trying to print both tokens and
then lex them back from a single string (because then we would have
the risk of thinking two tokens cannot be together when there is bug
in the lexer, which the testing is supposed to find).
maybe do some quick checking to make sure this function only gives
true negatives: check pairs which return false actually fail to lex or
give different symbols in return: could use quickcheck for this
a good sanity test for this function is to change it to always return
true, then check that the automated tests return the same number of
successes. I don't think it succeeds this test at the moment
>
>
>
> tokenListWillPrintAndLex :: Dialect -> [Token] -> Bool
> tokenListWillPrintAndLex _ [] = True
> tokenListWillPrintAndLex _ [_] = True
> tokenListWillPrintAndLex d (a:b:xs) =
> tokensWillPrintAndLex d a b && tokenListWillPrintAndLex d (b:xs)
> tokensWillPrintAndLex :: Dialect -> Token -> Token -> Bool
> tokensWillPrintAndLex d a b
a : followed by an identifier character will look like a host param
followed by = or : makes a different symbol
> | Symbol ":" <- a
> , checkFirstBChar (\x -> isIdentifierChar x || x `elem` ":=") = False
two symbols next to eachother will fail if the symbols can combine and
(possibly just the prefix) look like a different symbol
> | diPostgresSymbols d
> , Symbol a' <- a
> , Symbol b' <- b
> , b' `notElem` ["+", "-"] || or (map (`elem` a') "~!@#%^&|`?") = False
check two adjacent symbols in non postgres where the combination
possibilities are much more limited. This is ansi behaviour, it might
be different when the other dialects are done properly
> | Symbol a' <- a
> , Symbol b' <- b
> , (a',b') `elem` [("<",">")
> ,("<","=")
> ,(">","=")
> ,("!","=")
> ,("|","|")
> ,("||","|")
> ,("|","||")
> ,("||","||")
> ,("<",">=")
> ] = False
two whitespaces will be combined
> | Whitespace {} <- a
> , Whitespace {} <- b = False
line comment without a newline at the end will eat the next token
> | LineComment {} <- a
> , checkLastAChar (/='\n') = False
check the last character of the first token and the first character of
the second token forming a comment start or end symbol
> | let f '-' '-' = True
> f '/' '*' = True
> f '*' '/' = True
> f _ _ = False
> in checkBorderChars f = False
a symbol will absorb a following .
TODO: not 100% on this always being bad
> | Symbol {} <- a
> , checkFirstBChar (=='.') = False
cannot follow a symbol ending in : with another token starting with :
> | let f ':' ':' = True
> f _ _ = False
> in checkBorderChars f = False
unquoted identifier followed by an identifier letter
> | Identifier Nothing _ <- a
> , checkFirstBChar isIdentifierChar = False
a quoted identifier using ", followed by a " will fail
> | Identifier (Just (_,"\"")) _ <- a
> , checkFirstBChar (=='"') = False
prefixed variable followed by an identifier char will be absorbed
> | PrefixedVariable {} <- a
> , checkFirstBChar isIdentifierChar = False
a positional arg will absorb a following digit
> | PositionalArg {} <- a
> , checkFirstBChar isDigit = False
a string ending with ' followed by a token starting with ' will be absorbed
> | SqlString _ "'" _ <- a
> , checkFirstBChar (=='\'') = False
a number followed by a . will fail or be absorbed
> | SqlNumber {} <- a
> , checkFirstBChar (=='.') = False
a number followed by an e or E will fail or be absorbed
> | SqlNumber {} <- a
> , checkFirstBChar (\x -> x =='e' || x == 'E') = False
two numbers next to eachother will fail or be absorbed
> | SqlNumber {} <- a
> , SqlNumber {} <- b = False
> | otherwise = True
> where
> prettya = prettyToken d a
> prettyb = prettyToken d b
>
>
>
> checkBorderChars f
> | (_:_) <- prettya
> , (fb:_) <- prettyb
> , la <- last prettya
> = f la fb
> checkBorderChars _ = False
> checkFirstBChar f = case prettyb of
> (b':_) -> f b'
> _ -> False
> checkLastAChar f = case prettya of
> (_:_) -> f $ last prettya
> _ -> False
TODO:
make the tokenswill print more dialect accurate. Maybe add symbol
chars and identifier chars to the dialect definition and use them from
here
start adding negative / different parse dialect tests
add token tables and tests for oracle, sql server
review existing tables
look for refactoring opportunities, especially the token
generation tables in the tests
do some user documentation on lexing, and lexing/dialects
start thinking about a more separated design for the dialect handling
lexing tests are starting to take a really long time, so split the
tests so it is much easier to run all the tests except the lexing
tests which only need to be run when working on the lexer (which
should be relatively uncommon), or doing a commit or finishing off a
series of commits,
start writing the error message tests:
generate/write a large number of syntax errors
create a table with the source and the error message
try to compare some different versions of code to compare the
quality of the error messages by hand
get this checked in so improvements and regressions in the error
message quality can be tracked a little more easily (although it will
still be manual)
try again to add annotation to the ast