{ {-| Module : Toml.Lexer Description : TOML lexical analyzer Copyright : (c) Eric Mertens, 2023 License : ISC Maintainer : emertens@gmail.com This module parses a TOML file into a lazy sequence of tokens. The lexer is aware of nested brackets and equals signs in order to handle TOML's context-sensitive lexing requirements. This context enables the lexer to distinguish between bare keys and various values like: floating-point literals, integer literals, and date literals. This module uses actions and lexical hooks defined in "LexerUtils". -} module Toml.Lexer (Context(..), scanToken, lexValue, Token(..)) where import Toml.Lexer.Token import Toml.Lexer.Utils import Toml.Located import Toml.Position } $non_ascii = \x1 $wschar = [\ \t] @ws = $wschar* @newline = \r? \n $bindig = [0-1] $octdig = [0-7] $digit = [0-9] $hexdig = [ $digit A-F a-f ] $basic_unescaped = [ $wschar \x21 \x23-\x5B \x5D-\x7E $non_ascii ] $comment_start_symbol = \# @barekey = [0-9 A-Z a-z \- _]+ @unsigned_dec_int = $digit | [1-9] ($digit | _ $digit)+ @dec_int = [\-\+]? @unsigned_dec_int @zero_prefixable_int = $digit ($digit | _ $digit)* @hex_int = "0x" $hexdig ($hexdig | _ $hexdig)* @oct_int = "0o" $octdig ($octdig | _ $octdig)* @bin_int = "0b" $bindig ($bindig | _ $bindig)* @frac = "." @zero_prefixable_int @float_exp_part = [\+\-]? @zero_prefixable_int @special_float = [\+\-]? ("inf" | "nan") @exp = [Ee] @float_exp_part @float_int_part = @dec_int @float = @float_int_part ( @exp | @frac @exp? ) | @special_float @bad_dec_int = [\-\+]? 0 ($digit | _ $digit)+ $non_eol = [\x09 \x20-\x7E $non_ascii] @comment = $comment_start_symbol $non_eol* $literal_char = [\x09 \x20-\x26 \x28-\x7E $non_ascii] $mll_char = [\x09 \x20-\x26 \x28-\x7E] @mll_content = $mll_char | @newline @mlb_escaped_nl = \\ @ws @newline ($wschar | @newline)* $unescaped = [$wschar \x21 \x23-\x5B \x5D-\x7E $non_ascii] @date_fullyear = $digit {4} @date_month = $digit {2} @date_mday = $digit {2} $time_delim = [Tt\ ] @time_hour = $digit {2} @time_minute = $digit {2} @time_second = $digit {2} @time_secfrac = "." $digit+ @time_numoffset = [\+\-] @time_hour ":" @time_minute @time_offset = [Zz] | @time_numoffset @partial_time = @time_hour ":" @time_minute ":" @time_second @time_secfrac? @full_date = @date_fullyear "-" @date_month "-" @date_mday @full_time = @partial_time @time_offset @offset_date_time = @full_date $time_delim @full_time @local_date_time = @full_date $time_delim @partial_time @local_date = @full_date @local_time = @partial_time toml :- { @bad_dec_int { failure "leading zero prohibited" } @dec_int { token mkDecInteger } @hex_int { token mkHexInteger } @oct_int { token mkOctInteger } @bin_int { token mkBinInteger } @float { token mkFloat } "true" { token_ TokTrue } "false" { token_ TokFalse } @offset_date_time { timeValue "offset date-time" offsetDateTimePatterns TokOffsetDateTime } @local_date { timeValue "local date" localDatePatterns TokLocalDate } @local_date_time { timeValue "local date-time" localDateTimePatterns TokLocalDateTime } @local_time { timeValue "local time" localTimePatterns TokLocalTime } } <0> { "[[" { token_ Tok2SquareO } "]]" { token_ Tok2SquareC } } <0,val,tab> { @newline { token_ TokNewline } @comment; $wschar+; "=" { token_ TokEquals } "." { token_ TokPeriod } "," { token_ TokComma } "[" { token_ TokSquareO } "]" { token_ TokSquareC } "{" { token_ TokCurlyO } "}" { token_ TokCurlyC } @barekey { token TokBareKey } \"{3} @newline? { startMlBstr } \" { startBstr } "'''" @newline? { startMlLstr } "'" { startLstr } } { $literal_char+ { strFrag } "'" { endStr . fmap (drop 1) } } { $unescaped+ { strFrag } \" { endStr . fmap (drop 1) } } { @mll_content+ { strFrag } "'" {1,2} { strFrag } "'" {3,5} { endStr . fmap (drop 3) } } { @mlb_escaped_nl; ($unescaped | @newline)+ { strFrag } \" {1,2} { strFrag } \" {3,5} { endStr . fmap (drop 3) } } { \\ U $hexdig{8} { unicodeEscape } \\ U { failure "\\U requires exactly 8 hex digits"} \\ u $hexdig{4} { unicodeEscape } \\ u { failure "\\u requires exactly 4 hex digits"} \\ n { strFrag . ("\n" <$) } \\ t { strFrag . ("\t" <$) } \\ r { strFrag . ("\r" <$) } \\ f { strFrag . ("\f" <$) } \\ b { strFrag . ("\b" <$) } \\ \\ { strFrag . ("\\" <$) } \\ \" { strFrag . ("\"" <$) } } { type AlexInput = Located String alexGetByte :: AlexInput -> Maybe (Int, AlexInput) alexGetByte = locatedUncons -- | Get the next token from a located string. This function can be total -- because one of the possible token outputs is an error token. scanToken :: Context -> Located String -> Either (Located String) (Located Token, Located String) scanToken st str = case alexScan str (stateInt st) of AlexEOF -> eofToken st str AlexError str' -> Left (mkError <$> str') AlexSkip str' _ -> scanToken st str' AlexToken str' n action -> case action (take n <$> str) st of Resume st' -> scanToken st' str' LexerError e -> Left e EmitToken t -> Right (t, str') stateInt :: Context -> Int stateInt TopContext = 0 stateInt TableContext = tab stateInt ValueContext = val stateInt BstrContext {} = bstr stateInt MlBstrContext{} = mlbstr stateInt LstrContext {} = lstr stateInt MlLstrContext{} = mllstr -- | Lex a single token in a value context. This is mostly useful for testing. lexValue :: String -> Either String Token lexValue str = case scanToken ValueContext Located{ locPosition = startPos, locThing = str } of Left e -> Left (locThing e) Right (t,_) -> Right (locThing t) }