{-# LANGUAGE Safe #-} {-# LANGUAGE ViewPatterns #-} {-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-} {-| Module : Text.Gigaparsec.Errors.TokenExtractors Description : This module contains implementations of token extractors that can be used in the @ErrorBuilder@ to decide how to extract unexpected tokens from the residual input left over from a parse error. License : BSD-3-Clause Maintainer : Jamie Willis, Gigaparsec Maintainers Stability : stable This module contains implementations of token extractors that can be used in the "Text.Gigaparsec.Errors.ErrorBuilder" to decide how to extract unexpected tokens from the residual input left over from a parse error. These are common strategies, and something here is likely to be what is needed. They are all careful to handle unprintable characters and whitespace in a sensible way, and account for unicode codepoints that are wider than a single 16-bit character. @since 0.2.5.0 -} module Text.Gigaparsec.Errors.TokenExtractors ( Token(..), TokenExtractor, tillNextWhitespace, singleChar, matchParserDemand, lexToken, lexTokenWithSelect ) where import Text.Gigaparsec ( Parsec, Result(Success), parse, atomic, lookAhead, notFollowedBy, some, (<+>), (<~>), mapMaybeS ) import Text.Gigaparsec.Char (item) import Text.Gigaparsec.Combinator (option) import Text.Gigaparsec.Position (offset) import Data.Char (generalCategory, ord, GeneralCategory(Format, Surrogate, PrivateUse, NotAssigned, Control)) import Data.Char qualified as Char (isSpace) import Data.List.NonEmpty (NonEmpty((:|)), nonEmpty) import Data.List.NonEmpty qualified as NonEmpty (toList) import Data.Void (Void) import Data.Foldable (maximumBy) import Numeric (showHex) import Data.Function (on) import Data.Maybe (catMaybes) {-| Type alias for token extractors, matches the shape of 'Text.Gigaparsec.Errors.ErrorBuilder.unexpectedToken'. @since 0.2.5.0 -} type TokenExtractor :: * type TokenExtractor = NonEmpty Char -- ^ the remaining input, @cs@, at point of failure. -> Word -- ^ the input the parser tried to read when it failed -- (this is __not__ guaranteed to be smaller than the length of -- @cs@, but is __guaranteed to be greater than 0__). -> Bool -- ^ was this error generated as part of \"lexing\", or in a wider parser (see 'Text.Gigaparsec.Errors.Combinator.markAsToken'). -> Token -- ^ a token extracted from @cs@ that will be used as part of the unexpected message. {-| This type represents an extracted token returned by 'Text.Gigaparsec.Errors.ErrorBuilder.unexpectedToken' in 'Text.Gigaparsec.Errors.ErrorBuilder.ErrorBuilder'. There is deliberately no analogue for @EndOfInput@ because we guarantee that non-empty residual input is provided to token extraction. @since 0.2.5.0 -} type Token :: * data Token = Raw -- ^ This is a token that is directly extracted from the residual input itself. !String -- ^ the input extracted. | Named -- ^ This is a token that has been given a name, and is treated like a labelled item. !String -- ^ the description of the token. {-# UNPACK #-} !Word -- ^ the amount of residual input this token ate. {-# INLINABLE tillNextWhitespace #-} {-| This extractor provides an implementation for 'Text.Gigaparsec.ErrorBuilder.unexpectedToken': it will construct a token that extends to the next available whitespace in the remaining input. It can be configured to constrict this token to the minimum of the next whitespace or whatever the parser demanded. In the case of unprintable characters or whitespace, this extractor will favour reporting a more meaningful name. @since 0.2.5.0 -} tillNextWhitespace :: Bool -- ^ should the extractor cap the token to the amount of input the parser demanded? -> (Char -> Bool) -- ^ what counts as a space character -> TokenExtractor tillNextWhitespace _ _ (whitespaceOrUnprintable -> Just tok) _ _ = tok tillNextWhitespace trimToDemand isSpace (c :| cs) parserDemanded _ | trimToDemand = Raw (take (fromIntegral parserDemanded) (tillSpace (c:cs))) | otherwise = Raw (tillSpace (c:cs)) where tillSpace = takeWhile (not . isSpace) {-| This extractor provides an implementation for 'Text.Gigaparsec.ErrorBuilder.unexpectedToken': it will unconditionally report the first character in the remaining input as the problematic token. In the case of unprintable characters or whitespace, this extractor will favour reporting a more meaningful name. @since 0.2.5.0 -} {-# INLINABLE singleChar #-} singleChar :: TokenExtractor singleChar (whitespaceOrUnprintable -> Just tok) _ _ = tok singleChar (c :| _) _ _ = Raw [c] {-| This extractor provides an implementation for 'Text.Gigaparsec.ErrorBuilder.unexpectedToken': it will make a token as wide as the amount of input the parser tried to consume when it failed. In the case of unprintable characters or whitespace, this extractor will favour reporting a more meaningful name. @since 0.2.5.0 -} {-# INLINABLE matchParserDemand #-} matchParserDemand :: TokenExtractor matchParserDemand (whitespaceOrUnprintable -> Just tok) _ _ = tok matchParserDemand (c :| cs) parserDemanded _ = Raw (take (fromIntegral parserDemanded) (c:cs)) {-# INLINABLE whitespaceOrUnprintable #-} whitespaceOrUnprintable :: NonEmpty Char -> Maybe Token whitespaceOrUnprintable ('\n' :| _) = Just $ Named "newline" 1 whitespaceOrUnprintable ('\r' :| _) = Just $ Named "carriage return" 1 whitespaceOrUnprintable ('\t' :| _) = Just $ Named "tab" 1 whitespaceOrUnprintable (' ' :| _) = Just $ Named "space" 1 whitespaceOrUnprintable (c :| _) | Char.isSpace c = Just $ Named "whitespace character" 1 | otherwise = case generalCategory c of Format -> unprintable Surrogate -> unprintable PrivateUse -> unprintable NotAssigned -> unprintable Control -> unprintable _ -> Nothing where unprintable = Just $ Named ("non-printable character (\\x" ++ showHex (ord c) ")") 1 {-| This extractor provides an implementation for 'Text.Gigaparsec.ErrorBuilder.unexpectedToken': it will try and parse the residual input to identify a valid lexical token to report. When parsing a grammar that as a dedicated lexical distinction, it is nice to be able to report problematic tokens relevant to that grammar as opposed to generic input lifted straight from the input stream. The easiest way of doing this would be having a pre-lexing pass and parsing based on tokens, but this is deliberately not how Parsley is designed. Instead, this extractor can try and parse the remaining input to try and identify a token on demand. If the @lexicalError@ flag of the @unexpectedToken@ function is not set, which would indicate a problem within a token reported by a classical lexer and not the parser, the extractor will try to parse each of the provided @tokens@ in turn: whichever is the longest matched of these tokens will be reported as the problematic one, where an earlier token arbitrates ties ('lexTokenWithSelect' can alter which is chosen). For best effect, these tokens should not consume whitespace (which would otherwise be included at the end of the token!): this means that, if using the @Lexer@, the functionality in __@nonlexeme@__ should be used. If one of the givens tokens cannot be parsed, the input until the /next/ valid parsable token (or end of input) is returned as a @Raw@. If @lexicalError@ is true, then the given token extractor will be used instead to extract a default token. @since 0.2.5.0 -} {-# INLINABLE lexToken #-} lexToken :: [Parsec String] -- ^ The tokens that should be recognised by this extractor: each parser should return the -- intended name of the token exactly as it should appear in the "Named" token. -- -- This /should/ include a whitespace parser for "unexpected whitespace". However, with the -- exception of the whitespace parser, these tokens should not consume trailing (and -- certainly not leading) whitespace: if using definitions from "Text.Gigaparsec.Token.Lexer" -- functionality, the @nonlexeme@ versions of the tokens should be used. -> TokenExtractor -- ^ If the parser failed during the parsing of a token, this -- function extracts the problematic item from the remaining input. -> TokenExtractor lexToken = lexTokenWithSelect (maximumBy (compare `on` snd)) {-| This extractor provides an implementation for 'Text.Gigaparsec.ErrorBuilder.unexpectedToken': it will try and parse the residual input to identify a valid lexical token to report. When parsing a grammar that as a dedicated lexical distinction, it is nice to be able to report problematic tokens relevant to that grammar as opposed to generic input lifted straight from the input stream. The easiest way of doing this would be having a pre-lexing pass and parsing based on tokens, but this is deliberately not how Parsley is designed. Instead, this extractor can try and parse the remaining input to try and identify a token on demand. If the @lexicalError@ flag of the @unexpectedToken@ function is not set, which would indicate a problem within a token reported by a classical lexer and not the parser, the extractor will try to parse each of the provided @tokens@ in turn: the given function is used to select which is returned. For best effect, these tokens should not consume whitespace (which would otherwise be included at the end of the token!): this means that, if using the @Lexer@, the functionality in __@nonlexeme@__ should be used. If one of the givens tokens cannot be parsed, the input until the /next/ valid parsable token (or end of input) is returned as a @Raw@. If @lexicalError@ is true, then the given token extractor will be used instead to extract a default token. @since 0.2.5.0 -} {-# INLINABLE lexTokenWithSelect #-} lexTokenWithSelect :: (NonEmpty (String, Word) -> (String, Word)) -- ^ If the extractor is successful in identifying tokens that can be parsed from -- the residual input, this function will select /one/ of them to report back. -> [Parsec String] -- ^ The tokens that should be recognised by this extractor: each parser should return the -- intended name of the token exactly as it should appear in the "Named" token. -- -- This /should/ include a whitespace parser for "unexpected whitespace". However, with the -- exception of the whitespace parser, these tokens should not consume trailing (and -- certainly not leading) whitespace: if using definitions from "Text.Gigaparsec.Token.Lexer" -- functionality, the @nonlexeme@ versions of the tokens should be used. -> TokenExtractor -- ^ If the parser failed during the parsing of a token, this -- function extracts the problematic item from the remaining input. -> TokenExtractor lexTokenWithSelect selectToken tokens extractItem cs parserDemanded lexical | lexical = extractItem cs parserDemanded True | otherwise = extractToken cs where extractToken :: NonEmpty Char -> Token extractToken inp = let Success rawOrToks = parse @Void parser (NonEmpty.toList inp) in either (uncurry Named . selectToken) Raw rawOrToks parser :: Parsec (Either (NonEmpty (String, Word)) String) parser = let toks = mapMaybeS (nonEmpty . catMaybes) (traverse (\t -> option (lookAhead (atomic t <~> offset))) tokens) rawTok = some (traverse notFollowedBy tokens *> item) in toks <+> rawTok