Safe Haskell | None |
---|---|
Language | Haskell2010 |
This module implements a Parser
supporting custom error types. If you need efficient indentation
parsing, use FlatParse.Stateful instead.
Many internals are exposed for hacking on and extending. These are generally
denoted by a #
hash suffix.
Synopsis
- newtype Parser e a = Parser {
- runParser# :: ForeignPtrContents -> Addr# -> Addr# -> Res# e a
- type Res# e a = (# (# a, Addr# #) | (# #) | (# e #) #)
- pattern OK# :: a -> Addr# -> Res# e a
- pattern Fail# :: Res# e a
- pattern Err# :: e -> Res# e a
- data Result e a
- = OK a !ByteString
- | Fail
- | Err !e
- runParser :: Parser e a -> ByteString -> Result e a
- runParserS :: Parser e a -> String -> Result e a
- empty :: Parser e a
- err :: e -> Parser e a
- lookahead :: Parser e a -> Parser e a
- fails :: Parser e a -> Parser e ()
- try :: Parser e a -> Parser e a
- optional :: Parser e a -> Parser e (Maybe a)
- optional_ :: Parser e a -> Parser e ()
- withOption :: Parser e a -> (a -> Parser e b) -> Parser e b -> Parser e b
- cut :: Parser e a -> e -> Parser e a
- cutting :: Parser e a -> e -> (e -> e -> e) -> Parser e a
- eof :: Parser e ()
- takeBs :: Int -> Parser e ByteString
- takeRestBs :: Parser e ByteString
- skip :: Int -> Parser e ()
- char :: Char -> Q Exp
- byte :: Word8 -> Parser e ()
- bytes :: [Word] -> Q Exp
- string :: String -> Q Exp
- switch :: Q Exp -> Q Exp
- switchWithPost :: Maybe (Q Exp) -> Q Exp -> Q Exp
- rawSwitchWithPost :: Maybe (Q Exp) -> [(String, Q Exp)] -> Maybe (Q Exp) -> Q Exp
- satisfy :: (Char -> Bool) -> Parser e Char
- satisfy_ :: (Char -> Bool) -> Parser e ()
- satisfyASCII :: (Char -> Bool) -> Parser e Char
- satisfyASCII_ :: (Char -> Bool) -> Parser e ()
- fusedSatisfy :: (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> Parser e Char
- fusedSatisfy_ :: (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> Parser e ()
- anyWord8 :: Parser e Word8
- anyWord8_ :: Parser e ()
- anyWord16 :: Parser e Word16
- anyWord16_ :: Parser e ()
- anyWord32 :: Parser e Word32
- anyWord32_ :: Parser e ()
- anyWord64 :: Parser e Word64
- anyWord64_ :: Parser e ()
- anyWord :: Parser e Word
- anyWord_ :: Parser e ()
- anyInt8 :: Parser e Int8
- anyInt16 :: Parser e Int16
- anyInt32 :: Parser e Int32
- anyInt64 :: Parser e Int64
- anyInt :: Parser e Int
- anyChar :: Parser e Char
- anyChar_ :: Parser e ()
- anyCharASCII :: Parser e Char
- anyCharASCII_ :: Parser e ()
- isDigit :: Char -> Bool
- isGreekLetter :: Char -> Bool
- isLatinLetter :: Char -> Bool
- readInt :: Parser e Int
- readInteger :: Parser e Integer
- anyCString :: Parser e ByteString
- anyWord16le :: Parser e Word16
- anyWord16be :: Parser e Word16
- anyWord32le :: Parser e Word32
- anyWord32be :: Parser e Word32
- anyWord64le :: Parser e Word64
- anyWord64be :: Parser e Word64
- anyInt16le :: Parser e Int16
- anyInt16be :: Parser e Int16
- anyInt32le :: Parser e Int32
- anyInt32be :: Parser e Int32
- anyInt64le :: Parser e Int64
- anyInt64be :: Parser e Int64
- (<|>) :: Parser e a -> Parser e a -> Parser e a
- branch :: Parser e a -> Parser e b -> Parser e b -> Parser e b
- chainl :: (b -> a -> b) -> Parser e b -> Parser e a -> Parser e b
- chainr :: (a -> b -> b) -> Parser e a -> Parser e b -> Parser e b
- many :: Parser e a -> Parser e [a]
- many_ :: Parser e a -> Parser e ()
- some :: Parser e a -> Parser e [a]
- some_ :: Parser e a -> Parser e ()
- notFollowedBy :: Parser e a -> Parser e b -> Parser e a
- isolate :: Int -> Parser e a -> Parser e a
- newtype Pos = Pos Int
- data Span = Span !Pos !Pos
- getPos :: Parser e Pos
- setPos :: Pos -> Parser e ()
- endPos :: Pos
- spanOf :: Parser e a -> Parser e Span
- withSpan :: Parser e a -> (a -> Span -> Parser e b) -> Parser e b
- byteStringOf :: Parser e a -> Parser e ByteString
- withByteString :: Parser e a -> (a -> ByteString -> Parser e b) -> Parser e b
- inSpan :: Span -> Parser e a -> Parser e a
- validPos :: ByteString -> Pos -> Bool
- posLineCols :: ByteString -> [Pos] -> [(Int, Int)]
- unsafeSpanToByteString :: Span -> Parser e ByteString
- unsafeSlice :: ByteString -> Span -> ByteString
- mkPos :: ByteString -> (Int, Int) -> Pos
- lines :: ByteString -> [String]
- takeLine :: Parser e String
- traceLine :: Parser e String
- takeRest :: Parser e String
- traceRest :: Parser e String
- packUTF8 :: String -> ByteString
- unpackUTF8 :: ByteString -> String
- ensureBytes# :: Int -> Parser e ()
- takeBs# :: Int# -> Parser e ByteString
- atSkip# :: Int# -> Parser e a -> Parser e a
- setBack# :: Int -> Parser e ()
- withAddr# :: (Addr# -> Parser e a) -> Parser e a
- takeBsOffAddr# :: Addr# -> Int# -> Int# -> Parser e ByteString
- lookaheadFromAddr# :: Addr# -> Parser e a -> Parser e a
- atAddr# :: Addr# -> Parser e a -> Parser e a
- withAnyWord8# :: (Word8'# -> Parser e a) -> Parser e a
- withAnyWord16# :: (Word16'# -> Parser e a) -> Parser e a
- withAnyWord32# :: (Word32'# -> Parser e a) -> Parser e a
- withAnyWord64# :: (Word# -> Parser e a) -> Parser e a
- withAnyInt8# :: (Int8'# -> Parser e a) -> Parser e a
- withAnyInt16# :: (Int16'# -> Parser e a) -> Parser e a
- withAnyInt32# :: (Int32'# -> Parser e a) -> Parser e a
- withAnyInt64# :: (Int# -> Parser e a) -> Parser e a
- anyCStringUnsafe :: Parser e ByteString
- scan8# :: Word8 -> Parser e ()
- scan16# :: Word16 -> Parser e ()
- scan32# :: Word32 -> Parser e ()
- scan64# :: Word -> Parser e ()
- scanAny8# :: Parser e Word8
- scanBytes# :: [Word] -> Q Exp
Parser types and constructors
Parser e a
has an error type e
and a return type a
.
Parser | |
|
pattern OK# :: a -> Addr# -> Res# e a Source #
Contains return value and a pointer to the rest of the input buffer.
Higher-level boxed data type for parsing results.
OK a !ByteString | Contains return value and unconsumed input. |
Fail | Recoverable-by-default failure. |
Err !e | Unrecoverble-by-default error. |
Running parsers
runParserS :: Parser e a -> String -> Result e a Source #
Run a parser on a String
input. Reminder: OverloadedStrings
for ByteString
does not
yield a valid UTF-8 encoding! For non-ASCII ByteString
literal input, use runParserS
or
packUTF8
for testing.
Errors and failures
The failing parser. By default, parser choice (<|>)
arbitrarily backtracks
on parser failure.
lookahead :: Parser e a -> Parser e a Source #
Save the parsing state, then run a parser, then restore the state.
optional :: Parser e a -> Parser e (Maybe a) Source #
Convert a parsing failure to a Maybe
. If possible, use withOption
instead.
cutting :: Parser e a -> e -> (e -> e -> e) -> Parser e a Source #
Run the parser, if we get a failure, throw the given error, but if we get an error, merge the
inner and the newly given errors using the e -> e -> e
function. This can be useful for
implementing parsing errors which may propagate hints or accummulate contextual information.
Basic lexing and parsing
takeBs :: Int -> Parser e ByteString Source #
Read the given number of bytes as a ByteString
.
Throws a runtime error if given a negative integer.
takeRestBs :: Parser e ByteString Source #
Consume the rest of the input. May return the empty bytestring.
skip :: Int -> Parser e () Source #
Skip forward n
bytes. Fails if fewer than n
bytes are available.
Throws a runtime error if given a negative integer.
char :: Char -> Q Exp Source #
Parse a UTF-8 character literal. This is a template function, you can use it as
$(char 'x')
, for example, and the splice in this case has type Parser e ()
.
bytes :: [Word] -> Q Exp Source #
Read a sequence of bytes. This is a template function, you can use it as $(bytes [3, 4, 5])
,
for example, and the splice has type Parser e ()
.
string :: String -> Q Exp Source #
Parse a UTF-8 string literal. This is a template function, you can use it as $(string "foo")
,
for example, and the splice has type Parser e ()
.
switch :: Q Exp -> Q Exp Source #
This is a template function which makes it possible to branch on a collection of string literals in
an efficient way. By using switch
, such branching is compiled to a trie of primitive parsing
operations, which has optimized control flow, vectorized reads and grouped checking for needed input
bytes.
The syntax is slightly magical, it overloads the usual case
expression. An example:
$(switch [| case _ of "foo" -> pure True "bar" -> pure False |])
The underscore is mandatory in case _ of
. Each branch must be a string literal, but optionally
we may have a default case, like in
$(switch [| case _ of "foo" -> pure 10 "bar" -> pure 20 _ -> pure 30 |])
All case right hand sides must be parsers with the same type. That type is also the type
of the whole switch
expression.
A switch
has longest match semantics, and the order of cases does not matter, except for
the default case, which may only appear as the last case.
If a switch
does not have a default case, and no case matches the input, then it returns with
failure, without having consumed any input. A fallthrough to the default case also does not
consume any input.
switchWithPost :: Maybe (Q Exp) -> Q Exp -> Q Exp Source #
Switch expression with an optional first argument for performing a post-processing action after
every successful branch matching, not including the default branch. For example, if we have
ws :: Parser e ()
for a whitespace parser, we might want to consume whitespace after matching
on any of the switch cases. For that case, we can define a "lexeme" version of switch
as
follows.
switch' :: Q Exp -> Q Exp switch' = switchWithPost (Just [| ws |])
Note that this switch'
function cannot be used in the same module it's defined in, because of the
stage restriction of Template Haskell.
rawSwitchWithPost :: Maybe (Q Exp) -> [(String, Q Exp)] -> Maybe (Q Exp) -> Q Exp Source #
Version of switchWithPost
without syntactic sugar. The second argument is the
list of cases, the third is the default case.
fusedSatisfy :: (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> Parser e Char Source #
This is a variant of satisfy
which allows more optimization. We can pick four testing
functions for the four cases for the possible number of bytes in the UTF-8 character. So in
fusedSatisfy f1 f2 f3 f4
, if we read a one-byte character, the result is scrutinized with
f1
, for two-bytes, with f2
, and so on. This can result in dramatic lexing speedups.
For example, if we want to accept any letter, the naive solution would be to use
isLetter
, but this accesses a large lookup table of Unicode character classes. We
can do better with fusedSatisfy isLatinLetter isLetter isLetter isLetter
, since here the
isLatinLetter
is inlined into the UTF-8 decoding, and it probably handles a great majority of
all cases without accessing the character table.
fusedSatisfy_ :: (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> Parser e () Source #
Skipping variant of fusedSatisfy
.
anyWord16_ :: Parser e () Source #
Skip any Word16
.
anyWord32_ :: Parser e () Source #
Skip any Word32
.
anyWord64_ :: Parser e () Source #
Skip any Word64
.
anyCharASCII :: Parser e Char Source #
anyCharASCII_ :: Parser e () Source #
isGreekLetter :: Char -> Bool Source #
isGreekLetter c = ('Α' <= c && c <= 'Ω') || ('α' <= c && c <= 'ω')
isLatinLetter :: Char -> Bool Source #
isLatinLetter c = ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z')
readInteger :: Parser e Integer Source #
Read a non-negative Integer
from the input, as a non-empty digit
sequence.
anyCString :: Parser e ByteString Source #
Read a null-terminated bytestring (a C-style string).
Consumes the null terminator.
Explicit-endianness machine integers
Combinators
(<|>) :: Parser e a -> Parser e a -> Parser e a infixr 6 Source #
Choose between two parsers. If the first parser fails, try the second one, but if the first one throws an error, propagate the error.
branch :: Parser e a -> Parser e b -> Parser e b -> Parser e b Source #
Branch on a parser: if the first argument succeeds, continue with the second, else with the third.
This can produce slightly more efficient code than (<|>)
. Moreover, ḃranch
does not
backtrack from the true/false cases.
many :: Parser e a -> Parser e [a] Source #
Run a parser zero or more times, collect the results in a list. Note: for optimal performance, try to avoid this. Often it is possible to get rid of the intermediate list by using a combinator or a custom parser.
some :: Parser e a -> Parser e [a] Source #
Run a parser one or more times, collect the results in a list. Note: for optimal performance, try to avoid this. Often it is possible to get rid of the intermediate list by using a combinator or a custom parser.
notFollowedBy :: Parser e a -> Parser e b -> Parser e a Source #
Succeed if the first parser succeeds and the second one fails.
isolate :: Int -> Parser e a -> Parser e a Source #
isolate n p
runs the parser p
isolated to the next n
bytes. All
isolated bytes must be consumed.
Throws a runtime error if given a negative integer.
Positions and spans
Byte offset counted backwards from the end of the buffer.
A pair of positions.
withSpan :: Parser e a -> (a -> Span -> Parser e b) -> Parser e b Source #
Bind the result together with the span of the result. CPS'd version of spanOf
for better unboxing.
byteStringOf :: Parser e a -> Parser e ByteString Source #
Return the ByteString
consumed by a parser. Note: it's more efficient to use spanOf
and
withSpan
instead.
withByteString :: Parser e a -> (a -> ByteString -> Parser e b) -> Parser e b Source #
CPS'd version of byteStringOf
. Can be more efficient, because the result is more eagerly unboxed
by GHC. It's more efficient to use spanOf
or withSpan
instead.
inSpan :: Span -> Parser e a -> Parser e a Source #
Run a parser in a given input span. The input position and the Int
state is restored after
the parser is finished, so inSpan
does not consume input and has no side effect. Warning:
this operation may crash if the given span points outside the current parsing buffer. It's
always safe to use inSpan
if the span comes from a previous withSpan
or spanOf
call on
the current input.
Position and span conversions
validPos :: ByteString -> Pos -> Bool Source #
Check whether a Pos
points into a ByteString
.
posLineCols :: ByteString -> [Pos] -> [(Int, Int)] Source #
Compute corresponding line and column numbers for each Pos
in a list. Throw an error
on invalid positions. Note: computing lines and columns may traverse the ByteString
,
but it traverses it only once regardless of the length of the position list.
unsafeSpanToByteString :: Span -> Parser e ByteString Source #
Create a ByteString
from a Span
. The result is invalid if the Span
points
outside the current buffer, or if the Span
start is greater than the end position.
unsafeSlice :: ByteString -> Span -> ByteString Source #
Slice into a ByteString
using a Span
. The result is invalid if the Span
is not a valid slice of the first argument.
mkPos :: ByteString -> (Int, Int) -> Pos Source #
Create a Pos
from a line and column number. Throws an error on out-of-bounds
line and column numbers.
lines :: ByteString -> [String] Source #
Break an UTF-8-coded ByteString
to lines. Throws an error on invalid input.
This is mostly useful for grabbing specific source lines for displaying error
messages.
Getting the rest of the input as a String
takeLine :: Parser e String Source #
Parse the rest of the current line as a String
. Assumes UTF-8 encoding,
throws an error if the encoding is invalid.
traceLine :: Parser e String Source #
Parse the rest of the current line as a String
, but restore the parsing state.
Assumes UTF-8 encoding. This can be used for debugging.
traceRest :: Parser e String Source #
Get the rest of the input as a String
, but restore the parsing state. Assumes UTF-8 encoding.
This can be used for debugging.
String
conversions
packUTF8 :: String -> ByteString Source #
Convert a String
to an UTF-8-coded ByteString
.
unpackUTF8 :: ByteString -> String Source #
Convert an UTF-8-coded ByteString
to a String
.
Internal functions
ensureBytes# :: Int -> Parser e () Source #
Check that the input has at least the given number of bytes.
Unboxed arguments
takeBs# :: Int# -> Parser e ByteString Source #
Read the given number of bytes as a ByteString
.
Throws a runtime error if given a negative integer.
atSkip# :: Int# -> Parser e a -> Parser e a Source #
Skip forward n
bytes and run the given parser. Fails if fewer than n
bytes are available.
Throws a runtime error if given a negative integer.
Location & address primitives
setBack# :: Int -> Parser e () Source #
Decrease the current input position by the given number of bytes.
withAddr# :: (Addr# -> Parser e a) -> Parser e a Source #
Run a parser, passing it the current address the parser is at.
Useful for parsing offset-based data tables. For example, you may use this to save the base address to use together with various 0-indexed offsets.
takeBsOffAddr# :: Addr# -> Int# -> Int# -> Parser e ByteString Source #
takeBsOffAddr offset
moves to addr#
, skips offset#
bytes, reads len#
bytes into a ByteString
, and restores the original
address.
The Addr#
should be from withAddr#
.
Useful for parsing offset-based data tables. For example, you may use this
together with withAddr#
to jump to an offset in your input and read some
data.
atAddr# :: Addr# -> Parser e a -> Parser e a Source #
Run a parser at the given address.
The Addr#
should be from withAddr#
.
This is a highly internal function -- you likely want lookaheadFromAddr#
,
which will reset the address after running the parser.
Machine integer continuation parsers
Unsafe
anyCStringUnsafe :: Parser e ByteString Source #
Read a null-terminated bytestring (a C-style string), where the bytestring is known to be null-terminated somewhere in the input.
Highly unsafe. Unless you have a guarantee that the string will be null
terminated before the input ends, use anyCString
instead. Honestly, I'm not
sure if this is a good function to define. But here it is.
Fails on GHC versions older than 9.0, since we make use of the
cstringLength#
primop introduced in GHC 9.0, and we aren't very useful
without it.
Consumes the null terminator.
scan8# :: Word8 -> Parser e () Source #
Unsafely read a concrete byte from the input. It's not checked that the input has enough bytes.
scan16# :: Word16 -> Parser e () Source #
Unsafely read two concrete bytes from the input. It's not checked that the input has enough bytes.
scan32# :: Word32 -> Parser e () Source #
Unsafely read four concrete bytes from the input. It's not checked that the input has enough bytes.
scan64# :: Word -> Parser e () Source #
Unsafely read eight concrete bytes from the input. It's not checked that the input has enough bytes.