{-# LANGUAGE BangPatterns, CPP, FlexibleInstances, TypeSynonymInstances #-} {-# OPTIONS_GHC -fno-warn-warnings-deprecations #-} -- | -- Module : Data.Attoparsec.Text -- Copyright : Bryan O'Sullivan 2007-2014 -- License : BSD3 -- -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : unknown -- -- Simple, efficient combinator parsing for 'Text' strings, -- loosely based on the Parsec library. module Data.Attoparsec.Text ( -- * Differences from Parsec -- $parsec -- * Incremental input -- $incremental -- * Performance considerations -- $performance -- * Parser types Parser , Result , T.IResult(..) , I.compareResults -- * Running parsers , parse , feed , I.parseOnly , parseWith , parseTest -- ** Result conversion , maybeResult , eitherResult -- * Parsing individual characters , I.char , I.anyChar , I.notChar , I.satisfy , I.satisfyWith , I.skip -- ** Lookahead , I.peekChar , I.peekChar' -- ** Special character parsers , digit , letter , space -- ** Character classes , I.inClass , I.notInClass -- * Efficient string handling , I.string , I.stringCI , I.asciiCI , skipSpace , I.skipWhile , I.scan , I.take , I.takeWhile , I.takeWhile1 , I.takeTill -- ** String combinators -- $specalt , (.*>) , (<*.) -- ** Consume all remaining input , I.takeText , I.takeLazyText -- * Text parsing , I.endOfLine , isEndOfLine , isHorizontalSpace -- * Numeric parsers , decimal , hexadecimal , signed , double , Number(..) , number , rational , scientific -- * Combinators , try , (<?>) , choice , count , option , many' , many1 , many1' , manyTill , manyTill' , sepBy , sepBy' , sepBy1 , sepBy1' , skipMany , skipMany1 , eitherP , I.match -- * State observation and manipulation functions , I.endOfInput , I.atEnd ) where #if !MIN_VERSION_base(4,8,0) import Control.Applicative (pure, (*>), (<*)) import Data.Word (Word) #endif import Control.Applicative ((<$>), (<|>)) import Data.Attoparsec.Combinator import Data.Attoparsec.Number (Number(..)) import Data.Scientific (Scientific) import qualified Data.Scientific as Sci import Data.Attoparsec.Text.Internal (Parser, Result, parse, takeWhile1) import Data.Bits (Bits, (.|.), shiftL) import Data.Char (isAlpha, isDigit, isSpace, ord) import Data.Int (Int8, Int16, Int32, Int64) import Data.Text (Text) import Data.Word (Word8, Word16, Word32, Word64) import qualified Data.Attoparsec.Internal as I import qualified Data.Attoparsec.Internal.Types as T import qualified Data.Attoparsec.Text.Internal as I import qualified Data.Text as T -- $parsec -- -- Compared to Parsec 3, attoparsec makes several tradeoffs. It is -- not intended for, or ideal for, all possible uses. -- -- * While attoparsec can consume input incrementally, Parsec cannot. -- Incremental input is a huge deal for efficient and secure network -- and system programming, since it gives much more control to users -- of the library over matters such as resource usage and the I/O -- model to use. -- -- * Much of the performance advantage of attoparsec is gained via -- high-performance parsers such as 'I.takeWhile' and 'I.string'. -- If you use complicated combinators that return lists of -- characters, there is less performance difference between the two -- libraries. -- -- * Unlike Parsec 3, attoparsec does not support being used as a -- monad transformer. -- -- * attoparsec is specialised to deal only with strict 'Text' -- input. Efficiency concerns rule out both lists and lazy text. -- The usual use for lazy text would be to allow consumption of very -- large input without a large footprint. For this need, -- attoparsec's incremental input provides an excellent substitute, -- with much more control over when input takes place. If you must -- use lazy text, see the 'Lazy' module, which feeds lazy chunks to -- a regular parser. -- -- * Parsec parsers can produce more helpful error messages than -- attoparsec parsers. This is a matter of focus: attoparsec avoids -- the extra book-keeping in favour of higher performance. -- $incremental -- -- attoparsec supports incremental input, meaning that you can feed it -- a 'Text' that represents only part of the expected total amount -- of data to parse. If your parser reaches the end of a fragment of -- input and could consume more input, it will suspend parsing and -- return a 'T.Partial' continuation. -- -- Supplying the 'T.Partial' continuation with another string will -- resume parsing at the point where it was suspended, with the string -- you supplied used as new input at the end of the existing -- input. You must be prepared for the result of the resumed parse to -- be another 'Partial' continuation. -- -- To indicate that you have no more input, supply the 'Partial' -- continuation with an 'T.empty' 'Text'. -- -- Remember that some parsing combinators will not return a result -- until they reach the end of input. They may thus cause 'T.Partial' -- results to be returned. -- -- If you do not need support for incremental input, consider using -- the 'I.parseOnly' function to run your parser. It will never -- prompt for more input. -- -- /Note/: incremental input does /not/ imply that attoparsec will -- release portions of its internal state for garbage collection as it -- proceeds. Its internal representation is equivalent to a single -- 'Text': if you feed incremental input to an a parser, it will -- require memory proportional to the amount of input you supply. -- (This is necessary to support arbitrary backtracking.) -- $performance -- -- If you write an attoparsec-based parser carefully, it can be -- realistic to expect it to perform similarly to a hand-rolled C -- parser (measuring megabytes parsed per second). -- -- To actually achieve high performance, there are a few guidelines -- that it is useful to follow. -- -- Use the 'Text'-oriented parsers whenever possible, -- e.g. 'I.takeWhile1' instead of 'many1' 'I.anyChar'. There is -- about a factor of 100 difference in performance between the two -- kinds of parser. -- -- For very simple character-testing predicates, write them by hand -- instead of using 'I.inClass' or 'I.notInClass'. For instance, both -- of these predicates test for an end-of-line character, but the -- first is much faster than the second: -- -- >endOfLine_fast c = c == '\r' || c == '\n' -- >endOfLine_slow = inClass "\r\n" -- -- Make active use of benchmarking and profiling tools to measure, -- find the problems with, and improve the performance of your parser. -- | Run a parser and print its result to standard output. parseTest :: (Show a) => I.Parser a -> Text -> IO () parseTest p s = print (parse p s) -- | Run a parser with an initial input string, and a monadic action -- that can supply more input if needed. parseWith :: Monad m => (m Text) -- ^ An action that will be executed to provide the parser -- with more input, if necessary. The action must return an -- 'T.empty' string when there is no more input available. -> I.Parser a -> Text -- ^ Initial input for the parser. -> m (Result a) parseWith refill p s = step $ parse p s where step (T.Partial k) = (step . k) =<< refill step r = return r {-# INLINE parseWith #-} -- | Convert a 'Result' value to a 'Maybe' value. A 'Partial' result -- is treated as failure. maybeResult :: Result r -> Maybe r maybeResult (T.Done _ r) = Just r maybeResult _ = Nothing -- | Convert a 'Result' value to an 'Either' value. A 'Partial' result -- is treated as failure. eitherResult :: Result r -> Either String r eitherResult (T.Done _ r) = Right r eitherResult (T.Fail _ _ msg) = Left msg eitherResult _ = Left "Result: incomplete input" -- | A predicate that matches either a carriage return @\'\\r\'@ or -- newline @\'\\n\'@ character. isEndOfLine :: Char -> Bool isEndOfLine c = c == '\n' || c == '\r' {-# INLINE isEndOfLine #-} -- | A predicate that matches either a space @\' \'@ or horizontal tab -- @\'\\t\'@ character. isHorizontalSpace :: Char -> Bool isHorizontalSpace c = c == ' ' || c == '\t' {-# INLINE isHorizontalSpace #-} -- | Parse and decode an unsigned hexadecimal number. The hex digits -- @\'a\'@ through @\'f\'@ may be upper or lower case. -- -- This parser does not accept a leading @\"0x\"@ string. hexadecimal :: (Integral a, Bits a) => Parser a hexadecimal = T.foldl' step 0 `fmap` takeWhile1 isHexDigit where isHexDigit c = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') step a c | w >= 48 && w <= 57 = (a `shiftL` 4) .|. fromIntegral (w - 48) | w >= 97 = (a `shiftL` 4) .|. fromIntegral (w - 87) | otherwise = (a `shiftL` 4) .|. fromIntegral (w - 55) where w = ord c {-# SPECIALISE hexadecimal :: Parser Int #-} {-# SPECIALISE hexadecimal :: Parser Int8 #-} {-# SPECIALISE hexadecimal :: Parser Int16 #-} {-# SPECIALISE hexadecimal :: Parser Int32 #-} {-# SPECIALISE hexadecimal :: Parser Int64 #-} {-# SPECIALISE hexadecimal :: Parser Integer #-} {-# SPECIALISE hexadecimal :: Parser Word #-} {-# SPECIALISE hexadecimal :: Parser Word8 #-} {-# SPECIALISE hexadecimal :: Parser Word16 #-} {-# SPECIALISE hexadecimal :: Parser Word32 #-} {-# SPECIALISE hexadecimal :: Parser Word64 #-} -- | Parse and decode an unsigned decimal number. decimal :: Integral a => Parser a decimal = T.foldl' step 0 `fmap` takeWhile1 isDecimal where step a c = a * 10 + fromIntegral (ord c - 48) {-# SPECIALISE decimal :: Parser Int #-} {-# SPECIALISE decimal :: Parser Int8 #-} {-# SPECIALISE decimal :: Parser Int16 #-} {-# SPECIALISE decimal :: Parser Int32 #-} {-# SPECIALISE decimal :: Parser Int64 #-} {-# SPECIALISE decimal :: Parser Integer #-} {-# SPECIALISE decimal :: Parser Word #-} {-# SPECIALISE decimal :: Parser Word8 #-} {-# SPECIALISE decimal :: Parser Word16 #-} {-# SPECIALISE decimal :: Parser Word32 #-} {-# SPECIALISE decimal :: Parser Word64 #-} isDecimal :: Char -> Bool isDecimal c = c >= '0' && c <= '9' {-# INLINE isDecimal #-} -- | Parse a number with an optional leading @\'+\'@ or @\'-\'@ sign -- character. signed :: Num a => Parser a -> Parser a {-# SPECIALISE signed :: Parser Int -> Parser Int #-} {-# SPECIALISE signed :: Parser Int8 -> Parser Int8 #-} {-# SPECIALISE signed :: Parser Int16 -> Parser Int16 #-} {-# SPECIALISE signed :: Parser Int32 -> Parser Int32 #-} {-# SPECIALISE signed :: Parser Int64 -> Parser Int64 #-} {-# SPECIALISE signed :: Parser Integer -> Parser Integer #-} signed p = (negate <$> (I.char '-' *> p)) <|> (I.char '+' *> p) <|> p -- | Parse a rational number. -- -- The syntax accepted by this parser is the same as for 'double'. -- -- /Note/: this parser is not safe for use with inputs from untrusted -- sources. An input with a suitably large exponent such as -- @"1e1000000000"@ will cause a huge 'Integer' to be allocated, -- resulting in what is effectively a denial-of-service attack. -- -- In most cases, it is better to use 'double' or 'scientific' -- instead. rational :: Fractional a => Parser a {-# SPECIALIZE rational :: Parser Double #-} {-# SPECIALIZE rational :: Parser Float #-} {-# SPECIALIZE rational :: Parser Rational #-} {-# SPECIALIZE rational :: Parser Scientific #-} rational = scientifically realToFrac -- | Parse a rational number. -- -- This parser accepts an optional leading sign character, followed by -- at least one decimal digit. The syntax similar to that accepted by -- the 'read' function, with the exception that a trailing @\'.\'@ or -- @\'e\'@ /not/ followed by a number is not consumed. -- -- Examples with behaviour identical to 'read', if you feed an empty -- continuation to the first result: -- -- >rational "3" == Done 3.0 "" -- >rational "3.1" == Done 3.1 "" -- >rational "3e4" == Done 30000.0 "" -- >rational "3.1e4" == Done 31000.0, "" -- -- Examples with behaviour identical to 'read': -- -- >rational ".3" == Fail "input does not start with a digit" -- >rational "e3" == Fail "input does not start with a digit" -- -- Examples of differences from 'read': -- -- >rational "3.foo" == Done 3.0 ".foo" -- >rational "3e" == Done 3.0 "e" -- -- This function does not accept string representations of \"NaN\" or -- \"Infinity\". double :: Parser Double double = scientifically Sci.toRealFloat -- | Parse a number, attempting to preserve both speed and precision. -- -- The syntax accepted by this parser is the same as for 'double'. -- -- This function does not accept string representations of \"NaN\" or -- \"Infinity\". number :: Parser Number number = scientifically $ \s -> let e = Sci.base10Exponent s c = Sci.coefficient s in if e >= 0 then I (c * 10 ^ e) else D (Sci.toRealFloat s) {-# DEPRECATED number "Use 'scientific' instead." #-} -- | Parse a scientific number. -- -- The syntax accepted by this parser is the same as for 'double'. scientific :: Parser Scientific scientific = scientifically id -- A strict pair data SP = SP !Integer {-# UNPACK #-}!Int {-# INLINE scientifically #-} scientifically :: (Scientific -> a) -> Parser a scientifically h = do !positive <- ((== '+') <$> I.satisfy (\c -> c == '-' || c == '+')) <|> pure True n <- decimal let f fracDigits = SP (T.foldl' step n fracDigits) (negate $ T.length fracDigits) step a c = a * 10 + fromIntegral (ord c - 48) SP c e <- (I.satisfy (=='.') *> (f <$> I.takeWhile isDigit)) <|> pure (SP n 0) let !signedCoeff | positive = c | otherwise = -c (I.satisfy (\w -> w == 'e' || w == 'E') *> fmap (h . Sci.scientific signedCoeff . (e +)) (signed decimal)) <|> return (h $ Sci.scientific signedCoeff e) -- | Parse a single digit, as recognised by 'isDigit'. digit :: Parser Char digit = I.satisfy isDigit <?> "digit" {-# INLINE digit #-} -- | Parse a letter, as recognised by 'isAlpha'. letter :: Parser Char letter = I.satisfy isAlpha <?> "letter" {-# INLINE letter #-} -- | Parse a space character, as recognised by 'isSpace'. space :: Parser Char space = I.satisfy isSpace <?> "space" {-# INLINE space #-} -- | Skip over white space. skipSpace :: Parser () skipSpace = I.skipWhile isSpace {-# INLINE skipSpace #-} -- $specalt -- -- If you enable the @OverloadedStrings@ language extension, you can -- use the '*>' and '<*' combinators to simplify the common task of -- matching a statically known string, then immediately parsing -- something else. -- -- Instead of writing something like this: -- -- @ --'I.string' \"foo\" '*>' wibble -- @ -- -- Using @OverloadedStrings@, you can omit the explicit use of -- 'I.string', and write a more compact version: -- -- @ -- \"foo\" '*>' wibble -- @ -- -- (Note: the '.*>' and '<*.' combinators that were originally -- provided for this purpose are obsolete and unnecessary, and will be -- removed in the next major version.) -- | /Obsolete/. A type-specialized version of '*>' for 'Text'. Use -- '*>' instead. (.*>) :: Text -> Parser a -> Parser a s .*> f = I.string s *> f {-# DEPRECATED (.*>) "This is no longer necessary, and will be removed. Use '*>' instead." #-} -- | /Obsolete/. A type-specialized version of '<*' for 'Text'. Use -- '*>' instead. (<*.) :: Parser a -> Text -> Parser a f <*. s = f <* I.string s {-# DEPRECATED (<*.) "This is no longer necessary, and will be removed. Use '<*' instead." #-}