Language/Parse.hs

{-# LANGUAGE FlexibleContexts #-}
{-| Utility library that provides parsers for commonly-occuring programming
    constructs such as identifiers, numbers and characters.
-}
module Language.Parse (
    -- * Combinators
    -- ** Composable
      string
    , lookAhead
    , manyTill
    , manyThru
    , (<|>)
    , choice
    -- ** Extra
    , many2
    , between2
    , isEof
    , spaces1
    , charICase
    , stringICase
    -- * Identifiers
    , blacklistChar
    -- * Numbers
    -- ** Prepackaged Parsers
    , anyNumber
    -- ** Number Parts
    , signLiteral
    , baseLiteral
    , naturalLiteral
    , mantissaLiteral
    , exponentLiteral
    , denominatorLiteral
    , xDigit
    -- ** Convert Strings to Numbers
    , stringToInteger
    , stringToMantissa
    -- * Characters
    , literalChar
    , maybeLiteralChar
    ) where

import Control.Monad
import Control.Applicative ((<$>), (<*>), (*>), (<*))

import Data.Maybe
import Data.Ratio
import Data.Char
import Text.Parsec ( ParsecT
                   , satisfy, char, oneOf, eof
                   , try, (<?>), parserZero)
import qualified Text.Parsec as P



--FIXME put this in Parsec.Combinators.Composable
------ Composable Combinators ------
{-| Parse a string, but don't consume input on failure. -}
string :: (Monad m, P.Stream s m Char) => String -> ParsecT s u m String
string = try . P.string

{-| Lookahead without consuming any input. -}
lookAhead :: (Monad m, P.Stream s m t) => ParsecT s u m a -> ParsecT s u m a
lookAhead = try . P.lookAhead

{-| Use @manyTill p e@ to apply parser @p@ many times, stopping as soon as
    @e@ is next to parse. Note that @e@ is not consumed.
-}
manyTill :: (Monad m, P.Stream s m t) => ParsecT s u m a -> ParsecT s u m b -> ParsecT s u m [a]
manyTill p e = P.manyTill p (lookAhead e)

{-| Use @manyThru p e@ to apply parser @p@ many times, stopping as soon as
    @e@ is consumed. Unlike Parsec's @manyTill@, if @e@ fails, it does not
    consume input.
-}
manyThru :: (Monad m, P.Stream s m t) => ParsecT s u m a -> ParsecT s u m b -> ParsecT s u m [a]
manyThru p e = P.manyTill p (try e)

{-| Use @a <|> b@ to parse @a@ or @b@. If @a@ fails, no input is consumed. -}
(<|>) :: (Monad m, P.Stream s m t) => ParsecT s u m a -> ParsecT s u m a -> ParsecT s u m a
a <|> b = try a P.<|> b

{-| Parse the first of the passed combinators that succeeds. If any
    parser fails, it does not consume input.
-}
choice :: (Monad m, P.Stream s m t) => [ParsecT s u m a] -> ParsecT s u m a
choice = P.choice . map try

--TODO sepBy &co


------ Useful Combinators ------
{-| Use @many2 a b@ to parse an @a@ followed by zero or more @b@s. -}
many2 :: (Monad m, P.Stream s m t) => ParsecT s u m a -> ParsecT s u m a -> ParsecT s u m [a]
many2 p ps = do
    car <- p
    cdr <- P.many ps
    return (car:cdr)

{-| Use @between2 a p@ to parse an @a@, then a @p@, then an @a@. Return the
    results of the @p@ parser.
-}
between2 :: (Monad m, P.Stream s m t) => ParsecT s u m a -> ParsecT s u m b -> ParsecT s u m b
between2 e p = P.between e e p

{-| Detect end of file as a boolean. -}
isEof :: (Show t, Monad m, P.Stream s m t) => ParsecT s u m Bool
isEof = (eof >> return True) P.<|> return False


{-| One or more spaces. -}
spaces1 :: (Monad m, P.Stream s m Char) => ParsecT s u m ()
spaces1 = void $ P.many1 P.space

{-| Parse one character, case-insensitive. -}
charICase :: (Monad m, P.Stream s m Char) => Char -> ParsecT s u m Char
charICase c = satisfy $ (== toLower c) . toLower

{-| Parse a string, case-insensitive. If this parser fails, it consumes no input. -}
stringICase :: (Monad m, P.Stream s m Char) => String -> ParsecT s u m String
stringICase str = try $ mapM charICase str


------ Parsing Identifiers ------
{-| Parses a wide variety of characters, excepting those which meet
    the passed predicate. Specifically, we accept all of Unicode except:
        
        * Space
        
        * LineSeparator
        
        * ParagraphSeparator
        
        * Control
        
        * Format
        
        * Surrogate
        
        * PrivateUse

        * NotAssigned
-}
blacklistChar :: (Monad m, P.Stream s m Char) => (Char -> Bool) -> ParsecT s u m Char
blacklistChar p = satisfy $ \c -> not (p c) && case generalCategory c of
    Space -> False
    LineSeparator -> False
    ParagraphSeparator -> False
    Control -> False
    Format -> False
    Surrogate -> False
    PrivateUse -> False
    NotAssigned -> False
    _ -> True --Letter, Mark, Number, Punctuation/Quote, Symbol

--TODO maybe normal c-like identifiers, maybe identifiers that could be word-based vs. symbol-based


------ Parsing Numbers ------
--TODO common combinations of the number part parsers
{-| Optional sign, then an integer number in scientific notation
    or ratio, in base 2, 8, 10 or 16. If in scientific notation,
    the exponent may be in base 10 or 16
-}
anyNumber :: (Monad m, P.Stream s m Char) => ParsecT s u m Rational
anyNumber = (<?> "number") $ try $ do
    sign <- P.option 1 signLiteral
    base <- baseLiteral
    whole <- naturalLiteral base
    n <- choice [ scientificNotation whole base
                , fractionNotation whole base
                , return (whole % 1)
                ]
    return $ fromIntegral sign * n
    where
    scientificNotation whole base = do
        mantissa <- mantissaLiteral base
        (expbase, exponent) <- P.option (1,0) (decimalExp <|> hexExp)
        return $ ((whole % 1) + mantissa) * (fromIntegral expbase ^^ exponent)
    fractionNotation whole base = (whole %) . denominator <$> denominatorLiteral base
    decimalExp = (,) 10 <$> exponentLiteral 10
    hexExp = (,) 16 <$> exponentLiteral 16 


{-| Parse a minus or plus sign and return the appropriate multiplier. -}
signLiteral :: (Monad m, P.Stream s m Char) => ParsecT s u m Integer
signLiteral = (<?> "sign") $ (char '-' >> return (-1)) P.<|> (char '+' >> return 1)

{-| Parse \"0x\", \"0o\", or \"0b\" case-insensitive and return the appropriate base.
    If none of these parse, return base 10.
-}
baseLiteral :: (Monad m, P.Stream s m Char) => ParsecT s u m Int
baseLiteral = choice [ (stringICase "0x") >> return 16
                     , (stringICase "0o") >> return  8
                     , (stringICase "0b") >> return  2
                     ,                       return 10
                     ]

{-| Parse many digits in the passed base and return the corresponding integer. -}
naturalLiteral :: (Monad m, P.Stream s m Char) => Int -> ParsecT s u m Integer
naturalLiteral base = (<?> "natural number") $ stringToInteger base <$> P.many1 (xDigit base)

{-| Parse a dot followed by many digits in the passed base and return
    the corresponding ratio.
-}
mantissaLiteral :: (Monad m, P.Stream s m Char) => Int -> ParsecT s u m Rational
mantissaLiteral base = (<?> "mantissa") $ do
    char '.'
    stringToMantissa base <$> P.many1 (xDigit base)

{-| In base 10, parse an 'e' and a decimal integer.
    In base 16, parse an 'h' and a hexadecimal integer.
-}
exponentLiteral :: (Monad m, P.Stream s m Char) => Int -> ParsecT s u m Integer
exponentLiteral base = (<?> "exponent") (go base)
    where
    body = (*) <$> P.option 1 signLiteral <*> naturalLiteral base
    go 10 = charICase 'e' >> body
    go 16 = charICase 'h' >> body
    go _ = error "unrecognized base in Language.Parser.exponentLiteral (accepts only 10 or 16)"

{-| Parse a '/' and a natural in the passed base. Return the
    reciprocal of that number.
-}
denominatorLiteral :: (Monad m, P.Stream s m Char) => Int -> ParsecT s u m Rational
denominatorLiteral base = (<?> "denominator") $ do
    denom <- char '/' >> naturalLiteral base
    if denom == 0 then parserZero else return (1%denom)


{-| Parse a digit in the passed base: 2, 8, 10 or 16. -}
xDigit :: (Monad m, P.Stream s m Char) => Int -> ParsecT s u m Char
xDigit base = case base of
    2  -> oneOf "01"
    8  -> P.octDigit
    10 -> P.digit
    16 -> P.hexDigit
    _ -> error "unrecognized base in Language.Parser.xDigit (accepts only 2, 8, 10, or 16)"

{-| Interpret a string as an integer in the passed base. -}
stringToInteger :: Int -> String -> Integer
stringToInteger base = foldl impl 0
    where impl acc x = acc * fromIntegral base + (fromIntegral . digitToInt) x

{-| Interpret a string as a mantissa in the passed base. -}
stringToMantissa :: Int -> String -> Ratio Integer
stringToMantissa base = (/ (fromIntegral base%1)) . foldr impl (0 % 1)
    where impl x acc = acc / (fromIntegral base%1) + (((%1) . fromIntegral . digitToInt) x)


------ Parsing Character Literals ------
{-| Parse a single character as if in a string literal. This should be applicable
    to both character and string literals.

    Here's the list of what characters are accepted:

    * Any single unicode character that is not an ASCII control character, backslash, or double-quote.

    * Line continuation: backslash, then advance over whitespace
        (including newlines and comments) through the next backslash.

    * Octal or hexadecimal ASCII escapes: a sequence in @\/\\\\(x[0-9a-fA-F]{2}|o[0-7]{3})\/@.

    * Unicode escapes: a sequence in @\/\\\\(u|U0[0-9a-fA-F]|U10)[0-9a-fA-F]{4}\/@.

    * Special escape: a sequence in @\/\\\\[0abefnrtv\'\"]\/@.
        For reference, the meanings of special escapes are:
    
@
\\0: nul             (ASCII 0,  0x00)
\\a: bell            (ASCII 7,  0x07)
\\b: backspace       (ASCII 8,  0x08)
\\e: escape          (ASCII 27, 0x1B)
\\f: form feed       (ASCII 12, 0x0C)
\\n: line feed       (ASCII 10, 0x0A)
\\r: carriage return (ASCII 13, 0x0D)
\\t: horizontal tab  (ASCII 9,  0x09)
\\v: vertical tab    (ASCII 11, 0x0B)
\\\': single quote    (ASCII 39, 0x27)
\\\": double quote    (ASCII 34, 0x22)
@
-}
literalChar :: (Monad m, P.Stream s m Char) => ParsecT s u m Char
literalChar = (satisfy isNormalChar <?> "printing character") P.<|> (escape <?> "escape sequence")
    where
    isNormalChar c = c >= ' ' && c `notElem` "\DEL\'\"\\" --FIXME limit this slightly more
    escape = char '\\' >> P.choice [specialEscape, numericalEscape]
    specialEscape = fromJust . flip lookup table <$> oneOf (map fst table)
        where table = [ ('0' , '\0')
                      , ('a' , '\a')
                      , ('b' , '\b')
                      , ('e' , '\27')
                      , ('f' , '\f')
                      , ('n' , '\n')
                      , ('r' , '\r')
                      , ('t' , '\t')
                      , ('\'', '\'')
                      , ('\"', '\"')
                      , ('\\', '\\')
                      ]
    numericalEscape = chr . fromInteger <$> P.choice [ascii16, uni4, ascii8, uni6]
    ascii8  = stringToInteger 8  <$> (oneOf "oO" >> P.count 3 P.octDigit)
    ascii16 = stringToInteger 16 <$> (oneOf "xX" >> P.count 2 P.hexDigit)
    uni4    = stringToInteger 16 <$> (char  'u'  >> P.count 4 P.hexDigit)
    uni6    =                         char   'U' >> (high P.<|> low)
        where
        low  =                 stringToInteger 16 <$> (char    '0' >> P.count 5 P.hexDigit)
        high =  (+ 0x100000) . stringToInteger 16 <$> (string "10" >> P.count 4 P.hexDigit)

{-| Parse any character accepted by 'literalChar', but also accept two empty characters:
    
    * @\\&@ The eplicit empty character.
    
    * Backslash-whitespace-backslash.
-}
maybeLiteralChar :: (Monad m, P.Stream s m Char) => ParsecT s u m (Maybe Char)
maybeLiteralChar = (Just <$> literalChar) P.<|> (const Nothing <$> (string "\\&" P.<|> lineContinue)) 
    where
    lineContinue = between2 (char '\\') (P.many $ oneOf " \t\n\r") --FIXME more types of whitespace could be allowed