{-# LANGUAGE BangPatterns #-} {-# LANGUAGE BinaryLiterals #-} module Djot.Parse ( Parser , Chunk(..) , parse , asciiChar , satisfyByte , skipSatisfyByte , satisfy , anyChar , skipMany , skipSome , eof , getState , updateState , lookahead , peek , peekBack , fails , failed , withByteString , byteStringOf , notFollowedBy , optional_ , byteString , getOffset , sourceLine , sourceColumn , branch , endline , restOfLine , ws , followedByWhitespace , followedByBlankLine , spaceOrTab , isWs , strToUtf8 , utf8ToStr ) where import qualified Data.ByteString as B import qualified Data.ByteString.Char8 as B8 import Data.ByteString (ByteString) import Control.Applicative import Control.Monad (void, MonadPlus(..)) import Data.Char (chr) import Data.Bits import Data.Maybe (fromMaybe) import Data.Text.Encoding (decodeUtf8With, encodeUtf8) import Data.Text.Encoding.Error (lenientDecode) import qualified Data.Text as T -- import Text.Printf -- import Debug.Trace newtype Parser s a = Parser{ runParser :: ParserState s -> Maybe (ParserState s, a) } instance Functor (Parser s) where fmap f g = Parser $ \s -> case runParser g s of Nothing -> Nothing Just (s', !x) -> Just (s', f x) instance Applicative (Parser s) where pure x = Parser (\s -> Just (s, x)) liftA2 f g h = Parser $ \s -> case runParser g s of Nothing -> Nothing Just (s', x) -> case runParser h s' of Nothing -> Nothing Just (s'', y) -> Just (s'', f x y) instance Monad (Parser s) where return = pure f >>= g = Parser $ \s -> case runParser f s of Nothing -> Nothing Just (s', x) -> runParser (g x) s' instance Alternative (Parser s) where empty = Parser (const Nothing) f <|> g = Parser $ \s -> case runParser f s of Just (s', x) -> Just (s', x) Nothing -> runParser g s instance MonadPlus (Parser s) where mzero = empty mplus = (<|>) data Chunk = Chunk{ chunkLine :: Int , chunkColumn :: Int , chunkBytes :: ByteString } deriving (Show, Eq, Ord) data ParserState a = ParserState { chunks :: [Chunk] , subject :: !ByteString , offset :: !Int , line :: !Int , column :: !Int , userState :: !a } deriving (Show) -- | Apply a parser to a bytestring with a given user state. -- Returns @Nothing@ on failure, @Just result@ on success. parse :: Parser s a -> s -> [Chunk] -> Maybe a parse parser ustate chunks'' = snd <$> runParser parser ParserState { chunks = chunks' , subject = bs , offset = 0 , line = startline , column = startcol , userState = ustate } where (chunks', bs, startline, startcol) = case chunks'' of [] -> ([], mempty, 1, 0) (c:cs) -> (cs, chunkBytes c, chunkLine c, chunkColumn c) -- | Given a number of bytes, advances the offset and updates line/column. unsafeAdvance :: Int -> ParserState s -> ParserState s unsafeAdvance 0 = id unsafeAdvance !n = unsafeAdvance (n - 1) . unsafeAdvanceByte -- | Advance the offset and line/column for consuming a given byte. unsafeAdvanceByte :: ParserState s -> ParserState s unsafeAdvanceByte st | offset st + 1 >= B.length (subject st) , c:cs <- chunks st = st{ chunks = cs , subject = chunkBytes c , offset = 0 , line = chunkLine c , column = chunkColumn c } | otherwise = case B.index (subject st) (offset st) of 10 -> st{ offset = offset st + 1 , line = line st + 1 , column = 1 } 9 -> st{ offset = offset st + 1 , column = column st + (4 - (column st `mod` 4)) } !w | w < 0x80 -> st{ offset = offset st + 1 , column = column st + 1 } -- utf8 multibyte: only count byte 1: | w >= 0b11000000 -> st{ offset = offset st + 1 , column = column st + 1 } | otherwise -> st{ offset = offset st + 1 } -- | Returns current byte as Char. current :: ParserState s -> Maybe Char current st = subject st B8.!? offset st -- | Returns current byte as Char. peek :: Parser s (Maybe Char) peek = Parser $ \st -> Just (st, current st) -- | Returns previous byte as Char. Doesn't cross chunk boundaries. peekBack :: Parser s (Maybe Char) peekBack = Parser $ \st -> Just (st, subject st B8.!? (offset st - 1)) -- | Parse a byte satisfying a predicate. satisfyByte :: (Char -> Bool) -> Parser s Char satisfyByte f = Parser $ \st -> case current st of Just c | f c -> Just (unsafeAdvanceByte st, c) _ -> Nothing -- | Skip byte satisfying a predicate. skipSatisfyByte :: (Char -> Bool) -> Parser s () skipSatisfyByte f = Parser $ \st -> case current st of Just c | f c -> Just (unsafeAdvanceByte st, ()) _ -> Nothing -- | Parse a (possibly multibyte) Char satisfying a predicate. -- Assumes UTF-8 encoding. satisfy :: (Char -> Bool) -> Parser s Char satisfy f = Parser $ \st -> let peekWord !n = subject st B.!? (offset st + n) b2 = fromMaybe 0 $ peekWord 1 b3 = fromMaybe 0 $ peekWord 2 b4 = fromMaybe 0 $ peekWord 3 in case peekWord 0 of Nothing -> Nothing Just b1 | b1 < 0b10000000 , !c <- chr (fromIntegral b1) , f c -> Just (unsafeAdvanceByte st, c) | b1 .&. 0b11100000 == 0b11000000 , b2 >= 0b10000000 , !c <- chr (toCodePoint2 b1 b2) , f c -> Just (unsafeAdvance 2 st, c) | b1 .&. 0b11110000 == 0b11100000 , b2 >= 0b10000000 , b3 >= 0b10000000 , !c <- chr (toCodePoint3 b1 b2 b3) , f c -> Just (unsafeAdvance 3 st, c) | b1 .&. 0b11111000 == 0b11110000 , b2 >= 0b10000000 , b3 >= 0b10000000 , b4 >= 0b10000000 , !c <- chr (toCodePoint4 b1 b2 b3 b4) , f c -> Just (unsafeAdvance 4 st, c) | otherwise -> Nothing where toCodePoint2 a b = (fromIntegral (a .&. 0b00011111) `shiftL` 6) + fromIntegral (b .&. 0b00111111) toCodePoint3 a b c = (fromIntegral (a .&. 0b00001111) `shiftL` 12) + (fromIntegral (b .&. 0b00111111) `shiftL` 6) + fromIntegral (c .&. 0b00111111) toCodePoint4 a b c d = (fromIntegral (a .&. 0b00000111) `shiftL` 18) + (fromIntegral (b .&. 0b00111111) `shiftL` 12) + (fromIntegral (c .&. 0b00111111) `shiftL` 6) + fromIntegral (d .&. 0b00111111) -- | Parse any character. Assumes UTF-8 encoding. anyChar :: Parser s Char anyChar = satisfy (const True) -- | Parse an ASCII character. asciiChar :: Char -> Parser s () asciiChar !c = Parser $ \st -> case current st of Just d | d == c -> Just (unsafeAdvanceByte st, ()) _ -> Nothing -- | Apply parser 0 or more times, discarding result. skipMany :: Parser s a -> Parser s () skipMany parser = Parser go where go st = case runParser parser st of Nothing -> Just (st, ()) Just (st',_) -> go st' -- | Apply parser 1 or more times, discarding result. skipSome :: Parser s a -> Parser s () skipSome parser = parser *> skipMany parser -- | Succeeds if no more input. eof :: Parser s () eof = Parser $ \st -> case current st of Nothing -> Just (st, ()) Just _ -> Nothing -- | Returns current user state. getState :: Parser s s getState = Parser $ \st -> Just (st, userState st) -- | Updates user state. updateState :: (s -> s) -> Parser s () updateState f = Parser $ \st -> Just (st{ userState = f (userState st) }, ()) -- | Apply a parser, returning its result but not changing state -- or advancing. lookahead :: Parser s a -> Parser s a lookahead pa = Parser $ \st -> case runParser pa st of Just (_, x) -> Just (st, x) Nothing -> Nothing -- | Succeeds if parser fails. fails :: Parser s a -> Parser s () fails pa = Parser $ \st -> case runParser pa st of Just _ -> Nothing Nothing -> Just (st, ()) -- | Always fails. failed :: Parser s a failed = Parser $ const Nothing -- | Returns result of parse together with the bytestring -- consumed. withByteString :: Parser s a -> Parser s (a, ByteString) withByteString pa = Parser $ \st -> case runParser pa st of Just (st', x) -> Just (st', (x, B8.take (offset st' - offset st) (B8.drop (offset st) (subject st)))) Nothing -> Nothing -- | Returns bytestring consumed by parse. byteStringOf :: Parser s a -> Parser s ByteString byteStringOf pa = Parser $ \st -> case runParser pa st of Just (st', _) -> Just (st', case length (chunks st) - length (chunks st') of 0 -> B8.take (offset st' - offset st) (B8.drop (offset st) (subject st)) n -> B8.drop (offset st) (subject st) <> foldMap chunkBytes (take (n - 1) (chunks st)) <> B8.take (offset st') (subject st')) Nothing -> Nothing -- | Succeeds if first parser succeeds and second fails, returning -- first parser's value. notFollowedBy :: Parser s a -> Parser s b -> Parser s a notFollowedBy pa pb = pa <* fails pb -- | Apply parser but still succeed if it doesn't succeed. optional_ :: Parser s a -> Parser s () optional_ pa = void pa <|> pure () -- | Parse a bytestring. byteString :: ByteString -> Parser s () byteString bs = Parser $ \st -> if bs `B8.isPrefixOf` B8.drop (offset st) (subject st) then Just (unsafeAdvance (B.length bs) st, ()) else Nothing -- | Returns byte offset in input. getOffset :: Parser s Int getOffset = Parser $ \st -> Just (st, offset st) -- | Returns the line number. sourceLine :: Parser s Int sourceLine = Parser $ \st -> Just (st, line st) -- | Returns the source column number. (Tab stop is computed at 4.) sourceColumn :: Parser st Int sourceColumn = Parser $ \st -> Just (st, column st) -- | Try the first parser: if it succeeds, apply the second, -- returning its result, otherwise the third. branch :: Parser s b -> Parser s a -> Parser s a -> Parser s a branch pa pb pc = Parser $ \st -> case runParser pa st of Just (st',_) -> runParser pb st' Nothing -> runParser pc st -- | Parse an end of line sequence. endline :: Parser s () endline = branch (asciiChar '\r') (optional_ (asciiChar '\n')) (asciiChar '\n') -- | Return the rest of line (including the end of line). restOfLine :: Parser s ByteString restOfLine = byteStringOf $ skipMany (skipSatisfyByte (\c -> c /= '\n' && c /= '\r')) <* optional_ endline {-# INLINE isWs #-} -- | Is space, tab, `\r`, or `\n`. isWs :: Char -> Bool isWs c = c == ' ' || c == '\t' || c == '\r' || c == '\n' -- | Skip one space or tab. spaceOrTab :: Parser s () spaceOrTab = Parser $ \st -> case current st of Just ' ' -> Just (unsafeAdvanceByte st, ()) Just '\t' -> Just (unsafeAdvanceByte st, ()) _ -> Nothing -- | Skip 1 or more ASCII whitespace. ws :: Parser s () ws = skipSome (satisfyByte isWs) -- | Next character is ASCII whitespace. followedByWhitespace :: Parser s () followedByWhitespace = Parser $ \st -> case current st of Just c | isWs c -> Just (st, ()) _ -> Nothing -- | Followed by 0 or more spaces/tabs and endline or eof. followedByBlankLine :: Parser s () followedByBlankLine = Parser $ \st -> let subj = subject st !len = B8.length subj go !off | off >= len = Just (st, ()) | otherwise = case B8.index subj off of ' ' -> go (off + 1) '\r' -> go (off + 1) '\t' -> go (off + 1) '\n' -> Just (st, ()) _ -> Nothing in go (offset st) strToUtf8 :: String -> ByteString strToUtf8 = encodeUtf8 . T.pack utf8ToStr :: ByteString -> String utf8ToStr = T.unpack . decodeUtf8With lenientDecode