{-# LANGUAGE CPP #-} {-# LANGUAGE BangPatterns #-} {-# LANGUAGE BinaryLiterals #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ScopedTypeVariables #-} -- | SAX parser and API for XML. module Xeno.SAX ( process , Process(..) , StringLike(..) , fold , validate , validateEx , dump , skipDoctype ) where import Control.Exception (throw) import Control.Monad (unless) import Control.Monad.ST (ST, runST) import Control.Monad.State.Strict (State, evalStateT, execState, modify', lift, get, put) import Control.Spork (spork) import Data.Bits (testBit) import Data.ByteString (ByteString) import qualified Data.ByteString as S import qualified Data.ByteString.Char8 as S8 import qualified Data.ByteString.Unsafe as SU import Data.Char (isSpace) import Data.Functor.Identity (Identity(..)) import Data.Semigroup () import Data.STRef (newSTRef, modifySTRef', readSTRef) import Data.Word (Word8, Word64) import Xeno.Types class StringLike str where s_index' :: str -> Int -> Word8 elemIndexFrom' :: Word8 -> str -> Int -> Maybe Int drop' :: Int -> str -> str substring' :: str -> Int -> Int -> ByteString toBS :: str -> ByteString instance StringLike ByteString where s_index' = s_index {-# INLINE s_index' #-} elemIndexFrom' = elemIndexFrom {-# INLINE elemIndexFrom' #-} drop' = S.drop {-# INLINE drop' #-} substring' = substring {-# INLINE substring' #-} toBS = id {-# INLINE toBS #-} instance StringLike ByteStringZeroTerminated where s_index' (BSZT ps) n = ps `SU.unsafeIndex` n {-# INLINE s_index' #-} elemIndexFrom' w (BSZT bs) i = elemIndexFrom w bs i {-# INLINE elemIndexFrom' #-} drop' i (BSZT bs) = BSZT $ S.drop i bs {-# INLINE drop' #-} substring' (BSZT bs) s t = substring bs s t {-# INLINE substring' #-} toBS (BSZT bs) = bs {-# INLINE toBS #-} -- | Parameters to the 'process' function data Process a = Process { openF :: !(ByteString -> a) -- ^ Open tag. , attrF :: !(ByteString -> ByteString -> a) -- ^ Tag attribute. , endOpenF :: !(ByteString -> a) -- ^ End open tag. , textF :: !(ByteString -> a) -- ^ Text. , closeF :: !(ByteString -> a) -- ^ Close tag. , cdataF :: !(ByteString -> a) -- ^ CDATA. } -------------------------------------------------------------------------------- -- Helpful interfaces to the parser -- | Parse the XML but return no result, process no events. -- -- N.B.: Only the lexical correctness of the input string is checked, not its XML semantics (e.g. only if tags are well formed, not whether tags are properly closed) -- -- > > :set -XOverloadedStrings -- > > validate "" -- > True -- -- > > validate " False validate :: (StringLike str) => str -> Bool validate s = case spork (runIdentity (process Process { openF = \_ -> pure () , attrF = \_ _ -> pure () , endOpenF = \_ -> pure () , textF = \_ -> pure () , closeF = \_ -> pure () , cdataF = \_ -> pure () } s)) of Left (_ :: XenoException) -> False Right _ -> True -- It must be inlined or specialised to ByteString/ByteStringZeroTerminated {-# INLINE validate #-} {-# SPECIALISE validate :: ByteString -> Bool #-} {-# SPECIALISE validate :: ByteStringZeroTerminated -> Bool #-} -- | Parse the XML and checks tags nesting. -- validateEx :: (StringLike str) => str -> Bool validateEx s = case spork (runST $ do tags <- newSTRef [] (process Process { openF = \tag -> modifySTRef' tags (tag:) , attrF = \_ _ -> pure () , endOpenF = \_ -> pure () , textF = \_ -> pure () , closeF = \tag -> modifySTRef' tags $ \case [] -> fail $ "Unexpected close tag \"" ++ show tag ++ "\"" (expectedTag:tags') -> if expectedTag == tag then tags' else fail $ "Unexpected close tag. Expected \"" ++ show expectedTag ++ "\", but got \"" ++ show tag ++ "\"" , cdataF = \_ -> pure () } s) readSTRef tags >>= \case [] -> return () tags' -> error $ "Not all tags closed: " ++ show tags' ) of Left (_ :: XenoException) -> False Right _ -> True {-# INLINE validateEx #-} {-# SPECIALISE validateEx :: ByteString -> Bool #-} {-# SPECIALISE validateEx :: ByteStringZeroTerminated -> Bool #-} -- | Parse the XML and pretty print it to stdout. dump :: ByteString -> IO () dump str = evalStateT (process Process { openF = \name -> do level <- get lift (S8.putStr (S8.replicate level ' ' <> "<" <> name <> "")) , attrF = \key value -> lift (S8.putStr (" " <> key <> "=\"" <> value <> "\"")) , endOpenF = \_ -> do level <- get let !level' = level + 2 put level' lift (S8.putStrLn (">")) , textF = \text -> do level <- get lift (S8.putStrLn (S8.replicate level ' ' <> S8.pack (show text))) , closeF = \name -> do level <- get let !level' = level - 2 put level' lift (S8.putStrLn (S8.replicate level' ' ' <> " name <> ">")) , cdataF = \cdata -> do level <- get lift (S8.putStrLn (S8.replicate level ' ' <> "CDATA: " <> S8.pack (show cdata))) } str) (0 :: Int) -- | Fold over the XML input. fold :: (s -> ByteString -> s) -- ^ Open tag. -> (s -> ByteString -> ByteString -> s) -- ^ Attribute key/value. -> (s -> ByteString -> s) -- ^ End of open tag. -> (s -> ByteString -> s) -- ^ Text. -> (s -> ByteString -> s) -- ^ Close tag. -> (s -> ByteString -> s) -- ^ CDATA. -> s -> ByteString -> Either XenoException s fold openF attrF endOpenF textF closeF cdataF s str = spork (execState (process Process { openF = \name -> modify' (\s' -> openF s' name) , attrF = \key value -> modify' (\s' -> attrF s' key value) , endOpenF = \name -> modify' (\s' -> endOpenF s' name) , textF = \text -> modify' (\s' -> textF s' text) , closeF = \name -> modify' (\s' -> closeF s' name) , cdataF = \cdata -> modify' (\s' -> cdataF s' cdata) } str) s) -------------------------------------------------------------------------------- -- Main parsing function -- | Process events with callbacks in the XML input. process :: (Monad m, StringLike str) => Process (m ()) -> str -> m () process !(Process {openF, attrF, endOpenF, textF, closeF, cdataF}) str = findLT 0 where findLT index = case elemIndexFrom' openTagChar str index of Nothing -> unless (S.null text) (textF text) where text = toBS $ drop' index str Just fromLt -> do unless (S.null text) (textF text) checkOpenComment (fromLt + 1) where text = substring' str index fromLt -- Find open comment, CDATA or tag name. checkOpenComment index | s_index' this 0 == bangChar -- ! , s_index' this 1 == commentChar -- - , s_index' this 2 == commentChar -- - = findCommentEnd (index + 3) | s_index' this 0 == bangChar -- ! , s_index' this 1 == openAngleBracketChar -- [ , s_index' this 2 == 67 -- C , s_index' this 3 == 68 -- D , s_index' this 4 == 65 -- A , s_index' this 5 == 84 -- T , s_index' this 6 == 65 -- A , s_index' this 7 == openAngleBracketChar -- [ = findCDataEnd (index + 8) (index + 8) | otherwise = findTagName index where this = drop' index str findCommentEnd index = case elemIndexFrom' commentChar str index of Nothing -> throw $ XenoParseError index "Couldn't find the closing comment dash." Just fromDash -> if s_index' this 0 == commentChar && s_index' this 1 == closeTagChar then findLT (fromDash + 2) else findCommentEnd (fromDash + 1) where this = drop' index str findCDataEnd cdata_start index = case elemIndexFrom' closeAngleBracketChar str index of Nothing -> throw $ XenoParseError index "Couldn't find closing angle bracket for CDATA." Just fromCloseAngleBracket -> if s_index' str (fromCloseAngleBracket + 1) == closeAngleBracketChar then do cdataF (substring' str cdata_start fromCloseAngleBracket) findLT (fromCloseAngleBracket + 3) -- Start after ]]> else -- We only found one ], that means that we need to keep searching. findCDataEnd cdata_start (fromCloseAngleBracket + 1) findTagName index0 | s_index' str index0 == questionChar = case elemIndexFrom' closeTagChar str spaceOrCloseTag of Nothing -> throw $ XenoParseError index "Couldn't find the end of the tag." Just fromGt -> do findLT (fromGt + 1) | s_index' str spaceOrCloseTag == closeTagChar = do let tagname = substring' str index spaceOrCloseTag if s_index' str index0 == slashChar then closeF tagname else do openF tagname endOpenF tagname findLT (spaceOrCloseTag + 1) | otherwise = do let tagname = substring' str index spaceOrCloseTag openF tagname result <- findAttributes spaceOrCloseTag endOpenF tagname case result of Right closingTag -> findLT (closingTag + 1) Left closingPair -> do closeF tagname findLT (closingPair + 2) where index = if s_index' str index0 == slashChar then index0 + 1 else index0 spaceOrCloseTag = parseName str index findAttributes index0 -- case: /> | s_index' str index == slashChar , s_index' str (index + 1) == closeTagChar = pure (Left index) -- case: > | s_index' str index == closeTagChar = pure (Right index) -- case: attr=' or attr=" | s_index' str afterAttrName == equalChar , usedChar == quoteChar || usedChar == doubleQuoteChar = case elemIndexFrom' usedChar str (quoteIndex + 1) of Nothing -> throw (XenoParseError index "Couldn't find the matching quote character.") Just endQuoteIndex -> do attrF (substring' str index afterAttrName) (substring' str (quoteIndex + 1) (endQuoteIndex)) findAttributes (endQuoteIndex + 1) -- case: attr= without following quote | s_index' str afterAttrName == equalChar = throw (XenoParseError index("Expected ' or \", got: " <> S.singleton usedChar)) | otherwise = throw (XenoParseError index ("Expected =, got: " <> S.singleton (s_index' str afterAttrName) <> " at character index: " <> (S8.pack . show) afterAttrName)) where index = skipSpaces str index0 #ifdef WHITESPACE_AROUND_EQUALS afterAttrName = skipSpaces str (parseName str index) quoteIndex = skipSpaces str (afterAttrName + 1) #else afterAttrName = parseName str index quoteIndex = afterAttrName + 1 #endif usedChar = s_index' str quoteIndex {-# INLINE process #-} {-# SPECIALISE process :: Process (Identity ()) -> ByteString -> Identity () #-} {-# SPECIALISE process :: Process (State s ()) -> ByteString -> State s () #-} {-# SPECIALISE process :: Process (ST s ()) -> ByteString -> ST s () #-} {-# SPECIALISE process :: Process (IO ()) -> ByteString -> IO () #-} {-# SPECIALISE process :: Process (Identity ()) -> ByteStringZeroTerminated -> Identity () #-} {-# SPECIALISE process :: Process (State s ()) -> ByteStringZeroTerminated -> State s () #-} {-# SPECIALISE process :: Process (ST s ()) -> ByteStringZeroTerminated -> ST s () #-} {-# SPECIALISE process :: Process (IO ()) -> ByteStringZeroTerminated -> IO () #-} -------------------------------------------------------------------------------- -- ByteString utilities -- | /O(1)/ 'ByteString' index (subscript) operator, starting from 0. s_index :: ByteString -> Int -> Word8 s_index ps n | n < 0 = throw (XenoStringIndexProblem n ps) | n >= S.length ps = throw (XenoStringIndexProblem n ps) | otherwise = ps `SU.unsafeIndex` n {-# INLINE s_index #-} -- | A fast space skipping function. skipSpaces :: (StringLike str) => str -> Int -> Int skipSpaces str i = if isSpaceChar (s_index' str i) then skipSpaces str (i + 1) else i {-# INLINE skipSpaces #-} -- | Get a substring of a string. substring :: ByteString -> Int -> Int -> ByteString substring s start end = S.take (end - start) (S.drop start s) {-# INLINE substring #-} -- | Basically @findIndex (not . isNameChar)@, but doesn't allocate. parseName :: (StringLike str) => str -> Int -> Int parseName str index = if not (isNameChar1 (s_index' str index)) then index else parseName' str (index + 1) {-# INLINE parseName #-} -- | Basically @findIndex (not . isNameChar)@, but doesn't allocate. parseName' :: (StringLike str) => str -> Int -> Int parseName' str index = if not (isNameChar (s_index' str index)) then index else parseName' str (index + 1) {-# INLINE parseName' #-} -- | Get index of an element starting from offset. elemIndexFrom :: Word8 -> ByteString -> Int -> Maybe Int elemIndexFrom c str offset = fmap (+ offset) (S.elemIndex c (S.drop offset str)) -- Without the INLINE below, the whole function is twice as slow and -- has linear allocation. See git commit with this comment for -- results. {-# INLINE elemIndexFrom #-} -------------------------------------------------------------------------------- -- Character types isSpaceChar :: Word8 -> Bool isSpaceChar = testBit (0b100000000000000000010011000000000 :: Int) . fromIntegral -- | | || bits: -- | | |+-- 9 -- | | +--- 10 -- | +------ 13 -- +------------------------- 32 {-# INLINE isSpaceChar #-} -- | Is the character a valid first tag/attribute name constituent? -- 'a'-'z', 'A'-'Z', '_', ':' isNameChar1 :: Word8 -> Bool isNameChar1 c = (c >= 97 && c <= 122) || (c >= 65 && c <= 90) || c == 95 || c == 58 {-# INLINE isNameChar1 #-} -- isNameCharOriginal :: Word8 -> Bool -- isNameCharOriginal c = -- (c >= 97 && c <= 122) || (c >= 65 && c <= 90) || c == 95 || c == 58 || -- c == 45 || c == 46 || (c >= 48 && c <= 57) -- {-# INLINE isNameCharOriginal #-} -- -- TODO Strange, but highMaskIsNameChar, lowMaskIsNameChar don't calculate fast... FIX IT -- highMaskIsNameChar, lowMaskIsNameChar :: Word64 -- (highMaskIsNameChar, lowMaskIsNameChar) = -- foldl (\(hi,low) char -> (hi `setBit` (char - 64), low `setBit` char)) -- NB: `setBit` can process overflowed values (where char < 64; -- TODO fix it -- (0::Word64, 0::Word64) -- (map fromIntegral (filter isNameCharOriginal [0..128])) -- {-# INLINE highMaskIsNameChar #-} -- {-# INLINE lowMaskIsNameChar #-} -- | Is the character a valid tag/attribute name constituent? -- isNameChar1 + '-', '.', '0'-'9' isNameChar :: Word8 -> Bool isNameChar char = (lowMaskIsNameChar `testBit` char'low) || (highMaskIsNameChar `testBit` char'high) -- TODO 1) change code to use W# instead of Word64 -- 2) Document `ii - 64` -- there is underflow, but `testBit` can process this! where char'low = fromIntegral char char'high = fromIntegral (char - 64) highMaskIsNameChar :: Word64 highMaskIsNameChar = 0b11111111111111111111111111010000111111111111111111111111110 -- ------------+------------- | ------------+------------- -- | | | bits: -- | | +-- 65-90 -- | +------------------- 95 -- +---------------------------------- 97-122 lowMaskIsNameChar :: Word64 lowMaskIsNameChar = 0b11111111111011000000000000000000000000000000000000000000000 -- -----+----- || -- | || bits: -- | |+-- 45 -- | +--- 46 -- +---------- 48-58 {-# INLINE isNameChar #-} -- | Char for '\''. quoteChar :: Word8 quoteChar = 39 -- | Char for '"'. doubleQuoteChar :: Word8 doubleQuoteChar = 34 -- | Char for '='. equalChar :: Word8 equalChar = 61 -- | Char for '?'. questionChar :: Word8 questionChar = 63 -- | Char for '/'. slashChar :: Word8 slashChar = 47 -- | Exclaimation character !. bangChar :: Word8 bangChar = 33 -- | The dash character. commentChar :: Word8 commentChar = 45 -- '-' -- | Open tag character. openTagChar :: Word8 openTagChar = 60 -- '<' -- | Close tag character. closeTagChar :: Word8 closeTagChar = 62 -- '>' -- | Open angle bracket character. openAngleBracketChar :: Word8 openAngleBracketChar = 91 -- | Close angle bracket character. closeAngleBracketChar :: Word8 closeAngleBracketChar = 93 -- | Skip initial DOCTYPE declaration skipDoctype :: ByteString -> ByteString skipDoctype arg = if "" `S8.breakSubstring` bs in skipBlanks $ S8.drop 1 rest else bs where bs = skipBlanks arg skipBlanks = S8.dropWhile isSpace