module Text.XML.Expat.SAX (
Encoding(..),
XMLParseError(..),
XMLParseLocation(..),
ParserOptions(..),
SAXEvent(..),
textFromCString,
parse,
parseLocations,
parseLocationsThrowing,
parseThrowing,
defaultParserOptions,
XMLParseException(..),
parseSAX,
parseSAXLocations,
parseSAXLocationsThrowing,
parseSAXThrowing,
GenericXMLString(..)
) where
import Text.XML.Expat.IO hiding (parse)
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Internal as I
import Data.IORef
import Data.ByteString.Internal (c2w, w2c, c_strlen)
import qualified Data.Monoid as M
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import qualified Codec.Binary.UTF8.String as U8
import Data.Typeable
import Control.Exception.Extensible as Exc
import Control.Applicative
import Control.Parallel.Strategies
import Control.Monad
import System.IO.Unsafe
import Foreign.C.String
import Foreign.Ptr
data ParserOptions tag text = ParserOptions
{ parserEncoding :: Maybe Encoding
, entityDecoder :: Maybe (tag -> Maybe text)
}
defaultParserOptions :: ParserOptions tag text
defaultParserOptions = ParserOptions Nothing Nothing
class (M.Monoid s, Eq s) => GenericXMLString s where
gxNullString :: s -> Bool
gxToString :: s -> String
gxFromString :: String -> s
gxFromChar :: Char -> s
gxHead :: s -> Char
gxTail :: s -> s
gxBreakOn :: Char -> s -> (s, s)
gxFromCStringLen :: CStringLen -> IO s
gxToByteString :: s -> B.ByteString
instance GenericXMLString String where
gxNullString = null
gxToString = id
gxFromString = id
gxFromChar c = [c]
gxHead = head
gxTail = tail
gxBreakOn c = break (==c)
gxFromCStringLen cstr = U8.decodeString <$> peekCStringLen cstr
gxToByteString = B.pack . map c2w . U8.encodeString
instance GenericXMLString B.ByteString where
gxNullString = B.null
gxToString = U8.decodeString . map w2c . B.unpack
gxFromString = B.pack . map c2w . U8.encodeString
gxFromChar = B.singleton . c2w
gxHead = w2c . B.head
gxTail = B.tail
gxBreakOn c = B.break (== c2w c)
gxFromCStringLen = peekByteStringLen
gxToByteString = id
instance GenericXMLString T.Text where
gxNullString = T.null
gxToString = T.unpack
gxFromString = T.pack
gxFromChar = T.singleton
gxHead = T.head
gxTail = T.tail
gxBreakOn c = T.breakBy (==c)
gxFromCStringLen cstr = TE.decodeUtf8 <$> peekByteStringLen cstr
gxToByteString = TE.encodeUtf8
peekByteStringLen :: CStringLen -> IO B.ByteString
peekByteStringLen (cstr, len) =
I.create (fromIntegral len) $ \ptr ->
I.memcpy ptr (castPtr cstr) (fromIntegral len)
data SAXEvent tag text =
StartElement tag [(tag, text)] |
EndElement tag |
CharacterData text |
FailDocument XMLParseError
deriving (Eq, Show)
instance (NFData tag, NFData text) => NFData (SAXEvent tag text) where
rnf (StartElement tag atts) = rnf (tag, atts)
rnf (EndElement tag) = rnf tag
rnf (CharacterData text) = rnf text
rnf (FailDocument err) = rnf err
textFromCString :: GenericXMLString text => CString -> IO text
textFromCString cstr = do
len <- c_strlen cstr
gxFromCStringLen (cstr, fromIntegral len)
setEntityDecoder :: (GenericXMLString tag, GenericXMLString text)
=> Parser
-> IORef [SAXEvent tag text]
-> (tag -> Maybe text)
-> IO ()
setEntityDecoder parser queueRef decoder = do
setUseForeignDTD parser True
setExternalEntityRefHandler parser eh
setSkippedEntityHandler parser skip
where
skip _ _ 1 = return False
skip _ entityName 0 = do
en <- textFromCString entityName
let mbt = decoder en
maybe (return False)
(\t -> do
modifyIORef queueRef (CharacterData t:)
return True)
mbt
skip _ _ _ = undefined
eh p ctx _ systemID publicID =
if systemID == nullPtr && publicID == nullPtr
then withCStringLen "" $ \c -> do
parseExternalEntityReference p ctx Nothing c
else return False
setEntityDecoderLoc :: (GenericXMLString tag, GenericXMLString text)
=> Parser
-> IORef [(SAXEvent tag text, XMLParseLocation)]
-> (tag -> Maybe text)
-> IO ()
setEntityDecoderLoc parser queueRef decoder = do
setUseForeignDTD parser True
setExternalEntityRefHandler parser eh
setSkippedEntityHandler parser skip
where
skip _ _ 1 = return False
skip pp entityName 0 = do
en <- textFromCString entityName
let mbt = decoder en
maybe (return False)
(\t -> do
loc <- getParseLocation pp
modifyIORef queueRef ((CharacterData t,loc):)
return True)
mbt
skip _ _ _ = undefined
eh p ctx _ systemID publicID =
if systemID == nullPtr && publicID == nullPtr
then withCStringLen "" $ \c -> do
parseExternalEntityReference p ctx Nothing c
else return False
parse :: (GenericXMLString tag, GenericXMLString text) =>
ParserOptions tag text
-> L.ByteString
-> [SAXEvent tag text]
parse opts input = unsafePerformIO $ do
let enc = parserEncoding opts
let mEntityDecoder = entityDecoder opts
parser <- newParser enc
queueRef <- newIORef []
maybe (return ())
(setEntityDecoder parser queueRef)
mEntityDecoder
setStartElementHandler parser $ \_ cName cAttrs -> do
name <- textFromCString cName
attrs <- forM cAttrs $ \(cAttrName,cAttrValue) -> do
attrName <- textFromCString cAttrName
attrValue <- textFromCString cAttrValue
return (attrName, attrValue)
modifyIORef queueRef (StartElement name attrs:)
return True
setEndElementHandler parser $ \_ cName -> do
name <- textFromCString cName
modifyIORef queueRef (EndElement name:)
return True
setCharacterDataHandler parser $ \_ cText -> do
txt <- gxFromCStringLen cText
modifyIORef queueRef (CharacterData txt:)
return True
let runParser inp = unsafeInterleaveIO $ do
rema <- withParser parser $ \pp -> case inp of
(c:cs) -> do
mError <- parseChunk pp c False
case mError of
Just err -> return [FailDocument err]
Nothing -> runParser cs
[] -> do
mError <- parseChunk pp B.empty True
case mError of
Just err -> return [FailDocument err]
Nothing -> return []
queue <- readIORef queueRef
writeIORef queueRef []
return $ reverse queue ++ rema
runParser $ L.toChunks input
parseSAX :: (GenericXMLString tag, GenericXMLString text) =>
Maybe Encoding
-> L.ByteString
-> [SAXEvent tag text]
parseSAX enc = parse (ParserOptions enc Nothing)
data XMLParseException = XMLParseException XMLParseError
deriving (Eq, Show, Typeable)
instance Exception XMLParseException where
parseLocations :: (GenericXMLString tag, GenericXMLString text) =>
ParserOptions tag text
-> L.ByteString
-> [(SAXEvent tag text, XMLParseLocation)]
parseLocations opts input = unsafePerformIO $ do
let enc = parserEncoding opts
let mEntityDecoder = entityDecoder opts
parser <- newParser enc
queueRef <- newIORef []
maybe (return ())
(setEntityDecoderLoc parser queueRef)
mEntityDecoder
setStartElementHandler parser $ \pp cName cAttrs -> do
name <- textFromCString cName
attrs <- forM cAttrs $ \(cAttrName,cAttrValue) -> do
attrName <- textFromCString cAttrName
attrValue <- textFromCString cAttrValue
return (attrName, attrValue)
loc <- getParseLocation pp
modifyIORef queueRef ((StartElement name attrs,loc):)
return True
setEndElementHandler parser $ \pp cName -> do
name <- textFromCString cName
loc <- getParseLocation pp
modifyIORef queueRef ((EndElement name, loc):)
return True
setCharacterDataHandler parser $ \pp cText -> do
txt <- gxFromCStringLen cText
loc <- getParseLocation pp
modifyIORef queueRef ((CharacterData txt, loc):)
return True
let runParser [] = return []
runParser (c:cs) = unsafeInterleaveIO $ do
queue <- readIORef queueRef
writeIORef queueRef []
rema <- withParser parser $ \pp -> do
mError <- parseChunk pp c (null cs)
case mError of
Just err -> do
loc <- getParseLocation pp
return [(FailDocument err, loc)]
Nothing -> runParser cs
return $ reverse queue ++ rema
runParser $ L.toChunks input
parseSAXLocations :: (GenericXMLString tag, GenericXMLString text) =>
Maybe Encoding
-> L.ByteString
-> [(SAXEvent tag text, XMLParseLocation)]
parseSAXLocations enc = parseLocations (ParserOptions enc Nothing)
parseThrowing :: (GenericXMLString tag, GenericXMLString text) =>
ParserOptions tag text
-> L.ByteString
-> [SAXEvent tag text]
parseThrowing opts bs = map freakOut $ parse opts bs
where
freakOut (FailDocument err) = Exc.throw $ XMLParseException err
freakOut other = other
parseSAXThrowing :: (GenericXMLString tag, GenericXMLString text) =>
Maybe Encoding
-> L.ByteString
-> [SAXEvent tag text]
parseSAXThrowing mEnc = parseThrowing (ParserOptions mEnc Nothing)
parseLocationsThrowing :: (GenericXMLString tag, GenericXMLString text) =>
ParserOptions tag text
-> L.ByteString
-> [(SAXEvent tag text, XMLParseLocation)]
parseLocationsThrowing opts bs = map freakOut $ parseLocations opts bs
where
freakOut (FailDocument err, _) = Exc.throw $ XMLParseException err
freakOut other = other
parseSAXLocationsThrowing :: (GenericXMLString tag, GenericXMLString text) =>
Maybe Encoding
-> L.ByteString
-> [(SAXEvent tag text, XMLParseLocation)]
parseSAXLocationsThrowing mEnc =
parseLocationsThrowing (ParserOptions mEnc Nothing)