module Text.XML.Expat.SAX (
Encoding(..),
XMLParseError(..),
XMLParseLocation(..),
ParseOptions(..),
SAXEvent(..),
textFromCString,
parse,
parseLocations,
parseLocationsThrowing,
parseThrowing,
defaultParseOptions,
XMLParseException(..),
setEntityDecoder,
GenericXMLString(..),
parseSAX,
parseSAXLocations,
parseSAXLocationsThrowing,
parseSAXThrowing,
ParserOptions,
defaultParserOptions
) where
import Text.XML.Expat.Internal.IO hiding (parse)
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Internal as I
import Data.IORef
import Data.ByteString.Internal (c2w, w2c, c_strlen)
import qualified Data.Monoid as M
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import qualified Codec.Binary.UTF8.String as U8
import Data.Typeable
import Control.Exception.Extensible as Exc
import Control.Applicative
import Control.DeepSeq
import Control.Monad
import System.IO.Unsafe
import Foreign.C.String
import Foreign.Ptr
data ParseOptions tag text = ParseOptions
{ overrideEncoding :: Maybe Encoding
, entityDecoder :: Maybe (tag -> Maybe text)
}
type ParserOptions tag text = ParseOptions tag text
defaultParseOptions :: ParseOptions tag text
defaultParseOptions = ParseOptions Nothing Nothing
defaultParserOptions :: ParseOptions tag text
defaultParserOptions = defaultParseOptions
class (M.Monoid s, Eq s) => GenericXMLString s where
gxNullString :: s -> Bool
gxToString :: s -> String
gxFromString :: String -> s
gxFromChar :: Char -> s
gxHead :: s -> Char
gxTail :: s -> s
gxBreakOn :: Char -> s -> (s, s)
gxFromCStringLen :: CStringLen -> IO s
gxToByteString :: s -> B.ByteString
instance GenericXMLString String where
gxNullString = null
gxToString = id
gxFromString = id
gxFromChar c = [c]
gxHead = head
gxTail = tail
gxBreakOn c = break (==c)
gxFromCStringLen cstr = U8.decodeString <$> peekCStringLen cstr
gxToByteString = B.pack . map c2w . U8.encodeString
instance GenericXMLString B.ByteString where
gxNullString = B.null
gxToString = U8.decodeString . map w2c . B.unpack
gxFromString = B.pack . map c2w . U8.encodeString
gxFromChar = B.singleton . c2w
gxHead = w2c . B.head
gxTail = B.tail
gxBreakOn c = B.break (== c2w c)
gxFromCStringLen = peekByteStringLen
gxToByteString = id
instance GenericXMLString T.Text where
gxNullString = T.null
gxToString = T.unpack
gxFromString = T.pack
gxFromChar = T.singleton
gxHead = T.head
gxTail = T.tail
#if MIN_VERSION_text(0,11,0)
gxBreakOn c = T.break (==c)
#elif MIN_VERSION_text(0,10,0)
gxBreakOn c t = (T.takeWhile (/=c) t, T.dropWhile (/=c) t)
#else
gxBreakOn c = T.breakBy (==c)
#endif
gxFromCStringLen cstr = TE.decodeUtf8 <$> peekByteStringLen cstr
gxToByteString = TE.encodeUtf8
peekByteStringLen :: CStringLen -> IO B.ByteString
peekByteStringLen (cstr, len) =
I.create (fromIntegral len) $ \ptr ->
I.memcpy ptr (castPtr cstr) (fromIntegral len)
data SAXEvent tag text =
XMLDeclaration text (Maybe text) (Maybe Bool) |
StartElement tag [(tag, text)] |
EndElement tag |
CharacterData text |
StartCData |
EndCData |
ProcessingInstruction text text |
Comment text |
FailDocument XMLParseError
deriving (Eq, Show)
instance (NFData tag, NFData text) => NFData (SAXEvent tag text) where
rnf (XMLDeclaration ver mEnc mSD) = rnf ver `seq` rnf mEnc `seq` rnf mSD
rnf (StartElement tag atts) = rnf tag `seq` rnf atts
rnf (EndElement tag) = rnf tag
rnf (CharacterData text) = rnf text
rnf StartCData = ()
rnf EndCData = ()
rnf (ProcessingInstruction target text) = rnf target `seq` rnf text
rnf (Comment text) = rnf text
rnf (FailDocument err) = rnf err
textFromCString :: GenericXMLString text => CString -> IO text
textFromCString cstr = do
len <- c_strlen cstr
gxFromCStringLen (cstr, fromIntegral len)
setEntityDecoder :: (GenericXMLString tag, GenericXMLString text)
=> Parser
-> (tag -> Maybe text)
-> (ParserPtr -> text -> IO ())
-> IO ()
setEntityDecoder parser decoder insertText = do
setUseForeignDTD parser True
setExternalEntityRefHandler parser eh
setSkippedEntityHandler parser skip
where
skip _ _ 1 = return False
skip pp entityName 0 = do
en <- textFromCString entityName
let mbt = decoder en
maybe (return False)
(\t -> do
insertText pp t
return True)
mbt
skip _ _ _ = undefined
eh pp ctx _ systemID publicID =
if systemID == nullPtr && publicID == nullPtr
then withCStringLen "" $ \c -> do
parseExternalEntityReference pp ctx Nothing c
else return False
parse :: (GenericXMLString tag, GenericXMLString text) =>
ParseOptions tag text
-> L.ByteString
-> [SAXEvent tag text]
parse opts input = unsafePerformIO $ do
let enc = overrideEncoding opts
let mEntityDecoder = entityDecoder opts
parser <- newParser enc
queueRef <- newIORef []
case mEntityDecoder of
Just deco -> setEntityDecoder parser deco $ \_ txt -> do
modifyIORef queueRef (CharacterData txt:)
Nothing -> return ()
setXMLDeclarationHandler parser $ \_ cVer cEnc cSd -> do
ver <- textFromCString cVer
mEnc <- if cEnc == nullPtr
then return Nothing
else Just <$> textFromCString cEnc
let sd = if cSd < 0
then Nothing
else Just $ if cSd /= 0 then True else False
modifyIORef queueRef (XMLDeclaration ver mEnc sd:)
return True
setStartElementHandler parser $ \_ cName cAttrs -> do
name <- textFromCString cName
attrs <- forM cAttrs $ \(cAttrName,cAttrValue) -> do
attrName <- textFromCString cAttrName
attrValue <- textFromCString cAttrValue
return (attrName, attrValue)
modifyIORef queueRef (StartElement name attrs:)
return True
setEndElementHandler parser $ \_ cName -> do
name <- textFromCString cName
modifyIORef queueRef (EndElement name:)
return True
setCharacterDataHandler parser $ \_ cText -> do
txt <- gxFromCStringLen cText
modifyIORef queueRef (CharacterData txt:)
return True
setStartCDataHandler parser $ \_ -> do
modifyIORef queueRef (StartCData :)
return True
setEndCDataHandler parser $ \_ -> do
modifyIORef queueRef (EndCData :)
return True
setProcessingInstructionHandler parser $ \_ cTarget cText -> do
target <- textFromCString cTarget
txt <- textFromCString cText
modifyIORef queueRef (ProcessingInstruction target txt :)
return True
setCommentHandler parser $ \_ cText -> do
txt <- textFromCString cText
modifyIORef queueRef (Comment txt :)
return True
let runParser inp = unsafeInterleaveIO $ do
rema <- withParser parser $ \pp -> case inp of
(c:cs) -> do
mError <- parseChunk pp c False
case mError of
Just err -> return [FailDocument err]
Nothing -> runParser cs
[] -> do
mError <- parseChunk pp B.empty True
case mError of
Just err -> return [FailDocument err]
Nothing -> return []
queue <- readIORef queueRef
writeIORef queueRef []
return $ reverse queue ++ rema
runParser $ L.toChunks input
parseSAX :: (GenericXMLString tag, GenericXMLString text) =>
Maybe Encoding
-> L.ByteString
-> [SAXEvent tag text]
parseSAX enc = parse (ParseOptions enc Nothing)
data XMLParseException = XMLParseException XMLParseError
deriving (Eq, Show, Typeable)
instance Exception XMLParseException where
parseLocations :: (GenericXMLString tag, GenericXMLString text) =>
ParseOptions tag text
-> L.ByteString
-> [(SAXEvent tag text, XMLParseLocation)]
parseLocations opts input = unsafePerformIO $ do
let enc = overrideEncoding opts
let mEntityDecoder = entityDecoder opts
parser <- newParser enc
queueRef <- newIORef []
case mEntityDecoder of
Just deco -> setEntityDecoder parser deco $ \pp txt -> do
loc <- getParseLocation pp
modifyIORef queueRef ((CharacterData txt, loc):)
Nothing -> return ()
setXMLDeclarationHandler parser $ \pp cVer cEnc cSd -> do
ver <- textFromCString cVer
mEnc <- if cEnc == nullPtr
then return Nothing
else Just <$> textFromCString cEnc
let sd = if cSd < 0
then Nothing
else Just $ if cSd /= 0 then True else False
loc <- getParseLocation pp
modifyIORef queueRef ((XMLDeclaration ver mEnc sd,loc):)
return True
setStartElementHandler parser $ \pp cName cAttrs -> do
name <- textFromCString cName
attrs <- forM cAttrs $ \(cAttrName,cAttrValue) -> do
attrName <- textFromCString cAttrName
attrValue <- textFromCString cAttrValue
return (attrName, attrValue)
loc <- getParseLocation pp
modifyIORef queueRef ((StartElement name attrs,loc):)
return True
setEndElementHandler parser $ \pp cName -> do
name <- textFromCString cName
loc <- getParseLocation pp
modifyIORef queueRef ((EndElement name, loc):)
return True
setCharacterDataHandler parser $ \pp cText -> do
txt <- gxFromCStringLen cText
loc <- getParseLocation pp
modifyIORef queueRef ((CharacterData txt, loc):)
return True
setStartCDataHandler parser $ \pp -> do
loc <- getParseLocation pp
modifyIORef queueRef ((StartCData, loc):)
return True
setEndCDataHandler parser $ \pp -> do
loc <- getParseLocation pp
modifyIORef queueRef ((EndCData, loc):)
return True
setProcessingInstructionHandler parser $ \pp cTarget cText -> do
target <- textFromCString cTarget
txt <- textFromCString cText
loc <- getParseLocation pp
modifyIORef queueRef ((ProcessingInstruction target txt, loc) :)
return True
setCommentHandler parser $ \pp cText -> do
txt <- textFromCString cText
loc <- getParseLocation pp
modifyIORef queueRef ((Comment txt, loc) :)
return True
let runParser inp = unsafeInterleaveIO $ do
rema <- withParser parser $ \pp -> case inp of
(c:cs) -> do
mError <- parseChunk pp c False
case mError of
Just err -> do
loc <- getParseLocation pp
return [(FailDocument err, loc)]
Nothing -> runParser cs
[] -> do
mError <- parseChunk pp B.empty True
case mError of
Just err -> do
loc <- getParseLocation pp
return [(FailDocument err, loc)]
Nothing -> return []
queue <- readIORef queueRef
writeIORef queueRef []
return $ reverse queue ++ rema
runParser $ L.toChunks input
parseSAXLocations :: (GenericXMLString tag, GenericXMLString text) =>
Maybe Encoding
-> L.ByteString
-> [(SAXEvent tag text, XMLParseLocation)]
parseSAXLocations enc = parseLocations (ParseOptions enc Nothing)
parseThrowing :: (GenericXMLString tag, GenericXMLString text) =>
ParseOptions tag text
-> L.ByteString
-> [SAXEvent tag text]
parseThrowing opts bs = map freakOut $ parse opts bs
where
freakOut (FailDocument err) = Exc.throw $ XMLParseException err
freakOut other = other
parseSAXThrowing :: (GenericXMLString tag, GenericXMLString text) =>
Maybe Encoding
-> L.ByteString
-> [SAXEvent tag text]
parseSAXThrowing mEnc = parseThrowing (ParseOptions mEnc Nothing)
parseLocationsThrowing :: (GenericXMLString tag, GenericXMLString text) =>
ParseOptions tag text
-> L.ByteString
-> [(SAXEvent tag text, XMLParseLocation)]
parseLocationsThrowing opts bs = map freakOut $ parseLocations opts bs
where
freakOut (FailDocument err, _) = Exc.throw $ XMLParseException err
freakOut other = other
parseSAXLocationsThrowing :: (GenericXMLString tag, GenericXMLString text) =>
Maybe Encoding
-> L.ByteString
-> [(SAXEvent tag text, XMLParseLocation)]
parseSAXLocationsThrowing mEnc =
parseLocationsThrowing (ParseOptions mEnc Nothing)