{-# LANGUAGE OverloadedStrings #-}
module Text.HTML.DOM
( eventConduit
, sinkDoc
, readFile
, parseLBS
, parseBSChunks
, eventConduitText
, sinkDocText
, parseLT
, parseSTChunks
) where
import Control.Monad.Trans.Resource
import Prelude hiding (readFile)
import qualified Data.ByteString as S
import qualified Text.HTML.TagStream as TS
import qualified Data.XML.Types as XT
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import qualified Data.Conduit.List as CL
import Control.Arrow ((***))
import qualified Data.Set as Set
import qualified Text.XML as X
import Conduit
import qualified Data.ByteString.Lazy as L
import Data.Maybe (mapMaybe)
import qualified Data.Map.Strict as Map
eventConduit :: Monad m => ConduitT S.ByteString XT.Event m ()
eventConduit = decodeUtf8LenientC .| eventConduit'
eventConduitText :: Monad m => ConduitT T.Text XT.Event m ()
eventConduitText = eventConduit'
eventConduit' :: Monad m => ConduitT T.Text XT.Event m ()
eventConduit' =
TS.tokenStream .| go []
where
go stack = do
mx <- await
case mx of
Nothing -> closeStack stack
Just (TS.TagOpen local _ _) | "?" `T.isPrefixOf` local -> go stack
Just (TS.TagOpen local attrs isClosed) -> do
let name = toName local
attrs' = map (toName *** return . XT.ContentText) $ Map.toList attrs
yield $ XT.EventBeginElement name attrs'
if isClosed || isVoid local
then yield (XT.EventEndElement name) >> go stack
else go $ name : stack
Just (TS.TagClose name)
| toName name `elem` stack ->
let loop [] = go []
loop (n:ns) = do
yield $ XT.EventEndElement n
if n == toName name
then go ns
else loop ns
in loop stack
| otherwise -> go stack
Just (TS.Text t) -> do
yield $ XT.EventContent $ XT.ContentText t
go stack
Just (TS.Comment t) -> do
yield $ XT.EventComment t
go stack
Just TS.Special{} -> go stack
Just TS.Incomplete{} -> go stack
toName l = XT.Name l Nothing Nothing
closeStack = mapM_ (yield . XT.EventEndElement)
isVoid = flip Set.member $ Set.fromList
[ "area"
, "base"
, "br"
, "col"
, "command"
, "embed"
, "hr"
, "img"
, "input"
, "keygen"
, "link"
, "meta"
, "param"
, "source"
, "track"
, "wbr"
]
sinkDoc :: MonadThrow m => ConduitT S.ByteString o m X.Document
sinkDoc = sinkDoc' eventConduit
sinkDocText :: MonadThrow m => ConduitT T.Text o m X.Document
sinkDocText = sinkDoc' eventConduitText
sinkDoc'
:: MonadThrow m
=> ConduitT a XT.Event m ()
-> ConduitT a o m X.Document
sinkDoc' f =
fmap stripDummy $ mapOutput ((,) Nothing) f .| addDummyWrapper .| X.fromEvents
where
addDummyWrapper = do
yield (Nothing, XT.EventBeginElement "html" [])
awaitForever yield
yield (Nothing, XT.EventEndElement "html")
stripDummy doc@(X.Document pro (X.Element _ _ nodes) epi) =
case mapMaybe toElement nodes of
[root] -> X.Document pro root epi
_ -> doc
toElement (X.NodeElement e) = Just e
toElement _ = Nothing
readFile :: FilePath -> IO X.Document
readFile fp = withSourceFile fp $ \src -> runConduit $ src .| sinkDoc
parseLBS :: L.ByteString -> X.Document
parseLBS = parseBSChunks . L.toChunks
parseBSChunks :: [S.ByteString] -> X.Document
parseBSChunks tss =
case runConduit $ CL.sourceList tss .| sinkDoc of
Left e -> error $ "Unexpected exception in parseBSChunks: " ++ show e
Right x -> x
parseLT :: TL.Text -> X.Document
parseLT = parseSTChunks . TL.toChunks
parseSTChunks :: [T.Text] -> X.Document
parseSTChunks tss =
case runConduit $ CL.sourceList tss .| sinkDocText of
Left e -> error $ "Unexpected exception in parseSTChunks: " ++ show e
Right x -> x