{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TupleSections     #-}

module Text.XmlHtml.HTML.Parse where

import           Control.Applicative
import           Control.Monad
import           Data.Char
import           Data.List
import           Data.Maybe
import           Text.XmlHtml.Common
import           Text.XmlHtml.HTML.Meta
import           Text.XmlHtml.TextParser
import qualified Text.XmlHtml.XML.Parse as XML

import qualified Text.Parsec as P

import qualified Data.HashSet as S
import qualified Data.HashMap.Strict as M
import qualified Data.Map as Map

import           Data.Text (Text)
import qualified Data.Text as T


------------------------------------------------------------------------------
-- | HTML version of document fragment parsing rule  It differs only in that
-- it parses the HTML version of 'content' and returns an 'HtmlDocument'.
docFragment :: Encoding -> Parser Document
docFragment :: Encoding -> Parser Document
docFragment Encoding
e = do
    (Maybe DocType
dt, [Node]
nodes1)      <- Parser (Maybe DocType, [Node])
prolog
    ([Node]
nodes2, ElemResult
Matched) <- Maybe Text -> Parser ([Node], ElemResult)
content forall a. Maybe a
Nothing
    forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ Encoding -> Maybe DocType -> [Node] -> Document
HtmlDocument Encoding
e Maybe DocType
dt ([Node]
nodes1 forall a. [a] -> [a] -> [a]
++ [Node]
nodes2)


------------------------------------------------------------------------------
-- Parsing code                                                             --
------------------------------------------------------------------------------

{-
    The following are the differences between this code and the straight XML
    parsing code.

    1. HTML void tags (area, base, etc.) are always treated as empty tags,
       regardless of whether they have the empty-tag slash.

    2. HTML raw text tags (script and style) are parsed as straight text
       with neither markup nor references, except that they end at the first
       syntactically valid matching end tag.

    3. End tags need only match their corresponding start tags in a case
       insensitive comparison.  In case they are different, the start tag is
       used for the element tag name.

    4. Hexadecimal char references may use &#X...; (capital X)  -- DONE

    5. Attribute names are allowed to consist of any text except for control
       characters, space, '\"', '\'', '>', '/', or '='.

    6. Empty attribute syntax is allowed (an attribute not followed by an eq).
       In this case, the attribute value is considered to be the empty string.

    7. Quoted attribute syntax is relaxed to allow any character except for
       the matching quote.  References are allowed.

    8. Attribute values may be unquoted.  In this case, the attribute value
       may not contain space, single or double quotes, '=', '<', '>', or '`',
       and may not be the empty string.  It can still contain references.

    9. There are many more character references available.

    10. Only "ambiguous" ampersands are prohibited in character data.  This
        means ampersands that parse like character or entity references.

    11. Omittable end tags are inserted automatically.

    12. DOCTYPE tags matched with case insensitive keywords.
-}


------------------------------------------------------------------------------
prolog :: Parser (Maybe DocType, [Node])
prolog :: Parser (Maybe DocType, [Node])
prolog = do
    Maybe (Maybe Text)
_      <- forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
optional Parser (Maybe Text)
XML.xmlDecl
    [Maybe Node]
nodes1 <- forall (f :: * -> *) a. Alternative f => f a -> f [a]
many Parser (Maybe Node)
XML.misc
    Maybe (DocType, [Maybe Node])
rest   <- forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
optional forall a b. (a -> b) -> a -> b
$ do
        DocType
dt     <- Parser DocType
docTypeDecl
        [Maybe Node]
nodes2 <- forall (f :: * -> *) a. Alternative f => f a -> f [a]
many Parser (Maybe Node)
XML.misc
        forall (m :: * -> *) a. Monad m => a -> m a
return (DocType
dt, [Maybe Node]
nodes2)
    case Maybe (DocType, [Maybe Node])
rest of
        Maybe (DocType, [Maybe Node])
Nothing           -> forall (m :: * -> *) a. Monad m => a -> m a
return (forall a. Maybe a
Nothing, forall a. [Maybe a] -> [a]
catMaybes [Maybe Node]
nodes1)
        Just (DocType
dt, [Maybe Node]
nodes2) -> forall (m :: * -> *) a. Monad m => a -> m a
return (forall a. a -> Maybe a
Just DocType
dt, forall a. [Maybe a] -> [a]
catMaybes ([Maybe Node]
nodes1 forall a. [a] -> [a] -> [a]
++ [Maybe Node]
nodes2))


------------------------------------------------------------------------------
-- | Internal subset is parsed, but ignored since we don't have data types to
-- store it.
docTypeDecl :: Parser DocType
docTypeDecl :: Parser DocType
docTypeDecl = do
    forall s u (m :: * -> *) a. ParsecT s u m a -> ParsecT s u m a
P.try forall a b. (a -> b) -> a -> b
$ do
        Text
_      <- Text -> Parser Text
text Text
"<!"
        Text
decl   <- Parser Text
XML.name
        forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Text -> Text
T.toLower Text
decl forall a. Eq a => a -> a -> Bool
/= Text
"doctype") forall a b. (a -> b) -> a -> b
$ forall (m :: * -> *) a. MonadFail m => String -> m a
fail String
"Expected DOCTYPE"
    ParsecT Text () Identity ()
XML.whiteSpace
    Text
tag    <- Parser Text
XML.name
    Maybe ()
_      <- forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
optional ParsecT Text () Identity ()
XML.whiteSpace
    ExternalID
extid  <- Parser ExternalID
externalID
    Maybe ()
_      <- forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
optional ParsecT Text () Identity ()
XML.whiteSpace
    InternalSubset
intsub <- Parser InternalSubset
XML.internalDoctype
    Char
_      <- forall s (m :: * -> *) u.
Stream s m Char =>
Char -> ParsecT s u m Char
P.char Char
'>'
    forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> ExternalID -> InternalSubset -> DocType
DocType Text
tag ExternalID
extid InternalSubset
intsub)


------------------------------------------------------------------------------
externalID :: Parser ExternalID
externalID :: Parser ExternalID
externalID = do
    Maybe Text
tok  <- forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
optional forall a b. (a -> b) -> a -> b
$ Text -> Text
T.toLower forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser Text
XML.name
    case Maybe Text
tok of
        Just Text
"system" -> Parser ExternalID
systemID
        Just Text
"public" -> Parser ExternalID
publicID
        Just Text
_        -> forall (m :: * -> *) a. MonadFail m => String -> m a
fail String
"Expected SYSTEM or PUBLIC"
        Maybe Text
Nothing       -> forall (m :: * -> *) a. Monad m => a -> m a
return ExternalID
NoExternalID
  where
    systemID :: Parser ExternalID
systemID = do
        ParsecT Text () Identity ()
XML.whiteSpace
        Text -> ExternalID
System forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser Text
XML.systemLiteral
    publicID :: Parser ExternalID
publicID = do
        ParsecT Text () Identity ()
XML.whiteSpace
        Text
pid <- Parser Text
XML.pubIdLiteral
        ParsecT Text () Identity ()
XML.whiteSpace
        Text
sid <- Parser Text
XML.systemLiteral
        forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> Text -> ExternalID
Public Text
pid Text
sid)


------------------------------------------------------------------------------
-- | When parsing an element, three things can happen (besides failure):
--
-- (1) The end tag matches the start tag.  This is a Matched.
--
-- (2) The end tag does not match, but the element has an end tag that can be
-- omitted when there is no more content in its parent.  This is an
-- ImplicitLast.  In this case, we need to remember the tag name of the
-- end tag that we did find, so as to match it later.
--
-- (3) A start tag is found such that it implicitly ends the current element.
-- This is an ImplicitNext.  In this case, we parse and remember the
-- entire element that comes next, so that it can be inserted after the
-- element being parsed.
data ElemResult = Matched
                | ImplicitLast Text
                | ImplicitNext Text Text [(Text, Text)] Bool


------------------------------------------------------------------------------
finishElement :: Text -> Text -> [(Text, Text)] -> Bool
              -> Parser (Node, ElemResult)
finishElement :: Text -> Text -> [(Text, Text)] -> Bool -> Parser (Node, ElemResult)
finishElement Text
t Text
tbase [(Text, Text)]
a Bool
b = do
    if Bool
b then forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> [(Text, Text)] -> [Node] -> Node
Element Text
t [(Text, Text)]
a [], ElemResult
Matched)
         else Parser (Node, ElemResult)
nonEmptyElem
  where
    nonEmptyElem :: Parser (Node, ElemResult)
nonEmptyElem
        | Text -> [(Text, Text)] -> Bool
isRawText Text
tbase [(Text, Text)]
a = do
            Node
c <- forall a. String -> Parser a -> Parser Node
XML.cdata  String
"<"  forall a b. (a -> b) -> a -> b
$ forall s u (m :: * -> *) a. ParsecT s u m a -> ParsecT s u m a
P.try (Text -> Parser ElemResult
endTag Text
t)
            forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> [(Text, Text)] -> [Node] -> Node
Element Text
t [(Text, Text)]
a [Node
c], ElemResult
Matched)
        | Text
tbase forall a. (Eq a, Hashable a) => a -> HashSet a -> Bool
`S.member` HashSet Text
endOmittableLast = (Parser ElemResult -> ParsecT Text () Identity (Maybe ElemResult))
-> Parser (Node, ElemResult)
tagContents forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
optional
        | Bool
otherwise = (Parser ElemResult -> ParsecT Text () Identity (Maybe ElemResult))
-> Parser (Node, ElemResult)
tagContents (forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap forall a. a -> Maybe a
Just)
    tagContents :: (Parser ElemResult -> ParsecT Text () Identity (Maybe ElemResult))
-> Parser (Node, ElemResult)
tagContents Parser ElemResult -> ParsecT Text () Identity (Maybe ElemResult)
modifier = do
        ([Node]
c,ElemResult
r1) <- Maybe Text -> Parser ([Node], ElemResult)
content (forall a. a -> Maybe a
Just Text
tbase)
        case ElemResult
r1 of
            ElemResult
Matched -> do
                Maybe ElemResult
r2 <- Parser ElemResult -> ParsecT Text () Identity (Maybe ElemResult)
modifier (Text -> Parser ElemResult
endTag Text
t)
                case Maybe ElemResult
r2 of
                    Maybe ElemResult
Nothing -> forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> [(Text, Text)] -> [Node] -> Node
Element Text
t [(Text, Text)]
a [Node]
c, ElemResult
Matched)
                    Just ElemResult
rr -> forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> [(Text, Text)] -> [Node] -> Node
Element Text
t [(Text, Text)]
a [Node]
c, ElemResult
rr)
            ImplicitLast Text
tag | Text -> Text
T.toCaseFold Text
tag forall a. Eq a => a -> a -> Bool
== Text -> Text
T.toCaseFold Text
t -> do
                forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> [(Text, Text)] -> [Node] -> Node
Element Text
t [(Text, Text)]
a [Node]
c, ElemResult
Matched)
            ElemResult
end -> do
                forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> [(Text, Text)] -> [Node] -> Node
Element Text
t [(Text, Text)]
a [Node]
c, ElemResult
end)


------------------------------------------------------------------------------
emptyOrStartTag :: Parser (Text, Text, [(Text, Text)], Bool)
emptyOrStartTag :: Parser (Text, Text, [(Text, Text)], Bool)
emptyOrStartTag = do
    Text
t <- forall s u (m :: * -> *) a. ParsecT s u m a -> ParsecT s u m a
P.try forall a b. (a -> b) -> a -> b
$ forall s (m :: * -> *) u.
Stream s m Char =>
Char -> ParsecT s u m Char
P.char Char
'<' forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Parser Text
XML.name
    let tbase :: Text
tbase = Text -> Text
T.toLower forall a b. (a -> b) -> a -> b
$ forall a b. (a, b) -> b
snd forall a b. (a -> b) -> a -> b
$ Text -> Text -> (Text, Text)
T.breakOnEnd Text
":" Text
t
    [(Text, Text)]
a <- forall (f :: * -> *) a. Alternative f => f a -> f [a]
many forall a b. (a -> b) -> a -> b
$ forall s u (m :: * -> *) a. ParsecT s u m a -> ParsecT s u m a
P.try forall a b. (a -> b) -> a -> b
$ do
        ParsecT Text () Identity ()
XML.whiteSpace
        ParsecT Text () Identity (Text, Text)
attribute
    forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (forall {a} {b}. Eq a => [(a, b)] -> Bool
hasDups [(Text, Text)]
a) forall a b. (a -> b) -> a -> b
$ forall (m :: * -> *) a. MonadFail m => String -> m a
fail String
"Duplicate attribute names in element"
    Maybe ()
_ <- forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
optional ParsecT Text () Identity ()
XML.whiteSpace
    Bool
e <- forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap forall a. Maybe a -> Bool
isJust forall a b. (a -> b) -> a -> b
$ forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
optional (forall s (m :: * -> *) u.
Stream s m Char =>
Char -> ParsecT s u m Char
P.char Char
'/')
    let e' :: Bool
e' = Bool
e Bool -> Bool -> Bool
|| (Text
tbase forall a. (Eq a, Hashable a) => a -> HashSet a -> Bool
`S.member` HashSet Text
voidTags)
    Char
_ <- forall s (m :: * -> *) u.
Stream s m Char =>
Char -> ParsecT s u m Char
P.char Char
'>'
    forall (m :: * -> *) a. Monad m => a -> m a
return (Text
t, Text
tbase, [(Text, Text)]
a, Bool
e')
  where
    hasDups :: [(a, b)] -> Bool
hasDups [(a, b)]
a = forall (t :: * -> *) a. Foldable t => t a -> Int
length (forall a. Eq a => [a] -> [a]
nub (forall a b. (a -> b) -> [a] -> [b]
map forall a b. (a, b) -> a
fst [(a, b)]
a)) forall a. Ord a => a -> a -> Bool
< forall (t :: * -> *) a. Foldable t => t a -> Int
length [(a, b)]
a


------------------------------------------------------------------------------
attrName :: Parser Text
attrName :: Parser Text
attrName = (Char -> Bool) -> Parser Text
takeWhile1 Char -> Bool
isAttrName
  where isAttrName :: Char -> Bool
isAttrName Char
c | Char
c forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Char
'\0',Char
' ',Char
'"',Char
'\'',Char
'>',Char
'/',Char
'='] = Bool
False
                     | Char -> Bool
isControlChar Char
c       = Bool
False
                     | Bool
otherwise             = Bool
True


------------------------------------------------------------------------------
-- | From 8.2.2.3 of the HTML 5 spec, omitting the very high control
-- characters because they are unlikely to occur and I got tired of typing.
isControlChar :: Char -> Bool
isControlChar :: Char -> Bool
isControlChar Char
c | Char
c forall a. Ord a => a -> a -> Bool
>= Char
'\x007F' Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
<= Char
'\x009F' = Bool
True
                | Char
c forall a. Ord a => a -> a -> Bool
>= Char
'\xFDD0' Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
<= Char
'\xFDEF' = Bool
True
                | Bool
otherwise                      = Bool
False


------------------------------------------------------------------------------
quotedAttrValue :: Parser Text
quotedAttrValue :: Parser Text
quotedAttrValue = Parser Text
singleQuoted forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser Text
doubleQuoted
  where
    singleQuoted :: Parser Text
singleQuoted = forall s (m :: * -> *) u.
Stream s m Char =>
Char -> ParsecT s u m Char
P.char Char
'\'' forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> forall {t :: * -> *}. Foldable t => t Char -> Parser Text
refTill [Char
'&',Char
'\''] forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* forall s (m :: * -> *) u.
Stream s m Char =>
Char -> ParsecT s u m Char
P.char Char
'\''
    doubleQuoted :: Parser Text
doubleQuoted = forall s (m :: * -> *) u.
Stream s m Char =>
Char -> ParsecT s u m Char
P.char Char
'"'  forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> forall {t :: * -> *}. Foldable t => t Char -> Parser Text
refTill [Char
'&',Char
'"']  forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* forall s (m :: * -> *) u.
Stream s m Char =>
Char -> ParsecT s u m Char
P.char Char
'"'
    refTill :: t Char -> Parser Text
refTill t Char
end = [Text] -> Text
T.concat forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall (f :: * -> *) a. Alternative f => f a -> f [a]
many
        ((Char -> Bool) -> Parser Text
takeWhile1 (Bool -> Bool
not forall b c a. (b -> c) -> (a -> b) -> a -> c
. (forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` t Char
end)) forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser Text
reference)


------------------------------------------------------------------------------
unquotedAttrValue :: Parser Text
unquotedAttrValue :: Parser Text
unquotedAttrValue = forall {t :: * -> *}. Foldable t => t Char -> Parser Text
refTill [Char
' ',Char
'"',Char
'\'',Char
'=',Char
'<',Char
'>',Char
'&',Char
'`']
  where
    refTill :: t Char -> Parser Text
refTill t Char
end = [Text] -> Text
T.concat forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall (f :: * -> *) a. Alternative f => f a -> f [a]
some
        ((Char -> Bool) -> Parser Text
takeWhile1 (Bool -> Bool
not forall b c a. (b -> c) -> (a -> b) -> a -> c
. (forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` t Char
end)) forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser Text
reference)


------------------------------------------------------------------------------
attrValue :: Parser Text
attrValue :: Parser Text
attrValue = Parser Text
quotedAttrValue forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Parser Text
unquotedAttrValue


------------------------------------------------------------------------------
attribute :: Parser (Text, Text)
attribute :: ParsecT Text () Identity (Text, Text)
attribute = do
    Text
n <- Parser Text
attrName
    Maybe Text
v <- forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
optional forall a b. (a -> b) -> a -> b
$ do
        Char
_ <- forall s u (m :: * -> *) a. ParsecT s u m a -> ParsecT s u m a
P.try forall a b. (a -> b) -> a -> b
$ do
            Maybe ()
_ <- forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
optional ParsecT Text () Identity ()
XML.whiteSpace
            forall s (m :: * -> *) u.
Stream s m Char =>
Char -> ParsecT s u m Char
P.char Char
'='
        Maybe ()
_ <- forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
optional ParsecT Text () Identity ()
XML.whiteSpace
        Parser Text
attrValue
    forall (m :: * -> *) a. Monad m => a -> m a
return forall a b. (a -> b) -> a -> b
$ forall b a. b -> (a -> b) -> Maybe a -> b
maybe (Text
n,Text
"") (Text
n,) Maybe Text
v


------------------------------------------------------------------------------
endTag :: Text -> Parser ElemResult
endTag :: Text -> Parser ElemResult
endTag Text
s = do
    Text
_ <- Text -> Parser Text
text Text
"</"
    Text
t <- Parser Text
XML.name
    let sbase :: Text
sbase = Text -> Text
T.toLower forall a b. (a -> b) -> a -> b
$ forall a b. (a, b) -> b
snd forall a b. (a -> b) -> a -> b
$ Text -> Text -> (Text, Text)
T.breakOnEnd Text
":" Text
s
    ElemResult
r <- if (Text -> Text
T.toCaseFold Text
s forall a. Eq a => a -> a -> Bool
== Text -> Text
T.toCaseFold Text
t)
            then forall (m :: * -> *) a. Monad m => a -> m a
return ElemResult
Matched
            else if Text
sbase forall a. (Eq a, Hashable a) => a -> HashSet a -> Bool
`S.member` HashSet Text
endOmittableLast
                then forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> ElemResult
ImplicitLast Text
t)
                else forall (m :: * -> *) a. MonadFail m => String -> m a
fail forall a b. (a -> b) -> a -> b
$ String
"mismatched tags: </" forall a. [a] -> [a] -> [a]
++ Text -> String
T.unpack Text
t forall a. [a] -> [a] -> [a]
++
                            String
"> found inside <" forall a. [a] -> [a] -> [a]
++ Text -> String
T.unpack Text
s forall a. [a] -> [a] -> [a]
++ String
"> tag"
    Maybe ()
_ <- forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
optional ParsecT Text () Identity ()
XML.whiteSpace
    Text
_ <- Text -> Parser Text
text Text
">"
    forall (m :: * -> *) a. Monad m => a -> m a
return ElemResult
r


------------------------------------------------------------------------------
content :: Maybe Text -> Parser ([Node], ElemResult)
content :: Maybe Text -> Parser ([Node], ElemResult)
content Maybe Text
parent = do
    ([Maybe Node]
ns, ElemResult
end) <- ParsecT Text () Identity ([Maybe Node], ElemResult)
readText
    forall (m :: * -> *) a. Monad m => a -> m a
return ([Node] -> [Node]
coalesceText (forall a. [Maybe a] -> [a]
catMaybes [Maybe Node]
ns), ElemResult
end)
  where
    readText :: ParsecT Text () Identity ([Maybe Node], ElemResult)
readText     = do
        Maybe Node
s <- forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
optional Parser Node
XML.charData
        Maybe ([Maybe Node], ElemResult)
t <- forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
optional ParsecT Text () Identity ([Maybe Node], ElemResult)
whileMatched
        case Maybe ([Maybe Node], ElemResult)
t of
            Maybe ([Maybe Node], ElemResult)
Nothing      -> forall (m :: * -> *) a. Monad m => a -> m a
return ([Maybe Node
s], ElemResult
Matched)
            Just ([Maybe Node]
tt, ElemResult
m) -> forall (m :: * -> *) a. Monad m => a -> m a
return (Maybe Node
sforall a. a -> [a] -> [a]
:[Maybe Node]
tt, ElemResult
m)

    whileMatched :: ParsecT Text () Identity ([Maybe Node], ElemResult)
whileMatched = do
        ([Maybe Node]
n,ElemResult
end) <- (,ElemResult
Matched) forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (forall a. a -> [a] -> [a]
:[]) forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall a. a -> Maybe a
Just forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Text -> Node
TextNode forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser Text
reference
               forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (,ElemResult
Matched) forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (forall a. a -> [a] -> [a]
:[]) forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser (Maybe Node)
XML.cdSect
               forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (,ElemResult
Matched) forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (forall a. a -> [a] -> [a]
:[]) forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser (Maybe Node)
XML.processingInstruction
               forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (,ElemResult
Matched) forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (forall a. a -> [a] -> [a]
:[]) forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser (Maybe Node)
XML.comment
               forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> ParsecT Text () Identity ([Maybe Node], ElemResult)
doElement
        case ElemResult
end of
            ElemResult
Matched -> do
                ([Maybe Node]
ns, ElemResult
end') <- ParsecT Text () Identity ([Maybe Node], ElemResult)
readText
                forall (m :: * -> *) a. Monad m => a -> m a
return ([Maybe Node]
n forall a. [a] -> [a] -> [a]
++ [Maybe Node]
ns, ElemResult
end')
            ElemResult
_ -> do
                forall (m :: * -> *) a. Monad m => a -> m a
return ([Maybe Node]
n, ElemResult
end)

    doElement :: ParsecT Text () Identity ([Maybe Node], ElemResult)
doElement = do
        (Text
t,Text
tb, [(Text, Text)]
a,Bool
b) <- Parser (Text, Text, [(Text, Text)], Bool)
emptyOrStartTag
        Text
-> Text
-> [(Text, Text)]
-> Bool
-> ParsecT Text () Identity ([Maybe Node], ElemResult)
handle Text
t Text
tb [(Text, Text)]
a Bool
b

    handle :: Text
-> Text
-> [(Text, Text)]
-> Bool
-> ParsecT Text () Identity ([Maybe Node], ElemResult)
handle Text
t Text
tb [(Text, Text)]
a Bool
b = do
        if Text -> Maybe Text -> Bool
breaksTag Text
tb Maybe Text
parent
            then forall (m :: * -> *) a. Monad m => a -> m a
return ([forall a. Maybe a
Nothing], Text -> Text -> [(Text, Text)] -> Bool -> ElemResult
ImplicitNext Text
t Text
tb [(Text, Text)]
a Bool
b)
            else do
                (Node
n,ElemResult
end) <- Text -> Text -> [(Text, Text)] -> Bool -> Parser (Node, ElemResult)
finishElement Text
t Text
tb [(Text, Text)]
a Bool
b
                case ElemResult
end of
                    ImplicitNext Text
t' Text
tb' [(Text, Text)]
a' Bool
b' -> do
                        ([Maybe Node]
ns,ElemResult
end') <- Text
-> Text
-> [(Text, Text)]
-> Bool
-> ParsecT Text () Identity ([Maybe Node], ElemResult)
handle Text
t' Text
tb' [(Text, Text)]
a' Bool
b'
                        forall (m :: * -> *) a. Monad m => a -> m a
return (forall a. a -> Maybe a
Just Node
n forall a. a -> [a] -> [a]
: [Maybe Node]
ns, ElemResult
end')
                    ElemResult
_ -> forall (m :: * -> *) a. Monad m => a -> m a
return ([forall a. a -> Maybe a
Just Node
n], ElemResult
end)

    breaksTag :: Text -> Maybe Text -> Bool
breaksTag Text
_     Maybe Text
Nothing       = Bool
False
    breaksTag Text
child (Just Text
tag) = case forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
M.lookup Text
tag HashMap Text (HashSet Text)
endOmittableNext of
        Maybe (HashSet Text)
Nothing -> Bool
False
        Just HashSet Text
s  -> forall a. (Eq a, Hashable a) => a -> HashSet a -> Bool
S.member Text
child HashSet Text
s

    coalesceText :: [Node] -> [Node]
coalesceText (TextNode Text
s : TextNode Text
t : [Node]
ns)
        = [Node] -> [Node]
coalesceText (Text -> Node
TextNode (Text -> Text -> Text
T.append Text
s Text
t) forall a. a -> [a] -> [a]
: [Node]
ns)
    coalesceText (Node
n:[Node]
ns)
        = Node
n forall a. a -> [a] -> [a]
: [Node] -> [Node]
coalesceText [Node]
ns
    coalesceText []
        = []


------------------------------------------------------------------------------
reference :: Parser Text
reference :: Parser Text
reference = do
    Char
_    <- forall s (m :: * -> *) u.
Stream s m Char =>
Char -> ParsecT s u m Char
P.char Char
'&'
    Either Char Text
r    <- (forall a b. a -> Either a b
Left  forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall s u (m :: * -> *) a. ParsecT s u m a -> ParsecT s u m a
P.try ParsecT Text () Identity Char
finishCharRef)
        forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (forall a b. b -> Either a b
Right forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall s u (m :: * -> *) a. ParsecT s u m a -> ParsecT s u m a
P.try Parser Text
finishEntityRef)
        forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (forall a b. a -> Either a b
Left  forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall (m :: * -> *) a. Monad m => a -> m a
return Char
'&')
    case Either Char Text
r of
        Left Char
c   -> do
            forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Bool -> Bool
not (Char -> Bool
isValidChar Char
c)) forall a b. (a -> b) -> a -> b
$ forall (m :: * -> *) a. MonadFail m => String -> m a
fail forall a b. (a -> b) -> a -> b
$
                String
"Reference is not a valid character"
            forall (m :: * -> *) a. Monad m => a -> m a
return (Char -> Text
T.singleton Char
c)
        Right Text
nm -> case forall k a. Ord k => k -> Map k a -> Maybe a
Map.lookup Text
nm Map Text Text
predefinedRefs of
            Maybe Text
Nothing -> forall (m :: * -> *) a. MonadFail m => String -> m a
fail forall a b. (a -> b) -> a -> b
$ String
"Unknown entity reference: " forall a. [a] -> [a] -> [a]
++ Text -> String
T.unpack Text
nm
            Just Text
t  -> forall (m :: * -> *) a. Monad m => a -> m a
return Text
t


------------------------------------------------------------------------------
finishCharRef :: Parser Char
finishCharRef :: ParsecT Text () Identity Char
finishCharRef = forall s (m :: * -> *) u.
Stream s m Char =>
Char -> ParsecT s u m Char
P.char Char
'#' forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> (forall {u}. ParsecT Text u Identity Char
hexCharRef forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> forall {u}. ParsecT Text u Identity Char
decCharRef)
  where
    decCharRef :: ParsecT Text u Identity Char
decCharRef = do
        [Int]
ds <- forall (f :: * -> *) a. Alternative f => f a -> f [a]
some forall {u}. ParsecT Text u Identity Int
digit
        Char
_ <- forall s (m :: * -> *) u.
Stream s m Char =>
Char -> ParsecT s u m Char
P.char Char
';'
        let c :: Char
c = Int -> Char
chr forall a b. (a -> b) -> a -> b
$ forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl' (\Int
a Int
b -> Int
10 forall a. Num a => a -> a -> a
* Int
a forall a. Num a => a -> a -> a
+ Int
b) Int
0 [Int]
ds
        forall (m :: * -> *) a. Monad m => a -> m a
return Char
c
      where
        digit :: ParsecT Text u Identity Int
digit = do
            Char
d <- forall s (m :: * -> *) u.
Stream s m Char =>
(Char -> Bool) -> ParsecT s u m Char
P.satisfy (\Char
c -> Char
c forall a. Ord a => a -> a -> Bool
>= Char
'0' Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
<= Char
'9')
            forall (m :: * -> *) a. Monad m => a -> m a
return (Char -> Int
ord Char
d forall a. Num a => a -> a -> a
- Char -> Int
ord Char
'0')
    hexCharRef :: ParsecT Text u Identity Char
hexCharRef = do
        Char
_ <- forall s (m :: * -> *) u.
Stream s m Char =>
Char -> ParsecT s u m Char
P.char Char
'x' forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> forall s (m :: * -> *) u.
Stream s m Char =>
Char -> ParsecT s u m Char
P.char Char
'X'
        [Int]
ds <- forall (f :: * -> *) a. Alternative f => f a -> f [a]
some forall {u}. ParsecT Text u Identity Int
digit
        Char
_ <- forall s (m :: * -> *) u.
Stream s m Char =>
Char -> ParsecT s u m Char
P.char Char
';'
        let c :: Char
c = Int -> Char
chr forall a b. (a -> b) -> a -> b
$ forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl' (\Int
a Int
b -> Int
16 forall a. Num a => a -> a -> a
* Int
a forall a. Num a => a -> a -> a
+ Int
b) Int
0 [Int]
ds
        forall (m :: * -> *) a. Monad m => a -> m a
return Char
c
      where
        digit :: ParsecT Text u Identity Int
digit = forall {u}. ParsecT Text u Identity Int
num forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> forall {u}. ParsecT Text u Identity Int
upper forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> forall {u}. ParsecT Text u Identity Int
lower
        num :: ParsecT Text u Identity Int
num = do
            Char
d <- forall s (m :: * -> *) u.
Stream s m Char =>
(Char -> Bool) -> ParsecT s u m Char
P.satisfy (\Char
c -> Char
c forall a. Ord a => a -> a -> Bool
>= Char
'0' Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
<= Char
'9')
            forall (m :: * -> *) a. Monad m => a -> m a
return (Char -> Int
ord Char
d forall a. Num a => a -> a -> a
- Char -> Int
ord Char
'0')
        upper :: ParsecT Text u Identity Int
upper = do
            Char
d <- forall s (m :: * -> *) u.
Stream s m Char =>
(Char -> Bool) -> ParsecT s u m Char
P.satisfy (\Char
c -> Char
c forall a. Ord a => a -> a -> Bool
>= Char
'A' Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
<= Char
'F')
            forall (m :: * -> *) a. Monad m => a -> m a
return (Int
10 forall a. Num a => a -> a -> a
+ Char -> Int
ord Char
d forall a. Num a => a -> a -> a
- Char -> Int
ord Char
'A')
        lower :: ParsecT Text u Identity Int
lower = do
            Char
d <- forall s (m :: * -> *) u.
Stream s m Char =>
(Char -> Bool) -> ParsecT s u m Char
P.satisfy (\Char
c -> Char
c forall a. Ord a => a -> a -> Bool
>= Char
'a' Bool -> Bool -> Bool
&& Char
c forall a. Ord a => a -> a -> Bool
<= Char
'f')
            forall (m :: * -> *) a. Monad m => a -> m a
return (Int
10 forall a. Num a => a -> a -> a
+ Char -> Int
ord Char
d forall a. Num a => a -> a -> a
- Char -> Int
ord Char
'a')


------------------------------------------------------------------------------
finishEntityRef :: Parser Text
finishEntityRef :: Parser Text
finishEntityRef = Parser Text
XML.name forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* forall s (m :: * -> *) u.
Stream s m Char =>
Char -> ParsecT s u m Char
P.char Char
';'