src/Text/XmlHtml/Common.hs

{-# LANGUAGE OverloadedStrings     #-}
{-# LANGUAGE FlexibleInstances     #-}
{-# LANGUAGE MultiParamTypeClasses #-}

module Text.XmlHtml.Common where

import           Data.ByteString (ByteString)
import           Blaze.ByteString.Builder
import qualified Data.ByteString.Char8 as BS
import qualified Data.ByteString.Builder as B
import           Data.Char (isAscii, isLatin1)
import qualified Data.HashMap.Strict as M
import qualified Data.HashSet as S
import qualified Data.Map.Strict as Map
import           Data.Maybe

import           Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.Text.Encoding.Error as TE
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.Encoding as TL

import           Text.XmlHtml.HTML.Meta (reversePredefinedRefs,
                                         explicitAttributes)


------------------------------------------------------------------------------
-- | Represents a document fragment, including the format, encoding, and
-- document type declaration as well as its content.
data Document = XmlDocument  {
                    docEncoding :: !Encoding,
                    docType     :: !(Maybe DocType),
                    docContent  :: ![Node]
                }
              | HtmlDocument {
                    docEncoding :: !Encoding,
                    docType     :: !(Maybe DocType),
                    docContent  :: ![Node]
                }
    deriving (Eq, Show)


------------------------------------------------------------------------------
-- | A node of a document structure.  A node can be text, a comment, or an
-- element.  XML processing instructions are intentionally omitted as a
-- simplification, and CDATA and plain text are both text nodes, since they
-- ought to be semantically interchangeable.
data Node = TextNode !Text
          | Comment  !Text
          | Element {
                elementTag      :: !Text,
                elementAttrs    :: ![(Text, Text)],
                elementChildren :: ![Node]
            }
    deriving (Eq, Show)


------------------------------------------------------------------------------
-- | Rendering options
data RenderOptions = RenderOptions {
      roAttributeSurround :: AttrSurround
      -- ^ Single or double-quotes used around attribute values

    , roAttributeResolveInternal :: AttrResolveInternalQuotes
      -- ^ Quotes inside attribute values that conflict with the surround
      -- are escaped, or the outer quotes are changed to avoid conflicting
      -- with the internal ones

    , roExplicitEmptyAttrs :: Maybe (M.HashMap Text (S.HashSet Text))
      -- ^ Attributes in the whitelist with empty values are
      -- rendered as <div example="">
      -- 'Nothing' applies this rule to all attributes with empty values

    } deriving (Eq, Show)

data AttrSurround = SurroundDoubleQuote | SurroundSingleQuote
    deriving (Eq, Ord, Show)

data AttrResolveInternalQuotes = AttrResolveByEscape | AttrResolveAvoidEscape
    deriving (Eq, Ord, Show)

defaultRenderOptions :: RenderOptions
defaultRenderOptions = RenderOptions
    { roAttributeSurround        = SurroundSingleQuote
    , roAttributeResolveInternal = AttrResolveAvoidEscape
    , roExplicitEmptyAttrs       = Just explicitAttributes
    }

------------------------------------------------------------------------------
-- | Determines whether the node is text or not.
isTextNode :: Node -> Bool
isTextNode (TextNode _) = True
isTextNode _            = False


------------------------------------------------------------------------------
-- | Determines whether the node is a comment or not.
isComment :: Node -> Bool
isComment (Comment _) = True
isComment _           = False


------------------------------------------------------------------------------
-- | Determines whether the node is an element or not.
isElement :: Node -> Bool
isElement (Element _ _ _) = True
isElement _               = False


------------------------------------------------------------------------------
-- | Gives the tag name of an element, or 'Nothing' if the node isn't an
-- element.
tagName :: Node -> Maybe Text
tagName (Element t _ _) = Just t
tagName _               = Nothing


------------------------------------------------------------------------------
-- | Retrieves the attribute with the given name.  If the 'Node' is not an
-- element, the result is always 'Nothing'
getAttribute :: Text -> Node -> Maybe Text
getAttribute name (Element _ attrs _) = lookup name attrs
getAttribute _    _                   = Nothing


------------------------------------------------------------------------------
-- | Checks if a given attribute exists in a 'Node'.
hasAttribute :: Text -> Node -> Bool
hasAttribute name = isJust . getAttribute name


------------------------------------------------------------------------------
-- | Sets the attribute name to the given value.  If the 'Node' is not an
-- element, this is the identity.
setAttribute :: Text -> Text -> Node -> Node
setAttribute name val (Element t a c) = Element t newAttrs c
  where newAttrs = (name, val) : filter ((/= name) . fst) a
setAttribute _    _   n                   = n


------------------------------------------------------------------------------
-- | Gives the entire text content of a node, ignoring markup.
nodeText :: Node -> Text
nodeText (TextNode t)    = t
nodeText (Comment _)     = ""
nodeText (Element _ _ c) = T.concat (map nodeText c)


------------------------------------------------------------------------------
-- | Gives the child nodes of the given node.  Only elements have child nodes.
childNodes :: Node -> [Node]
childNodes (Element _ _ c) = c
childNodes _               = []


------------------------------------------------------------------------------
-- | Gives the child elements of the given node.
childElements :: Node -> [Node]
childElements = filter isElement . childNodes


------------------------------------------------------------------------------
-- | Gives all of the child elements of the node with the given tag
-- name.
childElementsTag :: Text -> Node -> [Node]
childElementsTag tag = filter ((== Just tag) . tagName) . childNodes


------------------------------------------------------------------------------
-- | Gives the first child element of the node with the given tag name,
-- or 'Nothing' if there is no such child element.
childElementTag :: Text -> Node -> Maybe Node
childElementTag tag = listToMaybe . childElementsTag tag


------------------------------------------------------------------------------
-- | Gives the descendants of the given node in the order that they begin in
-- the document.
descendantNodes :: Node -> [Node]
descendantNodes = concatMap (\n -> n : descendantNodes n) . childNodes

------------------------------------------------------------------------------
-- | Gives the descendant elements of the given node, in the order that their
-- start tags appear in the document.
descendantElements :: Node -> [Node]
descendantElements = filter isElement . descendantNodes


------------------------------------------------------------------------------
-- | Gives the descendant elements with a given tag name.
descendantElementsTag :: Text -> Node -> [Node]
descendantElementsTag tag = filter ((== Just tag) . tagName) . descendantNodes


------------------------------------------------------------------------------
-- | Gives the first descendant element of the node with the given tag name,
-- or 'Nothing' if there is no such element.
descendantElementTag :: Text -> Node -> Maybe Node
descendantElementTag tag = listToMaybe . descendantElementsTag tag


------------------------------------------------------------------------------
-- | A document type declaration.  Note that DTD internal subsets are
-- currently unimplemented.
data DocType = DocType !Text !ExternalID !InternalSubset
    deriving (Eq, Show)


------------------------------------------------------------------------------
-- | An external ID, as in a document type declaration.  This can be a
-- SYSTEM identifier, or a PUBLIC identifier, or can be omitted.
data ExternalID = Public !Text !Text
                | System !Text
                | NoExternalID
    deriving (Eq, Show)


------------------------------------------------------------------------------
-- | The internal subset is unparsed, but preserved in case it's actually
-- wanted.
data InternalSubset = InternalText !Text
                    | NoInternalSubset
    deriving (Eq, Show)


------------------------------------------------------------------------------
-- | The character encoding of a document.  Currently only the required
-- character encodings are implemented.
data Encoding = UTF8 | UTF16BE | UTF16LE | ISO_8859_1 deriving (Eq, Show)


------------------------------------------------------------------------------
-- | Retrieves the preferred name of a character encoding for embedding in
-- a document.
encodingName :: Encoding -> Text
encodingName UTF8       = "UTF-8"
encodingName UTF16BE    = "UTF-16"
encodingName UTF16LE    = "UTF-16"
encodingName ISO_8859_1 = "ISO-8859-1"


------------------------------------------------------------------------------
-- | Gets the encoding function from 'Text' to 'ByteString' for an encoding.
encoder :: Encoding -> Text -> ByteString
encoder UTF8       = T.encodeUtf8
encoder UTF16BE    = T.encodeUtf16BE
encoder UTF16LE    = T.encodeUtf16LE
encoder ISO_8859_1 = encodeIso_8859_1


------------------------------------------------------------------------------
-- | Encodes UTF-8 Text into bytestring with only latin1 characters
-- UTF-8 characters found in the input and present in the
-- 'Text.XmlHtml.Meta.references' map are mapped to their escape sequences,
-- and any other UTF-8 characters are replaced with ascii "?"
encodeIso_8859_1 :: Text -> ByteString
encodeIso_8859_1 t = T.encodeUtf8 . T.concat . map toAsciiChunk $
                     T.groupBy asciiSplits t
  where

    -- Identify long strings of all-acceptable or all-unacceptable characters
    -- Acceptable strings are passed through
    -- Unacceptable strings are mapped to ASCII character by character
    toAsciiChunk sub =
      if T.any isAscii sub
      then sub
      else T.concat . map toAsciiChar $ T.unpack sub
    asciiSplits x y = isAscii x == isAscii y

    -- A character's mapping to ascii goes through html entity escaping
    -- if that character is in the references table
    -- Otherwise its unicode index is printed to decimal and "&#" is appended
    toAsciiChar c = maybe
        (uniEscape c) (\esc -> T.concat ["&", esc, ";"])
        (Map.lookup (T.singleton c) reversePredefinedRefs)

    uniEscape = T.append "&#" . flip T.snoc ';' . T.pack .
                (show :: Int -> String) . fromEnum


------------------------------------------------------------------------------
-- | Gets the decoding function from 'ByteString' to 'Text' for an encoding.
decoder :: Encoding -> ByteString -> Text
decoder UTF8       = T.decodeUtf8With    (TE.replace '\xFFFF')
decoder UTF16BE    = T.decodeUtf16BEWith (TE.replace '\xFFFF')
decoder UTF16LE    = T.decodeUtf16LEWith (TE.replace '\xFFFF')
decoder ISO_8859_1 = T.decodeLatin1 .
                     BS.map (\c -> if isLatin1 c then c else '?')


------------------------------------------------------------------------------
isUTF16 :: Encoding -> Bool
isUTF16 e = e == UTF16BE || e == UTF16LE


------------------------------------------------------------------------------
fromText :: Encoding -> Text -> Builder
fromText e t = fromByteString (encoder e t)


bmap :: (Text -> Text) -> B.Builder -> B.Builder
bmap f   = B.byteString
               . T.encodeUtf8
               . f
               . TL.toStrict
               . TL.decodeUtf8
               . B.toLazyByteString