module Text.Html.IsLink
( isLinkAttr
, allLinkAttrs
-- * Example with HXT
-- $example
) where
import Data.HashSet (HashSet)
import qualified Data.HashSet as HS
-- | @isLinkAttr tag attr@ returns 'True' if the attribute @attr@ of an HTML
-- element with tag name @tag@ points to an external resource, and 'False'
-- otherwise. So for example @isLinkAttr \"a\" \"href\"@ returns 'True' whereas
-- @isLinkAttr \"a\" \"class\"@ returns 'False'. Note that 'isLinkAttr'
-- expects both @tag@ and @attr@ to be in lowercase, so for example
-- @isLinkAttr \"A\" \"HREF\"@ returns 'False'.
isLinkAttr :: String -> String -> Bool
isLinkAttr tag attr = HS.member (tag, attr) allLinkAttrs
-- sources:
-- * The HTML 4.01 transitional DTD
-- * The HTML 4.01 strict DTD
-- * The HTML 3.2 DTD
-- * The HTML 3.0 DTD
-- * The XMLmind XML Editor 6.0.0 Evaluation Edition contains
-- a BSD licensed W3C XML Schema file for HTML 5
-- * The HTML::Tagset perl module (version 3.20)
-- see the scripts/ directory for more details
-- | A 'HashSet' that contains all combinations of tag names and attributes
-- that correspond to links.
allLinkAttrs :: HashSet (String, String)
allLinkAttrs = HS.fromList
[ ("a", "href")
, ("applet", "archive")
, ("applet", "code")
, ("applet", "codebase")
, ("area", "href")
, ("audio", "src")
, ("base", "href")
, ("bgsound", "src")
, ("blockquote", "cite")
, ("body", "background")
, ("button", "formaction")
, ("command", "icon")
, ("del", "cite")
, ("embed", "pluginspage")
, ("embed", "src")
, ("fig", "src")
, ("form", "action")
, ("frame", "longdesc")
, ("frame", "src")
, ("head", "profile")
, ("hr", "src")
, ("html", "manifest")
, ("iframe", "longdesc")
, ("iframe", "src")
, ("ilayer", "background")
, ("img", "longdesc")
, ("img", "lowsrc")
, ("img", "src")
, ("img", "usemap")
, ("input", "formaction")
, ("input", "src")
, ("input", "usemap")
, ("ins", "cite")
, ("isindex", "action")
, ("layer", "background")
, ("layer", "src")
, ("link", "href")
, ("note", "src")
, ("object", "archive")
, ("object", "classid")
, ("object", "codebase")
, ("object", "data")
, ("object", "usemap")
, ("overlay", "src")
, ("q", "cite")
, ("script", "for")
, ("script", "src")
, ("source", "src")
, ("table", "background")
, ("td", "background")
, ("th", "background")
, ("track", "src")
, ("tr", "background")
, ("video", "poster")
, ("video", "src")
, ("xmp", "href")
]
-- $example
-- Here's an example illustrating how to use 'isLinkAttr' with @hxt@ in
-- order to extract all links from an HTML document:
--
-- > {-# LANGUAGE Arrows #-}
-- >
-- > import Text.Html.IsLink
-- > import Text.XML.HXT.Core
-- >
-- > -- returns a list of tuples containing the tag name, attribute name,
-- > -- attribute value of all links
-- > getAllLinks :: FilePath -> IO [(String, String, String)]
-- > getAllLinks path = runX $ doc >>> multi getLink
-- > where
-- > doc = readDocument [withParseHTML yes, withWarnings no] path
-- >
-- > getLink :: ArrowXml a => a XmlTree (String, String, String)
-- > getLink = proc node -> do
-- > tag <- getName -< node
-- > attrbNode <- getAttrl -< node
-- > attrb <- getName -< attrbNode
-- > val <- xshow getChildren -< attrbNode
-- > isLinkA -< (tag, attrb, val)
-- > where
-- > isLinkA = isLink `guardsP` this
-- > isLink (tag, attrb, _) = isLinkAttr tag attrb