Safe Haskell	Safe-Inferred
Language	Haskell2010

Text.HTML.Parser

Contents

Parsing
Types
Rendering, text canonicalization

Description

This is a performance-oriented HTML tokenizer aim at web-crawling applications. It follows the HTML5 parsing specification quite closely, so it behaves reasonable well on ill-formed documents from the open Web.

Synopsis

parseTokens :: Text -> [Token]
parseTokensLazy :: Text -> [Token]
token :: Parser Token
data Token
- = TagOpen !TagName [Attr]
- | TagSelfClose !TagName [Attr]
- | TagClose !TagName
- | ContentText !Text
- | ContentChar !Char
- | Comment !Builder
- | Doctype !Text
type TagName = Text
type AttrName = Text
type AttrValue = Text
data Attr = Attr !AttrName !AttrValue
renderTokens :: [Token] -> Text
renderToken :: Token -> Text
renderAttrs :: [Attr] -> Text
renderAttr :: Attr -> Text
canonicalizeTokens :: [Token] -> [Token]

Parsing

parseTokens :: Text -> [Token] Source #

Parse a lazy list of tokens from strict Text.

parseTokensLazy :: Text -> [Token] Source #

Parse a lazy list of tokens from lazy Text.

token :: Parser Token Source #

Parse a single Token.

Types

data Token Source #

An HTML token

Constructors

TagOpen !TagName [Attr]	An opening tag. Attribute ordering is arbitrary. Void elements have a `TagOpen` but no corresponding `TagClose`. See `nonClosing`.
TagSelfClose !TagName [Attr]	A self-closing tag.
TagClose !TagName	A closing tag.
ContentText !Text	The content between tags.
ContentChar !Char	A single character of content
Comment !Builder	Contents of a comment.
Doctype !Text	Doctype

Instances

Instances details

Generic Token Source #
Instance details Defined in Text.HTML.Parser Associated Types type Rep Token :: Type -> Type # Methods from :: Token -> Rep Token x # to :: Rep Token x -> Token #
Show Token Source #
Instance details Defined in Text.HTML.Parser Methods showsPrec :: Int -> Token -> ShowS # show :: Token -> String # showList :: [Token] -> ShowS #
NFData Token Source #
Instance details Defined in Text.HTML.Parser Methods rnf :: Token -> () #
Eq Token Source #
Instance details Defined in Text.HTML.Parser Methods (==) :: Token -> Token -> Bool # (/=) :: Token -> Token -> Bool #
Ord Token Source #
Instance details Defined in Text.HTML.Parser Methods compare :: Token -> Token -> Ordering # (<) :: Token -> Token -> Bool # (<=) :: Token -> Token -> Bool # (>) :: Token -> Token -> Bool # (>=) :: Token -> Token -> Bool # max :: Token -> Token -> Token # min :: Token -> Token -> Token #
type Rep Token Source #
Instance details Defined in Text.HTML.Parser type Rep Token = D1 ('MetaData "Token" "Text.HTML.Parser" "html-parse-0.2.1.0-7W3SxQLdixH36WLOIIkWCU" 'False) ((C1 ('MetaCons "TagOpen" 'PrefixI 'False) (S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'SourceStrict 'DecidedStrict) (Rec0 TagName) :: S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 [Attr])) :+: (C1 ('MetaCons "TagSelfClose" 'PrefixI 'False) (S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'SourceStrict 'DecidedStrict) (Rec0 TagName) :: S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 [Attr])) :+: C1 ('MetaCons "TagClose" 'PrefixI 'False) (S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'SourceStrict 'DecidedStrict) (Rec0 TagName)))) :+: ((C1 ('MetaCons "ContentText" 'PrefixI 'False) (S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'SourceStrict 'DecidedStrict) (Rec0 Text)) :+: C1 ('MetaCons "ContentChar" 'PrefixI 'False) (S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'SourceStrict 'DecidedUnpack) (Rec0 Char))) :+: (C1 ('MetaCons "Comment" 'PrefixI 'False) (S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'SourceStrict 'DecidedStrict) (Rec0 Builder)) :+: C1 ('MetaCons "Doctype" 'PrefixI 'False) (S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'SourceStrict 'DecidedStrict) (Rec0 Text)))))

type TagName = Text Source #

A tag name (e.g. body)

type AttrName = Text Source #

An attribute name (e.g. href)

type AttrValue = Text Source #

The value of an attribute

data Attr Source #

An attribute of a tag

Constructors

Attr !AttrName !AttrValue

Instances

Instances details

Show Attr Source #
Instance details Defined in Text.HTML.Parser Methods showsPrec :: Int -> Attr -> ShowS # show :: Attr -> String # showList :: [Attr] -> ShowS #
Eq Attr Source #
Instance details Defined in Text.HTML.Parser Methods (==) :: Attr -> Attr -> Bool # (/=) :: Attr -> Attr -> Bool #
Ord Attr Source #
Instance details Defined in Text.HTML.Parser Methods compare :: Attr -> Attr -> Ordering # (<) :: Attr -> Attr -> Bool # (<=) :: Attr -> Attr -> Bool # (>) :: Attr -> Attr -> Bool # (>=) :: Attr -> Attr -> Bool # max :: Attr -> Attr -> Attr # min :: Attr -> Attr -> Attr #

Rendering, text canonicalization

renderTokens :: [Token] -> Text Source #

See renderToken.

renderToken :: Token -> Text Source #

(Somewhat) canonical string representation of Token.

renderAttrs :: [Attr] -> Text Source #

See renderAttr.

renderAttr :: Attr -> Text Source #

Does not escape quotation in attribute values!

canonicalizeTokens :: [Token] -> [Token] Source #

Meld neighoring ContentChar and ContentText constructors together and drops empty text elements.

Key	Shortcut
s	Open this search box
esc	Close this search box
↓,ctrl + j	Move down in search results
↑,ctrl + k	Move up in search results
↵	Go to active search result