{-# LANGUAGE DeriveDataTypeable #-} module Text.HTML.TagSoup.Options where import Data.Typeable import Text.HTML.TagSoup.Type import Text.HTML.TagSoup.Entity import Text.StringLike -- | These options control how 'parseTags' works. The 'ParseOptions' type is usually generated by one of -- 'parseOptions', 'parseOptionsFast' or 'parseOptionsEntities', then selected fields may be overriden. -- -- The options 'optTagPosition' and 'optTagWarning' specify whether to generate -- 'TagPosition' or 'TagWarning' elements respectively. Usually these options should be set to @False@ -- to simplify future stages, unless you rely on position information or want to give malformed HTML -- messages to the end user. -- -- The options 'optEntityData' and 'optEntityAttrib' control how entities, for example @ @ are handled. -- Both take a string, and a boolean, where @True@ indicates that the entity ended with a semi-colon @;@. -- Inside normal text 'optEntityData' will be called, and the results will be inserted in the tag stream. -- Inside a tag attribute 'optEntityAttrib' will be called, and the first component of the result will be used -- in the attribute, and the second component will be appended after the 'TagOpen' value (usually the second -- component is @[]@). As an example, to not decode any entities, pass: -- -- > parseOptions -- > {optEntityData=\(str,b) -> [TagText $ "&" ++ str ++ [';' | b]] -- > ,optEntityAttrib\(str,b) -> ("&" ++ str ++ [';' | b], []) -- The 'optTagTextMerge' value specifies if you always want adjacent 'TagText' values to be merged. -- Merging adjacent pieces of text has a small performance penalty, but will usually make subsequent analysis -- simpler. Contiguous runs of characters without entities or tags will also be generated as single 'TagText' -- values. data ParseOptions str = ParseOptions {optTagPosition :: Bool -- ^ Should 'TagPosition' values be given before some items (default=False,fast=False). ,optTagWarning :: Bool -- ^ Should 'TagWarning' values be given (default=False,fast=False) ,optEntityData :: (str,Bool) -> [Tag str] -- ^ How to lookup an entity (Bool = has ending @';'@) ,optEntityAttrib :: (str,Bool) -> (str,[Tag str]) -- ^ How to lookup an entity in an attribute (Bool = has ending @';'@?) ,optTagTextMerge :: Bool -- ^ Require no adjacent 'TagText' values (default=True,fast=False) } deriving Typeable -- | A 'ParseOptions' structure using a custom function to lookup attributes. Any attribute -- that is not found will be left intact, and a 'TagWarning' given (if 'optTagWarning' is set). -- -- If you do not want to resolve any entities, simpliy pass @const Nothing@ for the lookup function. parseOptionsEntities :: StringLike str => (str -> Maybe str) -> ParseOptions str parseOptionsEntities lookupEntity = ParseOptions False False entityData entityAttrib True where entityData x = TagText a : b where (a,b) = entityAttrib x entityAttrib ~(x,b) = let x' = x `append` fromString [';'|b] in case lookupEntity x' of Just y -> (y, []) Nothing -> (fromChar '&' `append` x' ,[TagWarning $ fromString "Unknown entity: " `append` x]) -- | The default parse options value, described in 'ParseOptions'. Equivalent to -- @'parseOptionsEntities' 'lookupEntity'@. parseOptions :: StringLike str => ParseOptions str parseOptions = parseOptionsEntities $ fmap fromString . lookupEntity . toString -- | A 'ParseOptions' structure optimised for speed, following the fast options. parseOptionsFast :: StringLike str => ParseOptions str parseOptionsFast = parseOptions{optTagTextMerge=False} -- | Change the underlying string type of a 'ParseOptions' value. fmapParseOptions :: (StringLike from, StringLike to) => ParseOptions from -> ParseOptions to fmapParseOptions (ParseOptions a b c d e) = ParseOptions a b c2 d2 e where c2 ~(x,y) = map (fmap castString) $ c (castString x, y) d2 ~(x,y) = (castString r, map (fmap castString) s) where (r,s) = d (castString x, y)