{-# LANGUAGE ViewPatterns #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE OverloadedStrings #-}
module Text.HTML.TagStream.Entities
(Dec(..)
,isNameChar
,isNameStart
,decodeEntities)
where
import Data.Char
import Data.Monoid
import Data.String
import Data.Conduit
import Text.HTML.TagStream.Types
import qualified Data.Conduit.List as CL
import Data.Maybe (fromMaybe, isJust)
import Control.Arrow (first,second)
decodeEntities :: (Monad m
,Monoid builder
,Monoid string
,IsString string
,Eq string)
=> Dec builder string
-> Conduit (Token' string) m (Token' string)
decodeEntities dec =
start
where
start = await >>= maybe (return ()) (\token -> start' token >> start)
start' (Text t) = (yield t >> yieldWhileText) =$= decodeEntities' dec =$= CL.mapMaybe go
start' (TagOpen name attrs bool) = yield (TagOpen name (map (second (decodeString dec)) attrs) bool)
start' token = yield token
go t
| t == "" = Nothing
| otherwise = Just (Text t)
decodeString
:: (Eq a, IsString a, Monoid builder, Monoid a)
=> Dec builder a -> a -> a
decodeString dec input =
case makeEntityDecoder dec input of
(value', remainder)
| value' /= mempty -> value' <> decodeString dec remainder
| otherwise -> input
decodeEntities' :: (Monad m
,Monoid string
,IsString string
,Monoid builder
,Eq string)
=> Dec builder string
-> Conduit string m string
decodeEntities' dec =
loop id
where
loop accum = do
mchunk <- await
let chunk = accum $ fromMaybe mempty mchunk
(newStr, remainder) = makeEntityDecoder dec chunk
yield newStr
if isJust mchunk
then loop (mappend remainder)
else yield remainder
yieldWhileText :: Monad m => Conduit (Token' string) m string
yieldWhileText =
loop
where
loop = await >>= maybe (return ()) go
go (Text t) = yield t >> loop
go token = leftover token
data Dec builder string = Dec
{ decToS :: builder -> string
, decBreak :: (Char -> Bool) -> string -> (string,string)
, decBuilder :: string -> builder
, decDrop :: Int -> string -> string
, decEntity :: string -> Maybe string
, decUncons :: string -> Maybe (Char,string)
}
makeEntityDecoder :: (IsString string,Monoid builder,Eq string,Monoid string)
=> Dec builder string -> string -> (string, string)
makeEntityDecoder Dec{..} = first decToS . go
where
go s =
case decBreak (=='&') s of
(_,"") -> (decBuilder s, "")
(before,restPlusAmp@(decDrop 1 -> rest)) ->
case decBreak (not . (\c -> isNameChar c || c == '#')) rest of
(_,"") -> (decBuilder before, restPlusAmp)
(entity,after) -> (before1 <> before2, after')
where
before1 = decBuilder before
(before2, after') =
case mdecoded of
Nothing -> first ((decBuilder "&" <> decBuilder entity) <>) (go after)
Just (decBuilder -> decoded) ->
case decUncons after of
Just (';',validAfter) -> first (decoded <>) (go validAfter)
Just (_invalid,_rest) -> first (decoded <>) (go after)
Nothing -> (mempty, s)
mdecoded =
if entity == mempty
then Nothing
else decEntity entity
isNameStart :: Char -> Bool
isNameStart c =
c == ':' ||
c == '_' ||
isAsciiUpper c ||
isAsciiLower c ||
(c >= '\xC0' && c <= '\xD6') ||
(c >= '\xD8' && c <= '\xF6') ||
(c >= '\xF8' && c <= '\x2FF') ||
(c >= '\x370' && c <= '\x37D') ||
(c >= '\x37F' && c <= '\x1FFF') ||
(c >= '\x200C' && c <= '\x200D') ||
(c >= '\x2070' && c <= '\x218F') ||
(c >= '\x2C00' && c <= '\x2FEF') ||
(c >= '\x3001' && c <= '\xD7FF') ||
(c >= '\xF900' && c <= '\xFDCF') ||
(c >= '\xFDF0' && c <= '\xFFFD') ||
(c >= '\x10000' && c <= '\xEFFFF')
isNameChar :: Char -> Bool
isNameChar c =
c == '-' ||
c == '.' ||
c == '\xB7' ||
isDigit c ||
isNameStart c ||
(c >= '\x0300' && c <= '\x036F') ||
(c >= '\x203F' && c <= '\x2040')