{-# LANGUAGE ViewPatterns #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE OverloadedStrings #-}

-- | HTML entity decoding.

module Text.HTML.TagStream.Entities
  (Dec(..)
  ,isNameChar
  ,isNameStart
  ,decodeEntities)
  where

import Data.Char
import Data.Monoid
import Data.String
import Data.Conduit
import Text.HTML.TagStream.Types

import qualified Data.Conduit.List as CL
import Data.Maybe (fromMaybe, isJust)
import Control.Arrow (first,second)

-- | A conduit to decode entities from a stream of tokens into a new stream of tokens.
decodeEntities :: (Monad m
                  ,Monoid builder
                  ,Monoid string
                  ,IsString string
                  ,Eq string)
               => Dec builder string
               -> Conduit (Token' string) m (Token' string)
decodeEntities dec =
    start
  where
    start = await >>= maybe (return ()) (\token -> start' token >> start)
    start' (Text t) = (yield t >> yieldWhileText) =$= decodeEntities' dec =$= CL.mapMaybe go
    start' (TagOpen name attrs bool) = yield (TagOpen name (map (second (decodeString dec)) attrs) bool)
    start' token = yield token

    go t
        | t == ""   = Nothing
        | otherwise = Just (Text t)

-- | Decode entities in a complete string.
decodeString
  :: (Eq a, IsString a, Monoid builder, Monoid a)
  => Dec builder a -> a -> a
decodeString dec input =
  case makeEntityDecoder dec input of
    (value', remainder)
      | value' /= mempty -> value' <> decodeString dec remainder
      | otherwise -> input

decodeEntities' :: (Monad m
                   ,Monoid string
                   ,IsString string
                   ,Monoid builder
                   ,Eq string)
                => Dec builder string
                -> Conduit string m string
decodeEntities' dec =
    loop id
  where
    loop accum = do
        mchunk <- await
        let chunk = accum $ fromMaybe mempty mchunk
            (newStr, remainder) = makeEntityDecoder dec chunk
        yield newStr
        if isJust mchunk
            then loop (mappend remainder)
            else yield remainder

-- | Yield contiguous text tokens as strings.
yieldWhileText :: Monad m => Conduit (Token' string) m string
yieldWhileText =
    loop
  where
    loop = await >>= maybe (return ()) go
    go (Text t) = yield t >> loop
    go token = leftover token

-- | A decoder.
data Dec builder string = Dec
  { decToS     :: builder -> string
  , decBreak   :: (Char -> Bool) -> string -> (string,string)
  , decBuilder :: string -> builder
  , decDrop    :: Int -> string -> string
  , decEntity  :: string -> Maybe string
  , decUncons  :: string -> Maybe (Char,string)
  }

-- | Decode the entities in a string type with a decoder.
makeEntityDecoder :: (IsString string,Monoid builder,Eq string,Monoid string)
                  => Dec builder string -> string -> (string, string)
makeEntityDecoder Dec{..} = first decToS . go
  where
    go s =
      case decBreak (=='&') s of
        (_,"") -> (decBuilder s, "")
        (before,restPlusAmp@(decDrop 1 -> rest)) ->
          case decBreak (not . (\c -> isNameChar c || c == '#')) rest of
            (_,"") -> (decBuilder before, restPlusAmp)
            (entity,after) -> (before1 <> before2, after')
              where
                before1 = decBuilder before
                (before2, after') =
                  case mdecoded of
                    Nothing -> first ((decBuilder "&" <> decBuilder entity) <>) (go after)
                    Just (decBuilder -> decoded) ->
                      case decUncons after of
                        Just (';',validAfter) -> first (decoded <>) (go validAfter)
                        Just (_invalid,_rest) -> first (decoded <>) (go after)
                        Nothing -> (mempty, s)
                mdecoded =
                  if entity == mempty
                     then Nothing
                     else decEntity entity

-- | Is the character a valid Name starter?
isNameStart :: Char -> Bool
isNameStart c =
  c == ':' ||
  c == '_' ||
  isAsciiUpper c ||
  isAsciiLower c ||
  (c >= '\xC0' && c <= '\xD6') ||
  (c >= '\xD8' && c <= '\xF6') ||
  (c >= '\xF8' && c <= '\x2FF') ||
  (c >= '\x370' && c <= '\x37D') ||
  (c >= '\x37F' && c <= '\x1FFF') ||
  (c >= '\x200C' && c <= '\x200D') ||
  (c >= '\x2070' && c <= '\x218F') ||
  (c >= '\x2C00' && c <= '\x2FEF') ||
  (c >= '\x3001' && c <= '\xD7FF') ||
  (c >= '\xF900' && c <= '\xFDCF') ||
  (c >= '\xFDF0' && c <= '\xFFFD') ||
  (c >= '\x10000' && c <= '\xEFFFF')

-- | Is the character valid in a Name?
isNameChar :: Char -> Bool
isNameChar c =
  c == '-' ||
  c == '.' ||
  c == '\xB7' ||
  isDigit c ||
  isNameStart c ||
  (c >= '\x0300' && c <= '\x036F') ||
  (c >= '\x203F' && c <= '\x2040')