{- |
   Module      : Text.HTML.Parser.Util
   Description : Utility functions for the html-parse library
   Copyright   : (c) Neil Mitchell  2006–2019 (TagSoup),
                     Tony Zorman    2020–2022 (port to html-parse)
   License     : BSD-3
   Maintainer  : Tony Zorman <soliditsallgood@mailbox.org>
   Stability   : experimental
   Portability : non-portable

Utility functions to make working with @html-parse@ as easy as working
with TagSoup!  Most functions are one-to-one replacements for their
respective TagSoup analogues and work the same way.
-}
module Text.HTML.Parser.Util
    ( -- * Conversion
      toToken            -- :: Text -> Token
    , toTokenDefault     -- :: Token -> Text -> Token

      -- * Tag identification
    , isTagOpen          -- :: Token -> Bool
    , isTagClose         -- :: Token -> Bool
    , isTagSelfClose     -- :: Token -> Bool
    , isContentText      -- :: Token -> Bool
    , isContentChar      -- :: Token -> Bool
    , isComment          -- :: Token -> Bool
    , isDoctype          -- :: Token -> Bool
    , isTagOpenName      -- :: Text -> Token -> Bool
    , isTagCloseName     -- :: Text -> Token -> Bool

      -- * Extraction
    , fromContentText    -- :: Token -> Text
    , maybeContentText   -- :: Token -> Maybe Text
    , fromAttrib         -- :: Attr -> Token -> Attr
    , maybeAttrib        -- :: Attr -> Token -> Maybe Attr
    , innerText          -- :: [Token] -> Text
    , toHeadContentText  -- :: [Token] -> Text
    , between            -- :: Token -> Token -> [Token] -> [Token]
    , dropHeader         -- :: [Attr] -> [Token] -> [Token]
    , allContentText     -- :: [Token] -> [Text]

      -- * Utility
    , sections           -- :: (a -> Bool) -> [a] -> [[a]]
    , section            -- :: (a -> Bool) -> [a] -> [a]
    , partitions         -- :: (a -> Bool) -> [a] -> [[a]]

      -- * Combinators
    , (~==)              -- :: Token -> Token -> Bool
    , (~/=)              -- :: Token -> Token -> Bool
    ) where

import qualified Data.Attoparsec.Text as A
import qualified Data.List.NonEmpty   as NE
import qualified Data.Text            as T

import Data.Either (fromRight)
import Data.List (groupBy, tails)
import Data.Maybe (fromMaybe, mapMaybe)
import Data.Text (Text)
import Text.HTML.Parser (Attr (Attr), Token (Comment, ContentChar, ContentText, Doctype, TagClose, TagOpen, TagSelfClose), token)


-- | Like 'toTokenDefault', but with a supplied default value.
--
-- >>> toToken "text"
-- ContentText "text"
toToken :: Text -> Token
toToken :: AttrName -> Token
toToken = Token -> AttrName -> Token
toTokenDefault (AttrName -> Token
Doctype AttrName
"Could not parse string into token.")

-- | Convert 'Text' to 'Token', with a default in case of a parse failure.
toTokenDefault :: Token -> Text -> Token
toTokenDefault :: Token -> AttrName -> Token
toTokenDefault Token
d = Token -> Either [Char] Token -> Token
forall b a. b -> Either a b -> b
fromRight Token
d (Either [Char] Token -> Token)
-> (AttrName -> Either [Char] Token) -> AttrName -> Token
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Parser Token -> AttrName -> Either [Char] Token
forall a. Parser a -> AttrName -> Either [Char] a
A.parseOnly Parser Token
token

-- | This function takes a list, and returns all suffixes whose first item
-- matches the predicate.
--
-- >>> sections (== 'c') "abc cba ccb"
-- ["c cba ccb","cba ccb","ccb","cb"]
sections :: (a -> Bool) -> [a] -> [[a]]
sections :: forall a. (a -> Bool) -> [a] -> [[a]]
sections a -> Bool
p = ([a] -> Bool) -> [[a]] -> [[a]]
forall a. (a -> Bool) -> [a] -> [a]
filter (a -> Bool
p (a -> Bool) -> ([a] -> a) -> [a] -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [a] -> a
forall a. HasCallStack => [a] -> a
head) ([[a]] -> [[a]]) -> ([a] -> [[a]]) -> [a] -> [[a]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [[a]] -> [[a]]
forall a. HasCallStack => [a] -> [a]
init ([[a]] -> [[a]]) -> ([a] -> [[a]]) -> [a] -> [[a]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [a] -> [[a]]
forall a. [a] -> [[a]]
tails

-- | Like 'sections', but return the head element.  Returns an empty list if no
-- head element is present.
--
-- >>> section (== 'c') "abc cba ccb"
-- "c cba ccb"
section :: (a -> Bool) -> [a] -> [a]
section :: forall a. (a -> Bool) -> [a] -> [a]
section a -> Bool
f = \case
  [] -> []
  [a]
xs -> [a] -> (NonEmpty [a] -> [a]) -> Maybe (NonEmpty [a]) -> [a]
forall b a. b -> (a -> b) -> Maybe a -> b
maybe [] NonEmpty [a] -> [a]
forall a. NonEmpty a -> a
NE.head ([[a]] -> Maybe (NonEmpty [a])
forall a. [a] -> Maybe (NonEmpty a)
NE.nonEmpty ((a -> Bool) -> [a] -> [[a]]
forall a. (a -> Bool) -> [a] -> [[a]]
sections a -> Bool
f [a]
xs))

-- | This function is similar to 'sections', but splits the list so no element
-- appears in any two partitions.
--
-- >>> partitions (== 'c') "abc cba ccb"
-- ["c ","cba ","c","cb"]
partitions :: (a -> Bool) -> [a] -> [[a]]
partitions :: forall a. (a -> Bool) -> [a] -> [[a]]
partitions a -> Bool
p = (a -> a -> Bool) -> [a] -> [[a]]
forall a. (a -> a -> Bool) -> [a] -> [[a]]
groupBy ((a -> Bool) -> a -> a -> Bool
forall a b. a -> b -> a
const a -> Bool
notp) ([a] -> [[a]]) -> ([a] -> [a]) -> [a] -> [[a]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (a -> Bool) -> [a] -> [a]
forall a. (a -> Bool) -> [a] -> [a]
dropWhile a -> Bool
notp
 where notp :: a -> Bool
notp = Bool -> Bool
not (Bool -> Bool) -> (a -> Bool) -> a -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. a -> Bool
p

-- | Get the first 'ContentText' element from a list of 'Token's.  If no tag
-- could be found, return an empty string.
toHeadContentText :: [Token] -> Text
toHeadContentText :: [Token] -> AttrName
toHeadContentText = AttrName
-> (NonEmpty AttrName -> AttrName)
-> Maybe (NonEmpty AttrName)
-> AttrName
forall b a. b -> (a -> b) -> Maybe a -> b
maybe AttrName
"" NonEmpty AttrName -> AttrName
forall a. NonEmpty a -> a
NE.head (Maybe (NonEmpty AttrName) -> AttrName)
-> ([Token] -> Maybe (NonEmpty AttrName)) -> [Token] -> AttrName
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [AttrName] -> Maybe (NonEmpty AttrName)
forall a. [a] -> Maybe (NonEmpty a)
NE.nonEmpty ([AttrName] -> Maybe (NonEmpty AttrName))
-> ([Token] -> [AttrName]) -> [Token] -> Maybe (NonEmpty AttrName)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Token] -> [AttrName]
allContentText

-- | Get all 'Token's between @start@ and @end@.
between :: Token -> Token -> [Token] -> [Token]
between :: Token -> Token -> [Token] -> [Token]
between Token
start Token
end = (Token -> Bool) -> [Token] -> [Token]
forall a. (a -> Bool) -> [a] -> [a]
takeWhile (Token -> Token -> Bool
~/= Token
end  )
                  ([Token] -> [Token]) -> ([Token] -> [Token]) -> [Token] -> [Token]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> [Token] -> [Token]
forall a. Int -> [a] -> [a]
drop Int
1                 -- drop the tag
                  ([Token] -> [Token]) -> ([Token] -> [Token]) -> [Token] -> [Token]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Token -> Bool) -> [Token] -> [Token]
forall a. (a -> Bool) -> [a] -> [a]
dropWhile (Token -> Token -> Bool
~/= Token
start)

-- | Drop an HTML header (i.e. the header tags and everything in between), as
-- well as everything before it, from a list of 'Token's.
dropHeader :: [Attr] -> [Token] -> [Token]
dropHeader :: [Attr] -> [Token] -> [Token]
dropHeader [Attr]
attr = Int -> [Token] -> [Token]
forall a. Int -> [a] -> [a]
drop Int
1                   -- drop </header>
                ([Token] -> [Token]) -> ([Token] -> [Token]) -> [Token] -> [Token]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Token -> Bool) -> [Token] -> [Token]
forall a. (a -> Bool) -> [a] -> [a]
dropWhile (Token -> Token -> Bool
~/= AttrName -> Token
TagClose AttrName
"header"     )
                ([Token] -> [Token]) -> ([Token] -> [Token]) -> [Token] -> [Token]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Token -> Bool) -> [Token] -> [Token]
forall a. (a -> Bool) -> [a] -> [a]
dropWhile (Token -> Token -> Bool
~/= AttrName -> [Attr] -> Token
TagOpen  AttrName
"header" [Attr]
attr)

-- | Get all 'ContentText' entries from a list of 'Token's and extract their
-- content.
allContentText :: [Token] -> [Text]
allContentText :: [Token] -> [AttrName]
allContentText = (Token -> Maybe AttrName) -> [Token] -> [AttrName]
forall a b. (a -> Maybe b) -> [a] -> [b]
mapMaybe Token -> Maybe AttrName
maybeContentText

-- | Test if a 'Token' is a 'TagOpen'.
isTagOpen :: Token -> Bool
isTagOpen :: Token -> Bool
isTagOpen = \case
  TagOpen{} -> Bool
True
  Token
_         -> Bool
False

-- | Test if a 'Token' is a 'TagClose'.
isTagClose :: Token -> Bool
isTagClose :: Token -> Bool
isTagClose = \case
  TagClose{} -> Bool
True
  Token
_          -> Bool
False

-- | Test if a 'Token' is a 'ContentText'.
isContentText :: Token -> Bool
isContentText :: Token -> Bool
isContentText = \case
  ContentText{} -> Bool
True
  Token
_             -> Bool
False

-- | Extract the string from within 'ContentText', otherwise return 'Nothing'.
maybeContentText :: Token -> Maybe Text
maybeContentText :: Token -> Maybe AttrName
maybeContentText = \case
  ContentText AttrName
t -> AttrName -> Maybe AttrName
forall a. a -> Maybe a
Just AttrName
t
  Token
_             -> Maybe AttrName
forall a. Maybe a
Nothing

-- | Extract the string from within 'ContentText', crashes if not a
-- 'ContentText'.
fromContentText :: Token -> Text
fromContentText :: Token -> AttrName
fromContentText = \case
  ContentText AttrName
t -> AttrName
t
  Token
t             -> [Char] -> AttrName
forall a. HasCallStack => [Char] -> a
error ([Char] -> AttrName) -> [Char] -> AttrName
forall a b. (a -> b) -> a -> b
$ [Char]
"(" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ Token -> [Char]
forall a. Show a => a -> [Char]
show Token
t [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ [Char]
") is not a ContentText"

-- | Extract all text content from a list of Tokens (similar to Verbatim found
-- in HaXml).
innerText :: [Token] -> Text
innerText :: [Token] -> AttrName
innerText = [AttrName] -> AttrName
forall a. Monoid a => [a] -> a
mconcat ([AttrName] -> AttrName)
-> ([Token] -> [AttrName]) -> [Token] -> AttrName
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Token -> Maybe AttrName) -> [Token] -> [AttrName]
forall a b. (a -> Maybe b) -> [a] -> [b]
mapMaybe Token -> Maybe AttrName
maybeContentText

-- | Test if a 'Token' is a 'TagSelfClose'.
isTagSelfClose :: Token -> Bool
isTagSelfClose :: Token -> Bool
isTagSelfClose = \case
  TagSelfClose{} -> Bool
True
  Token
_              -> Bool
False

-- | Test if a 'Token' is a 'ContentChar'.
isContentChar :: Token -> Bool
isContentChar :: Token -> Bool
isContentChar = \case
  ContentChar{} -> Bool
True
  Token
_             -> Bool
False

-- | Test if a 'Token' is a 'Comment'.
isComment :: Token -> Bool
isComment :: Token -> Bool
isComment = \case
  Comment{} -> Bool
True
  Token
_         -> Bool
False

-- | Test if a 'Token' is a 'Doctype'.
isDoctype :: Token -> Bool
isDoctype :: Token -> Bool
isDoctype = \case
  Doctype{} -> Bool
True
  Token
_         -> Bool
False

-- | Returns True if the 'Token' is 'TagOpen' and matches the given name.
isTagOpenName :: Text -> Token -> Bool
isTagOpenName :: AttrName -> Token -> Bool
isTagOpenName AttrName
name (TagOpen AttrName
n [Attr]
_) = AttrName
n AttrName -> AttrName -> Bool
forall a. Eq a => a -> a -> Bool
== AttrName
name
isTagOpenName AttrName
_    Token
_             = Bool
False

-- | Returns True if the 'Token' is 'TagClose' and matches the given name.
isTagCloseName :: Text -> Token -> Bool
isTagCloseName :: AttrName -> Token -> Bool
isTagCloseName AttrName
name (TagClose AttrName
n) = AttrName
n AttrName -> AttrName -> Bool
forall a. Eq a => a -> a -> Bool
== AttrName
name
isTagCloseName AttrName
_    Token
_            = Bool
False

-- | Extract an attribute; crashes if not a 'TagOpen'.  Returns @Attr \"\" \"\"@
-- if no attribute present.
--
-- Warning: does not distinguish between missing attribute and present
-- attribute with values @\"\"@.
fromAttrib :: Attr -> Token -> Attr
fromAttrib :: Attr -> Token -> Attr
fromAttrib Attr
att Token
tag = Attr -> Maybe Attr -> Attr
forall a. a -> Maybe a -> a
fromMaybe (AttrName -> AttrName -> Attr
Attr AttrName
"" AttrName
"") (Maybe Attr -> Attr) -> Maybe Attr -> Attr
forall a b. (a -> b) -> a -> b
$ Attr -> Token -> Maybe Attr
maybeAttrib Attr
att Token
tag

-- | Extract an attribute; crashes if not a 'TagOpen'.  Returns
-- 'Nothing' if no attribute present.
maybeAttrib :: Attr -> Token -> Maybe Attr
maybeAttrib :: Attr -> Token -> Maybe Attr
maybeAttrib Attr
att (TagOpen AttrName
_ [Attr]
atts)
  | Attr
att Attr -> [Attr] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Attr]
atts = Attr -> Maybe Attr
forall a. a -> Maybe a
Just Attr
att
  | Bool
otherwise       = Maybe Attr
forall a. Maybe a
Nothing
maybeAttrib Attr
_ Token
t = [Char] -> Maybe Attr
forall a. HasCallStack => [Char] -> a
error ([Char]
"(" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ Token -> [Char]
forall a. Show a => a -> [Char]
show Token
t [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ [Char]
") is not a TagOpen")

infixl 9 ~==
-- | Performs an inexact match, the first item should be the thing to
-- match.
--
-- >>> ContentText "test" ~== ContentText ""
-- True
--
-- >>> TagOpen "div" [Attr "class" "division ", Attr "id" "dd"] ~== TagOpen "div" [Attr "class" "division "]
-- True
(~==) :: Token -> Token -> Bool
~== :: Token -> Token -> Bool
(~==) = Token -> Token -> Bool
f
 where
  f :: Token -> Token -> Bool
f (ContentText AttrName
y) (ContentText AttrName
x) = AttrName -> Bool
T.null AttrName
x             Bool -> Bool -> Bool
|| AttrName
x AttrName -> AttrName -> Bool
forall a. Eq a => a -> a -> Bool
== AttrName
y
  f (TagClose    AttrName
y) (TagClose    AttrName
x) = AttrName -> Bool
T.null AttrName
x             Bool -> Bool -> Bool
|| AttrName
x AttrName -> AttrName -> Bool
forall a. Eq a => a -> a -> Bool
== AttrName
y
  f (Comment     Builder
x) (Comment     Builder
y) = Builder
x Builder -> Builder -> Bool
forall a. Eq a => a -> a -> Bool
== Builder
forall a. Monoid a => a
mempty          Bool -> Bool -> Bool
|| Builder
x Builder -> Builder -> Bool
forall a. Eq a => a -> a -> Bool
== Builder
y
  f (TagOpen  AttrName
y [Attr]
ys) (TagOpen  AttrName
x [Attr]
xs) = (AttrName -> Bool
T.null AttrName
x Bool -> Bool -> Bool
|| AttrName
x AttrName -> AttrName -> Bool
forall a. Eq a => a -> a -> Bool
== AttrName
y) Bool -> Bool -> Bool
&& (Attr -> Bool) -> [Attr] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all Attr -> Bool
g [Attr]
xs
   where
    g :: Attr -> Bool
    g :: Attr -> Bool
g nv :: Attr
nv@(Attr AttrName
name AttrName
val)
      | AttrName -> Bool
T.null AttrName
name = AttrName
val  AttrName -> [AttrName] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` (Attr -> AttrName) -> [Attr] -> [AttrName]
forall a b. (a -> b) -> [a] -> [b]
map (\(Attr AttrName
o AttrName
_) -> AttrName
o) [Attr]
ys
      | AttrName -> Bool
T.null AttrName
val  = AttrName
name AttrName -> [AttrName] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` (Attr -> AttrName) -> [Attr] -> [AttrName]
forall a b. (a -> b) -> [a] -> [b]
map (\(Attr AttrName
_ AttrName
t) -> AttrName
t) [Attr]
ys
      | Bool
otherwise   = Attr
nv   Attr -> [Attr] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Attr]
ys
  f Token
_ Token
_ = Bool
False

infixl 9 ~/=
-- | Negation of '(~==)'.
(~/=) :: Token -> Token -> Bool
~/= :: Token -> Token -> Bool
(~/=) Token
a Token
b = Bool -> Bool
not (Token
a Token -> Token -> Bool
~== Token
b)