{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}

module Readability.Clean
  ( cleanElement,
    paradivs,
    sanitizeNode,
  )
where

import qualified Data.Map.Strict as Map
import Data.Map.Strict (Map)
import Data.Maybe (fromMaybe, mapMaybe)
import qualified Data.Text as T
import Data.Text (Text)
import Readability.Helper
import Readability.Metrics
import Readability.Types
import Text.XML
import Text.XML.Cursor

-- | Remove elements that do not contribute to the article.
--
-- Removes:
--
--   * headings that are part of ads, menus, widgets etc.
--   * forms, textareas
--   * iframes
--   * block elements (div, section, table etc.) with enough textual content
--
-- Preserves:
--
--   * textual content
sanitizeNode :: Settings -> Scores -> Node -> Maybe Node
sanitizeNode s _ n@(NodeElement e@(Element t as children))
  | t `elem` ["h1", "h2", "h3", "h4", "h5", "h6"] =
    guarded (const $ classWeight e >= 0 && linkDensity (fromNode n) <= 0.33) (NodeElement (Element t (sanitizeAttributes s as) children))
sanitizeNode _ _ (NodeElement e) | e `elin` ["form", "textarea"] = Nothing
sanitizeNode _ _ (NodeElement e)
  | e `elin` ["iframe"] =
    case Map.lookup "src" $ elementAttributes e of
      Just v | "videoRe" `T.isInfixOf` v -> Just (NodeContent "VIDEO")
      _ -> Nothing
sanitizeNode s m (NodeElement e@(Element t as children))
  | e `elin` ["table", "ul", "div", "aside", "header", "footer", "section"] =
    sanitizeContent m (NodeElement $ Element t (sanitizeAttributes s as) (mapMaybe (sanitizeNode s m) children))
sanitizeNode s m (NodeElement (Element t as children)) =
  Just $ NodeElement $ Element t (sanitizeAttributes s as) (mapMaybe (sanitizeNode s m) children)
sanitizeNode _ _ c@(NodeContent _) = Just c
sanitizeNode _ _ _ = Nothing

-- | Remove content of block elements that do not have enough textual content.
sanitizeContent :: Scores -> Node -> Maybe Node
sanitizeContent m n
  | score + weight < 0 = Nothing
  | T.count "," txt >= 10 = Just n
  | cntP > 0 && (fromIntegral cntImg :: Double) > 1.0 + fromIntegral cntP * 1.3 = Nothing
  | cntLi > cntP && not isList = Nothing
  | (fromIntegral cntInput :: Double) > fromIntegral cntP / 3 = Nothing
  | len < 25 && cntImg == 0 = Nothing
  | len < 25 && cntImg > 2 = Nothing
  | weight < 25 && density > 0.2 = Nothing
  | weight >= 25 && density > 0.5 = Nothing
  | (cntEmbed == 1 && len < 75) || cntEmbed > 1 = Nothing
  | len > 0 && preds + succs > 1000 = Just n
  | otherwise = Just n
  where
    score = fromMaybe 0 $ lookupScore n m
    weight = maybe 0 classWeight $ getElement n
    cursor = fromMaybe (fromNode n) $ lookupCursor n m
    txt = innerText cursor
    density = linkDensity cursor
    len = T.length txt
    cntP = count "p" cursor
    cntImg = count "img" cursor
    cntLi = count "li" cursor - 100
    cntEmbed = count "embed" cursor
    cntInput = count "input" cursor - cntInputHidden
    cntInputHidden = length (checkElement isHiddenInput cursor)
    isList = maybe False (`elin` ["ol", "ul"]) $ getElement n
    count :: Name -> Cursor -> Int
    count en = length . checkName (== en)
    preds = sum $ take 1 $ T.length . innerText <$> precedingSibling cursor
    succs = sum $ take 1 $ T.length . innerText <$> followingSibling cursor

-- | Remove element attributes.
sanitizeAttributes :: Settings -> Map Name T.Text -> Map Name T.Text
sanitizeAttributes Settings {..} = Map.filterWithKey (\k _ -> not $ reRemoveAttributes k)

-- | Tests whether element is `input` of type `hidden`.
isHiddenInput :: Element -> Bool
isHiddenInput (Element n as _) = n == "input" && Map.lookup "type" as == Just "hidden"

cleanNode :: Bool -> Node -> Maybe Node
cleanNode _ (NodeElement e) | elementName e `elem` ["script", "style", "link"] = Nothing
cleanNode True (NodeElement e) | isUnlikely e = Nothing
cleanNode r (NodeElement e) = Just $ NodeElement $ cleanElement r e
cleanNode _ c@(NodeContent _) = Just c
cleanNode _ _ = Nothing

cleanElement :: Bool -> Element -> Element
cleanElement ruthless (Element t as children) =
  Element
    t
    (Map.filterWithKey (\k _ -> k /= "style") as)
    (mapMaybe (cleanNode ruthless) children)

unlikely :: Text -> Bool
unlikely t = any (`T.isInfixOf` t) ["combx", "comment", "community", "disqus", "extra", "foot", "header", "menu", "remark", "rss", "shoutbox", "sidebar", "sponsor", "ad-break", "agegate", "pagination", "pager", "popup", "tweet", "twitter"]

likely :: Text -> Bool
likely t = any (`T.isInfixOf` t) ["and", "article", "body", "column", "main", "shadow"]

isUnlikely :: Element -> Bool
isUnlikely e = maybe False unl (attr "class" <> Just " " <> attr "id")
  where
    attr = flip Map.lookup (elementAttributes e)
    unl s = unlikely s && not (likely s) && ((nameLocalName . elementName) e `notElem` ["html", "body"])

{- This works slightly differently than 'transform_misused_divs_into_paragraphs',
   mainly because I did not understand how that was supposed to work. But also,
   this implementation gives better results for our testcases. -}
paradivs :: Node -> Node
paradivs n@(NodeElement (Element "div" as children)) =
  if null $ fromNode n $// checkName (`elem` ["blockquote", "dl", "div", "img", "ol", "p", "pre", "table", "ul"])
    then NodeElement $ Element "p" as children
    else NodeElement $ Element "div" as (paradivs <$> children)
paradivs (NodeElement (Element n as children)) = NodeElement $ Element n as (paradivs <$> children)
paradivs e = e