{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}

module Readability.Metrics
  ( classWeight,
    contentScore,
    linkDensity,
    scoreAncestor,
    scoreParagraph,
    scoreParagraphs,
    textualSibling,
  )
where

import qualified Data.Map.Strict as Map
import Data.Monoid (Sum (..))
import Data.Text (Text)
import qualified Data.Text as T
import Readability.Helper
import Readability.Types
import Text.XML
import Text.XML.Cursor

positive :: Text -> Bool
positive t = any (`T.isInfixOf` t) ["article", "body", "content", "entry", "hentry", "main", "page", "pagination", "post", "text", "blog", "story"]

negative :: Text -> Bool
negative t = any (`T.isInfixOf` t) ["combx", "comment", "com-", "contact", "foot", "footer", "footnote", "masthead", "media", "meta", "outbrain", "promo", "related", "scroll", "shoutbox", "sidebar", "sponsor", "shopping", "tags", "tool", "widget"]

linkDensity :: Cursor -> Double
linkDensity c = links / (total `max` 1)
  where
    len = fromIntegral . T.length . T.strip
    total = len $ innerText c
    links = len $ T.concat $ c $// element "a" &// content'

classWeight :: Element -> Double
classWeight e = maybe 0 getSum (sa "class" `mappend` sa "id")
  where
    sa :: Name -> Maybe (Sum Double)
    sa attr = Sum . st <$> Map.lookup attr (elementAttributes e)
    st :: Text -> Double
    st t
      | positive t = 25
      | negative t = -25
    st _ = 0

contentScore :: Cursor -> Maybe Double
contentScore c = if len < 25 then Nothing else Just score
  where
    text = innerText c
    -- TODO clean(elem.text_content() or "")
    len = T.length text
    score =
      1
        + fromIntegral (length $ T.split (== ',') text)
        + min 3 (fromIntegral len / 100)

scoreAncestor :: Cursor -> Double
scoreAncestor cursor = maybe 0 (\e -> score e + classWeight e) elm
  where
    elm = getElement $ node cursor
    score :: Element -> Double
    score e = case T.toLower $ nameLocalName $ elementName e of
      n
        | n `elem` ["div", "article"] -> 5
        | n `elem` ["pre", "td", "blockquote"] -> 3
        | n `elem` ["address", "ol", "ul", "dl", "dd", "dt", "li", "form", "aside"] -> -3
        | n `elem` ["h1", "h2", "h3", "h4", "h5", "h6", "th", "header", "footer", "nav"] -> -5
      _ -> 0

scoreParagraphs :: [Cursor] -> Maybe Scores
scoreParagraphs cs = if nullScores scores then Nothing else Just scores
  where
    scores = mapScores (\c s -> (c, s * (1 - linkDensity c))) $ foldl scoreParagraph emptyScores cs

scoreParagraph :: Scores -> Cursor -> Scores
scoreParagraph scores cursor =
  case contentScore cursor of
    Nothing -> scores
    Just score ->
      case take 2 $ ancestor cursor of
        [pc, gpc] -> insert (scoreAncestor pc) score pc $ insert (scoreAncestor gpc) (score / 2) gpc scores
        [pc] -> insert (scoreAncestor pc) score pc scores
        _ -> scores
  where
    insert ns cs c =
      alterScores
        ( \case
            Nothing -> Just (c, ns + cs)
            Just (c', v) -> Just (c', v + cs)
        )
        (node c)

textualSibling :: Cursor -> Bool
textualSibling c = (nodelen > 80 && ld < 0.25) || (nodelen <= 80 && ld == 0 && (". " `T.isInfixOf` nodetxt || "." `T.isSuffixOf` nodetxt))
  where
    nodetxt = innerText c
    nodelen = T.length nodetxt
    ld = linkDensity c