{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
module Readability.Metrics
( classWeight,
contentScore,
linkDensity,
scoreAncestor,
scoreParagraph,
scoreParagraphs,
textualSibling,
)
where
import qualified Data.Map.Strict as Map
import Data.Monoid (Sum (..))
import Data.Text (Text)
import qualified Data.Text as T
import Readability.Helper
import Readability.Types
import Text.XML
import Text.XML.Cursor
positive :: Text -> Bool
positive t = any (`T.isInfixOf` t) ["article", "body", "content", "entry", "hentry", "main", "page", "pagination", "post", "text", "blog", "story"]
negative :: Text -> Bool
negative t = any (`T.isInfixOf` t) ["combx", "comment", "com-", "contact", "foot", "footer", "footnote", "masthead", "media", "meta", "outbrain", "promo", "related", "scroll", "shoutbox", "sidebar", "sponsor", "shopping", "tags", "tool", "widget"]
linkDensity :: Cursor -> Double
linkDensity c = links / (total `max` 1)
where
len = fromIntegral . T.length . T.strip
total = len $ innerText c
links = len $ T.concat $ c $// element "a" &// content'
classWeight :: Element -> Double
classWeight e = maybe 0 getSum (sa "class" `mappend` sa "id")
where
sa :: Name -> Maybe (Sum Double)
sa attr = Sum . st <$> Map.lookup attr (elementAttributes e)
st :: Text -> Double
st t
| positive t = 25
| negative t = -25
st _ = 0
contentScore :: Cursor -> Maybe Double
contentScore c = if len < 25 then Nothing else Just score
where
text = innerText c
len = T.length text
score =
1
+ fromIntegral (length $ T.split (== ',') text)
+ min 3 (fromIntegral len / 100)
scoreAncestor :: Cursor -> Double
scoreAncestor cursor = maybe 0 (\e -> score e + classWeight e) elm
where
elm = getElement $ node cursor
score :: Element -> Double
score e = case T.toLower $ nameLocalName $ elementName e of
n
| n `elem` ["div", "article"] -> 5
| n `elem` ["pre", "td", "blockquote"] -> 3
| n `elem` ["address", "ol", "ul", "dl", "dd", "dt", "li", "form", "aside"] -> -3
| n `elem` ["h1", "h2", "h3", "h4", "h5", "h6", "th", "header", "footer", "nav"] -> -5
_ -> 0
scoreParagraphs :: [Cursor] -> Maybe Scores
scoreParagraphs cs = if nullScores scores then Nothing else Just scores
where
scores = mapScores (\c s -> (c, s * (1 - linkDensity c))) $ foldl scoreParagraph emptyScores cs
scoreParagraph :: Scores -> Cursor -> Scores
scoreParagraph scores cursor =
case contentScore cursor of
Nothing -> scores
Just score ->
case take 2 $ ancestor cursor of
[pc, gpc] -> insert (scoreAncestor pc) score pc $ insert (scoreAncestor gpc) (score / 2) gpc scores
[pc] -> insert (scoreAncestor pc) score pc scores
_ -> scores
where
insert ns cs c =
alterScores
( \case
Nothing -> Just (c, ns + cs)
Just (c', v) -> Just (c', v + cs)
)
(node c)
textualSibling :: Cursor -> Bool
textualSibling c = (nodelen > 80 && ld < 0.25) || (nodelen <= 80 && ld == 0 && (". " `T.isInfixOf` nodetxt || "." `T.isSuffixOf` nodetxt))
where
nodetxt = innerText c
nodelen = T.length nodetxt
ld = linkDensity c