{-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedStrings #-} module Readability.Metrics ( classWeight, contentScore, linkDensity, scoreAncestor, scoreParagraph, scoreParagraphs, textualSibling, ) where import qualified Data.Map.Strict as Map import Data.Monoid (Sum (..)) import Data.Text (Text) import qualified Data.Text as T import Readability.Helper import Readability.Types import Text.XML import Text.XML.Cursor positive :: Text -> Bool positive t = any (`T.isInfixOf` t) ["article", "body", "content", "entry", "hentry", "main", "page", "pagination", "post", "text", "blog", "story"] negative :: Text -> Bool negative t = any (`T.isInfixOf` t) ["combx", "comment", "com-", "contact", "foot", "footer", "footnote", "masthead", "media", "meta", "outbrain", "promo", "related", "scroll", "shoutbox", "sidebar", "sponsor", "shopping", "tags", "tool", "widget"] linkDensity :: Cursor -> Double linkDensity c = links / (total `max` 1) where len = fromIntegral . T.length . T.strip total = len $ innerText c links = len $ T.concat $ c $// element "a" &// content' classWeight :: Element -> Double classWeight e = maybe 0 getSum (sa "class" `mappend` sa "id") where sa :: Name -> Maybe (Sum Double) sa attr = Sum . st <$> Map.lookup attr (elementAttributes e) st :: Text -> Double st t | positive t = 25 | negative t = -25 st _ = 0 contentScore :: Cursor -> Maybe Double contentScore c = if len < 25 then Nothing else Just score where text = innerText c -- TODO clean(elem.text_content() or "") len = T.length text score = 1 + fromIntegral (length $ T.split (== ',') text) + min 3 (fromIntegral len / 100) scoreAncestor :: Cursor -> Double scoreAncestor cursor = maybe 0 (\e -> score e + classWeight e) elm where elm = getElement $ node cursor score :: Element -> Double score e = case T.toLower $ nameLocalName $ elementName e of n | n `elem` ["div", "article"] -> 5 | n `elem` ["pre", "td", "blockquote"] -> 3 | n `elem` ["address", "ol", "ul", "dl", "dd", "dt", "li", "form", "aside"] -> -3 | n `elem` ["h1", "h2", "h3", "h4", "h5", "h6", "th", "header", "footer", "nav"] -> -5 _ -> 0 scoreParagraphs :: [Cursor] -> Maybe Scores scoreParagraphs cs = if nullScores scores then Nothing else Just scores where scores = mapScores (\c s -> (c, s * (1 - linkDensity c))) $ foldl scoreParagraph emptyScores cs scoreParagraph :: Scores -> Cursor -> Scores scoreParagraph scores cursor = case contentScore cursor of Nothing -> scores Just score -> case take 2 $ ancestor cursor of [pc, gpc] -> insert (scoreAncestor pc) score pc $ insert (scoreAncestor gpc) (score / 2) gpc scores [pc] -> insert (scoreAncestor pc) score pc scores _ -> scores where insert ns cs c = alterScores ( \case Nothing -> Just (c, ns + cs) Just (c', v) -> Just (c', v + cs) ) (node c) textualSibling :: Cursor -> Bool textualSibling c = (nodelen > 80 && ld < 0.25) || (nodelen <= 80 && ld == 0 && (". " `T.isInfixOf` nodetxt || "." `T.isSuffixOf` nodetxt)) where nodetxt = innerText c nodelen = T.length nodetxt ld = linkDensity c