{-# LANGUAGE OverloadedStrings #-} module Readability.Internal ( Readability.Internal.summary, Readability.Title.shortTitle, Readability.Title.title, rootSummary, ) where import qualified Data.Map.Strict as Map import qualified Data.Text as T import Readability.Clean import Readability.Helper import Readability.Metrics import Readability.Title import Readability.Types import Text.XML import Text.XML.Cursor import Prelude hiding (div) summary :: Settings -> Document -> Maybe Document summary s doc = document <$> rootSummary s True (documentRoot doc) rootSummary :: Settings -> Bool -> Element -> Maybe Element rootSummary s ruthless root = if ruthless && T.length (innerText clean) <= 250 then rootSummary s False root else cleanedArticle where clean = fromNode $ paradivs $ NodeElement (cleanElement ruthless root) candidates = clean $// checkName (`elem` ["p", "pre", "td"]) cleanedArticle = case scoreParagraphs candidates of Nothing -> Nothing Just scores -> let (bestCursor, score) = maxScore scores article = getArticle bestCursor score scores in sanitizeNode s scores article >>= getElement getArticle :: Cursor -> Double -> Scores -> Node getArticle best score scores = html $ body $ div $ node <$> filter f siblings where siblings = precedingSibling best ++ best : followingSibling best threshold = 10 `max` score * 0.2 f c = node c == node best || any (>= threshold) (lookupScore (node c) scores) || (any (\e -> elementName e == "p") (getElement (node c)) && textualSibling c) html, body :: Node -> Node html n = NodeElement $ Element "html" Map.empty [n] body n = NodeElement $ Element "body" Map.empty [n] div :: [Node] -> Node div ns = NodeElement $ Element "div" Map.empty ns document :: Element -> Document document e = Document (Prologue [] Nothing []) e []