{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RecordWildCards #-} module Readability.Clean ( cleanElement, paradivs, sanitizeNode, ) where import qualified Data.Map.Strict as Map import Data.Map.Strict (Map) import Data.Maybe (fromMaybe, mapMaybe) import qualified Data.Text as T import Data.Text (Text) import Readability.Helper import Readability.Metrics import Readability.Types import Text.XML import Text.XML.Cursor -- | Remove elements that do not contribute to the article. -- -- Removes: -- -- * headings that are part of ads, menus, widgets etc. -- * forms, textareas -- * iframes -- * block elements (div, section, table etc.) with enough textual content -- -- Preserves: -- -- * textual content sanitizeNode :: Settings -> Scores -> Node -> Maybe Node sanitizeNode s _ n@(NodeElement e@(Element t as children)) | t `elem` ["h1", "h2", "h3", "h4", "h5", "h6"] = guarded (const $ classWeight e >= 0 && linkDensity (fromNode n) <= 0.33) (NodeElement (Element t (sanitizeAttributes s as) children)) sanitizeNode _ _ (NodeElement e) | e `elin` ["form", "textarea"] = Nothing sanitizeNode _ _ (NodeElement e) | e `elin` ["iframe"] = case Map.lookup "src" $ elementAttributes e of Just v | "videoRe" `T.isInfixOf` v -> Just (NodeContent "VIDEO") _ -> Nothing sanitizeNode s m (NodeElement e@(Element t as children)) | e `elin` ["table", "ul", "div", "aside", "header", "footer", "section"] = sanitizeContent m (NodeElement $ Element t (sanitizeAttributes s as) (mapMaybe (sanitizeNode s m) children)) sanitizeNode s m (NodeElement (Element t as children)) = Just $ NodeElement $ Element t (sanitizeAttributes s as) (mapMaybe (sanitizeNode s m) children) sanitizeNode _ _ c@(NodeContent _) = Just c sanitizeNode _ _ _ = Nothing -- | Remove content of block elements that do not have enough textual content. sanitizeContent :: Scores -> Node -> Maybe Node sanitizeContent m n | score + weight < 0 = Nothing | T.count "," txt >= 10 = Just n | cntP > 0 && (fromIntegral cntImg :: Double) > 1.0 + fromIntegral cntP * 1.3 = Nothing | cntLi > cntP && not isList = Nothing | (fromIntegral cntInput :: Double) > fromIntegral cntP / 3 = Nothing | len < 25 && cntImg == 0 = Nothing | len < 25 && cntImg > 2 = Nothing | weight < 25 && density > 0.2 = Nothing | weight >= 25 && density > 0.5 = Nothing | (cntEmbed == 1 && len < 75) || cntEmbed > 1 = Nothing | len > 0 && preds + succs > 1000 = Just n | otherwise = Just n where score = fromMaybe 0 $ lookupScore n m weight = maybe 0 classWeight $ getElement n cursor = fromMaybe (fromNode n) $ lookupCursor n m txt = innerText cursor density = linkDensity cursor len = T.length txt cntP = count "p" cursor cntImg = count "img" cursor cntLi = count "li" cursor - 100 cntEmbed = count "embed" cursor cntInput = count "input" cursor - cntInputHidden cntInputHidden = length (checkElement isHiddenInput cursor) isList = maybe False (`elin` ["ol", "ul"]) $ getElement n count :: Name -> Cursor -> Int count en = length . checkName (== en) preds = sum $ take 1 $ T.length . innerText <$> precedingSibling cursor succs = sum $ take 1 $ T.length . innerText <$> followingSibling cursor -- | Remove element attributes. sanitizeAttributes :: Settings -> Map Name T.Text -> Map Name T.Text sanitizeAttributes Settings {..} = Map.filterWithKey (\k _ -> not $ reRemoveAttributes k) -- | Tests whether element is `input` of type `hidden`. isHiddenInput :: Element -> Bool isHiddenInput (Element n as _) = n == "input" && Map.lookup "type" as == Just "hidden" cleanNode :: Bool -> Node -> Maybe Node cleanNode _ (NodeElement e) | elementName e `elem` ["script", "style", "link"] = Nothing cleanNode True (NodeElement e) | isUnlikely e = Nothing cleanNode r (NodeElement e) = Just $ NodeElement $ cleanElement r e cleanNode _ c@(NodeContent _) = Just c cleanNode _ _ = Nothing cleanElement :: Bool -> Element -> Element cleanElement ruthless (Element t as children) = Element t (Map.filterWithKey (\k _ -> k /= "style") as) (mapMaybe (cleanNode ruthless) children) unlikely :: Text -> Bool unlikely t = any (`T.isInfixOf` t) ["combx", "comment", "community", "disqus", "extra", "foot", "header", "menu", "remark", "rss", "shoutbox", "sidebar", "sponsor", "ad-break", "agegate", "pagination", "pager", "popup", "tweet", "twitter"] likely :: Text -> Bool likely t = any (`T.isInfixOf` t) ["and", "article", "body", "column", "main", "shadow"] isUnlikely :: Element -> Bool isUnlikely e = maybe False unl (attr "class" <> Just " " <> attr "id") where attr = flip Map.lookup (elementAttributes e) unl s = unlikely s && not (likely s) && ((nameLocalName . elementName) e `notElem` ["html", "body"]) {- This works slightly differently than 'transform_misused_divs_into_paragraphs', mainly because I did not understand how that was supposed to work. But also, this implementation gives better results for our testcases. -} paradivs :: Node -> Node paradivs n@(NodeElement (Element "div" as children)) = if null $ fromNode n $// checkName (`elem` ["blockquote", "dl", "div", "img", "ol", "p", "pre", "table", "ul"]) then NodeElement $ Element "p" as children else NodeElement $ Element "div" as (paradivs <$> children) paradivs (NodeElement (Element n as children)) = NodeElement $ Element n as (paradivs <$> children) paradivs e = e