{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
module Readability.Clean
( cleanElement,
paradivs,
sanitizeNode,
)
where
import qualified Data.Map.Strict as Map
import Data.Map.Strict (Map)
import Data.Maybe (fromMaybe, mapMaybe)
import qualified Data.Text as T
import Data.Text (Text)
import Readability.Helper
import Readability.Metrics
import Readability.Types
import Text.XML
import Text.XML.Cursor
sanitizeNode :: Settings -> Scores -> Node -> Maybe Node
sanitizeNode s _ n@(NodeElement e@(Element t as children))
| t `elem` ["h1", "h2", "h3", "h4", "h5", "h6"] =
guarded (const $ classWeight e >= 0 && linkDensity (fromNode n) <= 0.33) (NodeElement (Element t (sanitizeAttributes s as) children))
sanitizeNode _ _ (NodeElement e) | e `elin` ["form", "textarea"] = Nothing
sanitizeNode _ _ (NodeElement e)
| e `elin` ["iframe"] =
case Map.lookup "src" $ elementAttributes e of
Just v | "videoRe" `T.isInfixOf` v -> Just (NodeContent "VIDEO")
_ -> Nothing
sanitizeNode s m (NodeElement e@(Element t as children))
| e `elin` ["table", "ul", "div", "aside", "header", "footer", "section"] =
sanitizeContent m (NodeElement $ Element t (sanitizeAttributes s as) (mapMaybe (sanitizeNode s m) children))
sanitizeNode s m (NodeElement (Element t as children)) =
Just $ NodeElement $ Element t (sanitizeAttributes s as) (mapMaybe (sanitizeNode s m) children)
sanitizeNode _ _ c@(NodeContent _) = Just c
sanitizeNode _ _ _ = Nothing
sanitizeContent :: Scores -> Node -> Maybe Node
sanitizeContent m n
| score + weight < 0 = Nothing
| T.count "," txt >= 10 = Just n
| cntP > 0 && (fromIntegral cntImg :: Double) > 1.0 + fromIntegral cntP * 1.3 = Nothing
| cntLi > cntP && not isList = Nothing
| (fromIntegral cntInput :: Double) > fromIntegral cntP / 3 = Nothing
| len < 25 && cntImg == 0 = Nothing
| len < 25 && cntImg > 2 = Nothing
| weight < 25 && density > 0.2 = Nothing
| weight >= 25 && density > 0.5 = Nothing
| (cntEmbed == 1 && len < 75) || cntEmbed > 1 = Nothing
| len > 0 && preds + succs > 1000 = Just n
| otherwise = Just n
where
score = fromMaybe 0 $ lookupScore n m
weight = maybe 0 classWeight $ getElement n
cursor = fromMaybe (fromNode n) $ lookupCursor n m
txt = innerText cursor
density = linkDensity cursor
len = T.length txt
cntP = count "p" cursor
cntImg = count "img" cursor
cntLi = count "li" cursor - 100
cntEmbed = count "embed" cursor
cntInput = count "input" cursor - cntInputHidden
cntInputHidden = length (checkElement isHiddenInput cursor)
isList = maybe False (`elin` ["ol", "ul"]) $ getElement n
count :: Name -> Cursor -> Int
count en = length . checkName (== en)
preds = sum $ take 1 $ T.length . innerText <$> precedingSibling cursor
succs = sum $ take 1 $ T.length . innerText <$> followingSibling cursor
sanitizeAttributes :: Settings -> Map Name T.Text -> Map Name T.Text
sanitizeAttributes Settings {..} = Map.filterWithKey (\k _ -> not $ reRemoveAttributes k)
isHiddenInput :: Element -> Bool
isHiddenInput (Element n as _) = n == "input" && Map.lookup "type" as == Just "hidden"
cleanNode :: Bool -> Node -> Maybe Node
cleanNode _ (NodeElement e) | elementName e `elem` ["script", "style", "link"] = Nothing
cleanNode True (NodeElement e) | isUnlikely e = Nothing
cleanNode r (NodeElement e) = Just $ NodeElement $ cleanElement r e
cleanNode _ c@(NodeContent _) = Just c
cleanNode _ _ = Nothing
cleanElement :: Bool -> Element -> Element
cleanElement ruthless (Element t as children) =
Element
t
(Map.filterWithKey (\k _ -> k /= "style") as)
(mapMaybe (cleanNode ruthless) children)
unlikely :: Text -> Bool
unlikely t = any (`T.isInfixOf` t) ["combx", "comment", "community", "disqus", "extra", "foot", "header", "menu", "remark", "rss", "shoutbox", "sidebar", "sponsor", "ad-break", "agegate", "pagination", "pager", "popup", "tweet", "twitter"]
likely :: Text -> Bool
likely t = any (`T.isInfixOf` t) ["and", "article", "body", "column", "main", "shadow"]
isUnlikely :: Element -> Bool
isUnlikely e = maybe False unl (attr "class" <> Just " " <> attr "id")
where
attr = flip Map.lookup (elementAttributes e)
unl s = unlikely s && not (likely s) && ((nameLocalName . elementName) e `notElem` ["html", "body"])
paradivs :: Node -> Node
paradivs n@(NodeElement (Element "div" as children)) =
if null $ fromNode n $// checkName (`elem` ["blockquote", "dl", "div", "img", "ol", "p", "pre", "table", "ul"])
then NodeElement $ Element "p" as children
else NodeElement $ Element "div" as (paradivs <$> children)
paradivs (NodeElement (Element n as children)) = NodeElement $ Element n as (paradivs <$> children)
paradivs e = e