{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RecordWildCards #-} module Readability.Title ( title, shortTitle, ) where import Data.Foldable (find, maximumBy, minimumBy) import qualified Data.Map.Strict as Map import Data.Maybe (fromMaybe, listToMaybe) import Data.Ord (comparing) import qualified Data.Text as T import Text.XML import Text.XML.Cursor -- | Extracts title (@@) from HTML document and normalizes by removing exceeding white spaces. title :: Document -> Maybe T.Text title doc = fmap (T.unwords . T.words) $ find (not . T.null) $ fromDocument doc $// checkName (== "title") &/ content -- | Attempts to find shortened version of title without page name. shortTitle :: Document -> Maybe T.Text shortTitle doc = find (\c -> T.length c >= 15 && T.length c <= 150) (findShortTitle <$> title doc) where findShortTitle t = case candidates t of [] -> extractTitle t cs -> minimumBy (comparing T.length) cs candidates t = filter ( \c -> length (T.words c) >= 2 && T.length c >= 15 && c `T.isInfixOf` t ) $ fromDocument doc $// checkElement maybeTitle &/ content -- | Determines whether an element can hold a title of article. -- These are considered as potential candidates (as CSS selector): -- * h1, h2, h3 -- * #title, #head, #heading -- * .pageTitle, .news_title, .title, .head, .heading, .contentheading, .small_header_red maybeTitle :: Element -> Bool maybeTitle Element {..} = elementName `elem` ["h1", "h2", "h3"] || maybe False (`elem` ["title", "head", "heading"]) (Map.lookup "id" elementAttributes) || maybe False (any (`elem` ["pageTitle", "news_article", "title", "head", "heading", "contentheading", "small_header_red"]) . T.words) (Map.lookup "class" elementAttributes) -- | Attempts to extract article name from page title, for example "Chancellor Alistair Darling on brink of second bailout for banks | The Times" would detect @|@ as delimiter and would return "Chancellor Alistair Darling on brink of second bailout for banks". extractTitle :: T.Text -> T.Text extractTitle t = fromMaybe z y where delim = [" | ", " - ", " :: ", " – ", " — ", " / "] x d = T.unwords $ maximumBy (comparing length) $ filter (\l -> length l >= 4) $ T.words <$> T.splitOn d t y = listToMaybe (x <$> filter (`T.isInfixOf` t) delim) z = let (_, s) = T.breakOnEnd t ": " in if length (T.words s) >= 4 then s else T.drop 2 (fst $ T.breakOn t ": ")