{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
module Readability.Title
( title,
shortTitle,
)
where
import Data.Foldable (find, maximumBy, minimumBy)
import qualified Data.Map.Strict as Map
import Data.Maybe (fromMaybe, listToMaybe)
import Data.Ord (comparing)
import qualified Data.Text as T
import Text.XML
import Text.XML.Cursor
title :: Document -> Maybe T.Text
title doc =
fmap (T.unwords . T.words) $
find (not . T.null) $
fromDocument doc $// checkName (== "title") &/ content
shortTitle :: Document -> Maybe T.Text
shortTitle doc = find (\c -> T.length c >= 15 && T.length c <= 150) (findShortTitle <$> title doc)
where
findShortTitle t = case candidates t of
[] -> extractTitle t
cs -> minimumBy (comparing T.length) cs
candidates t =
filter
( \c ->
length (T.words c) >= 2
&& T.length c >= 15
&& c `T.isInfixOf` t
)
$ fromDocument doc $// checkElement maybeTitle &/ content
maybeTitle :: Element -> Bool
maybeTitle Element {..} =
elementName `elem` ["h1", "h2", "h3"]
|| maybe False (`elem` ["title", "head", "heading"]) (Map.lookup "id" elementAttributes)
|| maybe False (any (`elem` ["pageTitle", "news_article", "title", "head", "heading", "contentheading", "small_header_red"]) . T.words) (Map.lookup "class" elementAttributes)
extractTitle :: T.Text -> T.Text
extractTitle t = fromMaybe z y
where
delim = [" | ", " - ", " :: ", " – ", " — ", " / "]
x d = T.unwords $ maximumBy (comparing length) $ filter (\l -> length l >= 4) $ T.words <$> T.splitOn d t
y = listToMaybe (x <$> filter (`T.isInfixOf` t) delim)
z =
let (_, s) = T.breakOnEnd t ": "
in if length (T.words s) >= 4 then s else T.drop 2 (fst $ T.breakOn t ": ")