{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}

module Readability.Title
  ( title,
    shortTitle,
  )
where

import Data.Foldable (find, maximumBy, minimumBy)
import qualified Data.Map.Strict as Map
import Data.Maybe (fromMaybe, listToMaybe)
import Data.Ord (comparing)
import qualified Data.Text as T
import Text.XML
import Text.XML.Cursor

-- | Extracts title (@<title>@) from HTML document and normalizes by removing exceeding white spaces.
title :: Document -> Maybe T.Text
title doc =
  fmap (T.unwords . T.words) $
    find (not . T.null) $
      fromDocument doc $// checkName (== "title") &/ content

-- | Attempts to find shortened version of title without page name.
shortTitle :: Document -> Maybe T.Text
shortTitle doc = find (\c -> T.length c >= 15 && T.length c <= 150) (findShortTitle <$> title doc)
  where
    findShortTitle t = case candidates t of
      [] -> extractTitle t
      cs -> minimumBy (comparing T.length) cs
    candidates t =
      filter
        ( \c ->
            length (T.words c) >= 2
              && T.length c >= 15
              && c `T.isInfixOf` t
        )
        $ fromDocument doc $// checkElement maybeTitle &/ content

-- | Determines whether an element can hold a title of article.
-- These are considered as potential candidates (as CSS selector):
--  * h1, h2, h3
--  * #title, #head, #heading
--  * .pageTitle, .news_title, .title, .head, .heading, .contentheading, .small_header_red
maybeTitle :: Element -> Bool
maybeTitle Element {..} =
  elementName `elem` ["h1", "h2", "h3"]
    || maybe False (`elem` ["title", "head", "heading"]) (Map.lookup "id" elementAttributes)
    || maybe False (any (`elem` ["pageTitle", "news_article", "title", "head", "heading", "contentheading", "small_header_red"]) . T.words) (Map.lookup "class" elementAttributes)

-- | Attempts to extract article name from page title, for example "Chancellor Alistair Darling on brink of second bailout for banks | The Times" would detect @|@ as delimiter and would return "Chancellor Alistair Darling on brink of second bailout for banks".
extractTitle :: T.Text -> T.Text
extractTitle t = fromMaybe z y
  where
    delim = [" | ", " - ", " :: ", " – ", " — ", " / "]
    x d = T.unwords $ maximumBy (comparing length) $ filter (\l -> length l >= 4) $ T.words <$> T.splitOn d t
    y = listToMaybe (x <$> filter (`T.isInfixOf` t) delim)
    z =
      let (_, s) = T.breakOnEnd t ": "
       in if length (T.words s) >= 4 then s else T.drop 2 (fst $ T.breakOn t ": ")