{-# LANGUAGE DataKinds #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE DerivingStrategies #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE PolyKinds #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TypeApplications #-} {-# LANGUAGE TypeFamilies #-} {-# LANGUAGE TypeOperators #-} module Servant.Seo.Sitemap where import Control.Lens import Control.Monad.IO.Class (MonadIO) import qualified Data.Binary.Builder as Builder import Data.ByteString.Builder (Builder) import qualified Data.ByteString.Lazy as BSL import Data.Coerce (coerce) import qualified Data.Foldable as F import Data.Map.Strict (Map) import qualified Data.Map.Strict as Map import Data.Maybe (catMaybes) import Data.String (IsString) import Data.Text (Text) import qualified Data.Text as Text import qualified Data.Text.Encoding as Text import GHC.Generics (Generic) import GHC.TypeLits (KnownNat, KnownSymbol, natVal, symbolVal) import Numeric (showFFloat) import Servant import Servant.HTML.Blaze (HTML) import Text.Blaze (ToMarkup) import Text.XML import Servant.Seo.Combinators -- * Sitemap -- ** SitemapInfo -- | Intermediate structure representing @sitemap.xml@ file. -- During compilation API analysis performed. -- All 'HTML' pages with 'Get' verb would be populated into 'SitemapInfo' and then updated to list of URLs, unless 'Disallow' happened in the path to page. -- It consists of list of API branches and internal flag. data SitemapInfo = SitemapInfo { _sitemapInfoEntries :: [SitemapEntry] , _sitemapInfoPresent :: Maybe () } deriving (Show, Eq, Generic, Ord) instance Monoid SitemapInfo where mempty = SitemapInfo [] Nothing instance Semigroup SitemapInfo where SitemapInfo e1 p1 <> SitemapInfo e2 p2 = SitemapInfo (e1 <> e2) (p1 <> p2) -- | Represents single API branch. -- Could contain multiple values based on user decision. -- See 'ToSitemapParamPart' or 'ToSitemapPathPiece' for more details. -- Also contain optional 'Frequency' and 'Priority'. data SitemapEntry = SitemapEntry { _sitemapPathPieces :: [PathPiece] , _sitemapQueryParts :: [(ParamName, [ParamValue])] , _sitemapFrequency :: Maybe Period , _sitemapPriority :: Maybe Text } deriving (Show, Eq, Generic, Ord) instance Monoid SitemapEntry where mempty = SitemapEntry [] [] Nothing Nothing instance Semigroup SitemapEntry where SitemapEntry p1 q1 f1 r1 <> SitemapEntry p2 q2 f2 r2 = SitemapEntry (p1 <> p2) (q1 <> q2) (maximum [f1, f2]) (maximum [r1, r2]) newtype ParamName = ParamName Text deriving stock (Show, Eq, Generic) deriving newtype (Ord, ToMarkup, ToHttpApiData) newtype ParamValue = ParamValue Text deriving stock (Show, Eq, Generic) deriving newtype (Ord, ToMarkup, ToHttpApiData) -- | Could be either path piece obtained from @path :: Symbol@ or list of possible captured values provided by user. data PathPiece = UrlPathPiece Text | CaptureValues [Text] deriving (Show, Eq, Generic, Ord) -- | Checks that path piece is contained. isEmpty :: PathPiece -> Bool isEmpty (UrlPathPiece "") = True isEmpty (CaptureValues []) = True isEmpty _ = False -- ** User dependent decisions for sitemap. -- | How to get all possible values that should (or should not) be the parts of sitemap from query parameter values. -- -- Example (in combination with @Get '[HTML]@ will produce sitemap URLs): -- -- >>> newtype ArticleId = ArticleId Int deriving newtype (Show, Eq, Enum, Ord, Num, ToHttpApiData) -- >>> instance ToSitemapParamPart ArticleId where getParamsForIndexing _ _ = pure [0..100500] -- -- Another example (params that have no affect on sitemap): -- -- >>> newtype SortBy = SortBy Text deriving newtype (Show, Eq, ToHttpApiData) -- >>> instance ToSitemapParamPart SortBy where getParamsForIndexing _ _ = pure mempty -- class ToHttpApiData a => ToSitemapParamPart a where -- ^ Should be provided by user. getParamsForIndexing :: MonadIO m => Proxy a -> app -> m [a] getParamsForIndexing _ _ = pure mempty -- ^ Default implementation of conversion to '[ParamValue]'. toParamPart :: MonadIO m => Proxy a -> env -> m [ParamValue] toParamPart proxy env = do values <- getParamsForIndexing proxy env pure $ ParamValue . toUrlPiece <$> values -- | How to get all possible captured values that should (or should not) be the parts of sitemap. -- -- Example (in combination with @Get '[HTML]@ will produce sitemap URLs): -- -- >>> newtype UserId = UserId Int deriving newtype (Show, Eq, Enum, Ord, Num, ToHttpApiData) -- >>> instance ToSitemapPathPiece UserId where getPathPiecesForIndexing _ _ = pure [1..200000] -- -- Another example (captured values that have no affect on sitemap): -- -- >>> newtype Username = Username Text deriving newtype (Show, Eq, ToHttpApiData) -- >>> instance ToSitemapPathPiece Username where getPathPiecesForIndexing _ _ = pure mempty -- class ToHttpApiData a => ToSitemapPathPiece a where -- ^ Should be provided by user. getPathPiecesForIndexing :: MonadIO m => Proxy a -> app -> m [a] getPathPiecesForIndexing _ _ = pure mempty -- ^ Default implementation of conversion to 'PathPiece'. toPathPiece :: MonadIO m => Proxy a -> env -> m PathPiece toPathPiece proxy env = do values <- getPathPiecesForIndexing proxy env pure $ CaptureValues (toUrlPiece <$> values) makeLenses ''SitemapInfo makeLenses ''SitemapEntry -- ** Transforming API to @SitemapInfo@ -- | Servant API extension. -- It describes how to build 'SitemapInfo' representation from servant API. -- There are plenty of types that add nothing to it. -- -- __WARNING__: Do not derive this using @DeriveAnyClass@ as the generated -- instance will loop indefinitely. class HasSitemap a where toSitemapInfo :: MonadIO m => Proxy a -> m SitemapInfo toSitemapInfo proxy = toSitemapInfoWith () proxy toSitemapInfoWith :: MonadIO m => env -> Proxy a -> m SitemapInfo toSitemapInfoWith _ = toSitemapInfo {-# MINIMAL toSitemapInfo | toSitemapInfoWith #-} instance HasSitemap Raw where toSitemapInfo _ = pure mempty instance HasSitemap EmptyAPI where toSitemapInfo _ = pure mempty -- | Collect multiple API branches together. instance (HasSitemap a, HasSitemap b) => HasSitemap (a :<|> b) where toSitemapInfo _ = do sitemapA <- toSitemapInfo (Proxy :: Proxy a) sitemapB <- toSitemapInfo (Proxy :: Proxy b) pure (sitemapA <> sitemapB) instance HasSitemap sub => HasSitemap (WithNamedContext x c sub) where toSitemapInfo _ = toSitemapInfo (Proxy :: Proxy sub) instance HasSitemap sub => HasSitemap (HttpVersion :> sub) where toSitemapInfo _ = toSitemapInfo (Proxy :: Proxy sub) instance HasSitemap sub => HasSitemap (StreamBody' mods fr ct a :> sub) where toSitemapInfo _ = toSitemapInfo (Proxy :: Proxy sub) instance HasSitemap sub => HasSitemap (ReqBody' mods cs a :> sub) where toSitemapInfo _ = toSitemapInfo (Proxy :: Proxy sub) instance HasSitemap sub => HasSitemap (RemoteHost :> sub) where toSitemapInfo _ = toSitemapInfo (Proxy :: Proxy sub) -- | Extract all possible values under 'QueryParam'' and append them if sitemap is present for this particular branch. instance (ToSitemapParamPart a, HasSitemap sub, KnownSymbol sym) => HasSitemap (QueryParam' mods sym a :> sub) where toSitemapInfoWith env _ = do sitemap <- toSitemapInfo (Proxy :: Proxy sub) vals <- toParamPart (Proxy :: Proxy a) env let newSitemap | sitemap == mempty = mempty | null vals = sitemap | otherwise = sitemap & sitemapInfoEntries .~ newEntries sitemap vals pure newSitemap where param = (ParamName . Text.pack . symbolVal) (Proxy :: Proxy sym) addQueryParts paramName paramValues x = x & sitemapQueryParts .~ new where new = x ^. sitemapQueryParts . to ((paramName, paramValues) :) newEntries x vals = if x ^. sitemapInfoEntries . to null then [addQueryParts param vals mempty] else x ^. sitemapInfoEntries . to (fmap (addQueryParts param vals)) instance HasSitemap sub => HasSitemap (QueryFlag sym :> sub) where toSitemapInfo _ = toSitemapInfo (Proxy :: Proxy sub) instance HasSitemap sub => HasSitemap (Header' mods sym a :> sub) where toSitemapInfo _ = toSitemapInfo (Proxy :: Proxy sub) instance HasSitemap sub => HasSitemap (IsSecure :> sub) where toSitemapInfo _ = toSitemapInfo (Proxy :: Proxy sub) instance HasSitemap api => HasSitemap (Summary desc :> api) where toSitemapInfo _ = toSitemapInfo (Proxy :: Proxy api) instance HasSitemap api => HasSitemap (Description desc :> api) where toSitemapInfo _ = toSitemapInfo (Proxy :: Proxy api) -- | Extract all possible values under 'Capture'' and append them if sitemap is present for this particular branch. instance (HasSitemap sub, ToSitemapPathPiece a) => HasSitemap (Capture' mods sym a :> sub) where toSitemapInfoWith env _ = do sitemap <- toSitemapInfo (Proxy :: Proxy sub) value <- toPathPiece (Proxy @a) env pure $ if sitemap == mempty then mempty else if isEmpty value then sitemap else sitemap & sitemapInfoEntries .~ newEntries sitemap value where addPathPieces capturedValue x = x & sitemapPathPieces .~ new where new = x ^. sitemapPathPieces . to (capturedValue :) newEntries x capturedValue = if x ^. sitemapInfoEntries . to null then [addPathPieces capturedValue mempty] else x ^. sitemapInfoEntries . to (fmap (addPathPieces capturedValue)) instance HasSitemap sub => HasSitemap (CaptureAll sym a :> sub) where toSitemapInfo _ = toSitemapInfo (Proxy :: Proxy sub) instance HasSitemap sub => HasSitemap (Vault :> sub) where toSitemapInfo _ = toSitemapInfo (Proxy :: Proxy sub) instance (HasSitemap sub, KnownSymbol sym) => HasSitemap (sym :> sub) where toSitemapInfo _ = do sitemap <- toSitemapInfo (Proxy :: Proxy sub) pure $ if sitemap == mempty then mempty else sitemap & sitemapInfoEntries .~ newEntries sitemap where newEntries x = if x ^. sitemapInfoEntries . to null then [addPathPiece piece mempty] else x ^. sitemapInfoEntries . to (fmap (addPathPiece piece)) piece = (UrlPathPiece . Text.pack . symbolVal) (Proxy :: Proxy sym) addPathPiece p x = x & sitemapPathPieces .~ new where new = x ^. sitemapPathPieces . to (p :) -- | 'Disallow' combinator invalidates sitemap for particular API branch. instance (HasSitemap sub, KnownSymbol sym) => HasSitemap (Disallow sym :> sub) where toSitemapInfo _ = pure mempty instance {-# OVERLAPPABLE #-} (KnownNat status) => HasSitemap (Verb method status cs a) where toSitemapInfo _ = pure mempty -- | @Get '[HTML]@ enables sitemap for particular API branch. instance {-# OVERLAPPING #-} (ToMarkup a) => HasSitemap (Get '[HTML] a) where toSitemapInfo _ = pure (SitemapInfo [mempty] (Just ())) -- | Extracts 'Frequency' from API branch. instance (HasPeriod period, HasSitemap api) => HasSitemap (Frequency period :> api) where -- FIXME: compare with previous values, choose frequent one. toSitemapInfo _ = do sitemap <- toSitemapInfo (Proxy :: Proxy api) pure $ sitemap & sitemapInfoEntries . each %~ (sitemapFrequency . _Just .~ period') where period' = getPeriod (Proxy :: Proxy period) -- | Extracts 'Priority' from API branch. instance (KnownNat n, KnownNat m, HasSitemap api) => HasSitemap (Priority '(n, m) :> api) where toSitemapInfo _ = do sitemap <- toSitemapInfo (Proxy :: Proxy api) -- FIXME: compare with previous values, choose greater one. pure $ sitemap & sitemapInfoEntries . each %~ (sitemapPriority . _Just .~ priority') where n' = natVal (Proxy :: Proxy n) & fromInteger @Float m' = natVal (Proxy :: Proxy m) & fromInteger @Float priority' = (n' * 10 + m') / 10 & min 1.0 & showFloat & Text.pack showFloat x = showFFloat (Just 1) x "" -- ** Rendering -- | Populated during 'SitemapInfo' processing. -- 'SitemapUrl' would be rendered in XML node in runtime. -- -- Example: @\\https:\/\/example.com\/some\/url\<\/loc\>@. data SitemapUrl = SitemapUrl { _sitemapUrlLoc :: [SitemapLoc] , _sitemapUrlFrequency :: Maybe Period , _sitemapUrlPriority :: Maybe Text } deriving (Show, Eq, Generic, Ord) -- | Represents single URL listed in @sitemap.xml@. newtype SitemapLoc = SitemapLoc Text deriving stock (Show, Eq, Generic) deriving newtype (Ord, ToMarkup) -- | Server prefix without trailing slash. newtype ServerUrl = ServerUrl Text deriving stock (Show, Eq, Generic) deriving newtype (Ord, ToMarkup, IsString) -- | If sitemap consists of more than 50000 URLs, it should be indexed. type SitemapIndex = (Int, [SitemapUrl]) newtype SitemapIx = SitemapIx Int deriving stock (Generic) deriving newtype (Eq, Ord, Show, FromHttpApiData, ToHttpApiData) instance ToSitemapPathPiece SitemapIx makeLenses ''SitemapUrl -- | Transform single sitemap entry to list of URLs. sitemapEntryToUrlList :: ServerUrl -> SitemapEntry -> SitemapUrl sitemapEntryToUrlList server SitemapEntry {..} = SitemapUrl { _sitemapUrlLoc = locs , _sitemapUrlFrequency = _sitemapFrequency , _sitemapUrlPriority = _sitemapPriority } where locs = case (null paths, null queries) of (False, False) -> [ SitemapLoc (coerce server <> path <> query) | path <- paths, query <- queries ] (True, False) -> [ SitemapLoc (coerce server <> "/" <> query) | query <- queries ] (False, True) -> [ SitemapLoc (coerce server <> path) | path <- paths ] _ -> [ SitemapLoc (coerce server) ] paths = _sitemapPathPieces & F.foldr combinePaths [] & fmap (escapeXmlEntities . toText) where combinePaths :: PathPiece -> [Builder] -> [Builder] combinePaths p [] = case p of UrlPathPiece piece -> [prepare piece] CaptureValues pieces -> prepare <$> pieces combinePaths p xs = case p of UrlPathPiece piece -> prepend piece <$> xs CaptureValues pieces -> concatMap (\piece -> prepend piece <$> xs) pieces prepare = Builder.append (Builder.fromByteString "/") . toEncodedUrlPiece prepend x = Builder.append (prepare x) queries = _sitemapQueryParts & F.foldl' combineParams [] & fmap (escapeXmlEntities . fixQuery . toText) where combineParams :: [Builder] -> (ParamName, [ParamValue]) -> [Builder] combineParams [] (param, values) = values & fmap toEncodedUrlPiece & fmap (combine param) combineParams xs paramWithValues = combineParams [] paramWithValues & concatMap (\piece -> Builder.append piece <$> xs) combine p v = Builder.fromByteString "&" <> toEncodedUrlPiece p <> Builder.fromByteString "=" <> v fixQuery q = if Text.null q then q else Text.cons '?' (Text.tail q) toText :: Builder -> Text toText = Text.decodeUtf8 . BSL.toStrict . Builder.toLazyByteString escapeXmlEntities txt = txt & Text.replace "&" "&" & Text.replace "'" "'" & Text.replace "\"" """ & Text.replace ">" ">" & Text.replace "<" "<" -- | Transform list of URLs to list of XML nodes. sitemapUrlToNodes :: SitemapUrl -> [Node] sitemapUrlToNodes SitemapUrl{..} = mkUrlNode "url" extra <$> _sitemapUrlLoc where extra = [ frequencyNode, priorityNode ] & catMaybes frequencyNode = fromFrequency <$> _sitemapUrlFrequency priorityNode = fromPriority <$> _sitemapUrlPriority fromFrequency = mkNodeWithContent "changefreq" . Text.toLower . Text.pack . show fromPriority = mkNodeWithContent "priority" . coerce -- | Several XML rendering helpers. mkNode :: Name -> [Node] -> Node mkNode name childNodes = NodeElement (Element name Map.empty childNodes) mkNodeWithContent :: Name -> Text -> Node mkNodeWithContent name content = mkNode name [NodeContent content] mkUrlNode :: Name -> [Node] -> SitemapLoc -> Node mkUrlNode name extra loc = mkNode name (locNode : extra) where locNode = mkNodeWithContent "loc" (coerce loc) -- | Transform list of URLs to sitemap index XML nodes. sitemapIndexUrlToNodes :: SitemapUrl -> [Node] sitemapIndexUrlToNodes SitemapUrl{..} = mkUrlNode "sitemap" [] <$> _sitemapUrlLoc -- | Transform bunch of list of URLs to XML document. sitemapUrlsToDocument :: ServerUrl -> [SitemapUrl] -> Document sitemapUrlsToDocument server urlparts = Document { documentPrologue = Prologue [] Nothing [] , documentRoot = Element "urlset" (Map.fromList [namespace]) childNodes , documentEpilogue = [] } where namespace = ("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9") childNodes = concatMap sitemapUrlToNodes urlparts -- | Transform list of sitemap index URLs to XML document. sitemapIndexToDocument :: ServerUrl -> [SitemapUrl] -> Document sitemapIndexToDocument server urlparts = Document { documentPrologue = Prologue [] Nothing [] , documentRoot = Element "sitemapindex" (Map.fromList [namespace]) childNodes , documentEpilogue = [] } where namespace = ("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9") childNodes = concatMap sitemapIndexUrlToNodes urlparts sitemapUrlsToRootLBS :: ServerUrl -> [SitemapUrl] -> BSL.ByteString sitemapUrlsToRootLBS serverUrl urls = if urls & concatMap _sitemapUrlLoc & length & (<= 50000) then render sitemapUrlsToDocument urls else render sitemapIndexToDocument indeces where render x ys = renderSitemapWith x serverUrl ys indeces = urls & concatMap countLocGroups & length & pred & mkIndexList & fmap mkIndexUrl countLocGroups :: SitemapUrl -> [Int] countLocGroups x = x & _sitemapUrlLoc & length & (realToFrac @_ @Double) & (/ 50000) & truncate & mkIndexList mkIndexList x = [0 .. x] mkIndexUrl x = SitemapUrl { _sitemapUrlLoc = pure $ SitemapLoc $ coerce serverUrl <> "/" <> Text.pack (show x) <> "/sitemap.xml" , _sitemapUrlFrequency = Nothing , _sitemapUrlPriority = Nothing } sitemapUrlsToSitemapMap :: ServerUrl -> [SitemapUrl] -> Map Int BSL.ByteString sitemapUrlsToSitemapMap serverUrl urls = urls & concatMap (splitUrlTo50KLocs []) & zip [0..] & Map.fromList where splitUrlTo50KLocs xs s = if s ^. sitemapUrlLoc . to length . to (<= 50000) then mkLBS [s] : xs else splitUrlTo50KLocs (currentLocsSitemap : xs) restLocsSitemap where currentLocsSitemap = mkLBS [s & sitemapUrlLoc %~ take 50000] restLocsSitemap = s & sitemapUrlLoc %~ drop 50000 mkLBS = renderSitemapWith sitemapUrlsToDocument serverUrl renderSitemapWith :: (ServerUrl -> [SitemapUrl] -> Document) -> ServerUrl -> [SitemapUrl] -> BSL.ByteString renderSitemapWith renderer serverUrl urls = renderer serverUrl urls & renderLBS def -- $setup -- >>> :set -XDerivingStrategies -XGeneralizedNewtypeDeriving -- >>>