{-# LANGUAGE ExplicitForAll #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE OverloadedStrings #-} module Hakyll.Convert.Blogger (FullPost (..), readPosts, distill) where import Control.Arrow import Control.Monad import Data.Function import Data.List import Data.Maybe import qualified Data.Text as T import Data.Time.Format (defaultTimeLocale, parseTimeM) import Data.XML.Types (Element (..), Name (..), Node (..), elementChildren) import Hakyll.Convert.Common import Text.Atom.Feed import Text.Atom.Feed.Import import qualified Text.XML as XML -- | A post and its comments data FullPost = FullPost { fpPost :: Entry, fpComments :: [Entry], fpUri :: T.Text } deriving (Show) -- | An entry is assumed to be either a post, or a comment. -- If it's a post, it should be associated with the URI -- that visitors would use to read the post (on the old blog) -- If it's a comment, it should be the URI for the corresponding -- post. data BloggerEntry = Post {beUri_ :: T.Text, beEntry :: Entry} | Comment {beUri_ :: T.Text, beEntry :: Entry} | Orphan {beEntry :: Entry} deriving (Show) beUri :: BloggerEntry -> Maybe T.Text beUri (Orphan _) = Nothing beUri (Post u _) = Just u beUri (Comment u _) = Just u -- --------------------------------------------------------------------- -- Feed to helper type -- --------------------------------------------------------------------- -- | Returns only published posts readPosts :: FilePath -> IO (Maybe [FullPost]) readPosts f = do doc <- XML.readFile (XML.def :: XML.ParseSettings) f let root = XML.toXMLElement $ XML.documentRoot doc return $ fmap (extractPosts . feedEntries) $ elementFeed $ deleteDrafts root -- has to be done on the XML level as our atom lib doesn't understand -- the blogger-specific XML for drafts deleteDrafts :: Element -> Element deleteDrafts e = e {elementNodes = filter isInnocent (elementNodes e)} where isInnocent (NodeElement element) = not (isDraft element) isInnocent _ = True isDraft :: Element -> Bool isDraft e = not . null $ findElements draft e where draft = Name { nameLocalName = "draft", nameNamespace = Just "http://purl.org/atom/app#", namePrefix = Just "app" } -- | Warning: this silently ignores orphans, templates, settings extractPosts :: [Entry] -> [FullPost] extractPosts entries = map toFullPost blocks where toFullPost (uri, blockEntries) = FullPost { fpPost = post, fpComments = comments, fpUri = uri } where post = case [e | Post _ e <- blockEntries] of [] -> huh "Block of entries with no post?!" [p] -> p _ps -> huh "Block of entries with more than one post?!" comments = [e | Comment _ e <- blockEntries] huh msg = error . unlines $ msg : map (txtToString . entryTitle . beEntry) blockEntries blocks = [(u, xs) | (Just u, xs) <- blocks_] -- drop orphans blocks_ = buckets beUri $ map identifyEntry $ filter isInteresting entries -- | Contains actual meat (posts, comments; but not eg. templates) isInteresting :: Entry -> Bool isInteresting e = not $ any isBoring cats where isBoring c = any (\t -> isBloggerCategoryOfType t c) ["settings", "template"] cats = entryCategories e -- | Tag an entry from the blogger feed as either being a post, -- a comment, or an "orphan" (a comment without an associated post) identifyEntry :: Entry -> BloggerEntry identifyEntry e = if isPost e then case getLink "self" `mplus` getLink "alternate" of Just l -> Post (postUrl l) e Nothing -> entryError e oopsSelf else case getLink "alternate" of Just l -> Comment (postUrl l) e Nothing -> Orphan e where isPost = any (isBloggerCategoryOfType "post") . entryCategories postUrl = T.takeWhile (/= '?') . replaceSchema . linkHref replaceSchema url = maybe url (T.append "https://") (T.stripPrefix "http://" url) getLink ty = case filter (isLink ty) $ entryLinks e of [] -> Nothing [x] -> Just x _xs -> entryError e (oopsLink ty) isLink ty l = linkRel l == Just (Right ty) && linkType l == Just "text/html" oopsSelf = "Was expecting blog posts to have a self link" oopsLink ty = T.append "Was expecting entries have at most one link of type " ty isBloggerCategory :: Category -> Bool isBloggerCategory = (== Just "http://schemas.google.com/g/2005#kind") . catScheme isBloggerCategoryOfType :: -- | \"comment\", \"post\", etc T.Text -> Category -> Bool isBloggerCategoryOfType ty c = isBloggerCategory c && catTerm c == T.append "http://schemas.google.com/blogger/2008/kind#" ty -- --------------------------------------------------------------------- -- -- --------------------------------------------------------------------- distill :: Bool -> FullPost -> DistilledPost distill extractComments fp = DistilledPost { dpBody = body fpost, dpUri = fpUri fp, dpTitle = title fpost, dpTags = tags fpost, dpCategories = [], dpDate = date fpost } where fpost = fpPost fp fcomments = fpComments fp -- body post = let article = fromContent $ entryContent post comments = T.intercalate "\n" $ map formatComment fcomments in if extractComments then T.intercalate "\n" [ article, "", "

Comments

", comments ] else article fromContent (Just (HTMLContent x)) = x fromContent _ = error "Hakyll.Convert.Blogger.distill expecting HTML" formatComment c = T.intercalate "\n" [ "
", T.concat [ "

", "On ", pubdate, ", ", author, " wrote:", "

" ], "
", comment, "
", "
" ] where pubdate = fromMaybe "unknown date" (entryPublished c) author = T.unwords $ map personName (entryAuthors c) comment = fromContent $ entryContent c -- title p = case txtToString (entryTitle p) of "" -> Nothing t -> Just (T.pack t) tags = map catTerm . filter (not . isBloggerCategory) . entryCategories date x = case parseTime' =<< entryPublished x of Nothing -> fromJust $ parseTime' "1970-01-01T00:00:00Z" Just d -> d parseTime' d = msum $ map (\f -> parseTimeM True defaultTimeLocale f (T.unpack d)) [ "%FT%T%Q%z", -- with time zone "%FT%T%QZ" -- zulu time ] -- --------------------------------------------------------------------- -- odds and ends -- --------------------------------------------------------------------- entryError :: forall a. Entry -> T.Text -> a entryError e msg = error $ (T.unpack msg) ++ " [on entry " ++ (T.unpack (entryId e)) ++ "]\n" ++ show e buckets :: Ord b => (a -> b) -> [a] -> [(b, [a])] buckets f = map (first head . unzip) . groupBy ((==) `on` fst) . sortBy (compare `on` fst) . map (\x -> (f x, x)) -- | Find all non-nested elements which are named `name`, starting with `root`. -- ("Non-nested" means we don't search sub-elements of an element that's named -- `name`.) findElements :: Name -> Element -> [Element] findElements name element = if elementName element == name then [element] else concatMap (findElements name) (elementChildren element)