{-# Language OverloadedStrings #-} -- todo [release] date? module Main where import Control.Applicative import System.Environment import Network.HTTP.Simple import Text.HTML.Scalpel.Core import Text.RSS.Syntax import Text.RSS.Export import qualified Paths_gscholar_rss as P import qualified Control.Exception as E import qualified Data.List as L import qualified Data.String as DS import qualified Data.Text as T import qualified Data.Text.Lazy.IO as TLI import qualified Data.Text.Encoding as TE import qualified Data.Version as V import qualified Data.XML.Types as XT import qualified Text.Atom.Feed as AF import qualified Text.URI as U main :: IO () main = errorPress work where work :: IO () work = -- get args map T.pack <$> getArgs >>= \as -> -- malformed input if null as || length as > 1 || head as `elem` ["--help", "-h", "-v", "--version"] then putStrLn helps else -- scrape let url = head as in scrapeGet url (page url) >>= \p -> printRSS url p scrapeGet :: URL -> Scraper T.Text a -> IO a scrapeGet url s = getURLBody url >>= \b -> let r = maybe (error "Nothing in scrapeGet") id (scrapeStringLike b s) in return r ----------- -- TYPES -- ----------- type Title = T.Text type URL = T.Text data Page = Page Title URL [Article] deriving (Show, Eq) -- URL: url of the feed/page type Author = T.Text type Description = T.Text data Article = Article Title URL Author Description deriving (Show, Eq) ------------ -- SCRAPE -- ------------ page :: URL -> Scraper T.Text Page page u = Page <$> text "title" <*> pure u <*> chroot ("div" @: ["id" @= "gs_res_ccl_mid"]) entries entries :: Scraper T.Text [Article] entries = chroots ("div" @: [hasClass "gs_ri"]) paper where paper = Article <$> text ("h3" @: [hasClass "gs_rt"]) <*> (cleanGBooks <$> attr "href" "a") <*> text ("div" @: [hasClass "gs_a"]) <*> description description :: Scraper T.Text Description description = text ("div" @: [hasClass "gs_rs"]) <|> pure "No description available." -- todo rimuovi span? -- todo aggiungi test -- Title: channel title buildRSS :: Title -> Page -> RSS buildRSS pt (Page _ u as) = (nullRSS "NULL-WRONG-T" "NULL-WRONG-L") { rssAttrs = [dublinNamespace], rssChannel = (nullChannel pt u) {rssItems = map buildItem as} } where buildItem :: Article -> RSSItem buildItem (Article t wu a d) = (nullItem t) { rssItemLink = Just wu, -- rssItemAuthor = Just $ a, rssItemOther = [dcCreator a], rssItemDescription = Just d } -- http://www.rssboard.org/rss-profile#namespace-elements-dublin-creator dcCreator :: Author -> XT.Element dcCreator a = XT.Element "dc:creator" [] [XT.NodeContent $ XT.ContentText a] -- needed for dcCreator dublinNamespace :: AF.Attr dublinNamespace = ("xmlns:dc", [XT.ContentText "http://purl.org/dc/elements/1.1/"]) ------------- -- GET/PUT -- ------------- getURLBody :: URL -> IO T.Text getURLBody url = getResponseBody <$> httpBS req >>= \b -> return (TE.decodeLatin1 b) where req = DS.fromString (T.unpack url) -- todo non funziona con q dopo ?, usa new-uri (uri-modern? scordato) feedTitle :: URL -> Title feedTitle url = case lookup "q" (U.queryToPairs $ T.unpack url) of Nothing -> "Wrong: title not found" Just t -> "gscholar-rss: " <> T.pack t -- removes 'ots' and 'sig' from a google books url cleanGBooks :: URL -> URL cleanGBooks url | condA = let qry = U.uriQueryItems uri fqr = filter filtFun qry u' = uri { U.uriQuery = Just (U.pairsToQuery fqr)} in T.pack . show $ u' | otherwise = url where uri = maybe (error "misparse in gclean") id (U.parseURI $ T.unpack url) condA = case U.uriRegName uri of Nothing -> False -- citation, no uri Just rn -> L.isPrefixOf "books.google" rn filtFun :: (String, String) -> Bool filtFun (l, _) = notElem l ["ots", "sig"] printRSS :: URL -> Page -> IO () printRSS url s = case textRSS . buildRSS (feedTitle url) $ s of Nothing -> error "Nothing in printRSS" Just ms -> TLI.putStrLn ms helps :: String helps = unlines ["invoke as: gscholar-rss ", "manual and examples: http://ariis.it/static/articles/gscholar-rss/page.html", version ++ " -- GPLv3"] -- error handling lifted from ansi-terminal-game -- wraps error call with version/contact info errorPress :: IO () -> IO () errorPress m = E.catch m errorDisplay where errorDisplay :: E.SomeException -> IO () errorDisplay se = putStrLn . unlines $ [show se, "gscholar-rss " ++ version, "Can't fix this? Please report to "] version :: String version = 'v' : V.showVersion P.version