{- This file is part of feed-collect. - - Written in 2015 by fr33domlover . - Date parser format strings written originally by koral - for the imm package and were copied here (imm is released as WTFPL). - - ♡ Copying is an act of love. Please copy, reuse and share. - - The author(s) have dedicated all copyright and related and neighboring - rights to this software to the public domain worldwide. This software is - distributed without any warranty. - - You should have received a copy of the CC0 Public Domain Dedication along - with this software. If not, see - . -} -- | = Intro -- -- This module allows you to run a loop which visits web feeds (RSS, Atom, -- etc.) periodically and reports new items using a function you provide. You -- can also insert control commands into the loop, such as adding and removing -- a feed, so that stopping and restarting it isn't required. -- -- Both http and https URLs are supported. -- -- The original use case which motivated the creation of this library is -- . -- -- = Running -- -- The 'run' function runs the loop, and it takes a collector callback function -- (as one of its parameters) to be called when new feed items are found. -- -- Here is a simple usage example. -- -- > import Data.Time.Units -- > import Web.Feed.Collect -- > -- > collect :: Label -> URL -> Feed -> Item -> IO () -- > collect label url feed item = do -- > putStrLn $ label ++ " : " ++ url -- > putStrLn "Got a new feed item!" -- > putStrLn $ showFeed feed -- > putStrLn $ showItem item -- > -- > logError :: Error -> IO () -- > logError = print -- > -- > feeds :: [(Label, Url)] -- > feeds = [("democ-now", "http://www.democracynow.org/democracynow.rss")] -- > -- > main :: IO () -- > main = run collect Nothing logError Nothing (1 :: Minute) 3 feeds -- -- For quick testing, you can use one of the predefined collector functions. -- For example, there is 'collectorNull' which discards the feed items and does -- nothing (you can use it e.g. when testing success of HTTP requests or -- debugging the library itself), and 'collectorPretty' which writes a short -- nicely formatted entry to stdout for each new feed item. And there are more. -- -- If new items aren't being detected correctly, you can try using 'runDebug' -- in place of 'run'. It takes and same arguments and runs the same loop, but -- also writes a detailed log of the detection process into a file -- \"debug.log\". You can use @tail -f debug.log@ from your terminal to watch -- new log entries get appended to the file. -- -- = Using Control Commands -- -- Now let's see how to push control commands into the loop. The 4th parameter -- 'run' takes is an optional command queue. In the example above we just -- passed 'Nothing'. Now let's provide an actual queue and use it. -- -- Suppose we are writing a program with a command-line interface. It watches -- news feeds in the background, and can take commands from the user at the -- same time on the command-line. Feeds can be added, removed, etc. without -- restarting the program. -- -- Assume we've written a function named @parseCommand@, which takes a line of -- user input and returns a command ready to push into the queue. The parsed -- commands are of type 'Command' and are created using functions like -- 'addFeed', 'removeFeed' and so on, which this module provides. Using our -- @parseCommand@ we can write the program like this: -- -- > import Control.Concurrent (forkIO) -- > import Data.Time.Units -- > import System.IO -- > import Web.Feed.Collect -- > -- > collect :: Label -> URL -> Feed -> Item -> IO () -- > collect = collectPretty -- > -- > logE :: Error -> IO () -- > logE = print -- > -- > feeds :: [(Label, Url)] -- > feeds = [("fsf-news", "https://www.fsf.org/static/fsforg/rss/news.xml")] -- > -- > parseCommand :: String -> Maybe Command -- > parseCommand line = {- ... - } -- > -- > main :: IO () -- > main = do -- > cqueue <- newCommandQueue -- > forkIO $ run collect Nothing logE (Just cqueue) (1 :: Minute) 3 feeds -- > let loop = do -- > line <- getLine -- > if line == "quit" -- > then putStrLn "Bye!" -- > else do -- > case parseCommand line of -- > Just cmd -> sendCommand cqueue cmd -- > Nothing -> putStrLn "Invalid input" -- > loop -- > loop module Web.Feed.Collect ( -- * Types Label , Url , Command () , CommandQueue () , Error (..) -- * Collectors , collectorNull , collectorPrint , collectorPretty , collectorLog -- * Running , run , runDebug -- * Control commands , newCommandQueue , sendCommand , sendCommands , addFeed , removeFeed , feedActive , setInterval , setMaxPerVisit -- * Utilities , showFeed , showItem ) where import Control.Applicative.Util (applyIf') import Control.Concurrent (threadDelay) import Control.Concurrent.MVar import Control.Exception (catch) import Control.Monad (liftM, when) import Control.Monad.IO.Class (MonadIO, liftIO) import qualified Data.ByteString.Lazy.UTF8 as BU import Data.Maybe (catMaybes, fromMaybe, listToMaybe) import Data.Time.Clock (UTCTime) import Data.Time.Format import Data.Time.LocalTime (getZonedTime, zonedTimeToUTC) import Data.Time.RFC2822 (parseTimeRFC2822) import Data.Time.RFC3339 (parseTimeRFC3339) import Data.Time.RFC822 (parseTimeRFC822) import Data.Time.Units (TimeUnit (..)) import Network.HTTP.Client import Network.HTTP.Client.TLS (tlsManagerSettings) import System.IO import Text.Feed.Import (parseFeedString) import Text.Feed.Query import Text.Feed.Types (Item, Feed (..)) -- | A short name tag for a feed, for quick reference and internal use. type Label = String -- | An HTTP or HTTPS URL of a feed or a feed item. type Url = String -- | A control command sent to the feed watching loop, and affecting its -- behavior. data Command -- | Add a new feed to watch. = AddFeed Label Url -- | Remove a previously added feed. | RemoveFeed Label -- | Set whether a given feed should be watched (active) or not (inactive). | FeedActive Label Bool -- | Set the interval, in microseconds, between feed scans. | SetInterval Int -- | Set the maximal number of feed items to be collected per feed per -- scan. If more new items are found, they wait for the next scan. | SetMaxPerVisit Int -- | A queue of control commands for the 'run' loop to execute while running. -- -- Commands can be pushed concurrently from different threads, but there should -- be only one 'run' invocation reading from the queue (i.e. although this is -- unlikely anyway, don't pass the same queue to multiple concurrent 'run' -- calls). newtype CommandQueue = CommandQueue { cqMVar :: MVar [Command] } -- Since no item field is guaranteed to be provided, we need some way to attach -- a practically-mostly-unique ID to each feed item, so that we can remember -- the items we already collected. Item fields we can expect to be mostly -- unique are: -- -- * ID (this is meant to serve as a unique identifier) -- * Title (usually an item doesn't repeat a past item's title), -- * Date (usually you don't publish 2 items in the very same microsecond) -- * Summary (same idea as the title) -- -- The other fields are based on these, or are likely to be, or aren't expected -- to be unique. -- -- In the (hopefully very rare) case none of these is available, just assume we -- never saw this ID before. -- -- We won't be checking for cases the available fields change over time, e.g. -- suddenly feed items get their IDs published in the XML. If the need ever -- arises, code to handle this can be added. data ItemID = ByID String | ByTitle String | ByTime String | BySummary String | Unique deriving Show instance Eq ItemID where (ByID i) == (ByID j) = i == j (ByTitle i) == (ByTitle j) = i == j (ByTime i) == (ByTime j) = i == j (BySummary i) == (BySummary j) = i == j _ == _ = False data FeedRecord = FeedRecord { feedName :: String -- A short identifier for easy reference , feedUrl :: String -- The feed's URL , feedOn :: Bool -- Whether the feed is active , feedPrevIDs :: [ItemID] -- Previously collected feed item IDs , feedUpdated :: Maybe UTCTime -- Last time a feed item was published } deriving Show data State = State { usecInterval :: Int -- Microseconds between polls , maxItemsPerVisit :: Int -- Per-feed items to collect per poll , records :: [FeedRecord] -- Per-feed state } deriving Show -- | An error occuring while reading from a news feed. data Error -- | Error while creating an HTTP request or receiving a response. = HttpError HttpException -- | Error while parsing the HTTP response body into feed content | FeedParsingFailed Label Url deriving Show showFeedKind :: Feed -> String showFeedKind (AtomFeed _) = "Atom" showFeedKind (RSSFeed _) = "RSS" showFeedKind (RSS1Feed _) = "RSS1" showFeedKind (XMLFeed _) = "XML" -- | A short one-line description of a feed. showFeed :: Feed -> String showFeed feed = unwords [kind, title, "by", author, "at", home] where kind = '(' : showFeedKind feed ++ ")" title = getFeedTitle feed author = fromMaybe none $ getFeedAuthor feed home = fromMaybe none $ getFeedHome feed none = "[?]" -- | A short one-line description of a feed item. showItem :: Item -> String showItem item = unwords [title, "by", author, "at", date] where title = fromMaybe none $ getItemTitle item author = fromMaybe none $ getItemAuthor item date = fromMaybe none $ getItemDate item none = "[?]" -- | An item collector which discards the item, i.e. does nothing. collectorNull :: Label -> Url -> Feed -> Item -> IO () collectorNull _ _ _ _ = return () -- | An item collector which prints feed and item fields to @stdout@. collectorPrint :: Label -> Url -> Feed -> Item -> IO () collectorPrint _label _url feed item = print feed >> print item -- | An item collector which prints short friendly feed and item descriptions -- to @stdout@. collectorPretty :: Label -> Url -> Feed -> Item -> IO () collectorPretty _label _url feed item = do putStrLn $ showFeed feed putStrLn $ showItem item -- | An item collector which writes friendly descriptions into a log file, -- determining the log file's name using the function given as the first -- argument (the feed label, i.e. second argument, is also used as the argument -- for that function). collectorLog :: (Label -> FilePath) -> Label -> Url -> Feed -> Item -> IO () collectorLog getPath label url feed item = withFile (getPath $ getFeedTitle feed) AppendMode $ \ h -> do hPutStrLn h $ label ++ " : " ++ url hPutStrLn h $ showFeed feed hPutStrLn h $ showItem item hPutChar h '\n' -- Maximal number of item IDs to remember maxNumIDs :: Int maxNumIDs = 200 -- Find the value in the first 'Just' available in a list of 'Maybe's findJust :: [Maybe a] -> Maybe a findJust = listToMaybe . catMaybes -- Using the available item fields, get the best hopefully-unique ID we can. itemID :: Item -> ItemID itemID i = fromMaybe Unique $ findJust [ fmap (ByID . snd) $ getItemId i , fmap ByTitle $ getItemTitle i , fmap ByTime $ getItemDate i , fmap BySummary $ getItemSummary i ] -- Parse and get an item's date. If not available, return 'Nothing'. -- The format examples may be partial, i.e. not demonstrate the formats fully. -- -- Since the availability of item date isn't guaranteed, use 2 possible -- representations. One, use the date if available. Two, otherwise, assume a -- timeless value which always compares in a way causing us to find new items -- to collect. itemTime :: Item -> Maybe UTCTime itemTime item = let mdate = getItemDate item dateParsers = -- Sun, 15 Nov 2015 02:45:26 -0200 fmap zonedTimeToUTC . parseTimeRFC2822 : -- 2015-11-15T02:45:26-02:00 fmap zonedTimeToUTC . parseTimeRFC3339 : -- 15 Nov 2015 02:45 -0200 fmap zonedTimeToUTC . parseTimeRFC822 : map (parseTimeM True defaultTimeLocale) [ "%a, %d %b %G %T" -- Sun, 15 Nov 2015 02:45:26 , "%Y-%m-%d" -- 2015-11-15 , "%e %b %Y" -- 15 Nov 2015 , "%a, %e %b %Y %k:%M:%S %z" -- Sun, 15 Nov 2015 2:45:26 -0200 , "%a, %e %b %Y %T %Z" -- Sun, 15 Nov 2015 02:45:26 -0200 ] results = maybe [] (\ date -> map ($ date) dateParsers) mdate in findJust results -- Check whether one 'ItemTime' is more recent than another, for the purpose of -- identifying new feed items. newerThan :: Maybe UTCTime -> Maybe UTCTime -> Bool (Just u) `newerThan` (Just v) = u > v _ `newerThan` _ = True -- Find recent items we haven't collected yet detectNewItems :: Int -> FeedRecord -> Feed -> ([Item], FeedRecord, IO ()) detectNewItems maxItems rec feed = let items = feedItems feed ids = map itemID items times = map itemTime items iids = zip3 items ids times new (_i, iid, t) = iid `notElem` feedPrevIDs rec && t `newerThan` feedUpdated rec iidsAllNew = filter new iids iids' = drop (length iidsAllNew - maxItems) iidsAllNew (items', ids', _times') = unzip3 iids' rec' = rec { feedPrevIDs = take maxNumIDs $ ids' ++ feedPrevIDs rec , feedUpdated = case iids' of (_i, _iid, t) : _ -> t [] -> feedUpdated rec } report = do let showTime :: FormatTime t => t -> String showTime = formatTime defaultTimeLocale rfc822DateFormat updated' = fmap showTime . feedUpdated prevIDs = feedPrevIDs rec prevIDsS = take 5 prevIDs itemsS = take maxItems items iidsS = take maxItems iids prevIDsFinal = feedPrevIDs rec' prevIDsFinalS = take 5 prevIDsFinal h <- openFile "debug.log" AppendMode let line = hPutStrLn h nl = hPutChar h '\n' printIT (_, i, t) = line $ show t ++ " " ++ show i t <- getZonedTime line $ replicate 79 '-' line $ showTime t line $ replicate 79 '-' line $ "Label " ++ feedName rec line $ "URL " ++ feedUrl rec line "----------- (1) Before changes ------------" line $ "Active " ++ show (feedOn rec) ++ " (should be True!)" line $ "Updated " ++ fromMaybe "[?]" (updated' rec) line $ "At most " ++ show maxItems ++ " are reported per visit" line $ showFeed feed line $ show (length prevIDs) ++ " previous item IDs logged" nl line $ "Most recent " ++ show (length prevIDsS) ++ " are:" mapM_ (hPrint h) prevIDsS nl line "------------ (2) While running ------------" line $ "Feed has " ++ show (length items) ++ " items" nl line $ "First " ++ show maxItems ++ " from the top are:" mapM_ (line . showItem) itemsS nl line "Their computed times and IDs are:" mapM_ printIT iidsS nl line "Out of all feed items, the following have newly seen IDs:" mapM_ printIT $ filter (\ (_, i, _) -> i `notElem` prevIDs) iids nl line "Out of them, the following are also newer than last update:" mapM_ printIT iidsAllNew nl line "--------------- (3) Result ----------------" line "Out of them, the following have been collected:" mapM_ printIT iids' nl line $ "Updated " ++ fromMaybe "[?]" (updated' rec') line $ show (length prevIDsFinal) ++ " previous item IDs logged" nl line $ "Most recent " ++ show (length prevIDsFinalS) ++ " are:" mapM_ (hPrint h) prevIDsFinalS nl hClose h in (items', rec', report) fetchRaw :: Manager -> String -> IO (Either HttpException String) fetchRaw manager url = let action = do request <- parseUrl url response <- httpLbs request manager return $ Right $ BU.toString $ responseBody response handler e = return $ Left (e :: HttpException) in action `catch` handler -- Try to download a feed from its URL fetch :: Manager -> Label -> Url -> IO (Either Error Feed) fetch manager label url = do ebody <- fetchRaw manager url return $ case ebody of Left err -> Left $ HttpError err Right body -> case parseFeedString body of Just feed -> Right feed Nothing -> Left $ FeedParsingFailed label url -- Fill initial feed record initRec :: MonadIO m => (Error -> m ()) -> Manager -> Label -> Url -> m FeedRecord initRec logError manager label url = do efeed <- liftIO $ fetch manager label url case efeed of Right feed -> let items = feedItems feed in return $ FeedRecord { feedName = label , feedUrl = url , feedOn = True , feedPrevIDs = map itemID $ take maxNumIDs items , feedUpdated = listToMaybe items >>= itemTime } Left e -> do logError e return $ FeedRecord { feedName = label , feedUrl = url , feedOn = True , feedPrevIDs = [] , feedUpdated = Nothing } -- Execute a control command exec :: MonadIO m => (Error -> m ()) -> Manager -> Command -> State -> m State exec logError manager command state@State { records = rs } = case command of AddFeed label url -> do rec <- initRec logError manager label url return state { records = rs ++ [rec] } RemoveFeed label -> return state { records = filter ((== label) . feedName) rs } FeedActive label active -> let rs' = [r { feedOn = active } | r <- rs, feedName r == label] in return state { records = rs' } SetInterval usec -> return state { usecInterval = usec } SetMaxPerVisit nitems -> return state { maxItemsPerVisit = nitems } foldrM :: Monad m => (a -> b -> m b) -> b -> [a] -> m b foldrM _ v [] = return v foldrM f v (x:xs) = f x =<< foldrM f v xs -- Execute all commands waiting in the queue execAll :: MonadIO m => (Error -> m ()) -> Manager -> CommandQueue -> State -> m State execAll logError manager cq state = do cmds <- liftIO $ modifyMVar (cqMVar cq) $ \ l -> return ([], l) foldrM (exec logError manager) state cmds run' :: (TimeUnit t, MonadIO m) => Bool -> (Label -> Url -> Feed -> Item -> m ()) -> Maybe (Label -> Url -> Feed -> [Item] -> m ()) -> (Error -> m ()) -> Maybe CommandQueue -> t -> Int -> [(Label, Url)] -> m () run' debug collect mCollectMany logError mq interval maxPerVisit sources = do let collectMany = fromMaybe (\ l u f -> mapM_ $ collect l u f) mCollectMany manager <- liftIO $ newManager tlsManagerSettings initialRecords <- mapM (\ (label, url) -> initRec logError manager label url) sources let initialState = State { usecInterval = fromInteger $ toMicroseconds interval , maxItemsPerVisit = maxPerVisit , records = initialRecords } execCmds st = case mq of Just cq -> execAll logError manager cq st Nothing -> return st visit record = applyIf' record feedOn $ \ rec -> do efeed <- liftIO $ fetch manager (feedName rec) (feedUrl rec) case efeed of Right feed -> do let (items, rec', report) = detectNewItems maxPerVisit rec feed ritems = reverse items when debug $ liftIO report collectMany (feedName rec) (feedUrl rec) feed ritems return rec' Left e -> do logError e return rec loop state = do liftIO $ threadDelay $ usecInterval state recs <- mapM visit $ records state let stateCollected = state { records = recs } stateExec <- execCmds stateCollected loop stateExec loop initialState -- | Watch feeds and perform the given action on received feed items. run :: (TimeUnit t, MonadIO m) => (Label -> Url -> Feed -> Item -> m ()) -- ^ Collector, i.e. action to perform when receiving a feed item. -> Maybe (Label -> Url -> Feed -> [Item] -> m ()) -- ^ Action to perform when receiving feed items. This is just a chance to -- provide an efficient shortcut instead of repeated use of the previous -- parameter. If there is no such shortcut, simply pass 'Nothing'. -> (Error -> m ()) -- ^ Error logging action. It will be called when an error occurs while -- trying to download and parse a feed. Such an error doesn't cause -- anything to stop. Execution simply goes on to read the next feed, and -- will try the erronous feed again in the next round. -> Maybe CommandQueue -- ^ A command queue you can use to change settings while the program runs. -- For example, you can add a new feed or change the interval between polls -- without relaunching 'run'. -> t -- ^ Time interval between visits of a watched feed. -> Int -- ^ Maximal number of items to collect per visit (if more are available, -- they will be collected in the next visit) -> [(Label, Url)] -- ^ List of short feed labels (for easy reference and logging), and their -- URLs to watch. -> m () run = run' False -- | Like 'run', but writes detailed reports into a file named \"debug.log\". -- This should help you understand which feeds items are being received and why -- new items aren't being detected in the way you expect. -- -- For some feeds, the new item detection method used by this library may fail. -- In that case, please report the issue, and then the method can be updated -- and extended to support more cases. runDebug :: (TimeUnit t, MonadIO m) => (Label -> Url -> Feed -> Item -> m ()) -> Maybe (Label -> Url -> Feed -> [Item] -> m ()) -> (Error -> m ()) -> Maybe CommandQueue -> t -> Int -> [(Label, Url)] -> m () runDebug = run' True -- | Create a new empty command queue. newCommandQueue :: IO CommandQueue newCommandQueue = liftM CommandQueue $ newMVar [] -- | Send a command into the queue. sendCommand :: CommandQueue -> Command -> IO () sendCommand cq cmd = modifyMVar_ (cqMVar cq) $ \ l -> return $ cmd : l -- | Send a series of commands into the queue. sendCommands :: CommandQueue -> [Command] -> IO () sendCommands cq cmds = modifyMVar_ (cqMVar cq) $ \ l -> return $ reverse cmds ++ l -- | Add a new feed to watch. addFeed :: Label -> Url -> Command addFeed = AddFeed -- | Remove a previously added feed. removeFeed :: Label -> Command removeFeed = RemoveFeed -- | Set whether a given feed should be watched (active) or not (inactive). feedActive :: Label -> Bool -> Command feedActive = FeedActive -- | Set the interval, in microseconds, between feed scans. setInterval :: TimeUnit t => t -> Command setInterval = SetInterval . fromInteger . toMicroseconds -- | Set the maximal number of feed items to be collected per feed per scan. If -- more new items are found, they wait for the next scan. setMaxPerVisit :: Int -> Command setMaxPerVisit = SetMaxPerVisit