module Web.Feed.Collect
(
Label
, Url
, Command ()
, CommandQueue ()
, Error (..)
, collectorNull
, collectorPrint
, collectorPretty
, collectorLog
, run
, runDebug
, newCommandQueue
, sendCommand
, sendCommands
, addFeed
, removeFeed
, feedActive
, setInterval
, setMaxPerVisit
, showFeed
, showItem
)
where
import Control.Applicative.Util (applyIf')
import Control.Concurrent (threadDelay)
import Control.Concurrent.MVar
import Control.Exception (catch)
import Control.Monad (liftM, when)
import Control.Monad.IO.Class (MonadIO, liftIO)
import qualified Data.ByteString.Lazy.UTF8 as BU
import Data.Maybe (catMaybes, fromMaybe, listToMaybe)
import Data.Time.Clock (UTCTime)
import Data.Time.Format
import Data.Time.LocalTime (getZonedTime, zonedTimeToUTC)
import Data.Time.RFC2822 (parseTimeRFC2822)
import Data.Time.RFC3339 (parseTimeRFC3339)
import Data.Time.RFC822 (parseTimeRFC822)
import Data.Time.Units (TimeUnit (..))
import Network.HTTP.Client
import Network.HTTP.Client.TLS (tlsManagerSettings)
import System.IO
import Text.Feed.Import (parseFeedString)
import Text.Feed.Query
import Text.Feed.Types (Item, Feed (..))
type Label = String
type Url = String
data Command
= AddFeed Label Url
| RemoveFeed Label
| FeedActive Label Bool
| SetInterval Int
| SetMaxPerVisit Int
newtype CommandQueue = CommandQueue { cqMVar :: MVar [Command] }
data ItemID
= ByID String | ByTitle String | ByTime String | BySummary String | Unique
deriving Show
instance Eq ItemID where
(ByID i) == (ByID j) = i == j
(ByTitle i) == (ByTitle j) = i == j
(ByTime i) == (ByTime j) = i == j
(BySummary i) == (BySummary j) = i == j
_ == _ = False
data FeedRecord = FeedRecord
{ feedName :: String
, feedUrl :: String
, feedOn :: Bool
, feedPrevIDs :: [ItemID]
, feedUpdated :: Maybe UTCTime
}
deriving Show
data State = State
{ usecInterval :: Int
, maxItemsPerVisit :: Int
, records :: [FeedRecord]
}
deriving Show
data Error
= HttpError HttpException
| FeedParsingFailed Label Url
deriving Show
showFeedKind :: Feed -> String
showFeedKind (AtomFeed _) = "Atom"
showFeedKind (RSSFeed _) = "RSS"
showFeedKind (RSS1Feed _) = "RSS1"
showFeedKind (XMLFeed _) = "XML"
showFeed :: Feed -> String
showFeed feed = unwords [kind, title, "by", author, "at", home]
where
kind = '(' : showFeedKind feed ++ ")"
title = getFeedTitle feed
author = fromMaybe none $ getFeedAuthor feed
home = fromMaybe none $ getFeedHome feed
none = "[?]"
showItem :: Item -> String
showItem item = unwords [title, "by", author, "at", date]
where
title = fromMaybe none $ getItemTitle item
author = fromMaybe none $ getItemAuthor item
date = fromMaybe none $ getItemDate item
none = "[?]"
collectorNull :: Label -> Url -> Feed -> Item -> IO ()
collectorNull _ _ _ _ = return ()
collectorPrint :: Label -> Url -> Feed -> Item -> IO ()
collectorPrint _label _url feed item = print feed >> print item
collectorPretty :: Label -> Url -> Feed -> Item -> IO ()
collectorPretty _label _url feed item = do
putStrLn $ showFeed feed
putStrLn $ showItem item
collectorLog :: (Label -> FilePath) -> Label -> Url -> Feed -> Item -> IO ()
collectorLog getPath label url feed item =
withFile (getPath $ getFeedTitle feed) AppendMode $ \ h -> do
hPutStrLn h $ label ++ " : " ++ url
hPutStrLn h $ showFeed feed
hPutStrLn h $ showItem item
hPutChar h '\n'
maxNumIDs :: Int
maxNumIDs = 200
findJust :: [Maybe a] -> Maybe a
findJust = listToMaybe . catMaybes
itemID :: Item -> ItemID
itemID i = fromMaybe Unique $ findJust
[ fmap (ByID . snd) $ getItemId i
, fmap ByTitle $ getItemTitle i
, fmap ByTime $ getItemDate i
, fmap BySummary $ getItemSummary i
]
itemTime :: Item -> Maybe UTCTime
itemTime item =
let mdate = getItemDate item
dateParsers =
fmap zonedTimeToUTC . parseTimeRFC2822 :
fmap zonedTimeToUTC . parseTimeRFC3339 :
fmap zonedTimeToUTC . parseTimeRFC822 :
map
(parseTimeM True defaultTimeLocale)
[ "%a, %d %b %G %T"
, "%Y-%m-%d"
, "%e %b %Y"
, "%a, %e %b %Y %k:%M:%S %z"
, "%a, %e %b %Y %T %Z"
]
results = maybe [] (\ date -> map ($ date) dateParsers) mdate
in findJust results
newerThan :: Maybe UTCTime -> Maybe UTCTime -> Bool
(Just u) `newerThan` (Just v) = u > v
_ `newerThan` _ = True
detectNewItems :: Int -> FeedRecord -> Feed -> ([Item], FeedRecord, IO ())
detectNewItems maxItems rec feed =
let items = feedItems feed
ids = map itemID items
times = map itemTime items
iids = zip3 items ids times
new (_i, iid, t) =
iid `notElem` feedPrevIDs rec && t `newerThan` feedUpdated rec
iidsAllNew = filter new iids
iids' = drop (length iidsAllNew maxItems) iidsAllNew
(items', ids', _times') = unzip3 iids'
rec' = rec
{ feedPrevIDs = take maxNumIDs $ ids' ++ feedPrevIDs rec
, feedUpdated =
case iids' of
(_i, _iid, t) : _ -> t
[] -> feedUpdated rec
}
report = do
let showTime :: FormatTime t => t -> String
showTime = formatTime defaultTimeLocale rfc822DateFormat
updated' = fmap showTime . feedUpdated
prevIDs = feedPrevIDs rec
prevIDsS = take 5 prevIDs
itemsS = take maxItems items
iidsS = take maxItems iids
prevIDsFinal = feedPrevIDs rec'
prevIDsFinalS = take 5 prevIDsFinal
h <- openFile "debug.log" AppendMode
let line = hPutStrLn h
nl = hPutChar h '\n'
printIT (_, i, t) = line $ show t ++ " " ++ show i
t <- getZonedTime
line $ replicate 79 '-'
line $ showTime t
line $ replicate 79 '-'
line $ "Label " ++ feedName rec
line $ "URL " ++ feedUrl rec
line "----------- (1) Before changes ------------"
line $ "Active " ++ show (feedOn rec) ++ " (should be True!)"
line $ "Updated " ++ fromMaybe "[?]" (updated' rec)
line $ "At most " ++ show maxItems ++ " are reported per visit"
line $ showFeed feed
line $ show (length prevIDs) ++ " previous item IDs logged"
nl
line $ "Most recent " ++ show (length prevIDsS) ++ " are:"
mapM_ (hPrint h) prevIDsS
nl
line "------------ (2) While running ------------"
line $ "Feed has " ++ show (length items) ++ " items"
nl
line $ "First " ++ show maxItems ++ " from the top are:"
mapM_ (line . showItem) itemsS
nl
line "Their computed times and IDs are:"
mapM_ printIT iidsS
nl
line "Out of all feed items, the following have newly seen IDs:"
mapM_ printIT $ filter (\ (_, i, _) -> i `notElem` prevIDs) iids
nl
line "Out of them, the following are also newer than last update:"
mapM_ printIT iidsAllNew
nl
line "--------------- (3) Result ----------------"
line "Out of them, the following have been collected:"
mapM_ printIT iids'
nl
line $ "Updated " ++ fromMaybe "[?]" (updated' rec')
line $ show (length prevIDsFinal) ++ " previous item IDs logged"
nl
line $ "Most recent " ++ show (length prevIDsFinalS) ++ " are:"
mapM_ (hPrint h) prevIDsFinalS
nl
hClose h
in (items', rec', report)
fetchRaw :: Manager -> String -> IO (Either HttpException String)
fetchRaw manager url =
let action = do
request <- parseUrl url
response <- httpLbs request manager
return $ Right $ BU.toString $ responseBody response
handler e = return $ Left (e :: HttpException)
in action `catch` handler
fetch :: Manager -> Label -> Url -> IO (Either Error Feed)
fetch manager label url = do
ebody <- fetchRaw manager url
return $ case ebody of
Left err -> Left $ HttpError err
Right body ->
case parseFeedString body of
Just feed -> Right feed
Nothing -> Left $ FeedParsingFailed label url
initRec :: MonadIO m
=> (Error -> m ())
-> Manager
-> Label
-> Url
-> m FeedRecord
initRec logError manager label url = do
efeed <- liftIO $ fetch manager label url
case efeed of
Right feed ->
let items = feedItems feed
in return $ FeedRecord
{ feedName = label
, feedUrl = url
, feedOn = True
, feedPrevIDs = map itemID $ take maxNumIDs items
, feedUpdated = listToMaybe items >>= itemTime
}
Left e -> do
logError e
return $ FeedRecord
{ feedName = label
, feedUrl = url
, feedOn = True
, feedPrevIDs = []
, feedUpdated = Nothing
}
exec :: MonadIO m => (Error -> m ()) -> Manager -> Command -> State -> m State
exec logError manager command state@State { records = rs } =
case command of
AddFeed label url -> do
rec <- initRec logError manager label url
return state { records = rs ++ [rec] }
RemoveFeed label ->
return state { records = filter ((== label) . feedName) rs }
FeedActive label active ->
let rs' = [r { feedOn = active } | r <- rs, feedName r == label]
in return state { records = rs' }
SetInterval usec -> return state { usecInterval = usec }
SetMaxPerVisit nitems -> return state { maxItemsPerVisit = nitems }
foldrM :: Monad m => (a -> b -> m b) -> b -> [a] -> m b
foldrM _ v [] = return v
foldrM f v (x:xs) = f x =<< foldrM f v xs
execAll :: MonadIO m
=> (Error -> m ())
-> Manager
-> CommandQueue
-> State
-> m State
execAll logError manager cq state = do
cmds <- liftIO $ modifyMVar (cqMVar cq) $ \ l -> return ([], l)
foldrM (exec logError manager) state cmds
run' :: (TimeUnit t, MonadIO m)
=> Bool
-> (Label -> Url -> Feed -> Item -> m ())
-> Maybe (Label -> Url -> Feed -> [Item] -> m ())
-> (Error -> m ())
-> Maybe CommandQueue
-> t
-> Int
-> [(Label, Url)]
-> m ()
run' debug collect mCollectMany logError mq interval maxPerVisit sources = do
let collectMany
= fromMaybe (\ l u f -> mapM_ $ collect l u f) mCollectMany
manager <- liftIO $ newManager tlsManagerSettings
initialRecords <-
mapM (\ (label, url) -> initRec logError manager label url) sources
let initialState = State
{ usecInterval = fromInteger $ toMicroseconds interval
, maxItemsPerVisit = maxPerVisit
, records = initialRecords
}
execCmds st =
case mq of
Just cq -> execAll logError manager cq st
Nothing -> return st
visit record = applyIf' record feedOn $ \ rec -> do
efeed <- liftIO $ fetch manager (feedName rec) (feedUrl rec)
case efeed of
Right feed -> do
let (items, rec', report) =
detectNewItems maxPerVisit rec feed
ritems = reverse items
when debug $ liftIO report
collectMany (feedName rec) (feedUrl rec) feed ritems
return rec'
Left e -> do
logError e
return rec
loop state = do
liftIO $ threadDelay $ usecInterval state
recs <- mapM visit $ records state
let stateCollected = state { records = recs }
stateExec <- execCmds stateCollected
loop stateExec
loop initialState
run :: (TimeUnit t, MonadIO m)
=> (Label -> Url -> Feed -> Item -> m ())
-> Maybe (Label -> Url -> Feed -> [Item] -> m ())
-> (Error -> m ())
-> Maybe CommandQueue
-> t
-> Int
-> [(Label, Url)]
-> m ()
run = run' False
runDebug :: (TimeUnit t, MonadIO m)
=> (Label -> Url -> Feed -> Item -> m ())
-> Maybe (Label -> Url -> Feed -> [Item] -> m ())
-> (Error -> m ())
-> Maybe CommandQueue
-> t
-> Int
-> [(Label, Url)]
-> m ()
runDebug = run' True
newCommandQueue :: IO CommandQueue
newCommandQueue = liftM CommandQueue $ newMVar []
sendCommand :: CommandQueue -> Command -> IO ()
sendCommand cq cmd = modifyMVar_ (cqMVar cq) $ \ l -> return $ cmd : l
sendCommands :: CommandQueue -> [Command] -> IO ()
sendCommands cq cmds =
modifyMVar_ (cqMVar cq) $ \ l -> return $ reverse cmds ++ l
addFeed :: Label -> Url -> Command
addFeed = AddFeed
removeFeed :: Label -> Command
removeFeed = RemoveFeed
feedActive :: Label -> Bool -> Command
feedActive = FeedActive
setInterval :: TimeUnit t => t -> Command
setInterval = SetInterval . fromInteger . toMicroseconds
setMaxPerVisit :: Int -> Command
setMaxPerVisit = SetMaxPerVisit