{- This file is part of feed-collect.
 -
 - Written in 2015 by fr33domlover <fr33domlover@rel4tion.org>.
 - Date parser format strings written originally by koral <koral@mailoo.org>
 - for the imm package and were copied here (imm is released as WTFPL).
 -
 - ♡ Copying is an act of love. Please copy, reuse and share.
 -
 - The author(s) have dedicated all copyright and related and neighboring
 - rights to this software to the public domain worldwide. This software is
 - distributed without any warranty.
 -
 - You should have received a copy of the CC0 Public Domain Dedication along
 - with this software. If not, see
 - <http://creativecommons.org/publicdomain/zero/1.0/>.
 -}

-- | = Intro
--
-- This module allows you to run a loop which visits web feeds (RSS, Atom,
-- etc.) periodically and reports new items using a function you provide. You
-- can also insert control commands into the loop, such as adding and removing
-- a feed, so that stopping and restarting it isn't required.
--
-- Both http and https URLs are supported.
--
-- The original use case which motivated the creation of this library is
-- <http://rel4tion.org/projects/funbot FunBot>.
--
-- = Running
--
-- The 'run' function runs the loop, and it takes a collector callback function
-- (as one of its parameters) to be called when new feed items are found.
--
-- Here is a simple usage example.
--
-- > import Data.Time.Units
-- > import Web.Feed.Collect
-- >
-- > collect :: Label -> URL -> Feed -> Item -> IO ()
-- > collect label url feed item = do
-- >     putStrLn $ label ++ " : " ++ url
-- >     putStrLn "Got a new feed item!"
-- >     putStrLn $ showFeed feed
-- >     putStrLn $ showItem item
-- >
-- > logError :: Error -> IO ()
-- > logError = print
-- >
-- > feeds :: [(Label, Url)]
-- > feeds = [("democ-now", "http://www.democracynow.org/democracynow.rss")]
-- >
-- > main :: IO ()
-- > main = run collect Nothing logError Nothing (1 :: Minute) 3 feeds
--
-- For quick testing, you can use one of the predefined collector functions.
-- For example, there is 'collectorNull' which discards the feed items and does
-- nothing (you can use it e.g. when testing success of HTTP requests or
-- debugging the library itself), and 'collectorPretty' which writes a short
-- nicely formatted entry to stdout for each new feed item. And there are more.
--
-- If new items aren't being detected correctly, you can try using 'runDebug'
-- in place of 'run'. It takes and same arguments and runs the same loop, but
-- also writes a detailed log of the detection process into a file
-- \"debug.log\". You can use @tail -f debug.log@ from your terminal to watch
-- new log entries get appended to the file.
--
-- = Using Control Commands
--
-- Now let's see how to push control commands into the loop. The 4th parameter
-- 'run' takes is an optional command queue. In the example above we just
-- passed 'Nothing'. Now let's provide an actual queue and use it.
--
-- Suppose we are writing a program with a command-line interface. It watches
-- news feeds in the background, and can take commands from the user at the
-- same time on the command-line. Feeds can be added, removed, etc. without
-- restarting the program.
--
-- Assume we've written a function named @parseCommand@, which takes a line of
-- user input and returns a command ready to push into the queue. The parsed
-- commands are of type 'Command' and are created using functions like
-- 'addFeed', 'removeFeed' and so on, which this module provides. Using our
-- @parseCommand@ we can write the program like this:
--
-- > import Control.Concurrent (forkIO)
-- > import Data.Time.Units
-- > import System.IO
-- > import Web.Feed.Collect
-- >
-- > collect :: Label -> URL -> Feed -> Item -> IO ()
-- > collect = collectPretty
-- >
-- > logE :: Error -> IO ()
-- > logE = print
-- >
-- > feeds :: [(Label, Url)]
-- > feeds = [("fsf-news", "https://www.fsf.org/static/fsforg/rss/news.xml")]
-- >
-- > parseCommand :: String -> Maybe Command
-- > parseCommand line = {- ... - }
-- >
-- > main :: IO ()
-- > main = do
-- >     cqueue <- newCommandQueue
-- >     forkIO $ run collect Nothing logE (Just cqueue) (1 :: Minute) 3 feeds
-- >     let loop = do
-- >         line <- getLine
-- >         if line == "quit"
-- >             then putStrLn "Bye!"
-- >             else do
-- >                 case parseCommand line of
-- >                     Just cmd -> sendCommand cqueue cmd
-- >                     Nothing  -> putStrLn "Invalid input"
-- >                 loop
-- >     loop
module Web.Feed.Collect
    ( -- * Types
      Label
    , Url
    , Command ()
    , CommandQueue ()
    , Error (..)
      -- * Collectors
    , collectorNull
    , collectorPrint
    , collectorPretty
    , collectorLog
      -- * Running
    , run
    , runDebug
      -- * Control commands
    , newCommandQueue
    , sendCommand
    , sendCommands
    , addFeed
    , removeFeed
    , feedActive
    , setInterval
    , setMaxPerVisit
      -- * Utilities
    , showFeed
    , showItem
    )
where

import           Control.Applicative.Util (applyIf')
import           Control.Concurrent (threadDelay)
import           Control.Concurrent.MVar
import           Control.Exception (catch)
import           Control.Monad (liftM, when)
import           Control.Monad.IO.Class (MonadIO, liftIO)
import qualified Data.ByteString.Lazy.Char8 as BC
import           Data.Maybe (catMaybes, fromMaybe, listToMaybe)
import           Data.Time.Clock (UTCTime)
import           Data.Time.Format
import           Data.Time.LocalTime (getZonedTime, zonedTimeToUTC)
import           Data.Time.RFC2822 (parseTimeRFC2822)
import           Data.Time.RFC3339 (parseTimeRFC3339)
import           Data.Time.RFC822 (parseTimeRFC822)
import           Data.Time.Units (TimeUnit (..))
import           Network.HTTP.Client
import           Network.HTTP.Client.TLS (tlsManagerSettings)
import           System.IO
import           Text.Feed.Import (parseFeedString)
import           Text.Feed.Query
import           Text.Feed.Types (Item, Feed (..))

-- | A short name tag for a feed, for quick reference and internal use.
type Label = String

-- | An HTTP or HTTPS URL of a feed or a feed item.
type Url = String

-- | A control command sent to the feed watching loop, and affecting its
-- behavior.
data Command
    -- | Add a new feed to watch.
    = AddFeed Label Url
    -- | Remove a previously added feed.
    | RemoveFeed Label
    -- | Set whether a given feed should be watched (active) or not (inactive).
    | FeedActive Label Bool
    -- | Set the interval, in microseconds, between feed scans.
    | SetInterval Int
    -- | Set the maximal number of feed items to be collected per feed per
    -- scan. If more new items are found, they wait for the next scan.
    | SetMaxPerVisit Int

-- | A queue of control commands for the 'run' loop to execute while running.
--
-- Commands can be pushed concurrently from different threads, but there should
-- be only one 'run' invocation reading from the queue (i.e. although this is
-- unlikely anyway, don't pass the same queue to multiple concurrent 'run'
-- calls).
newtype CommandQueue = CommandQueue { cqMVar :: MVar [Command] }

-- Since no item field is guaranteed to be provided, we need some way to attach
-- a practically-mostly-unique ID to each feed item, so that we can remember
-- the items we already collected. Item fields we can expect to be mostly
-- unique are:
--
-- * ID      (this is meant to serve as a unique identifier)
-- * Title   (usually an item doesn't repeat a past item's title),
-- * Date    (usually you don't publish 2 items in the very same microsecond)
-- * Summary (same idea as the title)
--
-- The other fields are based on these, or are likely to be, or aren't expected
-- to be unique.
--
-- In the (hopefully very rare) case none of these is available, just assume we
-- never saw this ID before.
--
-- We won't be checking for cases the available fields change over time, e.g.
-- suddenly feed items get their IDs published in the XML. If the need ever
-- arises, code to handle this can be added.
data ItemID
    = ByID String | ByTitle String | ByTime String | BySummary String | Unique
    deriving Show

instance Eq ItemID where
    (ByID i)      == (ByID j)      = i == j
    (ByTitle i)   == (ByTitle j)   = i == j
    (ByTime i)    == (ByTime j)    = i == j
    (BySummary i) == (BySummary j) = i == j
    _             == _             = False

data FeedRecord = FeedRecord
    { feedName    :: String        -- A short identifier for easy reference
    , feedUrl     :: String        -- The feed's URL
    , feedOn      :: Bool          -- Whether the feed is active
    , feedPrevIDs :: [ItemID]      -- Previously collected feed item IDs
    , feedUpdated :: Maybe UTCTime -- Last time a feed item was published
    }
    deriving Show

data State = State
    { usecInterval     :: Int          -- Microseconds between polls
    , maxItemsPerVisit :: Int          -- Per-feed items to collect per poll
    , records          :: [FeedRecord] -- Per-feed state
    }
    deriving Show

-- | An error occuring while reading from a news feed.
data Error
    -- | Error while creating an HTTP request or receiving a response.
    = HttpError HttpException
    -- | Error while parsing the HTTP response body into feed content
    | FeedParsingFailed Label Url
    deriving Show

showFeedKind :: Feed -> String
showFeedKind (AtomFeed _) = "Atom"
showFeedKind (RSSFeed _)  = "RSS"
showFeedKind (RSS1Feed _) = "RSS1"
showFeedKind (XMLFeed _)  = "XML"

-- | A short one-line description of a feed.
showFeed :: Feed -> String
showFeed feed = unwords [kind, title, "by", author, "at", home]
    where
    kind = '(' : showFeedKind feed ++ ")"
    title = getFeedTitle feed
    author = fromMaybe none $ getFeedAuthor feed
    home = fromMaybe none $ getFeedHome feed
    none = "[?]"

-- | A short one-line description of a feed item.
showItem :: Item -> String
showItem item = unwords [title, "by", author, "at", date]
    where
    title = fromMaybe none $ getItemTitle item
    author = fromMaybe none $ getItemAuthor item
    date = fromMaybe none $ getItemDate item
    none = "[?]"

-- | An item collector which discards the item, i.e. does nothing.
collectorNull :: Label -> Url -> Feed -> Item -> IO ()
collectorNull _ _ _ _ = return ()

-- | An item collector which prints feed and item fields to @stdout@.
collectorPrint :: Label -> Url -> Feed -> Item -> IO ()
collectorPrint _label _url feed item = print feed >> print item

-- | An item collector which prints short friendly feed and item descriptions
-- to @stdout@.
collectorPretty :: Label -> Url -> Feed -> Item -> IO ()
collectorPretty _label _url feed item = do
    putStrLn $ showFeed feed
    putStrLn $ showItem item

-- | An item collector which writes friendly descriptions into a log file,
-- determining the log file's name using the function given as the first
-- argument (the feed label, i.e. second argument, is also used as the argument
-- for that function).
collectorLog :: (Label -> FilePath) -> Label -> Url -> Feed -> Item -> IO ()
collectorLog getPath label url feed item =
    withFile (getPath $ getFeedTitle feed) AppendMode $ \ h -> do
        hPutStrLn h $ label ++ " : " ++ url
        hPutStrLn h $ showFeed feed
        hPutStrLn h $ showItem item
        hPutChar h '\n'

-- Maximal number of item IDs to remember
maxNumIDs :: Int
maxNumIDs = 200

-- Find the value in the first 'Just' available in a list of 'Maybe's
findJust :: [Maybe a] -> Maybe a
findJust = listToMaybe . catMaybes

-- Using the available item fields, get the best hopefully-unique ID we can.
itemID :: Item -> ItemID
itemID i = fromMaybe Unique $ findJust
    [ fmap (ByID . snd) $ getItemId i
    , fmap ByTitle $ getItemTitle i
    , fmap ByTime $ getItemDate i
    , fmap BySummary $ getItemSummary i
    ]

-- Parse and get an item's date. If not available, return 'Nothing'.
-- The format examples may be partial, i.e. not demonstrate the formats fully.
--
-- Since the availability of item date isn't guaranteed, use 2 possible
-- representations. One, use the date if available. Two, otherwise, assume a
-- timeless value which always compares in a way causing us to find new items
-- to collect.
itemTime :: Item -> Maybe UTCTime
itemTime item =
    let mdate = getItemDate item
        dateParsers =
            -- Sun, 15 Nov 2015 02:45:26 -0200
            fmap zonedTimeToUTC . parseTimeRFC2822 :
            -- 2015-11-15T02:45:26-02:00
            fmap zonedTimeToUTC . parseTimeRFC3339 :
            -- 15 Nov 2015 02:45 -0200
            fmap zonedTimeToUTC . parseTimeRFC822 :
            map
                (parseTimeM True defaultTimeLocale)
                [ "%a, %d %b %G %T"          -- Sun, 15 Nov 2015 02:45:26
                , "%Y-%m-%d"                 -- 2015-11-15
                , "%e %b %Y"                 -- 15 Nov 2015
                , "%a, %e %b %Y %k:%M:%S %z" -- Sun, 15 Nov 2015 2:45:26 -0200
                , "%a, %e %b %Y %T %Z"       -- Sun, 15 Nov 2015 02:45:26 -0200
                ]
        results = maybe [] (\ date -> map ($ date) dateParsers) mdate
    in  findJust results

-- Check whether one 'ItemTime' is more recent than another, for the purpose of
-- identifying new feed items.
newerThan :: Maybe UTCTime -> Maybe UTCTime -> Bool
(Just u) `newerThan` (Just v) = u > v
_        `newerThan` _        = True

-- Find recent items we haven't collected yet
detectNewItems :: Int -> FeedRecord -> Feed -> ([Item], FeedRecord, IO ())
detectNewItems maxItems rec feed =
    let items = feedItems feed
        ids = map itemID items
        times = map itemTime items
        iids = zip3 items ids times
        new (_i, iid, t) =
            iid `notElem` feedPrevIDs rec  &&  t `newerThan` feedUpdated rec
        iidsAllNew = filter new iids
        iids' = drop (length iidsAllNew - maxItems) iidsAllNew
        (items', ids', _times') = unzip3 iids'
        rec' = rec
            { feedPrevIDs = take maxNumIDs $ ids' ++ feedPrevIDs rec
            , feedUpdated =
                case iids' of
                    (_i, _iid, t) : _ -> t
                    []                -> feedUpdated rec
            }
        report = do
            let showTime :: FormatTime t => t -> String
                showTime = formatTime defaultTimeLocale rfc822DateFormat
                updated' = fmap showTime . feedUpdated
                prevIDs = feedPrevIDs rec
                prevIDsS = take 5 prevIDs
                itemsS = take maxItems items
                iidsS = take maxItems iids
                prevIDsFinal = feedPrevIDs rec'
                prevIDsFinalS = take 5 prevIDsFinal
            h <- openFile "debug.log" AppendMode
            let line = hPutStrLn h
                nl = hPutChar h '\n'
                printIT (_, i, t) = line $ show t ++ " " ++ show i
            t <- getZonedTime
            line $ replicate 79 '-'
            line $ showTime t
            line $ replicate 79 '-'
            line $ "Label   " ++ feedName rec
            line $ "URL     " ++ feedUrl rec
            line "----------- (1) Before changes ------------"
            line $ "Active  " ++ show (feedOn rec) ++ " (should be True!)"
            line $ "Updated " ++ fromMaybe "[?]" (updated' rec)
            line $ "At most " ++ show maxItems ++ " are reported per visit"
            line $ showFeed feed
            line $ show (length prevIDs) ++ " previous item IDs logged"
            nl
            line $ "Most recent " ++ show (length prevIDsS) ++ " are:"
            mapM_ (hPrint h) prevIDsS
            nl
            line "------------ (2) While running ------------"
            line $ "Feed has " ++ show (length items) ++ " items"
            nl
            line $ "First " ++ show maxItems ++ " from the top are:"
            mapM_ (line . showItem) itemsS
            nl
            line "Their computed times and IDs are:"
            mapM_ printIT iidsS
            nl
            line "Out of all feed items, the following have newly seen IDs:"
            mapM_ printIT $ filter (\ (_, i, _) -> i `notElem` prevIDs) iids
            nl
            line "Out of them, the following are also newer than last update:"
            mapM_ printIT iidsAllNew
            nl
            line "--------------- (3) Result ----------------"
            line "Out of them, the following have been collected:"
            mapM_ printIT iids'
            nl
            line $ "Updated " ++ fromMaybe "[?]" (updated' rec')
            line $ show (length prevIDsFinal) ++ " previous item IDs logged"
            nl
            line $ "Most recent " ++ show (length prevIDsFinalS) ++ " are:"
            mapM_ (hPrint h) prevIDsFinalS
            nl
            hClose h
    in  (items', rec', report)

fetchRaw :: Manager -> String -> IO (Either HttpException String)
fetchRaw manager url =
    let action = do
            request <- parseUrl url
            response <- httpLbs request manager
            return $ Right $ BC.unpack $ responseBody response
        handler e = return $ Left (e :: HttpException)
    in  action `catch` handler

-- Try to download a feed from its URL
fetch :: Manager -> Label -> Url -> IO (Either Error Feed)
fetch manager label url = do
    ebody <- fetchRaw manager url
    return $ case ebody of
        Left err   -> Left $ HttpError err
        Right body ->
            case parseFeedString body of
                Just feed -> Right feed
                Nothing   -> Left $ FeedParsingFailed label url

-- Fill initial feed record
initRec :: MonadIO m
        => (Error -> m ())
        -> Manager
        -> Label
        -> Url
        -> m FeedRecord
initRec logError manager label url = do
    efeed <- liftIO $ fetch manager label url
    case efeed of
        Right feed ->
            let items = feedItems feed
            in  return $ FeedRecord
                    { feedName    = label
                    , feedUrl     = url
                    , feedOn      = True
                    , feedPrevIDs = map itemID $ take maxNumIDs items
                    , feedUpdated = listToMaybe items >>= itemTime
                    }
        Left e -> do
            logError e
            return $ FeedRecord
                { feedName    = label
                , feedUrl     = url
                , feedOn      = True
                , feedPrevIDs = []
                , feedUpdated = Nothing
                }

-- Execute a control command
exec :: MonadIO m => (Error -> m ()) -> Manager -> Command -> State -> m State
exec logError manager command state@State { records = rs } =
    case command of
        AddFeed label url       -> do
            rec <- initRec logError manager label url
            return state { records = rs ++ [rec] }
        RemoveFeed label        ->
            return state { records = filter ((== label) . feedName) rs }
        FeedActive label active ->
            let rs' = [r { feedOn = active } | r <- rs, feedName r == label]
            in  return state { records = rs' }
        SetInterval usec        -> return state { usecInterval = usec }
        SetMaxPerVisit nitems   -> return state { maxItemsPerVisit = nitems }

foldrM :: Monad m => (a -> b -> m b) -> b -> [a] -> m b
foldrM _ v []     = return v
foldrM f v (x:xs) = f x =<< foldrM f v xs

-- Execute all commands waiting in the queue
execAll :: MonadIO m
        => (Error -> m ())
        -> Manager
        -> CommandQueue
        -> State
        -> m State
execAll logError manager cq state = do
    cmds <- liftIO $ modifyMVar (cqMVar cq) $ \ l -> return ([], l)
    foldrM (exec logError manager) state cmds

run' :: (TimeUnit t, MonadIO m)
     => Bool
     -> (Label -> Url -> Feed -> Item -> m ())
     -> Maybe (Label -> Url -> Feed -> [Item] -> m ())
     -> (Error -> m ())
     -> Maybe CommandQueue
     -> t
     -> Int
     -> [(Label, Url)]
     -> m ()
run' debug collect mCollectMany logError mq interval maxPerVisit sources = do
    let collectMany
            = fromMaybe (\ l u f -> mapM_ $ collect l u f) mCollectMany
    manager <- liftIO $ newManager tlsManagerSettings
    initialRecords <-
        mapM (\ (label, url) -> initRec logError manager label url) sources
    let initialState = State
            { usecInterval     = fromInteger $ toMicroseconds interval
            , maxItemsPerVisit = maxPerVisit
            , records          = initialRecords
            }
        execCmds st =
            case mq of
                Just cq -> execAll logError manager cq st
                Nothing -> return st
        visit record = applyIf' record feedOn $ \ rec -> do
            efeed <- liftIO $ fetch manager (feedName rec) (feedUrl rec)
            case efeed of
                Right feed -> do
                    let (items, rec', report) =
                            detectNewItems maxPerVisit rec feed
                        ritems = reverse items
                    when debug $ liftIO report
                    collectMany (feedName rec) (feedUrl rec) feed ritems
                    return rec'
                Left e -> do
                    logError e
                    return rec
        loop state = do
            liftIO $ threadDelay $ usecInterval state
            recs <- mapM visit $ records state
            let stateCollected = state { records = recs }
            stateExec <- execCmds stateCollected
            loop stateExec
    loop initialState

-- | Watch feeds and perform the given action on received feed items.
run :: (TimeUnit t, MonadIO m)
    => (Label -> Url -> Feed -> Item -> m ())
    -- ^ Collector, i.e. action to perform when receiving a feed item.
    -> Maybe (Label -> Url -> Feed -> [Item] -> m ())
    -- ^ Action to perform when receiving feed items. This is just a chance to
    -- provide an efficient shortcut instead of repeated use of the previous
    -- parameter. If there is no such shortcut, simply pass 'Nothing'.
    -> (Error -> m ())
    -- ^ Error logging action. It will be called when an error occurs while
    -- trying to download and parse a feed. Such an error doesn't cause
    -- anything to stop. Execution simply goes on to read the next feed, and
    -- will try the erronous feed again in the next round.
    -> Maybe CommandQueue
    -- ^ A command queue you can use to change settings while the program runs.
    -- For example, you can add a new feed or change the interval between polls
    -- without relaunching 'run'.
    -> t
    -- ^ Time interval between visits of a watched feed.
    -> Int
    -- ^ Maximal number of items to collect per visit (if more are available,
    -- they will be collected in the next visit)
    -> [(Label, Url)]
    -- ^ List of short feed labels (for easy reference and logging), and their
    -- URLs to watch.
    -> m ()
run = run' False

-- | Like 'run', but writes detailed reports into a file named \"debug.log\".
-- This should help you understand which feeds items are being received and why
-- new items aren't being detected in the way you expect.
--
-- For some feeds, the new item detection method used by this library may fail.
-- In that case, please report the issue, and then the method can be updated
-- and extended to support more cases.
runDebug :: (TimeUnit t, MonadIO m)
         => (Label -> Url -> Feed -> Item -> m ())
         -> Maybe (Label -> Url -> Feed -> [Item] -> m ())
         -> (Error -> m ())
         -> Maybe CommandQueue
         -> t
         -> Int
         -> [(Label, Url)]
         -> m ()
runDebug = run' True

-- | Create a new empty command queue.
newCommandQueue :: IO CommandQueue
newCommandQueue = liftM CommandQueue $ newMVar []

-- | Send a command into the queue.
sendCommand :: CommandQueue -> Command -> IO ()
sendCommand cq cmd = modifyMVar_ (cqMVar cq) $ \ l -> return $ cmd : l

-- | Send a series of commands into the queue.
sendCommands :: CommandQueue -> [Command] -> IO ()
sendCommands cq cmds =
    modifyMVar_ (cqMVar cq) $ \ l -> return $ reverse cmds ++ l

-- | Add a new feed to watch.
addFeed :: Label -> Url -> Command
addFeed = AddFeed

-- | Remove a previously added feed.
removeFeed :: Label -> Command
removeFeed = RemoveFeed

-- | Set whether a given feed should be watched (active) or not (inactive).
feedActive :: Label -> Bool -> Command
feedActive = FeedActive

-- | Set the interval, in microseconds, between feed scans.
setInterval :: TimeUnit t => t -> Command
setInterval = SetInterval . fromInteger . toMicroseconds

-- | Set the maximal number of feed items to be collected per feed per scan. If
-- more new items are found, they wait for the next scan.
setMaxPerVisit :: Int -> Command
setMaxPerVisit = SetMaxPerVisit