{-# LANGUAGE BangPatterns #-} {-# LANGUAGE DuplicateRecordFields #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE TypeApplications #-} {-# LANGUAGE UnboxedTuples #-} {- | Parse RFC 3164 messages. For example: > <133>Feb 25 14:09:07 webserver syslogd: restart > <0>Oct 22 10:52:01 scapegoat.dmz.example.org sched[0]: That's All Folks! This library assumes that the @TAG@ field described by section 5.3 of RFC 3164 is a process name. It also assumes that the optional bracketed number that follows it is a process id. This library also addresses three common extensions to RFC 3164: * Some vendors include a year after the timestamp. For example: @<14>Oct 15 11:14:59 2019 example.com ...@. When present, the year is parsed and provided to the user. * Some vendors include a priority that preceeds the process name. For example: @<133>Aug 10 09:05:14 my-host notice tmsh[4726]: ...@. The Linux man page for @syslog.conf@ lists these options for priority: @debug@, @info@, @notice@, @warning@, @warn@, @err@, @error@, @crit@, @alert@, @emerg@, @panic@. If a process name begins with any of these keywords (followed by a space), the keyword and the trailing space are removed from the process name, and the keyword is made available in the @priority@ field. * Cisco ASAs omit the hostname sometimes. This is totally bizarre and leads to messages that looks like: @<190>Jun 08 2022 14:46:28: message@. In this case, the hostname is set to the empty string. -} module Syslog.Bsd ( -- * Types Message (..) , Process (..) , Timestamp (..) -- * Full Decode , decode , parser -- * Parsing Fragments , takePriority , takeTimestamp , takeHostname , takeProcess ) where import Prelude hiding (id) import Control.Monad (when) import Data.Bytes.Parser (Parser) import Data.Bytes.Types (Bytes (Bytes)) import Data.Word (Word32, Word8) import qualified Chronos import qualified Data.Bytes as Bytes import qualified Data.Bytes.Parser as Parser import qualified Data.Bytes.Parser.Latin as Latin import qualified Data.Bytes.Parser.Unsafe as Unsafe import qualified Data.Bytes.Text.Latin1 as Latin1 import qualified Data.Maybe.Unpacked.Numeric.Word32 as Word32 data Message = Message { priority :: !Word32 , timestamp :: !Timestamp , hostname :: {-# UNPACK #-} !Bytes , process :: !(Maybe Process) , message :: {-# UNPACK #-} !Bytes } deriving (Show) data Timestamp = Timestamp { month :: !Chronos.Month , day :: !Chronos.DayOfMonth , hour :: !Word8 , minute :: !Word8 , second :: !Word8 , year :: {-# UNPACK #-} !Word32.Maybe -- ^ Section 5.1 of RFC 3164 notes that some software appends -- a four-character year after the time of day. Since hostnames -- cannot start with digits, we can parse this unambiguously. We -- extend RFC 3164 to handle these nonstandard years. } deriving (Show) data Process = Process { priority :: {-# UNPACK #-} !Bytes -- ^ Priority is nonstandard. This field is the empty byte sequence -- when the priority is not present. , name :: {-# UNPACK #-} !Bytes , id :: {-# UNPACK #-} !Word32.Maybe } deriving (Show) -- | Run the RFC 3164 parser. See 'parser'. decode :: Bytes -> Maybe Message decode = Parser.parseBytesMaybe parser {- | Parse a RFC 3164 message. Note that this is just @takePriority@, @takeTimestamp@, @takeHostname, and @takeProcess@ called in sequence, followed by skipping whitespace and then treating the remaining input as the original message. -} parser :: Parser () s Message parser = do priority <- takePriority () timestamp <- takeTimestamp () Latin.trySatisfy (== ':') >>= \case True -> do Latin.skipChar ' ' message <- Parser.remaining pure Message {priority, timestamp, hostname = Bytes.empty, process = Nothing, message} False -> do hostname <- takeHostname () -- Watchguard includes a serial number and an ISO8601-encoded datetime -- in parenthesis before the process name. If we detect this, we skip over it. -- Note that, in compliant BSD-style syslog, there is no way for -- an open parenthesis to appear in this position. So, by doing -- this, we do not reject any good logs. Latin.peek' () >>= \case c | c >= 'A' && c <= 'Z' -> Parser.orElse ( do Latin.skipWhile (\x -> (x >= 'A' && x <= 'Z') || (x >= '0' && x <= '9')) Latin.char () ' ' Latin.char () '(' Latin.skipDigits1 () Latin.char () '-' Latin.skipDigits1 () Latin.char () '-' Latin.skipDigits1 () Latin.char () 'T' Latin.skipDigits1 () Latin.char () ':' Latin.skipDigits1 () Latin.char () ':' Latin.skipDigits1 () Latin.char () ')' Latin.char () ' ' ) (pure ()) _ -> pure () process <- Latin.trySatisfy (== ':') >>= \case True -> pure Nothing False -> do p <- takeProcess () pure (Just p) Latin.skipChar ' ' message <- Parser.remaining pure Message {priority, timestamp, hostname, process, message} {- | Consume the angle-bracketed priority. RFC 3164 does not allow a space to follow the priority, so this does not consume a trailing space. -} takePriority :: e -> Parser e s Word32 takePriority e = do Latin.char e '<' priority <- Latin.decWord32 e Latin.char e '>' pure priority {- | Consume the hostname and the space that follows it. Returns the hostname. -} takeHostname :: e -> Parser e s Bytes takeHostname e = -- TODO: This should actually use a takeWhile1. Latin.takeTrailedBy e ' ' {- | Consume the timestamp and the trailing space character if a trailing space exists. Returns the parsed timestamp. This allows two extensions to the RFC 3164 datetime format. The year may be present either right after the day of the month or after the time of day. -} takeTimestamp :: e -> Parser e s Timestamp takeTimestamp e = do monthBytes <- Parser.take e 3 month <- case resolveMonth monthBytes of Chronos.Month 12 -> Parser.fail e m -> pure m -- There might be two spaces here since single-digit days get -- padded with a space. Latin.skipChar1 e ' ' dayRaw <- Latin.decWord8 e day <- if dayRaw < 32 then pure (Chronos.DayOfMonth (fromIntegral dayRaw)) else Parser.fail e Latin.char e ' ' hourOrYear <- Latin.decWord32 e Latin.any e >>= \case -- We interpret the number as a year if it is followed by -- a space, and we interpret it as an hour if it is followed -- by a colon. ' ' -> do hour <- Latin.decWord8 e when (hour > 23) (Parser.fail e) Latin.char e ':' minute <- Latin.decWord8 e when (minute > 59) (Parser.fail e) Latin.char e ':' second <- Latin.decWord8 e when (second > 59) (Parser.fail e) _ <- Latin.trySatisfy (== ' ') pure Timestamp {month, day, hour, minute, second, year = Word32.just hourOrYear} ':' -> do when (hourOrYear > 23) (Parser.fail e) let hour = fromIntegral @Word32 @Word8 hourOrYear minute <- Latin.decWord8 e when (minute > 59) (Parser.fail e) Latin.char e ':' second <- Latin.decWord8 e when (second > 59) (Parser.fail e) Latin.trySatisfy (== ' ') >>= \case False -> pure Timestamp {month, day, hour, minute, second, year = Word32.nothing} True -> do -- The only good way to allow a year is with backtracking. We do not -- learn until we encounter the space following the decimal number -- whether it was a year or part of a hostname (likely an ip address). Parser.orElse ( do y <- Latin.decWord32 e Latin.char e ' ' pure Timestamp {month, day, hour, minute, second, year = Word32.just y} ) (pure Timestamp {month, day, hour, minute, second, year = Word32.nothing}) _ -> Parser.fail e {- | Take the process name and the process id and consume the colon that follows them. Does not consume any space after the colon. -} takeProcess :: e -> Parser e s Process takeProcess e = do processStart <- Unsafe.cursor hasPid <- Parser.skipTrailedBy2 e 0x3A 0x5B processEndSucc <- Unsafe.cursor arr <- Unsafe.expose let name0 = Bytes arr processStart ((processEndSucc - 1) - processStart) !(# name, priority #) = case Bytes.split1 0x20 name0 of Just (pre, post) | Latin1.equals3 'e' 'r' 'r' pre -> (# post, pre #) | Latin1.equals4 'c' 'r' 'i' 't' pre -> (# post, pre #) | Latin1.equals4 'i' 'n' 'f' 'o' pre -> (# post, pre #) | Latin1.equals4 'w' 'a' 'r' 'n' pre -> (# post, pre #) | Latin1.equals5 'a' 'l' 'e' 'r' 't' pre -> (# post, pre #) | Latin1.equals5 'd' 'e' 'b' 'u' 'g' pre -> (# post, pre #) | Latin1.equals5 'e' 'm' 'e' 'r' 'g' pre -> (# post, pre #) | Latin1.equals5 'e' 'r' 'r' 'o' 'r' pre -> (# post, pre #) | Latin1.equals5 'p' 'a' 'n' 'i' 'c' pre -> (# post, pre #) | Latin1.equals6 'n' 'o' 't' 'i' 'c' 'e' pre -> (# post, pre #) | Latin1.equals7 'w' 'a' 'r' 'n' 'i' 'n' 'g' pre -> (# post, pre #) _ -> (# name0, Bytes arr 0 0 #) case hasPid of False -> pure Process {priority, name, id = Word32.nothing} True -> do pid <- Latin.decWord32 e Latin.char2 e ']' ':' pure Process {priority, name, id = Word32.just pid} -- Precondition: length of bytes is 3 resolveMonth :: Bytes -> Chronos.Month resolveMonth b | Latin1.equals3 'A' 'p' 'r' b = Chronos.april | Latin1.equals3 'A' 'u' 'g' b = Chronos.august | Latin1.equals3 'D' 'e' 'c' b = Chronos.december | Latin1.equals3 'F' 'e' 'b' b = Chronos.february | Latin1.equals3 'J' 'a' 'n' b = Chronos.january | Latin1.equals3 'J' 'u' 'l' b = Chronos.july | Latin1.equals3 'J' 'u' 'n' b = Chronos.june | Latin1.equals3 'M' 'a' 'r' b = Chronos.march | Latin1.equals3 'M' 'a' 'y' b = Chronos.may | Latin1.equals3 'N' 'o' 'v' b = Chronos.november | Latin1.equals3 'O' 'c' 't' b = Chronos.october | Latin1.equals3 'S' 'e' 'p' b = Chronos.september | otherwise = Chronos.Month 12