{-# LANGUAGE OverloadedStrings , ScopedTypeVariables , StandaloneDeriving #-} {-| The CLTokens module describes non-overlapping classes of strings that are useful for disambiguating arguments to command line programs. Many common string formats -- environment variable assignments, URLs, option strings -- are recognized by this module's utilities. -} module System.Posix.ARX.CLI.CLTokens where import Prelude hiding (takeWhile) import Control.Applicative import Data.ByteString (ByteString) import Data.ByteString.Char8 () import Data.Either import Data.Map (Map) import qualified Data.Map as Map import Data.Attoparsec.ByteString.Char8 ( char8, choice, decimal, endOfInput, inClass, isDigit, parseOnly, Parser, satisfy, string, takeWhile, takeWhile1, try ) {-| Non-overlapping classes of command line argument strings. -} data Class = EnvBinding -- ^ An 'EnvBinding' has the form -- @=@. For example, -- @SENDIN=the_clowns@. | QualifiedPath -- ^ A 'QualifiedPath' is a file path starting with -- @/@, @./@, or @../@. | DashDash -- ^ A 'DashDash' is a string of two dashes, @--@, -- commonly used to indicate the end of options -- processing. | LongOption -- ^ A 'LongOption' is a string beginning with two -- dashes and then at least one non-dash. | Dash -- ^ A 'Dash' is a single dash, @-@, commonly used -- to indicate input from @stdin@ or output to -- @stdout@. | ShortOption -- ^ A 'ShortOption' is a beginning with a dash and -- then at least one non-dash. | URL -- ^ A 'URL' is a scheme, separated from the -- resource, represented as an arbitrary string, -- by @://@. The scheme consists of ASCII, -- lower-case letters and digits, and may be -- multi-part, with each part separated by a @+@ -- or @/@ (for example, @git+ssh@). An example -- URL: @http://example.com/?q=special@. | HexNum -- ^ A 'HexNum' is a sequence of hexadecimal -- digits, upper or lower case, beginning with -- @0x@; for example: @0x01a3@. | DecimalNum -- ^ A 'DecimalNum' is a string of decimal digits: -- @123123@. | Size -- ^ A 'Size' is a decimal number followed by a -- multiplicative suffix, in the manner of @dd@ -- or @head@. Note that counts in terms of bytes -- require @B@ (unlike @dd@ or @head@). For a -- full list of suffixes, see 'sizes' below. deriving instance Eq Class deriving instance Ord Class deriving instance Show Class {-| Determine if a particular 'ByteString' matches the given 'Class' of token. -} match :: Class -> ByteString -> Bool match = (e2b .) . parseOnly . recognizer where e2b (Left _) = False e2b (Right _) = True {-| Determine if a particular 'ByteString' matches any 'Class' of token. -} recognize :: ByteString -> Maybe Class recognize = e2m . parseOnly (choice recognizers) where e2m (Left _) = Nothing e2m (Right x) = Just x recognizeIt x = x <$ recognizer x recognizers = recognizeIt <$> [ EnvBinding, QualifiedPath, DashDash, LongOption, Dash, ShortOption, URL, HexNum, DecimalNum ] {-| A ByteString stand-in that demoes each token class. -} exemplar :: Class -> ByteString exemplar cls = case cls of EnvBinding -> "VAR=value" QualifiedPath -> "./qualified/path" DashDash -> "--" LongOption -> "--long-option" Dash -> "-" ShortOption -> "-shortopt" URL -> "scheme://url-to-resource" HexNum -> "0xA12FE" DecimalNum -> "0123456789" Size -> "4MiB" {-| The recognizer appropriate to each token class. Parses successfully if a the token class is recognized, returning '()'. Most token types are defined in terms of a prefix of the input -- for example, 'QualifiedPath' -- and the parsers for these tokens naturally return as soon as the prefix is recognized. -} recognizer :: Class -> Parser () recognizer cls = case cls of EnvBinding -> () <$ do satisfy varFirst takeWhile varBody char8 '=' QualifiedPath -> () <$ do string "/" <|> string "./" <|> string "../" DashDash -> string "--" *> endOfInput LongOption -> () <$ (string "--" >> satisfy (/= '-')) Dash -> char8 '-' *> endOfInput ShortOption -> () <$ (char8 '-' >> satisfy (/= '-')) URL -> () <$ do takeWhile1 isURLSchemeChar many $ do char8 '+' <|> char8 '/' takeWhile1 isURLSchemeChar string "://" HexNum -> string "0x" >> takeWhile1 isHexDigit *> endOfInput DecimalNum -> takeWhile1 isDigit *> endOfInput Size -> () <$ size schemeSeparator = char8 '+' <|> char8 '/' varFirst = inClass "a-zA-Z_" varBody = inClass "a-zA-Z_0-9" isLongOptionChar = inClass "a-zA-Z0-9-" isShortOptionChar = inClass "a-zA-Z0-9!?" isSchemeChar = inClass "a-z0-9" isHexDigit = inClass "0-9a-fA-F" isURLSchemeChar = inClass "a-z0-9" {-| A map from suffixes to sizes, following the conventions of command line tools (GNU @dd@ or @head@ and many others) as well as the standard for binary sizes established by the IEC. @ B = 1 K = KiB = 1024B kB = 1000B M = MiB = 1024K MB = 1000kB G = GiB = 1024M GB = 1000MB T = TiB = 1024G TB = 1000GB P = PiB = 1024T PB = 1000TB E = EiB = 1024P EB = 1000PB Z = ZiB = 1024E ZB = 1000EB Y = YiB = 1024Z YB = 1000ZB @ -} sizes :: Map ByteString Integer sizes = Map.fromList [ ("B", 1), ("K", 2^10), ("KiB", 2^10), ("kB", 10^03), ("M", 2^20), ("MiB", 2^20), ("MB", 10^06), ("G", 2^30), ("GiB", 2^30), ("GB", 10^09), ("T", 2^40), ("TiB", 2^40), ("TB", 10^12), ("P", 2^50), ("PiB", 2^50), ("PB", 10^15), ("E", 2^60), ("EiB", 2^60), ("EB", 10^18), ("Z", 2^70), ("ZiB", 2^70), ("ZB", 10^21), ("Y", 2^80), ("YiB", 2^80), ("YB", 10^24) ] {-| Parse a size, consuming the entire input string. -} size :: Parser Integer size = (*) <$> decimal <*> suffix where asSuffix (k, v) = v <$ try (string k <* endOfInput) suffix = choice (asSuffix <$> Map.toList sizes) {-| Parse a size, consuming the entire input string, with the final result bounded by the maximum of a 'Bounded' type. -} sizeBounded :: forall b . (Bounded b, Integral b) => Parser b sizeBounded = fromInteger . min (toInteger (maxBound :: b)) <$> size