-- | Tokenization breaks a 'String' into pieces of whitespace, -- constants, symbols, and identifiers. module Hpp.Tokens where import Data.Char (isAlphaNum, isDigit, isSpace) -- | Tokenization is 'words' except the white space is tagged rather -- than discarded. data Token = Important String -- ^ Identifiers, symbols, and constants | Other String -- ^ White space, etc. deriving (Eq,Ord,Show) -- | Extract the contents of a 'Token'. detok :: Token -> String detok (Important s) = s detok (Other s) = s {-# INLINE detok #-} -- | 'True' if the given 'Token' is 'Important'; 'False' otherwise. isImportant :: Token -> Bool isImportant (Important _) = True isImportant _ = False -- | 'True' if the given 'Token' is /not/ 'Important'; 'False' -- otherwise. notImportant :: Token -> Bool notImportant (Other _) = True notImportant _ = False -- | Return the contents of only 'Important' (non-space) tokens. importants :: [Token] -> [String] importants = map detok . filter isImportant -- | Trim 'Other' 'Token's from both ends of a list of 'Token's. trimUnimportant :: [Token] -> [Token] trimUnimportant = aux id . dropWhile (not . isImportant) where aux _ [] = [] aux acc (t@(Important _) : ts) = acc (t : aux id ts) aux acc (t@(Other _) : ts) = aux (acc . (t:)) ts -- | Is a 'Token' a newline character? newLine :: Token -> Bool newLine (Other "\n") = True newLine _ = False -- | Break a 'String' into space and non-whitespace runs. tokWords :: String -> [Token] tokWords [] = [] tokWords (c:cs) | isSpace c = let (spaces,rst) = break (not . isSpace) cs in Other (c : spaces) : tokWords rst | c == '\'' && isCharLit = goCharLit | c == '"' = flip skipLiteral cs $ \str rst -> Important (str []) : tokWords rst | otherwise = let (chars,rst) = break (not . validIdentifierChar) cs in Important (c:chars) : tokWords rst where (isCharLit, goCharLit) = case cs of (c':'\'':cs') -> (True, Important ['\'',c','\''] : tokWords cs') _ -> (False, []) -- | If you encounter a string literal, call this helper with a -- double-barreled continuation and the rest of your input. The -- continuation will expect the remainder of the string literal as the -- first argument, and the remaining input as the second argument. skipLiteral :: ((String -> String) -> String -> r) -> String -> r skipLiteral k = go ('"':) where go acc ('\\':'\\':cs) = go (acc . ("\\\\"++)) cs go acc ('\\':'"':cs) = go (acc . ("\\\""++)) cs go acc ('"':cs) = k (acc . ('"':)) cs go acc (c:cs) = go (acc . (c :)) cs go acc [] = k acc [] {-# INLINE skipLiteral #-} -- | @splits isDelimiter str@ tokenizes @str@ using @isDelimiter@ as a -- delimiter predicate. Leading whitespace is also stripped from -- tokens. splits :: (Char -> Bool) -> String -> [String] splits isDelim = filter (not . null) . go . dropWhile isSpace where go s = case break isDelim s of (h,[]) -> [dropWhile isSpace h] (h,d:t) -> dropWhile isSpace h : [d] : go t -- | Predicate on space characters based on something approximating -- valid identifier syntax. This is used to break apart non-space -- characters. validIdentifierChar :: Char -> Bool validIdentifierChar c = isAlphaNum c || c == '_' || c == '\'' -- | Something like @12E+FOO@ is a single pre-processor token, so -- @FOO@ should not be macro expanded. fixExponents :: [Token] -> [Token] fixExponents [] = [] fixExponents (Important (t1@(d1:_)):Important [c]:Important (d2:t2):ts) | elem c "-+" && isDigit d1 && elem (last t1) "eE" && isAlphaNum d2 = Important (t1++c:d2:t2) : fixExponents ts fixExponents (t:ts) = t : fixExponents ts -- | Break an input 'String' into a sequence of 'Tokens'. Warning: -- This may not exactly correspond to your target language's -- definition of a valid identifier! tokenize :: String -> [Token] tokenize = fixExponents . concatMap seps . tokWords where seps t@(Other _) = [t] seps t@(Important ('"':_)) = [t] seps t@(Important ('\'':_)) = [t] seps (Important s) = map Important $ splits (not . validIdentifierChar) s -- | Collapse a sequence of 'Tokens' back into a 'String'. @detokenize -- . tokenize == id@. detokenize :: [Token] -> String detokenize = concatMap detok {-# INLINE detokenize #-}