-- | -- Module : Cryptol.Parser.LexerUtils -- Copyright : (c) 2013-2016 Galois, Inc. -- License : BSD3 -- Maintainer : cryptol@galois.com -- Stability : provisional -- Portability : portable {-# LANGUAGE DeriveAnyClass #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE PatternGuards #-} module Cryptol.Parser.LexerUtils where import Cryptol.Parser.Position import Cryptol.Parser.Unlit(PreProc(None)) import Cryptol.Utils.PP import Cryptol.Utils.Panic import Data.Char(toLower,generalCategory,isAscii,ord,isSpace) import qualified Data.Char as Char import Data.Text(Text) import qualified Data.Text as T import Data.Word(Word8) import GHC.Generics (Generic) import Control.DeepSeq data Config = Config { cfgSource :: !FilePath -- ^ File that we are working on , cfgLayout :: !Layout -- ^ Settings for layout processing , cfgPreProc :: PreProc -- ^ Preprocessor settings , cfgAutoInclude :: [FilePath] -- ^ Implicit includes , cfgModuleScope :: Bool -- ^ When we do layout processing -- should we add a vCurly (i.e., are -- we parsing a list of things). } defaultConfig :: Config defaultConfig = Config { cfgSource = "" , cfgLayout = Layout , cfgPreProc = None , cfgAutoInclude = [] , cfgModuleScope = True } type Action = Config -> Position -> Text -> LexS -> (Maybe (Located Token), LexS) data LexS = Normal | InComment Bool Position ![Position] [Text] | InString Position Text | InChar Position Text startComment :: Bool -> Action startComment isDoc _ p txt s = (Nothing, InComment d p stack chunks) where (d,stack,chunks) = case s of Normal -> (isDoc, [], [txt]) InComment doc q qs cs -> (doc, q : qs, txt : cs) _ -> panic "[Lexer] startComment" ["in a string"] endComment :: Action endComment cfg p txt s = case s of InComment d f [] cs -> (Just (mkToken d f cs), Normal) InComment d _ (q:qs) cs -> (Nothing, InComment d q qs (txt : cs)) _ -> panic "[Lexer] endComment" ["outside comment"] where mkToken isDoc f cs = let r = Range { from = f, to = moves p txt, source = cfgSource cfg } str = T.concat $ reverse $ txt : cs tok = if isDoc then DocStr else BlockComment in Located { srcRange = r, thing = Token (White tok) str } addToComment :: Action addToComment _ _ txt s = (Nothing, InComment doc p stack (txt : chunks)) where (doc, p, stack, chunks) = case s of InComment d q qs cs -> (d,q,qs,cs) _ -> panic "[Lexer] addToComment" ["outside comment"] startEndComment :: Action startEndComment cfg p txt s = case s of Normal -> (Just tok, Normal) where tok = Located { srcRange = Range { from = p , to = moves p txt , source = cfgSource cfg } , thing = Token (White BlockComment) txt } InComment d p1 ps cs -> (Nothing, InComment d p1 ps (txt : cs)) _ -> panic "[Lexer] startEndComment" ["in string or char?"] startString :: Action startString _ p txt _ = (Nothing,InString p txt) endString :: Action endString cfg pe txt s = case s of InString ps str -> (Just (mkToken ps str), Normal) _ -> panic "[Lexer] endString" ["outside string"] where parseStr s1 = case reads s1 of [(cs, "")] -> StrLit cs _ -> Err InvalidString mkToken ps str = Located { srcRange = Range { from = ps , to = moves pe txt , source = cfgSource cfg } , thing = Token { tokenType = parseStr (T.unpack tokStr) , tokenText = tokStr } } where tokStr = str `T.append` txt addToString :: Action addToString _ _ txt s = case s of InString p str -> (Nothing,InString p (str `T.append` txt)) _ -> panic "[Lexer] addToString" ["outside string"] startChar :: Action startChar _ p txt _ = (Nothing,InChar p txt) endChar :: Action endChar cfg pe txt s = case s of InChar ps str -> (Just (mkToken ps str), Normal) _ -> panic "[Lexer] endString" ["outside character"] where parseChar s1 = case reads s1 of [(cs, "")] -> ChrLit cs _ -> Err InvalidChar mkToken ps str = Located { srcRange = Range { from = ps , to = moves pe txt , source = cfgSource cfg } , thing = Token { tokenType = parseChar (T.unpack tokStr) , tokenText = tokStr } } where tokStr = str `T.append` txt addToChar :: Action addToChar _ _ txt s = case s of InChar p str -> (Nothing,InChar p (str `T.append` txt)) _ -> panic "[Lexer] addToChar" ["outside character"] mkIdent :: Action mkIdent cfg p s z = (Just Located { srcRange = r, thing = Token t s }, z) where r = Range { from = p, to = moves p s, source = cfgSource cfg } t = Ident [] s mkQualIdent :: Action mkQualIdent cfg p s z = (Just Located { srcRange = r, thing = Token t s}, z) where r = Range { from = p, to = moves p s, source = cfgSource cfg } t = Ident ns i (ns,i) = splitQual s mkQualOp :: Action mkQualOp cfg p s z = (Just Located { srcRange = r, thing = Token t s}, z) where r = Range { from = p, to = moves p s, source = cfgSource cfg } t = Op (Other ns i) (ns,i) = splitQual s emit :: TokenT -> Action emit t cfg p s z = (Just Located { srcRange = r, thing = Token t s }, z) where r = Range { from = p, to = moves p s, source = cfgSource cfg } emitS :: (Text -> TokenT) -> Action emitS t cfg p s z = emit (t s) cfg p s z -- | Split out the prefix and name part of an identifier/operator. splitQual :: T.Text -> ([T.Text], T.Text) splitQual t = case splitNS (T.filter (not . isSpace) t) of [] -> panic "[Lexer] mkQualIdent" ["invalid qualified name", show t] [i] -> ([], i) xs -> (init xs, last xs) where -- split on the namespace separator, `::` splitNS s = case T.breakOn "::" s of (l,r) | T.null r -> [l] | otherwise -> l : splitNS (T.drop 2 r) -------------------------------------------------------------------------------- numToken :: Int {- ^ base -} -> Text -> TokenT numToken rad ds = Num (toVal ds') rad (T.length ds') where ds' = T.filter (/= '_') ds toVal = T.foldl' (\x c -> toInteger rad * x + fromDigit c) 0 fromDigit :: Char -> Integer fromDigit x' | 'a' <= x && x <= 'z' = toInteger (10 + fromEnum x - fromEnum 'a') | otherwise = toInteger (fromEnum x - fromEnum '0') where x = toLower x' -- XXX: For now we just keep the number as a rational. -- It might be better to keep the exponent representation, -- to avoid making huge numbers, and using up all the memory though... fnumToken :: Int -> Text -> TokenT fnumToken rad ds = Frac ((wholenNum + fracNum) * (eBase ^^ expNum)) rad where radI = fromIntegral rad :: Integer radR = fromIntegral rad :: Rational (whole,rest) = T.break (== '.') ds digits = T.filter (/= '_') expSym e = if rad == 10 then toLower e == 'e' else toLower e == 'p' (frac,mbExp) = T.break expSym (T.drop 1 rest) wholenNum = fromInteger $ T.foldl' (\x c -> radI * x + fromDigit c) 0 $ digits whole fracNum = T.foldl' (\x c -> (x + fromInteger (fromDigit c)) / radR) 0 $ T.reverse $ digits frac expNum = case T.uncons mbExp of Nothing -> 0 :: Integer Just (_,es) -> case T.uncons es of Just ('+', more) -> read $ T.unpack more _ -> read $ T.unpack es eBase = if rad == 10 then 10 else 2 :: Rational ------------------------------------------------------------------------------- data AlexInput = Inp { alexPos :: !Position , alexInputPrevChar :: !Char , input :: !Text } deriving Show alexGetByte :: AlexInput -> Maybe (Word8, AlexInput) alexGetByte i = do (c,rest) <- T.uncons (input i) let i' = i { alexPos = move (alexPos i) c, input = rest } b = byteForChar c return (b,i') data Layout = Layout | NoLayout -------------------------------------------------------------------------------- -- | Drop white-space tokens from the input. dropWhite :: [Located Token] -> [Located Token] dropWhite = filter (notWhite . tokenType . thing) where notWhite (White w) = w == DocStr notWhite _ = True data Block = Virtual Int -- ^ Virtual layout block | Explicit TokenT -- ^ An explicit layout block, expecting this ending -- token. deriving (Show) isExplicit :: Block -> Bool isExplicit Explicit{} = True isExplicit Virtual{} = False startsLayout :: TokenT -> Bool startsLayout (KW KW_where) = True startsLayout (KW KW_private) = True startsLayout (KW KW_parameter) = True startsLayout _ = False -- Add separators computed from layout layout :: Config -> [Located Token] -> [Located Token] layout cfg ts0 = loop False implicitScope [] ts0 where (_pos0,implicitScope) = case ts0 of t : _ -> (from (srcRange t), cfgModuleScope cfg && tokenType (thing t) /= KW KW_module) _ -> (start,False) loop :: Bool -> Bool -> [Block] -> [Located Token] -> [Located Token] loop afterDoc startBlock stack (t : ts) | startsLayout ty = toks ++ loop False True stack' ts -- We don't do layout within these delimeters | Sym ParenL <- ty = toks ++ loop False False (Explicit (Sym ParenR) : stack') ts | Sym CurlyL <- ty = toks ++ loop False False (Explicit (Sym CurlyR) : stack') ts | Sym BracketL <- ty = toks ++ loop False False (Explicit (Sym BracketR) : stack') ts | EOF <- ty = toks | White DocStr <- ty = toks ++ loop True False stack' ts | otherwise = toks ++ loop False False stack' ts where ty = tokenType (thing t) pos = srcRange t (toks,offStack) | afterDoc = ([t], stack) | otherwise = offsides startToks t stack -- add any block start tokens, and push a level on the stack (startToks,stack') | startBlock && ty == EOF = ( [ virt cfg (to pos) VCurlyR , virt cfg (to pos) VCurlyL ] , offStack ) | startBlock = ( [ virt cfg (to pos) VCurlyL ], Virtual (col (from pos)) : offStack ) | otherwise = ( [], offStack ) loop _ _ _ [] = panic "[Lexer] layout" ["Missing EOF token"] offsides :: [Located Token] -> Located Token -> [Block] -> ([Located Token], [Block]) offsides startToks t = go startToks where go virts stack = case stack of -- delimit or close a layout block Virtual c : rest -- commas only close to an explicit marker, so if there is none, the -- comma doesn't close anything | Sym Comma == ty -> if any isExplicit rest then go (virt cfg (to pos) VCurlyR : virts) rest else done virts stack | closingToken -> go (virt cfg (to pos) VCurlyR : virts) rest | col (from pos) == c -> done (virt cfg (to pos) VSemi : virts) stack | col (from pos) < c -> go (virt cfg (to pos) VCurlyR : virts) rest -- close an explicit block Explicit close : rest | close == ty -> done virts rest | Sym Comma == ty -> done virts stack _ -> done virts stack ty = tokenType (thing t) pos = srcRange t done ts s = (reverse (t:ts), s) closingToken = ty `elem` [ Sym ParenR, Sym BracketR, Sym CurlyR ] virt :: Config -> Position -> TokenV -> Located Token virt cfg pos x = Located { srcRange = Range { from = pos , to = pos , source = cfgSource cfg } , thing = t } where t = Token (Virt x) $ case x of VCurlyL -> "beginning of layout block" VCurlyR -> "end of layout block" VSemi -> "layout block separator" -------------------------------------------------------------------------------- data Token = Token { tokenType :: !TokenT, tokenText :: !Text } deriving (Show, Generic, NFData) -- | Virtual tokens, inserted by layout processing. data TokenV = VCurlyL| VCurlyR | VSemi deriving (Eq, Show, Generic, NFData) data TokenW = BlockComment | LineComment | Space | DocStr deriving (Eq, Show, Generic, NFData) data TokenKW = KW_else | KW_extern | KW_fin | KW_if | KW_private | KW_include | KW_inf | KW_lg2 | KW_lengthFromThen | KW_lengthFromThenTo | KW_max | KW_min | KW_module | KW_newtype | KW_pragma | KW_property | KW_then | KW_type | KW_where | KW_let | KW_x | KW_import | KW_as | KW_hiding | KW_infixl | KW_infixr | KW_infix | KW_primitive | KW_parameter | KW_constraint | KW_Prop deriving (Eq, Show, Generic, NFData) -- | The named operators are a special case for parsing types, and 'Other' is -- used for all other cases that lexed as an operator. data TokenOp = Plus | Minus | Mul | Div | Exp | Mod | Equal | LEQ | GEQ | Complement | Hash | At | Other [T.Text] T.Text deriving (Eq, Show, Generic, NFData) data TokenSym = Bar | ArrL | ArrR | FatArrR | Lambda | EqDef | Comma | Semi | Dot | DotDot | DotDotDot | Colon | BackTick | ParenL | ParenR | BracketL | BracketR | CurlyL | CurlyR | TriL | TriR | Underscore deriving (Eq, Show, Generic, NFData) data TokenErr = UnterminatedComment | UnterminatedString | UnterminatedChar | InvalidString | InvalidChar | LexicalError deriving (Eq, Show, Generic, NFData) data TokenT = Num !Integer !Int !Int -- ^ value, base, number of digits | Frac !Rational !Int -- ^ value, base. | ChrLit !Char -- ^ character literal | Ident ![T.Text] !T.Text -- ^ (qualified) identifier | StrLit !String -- ^ string literal | KW !TokenKW -- ^ keyword | Op !TokenOp -- ^ operator | Sym !TokenSym -- ^ symbol | Virt !TokenV -- ^ virtual token (for layout) | White !TokenW -- ^ white space token | Err !TokenErr -- ^ error token | EOF deriving (Eq, Show, Generic, NFData) instance PP Token where ppPrec _ (Token _ s) = text (T.unpack s) -- | Collapse characters into a single Word8, identifying ASCII, and classes of -- unicode. This came from: -- -- https://github.com/glguy/config-value/blob/master/src/Config/LexerUtils.hs -- -- Which adapted: -- -- https://github.com/ghc/ghc/blob/master/compiler/parser/Lexer.x byteForChar :: Char -> Word8 byteForChar c | c <= '\6' = non_graphic | isAscii c = fromIntegral (ord c) | otherwise = case generalCategory c of Char.LowercaseLetter -> lower Char.OtherLetter -> lower Char.UppercaseLetter -> upper Char.TitlecaseLetter -> upper Char.DecimalNumber -> digit Char.OtherNumber -> digit Char.ConnectorPunctuation -> symbol Char.DashPunctuation -> symbol Char.OtherPunctuation -> symbol Char.MathSymbol -> symbol Char.CurrencySymbol -> symbol Char.ModifierSymbol -> symbol Char.OtherSymbol -> symbol Char.Space -> sp Char.ModifierLetter -> other Char.NonSpacingMark -> other Char.SpacingCombiningMark -> other Char.EnclosingMark -> other Char.LetterNumber -> other Char.OpenPunctuation -> other Char.ClosePunctuation -> other Char.InitialQuote -> other Char.FinalQuote -> tick _ -> non_graphic where non_graphic = 0 upper = 1 lower = 2 digit = 3 symbol = 4 sp = 5 other = 6 tick = 7