{ {-# LANGUAGE DeriveFunctor #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE DeriveTraversable #-} {-# LANGUAGE StandaloneDeriving #-} {-# LANGUAGE StrictData #-} {-# OPTIONS_GHC -Wno-unused-imports #-} module Language.Cimple.Lexer ( Alex , AlexPosn (..) , alexError , alexScanTokens , alexMonadScan , Lexeme (..) , lexemeClass , lexemePosn , lexemeText , lexemeLine , runAlex ) where import Data.Aeson (FromJSON, ToJSON) import qualified Data.ByteString as BS import qualified Data.ByteString.Lazy as LBS import Data.Text (Text) import qualified Data.Text as Text import qualified Data.Text.Encoding as Text import GHC.Generics (Generic) import Language.Cimple.Tokens (LexemeClass (..)) } %wrapper "monad-bytestring" tokens :- -- Ignore attributes. <0> "GNU_PRINTF" { mkL KwGnuPrintf } <0> "VLA" { mkL KwVla } -- Winapi functions. <0> "FormatMessageA" { mkL IdVar } <0> "GetAdaptersInfo" { mkL IdVar } <0> "GetSystemTimeAsFileTime" { mkL IdVar } <0> "GetTickCount" { mkL IdVar } <0> "LocalFree" { mkL IdVar } <0> "QueryPerformanceCounter" { mkL IdVar } <0> "QueryPerformanceFrequency" { mkL IdVar } <0> "SecureZeroMemory" { mkL IdVar } <0> "WSAAddressToString" { mkL IdVar } <0> "WSACleanup" { mkL IdVar } <0> "WSAGetLastError" { mkL IdVar } <0> "WSAStartup" { mkL IdVar } <0> "WSAStringToAddress" { mkL IdVar } -- Winapi struct members. <0> "GatewayList" { mkL IdVar } <0> "IpAddressList" { mkL IdVar } <0> "IpAddress" { mkL IdVar } <0> "IpMask" { mkL IdVar } <0> "Next" { mkL IdVar } <0> "QuadPart" { mkL IdVar } <0> "String" { mkL IdVar } -- Windows typedefs. <0> "DWORD" { mkL IdStdType } <0> "FILETIME" { mkL IdStdType } <0> "INT" { mkL IdStdType } <0> "LPSOCKADDR" { mkL IdStdType } <0> "IP_ADAPTER_INFO" { mkL IdStdType } <0> "LPTSTR" { mkL IdStdType } <0> "u_long" { mkL IdStdType } <0> "LARGE_INTEGER" { mkL IdStdType } <0> "SOCKET" { mkL IdStdType } <0> "WSADATA" { mkL IdStdType } -- System struct types. <0> "addrinfo" { mkL IdSueType } <0> "ifconf" { mkL IdSueType } <0> "ifreq" { mkL IdSueType } <0> "epoll_event" { mkL IdSueType } <0> "ev_async" { mkL IdSueType } <0> "ev_io" { mkL IdSueType } <0> "ev_loop" { mkL IdSueType } <0> "fd_set" { mkL IdSueType } <0> "in_addr" { mkL IdSueType } <0> "in6_addr" { mkL IdSueType } <0> "ipv6_mreq" { mkL IdSueType } <0> "sockaddr" { mkL IdSueType } <0> "sockaddr_in" { mkL IdSueType } <0> "sockaddr_in6" { mkL IdSueType } <0> "sockaddr_storage" { mkL IdSueType } <0> "timespec" { mkL IdSueType } <0> "timeval" { mkL IdSueType } -- Msgpack struct types. <0> "msgpack_iovec" { mkL IdSueType } <0> "msgpack_object_array" { mkL IdSueType } <0> "msgpack_object_bin" { mkL IdSueType } <0> "msgpack_object_ext" { mkL IdSueType } <0> "msgpack_object_kv" { mkL IdSueType } <0> "msgpack_object_map" { mkL IdSueType } <0> "msgpack_object" { mkL IdSueType } <0> "msgpack_object_str" { mkL IdSueType } <0> "msgpack_object_type" { mkL IdSueType } <0> "msgpack_packer" { mkL IdSueType } <0> "msgpack_packer_write" { mkL IdFuncType } <0> "msgpack_sbuffer" { mkL IdSueType } <0> "msgpack_timestamp" { mkL IdSueType } <0> "msgpack_unpacked" { mkL IdSueType } <0> "msgpack_unpacker" { mkL IdSueType } <0> "msgpack_unpack_return" { mkL IdSueType } <0> "msgpack_vrefbuffer" { mkL IdSueType } <0> "msgpack_zbuffer" { mkL IdSueType } <0> "msgpack_zone" { mkL IdSueType } -- Sodium constants. <0,ppSC> "crypto_"[a-z0-9_]+[A-Z][A-Z0-9_]* { mkL IdConst } -- Standard C (ish). defined { mkL PpDefined } \"[^\"]*\" { mkL LitString } \n { mkL PpNewline `andBegin` 0 } \\\n ; " " ; $white { mkE ErrorToken } "//!TOKSTYLE+" { mkL IgnEnd `andBegin` 0 } ([^\/]+|.|\n) { mkL IgnBody } <0,ppSC> "//"\n ; <0,ppSC> "// ".* ; <0> [\ \n]+ ; <0> $white { mkE ErrorToken } <0> "//!TOKSTYLE-" { mkL IgnStart `andBegin` ignoreSC } <0> "/*!" { mkL CmtStartCode } <0> "*/" { mkL CmtEnd } <0> "/*" { mkL CmtStart `andBegin` cmtSC } <0> "/**" { mkL CmtStartDoc `andBegin` cmtSC } <0> "/** @{" { mkL CmtStartDocSection `andBegin` cmtSC } <0> "/** @} */" { mkL CmtEndDocSection } <0> "/**""*"+ { mkL CmtStartBlock `andBegin` cmtSC } <0,cmtSC> \"(\\.|[^\"])*\" { mkL LitString } <0> '(\\|[^'])*' { mkL LitChar } <0> "<"[a-z0-9\.\/_]+">" { mkL LitSysInclude } <0> "#if" { mkL PpIf `andBegin` ppSC } <0> "#ifdef" { mkL PpIfdef } <0> "#ifndef" { mkL PpIfndef } <0> "#elif" { mkL PpElif `andBegin` ppSC } <0> "#else" { mkL PpElse } <0> "#endif" { mkL PpEndif } <0> "#define" { mkL PpDefine `andBegin` ppSC } <0> "#undef" { mkL PpUndef } <0> "#include" { mkL PpInclude } <0,ppSC> "tox_"?"bitwise" { mkL KwBitwise } <0,ppSC> "tox_"?"force" { mkL KwForce } <0,ppSC> "tox_"?"non_null" { mkL KwNonNull } <0,ppSC> "tox_"?"nullable" { mkL KwNullable } <0,ppSC> "tox_"?"owner" { mkL KwOwner } <0,ppSC> "break" { mkL KwBreak } <0,ppSC> "case" { mkL KwCase } <0,ppSC> "const" { mkL KwConst } <0,ppSC> "continue" { mkL KwContinue } <0,ppSC> "default" { mkL KwDefault } <0,ppSC> "do" { mkL KwDo } <0,ppSC> "else" { mkL KwElse } <0,ppSC> "enum" { mkL KwEnum } <0,ppSC> "extern" { mkL KwExtern } <0,ppSC> "for" { mkL KwFor } <0,ppSC> "goto" { mkL KwGoto } <0,ppSC> "if" { mkL KwIf } <0,ppSC> "return" { mkL KwReturn } <0,ppSC> "sizeof" { mkL KwSizeof } <0,ppSC> "static" { mkL KwStatic } <0,ppSC> "static_assert" { mkL KwStaticAssert } <0,ppSC> "struct" { mkL KwStruct } <0,ppSC> "switch" { mkL KwSwitch } <0,ppSC> "typedef" { mkL KwTypedef } <0,ppSC> "union" { mkL KwUnion } <0,ppSC> "void" { mkL KwVoid } <0,ppSC> "while" { mkL KwWhile } <0,ppSC> "bool" { mkL IdStdType } <0,ppSC> "char" { mkL IdStdType } <0,ppSC> "double" { mkL IdStdType } <0,ppSC> "float" { mkL IdStdType } <0,ppSC> "int" { mkL IdStdType } <0,ppSC> "long int" { mkL IdStdType } <0,ppSC> "long signed int" { mkL IdStdType } <0,ppSC> "long" { mkL IdStdType } <0,ppSC> "signed int" { mkL IdStdType } <0,ppSC> "unsigned int" { mkL IdStdType } <0,ppSC> "unsigned long" { mkL IdStdType } <0,ppSC> "unsigned long long" { mkL IdStdType } <0,ppSC> "unsigned" { mkL IdStdType } <0,ppSC> "va_list" { mkL IdStdType } <0,ppSC> "false" { mkL LitFalse } <0,ppSC> "true" { mkL LitTrue } <0,ppSC> "__func__" { mkL IdVar } <0,ppSC> "nullptr" { mkL IdConst } <0,ppSC> "__"[a-zA-Z]+"__"? { mkL IdConst } <0,ppSC> [A-Z][A-Z0-9_]{1,2} { mkL IdSueType } <0,ppSC> _*[A-Z][A-Z0-9_]* { mkL IdConst } <0,ppSC> [A-Z][A-Za-z0-9_]*[a-z][A-Za-z0-9_]* { mkL IdSueType } <0,ppSC> "cmp_"[a-z][a-z0-9_]*_[suet] { mkL IdSueType } <0,ppSC> [a-z][a-z0-9_]*_t { mkL IdStdType } <0,ppSC> [a-z][a-z0-9_]*_cb { mkL IdFuncType } <0,ppSC> "cmp_"("reader"|"writer"|"skipper") { mkL IdFuncType } <0,ppSC> [a-z][A-Za-z0-9_]* { mkL IdVar } <0,ppSC,cmtSC> [0-9]+[LU]* { mkL LitInteger } <0,ppSC,cmtSC> [0-9]+"."[0-9]+[Ff]? { mkL LitInteger } <0,ppSC> 0x[0-9a-fA-F]+[LU]* { mkL LitInteger } <0,ppSC,cmtSC> "=" { mkL PctEq } <0,ppSC,cmtSC> "==" { mkL PctEqEq } <0,ppSC> "&" { mkL PctAmpersand } <0,ppSC> "&&" { mkL PctAmpersandAmpersand } <0,ppSC> "&=" { mkL PctAmpersandEq } <0,ppSC> "->" { mkL PctArrow } <0,ppSC,cmtSC> "," { mkL PctComma } <0,ppSC,cmtSC> "+" { mkL PctPlus } <0,ppSC> "++" { mkL PctPlusPlus } <0,ppSC> "+=" { mkL PctPlusEq } <0,ppSC,cmtSC> "-" { mkL PctMinus } <0,ppSC> "--" { mkL PctMinusMinus } <0,ppSC> "-=" { mkL PctMinusEq } <0,ppSC> "~" { mkL PctTilde } <0,ppSC,cmtSC> "/" { mkL PctSlash } <0,ppSC> "/=" { mkL PctSlashEq } <0,ppSC,cmtSC> "." { mkL PctPeriod } <0,ppSC,cmtSC> "..." { mkL PctEllipsis } <0,ppSC> "%" { mkL PctPercent } <0,ppSC> "%=" { mkL PctPercentEq } <0,ppSC,cmtSC> ";" { mkL PctSemicolon } <0,ppSC,cmtSC> ":" { mkL PctColon } <0,ppSC,cmtSC> "<" { mkL PctLess } <0,ppSC> "<<" { mkL PctLessLess } <0,ppSC> "<<=" { mkL PctLessLessEq } <0,ppSC> "<=" { mkL PctLessEq } <0,ppSC,cmtSC> ">" { mkL PctGreater } <0,ppSC> ">>" { mkL PctGreaterGreater } <0,ppSC> ">>=" { mkL PctGreaterGreaterEq } <0,ppSC,cmtSC> ">=" { mkL PctGreaterEq } <0,ppSC> "|" { mkL PctPipe } <0,ppSC> "||" { mkL PctPipePipe } <0,ppSC> "|=" { mkL PctPipeEq } <0,ppSC> "[" { mkL PctLBrack } <0,ppSC> "]" { mkL PctRBrack } <0,ppSC> "{" { mkL PctLBrace } <0,ppSC> "}" { mkL PctRBrace } <0,ppSC,cmtSC> "(" { mkL PctLParen } <0,ppSC,cmtSC> ")" { mkL PctRParen } <0,ppSC,cmtSC> "?" { mkL PctQMark } <0,ppSC,cmtSC> "!" { mkL PctEMark } <0,ppSC,cmtSC> "!=" { mkL PctEMarkEq } <0,ppSC> "*" { mkL PctAsterisk } <0,ppSC> "*=" { mkL PctAsteriskEq } <0,ppSC> "^" { mkL PctCaret } <0,ppSC> "^=" { mkL PctCaretEq } -- Comments. "Copyright ©" { mkL CmtSpdxCopyright } "SPDX-License-Identifier:" { mkL CmtSpdxLicense } "GPL-3.0-or-later" { mkL CmtWord } "TODO("[^\)]+"):" { mkL CmtWord } [Ee]".g." { mkL CmtWord } "etc." { mkL CmtWord } [Ii]".e." { mkL CmtWord } [0-2][0-9](":"[0-5][0-9]){2}"."[0-9]{3} { mkL CmtWord } "v"?[0-9]+("."[0-9]+)+ { mkL CmtWord } [A-Z][A-Za-z]+"::"[a-z_]+ { mkL CmtWord } ([a-z]+"/")+[A-Za-z]+("."[a-z_]+)+ { mkL CmtWord } [a-z]+("."[a-z_]+)+ { mkL CmtWord } [a-z]+("-"[a-z_]+)+ { mkL CmtWord } "@code" { mkL CmtCode `andBegin` codeSC } "" { mkL CmtCode `andBegin` codeSC } "["[^\]]+"]" { mkL CmtAttr } "@retval" { mkL CmtCommand `andBegin` retvalSC } [@\\][a-z]+ { mkL CmtCommand } "*"[A-Za-z][A-Za-z0-9_']*"*" { mkL CmtWord } "#"[A-Za-z][A-Za-z0-9_]* { mkL CmtRef } "_"*[A-Za-z][A-Za-z0-9_']* { mkL CmtWord } "#"[0-9]+ { mkL CmtWord } "http://"[^\ ]+ { mkL CmtWord } [0-9]+"%" { mkL LitInteger } "-1" { mkL LitInteger } "`"([^`]|"\`")+"`" { mkL CmtCode } "${"([^\}])+"}" { mkL CmtCode } "-"+ { mkL CmtWord } [&\|]+ { mkL CmtWord } "–" { mkL CmtWord } "*/" { mkL CmtEnd `andBegin` 0 } \n { mkL PpNewline `andBegin` cmtNewlineSC } " "+ ; [^\ ]+ { mkL CmtWord `andBegin` cmtSC } " "+ ; " "+"*"+"/" { mkL CmtEnd `andBegin` 0 } " "+"*" { begin cmtStartSC } " "+ { mkL CmtIndent `andBegin` cmtSC } \n { mkL PpNewline `andBegin` cmtNewlineSC } " "+ { mkL CmtIndent `andBegin` codeSC } \n { mkL PpNewline `andBegin` codeNewlineSC } " "+"*" { begin codeStartSC } -- blocks in comments. "@endcode" { mkL CmtCode `andBegin` cmtSC } "" { mkL CmtCode `andBegin` cmtSC } \n { mkL PpNewline `andBegin` codeNewlineSC } [^@\<]+ { mkL CmtCode } -- Error handling. <0,ppSC,cmtSC,codeSC> . { mkL ErrorToken } { deriving instance Generic AlexPosn instance FromJSON AlexPosn instance ToJSON AlexPosn data Lexeme text = L AlexPosn LexemeClass text deriving (Eq, Show, Generic, Functor, Foldable, Traversable) instance FromJSON text => FromJSON (Lexeme text) instance ToJSON text => ToJSON (Lexeme text) mkL :: LexemeClass -> AlexInput -> Int64 -> Alex (Lexeme Text) mkL c (p, _, str, _) len = pure $ L p c (piece str) where piece = Text.decodeUtf8 . LBS.toStrict . LBS.take len mkE :: LexemeClass -> AlexInput -> Int64 -> Alex (Lexeme Text) mkE c (p, _, str, _) len = alexError $ ": " <> show (L p c (piece str)) where piece = Text.decodeUtf8 . LBS.toStrict . LBS.take len lexemePosn :: Lexeme text -> AlexPosn lexemePosn (L p _ _) = p lexemeClass :: Lexeme text -> LexemeClass lexemeClass (L _ c _) = c lexemeText :: Lexeme text -> text lexemeText (L _ _ s) = s lexemeLine :: Lexeme text -> Int lexemeLine (L (AlexPn _ l _) _ _) = l alexEOF :: Alex (Lexeme Text) alexEOF = return (L (AlexPn 0 0 0) Eof Text.empty) alexScanTokens :: LBS.ByteString -> Either String [Lexeme Text] alexScanTokens str = runAlex str $ loop [] where loop toks = do tok@(L _ cl _) <- alexMonadScan if cl == Eof then return $ reverse toks else loop $! (tok:toks) }