{-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE TupleSections #-} {-# OPTIONS_GHC -fno-warn-missing-fields #-} -- | -- Module : GHC.SyntaxHighlighter -- Copyright : © 2018–present Mark Karpov -- License : BSD 3 clause -- -- Maintainer : Mark Karpov -- Stability : experimental -- Portability : portable -- -- The module allows you to decompose a 'Text' stream containing Haskell -- source code into a stream of 'Text' chunks tagged with 'Token'. -- -- This library uses the GHC's lexer, so the result is guaranteed to be 100% -- correct, as if it were parsed by GHC itself. module GHC.SyntaxHighlighter ( Token (..), Loc (..), tokenizeHaskell, tokenizeHaskellLoc, ) where import Control.Monad import Data.List (unfoldr) import Data.Maybe (isJust) import Data.Text (Text) import qualified Data.Text as T import qualified GHC.Data.EnumSet as ES import GHC.Data.FastString (mkFastString) import GHC.Data.StringBuffer import GHC.LanguageExtensions import qualified GHC.Parser.Lexer as L import GHC.Types.SrcLoc import GHC.Utils.Error (DiagOpts (..)) import GHC.Utils.Outputable (defaultSDocContext) ---------------------------------------------------------------------------- -- Data types -- | Token types that are used as tags to mark spans of source code. data Token = -- | Keyword KeywordTok | -- | Pragmas PragmaTok | -- | Symbols (punctuation that is not an operator) SymbolTok | -- | Variable name (term level) VariableTok | -- | Data\/type constructor ConstructorTok | -- | Operator OperatorTok | -- | Character CharTok | -- | String StringTok | -- | Integer IntegerTok | -- | Rational number RationalTok | -- | Comment (including Haddocks) CommentTok | -- | Space filling SpaceTok | -- | Something else? OtherTok deriving (Eq, Ord, Enum, Bounded, Show) -- | The start and end positions of a span. The arguments of the data -- constructor contain in order: -- -- * Line number of start position of a span -- * Column number of start position of a span -- * Line number of end position of a span -- * Column number of end position of a span -- -- @since 0.0.2.0 data Loc = Loc !Int !Int !Int !Int deriving (Eq, Ord, Show) ---------------------------------------------------------------------------- -- High-level API -- | Tokenize Haskell source code. If the code cannot be parsed, return -- 'Nothing'. Otherwise return the original input tagged by 'Token's. -- 'Nothing' is rarely returned, if ever, because it looks like the lexer is -- capable of interpreting almost any text as a stream of GHC tokens. -- -- The parser does not require the input source code to form a valid Haskell -- program, so as long as the lexer can decompose your input (most of the -- time), it'll return something in 'Just'. tokenizeHaskell :: Text -> Maybe [(Token, Text)] tokenizeHaskell input = sliceInputStream input <$> tokenizeHaskellLoc input -- | Replace 'Loc' locations with actual chunks of input 'Text'. sliceInputStream :: Text -> [(Token, Loc)] -> [(Token, Text)] sliceInputStream input toks = unfoldr sliceOnce (initText' input, toks) where sliceOnce (txt, []) = do (txt', chunk) <- tryFetchRest txt return ((SpaceTok, chunk), (txt', [])) sliceOnce (txt, tss@((t, l) : ts)) = case tryFetchSpace txt l of Nothing -> let (txt', chunk) = fetchSpan txt l t' = case t of CommentTok -> if isHeaderPragma chunk then PragmaTok else CommentTok tok -> tok in Just ((t', chunk), (txt', ts)) Just (txt', chunk) -> Just ((SpaceTok, chunk), (txt', tss)) -- | Similar to 'tokenizeHaskell', but instead of 'Text' chunks provides -- locations of corresponding spans in the given input stream. -- -- @since 0.0.2.0 tokenizeHaskellLoc :: Text -> Maybe [(Token, Loc)] tokenizeHaskellLoc input = case L.unP pLexer parseState of L.PFailed {} -> Nothing L.POk _ x -> Just x where location = mkRealSrcLoc (mkFastString "") 1 1 buffer = stringToStringBuffer (T.unpack input) parseState = L.initParserState parserOpts buffer location parserOpts = L.mkParserOpts (ES.fromList enabledExts) diagOpts [] True -- safe imports True -- keep Haddock tokens True -- keep comment tokens False -- lex LINE and COLUMN pragmas diagOpts = DiagOpts { diag_warning_flags = ES.empty, diag_fatal_warning_flags = ES.empty, diag_warn_is_error = False, diag_reverse_errors = False, diag_max_errors = Nothing, diag_ppr_ctx = defaultSDocContext } -- | The Haskell lexer. pLexer :: L.P [(Token, Loc)] pLexer = go where go = do r <- L.lexer False return case r of L _ L.ITeof -> return [] _ -> case fixupToken r of Nothing -> go Just x -> (x :) <$> go -- | Convert @'Located' 'L.Token'@ representation into a more convenient for -- us form. fixupToken :: Located L.Token -> Maybe (Token, Loc) fixupToken (L srcSpan tok) = (classifyToken tok,) <$> srcSpanToLoc srcSpan -- | Convert 'SrcSpan' into 'Loc'. srcSpanToLoc :: SrcSpan -> Maybe Loc srcSpanToLoc (RealSrcSpan s _) = let srcSpanSLine = srcSpanStartLine s srcSpanSCol = srcSpanStartCol s srcSpanELine = srcSpanEndLine s srcSpanECol = srcSpanEndCol s start = (srcSpanSLine, srcSpanSCol) end = (srcSpanELine, srcSpanECol) in if start == end then Nothing -- NOTE Some magic auto-generated tokens that do not -- actually appear in the input stream. Drop them. else Just $ Loc srcSpanSLine srcSpanSCol srcSpanELine srcSpanECol srcSpanToLoc _ = Nothing -- | Classify a 'L.Token' in terms of 'Token'. classifyToken :: L.Token -> Token classifyToken = \case -- Keywords L.ITas -> KeywordTok L.ITcase -> KeywordTok L.ITlcases -> KeywordTok L.ITclass -> KeywordTok L.ITdata -> KeywordTok L.ITdefault -> KeywordTok L.ITderiving -> KeywordTok L.ITdo _ -> KeywordTok L.ITelse -> KeywordTok L.IThiding -> KeywordTok L.ITforeign -> KeywordTok L.ITif -> KeywordTok L.ITimport -> KeywordTok L.ITin -> KeywordTok L.ITinfix -> KeywordTok L.ITinfixl -> KeywordTok L.ITinfixr -> KeywordTok L.ITinstance -> KeywordTok L.ITlet -> KeywordTok L.ITmodule -> KeywordTok L.ITnewtype -> KeywordTok L.ITof -> KeywordTok L.ITqualified -> KeywordTok L.ITthen -> KeywordTok L.ITtype -> KeywordTok L.ITwhere -> KeywordTok L.ITforall _ -> KeywordTok L.ITexport -> KeywordTok L.ITlabel -> KeywordTok L.ITdynamic -> KeywordTok L.ITsafe -> KeywordTok L.ITinterruptible -> KeywordTok L.ITunsafe -> KeywordTok L.ITstdcallconv -> KeywordTok L.ITccallconv -> KeywordTok L.ITcapiconv -> KeywordTok L.ITprimcallconv -> KeywordTok L.ITjavascriptcallconv -> KeywordTok L.ITmdo _ -> KeywordTok L.ITfamily -> KeywordTok L.ITrole -> KeywordTok L.ITgroup -> KeywordTok L.ITby -> KeywordTok L.ITusing -> KeywordTok L.ITpattern -> KeywordTok L.ITstatic -> KeywordTok L.ITstock -> KeywordTok L.ITanyclass -> KeywordTok L.ITvia -> KeywordTok L.ITunit -> KeywordTok L.ITsignature -> KeywordTok L.ITdependency -> KeywordTok L.ITrequires -> KeywordTok -- Pragmas L.ITinline_prag {} -> PragmaTok L.ITspec_prag _ -> PragmaTok L.ITspec_inline_prag {} -> PragmaTok L.ITsource_prag _ -> PragmaTok L.ITrules_prag _ -> PragmaTok L.ITwarning_prag _ -> PragmaTok L.ITdeprecated_prag _ -> PragmaTok L.ITline_prag _ -> PragmaTok L.ITcolumn_prag _ -> PragmaTok L.ITscc_prag _ -> PragmaTok L.ITunpack_prag _ -> PragmaTok L.ITnounpack_prag _ -> PragmaTok L.ITann_prag _ -> PragmaTok L.ITcomplete_prag _ -> PragmaTok L.ITclose_prag -> PragmaTok L.IToptions_prag _ -> PragmaTok L.ITinclude_prag _ -> PragmaTok L.ITlanguage_prag -> PragmaTok L.ITminimal_prag _ -> PragmaTok L.IToverlappable_prag _ -> PragmaTok L.IToverlapping_prag _ -> PragmaTok L.IToverlaps_prag _ -> PragmaTok L.ITincoherent_prag _ -> PragmaTok L.ITctype _ -> PragmaTok L.ITcomment_line_prag -> PragmaTok L.ITopaque_prag _ -> PragmaTok -- Reserved symbols L.ITdotdot -> SymbolTok L.ITcolon -> SymbolTok L.ITdcolon _ -> SymbolTok L.ITequal -> SymbolTok L.ITlam -> SymbolTok L.ITlcase -> SymbolTok L.ITvbar -> SymbolTok L.ITlarrow _ -> SymbolTok L.ITrarrow _ -> SymbolTok L.ITlolly -> SymbolTok L.ITat -> SymbolTok L.ITtilde -> SymbolTok L.ITdarrow _ -> SymbolTok L.ITbang -> SymbolTok L.ITstar _ -> SymbolTok L.ITbiglam -> SymbolTok L.ITocurly -> SymbolTok L.ITccurly -> SymbolTok L.ITvocurly -> SymbolTok L.ITvccurly -> SymbolTok L.ITobrack -> SymbolTok L.ITopabrack -> SymbolTok L.ITcpabrack -> SymbolTok L.ITcbrack -> SymbolTok L.IToparen -> SymbolTok L.ITcparen -> SymbolTok L.IToubxparen -> SymbolTok L.ITcubxparen -> SymbolTok L.ITsemi -> SymbolTok L.ITcomma -> SymbolTok L.ITunderscore -> SymbolTok L.ITbackquote -> SymbolTok L.ITsimpleQuote -> SymbolTok L.ITpercent -> SymbolTok L.ITproj _ -> SymbolTok -- NOTE GHC thinks these are reserved symbols, but I classify them as -- operators. L.ITminus -> OperatorTok L.ITprefixminus -> OperatorTok L.ITdot -> OperatorTok -- Identifiers L.ITvarid _ -> VariableTok L.ITconid _ -> ConstructorTok L.ITvarsym _ -> OperatorTok L.ITconsym _ -> OperatorTok L.ITqvarid _ -> VariableTok L.ITqconid _ -> ConstructorTok L.ITqvarsym _ -> OperatorTok L.ITqconsym _ -> OperatorTok L.ITdupipvarid _ -> VariableTok L.ITlabelvarid _ -> VariableTok -- Basic types L.ITchar _ _ -> CharTok L.ITstring _ _ -> StringTok L.ITinteger _ -> IntegerTok L.ITrational _ -> RationalTok L.ITprimchar _ _ -> CharTok L.ITprimstring _ _ -> StringTok L.ITprimint _ _ -> IntegerTok L.ITprimword _ _ -> IntegerTok L.ITprimfloat _ -> RationalTok L.ITprimdouble _ -> RationalTok -- Template Haskell extension tokens L.ITopenExpQuote _ _ -> SymbolTok L.ITopenPatQuote -> SymbolTok L.ITopenDecQuote -> SymbolTok L.ITopenTypQuote -> SymbolTok L.ITcloseQuote _ -> SymbolTok L.ITopenTExpQuote _ -> SymbolTok L.ITcloseTExpQuote -> SymbolTok L.ITtyQuote -> SymbolTok L.ITquasiQuote _ -> SymbolTok L.ITqQuasiQuote _ -> SymbolTok L.ITdollar -> SymbolTok L.ITdollardollar -> SymbolTok -- Arrow notation L.ITproc -> KeywordTok L.ITrec -> KeywordTok L.IToparenbar _ -> SymbolTok L.ITcparenbar _ -> SymbolTok L.ITlarrowtail _ -> SymbolTok L.ITrarrowtail _ -> SymbolTok L.ITLarrowtail _ -> SymbolTok L.ITRarrowtail _ -> SymbolTok -- Type application L.ITtypeApp -> SymbolTok -- Special L.ITunknown _ -> OtherTok L.ITeof -> OtherTok -- normally is not included in results -- Documentation annotations L.ITdocComment {} -> CommentTok L.ITdocOptions {} -> CommentTok L.ITlineComment {} -> CommentTok L.ITblockComment {} -> CommentTok ---------------------------------------------------------------------------- -- Text traversing -- | A type for 'Text' with line\/column location attached. data Text' = Text' {-# UNPACK #-} !Int {-# UNPACK #-} !Int {-# UNPACK #-} !Text deriving (Show) -- | Create 'Text'' from 'Text'. initText' :: Text -> Text' initText' = Text' 1 1 -- | Try to fetch white space before start of span at 'Loc'. tryFetchSpace :: Text' -> Loc -> Maybe (Text', Text) tryFetchSpace txt (Loc sl sc _ _) = let (txt', r) = reachLoc txt sl sc in if T.null r then Nothing else Just (txt', r) -- | Try to fetch the rest of 'Text'' stream. tryFetchRest :: Text' -> Maybe (Text', Text) tryFetchRest (Text' l c txt) = if T.null txt then Nothing else Just (Text' l c "", txt) -- | Fetch a span at 'Loc'. fetchSpan :: Text' -> Loc -> (Text', Text) fetchSpan txt (Loc _ _ el ec) = reachLoc txt el ec -- | Reach given line\/column location and return 'Text' that has been -- traversed. reachLoc :: Text' -> -- | Line number to reach Int -> -- | Column number to reach Int -> (Text', Text) reachLoc txt@(Text' _ _ original) l c = let chunk = T.unfoldr f txt f (Text' l' c' s) = do guard (l' < l || c' < c) (ch, s') <- T.uncons s let (l'', c'') = case ch of '\n' -> (l' + 1, 1) '\t' -> (l', c' + 8 - ((c' - 1) `rem` 8)) _ -> (l', c' + 1) return (ch, Text' l'' c'' s') in (Text' l c (T.drop (T.length chunk) original), chunk) ---------------------------------------------------------------------------- -- Pragmas detection -- | Detect file header pragma. isHeaderPragma :: Text -> Bool isHeaderPragma txt0 = isJust $ do txt1 <- T.stripStart <$> T.stripPrefix "{-#" txt0 guard (T.isPrefixOf "LANGUAGE" txt1 || T.isPrefixOf "OPTIONS_GHC" txt1) ---------------------------------------------------------------------------- -- Language extensions -- | Language extensions we enable by default. enabledExts :: [Extension] enabledExts = [ ForeignFunctionInterface, InterruptibleFFI, CApiFFI, Arrows, TemplateHaskell, TemplateHaskellQuotes, ImplicitParams, OverloadedLabels, ExplicitForAll, BangPatterns, PatternSynonyms, MagicHash, RecursiveDo, UnicodeSyntax, UnboxedTuples, UnboxedSums, DatatypeContexts, TransformListComp, QuasiQuotes, LambdaCase, BinaryLiterals, NegativeLiterals, HexFloatLiterals, TypeApplications, StaticPointers, NumericUnderscores, StarIsType ]