{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE PartialTypeSignatures #-} {-# LANGUAGE ViewPatterns #-} {-# LANGUAGE FlexibleContexts #-} -- | Skylighting code tokenizer module Token.Skylighting(lookupTokenizer, tokenizer) where import Control.Arrow(first) import Text.Pandoc.JSON () import Text.Pandoc.Definition () import Data.Maybe(listToMaybe, catMaybes) import Data.String (IsString) import Data.Text (Text) import qualified Data.Text as T import Prelude hiding(getLine) import Optics.Core import qualified Skylighting.Types as Sky import Skylighting.Types (TokenType(..), Syntax, SourceLine, Token) import qualified Skylighting.Syntax as Sky(defaultSyntaxMap) import qualified Skylighting.Tokenizer as Sky(tokenize, TokenizerConfig(..)) import qualified Skylighting.Core as Sky(lookupSyntax, syntaxByShortName) import Token ( MyLoc(MyLoc), MyTok(..), unTikzMark, mark ) rightToMaybe (Left err ) = Nothing rightToMaybe (Right result) = Just result lookupTokenizer :: [Text] -> Maybe Syntax lookupTokenizer = listToMaybe . catMaybes . fmap (Sky.syntaxByShortName Sky.defaultSyntaxMap) -- * Haskell tokenizer frontend -- | Attempt to tokenize input, -- returns `Nothing` if unsuccessful, -- so the processor can just pass input -- further when tokenizer fails. tokenizer :: Syntax -- Skylighting syntax description -> Text -- ^ Input text of code block -> Maybe [(MyTok, MyLoc, Text)] tokenizer syntax = fmap ( joinEscapedOperators . splitTokens . restoreLocations . recognizeTokens ) . rightToMaybe . Sky.tokenize tokenizerOpts syntax where tokenizerOpts = Sky.TokenizerConfig Sky.defaultSyntaxMap False -- | Recognize tokens from all source lines. recognizeTokens :: [SourceLine] -> [[(MyTok, Text)]] recognizeTokens = map $ map $ first skyTok -- | Convert token type of `ghc-lib` into tokens recognized by the filter. skyTok :: TokenType -> MyTok skyTok FloatTok = TNum skyTok DecValTok = TNum skyTok BaseNTok = TNum skyTok StringTok = TString skyTok CharTok = TString skyTok FunctionTok = TVar skyTok AttributeTok = TBlank skyTok VerbatimStringTok = TString skyTok SpecialStringTok = TCons skyTok ConstantTok = TCons skyTok KeywordTok = TKeyword skyTok BuiltInTok = TKeyword skyTok PreprocessorTok = TBlank skyTok CommentTok = TBlank skyTok DocumentationTok = TBlank skyTok CommentTok = TBlank skyTok OperatorTok = TOperator skyTok SpecialCharTok = TOperator skyTok RegionMarkerTok = TOperator skyTok AnnotationTok = TBlank skyTok ControlFlowTok = TKeyword skyTok VariableTok = TVar skyTok DataTypeTok = TCons skyTok other = TOther -- FIXME: generalize for GHC tokenizer and Skylighting -- | Restore locations -- TESTME: test -- 1. Without newlines should return a list of indices up to length -- 2. Of the same length as number of tokens -- 3. With newlines should return line indices up to number of lines. -- 4. Same for a list of lists of words without newlines joined as lines restoreLocations :: [[(MyTok, Text)]] -> [(MyTok, MyLoc, Text)] restoreLocations srcLines = concat $ zipWith (`go` 1) [1..] srcLines where go line col [] = [] go line col ((tok, txt):ls) = (tok, MyLoc line col (isMark tok), txt):go newLine newCol ls where isMark TBlank = False isMark _ = True newLine = line + lineIncr lineIncr = T.length $ T.filter (=='\n') txt newCol | lineIncr == 0 = col + T.length txt | otherwise = (+1) $ T.length $ fst $ T.break (=='\n') $ T.reverse txt -- * Likely common with other tokenizers -- | Split tokens into one blank per line. -- TESTME: assures that no token has '\n' before the end of text. splitTokens :: [(MyTok, MyLoc, Text)] -> [(MyTok, MyLoc, Text)] splitTokens = mconcat . fmap splitter where splitter :: (MyTok, MyLoc, Text) -> [(MyTok, MyLoc, Text)] splitter (TBlank, loc@(MyLoc line _ _), txt) | T.filter (=='\n') txt /= "" = withLocs withNewLines where split, withNewLines :: [Text] split = T.lines txt withNewLines = fmap (<>"\n") (init split) <> [last split] withLocs :: [Text] -> [(MyTok, MyLoc, Text)] withLocs (l:ls) = (TBlank, set mark True $ loc, l) : zipWith mkEntry [line+1..] ls mkEntry :: Int -> Text -> (MyTok, MyLoc, Text) mkEntry i t = (TBlank, MyLoc i 1 True, t) splitter other@(_, loc@(MyLoc line 1 x), txt) = [set (_2 % mark) True other] splitter other = [other] unmark :: Field2 a a MyLoc MyLoc => a -> a unmark = set (_2 % mark) False -- FIXME: use no-indent-mark instead. joinEscapedOperators :: (Eq c, IsString c, Semigroup c) => [(MyTok, MyLoc, c)] -> [(MyTok, MyLoc, c)] joinEscapedOperators (a@(_, _, "("):b@(_, _, _):c@(_, _, ")"):rest) = a:unmark b:unmark c:joinEscapedOperators rest joinEscapedOperators (a@(_, loc, "`"):b@(_, _, _):c@(_, _, "`"):rest) = a:unmark b:unmark c:joinEscapedOperators rest joinEscapedOperators (a@(_, _, "("):b@(TOperator, _, _):rest) = a:unmark b:joinEscapedOperators rest joinEscapedOperators (a@(_, _, _):b@(_, _, ")"):rest) = a:unmark b:joinEscapedOperators rest joinEscapedOperators (tok:rest) = tok:joinEscapedOperators rest joinEscapedOperators [] = [] {- -- | Recognize token using both token type from `ghc-lib`, -- and text content. -- Only TikZ marks are recognized by looking up text content. recognizeToken :: [Token] -> (MyTok, Text) recognizeToken (CommentTok, tokText@(unTikzMark -> Just mark)) = (TTikz mark, tokText) recognizeToken (tokType, tokText) = (skyTok tokType, tokText) -}