{-# LANGUAGE DerivingVia #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE TypeFamilies #-} -- | -- SPDX-License-Identifier: BSD-3-Clause -- -- Simple Markdown AST and related utilities. -- -- Parametrising 'Document' with the type of -- inline code and code blocks allows us to -- inspect and validate Swarm code in descriptions. -- -- See 'Swarm.TUI.View.Util.drawMarkdown' for -- rendering the descriptions as brick widgets. module Swarm.Language.Text.Markdown ( -- ** Markdown document Document (..), Paragraph (..), Node (..), TxtAttr (..), fromTextM, fromText, docToText, docToMark, -- ** Token stream StreamNode' (..), StreamNode, ToStream (..), toText, -- ** Utilities findCode, chunksOf, ) where import Commonmark qualified as Mark import Commonmark.Extensions qualified as Mark (rawAttributeSpec) import Control.Applicative ((<|>)) import Control.Arrow (left) import Control.Lens ((%~), (&), _head, _last) import Data.Char (isSpace) import Data.Functor.Identity (Identity (..)) import Data.List.Split (chop) import Data.Maybe (catMaybes) import Data.Set (Set) import Data.Set qualified as Set import Data.Text (Text) import Data.Text qualified as T import Data.Tuple.Extra (both, first) import Data.Vector (toList) import Data.Yaml import GHC.Exts qualified (IsList (..), IsString (..)) import Swarm.Language.Parse (readTerm) import Swarm.Language.Pipeline (processParsedTerm) import Swarm.Language.Pretty (PrettyPrec (..), prettyText, prettyTextLine, prettyTypeErrText) import Swarm.Language.Syntax (Syntax) -- | The top-level markdown document. newtype Document c = Document {paragraphs :: [Paragraph c]} deriving (Eq, Show, Functor, Foldable, Traversable) deriving (Semigroup, Monoid) via [Paragraph c] -- | Markdown paragraphs that contain inline leaf nodes. -- -- The idea is that paragraphs do not have line breaks, -- and so the inline elements follow each other. -- In particular inline code can be followed by text without -- space between them (e.g. @\`logger\`s@). newtype Paragraph c = Paragraph {nodes :: [Node c]} deriving (Eq, Show, Functor, Foldable, Traversable) deriving (Semigroup, Monoid) via [Node c] mapP :: (Node c -> Node c) -> Paragraph c -> Paragraph c mapP f (Paragraph ns) = Paragraph (map f ns) pureP :: Node c -> Paragraph c pureP = Paragraph . (: []) -- | Inline leaf nodes. -- -- The raw node is from the raw_annotation extension, -- and can be used for types/entities/invalid code. data Node c = LeafText (Set TxtAttr) Text | LeafRaw String Text | LeafCode c | LeafCodeBlock String c deriving (Eq, Show, Functor, Foldable, Traversable) txt :: Text -> Node c txt = LeafText mempty addTextAttribute :: TxtAttr -> Node c -> Node c addTextAttribute a (LeafText as t) = LeafText (Set.insert a as) t addTextAttribute _ n = n normalise :: (Eq c, Semigroup c) => Paragraph c -> Paragraph c normalise (Paragraph a) = Paragraph $ go a where go = \case [] -> [] (n : ns) -> let (n', ns') = mergeSame n ns in n' : go ns' mergeSame = \case l@(LeafText attrs1 t1) -> \case (LeafText attrs2 t2 : rss) | attrs1 == attrs2 -> mergeSame (LeafText attrs1 $ t1 <> t2) rss rs -> (l, rs) l -> (l,) -- | Simple text attributes that make it easier to find key info in descriptions. data TxtAttr = Strong | Emphasis deriving (Eq, Show, Ord) instance Mark.Rangeable (Paragraph c) where ranged _ = id instance Mark.HasAttributes (Paragraph c) where addAttributes _ = id instance Mark.Rangeable (Document c) where ranged _ = id instance Mark.HasAttributes (Document c) where addAttributes _ = id instance GHC.Exts.IsList (Document a) where type Item (Document a) = Paragraph a toList = paragraphs fromList = Document instance GHC.Exts.IsString (Document Syntax) where fromString = fromText . T.pack instance GHC.Exts.IsString (Paragraph Syntax) where fromString s = case paragraphs $ GHC.Exts.fromString s of [] -> mempty [p] -> p ps -> error $ "Error: expected one paragraph, but found " <> show (length ps) -- | Surround some text in double quotes if it is not empty. quoteMaybe :: Text -> Text quoteMaybe t = if T.null t then t else T.concat ["\"", t, "\""] instance Mark.IsInline (Paragraph Text) where lineBreak = pureP $ txt "\n" softBreak = pureP $ txt " " str = pureP . txt entity = Mark.str escapedChar c = Mark.str $ T.pack ['\\', c] emph = mapP $ addTextAttribute Emphasis strong = mapP $ addTextAttribute Strong link dest title desc = pureP (txt "[") <> desc <> pureP (txt $ "](" <> dest <> quoteMaybe title <> ")") image dest title desc = pureP (txt "!") <> Mark.link dest title desc code = pureP . LeafCode rawInline (Mark.Format f) = pureP . LeafRaw (T.unpack f) instance Mark.IsBlock (Paragraph Text) (Document Text) where paragraph = Document . (: []) plain = Mark.paragraph thematicBreak = mempty blockQuote (Document ns) = Document $ map Mark.emph ns codeBlock f = Mark.plain . pureP . LeafCodeBlock (T.unpack f) heading _lvl = Mark.plain . Mark.strong rawBlock (Mark.Format f) t = error . T.unpack $ "Unsupported raw " <> f <> " block:\n" <> t referenceLinkDefinition = mempty list _type _spacing = mconcat parseSyntax :: Text -> Either String Syntax parseSyntax t = case readTerm t of Left e -> Left (T.unpack e) Right Nothing -> Left "empty code" Right (Just s) -> case processParsedTerm s of -- Just run the typechecker etc. to make sure the term typechecks Left e -> Left (T.unpack $ prettyTypeErrText t e) -- ...but if it does, we just go back to using the original parsed -- (*unelaborated*) AST. See #1496. Right _ -> Right s findCode :: Document Syntax -> [Syntax] findCode = catMaybes . concatMap (map codeOnly . nodes) . paragraphs where codeOnly = \case LeafCode s -> Just s LeafCodeBlock _i s -> Just s _l -> Nothing instance ToJSON (Paragraph Syntax) where toJSON = String . toText instance ToJSON (Document Syntax) where toJSON = String . docToMark instance FromJSON (Document Syntax) where parseJSON v = parseDoc v <|> parsePars v where parseDoc = withText "markdown" fromTextM parsePars = withArray "markdown paragraphs" $ \a -> do (ts :: [Text]) <- mapM parseJSON $ toList a fromTextM $ T.intercalate "\n\n" ts -- | Parse Markdown document, but throw on invalid code. fromText :: Text -> Document Syntax fromText = either error id . fromTextE -- | Read Markdown document and parse&validate the code. -- -- If you want only the document with code as `Text`, -- use the 'fromTextPure' function. fromTextM :: MonadFail m => Text -> m (Document Syntax) fromTextM = either fail pure . fromTextE fromTextE :: Text -> Either String (Document Syntax) fromTextE t = fromTextPure t >>= traverse parseSyntax -- | Read Markdown document without code validation. fromTextPure :: Text -> Either String (Document Text) fromTextPure t = do let spec = Mark.rawAttributeSpec <> Mark.defaultSyntaxSpec <> Mark.rawAttributeSpec let runSimple = left show . runIdentity Document tokenizedDoc <- runSimple $ Mark.commonmarkWith spec "markdown" t return . Document $ normalise <$> tokenizedDoc -------------------------------------------------------------- -- DIY STREAM -------------------------------------------------------------- -- | Convert 'Document' to 'Text'. -- -- Note that this will strip some markdown, emphasis and bold marks. -- If you want to get markdown again, use 'docToMark'. docToText :: PrettyPrec a => Document a -> Text docToText = T.intercalate "\n\n" . map toText . paragraphs -- | This is the naive and easy way to get text from markdown document. toText :: ToStream a => a -> Text toText = streamToText . toStream -- | Token stream that can be easily converted to text or brick widgets. -- -- TODO: #574 Code blocks should probably be handled separately. data StreamNode' t = TextNode (Set TxtAttr) t | CodeNode t | RawNode String t deriving (Eq, Show, Functor) type StreamNode = StreamNode' Text unStream :: StreamNode' t -> (t -> StreamNode' t, t) unStream = \case TextNode a t -> (TextNode a, t) CodeNode t -> (CodeNode, t) RawNode a t -> (RawNode a, t) -- | Get chunks of nodes not exceeding length and broken at word boundary. chunksOf :: Int -> [StreamNode] -> [[StreamNode]] chunksOf n = chop (splitter True n) where nodeLength :: StreamNode -> Int nodeLength = T.length . snd . unStream splitter :: Bool -> Int -> [StreamNode] -> ([StreamNode], [StreamNode]) splitter start i = \case [] -> ([], []) (tn : ss) -> let l = nodeLength tn in if l <= i then first (tn :) $ splitter False (i - l) ss else let (tn1, tn2) = cut start i tn in ([tn1], tn2 : ss) cut :: Bool -> Int -> StreamNode -> (StreamNode, StreamNode) cut start i tn = let (con, t) = unStream tn endSpace = T.takeWhileEnd isSpace t startSpace = T.takeWhile isSpace t twords = T.words t & _head %~ (startSpace <>) & _last %~ (<> endSpace) in case splitWordsAt i twords of ([], []) -> (con "", con "") ([], ws@(ww : wws)) -> both (con . T.unwords) $ -- In case single word (e.g. web link) does not fit on line we must put -- it there and guarantee progress (otherwise chop will cycle) if start then ([T.take i ww], T.drop i ww : wws) else ([], ws) splitted -> both (con . T.unwords) splitted splitWordsAt :: Int -> [Text] -> ([Text], [Text]) splitWordsAt i = \case [] -> ([], []) (w : ws) -> let l = T.length w in if l < i then first (w :) $ splitWordsAt (i - l - 1) ws else ([], w : ws) streamToText :: [StreamNode] -> Text streamToText = T.concat . map nodeToText where nodeToText = \case TextNode _a t -> t RawNode _s t -> t CodeNode stx -> stx -- | Convert elements to one dimensional stream of nodes, -- that is easy to format and layout. -- -- If you want to split the stream at line length, use -- the 'chunksOf' function afterward. class ToStream a where toStream :: a -> [StreamNode] instance PrettyPrec a => ToStream (Node a) where toStream = \case LeafText a t -> [TextNode a t] LeafCode t -> [CodeNode (prettyTextLine t)] LeafRaw s t -> [RawNode s t] LeafCodeBlock _i t -> [CodeNode (prettyText t)] instance PrettyPrec a => ToStream (Paragraph a) where toStream = concatMap toStream . nodes -------------------------------------------------------------- -- Markdown -------------------------------------------------------------- nodeToMark :: PrettyPrec a => Node a -> Text nodeToMark = \case LeafText a t -> foldl attr t a LeafRaw _ c -> wrap "`" c LeafCode c -> wrap "`" (prettyText c) LeafCodeBlock f c -> codeBlock f $ prettyText c where codeBlock f t = wrap "```" $ T.pack f <> "\n" <> t <> "\n" wrap c t = c <> t <> c attr t a = case a of Emphasis -> wrap "_" t Strong -> wrap "**" t paragraphToMark :: PrettyPrec a => Paragraph a -> Text paragraphToMark = foldMap nodeToMark . nodes -- | Convert 'Document' to markdown text. docToMark :: PrettyPrec a => Document a -> Text docToMark = T.intercalate "\n\n" . map paragraphToMark . paragraphs