{-# LANGUAGE BangPatterns #-} {-# LANGUAGE DeriveAnyClass #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE DerivingStrategies #-} -- | -- Module : Data.Org -- Copyright : (c) Colin Woodbury, 2020 -- License : BSD3 -- Maintainer: Colin Woodbury -- -- This library parses text in the format. -- -- Use the `org` function to parse a `T.Text` value. module Data.Org ( -- * Types OrgFile(..) , emptyOrgFile , OrgDoc(..) , emptyDoc , Section(..) , Block(..) , Words(..) , ListItems(..) , Item(..) , Row(..) , Column(..) , URL(..) , Language(..) -- * Parsing , org -- ** Internal Parsers -- | These are exposed for testing purposes. , orgFile , meta , orgP , section , paragraph , table , list , line -- * Pretty Printing , prettyOrgFile , prettyOrg , prettyWords ) where import Control.Applicative.Combinators.NonEmpty import Control.Monad (void, when) import Data.Bool (bool) import Data.Functor (($>)) import Data.Hashable (Hashable(..)) import Data.List.NonEmpty (NonEmpty(..)) import qualified Data.List.NonEmpty as NEL import qualified Data.Map.Strict as M import Data.Semigroup (sconcat) import Data.Text (Text) import qualified Data.Text as T import Data.Void (Void) import GHC.Generics (Generic) import System.FilePath (takeExtension) import Text.Megaparsec hiding (sepBy1, sepEndBy1, some, someTill) import Text.Megaparsec.Char import qualified Text.Megaparsec.Char.Lexer as L -------------------------------------------------------------------------------- -- Types -- | A complete @.org@ file with metadata. data OrgFile = OrgFile { orgMeta :: M.Map Text Text -- ^ Top-level fields like: -- -- @ -- #+TITLE: Curing Cancer with Haskell -- #+DATE: 2020-02-25 -- #+AUTHOR: Colin -- @ , orgDoc :: OrgDoc } deriving stock (Eq, Show, Generic) emptyOrgFile :: OrgFile emptyOrgFile = OrgFile mempty emptyDoc -- | A recursive Org document. These are zero or more blocks of markup, followed -- by zero or more subsections. -- -- @ -- This is some top-level text. -- -- * Important heading -- -- ** Less important subheading -- @ data OrgDoc = OrgDoc { docBlocks :: [Block] , docSections :: [Section] } deriving stock (Eq, Show, Generic) deriving anyclass (Hashable) emptyDoc :: OrgDoc emptyDoc = OrgDoc [] [] -- | Some logically distinct block of Org content. data Block = Quote Text | Example Text | Code (Maybe Language) Text | List ListItems | Table (NonEmpty Row) | Paragraph (NonEmpty Words) deriving stock (Eq, Show, Generic) deriving anyclass (Hashable) -- | A subsection, marked by a heading line and followed recursively by an -- `OrgDoc`. -- -- @ -- * This is a Heading -- -- This is content in the sub ~OrgDoc~. -- @ data Section = Section { sectionHeading :: NonEmpty Words , sectionTags :: [Text] , sectionDoc :: OrgDoc } deriving stock (Eq, Show, Generic) deriving anyclass (Hashable) -- | An org list constructed of @-@ characters. -- -- @ -- - Feed the cat -- - The good stuff -- - Feed the dog -- - He'll eat anything -- - Feed the bird -- - Feed the alligator -- - Feed the elephant -- @ newtype ListItems = ListItems (NonEmpty Item) deriving stock (Eq, Show, Generic) deriving anyclass (Hashable) -- | A line in a bullet-list. Can contain sublists, as shown in `ListItems`. data Item = Item (NonEmpty Words) (Maybe ListItems) deriving stock (Eq, Show, Generic) deriving anyclass (Hashable) -- | A row in an org table. Can have content or be a horizontal rule. -- -- @ -- | A | B | C | -- |---+---+---| -- | D | E | F | -- @ data Row = Break | Row (NonEmpty Column) deriving stock (Eq, Show, Generic) deriving anyclass (Hashable) -- | A possibly empty column in an org table. data Column = Empty | Column (NonEmpty Words) deriving stock (Eq, Show, Generic) deriving anyclass (Hashable) -- | The fundamental unit of Org text content. `Plain` units are split -- word-by-word. data Words = Bold Text | Italic Text | Highlight Text | Underline Text | Verbatim Text | Strike Text | Link URL (Maybe Text) | Image URL | Tags (NonEmpty Text) | Punct Char | Plain Text deriving stock (Eq, Show, Generic) deriving anyclass (Hashable) -- | The url portion of a link. newtype URL = URL Text deriving stock (Eq, Show, Generic) deriving anyclass (Hashable) -- | The programming language some source code block was written in. newtype Language = Language Text deriving stock (Eq, Show, Generic) deriving anyclass (Hashable) -------------------------------------------------------------------------------- -- Parser -- | Attempt to parse an `OrgFile`. org :: Text -> Maybe OrgFile org = parseMaybe orgFile type Parser = Parsec Void Text orgFile :: Parser OrgFile orgFile = space *> L.lexeme space (OrgFile <$> meta <*> orgP) <* eof meta :: Parser (M.Map Text Text) meta = L.lexeme space $ M.fromList <$> keyword `sepEndBy` newline where keyword :: Parser (Text, Text) keyword = do void $ string "#+" key <- someTill' ':' void $ string ": " val <- someTillEnd pure (key, val) orgP :: Parser OrgDoc orgP = orgP' 1 orgP' :: Int -> Parser OrgDoc orgP' depth = L.lexeme space $ OrgDoc <$> many block <*> many (try $ section depth) where block :: Parser Block block = choice [ try code , try example , try quote , try list , try table , paragraph ] -- TODO Paragraph needs to fail if it detects a heading. -- | If a line stars with @*@ and a space, it is a `Section` heading. heading :: Parser (T.Text, NonEmpty Words, [Text]) heading = do stars <- someOf '*' <* char ' ' ws <- line '\n' case nelUnsnoc ws of (Tags ts, Just rest) -> pure (stars, rest, NEL.toList ts) _ -> pure (stars, ws, []) section :: Int -> Parser Section section depth = L.lexeme space $ do (stars, ws, ts) <- heading -- Fail if we've found a parent heading -- when (T.length stars < depth) $ failure Nothing mempty -- Otherwise continue -- void space Section ws ts <$> orgP' (succ depth) quote :: Parser Block quote = L.lexeme space $ do void top <* newline ls <- manyTill (manyTillEnd <* newline) bot pure . Quote $ T.intercalate "\n" ls where top = string "#+" *> (string "BEGIN_QUOTE" <|> string "begin_quote") bot = string "#+" *> (string "END_QUOTE" <|> string "end_quote") example :: Parser Block example = L.lexeme space $ do void top <* newline ls <- manyTill (manyTillEnd <* newline) bot pure . Example $ T.intercalate "\n" ls where top = string "#+" *> (string "BEGIN_EXAMPLE" <|> string "begin_example") bot = string "#+" *> (string "END_EXAMPLE" <|> string "end_example") code :: Parser Block code = L.lexeme space $ do lang <- top *> optional lng <* newline ls <- manyTill (manyTillEnd <* newline) bot pure . Code (Language <$> lang) $ T.intercalate "\n" ls where top = string "#+" *> (string "BEGIN_SRC" <|> string "begin_src") bot = string "#+" *> (string "END_SRC" <|> string "end_src") lng = char ' ' *> someTillEnd list :: Parser Block list = L.lexeme space $ List <$> listItems 0 listItems :: Int -> Parser ListItems listItems indent = ListItems <$> sepBy1 (item indent) (try $ newline *> lookAhead (nextItem indent)) nextItem :: Int -> Parser () nextItem indent = do void . string $ T.replicate indent " " void $ string "- " -- | Conditions for ending the current bullet: -- -- 1. You find two '\n' at the end of a line. -- 2. The first two non-space characters of the next line are "- ". item :: Int -> Parser Item item indent = do leading <- string $ T.replicate indent " " void $ string "- " l <- bullet let !nextInd = T.length leading + 2 Item l <$> optional (try $ newline *> listItems nextInd) where bullet :: Parser (NonEmpty Words) bullet = do l <- line '\n' try (lookAhead keepGoing *> space *> ((l <>) <$> bullet)) <|> pure l keepGoing :: Parser () keepGoing = void $ char '\n' *> manyOf ' ' *> noneOf ['-', '\n'] table :: Parser Block table = L.lexeme space $ Table <$> sepEndBy1 row (char '\n') where row :: Parser Row row = do void $ char '|' brk <|> (Row <$> sepEndBy1 column (char '|')) -- | If the line starts with @|-@, assume its a break regardless of what -- chars come after that. brk :: Parser Row brk = char '-' *> manyTillEnd $> Break column :: Parser Column column = do void $ someOf ' ' (lookAhead (char '|') $> Empty) <|> (Column <$> line '|') paragraph :: Parser Block paragraph = L.lexeme space $ do notFollowedBy heading Paragraph . sconcat <$> sepEndBy1 (line '\n') newline line :: Char -> Parser (NonEmpty Words) line end = sepEndBy1 (wordChunk end) (manyOf ' ') -- | RULES -- -- 1. In-lined markup is not recognized: This is not*bold*. Neither is *this*here. -- 2. Punctuation immediately after markup close /is/ allowed: *This*, in fact, is bold. -- 3. Otherwise, a space, newline or EOF is necessary after the close. -- 4. Any char after a link is fine. -- 5. When rerendering, a space must not appear between the end of a markup close and -- a punctuation/newline character. -- 6. But any other character must have a space before it. wordChunk :: Char -> Parser Words wordChunk end = choice [ try $ Bold <$> between (char '*') (char '*') (someTill' '*') <* pOrS , try $ Italic <$> between (char '/') (char '/') (someTill' '/') <* pOrS , try $ Highlight <$> between (char '~') (char '~') (someTill' '~') <* pOrS , try $ Verbatim <$> between (char '=') (char '=') (someTill' '=') <* pOrS , try $ Underline <$> between (char '_') (char '_') (someTill' '_') <* pOrS , try $ Strike <$> between (char '+') (char '+') (someTill' '+') <* pOrS , try image , try link , try tags , try $ Punct <$> oneOf punc , Plain <$> takeWhile1P (Just "plain text") (\c -> c /= ' ' && c /= end) ] where pOrS :: Parser () pOrS = lookAhead $ void (oneOf $ end : ' ' : punc) <|> eof punc :: String punc = ".,!?():;'" tags :: Parser Words tags = do void $ char ':' Tags <$> (T.pack . NEL.toList <$> some letterChar) `sepEndBy1` char ':' image :: Parser Words image = between (char '[') (char ']') $ between (char '[') (char ']') $ do path <- someTill' ']' let !ext = takeExtension $ T.unpack path when (ext `notElem` [".jpg", ".jpeg", ".png"]) $ failure Nothing mempty pure . Image $ URL path link :: Parser Words link = between (char '[') (char ']') $ Link <$> between (char '[') (char ']') (URL <$> someTill' ']') <*> optional (between (char '[') (char ']') (someTill' ']')) someTillEnd :: Parser Text someTillEnd = someTill' '\n' manyTillEnd :: Parser Text manyTillEnd = takeWhileP (Just "many until the end of the line") (/= '\n') someTill' :: Char -> Parser Text someTill' c = takeWhile1P (Just $ "some until " <> [c]) (/= c) -- | Fast version of `some` specialized to `Text`. someOf :: Char -> Parser Text someOf c = takeWhile1P (Just $ "some of " <> [c]) (== c) manyOf :: Char -> Parser Text manyOf c = takeWhileP (Just $ "many of " <> [c]) (== c) -------------------------------------------------------------------------------- -- Pretty Printing prettyOrgFile :: OrgFile -> Text prettyOrgFile (OrgFile m os) = metas <> "\n\n" <> prettyOrg os where metas = T.intercalate "\n" $ map (\(l, t) -> "#+" <> l <> ": " <> t) $ M.toList m prettyOrg :: OrgDoc -> Text prettyOrg = prettyOrg' 1 prettyOrg' :: Int -> OrgDoc -> Text prettyOrg' depth (OrgDoc bs ss) = T.intercalate "\n\n" $ map prettyBlock bs <> map (prettySection depth) ss prettySection :: Int -> Section -> Text prettySection depth (Section ws ts od) = headig <> "\n\n" <> subdoc where -- TODO There is likely a punctuation bug here. headig = T.unwords $ T.replicate depth "*" : NEL.toList (NEL.map prettyWords ws) <> bool [":" <> T.intercalate ":" ts <> ":"] [] (null ts) subdoc :: Text subdoc = prettyOrg' (succ depth) od prettyBlock :: Block -> Text prettyBlock o = case o of Code l t -> "#+begin_src" <> maybe "" (\(Language l') -> " " <> l' <> "\n") l <> t <> "\n#+end_src" Quote t -> "#+begin_quote\n" <> t <> "\n#+end_quote" Example t -> "#+begin_example\n" <> t <> "\n#+end_example" Paragraph ht -> par ht List items -> lis 0 items Table rows -> T.intercalate "\n" . map row $ NEL.toList rows where lis :: Int -> ListItems -> Text lis indent (ListItems is) = T.intercalate "\n" . map (f indent) $ NEL.toList is f :: Int -> Item -> Text f indent (Item ws li) = T.replicate indent " " <> "- " <> par ws <> maybe "" (\is -> "\n" <> lis (indent + 2) is) li par :: NonEmpty Words -> Text par (h :| t) = prettyWords h <> para h t -- | Stick punctuation directly behind the chars in front of it, while -- paying special attention to parentheses. para :: Words -> [Words] -> Text para _ [] = "" para pr (w:ws) = case pr of Punct '(' -> prettyWords w <> para w ws _ -> case w of Punct '(' -> " " <> prettyWords w <> para w ws Punct _ -> prettyWords w <> para w ws _ -> " " <> prettyWords w <> para w ws row :: Row -> Text row Break = "|-|" row (Row cs) = "| " <> (T.intercalate " | " . map col $ NEL.toList cs) <> " |" col :: Column -> Text col Empty = "" col (Column ws) = T.unwords . map prettyWords $ NEL.toList ws prettyWords :: Words -> Text prettyWords w = case w of Bold t -> "*" <> t <> "*" Italic t -> "/" <> t <> "/" Highlight t -> "~" <> t <> "~" Underline t -> "_" <> t <> "_" Verbatim t -> "=" <> t <> "=" Strike t -> "+" <> t <> "+" Link (URL url) Nothing -> "[[" <> url <> "]]" Link (URL url) (Just t) -> "[[" <> url <> "][" <> t <> "]]" Image (URL url) -> "[[" <> url <> "]]" Tags ts -> ":" <> T.intercalate ":" (NEL.toList ts) <> ":" Punct c -> T.singleton c Plain t -> t nelUnsnoc :: NonEmpty a -> (a, Maybe (NonEmpty a)) nelUnsnoc ne = (NEL.last ne, NEL.nonEmpty $ NEL.init ne)