{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE BangPatterns #-}

module Commonmark.Tokens
  ( Tok(..)
  , TokType(..)
  , SourcePos
  , tokenize
  , untokenize
  ) where

import           Unicode.Char    (isAlphaNum)
import           Unicode.Char.General.Compat  (isSpace)
import           Data.Text       (Text)
import qualified Data.Text       as T
import           Data.Data       (Data, Typeable)
import           Text.Parsec.Pos
import           Data.Text.Normalize (normalize, NormalizationMode(NFC))

data Tok = Tok { Tok -> TokType
tokType     :: !TokType
               , Tok -> SourcePos
tokPos      :: !SourcePos
               , Tok -> Text
tokContents :: {-# UNPACK #-} !Text
               }
               deriving (Int -> Tok -> ShowS
[Tok] -> ShowS
Tok -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Tok] -> ShowS
$cshowList :: [Tok] -> ShowS
show :: Tok -> String
$cshow :: Tok -> String
showsPrec :: Int -> Tok -> ShowS
$cshowsPrec :: Int -> Tok -> ShowS
Show, Tok -> Tok -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Tok -> Tok -> Bool
$c/= :: Tok -> Tok -> Bool
== :: Tok -> Tok -> Bool
$c== :: Tok -> Tok -> Bool
Eq, Typeable Tok
Tok -> DataType
Tok -> Constr
(forall b. Data b => b -> b) -> Tok -> Tok
forall a.
Typeable a
-> (forall (c :: * -> *).
    (forall d b. Data d => c (d -> b) -> d -> c b)
    -> (forall g. g -> c g) -> a -> c a)
-> (forall (c :: * -> *).
    (forall b r. Data b => c (b -> r) -> c r)
    -> (forall r. r -> c r) -> Constr -> c a)
-> (a -> Constr)
-> (a -> DataType)
-> (forall (t :: * -> *) (c :: * -> *).
    Typeable t =>
    (forall d. Data d => c (t d)) -> Maybe (c a))
-> (forall (t :: * -> * -> *) (c :: * -> *).
    Typeable t =>
    (forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c a))
-> ((forall b. Data b => b -> b) -> a -> a)
-> (forall r r'.
    (r -> r' -> r) -> r -> (forall d. Data d => d -> r') -> a -> r)
-> (forall r r'.
    (r' -> r -> r) -> r -> (forall d. Data d => d -> r') -> a -> r)
-> (forall u. (forall d. Data d => d -> u) -> a -> [u])
-> (forall u. Int -> (forall d. Data d => d -> u) -> a -> u)
-> (forall (m :: * -> *).
    Monad m =>
    (forall d. Data d => d -> m d) -> a -> m a)
-> (forall (m :: * -> *).
    MonadPlus m =>
    (forall d. Data d => d -> m d) -> a -> m a)
-> (forall (m :: * -> *).
    MonadPlus m =>
    (forall d. Data d => d -> m d) -> a -> m a)
-> Data a
forall u. Int -> (forall d. Data d => d -> u) -> Tok -> u
forall u. (forall d. Data d => d -> u) -> Tok -> [u]
forall r r'.
(r -> r' -> r) -> r -> (forall d. Data d => d -> r') -> Tok -> r
forall r r'.
(r' -> r -> r) -> r -> (forall d. Data d => d -> r') -> Tok -> r
forall (m :: * -> *).
Monad m =>
(forall d. Data d => d -> m d) -> Tok -> m Tok
forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> Tok -> m Tok
forall (c :: * -> *).
(forall b r. Data b => c (b -> r) -> c r)
-> (forall r. r -> c r) -> Constr -> c Tok
forall (c :: * -> *).
(forall d b. Data d => c (d -> b) -> d -> c b)
-> (forall g. g -> c g) -> Tok -> c Tok
forall (t :: * -> *) (c :: * -> *).
Typeable t =>
(forall d. Data d => c (t d)) -> Maybe (c Tok)
forall (t :: * -> * -> *) (c :: * -> *).
Typeable t =>
(forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c Tok)
gmapMo :: forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> Tok -> m Tok
$cgmapMo :: forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> Tok -> m Tok
gmapMp :: forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> Tok -> m Tok
$cgmapMp :: forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> Tok -> m Tok
gmapM :: forall (m :: * -> *).
Monad m =>
(forall d. Data d => d -> m d) -> Tok -> m Tok
$cgmapM :: forall (m :: * -> *).
Monad m =>
(forall d. Data d => d -> m d) -> Tok -> m Tok
gmapQi :: forall u. Int -> (forall d. Data d => d -> u) -> Tok -> u
$cgmapQi :: forall u. Int -> (forall d. Data d => d -> u) -> Tok -> u
gmapQ :: forall u. (forall d. Data d => d -> u) -> Tok -> [u]
$cgmapQ :: forall u. (forall d. Data d => d -> u) -> Tok -> [u]
gmapQr :: forall r r'.
(r' -> r -> r) -> r -> (forall d. Data d => d -> r') -> Tok -> r
$cgmapQr :: forall r r'.
(r' -> r -> r) -> r -> (forall d. Data d => d -> r') -> Tok -> r
gmapQl :: forall r r'.
(r -> r' -> r) -> r -> (forall d. Data d => d -> r') -> Tok -> r
$cgmapQl :: forall r r'.
(r -> r' -> r) -> r -> (forall d. Data d => d -> r') -> Tok -> r
gmapT :: (forall b. Data b => b -> b) -> Tok -> Tok
$cgmapT :: (forall b. Data b => b -> b) -> Tok -> Tok
dataCast2 :: forall (t :: * -> * -> *) (c :: * -> *).
Typeable t =>
(forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c Tok)
$cdataCast2 :: forall (t :: * -> * -> *) (c :: * -> *).
Typeable t =>
(forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c Tok)
dataCast1 :: forall (t :: * -> *) (c :: * -> *).
Typeable t =>
(forall d. Data d => c (t d)) -> Maybe (c Tok)
$cdataCast1 :: forall (t :: * -> *) (c :: * -> *).
Typeable t =>
(forall d. Data d => c (t d)) -> Maybe (c Tok)
dataTypeOf :: Tok -> DataType
$cdataTypeOf :: Tok -> DataType
toConstr :: Tok -> Constr
$ctoConstr :: Tok -> Constr
gunfold :: forall (c :: * -> *).
(forall b r. Data b => c (b -> r) -> c r)
-> (forall r. r -> c r) -> Constr -> c Tok
$cgunfold :: forall (c :: * -> *).
(forall b r. Data b => c (b -> r) -> c r)
-> (forall r. r -> c r) -> Constr -> c Tok
gfoldl :: forall (c :: * -> *).
(forall d b. Data d => c (d -> b) -> d -> c b)
-> (forall g. g -> c g) -> Tok -> c Tok
$cgfoldl :: forall (c :: * -> *).
(forall d b. Data d => c (d -> b) -> d -> c b)
-> (forall g. g -> c g) -> Tok -> c Tok
Data, Typeable)

data TokType =
       Spaces
     | UnicodeSpace
     | LineEnd
     | WordChars
     | Symbol {-# UNPACK #-} !Char
     deriving (Int -> TokType -> ShowS
[TokType] -> ShowS
TokType -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [TokType] -> ShowS
$cshowList :: [TokType] -> ShowS
show :: TokType -> String
$cshow :: TokType -> String
showsPrec :: Int -> TokType -> ShowS
$cshowsPrec :: Int -> TokType -> ShowS
Show, TokType -> TokType -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: TokType -> TokType -> Bool
$c/= :: TokType -> TokType -> Bool
== :: TokType -> TokType -> Bool
$c== :: TokType -> TokType -> Bool
Eq, Eq TokType
TokType -> TokType -> Bool
TokType -> TokType -> Ordering
TokType -> TokType -> TokType
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: TokType -> TokType -> TokType
$cmin :: TokType -> TokType -> TokType
max :: TokType -> TokType -> TokType
$cmax :: TokType -> TokType -> TokType
>= :: TokType -> TokType -> Bool
$c>= :: TokType -> TokType -> Bool
> :: TokType -> TokType -> Bool
$c> :: TokType -> TokType -> Bool
<= :: TokType -> TokType -> Bool
$c<= :: TokType -> TokType -> Bool
< :: TokType -> TokType -> Bool
$c< :: TokType -> TokType -> Bool
compare :: TokType -> TokType -> Ordering
$ccompare :: TokType -> TokType -> Ordering
Ord, Typeable TokType
TokType -> DataType
TokType -> Constr
(forall b. Data b => b -> b) -> TokType -> TokType
forall a.
Typeable a
-> (forall (c :: * -> *).
    (forall d b. Data d => c (d -> b) -> d -> c b)
    -> (forall g. g -> c g) -> a -> c a)
-> (forall (c :: * -> *).
    (forall b r. Data b => c (b -> r) -> c r)
    -> (forall r. r -> c r) -> Constr -> c a)
-> (a -> Constr)
-> (a -> DataType)
-> (forall (t :: * -> *) (c :: * -> *).
    Typeable t =>
    (forall d. Data d => c (t d)) -> Maybe (c a))
-> (forall (t :: * -> * -> *) (c :: * -> *).
    Typeable t =>
    (forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c a))
-> ((forall b. Data b => b -> b) -> a -> a)
-> (forall r r'.
    (r -> r' -> r) -> r -> (forall d. Data d => d -> r') -> a -> r)
-> (forall r r'.
    (r' -> r -> r) -> r -> (forall d. Data d => d -> r') -> a -> r)
-> (forall u. (forall d. Data d => d -> u) -> a -> [u])
-> (forall u. Int -> (forall d. Data d => d -> u) -> a -> u)
-> (forall (m :: * -> *).
    Monad m =>
    (forall d. Data d => d -> m d) -> a -> m a)
-> (forall (m :: * -> *).
    MonadPlus m =>
    (forall d. Data d => d -> m d) -> a -> m a)
-> (forall (m :: * -> *).
    MonadPlus m =>
    (forall d. Data d => d -> m d) -> a -> m a)
-> Data a
forall u. Int -> (forall d. Data d => d -> u) -> TokType -> u
forall u. (forall d. Data d => d -> u) -> TokType -> [u]
forall r r'.
(r -> r' -> r)
-> r -> (forall d. Data d => d -> r') -> TokType -> r
forall r r'.
(r' -> r -> r)
-> r -> (forall d. Data d => d -> r') -> TokType -> r
forall (m :: * -> *).
Monad m =>
(forall d. Data d => d -> m d) -> TokType -> m TokType
forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> TokType -> m TokType
forall (c :: * -> *).
(forall b r. Data b => c (b -> r) -> c r)
-> (forall r. r -> c r) -> Constr -> c TokType
forall (c :: * -> *).
(forall d b. Data d => c (d -> b) -> d -> c b)
-> (forall g. g -> c g) -> TokType -> c TokType
forall (t :: * -> *) (c :: * -> *).
Typeable t =>
(forall d. Data d => c (t d)) -> Maybe (c TokType)
forall (t :: * -> * -> *) (c :: * -> *).
Typeable t =>
(forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c TokType)
gmapMo :: forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> TokType -> m TokType
$cgmapMo :: forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> TokType -> m TokType
gmapMp :: forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> TokType -> m TokType
$cgmapMp :: forall (m :: * -> *).
MonadPlus m =>
(forall d. Data d => d -> m d) -> TokType -> m TokType
gmapM :: forall (m :: * -> *).
Monad m =>
(forall d. Data d => d -> m d) -> TokType -> m TokType
$cgmapM :: forall (m :: * -> *).
Monad m =>
(forall d. Data d => d -> m d) -> TokType -> m TokType
gmapQi :: forall u. Int -> (forall d. Data d => d -> u) -> TokType -> u
$cgmapQi :: forall u. Int -> (forall d. Data d => d -> u) -> TokType -> u
gmapQ :: forall u. (forall d. Data d => d -> u) -> TokType -> [u]
$cgmapQ :: forall u. (forall d. Data d => d -> u) -> TokType -> [u]
gmapQr :: forall r r'.
(r' -> r -> r)
-> r -> (forall d. Data d => d -> r') -> TokType -> r
$cgmapQr :: forall r r'.
(r' -> r -> r)
-> r -> (forall d. Data d => d -> r') -> TokType -> r
gmapQl :: forall r r'.
(r -> r' -> r)
-> r -> (forall d. Data d => d -> r') -> TokType -> r
$cgmapQl :: forall r r'.
(r -> r' -> r)
-> r -> (forall d. Data d => d -> r') -> TokType -> r
gmapT :: (forall b. Data b => b -> b) -> TokType -> TokType
$cgmapT :: (forall b. Data b => b -> b) -> TokType -> TokType
dataCast2 :: forall (t :: * -> * -> *) (c :: * -> *).
Typeable t =>
(forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c TokType)
$cdataCast2 :: forall (t :: * -> * -> *) (c :: * -> *).
Typeable t =>
(forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c TokType)
dataCast1 :: forall (t :: * -> *) (c :: * -> *).
Typeable t =>
(forall d. Data d => c (t d)) -> Maybe (c TokType)
$cdataCast1 :: forall (t :: * -> *) (c :: * -> *).
Typeable t =>
(forall d. Data d => c (t d)) -> Maybe (c TokType)
dataTypeOf :: TokType -> DataType
$cdataTypeOf :: TokType -> DataType
toConstr :: TokType -> Constr
$ctoConstr :: TokType -> Constr
gunfold :: forall (c :: * -> *).
(forall b r. Data b => c (b -> r) -> c r)
-> (forall r. r -> c r) -> Constr -> c TokType
$cgunfold :: forall (c :: * -> *).
(forall b r. Data b => c (b -> r) -> c r)
-> (forall r. r -> c r) -> Constr -> c TokType
gfoldl :: forall (c :: * -> *).
(forall d b. Data d => c (d -> b) -> d -> c b)
-> (forall g. g -> c g) -> TokType -> c TokType
$cgfoldl :: forall (c :: * -> *).
(forall d b. Data d => c (d -> b) -> d -> c b)
-> (forall g. g -> c g) -> TokType -> c TokType
Data, Typeable)

-- | Convert a 'Text' into a list of 'Tok'. The first parameter
-- species the source name.
tokenize :: String -> Text -> [Tok]
tokenize :: String -> Text -> [Tok]
tokenize String
name =
  {-# SCC tokenize #-} SourcePos -> [Text] -> [Tok]
go (String -> SourcePos
initialPos String
name) forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Char -> Bool) -> Text -> [Text]
T.groupBy Char -> Char -> Bool
f forall b c a. (b -> c) -> (a -> b) -> a -> c
. NormalizationMode -> Text -> Text
normalize NormalizationMode
NFC
  where
    -- We group \r\n, consecutive spaces, and consecutive alphanums;
    -- everything else gets in a token by itself.
    f :: Char -> Char -> Bool
f Char
'\r' Char
'\n' = Bool
True
    f Char
' ' Char
' '   = Bool
True
    f Char
x   Char
y     = Char -> Bool
isAlphaNum Char
x Bool -> Bool -> Bool
&& Char -> Bool
isAlphaNum Char
y

    go :: SourcePos -> [Text] -> [Tok]
go !SourcePos
_pos [] = []
    go !SourcePos
pos (!Text
t:[Text]
ts) = -- note that t:ts are guaranteed to be nonempty
      case Text -> Char
T.head Text
t of
         Char
' ' ->  TokType -> SourcePos -> Text -> Tok
Tok TokType
Spaces SourcePos
pos Text
t forall a. a -> [a] -> [a]
:
                 SourcePos -> [Text] -> [Tok]
go (SourcePos -> Int -> SourcePos
incSourceColumn SourcePos
pos (Text -> Int
T.length Text
t)) [Text]
ts
         Char
'\t' -> TokType -> SourcePos -> Text -> Tok
Tok TokType
Spaces SourcePos
pos Text
t forall a. a -> [a] -> [a]
:
                 SourcePos -> [Text] -> [Tok]
go (SourcePos -> Int -> SourcePos
incSourceColumn SourcePos
pos
                       (Int
4 forall a. Num a => a -> a -> a
- (SourcePos -> Int
sourceColumn SourcePos
pos forall a. Num a => a -> a -> a
- Int
1) forall a. Integral a => a -> a -> a
`mod` Int
4)) [Text]
ts
         Char
'\r' -> TokType -> SourcePos -> Text -> Tok
Tok TokType
LineEnd SourcePos
pos Text
t forall a. a -> [a] -> [a]
:
                 SourcePos -> [Text] -> [Tok]
go (SourcePos -> Int -> SourcePos
incSourceLine (SourcePos -> Int -> SourcePos
setSourceColumn SourcePos
pos Int
1) Int
1) [Text]
ts
         Char
'\n' -> TokType -> SourcePos -> Text -> Tok
Tok TokType
LineEnd SourcePos
pos Text
t forall a. a -> [a] -> [a]
:
                 SourcePos -> [Text] -> [Tok]
go (SourcePos -> Int -> SourcePos
incSourceLine (SourcePos -> Int -> SourcePos
setSourceColumn SourcePos
pos Int
1) Int
1) [Text]
ts
         Char
thead
           | Char -> Bool
isAlphaNum Char
thead ->
                 TokType -> SourcePos -> Text -> Tok
Tok TokType
WordChars SourcePos
pos Text
t forall a. a -> [a] -> [a]
:
                 SourcePos -> [Text] -> [Tok]
go (SourcePos -> Int -> SourcePos
incSourceColumn SourcePos
pos (Text -> Int
T.length Text
t)) [Text]
ts
           | Char -> Bool
isSpace Char
thead ->
                 TokType -> SourcePos -> Text -> Tok
Tok TokType
UnicodeSpace SourcePos
pos Text
t forall a. a -> [a] -> [a]
:
                 SourcePos -> [Text] -> [Tok]
go (SourcePos -> Int -> SourcePos
incSourceColumn SourcePos
pos Int
1) [Text]
ts
           | Bool
otherwise ->
                 TokType -> SourcePos -> Text -> Tok
Tok (Char -> TokType
Symbol Char
thead) SourcePos
pos Text
t forall a. a -> [a] -> [a]
:
                 SourcePos -> [Text] -> [Tok]
go (SourcePos -> Int -> SourcePos
incSourceColumn SourcePos
pos Int
1) [Text]
ts

-- | Reverses 'tokenize'.  @untokenize . tokenize@ should be
-- the identity.
untokenize :: [Tok] -> Text
untokenize :: [Tok] -> Text
untokenize = {-# SCC untokenize #-} forall a. Monoid a => [a] -> a
mconcat forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a -> b) -> [a] -> [b]
map Tok -> Text
tokContents