module Data.Char.Devanagari.Tokenizer
  ( selectTokenizerByContent,
    tokenize,
    fromDevanagari,
    fromIso,
    fromHarvard,
    fromIast,
    Tokenizer,
  )
where

{-
  This module contains Tokenizers for the Devanagari script and its roman transliterations Harvard-Kyoto, IAST and ISO15919.
  It also contains a generic tokenize function that detects the input encoding and automatically selects the right
  tokenizer to use.
-}

import           Control.Monad                         (join)
import           Data.Char.Devanagari.DevanagariTokens
import           Data.Char.Devanagari.TokenTables
import           Data.Map.Strict                       (Map)
import qualified Data.Map.Strict                       as Map
import           Data.Maybe                            (isJust)
import           Data.Sequence                         (Seq (Empty, (:<|)),
                                                        empty, (|>))
import           Data.Text                             (Text)
import qualified Data.Text                       as T

-- | A Tokenizer is a function that takes a Text as input and produces a Sequence of DevanagariToken instances as output.
type Tokenizer = (Text -> Seq DevanagariToken)

-- | A ParseMap is a Map from a Text to a DevanagariToken.
type ParseMap = Map Text DevanagariToken

harvardKyotoParseMap :: ParseMap
harvardKyotoParseMap :: ParseMap
harvardKyotoParseMap = [(Text, DevanagariToken)] -> ParseMap
forall k a. Ord k => [(k, a)] -> Map k a
Map.fromList ([(Text, DevanagariToken)]
harvardKyotoTable [(Text, DevanagariToken)]
-> [(Text, DevanagariToken)] -> [(Text, DevanagariToken)]
forall a. [a] -> [a] -> [a]
++ [(Text, DevanagariToken)]
inputVirams)

iastParseMap :: ParseMap
iastParseMap :: ParseMap
iastParseMap = [(Text, DevanagariToken)] -> ParseMap
forall k a. Ord k => [(k, a)] -> Map k a
Map.fromList ([(Text, DevanagariToken)]
iastTable [(Text, DevanagariToken)]
-> [(Text, DevanagariToken)] -> [(Text, DevanagariToken)]
forall a. [a] -> [a] -> [a]
++ [(Text, DevanagariToken)]
inputVirams)

isoParseMap :: ParseMap
isoParseMap :: ParseMap
isoParseMap = [(Text, DevanagariToken)] -> ParseMap
forall k a. Ord k => [(k, a)] -> Map k a
Map.fromList ([(Text, DevanagariToken)]
isoTable [(Text, DevanagariToken)]
-> [(Text, DevanagariToken)] -> [(Text, DevanagariToken)]
forall a. [a] -> [a] -> [a]
++ [(Text, DevanagariToken)]
inputVirams)

devanagariParseMap :: ParseMap
devanagariParseMap :: ParseMap
devanagariParseMap = [(Text, DevanagariToken)] -> ParseMap
forall k a. Ord k => [(k, a)] -> Map k a
Map.fromList ([(Text, DevanagariToken)]
devanagariIndependentTable [(Text, DevanagariToken)]
-> [(Text, DevanagariToken)] -> [(Text, DevanagariToken)]
forall a. [a] -> [a] -> [a]
++ [(Text, DevanagariToken)]
devanagariDependentTable [(Text, DevanagariToken)]
-> [(Text, DevanagariToken)] -> [(Text, DevanagariToken)]
forall a. [a] -> [a] -> [a]
++ [(Text, DevanagariToken)]
inputVirams)

-- | parse a Text into a Sequence of DevanagariToken instances using a ParseMap.
parse :: ParseMap -> Text -> Seq DevanagariToken
parse :: ParseMap -> Text -> Seq DevanagariToken
parse ParseMap
pMap Text
s = Text -> ParseMap -> Seq DevanagariToken -> Seq DevanagariToken
parse1 Text
s ParseMap
pMap Seq DevanagariToken
forall a. Seq a
empty
  where
    parse1 :: Text -> ParseMap -> Seq DevanagariToken -> Seq DevanagariToken
    parse1 :: Text -> ParseMap -> Seq DevanagariToken -> Seq DevanagariToken
parse1 Text
str ParseMap
_ Seq DevanagariToken
tokens
      | Text
str Text -> Text -> Bool
forall a. Eq a => a -> a -> Bool
== Text
T.empty = Seq DevanagariToken
tokens
    parse1 Text
str ParseMap
parseMap Seq DevanagariToken
tokens =
      case Text -> Int -> ParseMap -> Maybe (DevanagariToken, Text)
tryMatch Text
str Int
3 ParseMap
parseMap of
        Just (DevanagariToken
token, Text
rest) -> Text -> ParseMap -> Seq DevanagariToken -> Seq DevanagariToken
parse1 Text
rest ParseMap
parseMap (Seq DevanagariToken
tokens Seq DevanagariToken -> DevanagariToken -> Seq DevanagariToken
forall a. Seq a -> a -> Seq a
|> DevanagariToken
token)
        Maybe (DevanagariToken, Text)
Nothing ->
          case Text -> Int -> ParseMap -> Maybe (DevanagariToken, Text)
tryMatch Text
str Int
2 ParseMap
parseMap of
            Just (DevanagariToken
token, Text
rest) -> Text -> ParseMap -> Seq DevanagariToken -> Seq DevanagariToken
parse1 Text
rest ParseMap
parseMap (Seq DevanagariToken
tokens Seq DevanagariToken -> DevanagariToken -> Seq DevanagariToken
forall a. Seq a -> a -> Seq a
|> DevanagariToken
token)
            Maybe (DevanagariToken, Text)
Nothing ->
              case Text -> Int -> ParseMap -> Maybe (DevanagariToken, Text)
tryMatch Text
str Int
1 ParseMap
parseMap of
                Just (DevanagariToken
token, Text
rest) -> Text -> ParseMap -> Seq DevanagariToken -> Seq DevanagariToken
parse1 Text
rest ParseMap
parseMap (Seq DevanagariToken
tokens Seq DevanagariToken -> DevanagariToken -> Seq DevanagariToken
forall a. Seq a -> a -> Seq a
|> DevanagariToken
token)
                Maybe (DevanagariToken, Text)
Nothing -> Text -> ParseMap -> Seq DevanagariToken -> Seq DevanagariToken
parse1 (Int -> Text -> Text
T.drop Int
1 Text
str) ParseMap
parseMap (Seq DevanagariToken
tokens Seq DevanagariToken -> DevanagariToken -> Seq DevanagariToken
forall a. Seq a -> a -> Seq a
|> Char -> DevanagariToken
Unmapped ([Char] -> Char
forall a. HasCallStack => [a] -> a
head ([Char] -> Char) -> [Char] -> Char
forall a b. (a -> b) -> a -> b
$ Text -> [Char]
T.unpack (Text -> [Char]) -> Text -> [Char]
forall a b. (a -> b) -> a -> b
$ Int -> Text -> Text
T.take Int
1 Text
str))

    tryMatch :: Text -> Int -> ParseMap -> Maybe (DevanagariToken, Text)
    tryMatch :: Text -> Int -> ParseMap -> Maybe (DevanagariToken, Text)
tryMatch Text
str Int
n ParseMap
parseMap =
      let tok :: Text
tok = Int -> Text -> Text
T.take Int
n Text
str
          rest :: Text
rest = (Text, Text) -> Text
forall a b. (a, b) -> b
snd ((Text, Text) -> Text) -> (Text, Text) -> Text
forall a b. (a -> b) -> a -> b
$ Int -> Text -> (Text, Text)
T.splitAt (Text -> Int
T.length Text
tok) Text
str
          maybeToken :: Maybe DevanagariToken
maybeToken = Text -> ParseMap -> Maybe DevanagariToken
forall k a. Ord k => k -> Map k a -> Maybe a
Map.lookup Text
tok ParseMap
parseMap
       in case Maybe DevanagariToken
maybeToken of
            Just DevanagariToken
token -> (DevanagariToken, Text) -> Maybe (DevanagariToken, Text)
forall a. a -> Maybe a
Just (DevanagariToken
token, Text
rest)
            Maybe DevanagariToken
Nothing    -> Maybe (DevanagariToken, Text)
forall a. Maybe a
Nothing

-- | a tokenizer function that parses a Text containing IAST encoded Devanagari script into a Sequence of DevanagariToken instances.
fromIast :: Tokenizer
fromIast :: Text -> Seq DevanagariToken
fromIast = ParseMap -> Text -> Seq DevanagariToken
parse ParseMap
iastParseMap

-- | a tokenizer function that parses a Text containing ISO15919 encoded Devanagari script into a Sequence of DevanagariToken instances.
fromIso :: Tokenizer
fromIso :: Text -> Seq DevanagariToken
fromIso = ParseMap -> Text -> Seq DevanagariToken
parse ParseMap
isoParseMap

-- | a tokenizer function that parses a Text containing Harvard-Kyoto encoded Devanagari script into a Sequence of DevanagariToken instances.
fromHarvard :: Tokenizer
fromHarvard :: Text -> Seq DevanagariToken
fromHarvard = ParseMap -> Text -> Seq DevanagariToken
parse ParseMap
harvardKyotoParseMap

-- | a tokenizer function that parses a Text containing Devanagari script into a Sequence of DevanagariToken instances.
fromDevanagari :: Tokenizer
fromDevanagari :: Text -> Seq DevanagariToken
fromDevanagari Text
s = Seq DevanagariToken -> Seq DevanagariToken -> Seq DevanagariToken
addExplicitVowA Seq DevanagariToken
forall a. Seq a
empty (ParseMap -> Text -> Seq DevanagariToken
parse ParseMap
devanagariParseMap Text
s)
  where
    addExplicitVowA :: Seq DevanagariToken -> Seq DevanagariToken -> Seq DevanagariToken
    addExplicitVowA :: Seq DevanagariToken -> Seq DevanagariToken -> Seq DevanagariToken
addExplicitVowA Seq DevanagariToken
acc Seq DevanagariToken
Empty = Seq DevanagariToken
acc
    addExplicitVowA Seq DevanagariToken
acc (cons :: DevanagariToken
cons@(Cons Consonant
_) :<| DevanagariToken
Virama :<| Seq DevanagariToken
xs) = Seq DevanagariToken -> Seq DevanagariToken -> Seq DevanagariToken
addExplicitVowA (Seq DevanagariToken
acc Seq DevanagariToken -> DevanagariToken -> Seq DevanagariToken
forall a. Seq a -> a -> Seq a
|> DevanagariToken
cons) Seq DevanagariToken
xs
    addExplicitVowA Seq DevanagariToken
acc (cons :: DevanagariToken
cons@(Cons Consonant
_) :<| vow :: DevanagariToken
vow@(Vow Vowel
_) :<| Seq DevanagariToken
xs) = Seq DevanagariToken -> Seq DevanagariToken -> Seq DevanagariToken
addExplicitVowA (Seq DevanagariToken
acc Seq DevanagariToken -> DevanagariToken -> Seq DevanagariToken
forall a. Seq a -> a -> Seq a
|> DevanagariToken
cons Seq DevanagariToken -> DevanagariToken -> Seq DevanagariToken
forall a. Seq a -> a -> Seq a
|> DevanagariToken
vow) Seq DevanagariToken
xs
    addExplicitVowA Seq DevanagariToken
acc (cons :: DevanagariToken
cons@(Cons Consonant
_) :<| Seq DevanagariToken
xs) = Seq DevanagariToken -> Seq DevanagariToken -> Seq DevanagariToken
addExplicitVowA (Seq DevanagariToken
acc Seq DevanagariToken -> DevanagariToken -> Seq DevanagariToken
forall a. Seq a -> a -> Seq a
|> DevanagariToken
cons Seq DevanagariToken -> DevanagariToken -> Seq DevanagariToken
forall a. Seq a -> a -> Seq a
|> Vowel -> DevanagariToken
Vow Vowel
A) Seq DevanagariToken
xs
    addExplicitVowA Seq DevanagariToken
acc (DevanagariToken
x :<| Seq DevanagariToken
xs) = Seq DevanagariToken -> Seq DevanagariToken -> Seq DevanagariToken
addExplicitVowA (Seq DevanagariToken
acc Seq DevanagariToken -> DevanagariToken -> Seq DevanagariToken
forall a. Seq a -> a -> Seq a
|> DevanagariToken
x) Seq DevanagariToken
xs

-- | tokenize a string of Text into a sequence of DevanagariTokens.
-- The actual tokenizer is selected based on the content of the input string.
-- This tokenizer is then applied to the input string.
tokenize :: Tokenizer
tokenize :: Text -> Seq DevanagariToken
tokenize = (Text -> Text -> Seq DevanagariToken)
-> Text -> Seq DevanagariToken
forall (m :: * -> *) a. Monad m => m (m a) -> m a
join Text -> Text -> Seq DevanagariToken
selectTokenizerByContent

-- | select the correct tokenizer based on the content of the input string.
selectTokenizerByContent :: Text -> Tokenizer
selectTokenizerByContent :: Text -> Text -> Seq DevanagariToken
selectTokenizerByContent Text
str
  | Text -> Bool
containsDevanagari Text
str = Text -> Seq DevanagariToken
fromDevanagari
  | Text -> Bool
containsIso Text
str = Text -> Seq DevanagariToken
fromIso
  | Text -> Bool
containsIast Text
str = Text -> Seq DevanagariToken
fromIast
  | Bool
otherwise = Text -> Seq DevanagariToken
fromHarvard
  where
    containsDevanagari :: Text -> Bool
containsDevanagari = [Char] -> Text -> Bool
containsAnyOf ([Char
'\x900' .. Char
'\x963'] [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ [Char
'\x966' .. Char
'\x97F']) -- Unicode section for Devanagari
    containsIso :: Text -> Bool
containsIso = [Char] -> Text -> Bool
containsAnyOf ([Char]
"ēōṁ" [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ [Char
'\0325', Char
'\0304']) -- ISO15919 diacritics
    containsIast :: Text -> Bool
containsIast = [Char] -> Text -> Bool
containsAnyOf ([Char
'\241' .. Char
'\363'] [Char] -> [Char] -> [Char]
forall a. [a] -> [a] -> [a]
++ [Char
'\7693' .. Char
'\7789']) -- IAST diacritics
    containsAnyOf :: [Char] -> Text -> Bool
    containsAnyOf :: [Char] -> Text -> Bool
containsAnyOf [Char]
chars Text
text = (Char -> Bool) -> Text -> Bool
T.any (Char -> Text -> Bool
`isInfixOf` Text
text) (Text -> Bool) -> Text -> Bool
forall a b. (a -> b) -> a -> b
$ [Char] -> Text
T.pack [Char]
chars
    isInfixOf :: Char -> Text -> Bool
isInfixOf Char
c Text
text = Maybe Char -> Bool
forall a. Maybe a -> Bool
isJust ((Char -> Bool) -> Text -> Maybe Char
T.find (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
c) Text
text)