-- |
-- Module      :  Languages.Phonetic.Ukrainian.Syllable
-- Copyright   :  (c) OleksandrZhabenko 2020
-- License     :  MIT
-- Stability   :  Experimental
-- Maintainer  :  olexandr543@yahoo.com
--
-- This module works with syllable segmentation in Ukrainian. It is rewritten 
-- module MMSyn7.Syllable from the @mmsyn7s@ package : https://hackage.haskell.org/package/mmsyn7s
--

{-# LANGUAGE DeriveDataTypeable, FlexibleInstances #-}

module Languages.Phonetic.Ukrainian.Syllable  where

import Prelude hiding (mappend)
import Data.Monoid
import Data.Typeable
import qualified Data.Vector.Unboxed as V
import qualified Data.Vector as VB
import qualified Data.List as L (groupBy)
import Melodics.ByteString.Ukrainian
import CaseBi.Unboxed (getBFst')
import qualified CaseBi as X (getBFst')
import Data.List.InnToOut.Basic (mapI)

-- Inspired by: https://github.com/OleksandrZhabenko/mm1/releases/tag/0.2.0.0

-- CAUTION: Please, do not mix with the show7s functions, they are not interoperable.

data UZPP a b = UZ a b deriving ( Eq, Typeable )

instance (Ord a, Ord b) => Ord (UZPP a b) where
  compare (UZ x1 y1) (UZ x2 y2) =
    case compare x1 x2 of
      EQ -> compare y1 y2
      ~z -> z

data PhoneticType = W | S | O | D | K | L | M | N | E deriving ( Eq, Ord, Typeable )

type UZPP2 = UZPP Char PhoneticType

instance Show (UZPP Char PhoneticType) where
  show (UZ x y)
   | y `notElem` [O,K,M] =
       X.getBFst' ("", VB.fromList [('-'," "),('0'," "),('1'," "),('A',"дз"),('B',"ж"),('C',"й"),('D',"сь"),('E',"ч"),('F',"ш"),('G',"щ"),('L',"\700"),('M',"\8217"),
        ('a',"а"),('b',"б"),('c',"ц"),('d',"д"),('e',"е"),('f',"ф"),('g',"ґ"),('h',"г"),('i',"і"),('j',"дж"),('k',"к"),('l',"л"),('m',"м"),('n',"н"),('o',"о"),('p',"п"),('q',"ь"),
          ('r',"р"),('s',"с"),('t',"т"),('u',"у"),('v',"в"),('w',"ць"),('x',"х"),('y',"и"),('z',"з")]) x
   | otherwise =
       X.getBFst' ("", VB.fromList [('-'," "),('0'," "),('1'," "),('A',"дзь"),('B',"жь"),('E',"чь"),('F',"шь"),('G',"щь"),('b',"бь"),('d',"дь"),('f',"фь"),('g',"ґь"),
        ('h',"гь"),('j',"джь"),('k',"кь"),('l',"ль"),('m',"мь"),('n',"нь"),('p',"пь"),('q',"ь"),('r',"рь"),('t',"ть"),('v',"вь"),('x',"хь"),('z',"зь")]) x

phoneType :: UZPP2 -> PhoneticType
phoneType (UZ _ y) = y
{-# INLINE phoneType #-}

charUkr :: UZPP2 -> Char
charUkr (UZ x _) = x
{-# INLINE charUkr #-}

vec2UZPP2s :: V.Vector Char -> [UZPP2]
vec2UZPP2s v
  | V.null v = []
  | getBFst' (False, V.fromList [('a',True),('e',True),('i',True),('o',True),('u',True),('y',True)]) . V.unsafeHead $ v = UZ (V.unsafeHead v) W:vec2UZPP2s (V.unsafeTail v)
  | V.unsafeHead v == 'D' || V.unsafeHead v == 'w' = UZ (V.unsafeHead v) N:vec2UZPP2s (V.unsafeTail v)
  | ((V.null . V.unsafeTail $ v) || (V.unsafeIndex v 1 /= 'q')) && getBFst' (False, V.fromList [('C',True),('l',True),('m',True),('n',True),('r',True),('v',True)]) (V.unsafeHead v) =
       UZ (V.unsafeHead v) S:vec2UZPP2s (V.unsafeTail v)
  | ((V.null . V.unsafeTail $ v) || (V.unsafeIndex v 1 /= 'q')) &&
      getBFst' (False, V.fromList [('A',True),('B',True),('b',True),('d',True),('g',True),('h',True),('j',True),('z',True)]) (V.unsafeHead v) =
        UZ (V.unsafeHead v) D:vec2UZPP2s (V.unsafeTail v)
  | ((V.null . V.unsafeTail $ v) || (V.unsafeIndex v 1 /= 'q')) = UZ (V.unsafeHead v) L:vec2UZPP2s (V.unsafeTail v)
  | getBFst' (False, V.fromList [('l',True),('m',True),('n',True),('r',True),('v',True)]) (V.unsafeHead v) = UZ (V.unsafeHead v) O:vec2UZPP2s (V.unsafeDrop 2 v)
  | getBFst' (False, V.fromList [('A',True),('B',True),('b',True),('d',True),('g',True),('h',True),('j',True),('z',True)]) (V.unsafeHead v) =
      UZ (V.unsafeHead v) K:vec2UZPP2s (V.unsafeDrop 2 v)
  | otherwise = UZ (V.unsafeHead v) M:vec2UZPP2s (V.unsafeDrop 2 v)

-- | Function-predicate 'isVowel1' checks whether its argument is a vowel representation in the 'UZPP2' format.
isVowel1 :: UZPP2 -> Bool
isVowel1 = (== W) . phoneType
{-# INLINE isVowel1 #-}

-- | Function-predicate 'isVwl' checks whether its argument is a vowel representation in the 'Char' format.
isVwl :: Char -> Bool
isVwl = getBFst' (False, (V.fromList [('a',True),('e',True),('i',True),('o',True),('u',True),('y',True)]))
{-# INLINE isVwl #-}

-- | Function-predicate 'isSonorous1' checks whether its argument is a sonorous consonant representation in the 'UZPP2' format.
isSonorous1 :: UZPP2 -> Bool
isSonorous1 =  (`elem` [S,O]) . phoneType
{-# INLINE isSonorous1 #-}

-- | Function-predicate 'isVoicedC1' checks whether its argument is a voiced consonant representation in the 'UZPP2' format.
isVoicedC1 ::  UZPP2 -> Bool
isVoicedC1 = (`elem` [D,K]) . phoneType
{-# INLINE isVoicedC1 #-}

-- | Function-predicate 'isVoiceless1' checks whether its argument is a voiceless consonant representation in the 'UZPP2' format.
isVoicelessC1 ::  UZPP2 -> Bool
isVoicelessC1 =  (`elem` [L,M]) . phoneType
{-# INLINE isVoicelessC1 #-}

-- | Binary function-predicate 'isNotVowel2' checks whether its arguments are both consonant representations in the 'UZPP2' format.
isNotVowel2 :: UZPP2 -> UZPP2 -> Bool
isNotVowel2 x y
  | phoneType x == W || phoneType y == W = False
  | otherwise = True
{-# INLINE isNotVowel2 #-}

-- | Binary function-predicate 'notEqC' checks whether its arguments are not the same consonant sound representations (not taking palatalization into account).
notEqC :: UZPP2 -> UZPP2 -> Bool
notEqC x y
  | charUkr x == 's' || charUkr x == 'D' =
      case charUkr y of
        's' -> False
        'D' -> False
        _   -> True
  | charUkr x == 'w' || charUkr x == 'c' =
      case charUkr y of
        'w' -> False
        'c' -> False
        _   -> True
  | otherwise = charUkr x /= charUkr y

-- | Function 'sndGroups' converts a Ukrainian word being a list of 'UZPP2' to the list of phonetically similar (consonants grouped with consonants and each vowel separately)
-- sounds representations in 'UZPP2' format.
sndGroups :: [UZPP2] -> [[UZPP2]]
sndGroups ys@(_:_) = L.groupBy isNotVowel2 ys
sndGroups _ = []

-- | Function 'vecWords' similarly to 'Prelude.words' divides a 'V.Vector' of 'Char' into list of them, each element of which is a Ukrainian word (or its part
-- for dashed and hyphenated words or that ones with an apostrophe).
vecWords :: V.Vector Char -> [V.Vector Char]
vecWords v | V.null v = []
           | V.unsafeHead v == '-' || V.unsafeHead v == '0' || V.unsafeHead v == '1' = vecWords (V.unsafeTail v)
           | otherwise =
  let (v1, v2) = V.break (\x -> x == '-' || x == '0' || x == '1') v
      v3       = snd . V.span (\x -> x == '-' || x == '0' || x == '1') $ v2 in v1:vecWords v3

groupSnds :: [UZPP2] -> [[UZPP2]]
groupSnds = L.groupBy (\x y -> ((== W) . phoneType $ x) == ((== W) . phoneType $ y))

-- | Function 'divCnsnts' is used to divide groups of Ukrainian consonants into two-elements lists that later are made belonging to
-- different neighbour syllables if the group is between two vowels in a word. The group must be not empty, but this is not checked.
-- The phonetical information for the proper performance is taken from the:
-- https://msn.khnu.km.ua/pluginfile.php/302375/mod_resource/content/1/%D0%9B.3.%D0%86%D0%86.%20%D0%A1%D0%BA%D0%BB%D0%B0%D0%B4.%D0%9D%D0%B0%D0%B3%D0%BE%D0%BB%D0%BE%D1%81.pdf
divCnsnts :: [UZPP2] -> ([UZPP2] -> [UZPP2],[UZPP2] -> [UZPP2])
divCnsnts xs@(x:ys@(_:_:_:_))
  | (isSonorous1 x) || (isVoicedC1 x) = ((`mappend` [x]),(ys `mappend`))
  | otherwise = ((id),(xs `mappend`))
divCnsnts xs@(x:ys@(y:zs@(_:_)))
  | isSonorous1 x = ((`mappend` [x]),(ys `mappend`))
  | isSonorous1 y = ((`mappend` [x,y]),(zs `mappend`))
  | otherwise = ((id),(xs `mappend`))
divCnsnts xs@(x:ys@(y:_))
  | ((isSonorous1 x) && (x `notEqC` y)) || ((isVoicedC1 x) && (isVoicelessC1 y)) = ((`mappend` [x]),(ys `mappend`))
  | otherwise = ((id),(xs `mappend`))
divCnsnts xs = ((id),(xs `mappend`))

reSyllableCntnts :: [[UZPP2]] -> [[UZPP2]]
reSyllableCntnts (xs:ys:zs:xss)
  | (/= W) . phoneType . last $ ys = fst (divCnsnts ys) xs:reSyllableCntnts (snd (divCnsnts ys) zs:xss)
  | otherwise = reSyllableCntnts ((xs `mappend` ys):zs:xss)
reSyllableCntnts (xs:ys:_) = [(xs `mappend` ys)]
reSyllableCntnts xss = xss

divVwls :: [[UZPP2]] -> [[UZPP2]]
divVwls = mapI (\ws -> (length . filter ((== W) . phoneType) $ ws) > 1) h3
  where h3 us = [ys `mappend` take 1 zs] `mappend` (L.groupBy (\x y -> phoneType x == W && phoneType y /= W) . drop 1 $ zs)
                  where (ys,zs) = span (\t -> phoneType t /= W) us

createSyllablesUkr :: String -> [[[UZPP2]]]
createSyllablesUkr = map (divVwls . reSyllableCntnts . groupSnds . vec2UZPP2s) . vecWords . V.filter (/='0') . convertToProperUkrainian . map (\x -> if x == '-' then ' ' else x)
{-# INLINE createSyllablesUkr #-}

createSyllablesUkrP :: String -> [[[UZPP2]]]
createSyllablesUkrP = map (map representProlonged . divVwls . reSyllableCntnts . groupSnds . vec2UZPP2s) . vecWords . V.filter (/='0') . convertToProperUkrainian . map (\x -> if x == '-' then ' ' else x)
{-# INLINE createSyllablesUkrP #-}


createSyllablesUkrV :: VB.Vector Char -> [[[UZPP2]]]
createSyllablesUkrV = map (divVwls . reSyllableCntnts . groupSnds . vec2UZPP2s) . vecWords . V.filter (/='0') . convertToProperUkrainianV . VB.map (\x -> if x == '-' then ' ' else x)
{-# INLINE createSyllablesUkrV #-}

createSyllablesUkrVP :: VB.Vector Char -> [[[UZPP2]]]
createSyllablesUkrVP = map (map representProlonged . divVwls . reSyllableCntnts . groupSnds . vec2UZPP2s) . vecWords . V.filter (/='0') . convertToProperUkrainianV . VB.map (\x -> if x == '-' then ' ' else x)
{-# INLINE createSyllablesUkrVP #-}

-- | Function 'representProlonged' converts duplicated consequent in the syllable consonants
-- so that they are represented by just one 'UZPP2'. After applying the function to the list of 'UZPP2' being a syllable all groups of duplicated consequent consonants
-- in every syllable are represented with only one 'UZPP2' respectively.
representProlonged :: [UZPP2] -> [UZPP2]
representProlonged (x:y:xs)
  | isVowel1 x = x:representProlonged (y:xs)
  | not . notEqC x $ y = y:representProlonged xs
  | otherwise = x:representProlonged (y:xs)
representProlonged xs = xs

-- | Is inspired by the DobutokO.Sound.DIS5G6G module from @dobutokO2@ package.
-- See: 'https://hackage.haskell.org/package/dobutokO2-0.43.0.0/docs/DobutokO-Sound-DIS5G6G.html'. The 'Float' data are gotten from there.
str2Durat1 :: String -> Float
str2Durat1 = X.getBFst' ((0.153016), VB.fromList [("-", (0.101995)), ("0", (0.051020)), ("1", (0.153016)), ("а", 0.138231), ("б", 0.057143),
  ("в", 0.082268), ("г", 0.076825), ("д", 0.072063), ("дж", 0.048934), ("дз", 0.055601), ("е", 0.093605), ("ж", 0.070658), ("з", 0.056054),
    ("и", 0.099955), ("й", 0.057143), ("к", 0.045351), ("л", 0.064036), ("м", 0.077370), ("н", 0.074240), ("о", 0.116463), ("п", 0.134830),
      ("р", 0.049206), ("с", 0.074603), ("сь", 0.074558), ("т", 0.110658), ("у", 0.109070), ("ф", 0.062268), ("х", 0.077188), ("ц", 0.053061),
        ("ць", 0.089342), ("ч", 0.057596), ("ш", 0.066077), ("ь", 0.020227), ("і", 0.094150), ("ґ", 0.062948)])

uzpp2Durat1 :: UZPP2 -> Float
uzpp2Durat1 = X.getBFst' (0.051020, VB.fromList [(UZ 'A' D, 0.055601), (UZ 'A' K, 0.055601), (UZ 'B' D, 0.070658), (UZ 'B' K, 0.070658), (UZ 'C' S, 0.057143), (UZ 'D' N, 0.074558),
  (UZ 'E' L, 0.057596), (UZ 'E' M, 0.057596), (UZ 'F' L, 0.066077), (UZ 'F' M, 0.066077), (UZ 'a' W, 0.138231), (UZ 'b' D, 0.057143), (UZ 'b' K, 0.057143), (UZ 'c' D, 0.053061),
   (UZ 'd' D, 0.072063), (UZ 'd' K, 0.072063), (UZ 'e' W, 0.093605), (UZ 'f' L, 0.062268), (UZ 'f' M, 0.062268),  (UZ 'g' D, 0.062948), (UZ 'g' K, 0.062948), (UZ 'h' D, 0.076825),
    (UZ 'h' K, 0.076825), (UZ 'i' W, 0.094150), (UZ 'j' D, 0.048934), (UZ 'j' K, 0.048934), (UZ 'k' L, 0.045351), (UZ 'k' M, 0.045351), (UZ 'l' S, 0.064036), (UZ 'l' O, 0.064036),
     (UZ 'm' S, 0.077370), (UZ 'm' O, 0.077370), (UZ 'n' S, 0.074240), (UZ 'n' O, 0.074240), (UZ 'o' W, 0.116463), (UZ 'p' L, 0.134830), (UZ 'p' M, 0.134830),
      (UZ 'q' E, 0.020227), (UZ 'r' S, 0.049206), (UZ 'r' O, 0.049206), (UZ 's' L, 0.074603),  (UZ 't' L, 0.110658), (UZ 't' M, 0.110658), (UZ 'u' W, 0.109070), (UZ 'v' S, 0.082268),
       (UZ 'v' O, 0.082268), (UZ 'w' N, 0.089342), (UZ 'x' L, 0.077188), (UZ 'x' M, 0.077188), (UZ 'y' W, 0.099955), (UZ 'z' D, 0.056054), (UZ 'z' K, 0.056054)])

-- | Returns list of lists, every inner one of which contains approximate durations of the Ukrainian syllables.
syllableDurations :: [[[UZPP2]]] -> [[Float]]
syllableDurations = fmap (fmap (sum . fmap (uzpp2Durat1)))