{-# OPTIONS_HADDOCK show-extensions #-} {-# OPTIONS_GHC -funbox-strict-fields -fobject-code #-} {-# LANGUAGE BangPatterns #-} {-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE UnboxedTuples #-} {-# LANGUAGE MagicHash #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE NoImplicitPrelude #-} -- | -- Module : Phladiprelio.General.Syllables -- Copyright : (c) Oleksandr Zhabenko 2021-2024 -- License : MIT -- Stability : Experimental -- Maintainer : oleksandr.zhabenko@yahoo.com -- -- This module works with syllable segmentation. The generalized version for the module -- 'Phladiprelio.Ukrainian.Syllable' from @ukrainian-phonetics-basic-array@ package. -- module Phladiprelio.General.Syllables ( -- * Data types and type synonyms PRS(..) , PhoneticType(..) , CharPhoneticClassification , StringRepresentation , SegmentationInfo1(..) , SegmentationPredFunction(..) , SegmentationPredFData(..) , SegmentationFDP , Eval2Bool(..) , DListFunctionResult , SegmentationLineFunction(..) , SegmentationRules1(..) , SegmentRulesG , DListRepresentation(..) -- * Basic functions , str2PRSs , sndGroups , groupSnds , divCnsnts , reSyllableCntnts , divSylls , createSyllablesPL -- * Auxiliary functions , gBF4 , findC , createsSyllable , isSonorous1 , isVoicedC1 , isVoicelessC1 , notCreatesSyllable2 , notEqC , fromPhoneticType ) where import GHC.Base import GHC.List import qualified Data.List as L (groupBy,find,intercalate,words) import Phladiprelio.General.Base import CaseBi.Arr import GHC.Arr import GHC.Exts import Data.IntermediateStructures1 (mapI) import Data.Maybe (mapMaybe,fromJust) import GHC.Int import Text.Read (Read(..),readMaybe) import Text.Show (Show(..)) import Data.Char (isLetter) import GHC.Num ((-)) import Data.Tuple (fst, snd) import GHC.Enum (fromEnum) -- Inspired by: https://github.com/OleksandrZhabenko/mm1/releases/tag/0.2.0.0 -- CAUTION: Please, do not mix with the show7s functions, they are not interoperable. data PRS = SylS { charS :: {-# UNPACK #-} !Char, -- ^ Phonetic languages phenomenon representation. Usually, a phoneme, but it can be otherwise something different. phoneType :: {-# UNPACK #-} !PhoneticType -- ^ Some encoded type. For the vowels it has reserved value of 'P' 0, for the sonorous consonants - 'P' 1 and 'P' 2, -- for the voiced consonants - 'P' 3 and 'P' 4, for the voiceless consonants - 'P' 5 and 'P' 6. Nevertheless, it is possible to redefine the data by rewriting the -- respective parts of the code here. } deriving ( Eq, Read ) instance Ord PRS where compare (SylS x1 y1) (SylS x2 y2) = case compare x1 x2 of EQ -> compare y1 y2 ~z -> z instance Show PRS where show (SylS c (P x)) = "SylS \'" `mappend` (c:'\'':' ':show x) data PhoneticType = P {-# UNPACK #-} !Int8 deriving (Eq, Ord, Read) instance Show PhoneticType where show (P x) = 'P':' ':show x fromPhoneticType :: PhoneticType -> Int fromPhoneticType (P x) = fromEnum x -- | The 'Array' 'Int' must be sorted in the ascending order to be used in the module correctly. type CharPhoneticClassification = Array Int PRS -- | The 'String' of converted phonetic language representation 'Char' data is converted to this type to apply syllable -- segmentation or other transformations. type StringRepresentation = [PRS] -- | Is somewhat rewritten from the 'CaseBi.Arr.gBF3' function (not exported) from the @mmsyn2-array@ package. gBF4 :: (Ix i) => (# Int#, PRS #) -> (# Int#, PRS #) -> Char -> Array i PRS -> Maybe PRS gBF4 (# !i#, k #) (# !j#, m #) c arr | isTrue# ((j# -# i#) ># 1# ) = case compare c (charS p) of GT -> gBF4 (# n#, p #) (# j#, m #) c arr LT -> gBF4 (# i#, k #) (# n#, p #) c arr _ -> Just p | c == charS m = Just m | c == charS k = Just k | otherwise = Nothing where !n# = (i# +# j#) `quotInt#` 2# !p = unsafeAt arr (I# n#) findC :: Char -> Array Int PRS -> Maybe PRS findC c arr = gBF4 (# i#, k #) (# j#, m #) c arr where !(I# i#,I# j#) = bounds arr !k = unsafeAt arr (I# i#) !m = unsafeAt arr (I# i#) {-# INLINE findC #-} str2PRSs :: CharPhoneticClassification -> String -> StringRepresentation str2PRSs arr = map (\c -> fromJust . findC c $ arr) {-# INLINE str2PRSs #-} -- | Function-predicate 'createsSyllable' checks whether its argument is a phoneme representation that -- every time being presented in the text leads to the creation of the new syllable (in the 'PRS' format). -- Usually it is a vowel, but in some languages there can be syllabic phonemes that are not considered to be -- vowels. createsSyllable :: PRS -> Bool createsSyllable = (== P 0) . phoneType {-# INLINE createsSyllable #-} -- | Function-predicate 'isSonorous1' checks whether its argument is a sonorous consonant representation in the 'PRS' format. isSonorous1 :: PRS -> Bool isSonorous1 = (`elem` [1,2]) . fromPhoneticType . phoneType {-# INLINE isSonorous1 #-} -- | Function-predicate 'isVoicedC1' checks whether its argument is a voiced consonant representation in the 'PRS' format. isVoicedC1 :: PRS -> Bool isVoicedC1 = (`elem` [3,4]) . fromPhoneticType . phoneType {-# INLINE isVoicedC1 #-} -- | Function-predicate 'isVoiceless1' checks whether its argument is a voiceless consonant representation in the 'PRS' format. isVoicelessC1 :: PRS -> Bool isVoicelessC1 = (`elem` [5,6]) . fromPhoneticType . phoneType {-# INLINE isVoicelessC1 #-} -- | Binary function-predicate 'notCreatesSyllable2' checks whether its arguments are both consonant representations in the 'PRS' format. notCreatesSyllable2 :: PRS -> PRS -> Bool notCreatesSyllable2 x y | phoneType x == P 0 || phoneType y == P 0 = False | otherwise = True {-# INLINE notCreatesSyllable2 #-} -- | Binary function-predicate 'notEqC' checks whether its arguments are not the same consonant sound representations (not taking palatalization into account). notEqC :: [(Char,Char)] -- ^ The pairs of the 'Char' that corresponds to the similar phonetic languages consonant phenomenon (e. g. allophones). Must be sorted in the ascending order to be used correctly. -> PRS -> PRS -> Bool notEqC xs x y | (== cy) . getBFstLSorted' cx xs $ cx = False | otherwise = cx /= cy where !cx = charS x !cy = charS y {-# INLINE notEqC #-} -- | Function 'sndGroups' converts a word being a list of 'PRS' to the list of phonetically similar (consonants grouped with consonants and each vowel separately) -- sounds representations in 'PRS' format. sndGroups :: [PRS] -> [[PRS]] sndGroups ys@(_:_) = L.groupBy notCreatesSyllable2 ys sndGroups _ = [] {-# INLINE sndGroups #-} groupSnds :: [PRS] -> [[PRS]] groupSnds = L.groupBy (\x y -> createsSyllable x == createsSyllable y) {-# INLINE groupSnds #-} data SegmentationInfo1 = SI { fieldN :: !Int8, -- ^ Number of fields in the pattern matching that are needed to apply the segmentation rules. Not less than 1. predicateN :: Int8 -- ^ Number of predicates in the definition for the 'fieldN' that are needed to apply the segmentation rules. } deriving (Eq, Read, Show) instance PhoneticElement SegmentationInfo1 where readPEMaybe rs | not . any isLetter $ rs = Nothing | otherwise = let (ys:yss) = L.words rs in case ys of "SI" -> case yss of [xs,ts] -> case (readMaybe xs::Maybe Int8) of Just m -> case (readMaybe ts::Maybe Int8) of Just n -> Just (SI m n) _ -> Nothing _ -> Nothing _ -> Nothing _ -> Nothing -- | We can think of 'SegmentationPredFunction' in terms of @f ('SI' fN pN) ks [x_{1},x_{2},...,x_{i},...,x_{fN}]@. Comparing with -- 'divCnsnts' from the @ukrainian-phonetics-basics-array@ we can postulate that it consists of the following logical terms in -- the symbolic form: -- -- 1) 'phoneType' x_{i} \`'elem'\` (X{...} = 'map' 'P' ['Int8']) -- -- 2) 'notEqC' ks x_{i} x_{j} (j /= i) -- -- combined with the standard logic Boolean operations of '(&&)', '(||)' and 'not'. Further, the 'not' can be transformed into the -- positive (affirmative) form using the notion of the universal set for the task. This transformation needs that the similar -- phonetic phenomenae (e. g. the double sounds -- the prolonged ones) belong to the one syllable and not to the different ones -- (so they are not related to different syllables, but just to the one and the same). Since such assumption has been used, -- we can further represent the function by the following data type and operations with it, see 'SegmentationPredFData'. data SegmentationPredFunction = PF (SegmentationInfo1 -> [(Char, Char)] -> [PRS] -> Bool) data SegmentationPredFData a b = L Int [Int] (Array Int a) | NEC Int Int (Array Int a) [b] | C (SegmentationPredFData a b) (SegmentationPredFData a b) | D (SegmentationPredFData a b) (SegmentationPredFData a b) deriving (Eq, Read, Show) class Eval2Bool a where eval2Bool :: a -> Bool type SegmentationFDP = SegmentationPredFData PRS (Char, Char) instance Eval2Bool (SegmentationPredFData PRS (Char, Char)) where eval2Bool (L i js arr) | all (<= n) js && i <= n && i >= 1 && all (>=1) js = fromPhoneticType (phoneType (unsafeAt arr $ i - 1)) `elem` js | otherwise = error "Phladiprelio.General.Syllables.eval2Bool: 'L' element is not properly defined. " where n = numElements arr eval2Bool (NEC i j arr ks) | i >= 1 && j >= 1 && i /= j && i <= n && j <= n = notEqC ks (unsafeAt arr $ i - 1) (unsafeAt arr $ j - 1) | otherwise = error "Phladiprelio.General.Syllables.eval2Bool: 'NEC' element is not properly defined. " where n = numElements arr eval2Bool (C x y) = eval2Bool x && eval2Bool y eval2Bool (D x y) = eval2Bool x || eval2Bool y type DListFunctionResult = ([PRS] -> [PRS],[PRS] -> [PRS]) class DListRepresentation a b where toDLR :: b -> [a] -> ([a] -> [a], [a] -> [a]) instance DListRepresentation PRS Int8 where toDLR left xs | null xs = (id,id) | null ts = (id,(zs `mappend`)) | null zs = ((`mappend` ts), id) | otherwise = ((`mappend` ts), (zs `mappend`)) where (ts,zs) = splitAt (fromEnum left) xs data SegmentationLineFunction = LFS { infoSP :: SegmentationInfo1, predF :: SegmentationFDP, -- ^ The predicate to check the needed rule for segmentation. resF :: Int8 -- ^ The result argument to be appended to the left of the group of consonants if the 'predF' returns 'True' for its arguments. Is an argument to the 'toDLR'. } deriving (Read, Show) data SegmentationRules1 = SR1 { infoS :: SegmentationInfo1, lineFs :: [SegmentationLineFunction] -- ^ The list must be sorted in the appropriate order of the guards usage for the predicates. -- The length of the list must be equal to the ('fromEnum' . 'predicateN' . 'infoS') value. } deriving (Read, Show) -- | List of the 'SegmentationRules1' sorted in the descending order by the 'fieldN' 'SegmentationInfo1' data and where the -- length of all the 'SegmentationPredFunction' lists of 'PRS' are equal to the 'fieldN' 'SegmentationInfo1' data by definition. type SegmentRulesG = [SegmentationRules1] -- | Function 'divCnsnts' is used to divide groups of consonants into two-elements lists that later are made belonging to -- different neighbour syllables if the group is between two vowels in a word. The group must be not empty, but this is not checked. -- The example phonetical information for the proper performance in Ukrainian can be found from the: -- https://msn.khnu.km.ua/pluginfile.php/302375/mod_resource/content/1/%D0%9B.3.%D0%86%D0%86.%20%D0%A1%D0%BA%D0%BB%D0%B0%D0%B4.%D0%9D%D0%B0%D0%B3%D0%BE%D0%BB%D0%BE%D1%81.pdf -- The example of the 'divCnsnts' can be found at: https://hackage.haskell.org/package/ukrainian-phonetics-basic-array-0.1.2.0/docs/src/Languages.Phonetic.Ukrainian.Syllable.Arr.html#divCnsnts divCnsnts :: [(Char,Char)] -- ^ The pairs of the 'Char' that corresponds to the similar phonetic languages consonant phenomenon (e. g. allophones). Must be sorted in the ascending order to be used correctly. -> SegmentRulesG -> [PRS] -> DListFunctionResult divCnsnts ks gs xs@(_:_) = toDLR left xs where !js = fromJust . L.find ((== length xs) . fromEnum . fieldN . infoS) $ gs -- js :: SegmentationRules1 !left = resF . fromJust . L.find (eval2Bool . predF). lineFs $ js divCnsnts _ _ [] = (id,id) reSyllableCntnts :: [(Char,Char)] -- ^ The pairs of the 'Char' that corresponds to the similar phonetic languages consonant phenomenon (e. g. allophones). Must be sorted in the ascending order to be used correctly. -> SegmentRulesG -> [[PRS]] -> [[PRS]] reSyllableCntnts ks gs (xs:ys:zs:xss) | (/= P 0) . phoneType . last $ ys = fst (divCnsnts ks gs ys) xs:reSyllableCntnts ks gs (snd (divCnsnts ks gs ys) zs:xss) | otherwise = reSyllableCntnts ks gs ((xs `mappend` ys):zs:xss) reSyllableCntnts _ _ (xs:ys:_) = [(xs `mappend` ys)] reSyllableCntnts _ _ xss = xss divSylls :: [[PRS]] -> [[PRS]] divSylls = mapI (\ws -> (length . filter createsSyllable $ ws) > 1) h3 where h3 us = [ys `mappend` take 1 zs] `mappend` (L.groupBy (\x y -> createsSyllable x && phoneType y /= P 0) . drop 1 $ zs) where (ys,zs) = break createsSyllable us {-| The function actually creates syllables using the provided data. Each resulting inner-most list is a phonetic language representation of the syllable according to the rules provided. -} createSyllablesPL :: GWritingSystemPRPLX -- ^ Data used to obtain the phonetic language representation of the text. -> [(Char,Char)] -- ^ The pairs of the 'Char' that corresponds to the similar phonetic languages consonant phenomenon (e. g. allophones). Must be sorted in the ascending order to be used correctly. -> CharPhoneticClassification -> SegmentRulesG -> String -- ^ Corresponds to the 100 delimiter in the @ukrainian-phonetics-basic-array@ package. -> String -- ^ Corresponds to the 101 delimiter in the @ukrainian-phonetics-basic-array@ package. -> String -- ^ Actually the converted 'String'. -> [[[PRS]]] createSyllablesPL wrs ks arr gs us vs = map (divSylls . reSyllableCntnts ks gs . groupSnds . str2PRSs arr) . words1 . mapMaybe g . convertToProperPL . map (\x -> if x == '-' then ' ' else x) where g x | x `elem` us = Nothing | x `notElem` vs = Just x | otherwise = Just ' ' words1 xs = if null ts then [] else w : words1 s'' -- Practically this is an optimized version for this case 'words' function from Prelude. where ts = dropWhile (== ' ') xs (w, s'') = break (== ' ') ts {-# NOINLINE words1 #-} convertToProperPL = concatMap string1 . stringToXG wrs {-# INLINE createSyllablesPL #-}