{-# OPTIONS_HADDOCK show-extensions #-} {-# OPTIONS_GHC -funbox-strict-fields -fobject-code #-} {-# LANGUAGE BangPatterns #-} {-# LANGUAGE UnboxedTuples #-} {-# LANGUAGE MagicHash #-} -- | -- Module : Data.Phonetic.Languages.Syllables -- Copyright : (c) OleksandrZhabenko 2021 -- License : MIT -- Stability : Experimental -- Maintainer : olexandr543@yahoo.com -- -- This module works with syllable segmentation. The generalized version for the module -- 'Languages.Phonetic.Ukrainian.Syllable.Arr' from @ukrainian-phonetics-basic-array@ package. -- module Data.Phonetic.Languages.Syllables ( -- * Data types and type synonyms PRS(..) , PhoneticType(..) , CharPhoneticClassification , StringRepresentation , SegmentationInfo1(..) , SegmentationPredFunction(..) , DListFunctionResult , SegmentationLineFunction(..) , SegmentationRules1(..) , SegmentRulesG -- * Basic functions , str2PRSs , sndGroups , groupSnds , divCnsnts , reSyllableCntnts , divVwls , createSyllablesPL -- * Auxiliary functions , gBF4 , findC , isVowel1 , isSonorous1 , isVoicedC1 , isVoicelessC1 , isNotVowel2 , notEqC ) where import Prelude hiding (mappend) import Data.Monoid import qualified Data.List as L (groupBy,find) import Data.Phonetic.Languages.Base import CaseBi.Arr import GHC.Arr import GHC.Exts import Data.List.InnToOut.Basic (mapI) import Data.Maybe (mapMaybe,fromJust) import GHC.Int -- Inspired by: https://github.com/OleksandrZhabenko/mm1/releases/tag/0.2.0.0 -- CAUTION: Please, do not mix with the show7s functions, they are not interoperable. data PRS = SylS { charS :: !Char, -- ^ Phonetic languages phenomenon representation. Usually, a phoneme, but it can be otherwise something different. phoneType :: !PhoneticType -- ^ Some encoded type. } deriving ( Eq ) instance Ord PRS where compare (SylS x1 y1) (SylS x2 y2) = case compare x1 x2 of EQ -> compare y1 y2 ~z -> z data PhoneticType = P !Int8 deriving (Eq, Ord) {-| The 'Array' 'Int' must be sorted in the ascending order to be used in the module correctly. -} type CharPhoneticClassification = Array Int PRS {-| The 'String' of converted phonetic language representation 'Char' data is converted to this type to apply syllable segmentation or other transformations. -} type StringRepresentation = [PRS] -- | Is somewhat rewritten from the 'CaseBi.Arr.gBF3' function (not exported) from the @mmsyn2-array@ package. gBF4 :: (Ix i) => (# Int#, PRS #) -> (# Int#, PRS #) -> Char -> Array i PRS -> Maybe PRS gBF4 (# !i#, k #) (# !j#, m #) c arr | isTrue# ((j# -# i#) ># 1# ) = case compare c (charS p) of GT -> gBF4 (# n#, p #) (# j#, m #) c arr LT -> gBF4 (# i#, k #) (# n#, p #) c arr _ -> Just p | c == charS m = Just m | c == charS k = Just k | otherwise = Nothing where !n# = (i# +# j#) `quotInt#` 2# !p = unsafeAt arr (I# n#) {-# INLINABLE gBF4 #-} findC :: Char -> Array Int PRS -> Maybe PRS findC c arr = gBF4 (# i#, k #) (# j#, m #) c arr where !(I# i#,I# j#) = bounds arr !k = unsafeAt arr (I# i#) !m = unsafeAt arr (I# i#) str2PRSs :: CharPhoneticClassification -> String -> StringRepresentation str2PRSs arr = map (\c -> fromJust . findC c $ arr) -- | Function-predicate 'isVowel1' checks whether its argument is a vowel representation in the 'PRS' format. isVowel1 :: PRS -> Bool isVowel1 = (== P 0) . phoneType {-# INLINE isVowel1 #-} -- | Function-predicate 'isSonorous1' checks whether its argument is a sonorous consonant representation in the 'PRS' format. isSonorous1 :: PRS -> Bool isSonorous1 = (`elem` [P 1,P 2]) . phoneType {-# INLINE isSonorous1 #-} -- | Function-predicate 'isVoicedC1' checks whether its argument is a voiced consonant representation in the 'PRS' format. isVoicedC1 :: PRS -> Bool isVoicedC1 = (`elem` [P 3,P 4]) . phoneType {-# INLINE isVoicedC1 #-} -- | Function-predicate 'isVoiceless1' checks whether its argument is a voiceless consonant representation in the 'PRS' format. isVoicelessC1 :: PRS -> Bool isVoicelessC1 = (`elem` [P 5,P 6]) . phoneType {-# INLINE isVoicelessC1 #-} -- | Binary function-predicate 'isNotVowel2' checks whether its arguments are both consonant representations in the 'PRS' format. isNotVowel2 :: PRS -> PRS -> Bool isNotVowel2 x y | phoneType x == P 0 || phoneType y == P 0 = False | otherwise = True {-# INLINE isNotVowel2 #-} -- | Binary function-predicate 'notEqC' checks whether its arguments are not the same consonant sound representations (not taking palatalization into account). notEqC :: [(Char,Char)] -- ^ The pairs of the 'Char' that corresponds to the similar phonetic languages consonant phenomenon (e. g. allophones). Must be sorted in the ascending order to be used correctly. -> PRS -> PRS -> Bool notEqC xs x y | (== cy) . getBFstLSorted' cx xs $ cx = False | otherwise = cx /= cy where !cx = charS x !cy = charS y -- | Function 'sndGroups' converts a word being a list of 'PRS' to the list of phonetically similar (consonants grouped with consonants and each vowel separately) -- sounds representations in 'PRS' format. sndGroups :: [PRS] -> [[PRS]] sndGroups ys@(_:_) = L.groupBy isNotVowel2 ys sndGroups _ = [] groupSnds :: [PRS] -> [[PRS]] groupSnds = L.groupBy (\x y -> ((== P 0) . phoneType $ x) == ((== P 0) . phoneType $ y)) data SegmentationInfo1 = SI { fieldN :: !Int8, -- ^ Number of fields in the pattern matching that are needed to apply the segmentation rules. Not less than 1. predicateN :: Int8 -- ^ Number of predicates in the definition for the 'fieldN' that are needed to apply the segmentation rules. } deriving (Eq) data SegmentationPredFunction = PF (SegmentationInfo1 -> [(Char, Char)] -> [PRS] -> Bool) type DListFunctionResult = ([PRS] -> [PRS],[PRS] -> [PRS]) data SegmentationLineFunction = LFS { predF :: SegmentationPredFunction, -- ^ The predicate to check the needed rule for segmentation. resF :: DListFunctionResult -- ^ The result if the 'predF' returns 'True' for its arguments. } data SegmentationRules1 = SR1 { infoS :: SegmentationInfo1, lineFs :: [SegmentationLineFunction] -- ^ The list must be sorted in the appropriate order of the guards usage for the predicates. -- The length of the list must be equal to the ('fromEnum' . 'predicateN' . 'infoS') value. } {-| List of the 'SegmentationRules1' sorted in the descending order by the 'fieldN' 'SegmentationInfo1' data and where the length of all the 'SegmentationPredFunction' lists of 'PRS' are equal to the 'fieldN' 'SegmentationInfo1' data by definition. -} type SegmentRulesG = [SegmentationRules1] -- | Function 'divCnsnts' is used to divide groups of consonants into two-elements lists that later are made belonging to -- different neighbour syllables if the group is between two vowels in a word. The group must be not empty, but this is not checked. -- The example phonetical information for the proper performance in Ukrainian can be found from the: -- https://msn.khnu.km.ua/pluginfile.php/302375/mod_resource/content/1/%D0%9B.3.%D0%86%D0%86.%20%D0%A1%D0%BA%D0%BB%D0%B0%D0%B4.%D0%9D%D0%B0%D0%B3%D0%BE%D0%BB%D0%BE%D1%81.pdf -- The example of the 'divCnsnts' can be found at: https://hackage.haskell.org/package/ukrainian-phonetics-basic-array-0.1.2.0/docs/src/Languages.Phonetic.Ukrainian.Syllable.Arr.html#divCnsnts divCnsnts :: [(Char,Char)] -> SegmentRulesG -> [PRS] -> DListFunctionResult divCnsnts ks gs xs@(_:_) = resF . fromJust . L.find ((\(PF f) -> f (infoS js) ks xs) . predF). lineFs $ js where !l = length xs !js = fromJust . L.find ((== l) . fromEnum . fieldN . infoS) $ gs -- js :: SegmentationRules1 divCnsnts _ _ [] = (id,id) reSyllableCntnts :: [(Char,Char)] -> SegmentRulesG -> [[PRS]] -> [[PRS]] reSyllableCntnts ks gs (xs:ys:zs:xss) | (/= P 0) . phoneType . last $ ys = fst (divCnsnts ks gs ys) xs:reSyllableCntnts ks gs (snd (divCnsnts ks gs ys) zs:xss) | otherwise = reSyllableCntnts ks gs ((xs `mappend` ys):zs:xss) reSyllableCntnts _ _ (xs:ys:_) = [(xs `mappend` ys)] reSyllableCntnts _ _ xss = xss divVwls :: [[PRS]] -> [[PRS]] divVwls = mapI (\ws -> (length . filter ((== P 0) . phoneType) $ ws) > 1) h3 where h3 us = [ys `mappend` take 1 zs] `mappend` (L.groupBy (\x y -> phoneType x == P 0 && phoneType y /= P 0) . drop 1 $ zs) where (ys,zs) = span (\t -> phoneType t /= P 0) us createSyllablesPL :: GWritingSystemPRPLX -> [(Char,Char)] -> CharPhoneticClassification -> SegmentRulesG -> String -- ^ Corresponds to the \'0\' symbol delimiter in the @ukrainian-phonetics-basic-array@ package. -> String -- ^ Corresponds to the \'1\' and \'-\' symbol delimiters in the @ukrainian-phonetics-basic-array@ package. -> String -- ^ Actually the converted 'String'. -> [[[PRS]]] createSyllablesPL wrs ks arr gs us vs = map (divVwls . reSyllableCntnts ks gs . groupSnds . str2PRSs arr) . words1 . mapMaybe g . convertToProperPL . map (\x -> if x == '-' then ' ' else x) where g x | x `elem` us = Nothing | x `notElem` vs = Just x | otherwise = Just ' ' words1 xs = if null ts then [] else w : words1 s'' -- Practically this is an optimized version for this case 'words' function from Prelude. where ts = dropWhile (== ' ') xs (w, s'') = span (/= ' ') ts {-# NOINLINE words1 #-} convertToProperPL = map char . stringToXG wrs {-# INLINE createSyllablesPL #-}