-- | -- Module : Data.Phonetic.Languages.PrepareText -- Copyright : (c) OleksandrZhabenko 2020-2021 -- License : MIT -- Stability : Experimental -- Maintainer : olexandr543@yahoo.com -- -- Helps to order the 7 or less phonetic language words (or their concatenations) -- to obtain (to some extent) suitable for poetry or music text. -- Earlier it has been a module DobutokO.Poetry.Ukrainian.PrepareText -- from the @dobutokO-poetry@ package. -- In particular, this module can be used to prepare the phonetic language text -- by applying the most needed grammar to avoid misunderstanding -- for the produced text. The attention is paid to the prepositions, pronouns, conjunctions -- and particles that are most commonly connected (or not) in a significant way -- with the next text. -- Uses the information from: -- https://uk.wikipedia.org/wiki/%D0%A1%D0%BF%D0%BE%D0%BB%D1%83%D1%87%D0%BD%D0%B8%D0%BA -- and -- https://uk.wikipedia.org/wiki/%D0%A7%D0%B0%D1%81%D1%82%D0%BA%D0%B0_(%D0%BC%D0%BE%D0%B2%D0%BE%D0%B7%D0%BD%D0%B0%D0%B2%D1%81%D1%82%D0%B2%D0%BE) -- -- Uses arrays instead of vectors. -- A list of basic (but, probably not complete and needed to be extended as needed) English words (the articles, pronouns, -- particles, conjunctions etc.) the corresponding phonetic language translations of which are intended to be used as a -- 'Concatenations' here is written to the file EnglishConcatenated.txt in the source tarball. module Data.Phonetic.Languages.PrepareText ( Concatenations -- * Basic functions , prepareText , prepareTextN , complexWords , splitLines , splitLinesN , isSpC , sort2Concat , toSequentialApp , prepareConcats , complexNWords -- * Used to transform after convertToProperphonetic language from mmsyn6ukr package , isPLL ) where import CaseBi.Arr (getBFstL') import Data.List.InnToOut.Basic (mapI) import Data.Char (isAlpha,toLower) import GHC.Arr import Data.List (sort,sortOn) -- | The lists in the list are sorted in the descending order by the word counts in the inner 'String's. All the 'String's -- in each inner list have the same number of words, and if there is no 'String' with some intermediate number of words (e. g. there -- are not empty 'String's for 4 and 2 words, but there is no one for 3 words 'String's) then such corresponding list is empty, but -- it is, nevertheless, present. Probably the maximum number of words can be no more than 4, and the minimum number can be -- probably no less than 1, but it depends (especially for the maximum). The 'String's in the inner lists must be (unlike the inner -- lists themselves) sorted in the ascending order for the data type to work correctly in the functions of the module. type Concatenations = [[String]] -- | Is used to convert a phonetic language text into list of 'String' each of which is ready to be -- used by the functions from the other modules in the package. -- It applies minimal grammar links and connections between the most commonly used phonetic language -- words that \"should\" be paired and not dealt with separately -- to avoid the misinterpretation and preserve maximum of the semantics for the -- \"phonetic\" language on the phonetic language basis. prepareText :: [[String]] -- ^ Is intended to become a valid 'Concatenations'. -> String -- ^ A sorted 'String' of possible characters in the phonetic language representation. -> String -> [String] prepareText ysss xs = filter (any (isPLL xs)) . splitLines . map (unwords . complexNWords ysss . words . filter (\t -> isAlpha t || isSpC t)) . filter (not . null) . lines sort2Concat :: [[String]] -> Concatenations -- ^ Data used to concatenate the basic grammar preserving words and word sequences to the next word to -- leave the most of the meaning (semantics) of the text available to easy understanding while reading and listening to. sort2Concat xsss | null xsss = [] | otherwise = map sort . reverse . sortOn (map (length . words)) $ xsss toSequentialApp :: Concatenations -- ^ Data used to concatenate the basic grammar preserving words and word sequences to the next word to -- leave the most of the meaning (semantics) of the text available to easy understanding while reading and listening to. -> [Concatenations] toSequentialApp ysss@(xss:xsss) | null xss = toSequentialApp xsss | otherwise = [xss, replicate (n - 1) []] : toSequentialApp xsss where n = length . words . head $ xss toSequentialApp _ = [] prepareConcats :: [[String]] -> [Concatenations] prepareConcats = toSequentialApp . sort2Concat {-# INLINABLE prepareConcats #-} {-| Applies the full complex words concatenations (opposite to the 'complexWords' that applies only partial concatenations). -} complexNWords :: [[String]] -> [String] -> [String] complexNWords xsss yss = complexNWords' tssss yss where tssss = prepareConcats xsss complexNWords' tssss@(ysss:zssss) uss = complexNWords' zssss . complexWords ysss ysss $ uss complexNWords' _ uss = uss -- | Concatenates complex words in phonetic language so that they are not separated further by possible words order rearrangements (because they are treated -- as a single word). This is needed to preserve basic grammar in phonetic languages. complexWords :: Concatenations -> Concatenations -> [String] -> [String] complexWords rsss ysss zss = map (\(ts,_,_) -> ts) . foldr f v $ zss where v = [([],rsss,ysss)] f z rs@((t,rsss,(yss:tsss)):ks) | null yss = f z ((t,rsss,tsss):ks) | getBFstL' False (zip yss . replicate 10000 $ True) uwxs = (filter (/= ' ') uwxs `mappend` t,rsss,rsss):ks | otherwise = f z ((t,rsss,tsss):ks) where y = length . words . head $ yss uwxs = unwords . take y . map (\(q,_,_) -> q) $ rs f z rs@((t,rsss,[]):ks) = (z,rsss,rsss):rs -- | A generalized variant of the 'prepareText' with the arbitrary maximum number of the words in the lines given as the first argument. prepareTextN :: Int -- ^ A maximum number of the words or their concatenations in the resulting list of 'String's. -> [[String]] -- ^ Is intended to become a valid 'Concatenations'. -> String -- ^ A sorted 'String' of possible characters in the phonetic language representation. -> String -> [String] prepareTextN n ysss xs = filter (any (isPLL xs)) . splitLinesN n . map (unwords . complexNWords ysss . words . filter (\t -> isAlpha t || isSpC t)) . filter (not . null) . lines -- | A predicate to check whether the given character is one of the \"\' \\x2019\\x02BC-\". isSpC :: Char -> Bool isSpC x = x == '\'' || x == ' ' || x == '\x2019' || x == '\x02BC' || x == '-' {-# INLINE isSpC #-} -- | The first argument must be a 'String' of sorted 'Char's in the ascending order of all possible symbols that can be -- used for the text in the phonetic language selected. Can be prepared beforehand, or read from the file. isPLL :: String -> Char -> Bool isPLL xs y = getBFstL' False (zip xs . replicate 10000 $ True) y -- | The function is recursive and is applied so that all returned elements ('String') are no longer than 7 words in them. splitLines :: [String] -> [String] splitLines xss | null xss = [] | otherwise = mapI (\xs -> compare (length . words $ xs) 7 == GT) (\xs -> let yss = words xs in splitLines . map unwords . (\(q,r) -> [q,r]) . splitAt (length yss `quot` 2) $ yss) $ xss -- | A generalized variant of the 'splitLines' with the arbitrary maximum number of the words in the lines given as the first argument. splitLinesN :: Int -> [String] -> [String] splitLinesN n xss | null xss || n <= 0 = [] | otherwise = mapI (\xs -> compare (length . words $ xs) n == GT) (\xs -> let yss = words xs in splitLines . map unwords . (\(q,r) -> [q,r]) . splitAt (length yss `quot` 2) $ yss) $ xss