-- |
-- Module      :  Data.Phonetic.Languages.PrepareText
-- Copyright   :  (c) OleksandrZhabenko 2020-2021
-- License     :  MIT
-- Stability   :  Experimental
-- Maintainer  :  olexandr543@yahoo.com
--
-- Helps to order the 7 or less phonetic language words (or their concatenations)
-- to obtain (to some extent) suitable for poetry or music text.
-- Earlier it has been a module DobutokO.Poetry.Ukrainian.PrepareText
-- from the @dobutokO-poetry@ package.
-- In particular, this module can be used to prepare the phonetic language text
-- by applying the most needed grammar to avoid misunderstanding
-- for the produced text. The attention is paid to the prepositions, pronouns, conjunctions
-- and particles that are most commonly connected (or not) in a significant way
-- with the next text.
-- Uses the information from:
-- https://uk.wikipedia.org/wiki/%D0%A1%D0%BF%D0%BE%D0%BB%D1%83%D1%87%D0%BD%D0%B8%D0%BA
-- and
-- https://uk.wikipedia.org/wiki/%D0%A7%D0%B0%D1%81%D1%82%D0%BA%D0%B0_(%D0%BC%D0%BE%D0%B2%D0%BE%D0%B7%D0%BD%D0%B0%D0%B2%D1%81%D1%82%D0%B2%D0%BE)
--
-- Uses arrays instead of vectors.
-- A list of basic (but, probably not complete and needed to be extended as needed) English words (the articles, pronouns,
-- particles, conjunctions etc.) the corresponding phonetic language translations of which are intended to be used as a
-- 'Concatenations' here is written to the file EnglishConcatenated.txt in the source tarball.

module Data.Phonetic.Languages.PrepareText (
  Concatenations
  -- * Basic functions
  , prepareText
  , prepareTextN
  , complexWords
  , splitLines
  , splitLinesN
  , isSpC
  , sort2Concat
  , toSequentialApp
  , prepareConcats
  , complexNWords
  -- * Used to transform after convertToProperphonetic language from mmsyn6ukr package
  , isPLL
) where

import CaseBi.Arr (getBFstL')
import Data.List.InnToOut.Basic (mapI)
import Data.Char (isAlpha,toLower)
import GHC.Arr
import Data.List (sort,sortOn)

-- | The lists in the list are sorted in the descending order by the word counts in the inner 'String's. All the 'String's
-- in each inner list have the same number of words, and if there is no 'String' with some intermediate number of words (e. g. there
-- are not empty 'String's for 4 and 2 words, but there is no one for 3 words 'String's) then such corresponding list is empty, but
-- it is, nevertheless, present. Probably the maximum number of words can be no more than 4, and the minimum number can be
-- probably no less than 1, but it depends (especially for the maximum). The 'String's in the inner lists must be (unlike the inner
-- lists themselves) sorted in the ascending order for the data type to work correctly in the functions of the module.
type Concatenations = [[String]]

-- | Is used to convert a phonetic language text into list of 'String' each of which is ready to be
-- used by the functions from the other modules in the package.
-- It applies minimal grammar links and connections between the most commonly used phonetic language
-- words that \"should\" be paired and not dealt with separately
-- to avoid the misinterpretation and preserve maximum of the semantics for the
-- \"phonetic\" language on the phonetic language basis.
prepareText
  :: [[String]] -- ^ Is intended to become a valid 'Concatenations'.
  -> String -- ^ A sorted 'String' of possible characters in the phonetic language representation.
  -> String
  -> [String]
prepareText ysss xs = filter (any (isPLL xs)) . splitLines . map (unwords . complexNWords ysss . words .
  filter (\t -> isAlpha t || isSpC t)) . filter (not . null) . lines

sort2Concat
 :: [[String]]
 -> Concatenations  -- ^ Data used to concatenate the basic grammar preserving words and word sequences to the next word to
 -- leave the most of the meaning (semantics) of the text available to easy understanding while reading and listening to.
sort2Concat xsss
 | null xsss = []
 | otherwise = map sort . reverse . sortOn (map (length . words)) $ xsss

toSequentialApp
 :: Concatenations -- ^ Data used to concatenate the basic grammar preserving words and word sequences to the next word to
 -- leave the most of the meaning (semantics) of the text available to easy understanding while reading and listening to.
 -> [Concatenations]
toSequentialApp ysss@(xss:xsss)
 | null xss = toSequentialApp xsss
 | otherwise = [xss, replicate (n - 1) []] : toSequentialApp xsss
     where n = length . words . head $ xss
toSequentialApp _ = []

prepareConcats
 :: [[String]]
 -> [Concatenations]
prepareConcats = toSequentialApp . sort2Concat
{-# INLINABLE prepareConcats #-}

{-| Applies the full complex words concatenations (opposite to the 'complexWords' that applies only partial concatenations). 
-}
complexNWords :: [[String]] -> [String] -> [String]
complexNWords xsss yss = complexNWords' tssss yss
  where tssss = prepareConcats xsss
        complexNWords' tssss@(ysss:zssss) uss = complexNWords' zssss . complexWords ysss ysss $ uss
        complexNWords' _ uss = uss

-- | Concatenates complex words in phonetic language so that they are not separated further by possible words order rearrangements (because they are treated
-- as a single word). This is needed to preserve basic grammar in phonetic languages.
complexWords :: Concatenations -> Concatenations -> [String] -> [String]
complexWords rsss ysss zss
 = map (\(ts,_,_) -> ts) . foldr f v $ zss
    where v = [([],rsss,ysss)]
          f z rs@((t,rsss,(yss:tsss)):ks)
            | null yss = f z ((t,rsss,tsss):ks)
            | getBFstL' False (zip yss . replicate 10000 $ True) uwxs = (filter (/= ' ') uwxs `mappend` t,rsss,rsss):ks
            | otherwise = f z ((t,rsss,tsss):ks)
                  where y = length . words . head $ yss
                        uwxs = unwords . take y . map (\(q,_,_) -> q) $ rs
          f z rs@((t,rsss,[]):ks) = (z,rsss,rsss):rs

-- | A generalized variant of the 'prepareText' with the arbitrary maximum number of the words in the lines given as the first argument.
prepareTextN
 :: Int -- ^ A maximum number of the words or their concatenations in the resulting list of 'String's.
 -> [[String]] -- ^ Is intended to become a valid 'Concatenations'.
 -> String -- ^ A sorted 'String' of possible characters in the phonetic language representation.
 -> String
 -> [String]
prepareTextN n ysss xs = filter (any (isPLL xs)) . splitLinesN n . map (unwords . complexNWords ysss . words .
  filter (\t -> isAlpha t || isSpC t)) . filter (not . null) . lines

-- | A predicate to check whether the given character is one of the \"\' \\x2019\\x02BC-\".
isSpC :: Char -> Bool
isSpC x = x == '\'' || x == ' ' || x == '\x2019' || x == '\x02BC' || x == '-'
{-# INLINE isSpC #-}

-- | The first argument must be a 'String' of sorted 'Char's in the ascending order of all possible symbols that can be
-- used for the text in the phonetic language selected. Can be prepared beforehand, or read from the file. 
isPLL :: String -> Char -> Bool
isPLL xs y = getBFstL' False (zip xs . replicate 10000 $ True) y

-- | The function is recursive and is applied so that all returned elements ('String') are no longer than 7 words in them.
splitLines :: [String] -> [String]
splitLines xss
 | null xss = []
 | otherwise = mapI (\xs -> compare (length . words $ xs) 7 == GT) (\xs -> let yss = words xs in
     splitLines . map unwords . (\(q,r) -> [q,r]) . splitAt (length yss `quot` 2) $ yss) $ xss

-- | A generalized variant of the 'splitLines' with the arbitrary maximum number of the words in the lines given as the first argument.
splitLinesN :: Int -> [String] -> [String]
splitLinesN n xss
 | null xss || n <= 0 = []
 | otherwise = mapI (\xs -> compare (length . words $ xs) n == GT) (\xs -> let yss = words xs in
     splitLines . map unwords . (\(q,r) -> [q,r]) . splitAt (length yss `quot` 2) $ yss) $ xss