-- |
-- Module      :  Languages.Phonetic.Ukrainian.PrepareText
-- Copyright   :  (c) OleksandrZhabenko 2020
-- License     :  MIT
-- Stability   :  Experimental
-- Maintainer  :  olexandr543@yahoo.com
--
-- Helps to order the 7 or less Ukrainian words (or their concatenations) 
-- to obtain (to some extent) suitable for poetry or music text. 
-- Earlier it has been a module DobutokO.Poetry.Ukrainian.PrepareText 
-- from the @dobutokO-poetry@ package. 
-- In particular, this module can be used to prepare the Ukrainian text 
-- by applying the most needed grammar to avoid misunderstanding 
-- for the produced text. The attention is paid to the prepositions, pronouns, conjunctions 
-- and particles that are most commonly connected (or not) in a significant way 
-- with the next text. 
-- Uses the information from: 
-- https://uk.wikipedia.org/wiki/%D0%A1%D0%BF%D0%BE%D0%BB%D1%83%D1%87%D0%BD%D0%B8%D0%BA
-- and 
-- https://uk.wikipedia.org/wiki/%D0%A7%D0%B0%D1%81%D1%82%D0%BA%D0%B0_(%D0%BC%D0%BE%D0%B2%D0%BE%D0%B7%D0%BD%D0%B0%D0%B2%D1%81%D1%82%D0%B2%D0%BE)
-- 

module Languages.Phonetic.Ukrainian.PrepareText where

import CaseBi (getBFst')
import Data.List.InnToOut.Basic (mapI)
import Data.Char (isAlpha,toLower)
import qualified Data.Vector as V

-- | Is used to convert a Ukrainian text into list of 'String' each of which is ready to be 
-- used by the functions from the other modules in the package. 
-- It applies minimal grammar links and connections between the most commonly used Ukrainian 
-- words that \"should\" be paired and not dealt with separately 
-- to avoid the misinterpretation and preserve maximum of the semantics for the 
-- \"phonetic\" language on the Ukrainian basis. 
prepareText :: String -> [String]
prepareText = splitLines . map (unwords . concatenated2. auxiliary1 . complexWords . words .
        filter (\t -> isAlpha t || isSpC t)) . filter (not . null) . lines

complexWords :: [String] -> [String]
complexWords (xs:ys:zs:ts:xss) =
 getBFst' (xs:complexWords (ys:zs:ts:xss),V.fromList [("\1074",
    if ys == "\1084\1110\1088\1091" && zs == "\1090\1086\1075\1086" &&
    ts == "\1103\1082" then (xs ++ ys ++ zs ++ ts ++ (concat . take 1 $ xss)):
    complexWords (drop 1 xss) else (xs ++ ys):complexWords (zs:ts:xss)),
    ("\1076\1072\1088\1084\1072", if ys == "\1097\1086"
    then (xs ++ ys ++ zs):complexWords (ts:xss)
    else (xs ++ ys):complexWords (zs:ts:xss)), ("\1076\1083\1103",
    if ys == "\1090\1086\1075\1086" && zs == "\1097\1086\1073"
    then (xs ++ ys ++ zs ++ ts):complexWords xss
    else (xs ++ ys):complexWords (zs:ts:xss)), ("\1079",
      case ys of
        "\1090\1080\1084" -> if zs == "\1097\1086\1073"
          then (xs ++ ys ++ zs):complexWords (ts:xss)
          else (xs ++ ys):complexWords (zs:ts:xss)
        "\1090\1086\1075\1086" -> if zs == "\1095\1072\1089\1091" && ts == "\1103\1082"
          then (xs ++ ys ++ zs ++ "\1081\1072\1082" ++
          (concat . take 1 $ xss)):complexWords (drop 1 xss)
          else (xs ++ ys):complexWords (zs:ts:xss)
        _ -> (xs ++ ys):complexWords (zs:ts:xss)), ("\1079\1072\1084\1110\1089\1090\1100",
    if ys == "\1090\1086\1075\1086" && zs == "\1097\1086\1073"
    then (xs ++ ys ++ zs ++ ts):complexWords xss
    else (xs ++ ys):complexWords (zs:ts:xss)), ("\1087\1086\1087\1088\1080",
    if ys == "\1090\1077" && zs == "\1097\1086" then (xs ++ ys ++ zs ++ ts):complexWords xss
    else (xs ++ ys):complexWords (zs:ts:xss)), ("\1087\1088\1080",
    if ys == "\1094\1100\1086\1084\1091" then (xs ++ ys ++ zs):complexWords (ts:xss)
    else (xs ++ ys):complexWords (zs:ts:xss)), ("\1087\1110\1089\1083\1103",
    if ys == "\1090\1086\1075\1086" && zs == "\1103\1082" then (xs ++ ys ++ zs ++ ts):complexWords xss
    else (xs ++ ys):complexWords (zs:ts:xss)), ("\1090\1072\1082", if ys == "\1097\1086"
    then (xs ++ ys ++ zs):complexWords (ts:xss) else (xs ++ ys):complexWords (zs:ts:xss)),
    ("\1090\1080\1084\1095\1072\1089\1086\1084", if ys == "\1103\1082"
    then (xs ++ ys ++ zs):complexWords (ts:xss) else (xs ++ ys):complexWords (zs:ts:xss)),
    ("\1090\1086\1084\1091", if ys == "\1103\1082" then (xs ++ "\1081\1072\1082" ++ zs):complexWords (ts:xss)
    else (xs ++ ys):complexWords (zs:ts:xss)), ("\1091", if ys == "\1084\1110\1088\1091" &&
    zs == "\1090\1086\1075\1086" && ts == "\1103\1082" then (xs ++ ys ++ zs ++ ts ++
    (concat . take 1 $ xss)):complexWords (drop 1 xss) else (xs ++ ys):complexWords (zs:ts:xss)),
    ("\1093\1086\1095", if ys == "\1073\1080" then (xs ++ ys ++ zs):complexWords (ts:xss)
    else (xs ++ ys):complexWords (zs:ts:xss)), ("\1093\1086\1095", if ys == "\1073\1080"
    then (xs ++ ys ++ zs):complexWords (ts:xss) else (xs ++ ys):complexWords (zs:ts:xss)),
    ("\1095\1077\1088\1077\1079", if ys == "\1090\1077" && zs == "\1097\1086"
    then (xs ++ ys ++ zs ++ ts):complexWords xss else (xs ++ ys):complexWords (zs:ts:xss))]) xs
complexWords (xs:ys:zs:_) =
 getBFst' ([xs,ys,zs],V.fromList [
    ("\1076\1072\1088\1084\1072", if ys == "\1097\1086"
    then [xs ++ ys ++ zs]
    else [xs ++ ys,zs]), ("\1079",
      case ys of
        "\1090\1080\1084" -> if zs == "\1097\1086\1073"
          then [xs ++ ys ++ zs]
          else [xs ++ ys,zs]
        _ -> [xs ++ ys,zs]), ("\1087\1088\1080",
    if ys == "\1094\1100\1086\1084\1091" then [xs ++ ys ++ zs]
    else [xs ++ ys,zs]), ("\1090\1072\1082", if ys == "\1097\1086"
    then [xs ++ ys ++ zs] else [xs ++ ys,zs]),     ("\1090\1080\1084\1095\1072\1089\1086\1084",
    if ys == "\1103\1082" then [xs ++ ys ++ zs] else [xs ++ ys, zs]),
    ("\1090\1086\1084\1091", if ys == "\1103\1082" then [xs ++ "\1081\1072\1082" ++ zs]
    else [xs ++ ys,zs]), ("\1093\1086\1095", if ys == "\1073\1080" then [xs ++ ys ++ zs]
    else [xs ++ ys,zs]), ("\1093\1086\1095", if ys == "\1073\1080"
    then [xs ++ ys ++ zs] else [xs ++ ys,zs])]) xs
complexWords xss = xss

splitLines :: [String] -> [String]
splitLines xss
 | null xss = []
 | otherwise = mapI (\xs -> compare (length . words $ xs) 7 == GT) (\xs ->
     (\(t,z) -> [t, z]) . splitAt (length xs `quot` 2) $ xs) xss

auxiliary1 :: [String] -> [String]
auxiliary1 (xs:ys:zs:xss)
  | isConcatenated ys || isPreposition ys =
      auxiliary1 (xs:auxiliary1 ((ys ++ (drop 1 . jottedConv $ ' ':zs)):xss))
  | isConcatenated xs || isPreposition xs = auxiliary1 ((xs ++ (drop 1 . jottedConv $ ' ':ys)):zs:xss)
  | otherwise = xs:auxiliary1 (ys:zs:xss)
auxiliary1 x@(xs:ys:xss)
  | isConcatenated xs || isPreposition xs = auxiliary1 ((xs ++ (drop 1 . jottedConv $ ' ':ys)):xss)
  | otherwise = x
auxiliary1 xss = xss

isPreposition :: String -> Bool
isPreposition ts =
  getBFst' (False, V.fromList .
   zip ["\1030\1079", "\1041\1077\1079", "\1041\1110\1083\1103", "\1042",
    "\1042\1110\1076", "\1044\1083\1103", "\1044\1086", "\1047",
     "\1047\1072", "\1047\1072\1088\1072\1076\1080", "\1047\1110",
      "\1050", "\1050\1086\1083\1086", "\1050\1088\1110\1079\1100",
       "\1050\1088\1110\1084", "\1052\1077\1078", "\1052\1077\1078\1080",
        "\1052\1110\1078", "\1053\1072", "\1053\1072\1076", "\1054",
         "\1054\1073", "\1054\1076", "\1054\1082\1088\1110\1084",
          "\1055\1077\1088\1077\1076", "\1055\1086", "\1055\1088\1080",
           "\1055\1088\1086", "\1055\1088\1086\1090\1080",
            "\1055\1110\1076", "\1055\1110\1089\1083\1103",
             "\1057\1077\1088\1077\1076", "\1057\1077\1088\1077\1076\1080",
              "\1059", "\1063\1077\1088\1077\1079", "\1073\1077\1079",
               "\1073\1110\1083\1103", "\1074", "\1074\1110\1076",
                "\1076\1083\1103", "\1076\1086", "\1079", "\1079\1072",
                 "\1079\1072\1088\1072\1076\1080", "\1079\1110",
                  "\1082", "\1082\1086\1083\1086", "\1082\1088\1110\1079\1100",
                   "\1082\1088\1110\1084", "\1084\1077\1078", "\1084\1077\1078\1080",
                    "\1084\1110\1078", "\1085\1072", "\1085\1072\1076", "\1086",
                     "\1086\1073", "\1086\1076", "\1086\1082\1088\1110\1084",
                      "\1087\1077\1088\1077\1076", "\1087\1086", "\1087\1088\1080",
                       "\1087\1088\1086", "\1087\1088\1086\1090\1080", "\1087\1110\1076",
                        "\1087\1110\1089\1083\1103", "\1089\1077\1088\1077\1076",
                         "\1089\1077\1088\1077\1076\1080", "\1091",
                          "\1095\1077\1088\1077\1079", "\1110\1079"] $
                           replicate 100 True) ts
{-# INLINE isPreposition #-}

-- | Since the dobutokO-poetry version 0.16.3.0 the (||) operator has been changed to the (&&). 
-- The idea is that these words are the ones that are pronouns and they \"should\" be treated 
-- (by the author's understanding) as independent words.
isConcatenated :: String -> Bool
isConcatenated ts
 | null ts = False
 | otherwise = compare (length ts) 2 /= GT && getBFst' (True, V.fromList .
     zip ["\1028", "\1042\1080", "\1052\1080", "\1058\1080", "\1058\1110",
       "\1062\1110", "\1071", "\1074\1080", "\1084\1080", "\1090\1080", "\1090\1110",
         "\1094\1110", "\1103", "\1108"] $ replicate 14 False) ts &&
           (head ts `notElem` "\1031\1111")
{-# INLINE isConcatenated #-}

concatenated2 :: [String] -> [String]
concatenated2 (xs:ys:xss) =
 getBFst' (xs:concatenated2 (ys:xss), V.fromList . zip ["\1040\1073\1086","\1040\1076\1078\1077",
 "\1040\1083\1077","\1040\1085\1110\1078","\1041\1086\1076\1072\1081",
 "\1041\1091\1094\1110\1084\1090\1086","\1042\1078\1077","\1042\1080\1082\1083\1102\1095\1085\1086",
 "\1042\1083\1072\1089\1085\1077","\1042\1090\1110\1084","\1044\1072\1074\1072\1081",
 "\1047\1072\1090\1077","\1050\1086\1083\1080","\1051\1077\1076\1074\1077","\1051\1080\1096\1077",
 "\1052\1072\1081\1078\1077","\1052\1086\1074","\1052\1086\1074\1073\1080",
 "\1052\1086\1074\1073\1080\1090\1086","\1053\1072\1074\1110\1090\1100",
 "\1053\1072\1089\1082\1110\1083\1100\1082\1080","\1053\1072\1095\1077","\1053\1072\1095\1077\1073",
 "\1053\1072\1095\1077\1073\1090\1086","\1053\1077\1074\1078\1077","\1053\1077\1084\1086\1074",
 "\1053\1077\1084\1086\1074\1073\1080","\1053\1077\1084\1086\1074\1073\1080\1090\1086",
 "\1053\1077\1085\1072\1095\1077","\1053\1077\1085\1072\1095\1077\1073\1090\1086",
 "\1053\1077\1093\1072\1081","\1053\1090\1078\1077","\1053\1110\1073\1080",
 "\1053\1110\1073\1080\1090\1086","\1053\1110\1078","\1054\1090\1086\1078",
 "\1055\1088\1080\1090\1086\1084\1091","\1055\1088\1080\1090\1110\1084",
 "\1055\1088\1080\1095\1086\1084\1091","\1055\1088\1080\1095\1110\1084",
 "\1055\1088\1086\1090\1077","\1057\1072\1084\1077","\1057\1077\1073\1090\1086",
 "\1058\1072\1082\1080","\1058\1086\1073\1090\1086","\1058\1110\1083\1100\1082\1080",
 "\1061\1072\1081","\1061\1086\1095","\1061\1110\1073\1072","\1062\1077\1073\1090\1086",
 "\1065\1086\1073","\1071\1082\1073\1080","\1071\1082\1088\1072\1079","\1071\1082\1097\1086",
 "\1072\1073\1086","\1072\1076\1078\1077","\1072\1083\1077","\1072\1085\1110\1078",
 "\1073\1086\1076\1072\1081","\1073\1091\1094\1110\1084\1090\1086","\1074\1078\1077",
 "\1074\1080\1082\1083\1102\1095\1085\1086","\1074\1083\1072\1089\1085\1077",
 "\1074\1090\1110\1084","\1076\1072\1074\1072\1081","\1079\1072\1090\1077","\1082\1086\1083\1080",
 "\1083\1077\1076\1074\1077","\1083\1080\1096\1077","\1084\1072\1081\1078\1077","\1084\1086\1074",
 "\1084\1086\1074\1073\1080","\1084\1086\1074\1073\1080\1090\1086","\1085\1072\1074\1110\1090\1100",
 "\1085\1072\1089\1082\1110\1083\1100\1082\1080","\1085\1072\1095\1077","\1085\1072\1095\1077\1073",
 "\1085\1072\1095\1077\1073\1090\1086","\1085\1077\1074\1078\1077","\1085\1077\1084\1086\1074",
 "\1085\1077\1084\1086\1074\1073\1080","\1085\1077\1084\1086\1074\1073\1080\1090\1086",
 "\1085\1077\1085\1072\1095\1077","\1085\1077\1085\1072\1095\1077\1073\1090\1086",
 "\1085\1077\1093\1072\1081","\1085\1110\1073\1080","\1085\1110\1073\1080\1090\1086",
 "\1085\1110\1078","\1086\1090\1078\1077","\1086\1090\1086\1078","\1087\1088\1080\1090\1086\1084\1091",
 "\1087\1088\1080\1090\1110\1084","\1087\1088\1080\1095\1086\1084\1091","\1087\1088\1080\1095\1110\1084",
 "\1087\1088\1086\1090\1077","\1089\1072\1084\1077","\1089\1077\1073\1090\1086","\1090\1072\1082\1080",
 "\1090\1086\1073\1090\1086","\1090\1110\1083\1100\1082\1080","\1093\1072\1081","\1093\1086\1095",
 "\1093\1110\1073\1072","\1094\1077\1073\1090\1086","\1097\1086\1073","\1103\1082\1073\1080",
 "\1103\1082\1088\1072\1079","\1103\1082\1097\1086"] $ replicate 100 ((xs ++ (drop 1 . jottedConv $ ' ':ys)):concatenated2 xss)) xs
concatenated2 xss = xss

isSpC :: Char -> Bool
isSpC x = x == '\'' || x == ' ' || x == '\x2019' || x == '\x02BC' || x == '-'
{-# INLINE isSpC #-}

jottedConv :: String -> String
jottedConv (x:y:xs)
  | isSpC x = x:(getBFst' (jottedConv (y:xs), V.fromList
     [('\1028', '\1049':'\1077':jottedConv xs),
      ('\1031', '\1049':'\1110':jottedConv xs),
      ('\1070', '\1049':'\1091':jottedConv xs),
      ('\1071', '\1049':'\1072':jottedConv xs),
      ('\1102', '\1081':'\1091':jottedConv xs),
      ('\1103', '\1081':'\1072':jottedConv xs),
      ('\1108', '\1081':'\1077':jottedConv xs),
      ('\1111', '\1081':'\1110':jottedConv xs)]) y)
  | otherwise = x:jottedConv (y:xs)
jottedConv xs = xs