module Languages.Phonetic.Ukrainian.PrepareText where
import CaseBi (getBFst')
import Data.List.InnToOut.Basic (mapI)
import Data.Char (isAlpha,toLower)
import qualified Data.Vector as V
prepareText :: String -> [String]
prepareText = splitLines . map (unwords . concatenated2. auxiliary1 . complexWords . words .
filter (\t -> isAlpha t || isSpC t)) . filter (not . null) . lines
complexWords :: [String] -> [String]
complexWords (xs:ys:zs:ts:xss) =
getBFst' (xs:complexWords (ys:zs:ts:xss),V.fromList [("\1074",
if ys == "\1084\1110\1088\1091" && zs == "\1090\1086\1075\1086" &&
ts == "\1103\1082" then (xs ++ ys ++ zs ++ ts ++ (concat . take 1 $ xss)):
complexWords (drop 1 xss) else (xs ++ ys):complexWords (zs:ts:xss)),
("\1076\1072\1088\1084\1072", if ys == "\1097\1086"
then (xs ++ ys ++ zs):complexWords (ts:xss)
else (xs ++ ys):complexWords (zs:ts:xss)), ("\1076\1083\1103",
if ys == "\1090\1086\1075\1086" && zs == "\1097\1086\1073"
then (xs ++ ys ++ zs ++ ts):complexWords xss
else (xs ++ ys):complexWords (zs:ts:xss)), ("\1079",
case ys of
"\1090\1080\1084" -> if zs == "\1097\1086\1073"
then (xs ++ ys ++ zs):complexWords (ts:xss)
else (xs ++ ys):complexWords (zs:ts:xss)
"\1090\1086\1075\1086" -> if zs == "\1095\1072\1089\1091" && ts == "\1103\1082"
then (xs ++ ys ++ zs ++ "\1081\1072\1082" ++
(concat . take 1 $ xss)):complexWords (drop 1 xss)
else (xs ++ ys):complexWords (zs:ts:xss)
_ -> (xs ++ ys):complexWords (zs:ts:xss)), ("\1079\1072\1084\1110\1089\1090\1100",
if ys == "\1090\1086\1075\1086" && zs == "\1097\1086\1073"
then (xs ++ ys ++ zs ++ ts):complexWords xss
else (xs ++ ys):complexWords (zs:ts:xss)), ("\1087\1086\1087\1088\1080",
if ys == "\1090\1077" && zs == "\1097\1086" then (xs ++ ys ++ zs ++ ts):complexWords xss
else (xs ++ ys):complexWords (zs:ts:xss)), ("\1087\1088\1080",
if ys == "\1094\1100\1086\1084\1091" then (xs ++ ys ++ zs):complexWords (ts:xss)
else (xs ++ ys):complexWords (zs:ts:xss)), ("\1087\1110\1089\1083\1103",
if ys == "\1090\1086\1075\1086" && zs == "\1103\1082" then (xs ++ ys ++ zs ++ ts):complexWords xss
else (xs ++ ys):complexWords (zs:ts:xss)), ("\1090\1072\1082", if ys == "\1097\1086"
then (xs ++ ys ++ zs):complexWords (ts:xss) else (xs ++ ys):complexWords (zs:ts:xss)),
("\1090\1080\1084\1095\1072\1089\1086\1084", if ys == "\1103\1082"
then (xs ++ ys ++ zs):complexWords (ts:xss) else (xs ++ ys):complexWords (zs:ts:xss)),
("\1090\1086\1084\1091", if ys == "\1103\1082" then (xs ++ "\1081\1072\1082" ++ zs):complexWords (ts:xss)
else (xs ++ ys):complexWords (zs:ts:xss)), ("\1091", if ys == "\1084\1110\1088\1091" &&
zs == "\1090\1086\1075\1086" && ts == "\1103\1082" then (xs ++ ys ++ zs ++ ts ++
(concat . take 1 $ xss)):complexWords (drop 1 xss) else (xs ++ ys):complexWords (zs:ts:xss)),
("\1093\1086\1095", if ys == "\1073\1080" then (xs ++ ys ++ zs):complexWords (ts:xss)
else (xs ++ ys):complexWords (zs:ts:xss)), ("\1093\1086\1095", if ys == "\1073\1080"
then (xs ++ ys ++ zs):complexWords (ts:xss) else (xs ++ ys):complexWords (zs:ts:xss)),
("\1095\1077\1088\1077\1079", if ys == "\1090\1077" && zs == "\1097\1086"
then (xs ++ ys ++ zs ++ ts):complexWords xss else (xs ++ ys):complexWords (zs:ts:xss))]) xs
complexWords (xs:ys:zs:_) =
getBFst' ([xs,ys,zs],V.fromList [
("\1076\1072\1088\1084\1072", if ys == "\1097\1086"
then [xs ++ ys ++ zs]
else [xs ++ ys,zs]), ("\1079",
case ys of
"\1090\1080\1084" -> if zs == "\1097\1086\1073"
then [xs ++ ys ++ zs]
else [xs ++ ys,zs]
_ -> [xs ++ ys,zs]), ("\1087\1088\1080",
if ys == "\1094\1100\1086\1084\1091" then [xs ++ ys ++ zs]
else [xs ++ ys,zs]), ("\1090\1072\1082", if ys == "\1097\1086"
then [xs ++ ys ++ zs] else [xs ++ ys,zs]), ("\1090\1080\1084\1095\1072\1089\1086\1084",
if ys == "\1103\1082" then [xs ++ ys ++ zs] else [xs ++ ys, zs]),
("\1090\1086\1084\1091", if ys == "\1103\1082" then [xs ++ "\1081\1072\1082" ++ zs]
else [xs ++ ys,zs]), ("\1093\1086\1095", if ys == "\1073\1080" then [xs ++ ys ++ zs]
else [xs ++ ys,zs]), ("\1093\1086\1095", if ys == "\1073\1080"
then [xs ++ ys ++ zs] else [xs ++ ys,zs])]) xs
complexWords xss = xss
splitLines :: [String] -> [String]
splitLines xss
| null xss = []
| otherwise = mapI (\xs -> compare (length . words $ xs) 7 == GT) (\xs ->
(\(t,z) -> [t, z]) . splitAt (length xs `quot` 2) $ xs) xss
auxiliary1 :: [String] -> [String]
auxiliary1 (xs:ys:zs:xss)
| isConcatenated ys || isPreposition ys =
auxiliary1 (xs:auxiliary1 ((ys ++ (drop 1 . jottedConv $ ' ':zs)):xss))
| isConcatenated xs || isPreposition xs = auxiliary1 ((xs ++ (drop 1 . jottedConv $ ' ':ys)):zs:xss)
| otherwise = xs:auxiliary1 (ys:zs:xss)
auxiliary1 x@(xs:ys:xss)
| isConcatenated xs || isPreposition xs = auxiliary1 ((xs ++ (drop 1 . jottedConv $ ' ':ys)):xss)
| otherwise = x
auxiliary1 xss = xss
isPreposition :: String -> Bool
isPreposition ts =
getBFst' (False, V.fromList .
zip ["\1030\1079", "\1041\1077\1079", "\1041\1110\1083\1103", "\1042",
"\1042\1110\1076", "\1044\1083\1103", "\1044\1086", "\1047",
"\1047\1072", "\1047\1072\1088\1072\1076\1080", "\1047\1110",
"\1050", "\1050\1086\1083\1086", "\1050\1088\1110\1079\1100",
"\1050\1088\1110\1084", "\1052\1077\1078", "\1052\1077\1078\1080",
"\1052\1110\1078", "\1053\1072", "\1053\1072\1076", "\1054",
"\1054\1073", "\1054\1076", "\1054\1082\1088\1110\1084",
"\1055\1077\1088\1077\1076", "\1055\1086", "\1055\1088\1080",
"\1055\1088\1086", "\1055\1088\1086\1090\1080",
"\1055\1110\1076", "\1055\1110\1089\1083\1103",
"\1057\1077\1088\1077\1076", "\1057\1077\1088\1077\1076\1080",
"\1059", "\1063\1077\1088\1077\1079", "\1073\1077\1079",
"\1073\1110\1083\1103", "\1074", "\1074\1110\1076",
"\1076\1083\1103", "\1076\1086", "\1079", "\1079\1072",
"\1079\1072\1088\1072\1076\1080", "\1079\1110",
"\1082", "\1082\1086\1083\1086", "\1082\1088\1110\1079\1100",
"\1082\1088\1110\1084", "\1084\1077\1078", "\1084\1077\1078\1080",
"\1084\1110\1078", "\1085\1072", "\1085\1072\1076", "\1086",
"\1086\1073", "\1086\1076", "\1086\1082\1088\1110\1084",
"\1087\1077\1088\1077\1076", "\1087\1086", "\1087\1088\1080",
"\1087\1088\1086", "\1087\1088\1086\1090\1080", "\1087\1110\1076",
"\1087\1110\1089\1083\1103", "\1089\1077\1088\1077\1076",
"\1089\1077\1088\1077\1076\1080", "\1091",
"\1095\1077\1088\1077\1079", "\1110\1079"] $
replicate 100 True) ts
{-# INLINE isPreposition #-}
isConcatenated :: String -> Bool
isConcatenated ts
| null ts = False
| otherwise = compare (length ts) 2 /= GT && getBFst' (True, V.fromList .
zip ["\1028", "\1042\1080", "\1052\1080", "\1058\1080", "\1058\1110",
"\1062\1110", "\1071", "\1074\1080", "\1084\1080", "\1090\1080", "\1090\1110",
"\1094\1110", "\1103", "\1108"] $ replicate 14 False) ts &&
(head ts `notElem` "\1031\1111")
{-# INLINE isConcatenated #-}
concatenated2 :: [String] -> [String]
concatenated2 (xs:ys:xss) =
getBFst' (xs:concatenated2 (ys:xss), V.fromList . zip ["\1040\1073\1086","\1040\1076\1078\1077",
"\1040\1083\1077","\1040\1085\1110\1078","\1041\1086\1076\1072\1081",
"\1041\1091\1094\1110\1084\1090\1086","\1042\1078\1077","\1042\1080\1082\1083\1102\1095\1085\1086",
"\1042\1083\1072\1089\1085\1077","\1042\1090\1110\1084","\1044\1072\1074\1072\1081",
"\1047\1072\1090\1077","\1050\1086\1083\1080","\1051\1077\1076\1074\1077","\1051\1080\1096\1077",
"\1052\1072\1081\1078\1077","\1052\1086\1074","\1052\1086\1074\1073\1080",
"\1052\1086\1074\1073\1080\1090\1086","\1053\1072\1074\1110\1090\1100",
"\1053\1072\1089\1082\1110\1083\1100\1082\1080","\1053\1072\1095\1077","\1053\1072\1095\1077\1073",
"\1053\1072\1095\1077\1073\1090\1086","\1053\1077\1074\1078\1077","\1053\1077\1084\1086\1074",
"\1053\1077\1084\1086\1074\1073\1080","\1053\1077\1084\1086\1074\1073\1080\1090\1086",
"\1053\1077\1085\1072\1095\1077","\1053\1077\1085\1072\1095\1077\1073\1090\1086",
"\1053\1077\1093\1072\1081","\1053\1090\1078\1077","\1053\1110\1073\1080",
"\1053\1110\1073\1080\1090\1086","\1053\1110\1078","\1054\1090\1086\1078",
"\1055\1088\1080\1090\1086\1084\1091","\1055\1088\1080\1090\1110\1084",
"\1055\1088\1080\1095\1086\1084\1091","\1055\1088\1080\1095\1110\1084",
"\1055\1088\1086\1090\1077","\1057\1072\1084\1077","\1057\1077\1073\1090\1086",
"\1058\1072\1082\1080","\1058\1086\1073\1090\1086","\1058\1110\1083\1100\1082\1080",
"\1061\1072\1081","\1061\1086\1095","\1061\1110\1073\1072","\1062\1077\1073\1090\1086",
"\1065\1086\1073","\1071\1082\1073\1080","\1071\1082\1088\1072\1079","\1071\1082\1097\1086",
"\1072\1073\1086","\1072\1076\1078\1077","\1072\1083\1077","\1072\1085\1110\1078",
"\1073\1086\1076\1072\1081","\1073\1091\1094\1110\1084\1090\1086","\1074\1078\1077",
"\1074\1080\1082\1083\1102\1095\1085\1086","\1074\1083\1072\1089\1085\1077",
"\1074\1090\1110\1084","\1076\1072\1074\1072\1081","\1079\1072\1090\1077","\1082\1086\1083\1080",
"\1083\1077\1076\1074\1077","\1083\1080\1096\1077","\1084\1072\1081\1078\1077","\1084\1086\1074",
"\1084\1086\1074\1073\1080","\1084\1086\1074\1073\1080\1090\1086","\1085\1072\1074\1110\1090\1100",
"\1085\1072\1089\1082\1110\1083\1100\1082\1080","\1085\1072\1095\1077","\1085\1072\1095\1077\1073",
"\1085\1072\1095\1077\1073\1090\1086","\1085\1077\1074\1078\1077","\1085\1077\1084\1086\1074",
"\1085\1077\1084\1086\1074\1073\1080","\1085\1077\1084\1086\1074\1073\1080\1090\1086",
"\1085\1077\1085\1072\1095\1077","\1085\1077\1085\1072\1095\1077\1073\1090\1086",
"\1085\1077\1093\1072\1081","\1085\1110\1073\1080","\1085\1110\1073\1080\1090\1086",
"\1085\1110\1078","\1086\1090\1078\1077","\1086\1090\1086\1078","\1087\1088\1080\1090\1086\1084\1091",
"\1087\1088\1080\1090\1110\1084","\1087\1088\1080\1095\1086\1084\1091","\1087\1088\1080\1095\1110\1084",
"\1087\1088\1086\1090\1077","\1089\1072\1084\1077","\1089\1077\1073\1090\1086","\1090\1072\1082\1080",
"\1090\1086\1073\1090\1086","\1090\1110\1083\1100\1082\1080","\1093\1072\1081","\1093\1086\1095",
"\1093\1110\1073\1072","\1094\1077\1073\1090\1086","\1097\1086\1073","\1103\1082\1073\1080",
"\1103\1082\1088\1072\1079","\1103\1082\1097\1086"] $ replicate 100 ((xs ++ (drop 1 . jottedConv $ ' ':ys)):concatenated2 xss)) xs
concatenated2 xss = xss
isSpC :: Char -> Bool
isSpC x = x == '\'' || x == ' ' || x == '\x2019' || x == '\x02BC' || x == '-'
{-# INLINE isSpC #-}
jottedConv :: String -> String
jottedConv (x:y:xs)
| isSpC x = x:(getBFst' (jottedConv (y:xs), V.fromList
[('\1028', '\1049':'\1077':jottedConv xs),
('\1031', '\1049':'\1110':jottedConv xs),
('\1070', '\1049':'\1091':jottedConv xs),
('\1071', '\1049':'\1072':jottedConv xs),
('\1102', '\1081':'\1091':jottedConv xs),
('\1103', '\1081':'\1072':jottedConv xs),
('\1108', '\1081':'\1077':jottedConv xs),
('\1111', '\1081':'\1110':jottedConv xs)]) y)
| otherwise = x:jottedConv (y:xs)
jottedConv xs = xs