----------------------------------------------------------------------------- -- Module : Convert umlaut written as ae, oe or ue into ä, ö and ü -- deals with lines, preserving the leading spaces and tabs. -- could be improved to use span to break on first non-space character ----------------------------------------------------------------------------- {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TypeFamilies #-} {-# LANGUAGE TypeSynonymInstances #-} {-# LANGUAGE OverloadedStrings #-} module Lib.ProcWord -- (openMain, htf_thisModuelsTests) where -- import Uniform.Strings -- import Uniform.FileIO import UniformBase -- import Lib.FileHandling import qualified Data.Text as T (commonPrefixes) -- import Control.Monad import Control.Monad.Trans.Writer.Strict -- procTxt2 :: [Text] -> Text -> Text -- called from OneMDfile, not from ProcTxt -- -- change all umlaut in text - yaml header and markdown text -- -- preserve leading blanks of line, or tabs, but not mixture of these -- procTxt2 erl2 = unlines' . map (procLine2 erl2) . lines' procLine2 :: [Text] -> Text -> Text -- process one line preserving spaces or tabs (but not a mix) at start -- improve to use span break on first non-space -- assumes that text is not \n terminated! procLine2 erl2 t = ld <> (procLine erl2 t1) where (ld, t1) = auxProcLine2 t -- case mb1 t of -- Nothing -> case mb2 t of -- Nothing -> ("", t) -- Just (lead2, _, t02) -> (lead2,t02) -- Just (lead, _, t0) -> (lead,t0) -- mb1 tx = T.commonPrefixes " " tx -- mb2 ty = T.commonPrefixes "\t\t\t\t\t\t\t" ty auxProcLine2 :: Text -> (Text,Text) auxProcLine2 t = case mb1 t of Nothing -> case mb2 t of Nothing -> ("", t) Just (lead2, _, t02) -> (lead2,t02) Just (lead, _, t0) -> (lead,t0) where mb1 tx = T.commonPrefixes " " tx mb2 ty = T.commonPrefixes "\t\t\t\t\t\t\t" ty procLine2Rep :: [Text] -> Text -> Writer Text Text -- accumulates the changed woerds for checking procLine2Rep erl2 t = do t2 <- procLineRep erl2 t1 return $ ld <> t2 where (ld,t1) = auxProcLine2 t procLine :: [Text] -> Text -> Text procLine erlaubt ln = unwords' . map (procWord2 erlaubt) . words' $ ln -- process all words in a line -- should be idempotent, as long as text is not n\ terminated procLineRep :: [Text] -> Text -> Writer Text Text procLineRep erlaubt ln = do ln2rep <- mapM (procWord2Rep erlaubt) (words' ln) return . unwords' $ ln2rep procWord2 :: [Text] -> Text -> Text -- replace umlaut unless it is an permitted group procWord2 erlaubt word = if checkErlaubt erlaubt word then word else procWord1 word procWord2Rep :: [Text] -> Text -> Writer Text Text procWord2Rep erlaubt word = do let word1 = procWord2 erlaubt word if word1 == word then return word else do tell ((word1<>" ")::Text) return word1 procWord1 :: Text -> Text -- ^ convert the umlaut in a single word -- no test, no exclusions -- preserve capitalization procWord1 t = replace' "AE" "Ä" . replace' "OE" "Ö" . replace' "UE" "Ü" . replace' "Ae" "Ä" . replace' "Oe" "Ö" . replace' "Ue" "Ü" . replace' "ae" "ä" . replace' "oe" "ö" . replace' "ue" "ü" $ t erlaubt1 :: [Text] -- erlaubte Gruppen - for test only erlaubt1 = map toLower' ["koef", "poet", "poes", "neue", "freue"] checkErlaubt :: [Text] -> Text -> Bool -- ^ enthaelt das Wort eine erlaubte kombination checkErlaubt erlaubt word = any (\e -> isInfixOf' e . toLower' $ word) erlaubt checkErlaubt1 :: Text -> Bool checkErlaubt1 = checkErlaubt erlaubt1 -- mit fester Liste der erlaubten - for test