{-# LANGUAGE OverloadedStrings #-}

module Data.Text.Slugger (toSlug) where


--------------------------------------------------------------------------------
import qualified Data.Char as Char
import qualified Data.Text as T
import qualified Data.Text.ICU.Char as ICUChar
import qualified Data.Text.ICU.Normalize as ICUN


--------------------------------------------------------------------------------
{- | Converts to a US-ASCII, lowercase, hyphenated, URI-friendly "slug"

__Examples:__

@
toSlug (T.pack "Hey there,   world!")
-- "hey-there-world"

toSlug (T.pack "GARÇON - déjà , Forêt — Zoë")
-- "garcon-deja-foret-zoe"
@
-}
toSlug :: T.Text -> T.Text
toSlug = hyphenateWords . clean . normalize


--------------------------------------------------------------------------------
normalize :: T.Text -> T.Text
normalize = ICUN.normalize ICUN.NFKD


--------------------------------------------------------------------------------
clean :: T.Text -> T.Text
clean = T.foldr buildCleanText T.empty


buildCleanText :: Char -> T.Text -> T.Text
buildCleanText x acc
    | isCharModifier x || isSingleQuote x = acc
    | otherwise = T.concat [adjustChar x, acc]


isSingleQuote :: Char -> Bool
isSingleQuote = (== '\'')


isCharModifier :: Char -> Bool
isCharModifier = ICUChar.property ICUChar.Diacritic


adjustChar :: Char -> T.Text
adjustChar 'æ' = "ae"
adjustChar 'Æ' = "ae"
adjustChar 'ð' = "d"
adjustChar 'Ð' = "d"
adjustChar 'ƒ' = "f"
adjustChar 'Ƒ' = "f"
adjustChar 'ø' = "o"
adjustChar 'Ø' = "o"
adjustChar 'œ' = "oe"
adjustChar 'Œ' = "oe"
adjustChar 'ł' = "l"
adjustChar 'Ł' = "l"
adjustChar 'ß' = "ss"
adjustChar 'þ' = "th"
adjustChar 'Þ' = "th"
adjustChar 'ı' = "i"
-- See Note [Turkish I]
adjustChar x
  | isAsciiAlphaNum x = toLowerAsText x
  | otherwise = " "


isAsciiAlphaNum :: Char -> Bool
isAsciiAlphaNum x = Char.isAscii x && Char.isAlphaNum x


toLowerAsText :: Char -> T.Text
toLowerAsText = T.singleton . Char.toLower


--------------------------------------------------------------------------------
hyphenateWords :: T.Text -> T.Text
hyphenateWords = T.intercalate (T.singleton '-') . T.words

{-
Note: [Turkish I]

Turkish has an unusual casing of the letter 'I'. In Turkish, we have
'i' and 'ı', two separate letters. They correspond to uppercase 'İ'
and 'I'. Notice that this is the opposite of most other languages,
where lowercase 'i' correspond to uppercase 'I' (losing the dot for no
good reason).

Unicode gets this correctly, so a Unicode-aware `toLower` function would
convert uppercase 'I' to 'ı' when on Turkish locale. This tend to break
functions like the one we are writing, if we incorrectly assume that every
ASCII uppercase letter would correspond to an ASCII lowercase letter.

The surprise is that; `Data.Char.toLower` function we use is not
locale-aware, `Data.Text.ICU.toLower` is. Only because of this fact `I`
becomes `i` even on Turkish locale on this function. This note is here
so that we do not start using `Data.Text.ICU.toLower` and break the
library on Turkish locale.
-}