\startproduct *
\project j_mangrove

\starttext
\starthaskell
-- |
-- Description:    Mappings between textual names of encoding schemes and the type-safe 'Enum'.
-- 
-- Copyright:      (c) 2020 Samuel May
-- License:        MPL-2.0
-- Maintainer:     ag.eitilt@gmail.com
-- 
Stability:      provisional
Portability:    portable
\stophaskell
\begin{code}
{-# LANGUAGE Trustworthy #-}

{-|
Description:    Mappings between textual names of encoding schemes and the type-safe 'Enum'.

Copyright:      (c) 2020 Samuel May
License:        MPL-2.0
Maintainer:     ag.eitilt@gmail.com

Stability:      provisional
Portability:    portable

The __[Encoding](https://encoding.spec.whatwg.org/)__ spec uses a conceptual
model of an "encoding" as being the function between Unicode values and bytes.
As this is a bit more complex than any content author wants to specify every
document, HTML (and other interfaces) represent them as semi-standardized but
freeform text strings; the standard document then collects the various strings
authors have used across the web and associates the most common as "labels" of
those abstract encodings.

To refer to them internally, however, it also promotes one of the labels of
each encoding as the canonical form; this library implements that set (with
modifications to fit Haskell identifiers) in 'Encoding'.  The labels are
described via a reversible many-to-one mapping with those names, which as
the reverse is rarely used, lends itself well to being adapted as a lookup
table.  This then is a machine-readable formatting of that table.
-}
module Web.Willow.Common.Encoding.Labels
    ( lookupEncoding
    ) where


import qualified Data.Aeson as J
import qualified Data.HashMap.Strict as M
import qualified Data.Maybe as Y
import qualified Data.Text as T

import qualified System.IO.Unsafe as IO.Unsafe

import Paths_mangrove

import Web.Willow.Common.Encoding.Common

import Data.Aeson ( (.:) )
import System.FilePath ( (</>), (<.>) )


-- | __Encoding:__
--      @[get an encoding]
--      (https://encoding.spec.whatwg.org/#concept-encoding-get)@
-- 
-- Given an encoding's case-insensitive label, try to retrieve an appropriate
-- 'Encoding'.  The set prescribed by the HTML specification is smaller than
-- that used by other registries for security and interoperability reasons, and
-- may not always return the expected 'Encoding' if an alternate one has been
-- determined to be more internet-compatible.
lookupEncoding :: T.Text -> Maybe Encoding
lookupEncoding label = M.lookup (T.map toAsciiLower $ strip label) encodingLabels
  where strip = T.dropWhileEnd (`elem` asciiWhitespace) . T.dropWhile (`elem` asciiWhitespace)
        -- Custom @strip@ needed as 'T.strip' also removes non-HTML whitespace.

-- | The actual lookup table, in a form optimized for unidirectional access.
-- Many-to-one; even if two 'M.lookup' calls return the same 'Encoding' value,
-- there's no guarantee that the original 'T.Text's are equal.
-- 
-- While generating a map adds a bit more processing over a simple
-- 'Data.List.lookup', the savings in 'M.lookup' (@O(log n)@) will accumulate
-- over multiple calls -- i.e. in typical web browsing.  While the
-- "Data.HashMap.Strict" only provides @O(n*log n)@ construction to
-- "Data.HashMap.Lazy" with @O(n)@, there's no reason to keep the initializer
-- list around for the latter.
-- 
-- The spec is very insistent that no other labels be recognized, though some
-- browsers may not hew to that proscription for historic reasons.
encodingLabels :: M.HashMap T.Text Encoding
encodingLabels = M.fromList $ concatMap (concatMap unpack . encodings) encodingList
  where unpack desc = zip (labels desc) (repeat $ encoding desc)


-- | Lookup table between the official encoding names used in the @name@ field
-- of @encodings.json@, and the Haskell datatype constructor representing that
-- same encoding scheme.
encodingNames :: M.HashMap String Encoding
encodingNames = M.fromList
    [ ("UTF-8", Utf8)
    , ("IBM866", Ibm866)
    , ("ISO-8859-2", Iso8859_2)
    , ("ISO-8859-3", Iso8859_3)
    , ("ISO-8859-4", Iso8859_4)
    , ("ISO-8859-5", Iso8859_5)
    , ("ISO-8859-6", Iso8859_6)
    , ("ISO-8859-7", Iso8859_7)
    , ("ISO-8859-8", Iso8859_8)
    , ("ISO-8859-8-I", Iso8859_8i)
    , ("ISO-8859-10", Iso8859_10)
    , ("ISO-8859-13", Iso8859_13)
    , ("ISO-8859-14", Iso8859_14)
    , ("ISO-8859-15", Iso8859_15)
    , ("ISO-8859-16", Iso8859_16)
    , ("KOI8-R", Koi8R)
    , ("KOI8-U", Koi8U)
    , ("macintosh", Macintosh)
    , ("windows-874", Windows874)
    , ("windows-1250", Windows1250)
    , ("windows-1251", Windows1251)
    , ("windows-1252", Windows1252)
    , ("windows-1253", Windows1253)
    , ("windows-1254", Windows1254)
    , ("windows-1255", Windows1255)
    , ("windows-1256", Windows1256)
    , ("windows-1257", Windows1257)
    , ("windows-1258", Windows1258)
    , ("x-mac-cyrillic", MacintoshCyrillic)
    , ("GBK", Gbk)
    , ("gb18030", Gb18030)
    , ("Big5", Big5)
    , ("EUC-JP", EucJp)
    , ("ISO-2022-JP", Iso2022Jp)
    , ("Shift_JIS", ShiftJis)
    , ("EUC-KR", EucKr)
    , ("replacement", Replacement)
    , ("UTF-16BE", Utf16be)
    , ("UTF-16LE", Utf16le)
    , ("x-user-defined", UserDefined)
    ]


-- | __Encoding:__
--      @[encodings.json]
--      (https://encoding.spec.whatwg.org/encodings.json)
-- 
-- Load the "non-normative" data resource distributed alongside the standard to
-- avoid duplicating update effort.
-- 
-- Uses 'IO.Unsafe.unsafePerformIO' internally, as the accessed file should
-- never change at runtime, and so every invocation should be pure.
encodingList :: [EncodingTable]
encodingList = IO.Unsafe.unsafePerformIO $ do
    path <- getDataFileName $ "encoding" </> "encodings" <.> "json"
    Y.fromMaybe [] <$> J.decodeFileStrict' path
{-# NOINLINE encodingList #-}


-- | Internal representation of the categorical groupings in the
-- @encodings.json@ file.
newtype EncodingTable = EncodingTable
    { encodings :: [EncodingDesc]
 -- , heading :: T.Text
    }
  deriving ( Eq, Show, Read )
instance J.FromJSON EncodingTable where
    parseJSON = J.withObject "table" $ \v -> EncodingTable
        <$> v .: T.pack "encodings"
     -- <*> v .: T.pack "heading"

-- | Internal representation of the @'Encoding' <-> [label]@ mappings described
-- by the @encodings.json@ file.
data EncodingDesc = EncodingDesc
    { labels :: [T.Text]
    , encoding :: Encoding
    }
  deriving ( Eq, Show, Read )
instance J.FromJSON EncodingDesc where
    parseJSON = J.withObject "encoding" $ \v -> EncodingDesc
        <$> v .: T.pack "labels"
        <*> fmap readEncoding (v .: T.pack "name")
      where readEncoding str = Y.fromMaybe (panic str) $ M.lookup str encodingNames
            panic str = error $ "readEncoding: could not parse '" ++ str ++ "'"
\end{code}
\stoptext

\stopproduct