\startproduct * \project j_mangrove \starttext \starthaskell -- | -- Description: Mappings between textual names of encoding schemes and the type-safe 'Enum'. -- -- Copyright: (c) 2020 Samuel May -- License: MPL-2.0 -- Maintainer: ag.eitilt@gmail.com -- Stability: provisional Portability: portable \stophaskell \begin{code} {-# LANGUAGE Trustworthy #-} {-| Description: Mappings between textual names of encoding schemes and the type-safe 'Enum'. Copyright: (c) 2020 Samuel May License: MPL-2.0 Maintainer: ag.eitilt@gmail.com Stability: provisional Portability: portable The __[Encoding](https://encoding.spec.whatwg.org/)__ spec uses a conceptual model of an "encoding" as being the function between Unicode values and bytes. As this is a bit more complex than any content author wants to specify every document, HTML (and other interfaces) represent them as semi-standardized but freeform text strings; the standard document then collects the various strings authors have used across the web and associates the most common as "labels" of those abstract encodings. To refer to them internally, however, it also promotes one of the labels of each encoding as the canonical form; this library implements that set (with modifications to fit Haskell identifiers) in 'Encoding'. The labels are described via a reversible many-to-one mapping with those names, which as the reverse is rarely used, lends itself well to being adapted as a lookup table. This then is a machine-readable formatting of that table. -} module Web.Willow.Common.Encoding.Labels ( lookupEncoding ) where import qualified Data.Aeson as J import qualified Data.HashMap.Strict as M import qualified Data.Maybe as Y import qualified Data.Text as T import qualified System.IO.Unsafe as IO.Unsafe import Paths_mangrove import Web.Willow.Common.Encoding.Common import Data.Aeson ( (.:) ) import System.FilePath ( (), (<.>) ) -- | __Encoding:__ -- @[get an encoding] -- (https://encoding.spec.whatwg.org/#concept-encoding-get)@ -- -- Given an encoding's case-insensitive label, try to retrieve an appropriate -- 'Encoding'. The set prescribed by the HTML specification is smaller than -- that used by other registries for security and interoperability reasons, and -- may not always return the expected 'Encoding' if an alternate one has been -- determined to be more internet-compatible. lookupEncoding :: T.Text -> Maybe Encoding lookupEncoding label = M.lookup (T.map toAsciiLower $ strip label) encodingLabels where strip = T.dropWhileEnd (`elem` asciiWhitespace) . T.dropWhile (`elem` asciiWhitespace) -- Custom @strip@ needed as 'T.strip' also removes non-HTML whitespace. -- | The actual lookup table, in a form optimized for unidirectional access. -- Many-to-one; even if two 'M.lookup' calls return the same 'Encoding' value, -- there's no guarantee that the original 'T.Text's are equal. -- -- While generating a map adds a bit more processing over a simple -- 'Data.List.lookup', the savings in 'M.lookup' (@O(log n)@) will accumulate -- over multiple calls -- i.e. in typical web browsing. While the -- "Data.HashMap.Strict" only provides @O(n*log n)@ construction to -- "Data.HashMap.Lazy" with @O(n)@, there's no reason to keep the initializer -- list around for the latter. -- -- The spec is very insistent that no other labels be recognized, though some -- browsers may not hew to that proscription for historic reasons. encodingLabels :: M.HashMap T.Text Encoding encodingLabels = M.fromList $ concatMap (concatMap unpack . encodings) encodingList where unpack desc = zip (labels desc) (repeat $ encoding desc) -- | Lookup table between the official encoding names used in the @name@ field -- of @encodings.json@, and the Haskell datatype constructor representing that -- same encoding scheme. encodingNames :: M.HashMap String Encoding encodingNames = M.fromList [ ("UTF-8", Utf8) , ("IBM866", Ibm866) , ("ISO-8859-2", Iso8859_2) , ("ISO-8859-3", Iso8859_3) , ("ISO-8859-4", Iso8859_4) , ("ISO-8859-5", Iso8859_5) , ("ISO-8859-6", Iso8859_6) , ("ISO-8859-7", Iso8859_7) , ("ISO-8859-8", Iso8859_8) , ("ISO-8859-8-I", Iso8859_8i) , ("ISO-8859-10", Iso8859_10) , ("ISO-8859-13", Iso8859_13) , ("ISO-8859-14", Iso8859_14) , ("ISO-8859-15", Iso8859_15) , ("ISO-8859-16", Iso8859_16) , ("KOI8-R", Koi8R) , ("KOI8-U", Koi8U) , ("macintosh", Macintosh) , ("windows-874", Windows874) , ("windows-1250", Windows1250) , ("windows-1251", Windows1251) , ("windows-1252", Windows1252) , ("windows-1253", Windows1253) , ("windows-1254", Windows1254) , ("windows-1255", Windows1255) , ("windows-1256", Windows1256) , ("windows-1257", Windows1257) , ("windows-1258", Windows1258) , ("x-mac-cyrillic", MacintoshCyrillic) , ("GBK", Gbk) , ("gb18030", Gb18030) , ("Big5", Big5) , ("EUC-JP", EucJp) , ("ISO-2022-JP", Iso2022Jp) , ("Shift_JIS", ShiftJis) , ("EUC-KR", EucKr) , ("replacement", Replacement) , ("UTF-16BE", Utf16be) , ("UTF-16LE", Utf16le) , ("x-user-defined", UserDefined) ] -- | __Encoding:__ -- @[encodings.json] -- (https://encoding.spec.whatwg.org/encodings.json) -- -- Load the "non-normative" data resource distributed alongside the standard to -- avoid duplicating update effort. -- -- Uses 'IO.Unsafe.unsafePerformIO' internally, as the accessed file should -- never change at runtime, and so every invocation should be pure. encodingList :: [EncodingTable] encodingList = IO.Unsafe.unsafePerformIO $ do path <- getDataFileName $ "encoding" "encodings" <.> "json" Y.fromMaybe [] <$> J.decodeFileStrict' path {-# NOINLINE encodingList #-} -- | Internal representation of the categorical groupings in the -- @encodings.json@ file. newtype EncodingTable = EncodingTable { encodings :: [EncodingDesc] -- , heading :: T.Text } deriving ( Eq, Show, Read ) instance J.FromJSON EncodingTable where parseJSON = J.withObject "table" $ \v -> EncodingTable <$> v .: T.pack "encodings" -- <*> v .: T.pack "heading" -- | Internal representation of the @'Encoding' <-> [label]@ mappings described -- by the @encodings.json@ file. data EncodingDesc = EncodingDesc { labels :: [T.Text] , encoding :: Encoding } deriving ( Eq, Show, Read ) instance J.FromJSON EncodingDesc where parseJSON = J.withObject "encoding" $ \v -> EncodingDesc <$> v .: T.pack "labels" <*> fmap readEncoding (v .: T.pack "name") where readEncoding str = Y.fromMaybe (panic str) $ M.lookup str encodingNames panic str = error $ "readEncoding: could not parse '" ++ str ++ "'" \end{code} \stoptext \stopproduct