{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE DeriveLift #-}

-- | Unicode
module Haskus.Format.Text.Unicode
   ( module Haskus.Format.Text.Unicode.CodePoint
   -- * Planes
   , Plane (..)
   , codePointPlane
   , planeBMP
   , planeSMP
   , planeSIP
   , planeSSP
   , planePrivates
   -- * Encodings
   , UTF8 (..)
   , UTF16_BOM (..)
   , UTF16_BE (..)
   , UTF16_LE (..)
   , UTF32_BOM (..)
   , UTF32_BE (..)
   , UTF32_LE (..)
   )
where

import Haskus.Format.Binary.Word
import Haskus.Format.Binary.Bits
import Haskus.Utils.Types
import Data.Kind
import Haskus.Format.Text.Unicode.CodePoint

--------------------------------------------------
-- Planes
--------------------------------------------------

newtype Plane = Plane Word deriving (Show,Eq)

-- | Get the plan the code-point belongs to
--
-- >>> codePointPlane (CodePoint 0x21234)
-- Plane 2
codePointPlane :: CodePoint -> Plane
codePointPlane (CodePoint x) = Plane (fromIntegral (x `uncheckedShiftR` 16))

-- | Basic Multilingual Plane (BMP)
--
--  The Basic Multilingual Plane (BMP, or Plane 0) contains the common-use
--  characters for all the modern scripts of the world as well as many
--  historical and rare characters. By far the majority of all Unicode
--  characters for almost all textual data can be found in the BMP.
planeBMP :: Plane
planeBMP = Plane 0

-- | Supplementary Multilingual Plane (SMP)
--
-- Supplementary Multilingual Plane. The Supplementary Multilingual Plane (SMP,
-- or Plane 1) is dedicated to the encoding of characters for scripts or symbols
-- which either could not be fit into the BMP or see very infrequent usage. This
-- includes many historic scripts, a number of lesser-used contemporary scripts,
-- special-purpose invented scripts, notational systems or large pictographic
-- symbol sets, and occasionally historic extensions of scripts whose core sets
-- are encoded on the BMP.
--
-- Examples include Gothic (historic), Shavian (special-purpose invented),
-- Musical Symbols (notational system), Domino Tiles (pictographic), and Ancient
-- Greek Numbers (historic extension for Greek). A number of scripts, whether of
-- historic and contemporary use, do not yet have their characters encoded in
-- the Unicode Standard. The majority of scripts currently identified for
-- encoding will eventually be allocated in the SMP. As a result, some areas of
-- the SMP will experience common, frequent usage.
planeSMP :: Plane
planeSMP = Plane 1

-- | Supplementary Ideographic Plane (SIP)
--
-- The Supplementary Ideographic Plane (SIP, or Plane 2) is intended as an
-- additional allocation area for those CJK characters that could not be fit in
-- the blocks set aside for more common CJK characters in the BMP. While there
-- are a small number of common-use CJK characters in the SIP (for example, for
-- Cantonese usage), the vast majority of Plane 2 characters are extremely rare
-- or of historical interest only.
planeSIP :: Plane
planeSIP = Plane 2

-- | Supplementary Special-purpose Plane (SSP)
--
-- The Supplementary Special-purpose Plane (SSP, or Plane 14) is the spillover
-- allocation area for format control characters that do not fit into the small
-- allocation areas for format control characters in the BMP.
planeSSP :: Plane
planeSSP = Plane 14

-- | Private Use Planes
--
-- The two Private Use Planes (Planes 15 and 16) are allocated, in their
-- entirety, for private use. Those two planes contain a total of 131,068
-- characters to supple- ment the 6,400 private-use characters located in the
-- BMP.
planePrivates :: [Plane]
planePrivates = [Plane 15, Plane 16]

--------------------------------------------------
-- Encoding
--------------------------------------------------

-- Unicode encodings
data UTF8      = UTF8
data UTF16_BOM = UTF16_BOM
data UTF16_BE  = UTF16_BE
data UTF16_LE  = UTF16_LE
data UTF32_BOM = UTF32_BOM
data UTF32_BE  = UTF32_BE
data UTF32_LE  = UTF32_LE

-- | Unicode encoding scheme
class Encoding a where
   -- | Code unit type
   type CodeUnit a :: Type

   -- | Maximum number of code unit per code point
   type MaxCodeUnit a :: Nat

   -- | Binary sorting is equivalent to code point sorting
   type BinarySorting a :: Bool

   -- | BOM (byte-order mask) allowed
   type AllowedBOM a :: Bool


instance Encoding UTF8 where
   type CodeUnit        UTF8 = Word8
   type MaxCodeUnit     UTF8 = 4
   type BinarySorting   UTF8 = 'True
   type AllowedBOM      UTF8 = 'True

instance Encoding UTF16_BOM where
   type CodeUnit        UTF16_BOM = Word16
   type MaxCodeUnit     UTF16_BOM = 2
   type BinarySorting   UTF16_BOM = 'False
   type AllowedBOM      UTF16_BOM = 'True

instance Encoding UTF16_BE where
   type CodeUnit        UTF16_BE = Word16
   type MaxCodeUnit     UTF16_BE = 2
   type BinarySorting   UTF16_BE = 'False
   type AllowedBOM      UTF16_BE = 'False

instance Encoding UTF16_LE where
   type CodeUnit        UTF16_LE = Word16
   type MaxCodeUnit     UTF16_LE = 2
   type BinarySorting   UTF16_LE = 'False
   type AllowedBOM      UTF16_LE = 'False

instance Encoding UTF32_BOM where
   type CodeUnit        UTF32_BOM = Word32
   type MaxCodeUnit     UTF32_BOM = 1
   type BinarySorting   UTF32_BOM = 'True
   type AllowedBOM      UTF32_BOM = 'True

instance Encoding UTF32_BE where
   type CodeUnit        UTF32_BE = Word32
   type MaxCodeUnit     UTF32_BE = 1
   type BinarySorting   UTF32_BE = 'True
   type AllowedBOM      UTF32_BE = 'False

instance Encoding UTF32_LE where
   type CodeUnit        UTF32_LE = Word32
   type MaxCodeUnit     UTF32_LE = 1
   type BinarySorting   UTF32_LE = 'True
   type AllowedBOM      UTF32_LE = 'False