module Opentype.Fileformat.Cmap where
import Opentype.Fileformat.Types
import Data.Binary
import Data.Binary.Put
import Data.List (sort, mapAccumL, foldl')
import Data.Either (either)
import Control.Monad
import Data.Traversable (for)
import Data.Foldable (for_, traverse_)
import qualified Data.ByteString.Lazy as Lazy
import qualified Data.ByteString as Strict
import qualified Data.Map as M
import qualified Data.IntSet as IS
import Data.Maybe
import Data.Bits
import Data.Int
-- import Hexdump -- for debugging

-- | This table defines the mapping of character codes to the glyph
-- index values used in the font. It may contain more than one
-- subtable, in order to support more than one character encoding
-- scheme. Character codes that do not correspond to any glyph in the
-- font should be mapped to glyph index 0. The glyph at this location
-- must be a special glyph representing a missing character, commonly
-- known as .notdef.
--
-- The table header indicates the character encodings for which
-- subtables are present. Each subtable is in one of seven possible
-- formats and begins with a format code indicating the format
-- used.
-- 
-- The `platformID` and platform-specific `encodingID` in the header
-- entry (and, in the case of the Macintosh platform, the `macLanguage`
-- field in the subtable itself) are used to specify a particular
-- cmap encoding. Each platform ID, platform-specific encoding ID,
-- and subtable `macLanguage` combination may appear only once in the
-- `CmapTable`.
--
-- When `platformID` is `UnicodePlatform`, `encodingID` is interpreted as follows:
-- 
--  * 0: Default semantics
--  * 1: Version 1.1 semantics
--  * 2: ISO 10646 1993 semantics (deprecated)
--  * 3: Unicode 2.0 or later semantics (BMP only)
--  * 4: Unicode 2.0 or later semantics (non-BMP characters allowed)
--  * 5: Unicode Variation Sequences
--  * 6: Full Unicode coverage (used with type 13.0 cmaps by OpenType)
--
-- When `platformID` `MacintoshPlatform`, the `encodingID` is a QuickDraw script code.
-- 
-- Note that the use of the Macintosh platformID is currently
-- discouraged. Subtables with a Macintosh platformID are only
-- required for backwards compatibility with QuickDraw and will be
-- synthesized from Unicode-based subtables if ever needed.
--
-- When `platformID` is `MicrosoftPlatform`, the `encodingID` is a is interpreted as follows:
--
-- * 0: Symbol
-- * 1: Unicode BMP-only (UCS-2)
-- * 2: Shift-JIS
-- * 3: PRC
-- * 4: BigFive
-- * 5: Johab
-- * 10: Unicode UCS-4

newtype CmapTable = CmapTable {getCmaps :: [CMap]}
  deriving Show

emptyCmapTable :: CmapTable
emptyCmapTable = CmapTable []

data CMap = CMap {
  cmapPlatform :: PlatformID,
  cmapEncoding :: Word16,
  -- | used only in the Macintosh platformID (/DEPRECATED/)
  cmapLanguage :: Word16,
  -- | internal format of the map
  mapFormat :: MapFormat,
  -- | set contains high byte\/word if part of multibyte\/word
  -- character.
  multiByte :: IS.IntSet,
  -- | map from character code to glyph index
  glyphMap :: WordMap GlyphID
  }
  deriving Show

instance Ord CMap where
  compare (CMap pfID encID lang _ _ _) (CMap pfID2 encID2 lang2 _ _ _) =
    compare (pfID, encID, lang) (pfID2, encID2, lang2)

instance Eq CMap where
  (CMap pfID encID lang _ _ _) == (CMap pfID2 encID2 lang2 _ _ _) =
    (pfID, encID, lang) == (pfID2, encID2, lang2)

data MapFormat = 
  -- | 8 bit encoding, contiguous block of bytes.  /LEGACY ONLY./
  MapFormat0 |
  -- | mixed 8\/16 bit encoding with gaps.  /LEGACY ONLY./
  MapFormat2 |
  -- | 16 bit encoding with holes.  This should contain the BMP for a
  -- unicode font.
  MapFormat4 |
  -- | 16 bit single contiguous block (trimmed).
  MapFormat6 |
  -- | mixed 16\/32 bit, for compatibility only, /DO NOT USE/
  MapFormat8 |
  -- | 32 bit single contiguous block (trimmed), for compatibility
  -- only, /DO NOT USE/
  MapFormat10 |
  -- | 32 bit segmented coverage.  This should contain Unicode
  -- encodings with glyphs above 0xFFFF.  It's recommended to save a
  -- subset to format 4, for backwards compatibility.
  MapFormat12
  deriving Show

putCmapTable :: CmapTable -> Put
putCmapTable (CmapTable cmaps_) =
  do putWord16be 0
     putWord16be $ fromIntegral $ length cmaps
     for_ (zip offsets cmaps) $
       \(offset, CMap pfID encID _ _ _ _) ->
         do putPf pfID
            putWord16be encID
            putWord32be offset
     traverse_ putLazyByteString cmapsBs
       where
         cmaps = sort cmaps_
         offsets :: [Word32]
         offsets =
           scanl (+) (fromIntegral $ 4 + 8 * length cmaps) $
           map (fromIntegral . Lazy.length) cmapsBs
         cmapsBs = map (runPut.putCmap) cmaps

readCmapTable :: Strict.ByteString -> Either String CmapTable
readCmapTable bs = do
  version <- index16 bs 0
  when (version /= 0) $
    fail "unsupported cmap version."
  n <- index16 bs 1
  entries <- for [0..n-1] $ \i -> do
    pfID <- toPf =<< (index16 bs $ 2 + i*4)
    encID <- index16 bs $ 2 + i*4 + 1
    offset <- index32 bs $ 2 + fromIntegral i*2
    return (offset, pfID, encID)
  cmaps <- for entries $ \(offset, pfID, encID) -> do
    cm <- readCmap $ Strict.drop (fromIntegral offset) bs
    Right $ cm {cmapPlatform = pfID, cmapEncoding = encID}
  return $ CmapTable cmaps

putCmap :: CMap -> Put
putCmap cmap = case mapFormat cmap of
  MapFormat0 -> putMap0 cmap
  MapFormat2 -> putMap2 cmap
  -- 16 bit encoding with holes
  MapFormat4 -> putMap4 cmap
  -- trimmed 16 bit mapping.
  MapFormat6 -> putMap6 cmap
  -- mixed 16/32 bit encoding (/DEPRECATED/)
  MapFormat8 -> putMap8 cmap
  MapFormat10 -> putMap10 cmap
  -- 32 bit segmented coverage
  MapFormat12 -> putMap12 cmap

readCmap :: Strict.ByteString -> Either String CMap
readCmap bs_ = do
  c <- index16 bs_ 0
  let bs | (c >= 8 && c < 14) = Strict.drop 8 bs_
         | otherwise = Strict.drop 4 bs_
  either fail return $
    case c of 
      0 -> getMap0 bs
      2 -> getMap2 bs
      4 -> getMap4 bs
      6 -> getMap6 bs
      8 -> getMap8 bs
      10 -> getMap10 bs
      12 -> getMap12 bs
      i -> fail $ "unsupported map encoding " ++ show i

subIntMap :: Word32 -> Word32 -> WordMap GlyphID -> WordMap GlyphID
subIntMap from to intMap =
  fst $ M.split (fromIntegral to+1) $ snd $
  M.split (fromIntegral from-1) intMap

asSubtypeFrom :: b -> [(a, b)] -> b
asSubtypeFrom a _ = a

-- Put codes in range, and use zero glyph when no code is found.
putCodes :: Word32 -> Word32 -> [(Word32, GlyphID)] -> Put
putCodes start end _
  | start > end = return ()

putCodes start end [] = do
  putWord16be 0
  putCodes (start+1) end []
      
putCodes start end l@((i, code):rest)
  | start < i = do putWord16be 0
                   putCodes (start+1) end l
  | otherwise = do putWord16be code
                   putCodes (i+1) end rest

-- write subcodes as range, with zero glyph for missing codes.
subCodes :: WordMap GlyphID -> Word32 -> Word32 -> Put
subCodes set start end =
  putCodes start end $ M.toList $ subIntMap start end set

data SubTable2 = SubTable2 {
  highByte :: Word16,
  firstCode :: Word16,
  entryCount :: Word16,
  rangeOffset :: Word16,
  rangeBytes :: Put
  }

putMap0 :: CMap -> PutM ()
putMap0 cmap = do
  putWord16be 0
  putWord16be 262
  putWord16be $ cmapLanguage cmap
  let gm = glyphMap cmap
  for_ [0..255] $ \c ->
    putWord8 $ fromIntegral $
    fromMaybe 0 (M.lookup c gm)

getMap0 :: Strict.ByteString -> Either String CMap
getMap0 bs =
  if Strict.length bs < 258 then
    Left "invalid map format 0"
  else
    do lang <- index16 bs 0
       let gmap = M.fromAscList $
             filter ((/= 0).snd) $
             (flip map) [0..255] $ \c ->
             (fromIntegral c, fromIntegral $ Strict.index bs (c+2))
       Right $ CMap UnicodePlatform 0 lang MapFormat0 IS.empty gmap

putMap2 :: CMap -> PutM ()
putMap2 cmap = do
  putWord16be 2
  putWord16be size
  putWord16be (cmapLanguage cmap)
  putCodes 0 255 $ zip (map (fromIntegral.highByte) subTableCodes) [1::Word16 ..]
  for_ subTables $ \(SubTable2 _ fc ec ro _) ->
    do putWord16be fc
       putWord16be ec
       putWord16be 0
       putWord16be ro
  for_ subTables rangeBytes
    where
      highBytes :: [Int]
      highBytes =
        IS.toList $ fst $
        IS.split 255 (multiByte cmap)
      subTableCodes =
        filter ((/= 0) . entryCount) $ 
        flip map highBytes $ \hb ->
        let subMap = subIntMap (fromIntegral hb `shift` 8)
                     (fromIntegral hb `shift` 8 .|. 0xff) $
                     glyphMap cmap
            (fstCode, lstCode)
              | M.null subMap = (0, -1)
              | otherwise = (fst $ M.findMin subMap,
                             fst $ M.findMax subMap)
            ec = lstCode - fstCode + 1
            rb = subCodes subMap fstCode lstCode
        in SubTable2 (fromIntegral hb) (fromIntegral fstCode) (fromIntegral ec) 0 rb
          where
      subTables = scanl calcOffset firstTable subTableCodes
      firstTable =
        SubTable2 0 0 256 (fromIntegral $ length subTableCodes * 8 + 2) $
        subCodes (glyphMap cmap) 0 255
      size :: Word16
      size = 518 + 8 * (fromIntegral $ length subTables) + 2 * sum (map entryCount subTables)
      calcOffset prev st = st { rangeOffset = rangeOffset prev - 8 + 2*entryCount prev }
        
getMap2 :: Strict.ByteString -> Either String CMap
getMap2 bs = do
  lang <- index16 bs 0
  highBytes <- do
    l <- traverse (index16 bs) [1..256]
    Right $ map fst $ filter ((/=0).snd) $ zip [0::Int ..255] l
  l <- for [0::Word16 .. fromIntegral $ length highBytes] $ \i -> do
    fstCode <- index16 bs (fromIntegral $ 257 + i*4)
    cnt <- index16 bs (fromIntegral $ 257 + i*4 + 1)
    delta <- index16 bs (fromIntegral $ 257 + i*4 + 2)
    ro <- index16 bs (fromIntegral $ 257 + i*4 + 3)
    for [0 .. fromIntegral cnt-1] $ \entry -> do
      p <- index16 bs (fromIntegral $ 257 + i*4 + 3 + ro `quot` 2 + entry)
      Right (fromIntegral $ fstCode + entry, if p == 0 then 0 else p + delta)
  let im = M.fromAscList $ filter ((/= 0).snd) $ concat l
      is = IS.fromAscList $ map fromIntegral highBytes
  Right $ CMap UnicodePlatform 0 lang MapFormat2 is im      
      
data Segment4 = RangeSegment Word16 Word16 Word16
              | CodeSegment Word16 Word16 [Word16]
              deriving Show

findRange :: Word32 -> Int64 -> [(Word32, Word16)] -> (Word32, [(Word32, Word16)])
findRange nextI _ [] =
  (nextI-1, [])
findRange nextI offset l@((i,c):r)
  | i == nextI && c == fromIntegral (fromIntegral nextI+offset) =
    findRange (nextI+1) offset r
  | otherwise = (nextI-1, l)

findCodes :: Word32 -> [(Word32, Word16)] -> ([GlyphID], [(Word32, Word16)])
findCodes _ [] = ([], [])
findCodes prevI l@((i,c):r)
  -- maximum gap is 4
  | i - prevI > 4 = ([], l)
  | otherwise = (replicate (fromIntegral $ i-prevI-1) 0 ++ c:c2, r2)
  where (c2, r2) = findCodes i r

getSegments :: [(Word32, Word16)] -> [Segment4]
getSegments [] = [RangeSegment 0xffff 1 0]
getSegments l@((start, c):_)
  | fromIntegral end - start >= 4 ||
    lc <= end-start+1 = 
      RangeSegment (fromIntegral start) (fromIntegral end-fromIntegral start+1) c :
      getSegments r
  | otherwise =
      CodeSegment (fromIntegral start) (fromIntegral lc) codes :
      getSegments r2
  where
    lc = fromIntegral $ length codes
    (end, r) = findRange start (fromIntegral c - fromIntegral start) l
    (codes, r2) = findCodes (start-1) l

data Segment4layout = Segment4layout {
  s4endCode :: Word16,
  s4startCode :: Word16,
  s4idDelta :: Word16,
  s4idRangeOffset :: Word16,
  s4glyphIndex :: [GlyphID] }
  deriving Show
  
putMap4 :: CMap -> PutM ()
putMap4 cmap = do
  putWord16be 4
  putWord16be size
  putWord16be (cmapLanguage cmap)
  putWord16be (segCount*2)
  putWord16be searchRange
  putWord16be entrySelector
  putWord16be $ 2*segCount - searchRange
  traverse_ (put.s4endCode) layout
  putWord16be 0
  traverse_ (put.s4startCode) layout
  traverse_ (put.s4idDelta) layout
  traverse_ (put.s4idRangeOffset) layout
  traverse_ (traverse_ put.s4glyphIndex) layout
    where
      size, segCount, searchRange, entrySelector :: Word16
      entrySelector = iLog2 segCount
      searchRange = 1 `shift` (fromIntegral $ entrySelector+1)
      segments = getSegments $ M.toList $ subIntMap 0 0xffff $ glyphMap cmap
      (codeSize, layout) = mapAccumL foldLayout (segCount*2) segments
      foldLayout offset (RangeSegment start len code) =
        (offset-2, Segment4layout (fromIntegral $ start+len-1)
                   (fromIntegral start) (code-(fromIntegral start)) 0 [])
      foldLayout offset (CodeSegment start len codes) =
        (offset+fromIntegral len*2-2,
         Segment4layout (fromIntegral $ start+len-1)
         (fromIntegral start) 0 offset codes)
      size = 8*segCount + codeSize + 16
      segCount = fromIntegral $ length segments

getMap4 :: Strict.ByteString -> Either String CMap
getMap4 bs = do
    lang <- index16 bs 0
    segCount <- (`quot` 2) <$> index16 bs 1
    gmap <- fmap (M.fromAscList . filter ((/= 0).snd) . concat ) $
            for [0::Word16 .. segCount-2] $ \i ->
      do idDelta <- index16 bs (i + 6 + segCount*2)
         endCode <- index16 bs (i + 5)
         startCode <- index16 bs (i + 6 + segCount)
         idRangeOffset <- index16 bs (i + 6 + segCount*3)
         if idRangeOffset == 0
           then Right [(fromIntegral c, c+idDelta) | c <- [startCode .. endCode]]
           else for [0..endCode-startCode] $ \j ->
           do glyph <- index16 bs (fromIntegral $ i + 6 + segCount*3 + idRangeOffset`div`2 + j)
              Right (fromIntegral $ startCode + j, glyph)
    Right $ CMap UnicodePlatform 0 lang MapFormat4 IS.empty gmap

putMap6 :: CMap -> PutM ()
putMap6 cmap = do
  putWord16be 6
  putWord16be size
  putWord16be (cmapLanguage cmap)
  putWord16be fCode
  putWord16be eCount
  subCodes (glyphMap cmap) (fromIntegral fCode) (fromIntegral lastCode)
  where
    size, eCount, fCode, lastCode :: Word16
    size = eCount*2 + 10
    eCount = lastCode - fCode + 1
    fCode = fromIntegral $
      min (fromIntegral (maxBound :: Word16)::Word32) $
      fst $ M.findMin (glyphMap cmap)
    lastCode = fromIntegral $
      min (fromIntegral (maxBound :: Word16)::Word32) $
      fst $ M.findMax (glyphMap cmap)
    
getMap6 :: Strict.ByteString -> Either String CMap
getMap6 bs = do
  lang <- index16 bs 0
  fCode <- index16 bs 1
  eCount <- index16 bs 2
  gmap <- fmap (M.fromAscList . filter ((/= 0).snd)) $
          for [0..eCount-1] $ \i -> do
    g <- index16 bs (i+3)
    Right (fromIntegral $ i + fCode, g)
  Right $ CMap UnicodePlatform 0 lang MapFormat6 IS.empty gmap

putPacked :: Int -> [Int] -> Put
putPacked start highBytes
  | start >= 8192 = return ()
  | otherwise = do
      putWord8 $ foldl' (.|.) 0 $ map ((shift 1) . (.&. 7)) bytes
      putPacked (start+1) rest
        where
          (bytes, rest) = span (\b -> b >= (start*8) &&
                                      b < (start+1)*8)
                          highBytes

readPacked :: Strict.ByteString -> [Int]
readPacked bs =
  [i .|. b `shift` 3 |
   (a, b) <- zip (Strict.unpack bs) [0..8191],
   i <- [0..7],
   a .&. (1 `shift` i) /= 0
  ]
      

findRanges :: [(Word32, GlyphID)] -> [(Word32, Word32, GlyphID)]
findRanges [] = []
findRanges l@((i,c):_) = (i, i2, c) : findRanges next
  where (i2, next) = findRange i (fromIntegral c-fromIntegral i) l
  
putMap8 :: CMap -> PutM ()
putMap8 cmap = do
  putWord16be 8
  putWord16be 0
  putWord32be $ fromIntegral size
  putWord32be (fromIntegral $ cmapLanguage cmap)
  putPacked 0 highBytes
  putWord32be $ fromIntegral nGroups
  for_ ranges $ \(start, end, code) -> do
    putWord32be $ fromIntegral start
    putWord32be $ fromIntegral end
    putWord32be $ fromIntegral code
  where
    size = nGroups * 12 + 8208
    highBytes = IS.toList $ multiByte cmap
    ranges = findRanges $ M.toList $ glyphMap cmap
    nGroups = length ranges

getMap8 :: Strict.ByteString -> Either String CMap
getMap8 bs = do
  _ <- index16 bs 0
  lang <- index16 bs 1
  let is = IS.fromAscList $ readPacked (Strict.drop 4 bs)
  nGroups <- index32 bs 2049
  gmap <- fmap (M.fromAscList . concat) $ for [0..nGroups-1] $ \i -> do
    start <- index32 bs (i*3 + 2050)
    end <- index32 bs (i*3 + 2051)
    glyph <- index32 bs (i*3 + 2052)
    return [(fromIntegral c, fromIntegral $ glyph+c-start) | c <- [start .. end]]
  Right $ CMap UnicodePlatform 0 lang MapFormat8 is gmap

getMap10 :: Strict.ByteString -> Either String CMap
getMap10 bs = do
  lang <- index32 bs 0
  fCode <- index32 bs 1
  eCount <- index32 bs 2
  gmap <- fmap (M.fromAscList . filter ((/= 0).snd)) $
          for [0..eCount-1] $ \i -> do
    g <- index16 bs (fromIntegral i+6)
    Right (fromIntegral $ i + fCode, g)
  Right $ CMap UnicodePlatform 0 (fromIntegral lang) MapFormat6 IS.empty gmap


putMap10 :: CMap -> Put
putMap10 cmap = do
  putWord16be 10
  putWord16be 0
  putWord32be size
  putWord32be $ fromIntegral $ cmapLanguage cmap
  putWord32be fCode
  putWord32be eCount
  subCodes (glyphMap cmap) (fromIntegral fCode) (fromIntegral lastCode)
  where
    size, eCount, fCode, lastCode :: Word32
    size = eCount*2 + 20
    eCount = lastCode - fCode  + 1
    fCode = fromIntegral $ fst $ M.findMin $ glyphMap cmap
    lastCode = fromIntegral $ fst $ M.findMax $ glyphMap cmap

putMap12 :: CMap -> PutM ()
putMap12 cmap = do
  putWord16be 12
  putWord16be 0
  putWord32be $ fromIntegral size
  putWord32be (fromIntegral $ cmapLanguage cmap)
  putWord32be $ fromIntegral nGroups
  for_ ranges $ \(start, end, code) -> do
    putWord32be $ fromIntegral start
    putWord32be $ fromIntegral end
    putWord32be $ fromIntegral code
  where
    size = nGroups * 12 + 16
    ranges = findRanges $ M.toList $ glyphMap cmap
    nGroups = length ranges

getMap12 :: Strict.ByteString -> Either String CMap
getMap12 bs = do
  _ <- index16 bs 0
  lang <- index16 bs 1
  nGroups <- index32 bs 1
  gmap <- fmap (M.fromAscList . concat) $ for [0..nGroups-1] $ \i -> do
    start <- index32 bs (i*3 + 2)
    end <- index32 bs (i*3 + 3)
    glyph <- index32 bs (i*3 + 4)
    return [(fromIntegral c, fromIntegral $ glyph+c-start) | c <- [start .. end]]
  Right $ CMap UnicodePlatform 0 lang MapFormat8 IS.empty gmap