{-# LANGUAGE TupleSections #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE CPP #-}
{-# OPTIONS_GHC -fspec-constr-count=5 #-}
-- | Module used for JPEG file loading and writing.
module Codec.Picture.Jpg( decodeJpeg
                        , encodeJpegAtQuality
                        , encodeJpeg
                        ) where

#if !MIN_VERSION_base(4,8,0)
import Control.Applicative( pure )
#endif

import Control.Applicative( (<$>) )

import Control.Arrow( (>>>) )
import Control.Monad( when, forM_ )
import Control.Monad.ST( ST, runST )
import Control.Monad.Trans( lift )
import Control.Monad.Trans.RWS.Strict( RWS, modify, tell, gets, execRWS )

import Data.Bits( (.|.), unsafeShiftL )
import Data.Int( Int16, Int32 )
import Data.Word(Word8, Word32)
import Data.Binary( Binary(..), encode )
import Data.STRef( newSTRef, writeSTRef, readSTRef )

import Data.Vector( (//) )
import Data.Vector.Unboxed( (!) )
import qualified Data.Vector as V
import qualified Data.Vector.Unboxed as VU
import qualified Data.Vector.Storable as VS
import qualified Data.Vector.Storable.Mutable as M
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as L

import Codec.Picture.InternalHelper
import Codec.Picture.BitWriter
import Codec.Picture.Types
import Codec.Picture.Jpg.Types
import Codec.Picture.Jpg.Common
import Codec.Picture.Jpg.Progressive
import Codec.Picture.Jpg.DefaultTable
import Codec.Picture.Jpg.FastDct

quantize :: MacroBlock Int16 -> MutableMacroBlock s Int32
         -> ST s (MutableMacroBlock s Int32)
quantize table block = update 0
  where update 64 = return block
        update idx = do
            val <- block `M.unsafeRead` idx
            let q = fromIntegral (table `VS.unsafeIndex` idx)
                finalValue = (val + (q `div` 2)) `quot` q -- rounded integer division
            (block `M.unsafeWrite` idx) finalValue
            update $ idx + 1


powerOf :: Int32 -> Word32
powerOf 0 = 0
powerOf n = limit 1 0
    where val = abs n
          limit range i | val < range = i
          limit range i = limit (2 * range) (i + 1)

encodeInt :: BoolWriteStateRef s -> Word32 -> Int32 -> ST s ()
{-# INLINE encodeInt #-}
encodeInt st ssss n | n > 0 = writeBits' st (fromIntegral n) (fromIntegral ssss)
encodeInt st ssss n         = writeBits' st (fromIntegral $ n - 1) (fromIntegral ssss)

-- | Assume the macro block is initialized with zeroes
acCoefficientsDecode :: HuffmanPackedTree -> MutableMacroBlock s Int16
                     -> BoolReader s (MutableMacroBlock s Int16)
acCoefficientsDecode acTree mutableBlock = parseAcCoefficient 1 >> return mutableBlock
  where parseAcCoefficient n | n >= 64 = return ()
                             | otherwise = do
            rrrrssss <- decodeRrrrSsss acTree
            case rrrrssss of
                (  0, 0) -> return ()
                (0xF, 0) -> parseAcCoefficient (n + 16)
                (rrrr, ssss) -> do
                    decoded <- fromIntegral <$> decodeInt ssss
                    lift $ (mutableBlock `M.unsafeWrite` (n + rrrr)) decoded
                    parseAcCoefficient (n + rrrr + 1)

-- | Decompress a macroblock from a bitstream given the current configuration
-- from the frame.
decompressMacroBlock :: HuffmanPackedTree   -- ^ Tree used for DC coefficient
                     -> HuffmanPackedTree   -- ^ Tree used for Ac coefficient
                     -> MacroBlock Int16    -- ^ Current quantization table
                     -> MutableMacroBlock s Int16    -- ^ A zigzag table, to avoid allocation
                     -> DcCoefficient       -- ^ Previous dc value
                     -> BoolReader s (DcCoefficient, MutableMacroBlock s Int16)
decompressMacroBlock dcTree acTree quantizationTable zigzagBlock previousDc = do
    dcDeltaCoefficient <- dcCoefficientDecode dcTree
    block <- lift createEmptyMutableMacroBlock
    let neoDcCoefficient = previousDc + dcDeltaCoefficient
    lift $ (block `M.unsafeWrite` 0) neoDcCoefficient
    fullBlock <- acCoefficientsDecode acTree block
    decodedBlock <- lift $ decodeMacroBlock quantizationTable zigzagBlock fullBlock
    return (neoDcCoefficient, decodedBlock)

pixelClamp :: Int16 -> Word8
pixelClamp n = fromIntegral . min 255 $ max 0 n

unpack444Y :: Int -- ^ component index
           -> Int -- ^ x
           -> Int -- ^ y
           -> MutableImage s PixelYCbCr8
           -> MutableMacroBlock s Int16
           -> ST s ()
unpack444Y _ x y (MutableImage { mutableImageWidth = imgWidth, mutableImageData = img })
                 block = blockVert baseIdx 0 zero
  where zero = 0 :: Int
        baseIdx = x * dctBlockSize + y * dctBlockSize * imgWidth

        blockVert        _       _ j | j >= dctBlockSize = return ()
        blockVert writeIdx readingIdx j = blockHoriz writeIdx readingIdx zero
          where blockHoriz   _ readIdx i | i >= dctBlockSize = blockVert (writeIdx + imgWidth) readIdx $ j + 1
                blockHoriz idx readIdx i = do
                    val <- pixelClamp <$> (block `M.unsafeRead` readIdx)
                    (img `M.unsafeWrite` idx) val
                    blockHoriz (idx + 1) (readIdx + 1) $ i + 1

unpack444Ycbcr :: Int -- ^ Component index
              -> Int -- ^ x
              -> Int -- ^ y
              -> MutableImage s PixelYCbCr8
              -> MutableMacroBlock s Int16
              -> ST s ()
unpack444Ycbcr compIdx x y
                 (MutableImage { mutableImageWidth = imgWidth, mutableImageData = img })
                 block = blockVert baseIdx 0 zero
  where zero = 0 :: Int
        baseIdx = (x * dctBlockSize + y * dctBlockSize * imgWidth) * 3 + compIdx

        blockVert   _       _ j | j >= dctBlockSize = return ()
        blockVert idx readIdx j = do
            val0 <- pixelClamp <$> (block `M.unsafeRead` readIdx)
            val1 <- pixelClamp <$> (block `M.unsafeRead` (readIdx + 1))
            val2 <- pixelClamp <$> (block `M.unsafeRead` (readIdx + 2))
            val3 <- pixelClamp <$> (block `M.unsafeRead` (readIdx + 3))
            val4 <- pixelClamp <$> (block `M.unsafeRead` (readIdx + 4))
            val5 <- pixelClamp <$> (block `M.unsafeRead` (readIdx + 5))
            val6 <- pixelClamp <$> (block `M.unsafeRead` (readIdx + 6))
            val7 <- pixelClamp <$> (block `M.unsafeRead` (readIdx + 7))

            (img `M.unsafeWrite` idx) val0
            (img `M.unsafeWrite` (idx + (3    ))) val1
            (img `M.unsafeWrite` (idx + (3 * 2))) val2
            (img `M.unsafeWrite` (idx + (3 * 3))) val3
            (img `M.unsafeWrite` (idx + (3 * 4))) val4
            (img `M.unsafeWrite` (idx + (3 * 5))) val5
            (img `M.unsafeWrite` (idx + (3 * 6))) val6
            (img `M.unsafeWrite` (idx + (3 * 7))) val7

            blockVert (idx + 3 * imgWidth) (readIdx + dctBlockSize) $ j + 1


          {-where blockHoriz   _ readIdx i | i >= 8 = blockVert (writeIdx + imgWidth * 3) readIdx $ j + 1-}
                {-blockHoriz idx readIdx i = do-}
                    {-val <- pixelClamp <$> (block `M.unsafeRead` readIdx) -}
                    {-(img `M.unsafeWrite` idx) val-}
                    {-blockHoriz (idx + 3) (readIdx + 1) $ i + 1-}

unpack421Ycbcr :: Int -- ^ Component index
               -> Int -- ^ x
               -> Int -- ^ y
               -> MutableImage s PixelYCbCr8
               -> MutableMacroBlock s Int16
               -> ST s ()
unpack421Ycbcr compIdx x y
                 (MutableImage { mutableImageWidth = imgWidth,
                                 mutableImageHeight = _, mutableImageData = img })
                 block = blockVert baseIdx 0 zero
  where zero = 0 :: Int
        baseIdx = (x * dctBlockSize + y * dctBlockSize * imgWidth) * 3 + compIdx
        lineOffset = imgWidth * 3

        blockVert        _       _ j | j >= dctBlockSize = return ()
        blockVert idx readIdx j = do
            v0 <- pixelClamp <$> (block `M.unsafeRead` readIdx)
            v1 <- pixelClamp <$> (block `M.unsafeRead` (readIdx + 1))
            v2 <- pixelClamp <$> (block `M.unsafeRead` (readIdx + 2))
            v3 <- pixelClamp <$> (block `M.unsafeRead` (readIdx + 3))
            v4 <- pixelClamp <$> (block `M.unsafeRead` (readIdx + 4))
            v5 <- pixelClamp <$> (block `M.unsafeRead` (readIdx + 5))
            v6 <- pixelClamp <$> (block `M.unsafeRead` (readIdx + 6))
            v7 <- pixelClamp <$> (block `M.unsafeRead` (readIdx + 7))

            (img `M.unsafeWrite` idx)       v0
            (img `M.unsafeWrite` (idx + 3)) v0

            (img `M.unsafeWrite` (idx + 6    ))      v1
            (img `M.unsafeWrite` (idx + 6     + 3))  v1

            (img `M.unsafeWrite` (idx + 6 * 2))      v2
            (img `M.unsafeWrite` (idx + 6 * 2 + 3))  v2

            (img `M.unsafeWrite` (idx + 6 * 3))      v3
            (img `M.unsafeWrite` (idx + 6 * 3 + 3))  v3

            (img `M.unsafeWrite` (idx + 6 * 4))      v4
            (img `M.unsafeWrite` (idx + 6 * 4 + 3))  v4

            (img `M.unsafeWrite` (idx + 6 * 5))      v5
            (img `M.unsafeWrite` (idx + 6 * 5 + 3))  v5

            (img `M.unsafeWrite` (idx + 6 * 6))      v6
            (img `M.unsafeWrite` (idx + 6 * 6 + 3))  v6

            (img `M.unsafeWrite` (idx + 6 * 7))      v7
            (img `M.unsafeWrite` (idx + 6 * 7 + 3))  v7

            blockVert (idx + lineOffset) (readIdx + dctBlockSize) $ j + 1

type Unpacker s = Int -- ^ component index
               -> Int -- ^ x
               -> Int -- ^ y
               -> MutableImage s PixelYCbCr8
               -> MutableMacroBlock s Int16
               -> ST s ()

type JpgScripter s a =
    RWS () [([(JpgUnpackerParameter, Unpacker s)], L.ByteString)] JpgDecoderState a

data JpgDecoderState = JpgDecoderState
    { dcDecoderTables       :: !(V.Vector HuffmanPackedTree)
    , acDecoderTables       :: !(V.Vector HuffmanPackedTree)
    , quantizationMatrices  :: !(V.Vector (MacroBlock Int16))
    , currentRestartInterv  :: !Int
    , currentFrame          :: Maybe JpgFrameHeader
    , minimumComponentIndex :: !Int
    , isProgressive         :: !Bool
    , maximumHorizontalResolution :: !Int
    , maximumVerticalResolution   :: !Int
    , seenBlobs                   :: !Int
    }

emptyDecoderState :: JpgDecoderState
emptyDecoderState = JpgDecoderState
    { dcDecoderTables =
        let (_, dcLuma) = prepareHuffmanTable DcComponent 0 defaultDcLumaHuffmanTable
            (_, dcChroma) = prepareHuffmanTable DcComponent 1 defaultDcChromaHuffmanTable
        in
        V.fromList [ dcLuma, dcChroma, dcLuma, dcChroma ]

    , acDecoderTables =
        let (_, acLuma) = prepareHuffmanTable AcComponent 0 defaultAcLumaHuffmanTable
            (_, acChroma) = prepareHuffmanTable AcComponent 1 defaultAcChromaHuffmanTable
        in
        V.fromList [acLuma, acChroma, acLuma, acChroma]

    , quantizationMatrices = V.replicate 4 (VS.replicate (8 * 8) 1)
    , currentRestartInterv = -1
    , currentFrame         = Nothing
    , minimumComponentIndex = 1
    , isProgressive        = False
    , maximumHorizontalResolution = 0
    , maximumVerticalResolution   = 0
    , seenBlobs = 0
    }

-- | This pseudo interpreter interpret the Jpg frame for the huffman,
-- quant table and restart interval parameters.
jpgMachineStep :: JpgFrame -> JpgScripter s ()
jpgMachineStep (JpgAppFrame _ _) = pure ()
jpgMachineStep (JpgExtension _ _) = pure ()
jpgMachineStep (JpgScanBlob hdr raw_data) = do
    let scanCount = length $ scans hdr
    params <- concat <$> mapM (scanSpecifier scanCount) (scans hdr)

    modify $ \st -> st { seenBlobs = seenBlobs st + 1 }
    tell [(params, raw_data)  ]
  where (selectionLow, selectionHigh) = spectralSelection hdr
        approxHigh = fromIntegral $ successiveApproxHigh hdr
        approxLow = fromIntegral $ successiveApproxLow hdr

        
        scanSpecifier scanCount scanSpec = do
            minimumIndex <- gets minimumComponentIndex
            let maximumHuffmanTable = 4
                dcIndex = min (maximumHuffmanTable - 1) 
                            . fromIntegral $ dcEntropyCodingTable scanSpec
                acIndex = min (maximumHuffmanTable - 1)
                            . fromIntegral $ acEntropyCodingTable scanSpec
                comp = fromIntegral (componentSelector scanSpec) - minimumIndex

            dcTree <- gets $ (V.! dcIndex) . dcDecoderTables
            acTree <- gets $ (V.! acIndex) . acDecoderTables
            isProgressiveImage <- gets isProgressive
            maxiW <- gets maximumHorizontalResolution 
            maxiH <- gets maximumVerticalResolution
            restart <- gets currentRestartInterv
            frameInfo <- gets currentFrame
            blobId <- gets seenBlobs                   
            case frameInfo of
              Nothing -> fail "Jpg decoding error - no previous frame"
              Just v -> do
                 let compDesc = jpgComponents v !! comp
                     compCount = length $ jpgComponents v
                     xSampling = fromIntegral $ horizontalSamplingFactor compDesc
                     ySampling = fromIntegral $ verticalSamplingFactor compDesc
                     componentSubSampling =
                        (maxiW - xSampling + 1, maxiH - ySampling + 1)
                     (xCount, yCount)
                        | scanCount > 1 || isProgressiveImage = (xSampling, ySampling)
                        | otherwise = (1, 1)

                 pure [ (JpgUnpackerParameter
                         { dcHuffmanTree = dcTree
                         , acHuffmanTree = acTree
                         , componentIndex = comp
                         , restartInterval = fromIntegral restart
                         , componentWidth = xSampling
                         , componentHeight = ySampling
                         , subSampling = componentSubSampling
                         , successiveApprox = (approxLow, approxHigh)
                         , readerIndex = blobId
                         , indiceVector =
                             if scanCount == 1 then 0 else 1
                         , coefficientRange =
                             ( fromIntegral selectionLow
                             , fromIntegral selectionHigh )
                         , blockIndex = y * ySampling + x
                         , blockMcuX = x
                         , blockMcuY = y
                         }, unpackerDecision compCount componentSubSampling)
                             | y <- [0 .. yCount - 1]
                             , x <- [0 .. xCount - 1] ]

jpgMachineStep (JpgScans kind hdr) = modify $ \s ->
   s { currentFrame = Just hdr
     , minimumComponentIndex =
          fromIntegral $ minimum [componentIdentifier comp | comp <- jpgComponents hdr]
     , isProgressive = case kind of
            JpgProgressiveDCTHuffman -> True
            _ -> False
     , maximumHorizontalResolution =
         fromIntegral $ maximum horizontalResolutions
     , maximumVerticalResolution =
         fromIntegral $ maximum verticalResolutions
     }
    where components = jpgComponents hdr
          horizontalResolutions = map horizontalSamplingFactor components
          verticalResolutions = map verticalSamplingFactor components
jpgMachineStep (JpgIntervalRestart restart) =
    modify $ \s -> s { currentRestartInterv = fromIntegral restart }
jpgMachineStep (JpgHuffmanTable tables) = mapM_ placeHuffmanTrees tables
  where placeHuffmanTrees (spec, tree) = case huffmanTableClass spec of
            DcComponent -> modify $ \s ->
              if idx >= V.length (dcDecoderTables s) then s
              else
                let neu = dcDecoderTables s // [(idx, tree)] in 
                s { dcDecoderTables = neu }
                    where idx = fromIntegral $ huffmanTableDest spec
                          
            AcComponent -> modify $ \s ->
              if idx >= V.length (acDecoderTables s) then s
              else
                s { acDecoderTables = acDecoderTables s // [(idx, tree)] }
                    where idx = fromIntegral $ huffmanTableDest spec

jpgMachineStep (JpgQuantTable tables) = mapM_ placeQuantizationTables tables
  where placeQuantizationTables table = do
            let idx = fromIntegral $ quantDestination table
                tableData = quantTable table
            modify $ \s ->
                s { quantizationMatrices =  quantizationMatrices s // [(idx, tableData)] }

unpackerDecision :: Int -> (Int, Int) -> Unpacker s
unpackerDecision 1 (1, 1) = unpack444Y
unpackerDecision _ (1, 1) = unpack444Ycbcr
unpackerDecision _ (2, 1) = unpack421Ycbcr
unpackerDecision compCount (xScalingFactor, yScalingFactor) =
    unpackMacroBlock compCount xScalingFactor yScalingFactor

decodeImage :: JpgFrameHeader
            -> V.Vector (MacroBlock Int16)
            -> [([(JpgUnpackerParameter, Unpacker s)], L.ByteString)]
            -> MutableImage s PixelYCbCr8 -- ^ Result image to write into
            -> ST s ()
decodeImage frame quants lst outImage = do
  let compCount = length $ jpgComponents frame
  zigZagArray <- createEmptyMutableMacroBlock
  dcArray <- M.replicate compCount 0  :: ST s (M.STVector s DcCoefficient)
  resetCounter <- newSTRef restartIntervalValue

  forM_ lst $ \(params, str) -> do
    let componentsInfo = V.fromList params
        compReader = initBoolStateJpg . B.concat $ L.toChunks str
        maxiW = maximum [fst $ subSampling c | (c,_) <- params]
        maxiH = maximum [snd $ subSampling c | (c,_) <- params]

        imageBlockWidth = (imgWidth + 7) `div` 8
        imageBlockHeight = (imgHeight + 7) `div` 8

        imageMcuWidth = (imageBlockWidth + (maxiW - 1)) `div` maxiW
        imageMcuHeight = (imageBlockHeight + (maxiH - 1)) `div` maxiH

    execBoolReader compReader $ rasterMap imageMcuWidth imageMcuHeight $ \x y -> do
      resetLeft <- lift $ readSTRef resetCounter
      if resetLeft == 0 then do
        lift $ M.set dcArray 0
        byteAlignJpg
        _restartCode <- decodeRestartInterval
        lift $ resetCounter `writeSTRef` (restartIntervalValue - 1)
      else
        lift $ resetCounter `writeSTRef` (resetLeft - 1)

      V.forM_ componentsInfo $ \(comp, unpack) -> do
        let compIdx = componentIndex comp
            dcTree = dcHuffmanTree comp
            acTree = acHuffmanTree comp
            quantId = fromIntegral .  quantizationTableDest
                    $ jpgComponents frame !! compIdx
            qTable = quants V.! min 3 quantId
            xd = blockMcuX comp
            yd = blockMcuY comp
            (subX, subY) = subSampling comp
        dc <- lift $ dcArray `M.unsafeRead` compIdx
        (dcCoeff, block) <-
              decompressMacroBlock dcTree acTree qTable zigZagArray $ fromIntegral dc
        lift $ (dcArray `M.unsafeWrite` compIdx) dcCoeff
        let verticalLimited = y == imageMcuHeight - 1
        if (x == imageMcuWidth - 1) || verticalLimited then
          lift $ unpackMacroBlock imgComponentCount
                                  subX subY compIdx
                                  (x * maxiW + xd) (y * maxiH + yd) outImage block
        else
          lift $ unpack compIdx (x * maxiW + xd) (y * maxiH + yd) outImage block

  where imgComponentCount = length $ jpgComponents frame

        imgWidth = fromIntegral $ jpgWidth frame
        imgHeight = fromIntegral $ jpgHeight frame
        restartIntervalValue = case lst of
                ((p,_):_,_): _ -> restartInterval p
                _ -> -1

gatherImageKind :: [JpgFrame] -> Maybe JpgImageKind
gatherImageKind lst = case [k | JpgScans k _ <- lst, isDctSpecifier k] of
    [JpgBaselineDCTHuffman] -> Just BaseLineDCT
    [JpgProgressiveDCTHuffman] -> Just ProgressiveDCT
    _ -> Nothing
  where isDctSpecifier JpgProgressiveDCTHuffman = True
        isDctSpecifier JpgBaselineDCTHuffman = True
        isDctSpecifier _ = False

gatherScanInfo :: JpgImage -> (JpgFrameKind, JpgFrameHeader)
gatherScanInfo img = head [(a, b) | JpgScans a b <- jpgFrame img]

-- | Try to decompress a jpeg file and decompress. The colorspace is still
-- YCbCr if you want to perform computation on the luma part. You can
-- convert it to RGB using 'convertImage' from the 'ColorSpaceConvertible'
-- typeclass.
--
-- This function can output the following pixel types :
--
--    * PixelY8
--
--    * PixelYCbCr8
--
decodeJpeg :: B.ByteString -> Either String DynamicImage
decodeJpeg file = case runGetStrict get file of
  Left err -> Left err
  Right img -> case (compCount, imgKind) of
                 (_, Nothing) -> Left "Unknown Jpg kind"
                 (3, Just ProgressiveDCT) -> Right . ImageYCbCr8 $ decodeProgressive
                 (1, Just BaseLineDCT) -> Right . ImageY8 $ Image imgWidth imgHeight pixelData
                 (3, Just BaseLineDCT) -> Right . ImageYCbCr8 $ Image imgWidth imgHeight pixelData
                 _ -> Left "Wrong component count"

      where compCount = length $ jpgComponents scanInfo
            (_,scanInfo) = gatherScanInfo img

            imgKind = gatherImageKind $ jpgFrame img
            imgWidth = fromIntegral $ jpgWidth scanInfo
            imgHeight = fromIntegral $ jpgHeight scanInfo

            imageSize = imgWidth * imgHeight * compCount

            decodeProgressive = runST $ do
                let (st, wrotten) =
                        execRWS (mapM_ jpgMachineStep (jpgFrame img)) () emptyDecoderState
                    Just fHdr = currentFrame st
                progressiveUnpack
                    (maximumHorizontalResolution st, maximumVerticalResolution st)
                    fHdr
                    (quantizationMatrices st)
                    wrotten >>= unsafeFreezeImage

            pixelData = runST $ do
                let (st, wrotten) =
                        execRWS (mapM_ jpgMachineStep (jpgFrame img)) () emptyDecoderState
                    Just fHdr = currentFrame st
                resultImage <- M.new imageSize
                let wrapped = MutableImage imgWidth imgHeight resultImage
                decodeImage 
                    fHdr
                    (quantizationMatrices st)
                    wrotten
                    wrapped
                VS.unsafeFreeze resultImage

extractBlock :: Image PixelYCbCr8       -- ^ Source image
             -> MutableMacroBlock s Int16      -- ^ Mutable block where to put extracted block
             -> Int                     -- ^ Plane
             -> Int                     -- ^ X sampling factor
             -> Int                     -- ^ Y sampling factor
             -> Int                     -- ^ Sample per pixel
             -> Int                     -- ^ Block x
             -> Int                     -- ^ Block y
             -> ST s (MutableMacroBlock s Int16)
extractBlock (Image { imageWidth = w, imageHeight = h, imageData = src })
             block 1 1 sampCount plane bx by | (bx * dctBlockSize) + 7 < w && (by * 8) + 7 < h = do
    let baseReadIdx = (by * dctBlockSize * w) + bx * dctBlockSize
    sequence_ [(block `M.unsafeWrite` (y * dctBlockSize + x)) val
                        | y <- [0 .. dctBlockSize - 1]
                        , let blockReadIdx = baseReadIdx + y * w
                        , x <- [0 .. dctBlockSize - 1]
                        , let val = fromIntegral $ src `VS.unsafeIndex` ((blockReadIdx + x) * sampCount + plane)
                        ]
    return block
extractBlock (Image { imageWidth = w, imageHeight = h, imageData = src })
             block sampWidth sampHeight sampCount plane bx by = do
    let accessPixel x y | x < w && y < h = let idx = (y * w + x) * sampCount + plane in src `VS.unsafeIndex` idx
                        | x >= w = accessPixel (w - 1) y
                        | otherwise = accessPixel x (h - 1)

        pixelPerCoeff = fromIntegral $ sampWidth * sampHeight

        blockVal x y = sum [fromIntegral $ accessPixel (xBase + dx) (yBase + dy)
                                | dy <- [0 .. sampHeight - 1]
                                , dx <- [0 .. sampWidth - 1] ] `div` pixelPerCoeff
            where xBase = blockXBegin + x * sampWidth
                  yBase = blockYBegin + y * sampHeight

        blockXBegin = bx * dctBlockSize * sampWidth
        blockYBegin = by * dctBlockSize * sampHeight

    sequence_ [(block `M.unsafeWrite` (y * dctBlockSize + x)) $ blockVal x y | y <- [0 .. 7], x <- [0 .. 7] ]
    return block

serializeMacroBlock :: BoolWriteStateRef s
                    -> HuffmanWriterCode -> HuffmanWriterCode
                    -> MutableMacroBlock s Int32
                    -> ST s ()
serializeMacroBlock !st !dcCode !acCode !blk =
 (blk `M.unsafeRead` 0) >>= (fromIntegral >>> encodeDc) >> writeAcs (0, 1) >> return ()
  where writeAcs acc@(_, 63) =
            (blk `M.unsafeRead` 63) >>= (fromIntegral >>> encodeAcCoefs acc) >> return ()
        writeAcs acc@(_, i ) =
            (blk `M.unsafeRead`  i) >>= (fromIntegral >>> encodeAcCoefs acc) >>= writeAcs

        encodeDc n = writeBits' st (fromIntegral code) (fromIntegral bitCount)
                        >> when (ssss /= 0) (encodeInt st ssss n)
            where ssss = powerOf $ fromIntegral n
                  (bitCount, code) = dcCode `V.unsafeIndex` fromIntegral ssss

        encodeAc 0         0 = writeBits' st (fromIntegral code) $ fromIntegral bitCount
            where (bitCount, code) = acCode `V.unsafeIndex` 0

        encodeAc zeroCount n | zeroCount >= 16 =
          writeBits' st (fromIntegral code) (fromIntegral bitCount) >>  encodeAc (zeroCount - 16) n
            where (bitCount, code) = acCode `V.unsafeIndex` 0xF0
        encodeAc zeroCount n =
          writeBits' st (fromIntegral code) (fromIntegral bitCount) >> encodeInt st ssss n
            where rrrr = zeroCount `unsafeShiftL` 4
                  ssss = powerOf $ fromIntegral n
                  rrrrssss = rrrr .|. ssss
                  (bitCount, code) = acCode `V.unsafeIndex` fromIntegral rrrrssss

        encodeAcCoefs (            _, 63) 0 = encodeAc 0 0 >> return (0, 64)
        encodeAcCoefs (zeroRunLength,  i) 0 = return (zeroRunLength + 1, i + 1)
        encodeAcCoefs (zeroRunLength,  i) n =
            encodeAc zeroRunLength n >> return (0, i + 1)

encodeMacroBlock :: QuantificationTable
                 -> MutableMacroBlock s Int32
                 -> MutableMacroBlock s Int32
                 -> Int16
                 -> MutableMacroBlock s Int16
                 -> ST s (Int32, MutableMacroBlock s Int32)
encodeMacroBlock quantTableOfComponent workData finalData prev_dc block = do
 -- the inverse level shift is performed internally by the fastDCT routine
 blk <- fastDctLibJpeg workData block
        >>= zigZagReorderForward finalData
        >>= quantize quantTableOfComponent
 dc <- blk `M.unsafeRead` 0
 (blk `M.unsafeWrite` 0) $ dc - fromIntegral prev_dc
 return (dc, blk)

divUpward :: (Integral a) => a -> a -> a
divUpward n dividor = val + (if rest /= 0 then 1 else 0)
    where (val, rest) = n `divMod` dividor

prepareHuffmanTable :: DctComponent -> Word8 -> HuffmanTable
                    -> (JpgHuffmanTableSpec, HuffmanPackedTree)
prepareHuffmanTable classVal dest tableDef =
   (JpgHuffmanTableSpec { huffmanTableClass = classVal
                        , huffmanTableDest  = dest
                        , huffSizes = sizes
                        , huffCodes = V.fromListN 16
                            [VU.fromListN (fromIntegral $ sizes ! i) lst
                                                | (i, lst) <- zip [0..] tableDef ]
                        }, VS.singleton 0)
      where sizes = VU.fromListN 16 $ map (fromIntegral . length) tableDef

-- | Encode an image in jpeg at a reasonnable quality level.
-- If you want better quality or reduced file size, you should
-- use `encodeJpegAtQuality`
encodeJpeg :: Image PixelYCbCr8 -> L.ByteString
encodeJpeg = encodeJpegAtQuality 50

defaultHuffmanTables :: [(JpgHuffmanTableSpec, HuffmanPackedTree)]
defaultHuffmanTables =
    [ prepareHuffmanTable DcComponent 0 defaultDcLumaHuffmanTable
    , prepareHuffmanTable AcComponent 0 defaultAcLumaHuffmanTable
    , prepareHuffmanTable DcComponent 1 defaultDcChromaHuffmanTable
    , prepareHuffmanTable AcComponent 1 defaultAcChromaHuffmanTable
    ]

-- | Function to call to encode an image to jpeg.
-- The quality factor should be between 0 and 100 (100 being
-- the best quality).
encodeJpegAtQuality :: Word8                -- ^ Quality factor
                    -> Image PixelYCbCr8    -- ^ Image to encode
                    -> L.ByteString         -- ^ Encoded JPEG
encodeJpegAtQuality quality img@(Image { imageWidth = w, imageHeight = h }) = encode finalImage
  where finalImage = JpgImage [ JpgQuantTable quantTables
                              , JpgScans JpgBaselineDCTHuffman hdr
                              , JpgHuffmanTable defaultHuffmanTables
                              , JpgScanBlob scanHeader encodedImage
                              ]

        outputComponentCount = 3

        scanHeader = scanHeader'{ scanLength = fromIntegral $ calculateSize scanHeader' }
        scanHeader' = JpgScanHeader
            { scanLength = 0
            , scanComponentCount = outputComponentCount
            , scans = [ JpgScanSpecification { componentSelector = 1
                                             , dcEntropyCodingTable = 0
                                             , acEntropyCodingTable = 0
                                             }
                      , JpgScanSpecification { componentSelector = 2
                                             , dcEntropyCodingTable = 1
                                             , acEntropyCodingTable = 1
                                             }
                      , JpgScanSpecification { componentSelector = 3
                                             , dcEntropyCodingTable = 1
                                             , acEntropyCodingTable = 1
                                             }
                      ]

            , spectralSelection = (0, 63)
            , successiveApproxHigh = 0
            , successiveApproxLow  = 0
            }

        hdr = hdr' { jpgFrameHeaderLength   = fromIntegral $ calculateSize hdr' }
        hdr' = JpgFrameHeader { jpgFrameHeaderLength   = 0
                              , jpgSamplePrecision     = 8
                              , jpgHeight              = fromIntegral h
                              , jpgWidth               = fromIntegral w
                              , jpgImageComponentCount = outputComponentCount
                              , jpgComponents          = [
                                    JpgComponent { componentIdentifier      = 1
                                                 , horizontalSamplingFactor = 2
                                                 , verticalSamplingFactor   = 2
                                                 , quantizationTableDest    = 0
                                                 }
                                  , JpgComponent { componentIdentifier      = 2
                                                 , horizontalSamplingFactor = 1
                                                 , verticalSamplingFactor   = 1
                                                 , quantizationTableDest    = 1
                                                 }
                                  , JpgComponent { componentIdentifier      = 3
                                                 , horizontalSamplingFactor = 1
                                                 , verticalSamplingFactor   = 1
                                                 , quantizationTableDest    = 1
                                                 }
                                  ]
                              }

        lumaQuant = scaleQuantisationMatrix (fromIntegral quality)
                        defaultLumaQuantizationTable
        chromaQuant = scaleQuantisationMatrix (fromIntegral quality)
                            defaultChromaQuantizationTable

        zigzagedLumaQuant = zigZagReorderForwardv lumaQuant
        zigzagedChromaQuant = zigZagReorderForwardv chromaQuant
        quantTables = [ JpgQuantTableSpec { quantPrecision = 0, quantDestination = 0
                                          , quantTable = zigzagedLumaQuant }
                      , JpgQuantTableSpec { quantPrecision = 0, quantDestination = 1
                                          , quantTable = zigzagedChromaQuant }
                      ]

        encodedImage = runST $ do
            let horizontalMetaBlockCount =
                    w `divUpward` (dctBlockSize * maxSampling)
                verticalMetaBlockCount =
                    h `divUpward` (dctBlockSize * maxSampling)
                maxSampling = 2
                lumaSamplingSize = ( maxSampling, maxSampling, zigzagedLumaQuant
                                   , makeInverseTable defaultDcLumaHuffmanTree
                                   , makeInverseTable defaultAcLumaHuffmanTree)
                chromaSamplingSize = ( maxSampling - 1, maxSampling - 1, zigzagedChromaQuant
                                     , makeInverseTable defaultDcChromaHuffmanTree
                                     , makeInverseTable defaultAcChromaHuffmanTree)
                componentDef = [lumaSamplingSize, chromaSamplingSize, chromaSamplingSize]

                imageComponentCount = length componentDef

            dc_table <- M.replicate 3 0
            block <- createEmptyMutableMacroBlock
            workData <- createEmptyMutableMacroBlock
            zigzaged <- createEmptyMutableMacroBlock
            writeState <- newWriteStateRef

            -- It's ugly, I know, be avoid allocation
            let blockDecoder mx my = component $ zip [0..] componentDef
                  where component [] = return ()
                        component ((comp, (sizeX, sizeY, table, dc, ac)) : comp_rest) =
                           rasterMap sizeX sizeY decoder >> component comp_rest
                          where xSamplingFactor = maxSampling - sizeX + 1
                                ySamplingFactor = maxSampling - sizeY + 1
                                extractor = extractBlock img block xSamplingFactor ySamplingFactor imageComponentCount

                                decoder subX subY = do
                                  let blockY = my * sizeY + subY
                                      blockX = mx * sizeX + subX
                                  prev_dc <- dc_table `M.unsafeRead` comp
                                  (dc_coeff, neo_block) <- extractor comp blockX blockY >>=
                                                          encodeMacroBlock table workData zigzaged prev_dc
                                  (dc_table `M.unsafeWrite` comp) $ fromIntegral dc_coeff
                                  serializeMacroBlock writeState dc ac neo_block

            rasterMap 
                horizontalMetaBlockCount verticalMetaBlockCount
                blockDecoder

            finalizeBoolWriter writeState