{-# LANGUAGE MagicHash #-}
-- | Evaluate an array by dividing it into rectangular blocks and filling
--   each block in parallel.
module Data.Array.Repa.Eval.Cursored
        ( fillBlock2P
        , fillCursoredBlock2P
        , fillCursoredBlock2S )
where
import Data.Array.Repa.Index
import Data.Array.Repa.Shape
import Data.Array.Repa.Eval.Elt
import Data.Array.Repa.Eval.Gang
import GHC.Base


-- Non-cursored interface -----------------------------------------------------
-- | Fill a block in a rank-2 array in parallel.
--
--   * Blockwise filling can be more cache-efficient than linear filling for
--    rank-2 arrays.
--
--   * Coordinates given are of the filled edges of the block.
-- 
--   * We divide the block into columns, and give one column to each thread.
-- 
--   * Each column is filled in row major order from top to bottom.
--
fillBlock2P
        :: Elt a
        => (Int -> a -> IO ())  -- ^ Update function to write into result buffer.
        -> (DIM2 -> a)          -- ^ Function to evaluate the element at an index.
        -> Int#                 -- ^ Width of the whole array.
        -> Int#                 -- ^ x0 lower left corner of block to fill
        -> Int#                 -- ^ y0 
        -> Int#                 -- ^ w0 width of block to fill.
        -> Int#                 -- ^ h0 height of block to fill.
        -> IO ()

{-# INLINE [0] fillBlock2P #-}
fillBlock2P write getElem !imageWidth !x0 !y0 !w0 h0
 = fillCursoredBlock2P
        write id addDim getElem
        imageWidth x0 y0 w0 h0

{-
-- | Fill a block in a rank-2 array sequentially.
--
--   * Blockwise filling can be more cache-efficient than linear filling for
--    rank-2 arrays.
--
--   * Coordinates given are of the filled edges of the block.
-- 
--   * The block is filled in row major order from top to bottom.
--
fillBlock2S
        :: Elt a
        => (Int -> a -> IO ())  -- ^ Update function to write into result buffer.
        -> (DIM2 -> a)          -- ^ Function to evaluate the element at an index.
        -> Int#                 -- ^ Width of the whole array.
        -> Int#                 -- ^ x0 lower left corner of block to fill
        -> Int#                 -- ^ y0
        -> Int#                 -- ^ w0 width of block to fill
        -> Int#                 -- ^ h0 height of block to filll
        -> IO ()

{-# INLINE [0] fillBlock2S #-}
fillBlock2S write getElem !imageWidth !x0 !y0 !w0 !h0
 = fillCursoredBlock2S
        write id addDim getElem 
        imageWidth x0 y0 w0 h0
-}

-- Block filling ----------------------------------------------------------------------------------
-- | Fill a block in a rank-2 array in parallel.
-- 
--   * Blockwise filling can be more cache-efficient than linear filling for rank-2 arrays.
--
--   * Using cursor functions can help to expose inter-element indexing computations to
--     the GHC and LLVM optimisers.
--
--   * Coordinates given are of the filled edges of the block.
-- 
--   * We divide the block into columns, and give one column to each thread.
-- 
--   * Each column is filled in row major order from top to bottom.
--
fillCursoredBlock2P
        :: Elt a
        => (Int -> a -> IO ())          -- ^ Update function to write into result buffer.
        -> (DIM2   -> cursor)           -- ^ Make a cursor to a particular element.
        -> (DIM2   -> cursor -> cursor) -- ^ Shift the cursor by an offset.
        -> (cursor -> a)                -- ^ Function to evaluate the element at an index.
        -> Int#                         -- ^ Width of the whole array.
        -> Int#                         -- ^ x0 lower left corner of block to fill
        -> Int#                         -- ^ y0
        -> Int#                         -- ^ w0 width of block to fill
        -> Int#                         -- ^ h0 height of block to fill
        -> IO ()

{-# INLINE [0] fillCursoredBlock2P #-}
fillCursoredBlock2P
        write
        makeCursorFCB shiftCursorFCB getElemFCB
        !imageWidth !x0 !y0 !w0 !h0
 =      gangIO theGang fillBlock
 where
        !(I# threads)  = gangSize theGang

        -- All columns have at least this many pixels.
        !colChunkLen   = w0 `quotInt#` threads

        -- Extra pixels that we have to divide between some of the threads.
        !colChunkSlack = w0 `remInt#` threads

        -- Get the starting pixel of a column in the image.
        {-# INLINE colIx #-}
        colIx !ix
         | 1# <- ix <# colChunkSlack = x0 +# (ix *# (colChunkLen +# 1#))
         | otherwise                 = x0 +# (ix *# colChunkLen) +# colChunkSlack

        -- Give one column to each thread
        {-# INLINE fillBlock #-}
        fillBlock :: Int -> IO ()
        fillBlock !(I# ix)
         = let  !x0'      = colIx ix
                !w0'      = colIx (ix +# 1#) -# x0'
                !y0'      = y0
                !h0'      = h0
           in   fillCursoredBlock2S
                        write
                        makeCursorFCB shiftCursorFCB getElemFCB
                        imageWidth x0' y0' w0' h0'


-- | Fill a block in a rank-2 array, sequentially.
--
--   * Blockwise filling can be more cache-efficient than linear filling for rank-2 arrays.
--
--   * Using cursor functions can help to expose inter-element indexing computations to
--     the GHC and LLVM optimisers.
--
--   * Coordinates given are of the filled edges of the block.
--
--   * The block is filled in row major order from top to bottom.
--
fillCursoredBlock2S
        :: Elt a
        => (Int -> a -> IO ())          -- ^ Update function to write into result buffer.
        -> (DIM2   -> cursor)           -- ^ Make a cursor to a particular element.
        -> (DIM2   -> cursor -> cursor) -- ^ Shift the cursor by an offset.
        -> (cursor -> a)                -- ^ Function to evaluate an element at the given index.
        -> Int#                         -- ^ Width of the whole array.
        -> Int#                         -- ^ x0 lower left corner of block to fill.
        -> Int#                         -- ^ y0
        -> Int#                         -- ^ w0 width of block to fill
        -> Int#                         -- ^ h0 height of block to fill
        -> IO ()

{-# INLINE [0] fillCursoredBlock2S #-}
fillCursoredBlock2S
        write
        makeCursor shiftCursor getElem
        !imageWidth !x0 !y0 !w0 h0

 = do   fillBlock y0
 where  !x1     = x0 +# w0
        !y1     = y0 +# h0

        {-# INLINE fillBlock #-}
        fillBlock !y
         | 1# <- y >=# y1      = return ()
         | otherwise
         = do   fillLine4 x0
                fillBlock (y +# 1#)

         where  {-# INLINE fillLine4 #-}
                fillLine4 !x
                 | 1# <- x +# 4# >=# x1  = fillLine1 x
                 | otherwise
                 = do   -- Compute each source cursor based on the previous one so that
                        -- the variable live ranges in the generated code are shorter.
                        let srcCur0     = makeCursor  (Z :. (I# y) :. (I# x))
                        let srcCur1     = shiftCursor (Z :. 0 :. 1) srcCur0
                        let srcCur2     = shiftCursor (Z :. 0 :. 1) srcCur1
                        let srcCur3     = shiftCursor (Z :. 0 :. 1) srcCur2

                        -- Get the result value for each cursor.
                        let val0        = getElem srcCur0
                        let val1        = getElem srcCur1
                        let val2        = getElem srcCur2
                        let val3        = getElem srcCur3

                        -- Ensure that we've computed each of the result values before we
                        -- write into the array. If the backend code generator can't tell
                        -- our destination array doesn't alias with the source then writing
                        -- to it can prevent the sharing of intermediate computations.
                        touch val0
                        touch val1
                        touch val2
                        touch val3

                        -- Compute cursor into destination array.
                        let !dstCur0    = x +# (y *# imageWidth)
                        write (I# dstCur0)         val0
                        write (I# (dstCur0 +# 1#)) val1
                        write (I# (dstCur0 +# 2#)) val2
                        write (I# (dstCur0 +# 3#)) val3
                        fillLine4 (x +# 4#)

                {-# INLINE fillLine1 #-}
                fillLine1 !x
                 | 1# <- x >=# x1 = return ()
                 | otherwise
                 = do   let val0  = (getElem $ makeCursor (Z :. (I# y) :. (I# x)))
                        write (I# (x +# (y *# imageWidth))) val0
                        fillLine1 (x +# 1#)