{-# LANGUAGE MagicHash #-}
-- | Evaluate an array in parallel in an interleaved fashion,
--  with each by having each processor computing alternate elements.
module Data.Array.Repa.Eval.Interleaved
        ( fillInterleavedP)
where
import Data.Array.Repa.Eval.Gang
import GHC.Exts
import Prelude          as P


-- | Fill something in parallel.
-- 
--   * The array is split into linear chunks and each thread fills one chunk.
-- 
fillInterleavedP
        :: Int                  -- ^ Number of elements.
        -> (Int -> a -> IO ())  -- ^ Update function to write into result buffer.
        -> (Int -> a)           -- ^ Fn to get the value at a given index.
        -> IO ()

{-# INLINE [0] fillInterleavedP #-}
fillInterleavedP !(I# len) write getElem
 =      gangIO theGang
         $  \(I# thread) ->
              let !step    = threads
                  !start   = thread
                  !count   = elemsForThread thread
              in  fill step start count

 where
        -- Decide now to split the work across the threads.
        !(I# threads)   = gangSize theGang

        -- All threads get this many elements.
        !chunkLenBase   = len `quotInt#` threads

        -- Leftover elements to divide between first few threads.
        !chunkLenSlack  = len `remInt#`  threads

        -- How many elements to compute with this thread.
        elemsForThread thread
         | 1# <- thread <# chunkLenSlack
         = chunkLenBase +# 1#

         | otherwise
         = chunkLenBase
        {-# INLINE elemsForThread #-}

        -- Evaluate the elements of a single chunk.
        fill !step !ix0 !count0
         = go ix0 count0
         where
          go !ix !count
             | 1# <- count <=# 0#
             = return ()

             | otherwise
             = do write (I# ix) (getElem (I# ix))
                  go (ix +# step) (count -# 1#)
        {-# INLINE fill #-}