module Futhark.CodeGen.ImpGen.Multicore.SegHist
  ( compileSegHist,
  )
where

import Control.Monad
import Data.List (zip4, zip5)
import qualified Futhark.CodeGen.ImpCode.Multicore as Imp
import Futhark.CodeGen.ImpGen
import Futhark.CodeGen.ImpGen.Multicore.Base
import Futhark.CodeGen.ImpGen.Multicore.SegRed (compileSegRed')
import Futhark.IR.MCMem
import Futhark.MonadFreshNames
import Futhark.Util (chunks, splitFromEnd, takeLast)
import Futhark.Util.IntegralExp (rem)
import Prelude hiding (quot, rem)

compileSegHist ::
  Pattern MCMem ->
  SegSpace ->
  [HistOp MCMem] ->
  KernelBody MCMem ->
  TV Int32 ->
  MulticoreGen Imp.Code
compileSegHist pat space histops kbody nsubtasks
  | [_] <- unSegSpace space =
    nonsegmentedHist pat space histops kbody nsubtasks
  | otherwise =
    segmentedHist pat space histops kbody

-- | Split some list into chunks equal to the number of values
-- returned by each 'SegBinOp'
segHistOpChunks :: [HistOp lore] -> [a] -> [[a]]
segHistOpChunks = chunks . map (length . histNeutral)

nonsegmentedHist ::
  Pattern MCMem ->
  SegSpace ->
  [HistOp MCMem] ->
  KernelBody MCMem ->
  TV Int32 ->
  MulticoreGen Imp.Code
nonsegmentedHist pat space histops kbody num_histos = do
  let ns = map snd $ unSegSpace space
      ns_64 = map toInt64Exp ns
      num_histos' = tvExp num_histos
      hist_width = toInt64Exp $ histWidth $ head histops
      use_subhistogram = sExt64 num_histos' * hist_width .<=. product ns_64

  histops' <- renameHistOpLambda histops

  -- Only do something if there is actually input.
  collect $
    sUnless (product ns_64 .==. 0) $ do
      flat_idx <- dPrim "iter" int64
      sIf
        use_subhistogram
        (subHistogram pat flat_idx space histops num_histos kbody)
        (atomicHistogram pat flat_idx space histops' kbody)

-- |
-- Atomic Histogram approach
-- The implementation has three sub-strategies depending on the
-- type of the operator
-- 1. If values are integral scalars, a direct-supported atomic update is used.
-- 2. If values are on one memory location, e.g. a float, then a
-- CAS operation is used to perform the update, where the float is
-- casted to an integral scalar.
-- 1. and 2. currently only works for 32-bit and 64-bit types,
-- but GCC has support for 8-, 16- and 128- bit types as well.
-- 3. Otherwise a locking based approach is used
onOpAtomic :: HistOp MCMem -> MulticoreGen ([VName] -> [Imp.TExp Int64] -> MulticoreGen ())
onOpAtomic op = do
  atomics <- hostAtomics <$> askEnv
  let lambda = histOp op
      do_op = atomicUpdateLocking atomics lambda
  case do_op of
    AtomicPrim f -> return f
    AtomicCAS f -> return f
    AtomicLocking f -> do
      -- Allocate a static array of locks
      -- as in the GPU backend
      let num_locks = 100151 -- This number is taken from the GPU backend
          dims =
            map toInt64Exp $
              shapeDims (histShape op) ++ [histWidth op]
      locks <-
        sStaticArray "hist_locks" DefaultSpace int32 $
          Imp.ArrayZeros num_locks
      let l' = Locking locks 0 1 0 (pure . (`rem` fromIntegral num_locks) . flattenIndex dims)
      return $ f l'

atomicHistogram ::
  Pattern MCMem ->
  TV Int64 ->
  SegSpace ->
  [HistOp MCMem] ->
  KernelBody MCMem ->
  MulticoreGen ()
atomicHistogram pat flat_idx space histops kbody = do
  let (is, ns) = unzip $ unSegSpace space
      ns_64 = map toInt64Exp ns
  let num_red_res = length histops + sum (map (length . histNeutral) histops)
      (all_red_pes, map_pes) = splitAt num_red_res $ patternValueElements pat

  atomicOps <- mapM onOpAtomic histops

  body <- collect $ do
    zipWithM_ dPrimV_ is $ unflattenIndex ns_64 $ tvExp flat_idx
    compileStms mempty (kernelBodyStms kbody) $ do
      let (red_res, map_res) = splitFromEnd (length map_pes) $ kernelBodyResult kbody
          perOp = chunks $ map (length . histDest) histops
          (buckets, vs) = splitAt (length histops) red_res

      let pes_per_op = chunks (map (length . histDest) histops) all_red_pes
      forM_ (zip5 histops (perOp vs) buckets atomicOps pes_per_op) $
        \(HistOp dest_w _ _ _ shape lam, vs', bucket, do_op, dest_res) -> do
          let (_is_params, vs_params) = splitAt (length vs') $ lambdaParams lam
              dest_w' = toInt64Exp dest_w
              bucket' = toInt64Exp $ kernelResultSubExp bucket
              bucket_in_bounds = bucket' .<. dest_w' .&&. 0 .<=. bucket'

          sComment "save map-out results" $
            forM_ (zip map_pes map_res) $ \(pe, res) ->
              copyDWIMFix (patElemName pe) (map Imp.vi64 is) (kernelResultSubExp res) []

          sComment "perform updates" $
            sWhen bucket_in_bounds $ do
              let bucket_is = map Imp.vi64 (init is) ++ [bucket']
              dLParams $ lambdaParams lam
              sLoopNest shape $ \is' -> do
                forM_ (zip vs_params vs') $ \(p, res) ->
                  copyDWIMFix (paramName p) [] (kernelResultSubExp res) is'
                do_op (map patElemName dest_res) (bucket_is ++ is')

  free_params <- freeParams body (segFlat space : [tvVar flat_idx])
  emit $ Imp.Op $ Imp.ParLoop "atomic_seg_hist" (tvVar flat_idx) mempty body mempty free_params $ segFlat space

updateHisto :: HistOp MCMem -> [VName] -> [Imp.TExp Int64] -> MulticoreGen ()
updateHisto op arrs bucket = do
  let acc_params = take (length arrs) $ lambdaParams $ histOp op
      bind_acc_params =
        forM_ (zip acc_params arrs) $ \(acc_p, arr) ->
          copyDWIMFix (paramName acc_p) [] (Var arr) bucket
      op_body = compileBody' [] $ lambdaBody $ histOp op
      writeArray arr val = copyDWIMFix arr bucket val []
      do_hist = zipWithM_ writeArray arrs $ bodyResult $ lambdaBody $ histOp op

  sComment "Start of body" $ do
    dLParams acc_params
    bind_acc_params
    op_body
    do_hist

-- Generates num_histos sub-histograms of the size
-- of the destination histogram
-- Then for each chunk of the input each subhistogram
-- is computed and finally combined through a segmented reduction
-- across the histogram indicies.
-- This is expected to be fast if len(histDest) is small
subHistogram ::
  Pattern MCMem ->
  TV Int64 ->
  SegSpace ->
  [HistOp MCMem] ->
  TV Int32 ->
  KernelBody MCMem ->
  MulticoreGen ()
subHistogram pat flat_idx space histops num_histos kbody = do
  emit $ Imp.DebugPrint "subHistogram segHist" Nothing

  let (is, ns) = unzip $ unSegSpace space
      ns_64 = map toInt64Exp ns

  let pes = patternElements pat
      num_red_res = length histops + sum (map (length . histNeutral) histops)
      map_pes = drop num_red_res pes
      per_red_pes = segHistOpChunks histops $ patternValueElements pat

  -- Allocate array of subhistograms in the calling thread.  Each
  -- tasks will work in its own private allocations (to avoid false
  -- sharing), but this is where they will ultimately copy their
  -- results.
  global_subhistograms <- forM histops $ \histop ->
    forM (histType histop) $ \t -> do
      let shape = Shape [tvSize num_histos] <> arrayShape t
      sAllocArray "subhistogram" (elemType t) shape DefaultSpace

  let tid' = Imp.vi64 $ segFlat space
      flat_idx' = tvExp flat_idx

  (local_subhistograms, prebody) <- collect' $ do
    zipWithM_ dPrimV_ is $ unflattenIndex ns_64 $ sExt64 flat_idx'

    forM (zip per_red_pes histops) $ \(pes', histop) -> do
      op_local_subhistograms <- forM (histType histop) $ \t ->
        sAllocArray "subhistogram" (elemType t) (arrayShape t) DefaultSpace

      forM_ (zip3 pes' op_local_subhistograms (histNeutral histop)) $ \(pe, hist, ne) ->
        -- First thread initializes histogram with dest vals. Others
        -- initialize with neutral element
        sIf
          (tid' .==. 0)
          (copyDWIMFix hist [] (Var $ patElemName pe) [])
          ( sFor "i" (toInt64Exp $ histWidth histop) $ \i ->
              sLoopNest (histShape histop) $ \vec_is ->
                copyDWIMFix hist (i : vec_is) ne []
          )

      return op_local_subhistograms

  -- Generate loop body of parallel function
  body <- collect $ do
    zipWithM_ dPrimV_ is $ unflattenIndex ns_64 $ sExt64 flat_idx'
    compileStms mempty (kernelBodyStms kbody) $ do
      let (red_res, map_res) = splitFromEnd (length map_pes) $ kernelBodyResult kbody
          (buckets, vs) = splitAt (length histops) red_res
          perOp = chunks $ map (length . histDest) histops

      sComment "save map-out results" $
        forM_ (zip map_pes map_res) $ \(pe, res) ->
          copyDWIMFix
            (patElemName pe)
            (map Imp.vi64 is)
            (kernelResultSubExp res)
            []

      forM_ (zip4 histops local_subhistograms buckets (perOp vs)) $
        \( histop@(HistOp dest_w _ _ _ shape lam),
           histop_subhistograms,
           bucket,
           vs'
           ) -> do
            let bucket' = toInt64Exp $ kernelResultSubExp bucket
                dest_w' = toInt64Exp dest_w
                bucket_in_bounds = bucket' .<. dest_w' .&&. 0 .<=. bucket'
                vs_params = takeLast (length vs') $ lambdaParams lam
                bucket_is = [bucket']

            sComment "perform updates" $
              sWhen bucket_in_bounds $ do
                dLParams $ lambdaParams lam
                sLoopNest shape $ \is' -> do
                  forM_ (zip vs_params vs') $ \(p, res) ->
                    copyDWIMFix (paramName p) [] (kernelResultSubExp res) is'
                  updateHisto histop histop_subhistograms (bucket_is ++ is')

  -- Copy the task-local subhistograms to the global subhistograms,
  -- where they will be combined.
  postbody <- collect $
    forM_ (zip (concat global_subhistograms) (concat local_subhistograms)) $
      \(global, local) -> copyDWIMFix global [tid'] (Var local) []

  free_params <- freeParams (prebody <> body <> postbody) (segFlat space : [tvVar flat_idx])
  let (body_allocs, body') = extractAllocations body
  emit $ Imp.Op $ Imp.ParLoop "seghist_stage_1" (tvVar flat_idx) (body_allocs <> prebody) body' postbody free_params $ segFlat space

  -- Perform a segmented reduction over the subhistograms
  forM_ (zip3 per_red_pes global_subhistograms histops) $ \(red_pes, hists, op) -> do
    bucket_id <- newVName "bucket_id"
    subhistogram_id <- newVName "subhistogram_id"

    let num_buckets = histWidth op
        segred_space =
          SegSpace (segFlat space) $
            segment_dims
              ++ [(bucket_id, num_buckets)]
              ++ [(subhistogram_id, tvSize num_histos)]

        segred_op = SegBinOp Noncommutative (histOp op) (histNeutral op) (histShape op)

    nsubtasks_red <- dPrim "num_tasks" $ IntType Int32
    red_code <- compileSegRed' (Pattern [] red_pes) segred_space [segred_op] nsubtasks_red $ \red_cont ->
      red_cont $
        flip map hists $ \subhisto ->
          ( Var subhisto,
            map Imp.vi64 $
              map fst segment_dims ++ [subhistogram_id, bucket_id]
          )

    let ns_red = map (toInt64Exp . snd) $ unSegSpace segred_space
        iterations = product $ init ns_red -- The segmented reduction is sequential over the inner most dimension
        scheduler_info = Imp.SchedulerInfo (tvVar nsubtasks_red) (untyped iterations) Imp.Static
        red_task = Imp.ParallelTask red_code $ segFlat space
    free_params_red <- freeParams red_code [segFlat space, tvVar nsubtasks_red]
    emit $ Imp.Op $ Imp.Segop "seghist_red" free_params_red red_task Nothing mempty scheduler_info
  where
    segment_dims = init $ unSegSpace space

-- This implementation for a Segmented Hist only
-- parallelize over the segments,
-- where each segment is updated sequentially.
segmentedHist ::
  Pattern MCMem ->
  SegSpace ->
  [HistOp MCMem] ->
  KernelBody MCMem ->
  MulticoreGen Imp.Code
segmentedHist pat space histops kbody = do
  emit $ Imp.DebugPrint "Segmented segHist" Nothing
  -- Iteration variable over the segments
  segments_i <- dPrim "segment_iter" $ IntType Int64
  collect $ do
    par_body <- compileSegHistBody (tvExp segments_i) pat space histops kbody
    free_params <- freeParams par_body [segFlat space, tvVar segments_i]
    let (body_allocs, body') = extractAllocations par_body
    emit $ Imp.Op $ Imp.ParLoop "segmented_hist" (tvVar segments_i) body_allocs body' mempty free_params $ segFlat space

compileSegHistBody ::
  Imp.TExp Int64 ->
  Pattern MCMem ->
  SegSpace ->
  [HistOp MCMem] ->
  KernelBody MCMem ->
  MulticoreGen Imp.Code
compileSegHistBody idx pat space histops kbody = do
  let (is, ns) = unzip $ unSegSpace space
      ns_64 = map toInt64Exp ns

  let num_red_res = length histops + sum (map (length . histNeutral) histops)
      map_pes = drop num_red_res $ patternValueElements pat
      per_red_pes = segHistOpChunks histops $ patternValueElements pat

  collect $ do
    let inner_bound = last ns_64
    sFor "i" inner_bound $ \i -> do
      zipWithM_ dPrimV_ (init is) $ unflattenIndex (init ns_64) idx
      dPrimV_ (last is) i

      compileStms mempty (kernelBodyStms kbody) $ do
        let (red_res, map_res) =
              splitFromEnd (length map_pes) $
                map kernelResultSubExp $ kernelBodyResult kbody
            (buckets, vs) = splitAt (length histops) red_res
            perOp = chunks $ map (length . histDest) histops

        forM_ (zip4 per_red_pes histops (perOp vs) buckets) $
          \(red_pes, HistOp dest_w _ _ _ shape lam, vs', bucket) -> do
            let (is_params, vs_params) = splitAt (length vs') $ lambdaParams lam
                bucket' = toInt64Exp bucket
                dest_w' = toInt64Exp dest_w
                bucket_in_bounds = bucket' .<. dest_w' .&&. 0 .<=. bucket'

            sComment "save map-out results" $
              forM_ (zip map_pes map_res) $ \(pe, res) ->
                copyDWIMFix (patElemName pe) (map Imp.vi64 is) res []

            sComment "perform updates" $
              sWhen bucket_in_bounds $ do
                dLParams $ lambdaParams lam
                sLoopNest shape $ \vec_is -> do
                  -- Index
                  let buck = toInt64Exp bucket
                  forM_ (zip red_pes is_params) $ \(pe, p) ->
                    copyDWIMFix (paramName p) [] (Var $ patElemName pe) (map Imp.vi64 (init is) ++ [buck] ++ vec_is)
                  -- Value at index
                  forM_ (zip vs_params vs') $ \(p, v) ->
                    copyDWIMFix (paramName p) [] v vec_is
                  compileStms mempty (bodyStms $ lambdaBody lam) $
                    forM_ (zip red_pes $ bodyResult $ lambdaBody lam) $
                      \(pe, se) -> copyDWIMFix (patElemName pe) (map Imp.vi64 (init is) ++ [buck] ++ vec_is) se []