{-# LANGUAGE TypeFamilies #-}

-- | Extract limited nested parallelism for execution inside
-- individual kernel threadblocks.
module Futhark.Pass.ExtractKernels.Intrablock (intrablockParallelise) where

import Control.Monad
import Control.Monad.RWS
import Control.Monad.Trans.Maybe
import Data.Map.Strict qualified as M
import Data.Set qualified as S
import Futhark.Analysis.PrimExp.Convert
import Futhark.IR.GPU hiding (HistOp)
import Futhark.IR.GPU.Op qualified as GPU
import Futhark.IR.SOACS
import Futhark.MonadFreshNames
import Futhark.Pass.ExtractKernels.BlockedKernel
import Futhark.Pass.ExtractKernels.DistributeNests
import Futhark.Pass.ExtractKernels.Distribution
import Futhark.Pass.ExtractKernels.ToGPU
import Futhark.Tools
import Futhark.Transform.FirstOrderTransform qualified as FOT
import Futhark.Util.Log
import Prelude hiding (log)

-- | Convert the statements inside a map nest to kernel statements,
-- attempting to parallelise any remaining (top-level) parallel
-- statements.  Anything that is not a map, scan or reduction will
-- simply be sequentialised.  This includes sequential loops that
-- contain maps, scans or reduction.  In the future, we could probably
-- do something more clever.  Make sure that the amount of parallelism
-- to be exploited does not exceed the group size.  Further, as a hack
-- we also consider the size of all intermediate arrays as
-- "parallelism to be exploited" to avoid exploding shared memory.
--
-- We distinguish between "minimum group size" and "maximum
-- exploitable parallelism".
intrablockParallelise ::
  (MonadFreshNames m, LocalScope GPU m) =>
  KernelNest ->
  Lambda SOACS ->
  m
    ( Maybe
        ( (SubExp, SubExp),
          SubExp,
          Log,
          Stms GPU,
          Stms GPU
        )
    )
intrablockParallelise knest lam = runMaybeT $ do
  (ispace, inps) <- lift $ flatKernel knest

  (num_tblocks, w_stms) <-
    lift $
      runBuilder $
        letSubExp "intra_num_tblocks"
          =<< foldBinOp (Mul Int64 OverflowUndef) (intConst Int64 1) (map snd ispace)

  let body = lambdaBody lam

  tblock_size <- newVName "computed_tblock_size"
  (wss_min, wss_avail, log, kbody) <-
    lift . localScope (scopeOfLParams $ lambdaParams lam) $
      intrablockParalleliseBody body

  outside_scope <- lift askScope
  -- outside_scope may also contain the inputs, even though those are
  -- not actually available outside the kernel.
  let available v =
        v `M.member` outside_scope
          && v `notElem` map kernelInputName inps
  unless (all available $ namesToList $ freeIn (wss_min ++ wss_avail)) $
    fail "Irregular parallelism"

  ((intra_avail_par, kspace, read_input_stms), prelude_stms) <- lift $
    runBuilder $ do
      let foldBinOp' _ [] = eSubExp $ intConst Int64 1
          foldBinOp' bop (x : xs) = foldBinOp bop x xs
      ws_min <-
        mapM (letSubExp "one_intra_par_min" <=< foldBinOp' (Mul Int64 OverflowUndef)) $
          filter (not . null) wss_min
      ws_avail <-
        mapM (letSubExp "one_intra_par_avail" <=< foldBinOp' (Mul Int64 OverflowUndef)) $
          filter (not . null) wss_avail

      -- The amount of parallelism available *in the worst case* is
      -- equal to the smallest parallel loop, or *at least* 1.
      intra_avail_par <-
        letSubExp "intra_avail_par" =<< foldBinOp' (SMin Int64) ws_avail

      -- The group size is either the maximum of the minimum parallelism
      -- exploited, or the desired parallelism (bounded by the max group
      -- size) in case there is no minimum.
      letBindNames [tblock_size]
        =<< if null ws_min
          then
            eBinOp
              (SMin Int64)
              (eSubExp =<< letSubExp "max_tblock_size" (Op $ SizeOp $ GetSizeMax SizeThreadBlock))
              (eSubExp intra_avail_par)
          else foldBinOp' (SMax Int64) ws_min

      let inputIsUsed input = kernelInputName input `nameIn` freeIn body
          used_inps = filter inputIsUsed inps

      addStms w_stms
      read_input_stms <- runBuilder_ $ mapM readGroupKernelInput used_inps
      space <- SegSpace <$> newVName "phys_tblock_id" <*> pure ispace
      pure (intra_avail_par, space, read_input_stms)

  let kbody' = kbody {kernelBodyStms = read_input_stms <> kernelBodyStms kbody}

  let nested_pat = loopNestingPat first_nest
      rts = map (length ispace `stripArray`) $ patTypes nested_pat
      grid = KernelGrid (Count num_tblocks) (Count $ Var tblock_size)
      lvl = SegBlock SegNoVirt (Just grid)
      kstm =
        Let nested_pat aux $ Op $ SegOp $ SegMap lvl kspace rts kbody'

  let intra_min_par = intra_avail_par
  pure
    ( (intra_min_par, intra_avail_par),
      Var tblock_size,
      log,
      prelude_stms,
      oneStm kstm
    )
  where
    first_nest = fst knest
    aux = loopNestingAux first_nest

readGroupKernelInput ::
  (DistRep (Rep m), MonadBuilder m) =>
  KernelInput ->
  m ()
readGroupKernelInput inp
  | Array {} <- kernelInputType inp = do
      v <- newVName $ baseString $ kernelInputName inp
      readKernelInput inp {kernelInputName = v}
      letBindNames [kernelInputName inp] $ BasicOp $ Replicate mempty $ Var v
  | otherwise =
      readKernelInput inp

data IntraAcc = IntraAcc
  { accMinPar :: S.Set [SubExp],
    accAvailPar :: S.Set [SubExp],
    accLog :: Log
  }

instance Semigroup IntraAcc where
  IntraAcc min_x avail_x log_x <> IntraAcc min_y avail_y log_y =
    IntraAcc (min_x <> min_y) (avail_x <> avail_y) (log_x <> log_y)

instance Monoid IntraAcc where
  mempty = IntraAcc mempty mempty mempty

type IntrablockM =
  BuilderT GPU (RWS () IntraAcc VNameSource)

instance MonadLogger IntrablockM where
  addLog log = tell mempty {accLog = log}

runIntrablockM ::
  (MonadFreshNames m, HasScope GPU m) =>
  IntrablockM () ->
  m (IntraAcc, Stms GPU)
runIntrablockM m = do
  scope <- castScope <$> askScope
  modifyNameSource $ \src ->
    let (((), kstms), src', acc) = runRWS (runBuilderT m scope) () src
     in ((acc, kstms), src')

parallelMin :: [SubExp] -> IntrablockM ()
parallelMin ws =
  tell
    mempty
      { accMinPar = S.singleton ws,
        accAvailPar = S.singleton ws
      }

intrablockBody :: Body SOACS -> IntrablockM (Body GPU)
intrablockBody body = do
  stms <- collectStms_ $ intrablockStms $ bodyStms body
  pure $ mkBody stms $ bodyResult body

intrablockLambda :: Lambda SOACS -> IntrablockM (Lambda GPU)
intrablockLambda lam =
  mkLambda (lambdaParams lam) $
    bodyBind =<< intrablockBody (lambdaBody lam)

intrablockWithAccInput :: WithAccInput SOACS -> IntrablockM (WithAccInput GPU)
intrablockWithAccInput (shape, arrs, Nothing) =
  pure (shape, arrs, Nothing)
intrablockWithAccInput (shape, arrs, Just (lam, nes)) = do
  lam' <- intrablockLambda lam
  pure (shape, arrs, Just (lam', nes))

intrablockStm :: Stm SOACS -> IntrablockM ()
intrablockStm stm@(Let pat aux e) = do
  scope <- askScope
  let lvl = SegThreadInBlock SegNoVirt

  case e of
    Loop merge form loopbody ->
      localScope (scopeOfLoopForm form <> scopeOfFParams (map fst merge)) $ do
        loopbody' <- intrablockBody loopbody
        certifying (stmAuxCerts aux) . letBind pat $
          Loop merge form loopbody'
    Match cond cases defbody ifdec -> do
      cases' <- mapM (traverse intrablockBody) cases
      defbody' <- intrablockBody defbody
      certifying (stmAuxCerts aux) . letBind pat $
        Match cond cases' defbody' ifdec
    WithAcc inputs lam -> do
      inputs' <- mapM intrablockWithAccInput inputs
      lam' <- intrablockLambda lam
      certifying (stmAuxCerts aux) . letBind pat $ WithAcc inputs' lam'
    Op soac
      | "sequential_outer" `inAttrs` stmAuxAttrs aux ->
          intrablockStms . fmap (certify (stmAuxCerts aux))
            =<< runBuilder_ (FOT.transformSOAC pat soac)
    Op (Screma w arrs form)
      | Just lam <- isMapSOAC form -> do
          let loopnest = MapNesting pat aux w $ zip (lambdaParams lam) arrs
              env =
                DistEnv
                  { distNest =
                      singleNesting $ Nesting mempty loopnest,
                    distScope =
                      scopeOfPat pat
                        <> scopeForGPU (scopeOf lam)
                        <> scope,
                    distOnInnerMap =
                      distributeMap,
                    distOnTopLevelStms =
                      liftInner . collectStms_ . intrablockStms,
                    distSegLevel = \minw _ _ -> do
                      lift $ parallelMin minw
                      pure lvl,
                    distOnSOACSStms =
                      pure . oneStm . soacsStmToGPU,
                    distOnSOACSLambda =
                      pure . soacsLambdaToGPU
                  }
              acc =
                DistAcc
                  { distTargets = singleTarget (pat, bodyResult $ lambdaBody lam),
                    distStms = mempty
                  }

          addStms
            =<< runDistNestT env (distributeMapBodyStms acc (bodyStms $ lambdaBody lam))
    Op (Screma w arrs form)
      | Just (scans, mapfun) <- isScanomapSOAC form,
        -- FIXME: Futhark.CodeGen.ImpGen.GPU.Block.compileGroupOp
        -- cannot handle multiple scan operators yet.
        Scan scanfun nes <- singleScan scans -> do
          let scanfun' = soacsLambdaToGPU scanfun
              mapfun' = soacsLambdaToGPU mapfun
          certifying (stmAuxCerts aux) $
            addStms =<< segScan lvl pat mempty w [SegBinOp Noncommutative scanfun' nes mempty] mapfun' arrs [] []
          parallelMin [w]
    Op (Screma w arrs form)
      | Just (reds, map_lam) <- isRedomapSOAC form -> do
          let onReduce (Reduce comm red_lam nes) =
                SegBinOp comm (soacsLambdaToGPU red_lam) nes mempty
              reds' = map onReduce reds
              map_lam' = soacsLambdaToGPU map_lam
          certifying (stmAuxCerts aux) $
            addStms =<< segRed lvl pat mempty w reds' map_lam' arrs [] []
          parallelMin [w]
    Op (Screma w arrs form) ->
      -- This screma is too complicated for us to immediately do
      -- anything, so split it up and try again.
      mapM_ intrablockStm . fmap (certify (stmAuxCerts aux)) . snd
        =<< runBuilderT (dissectScrema pat w form arrs) (scopeForSOACs scope)
    Op (Hist w arrs ops bucket_fun) -> do
      ops' <- forM ops $ \(HistOp num_bins rf dests nes op) -> do
        (op', nes', shape) <- determineReduceOp op nes
        let op'' = soacsLambdaToGPU op'
        pure $ GPU.HistOp num_bins rf dests nes' shape op''

      let bucket_fun' = soacsLambdaToGPU bucket_fun
      certifying (stmAuxCerts aux) $
        addStms =<< segHist lvl pat w [] [] ops' bucket_fun' arrs
      parallelMin [w]
    Op (Stream w arrs accs lam)
      | chunk_size_param : _ <- lambdaParams lam -> do
          types <- asksScope castScope
          ((), stream_stms) <-
            runBuilderT (sequentialStreamWholeArray pat w accs lam arrs) types
          let replace (Var v) | v == paramName chunk_size_param = w
              replace se = se
              replaceSets (IntraAcc x y log) =
                IntraAcc (S.map (map replace) x) (S.map (map replace) y) log
          censor replaceSets $ intrablockStms stream_stms
    Op (Scatter w ivs lam dests) -> do
      write_i <- newVName "write_i"
      space <- mkSegSpace [(write_i, w)]

      let lam' = soacsLambdaToGPU lam
          krets = do
            (_a_w, a, is_vs) <-
              groupScatterResults dests $ bodyResult $ lambdaBody lam'
            let cs =
                  foldMap (foldMap resCerts . fst) is_vs
                    <> foldMap (resCerts . snd) is_vs
                is_vs' = [(Slice $ map (DimFix . resSubExp) is, resSubExp v) | (is, v) <- is_vs]
            pure $ WriteReturns cs a is_vs'
          inputs = do
            (p, p_a) <- zip (lambdaParams lam') ivs
            pure $ KernelInput (paramName p) (paramType p) p_a [Var write_i]

      kstms <- runBuilder_ $
        localScope (scopeOfSegSpace space) $ do
          mapM_ readKernelInput inputs
          addStms $ bodyStms $ lambdaBody lam'

      certifying (stmAuxCerts aux) $ do
        let body = KernelBody () kstms krets
        letBind pat $ Op $ SegOp $ SegMap lvl space (patTypes pat) body

      parallelMin [w]
    _ ->
      addStm $ soacsStmToGPU stm

intrablockStms :: Stms SOACS -> IntrablockM ()
intrablockStms = mapM_ intrablockStm

intrablockParalleliseBody ::
  (MonadFreshNames m, HasScope GPU m) =>
  Body SOACS ->
  m ([[SubExp]], [[SubExp]], Log, KernelBody GPU)
intrablockParalleliseBody body = do
  (IntraAcc min_ws avail_ws log, kstms) <-
    runIntrablockM $ intrablockStms $ bodyStms body
  pure
    ( S.toList min_ws,
      S.toList avail_ws,
      log,
      KernelBody () kstms $ map ret $ bodyResult body
    )
  where
    ret (SubExpRes cs se) = Returns ResultMaySimplify cs se