module Futhark.CodeGen.ImpGen.Multicore.Base
  ( extractAllocations,
    compileThreadResult,
    HostEnv (..),
    AtomicBinOp,
    MulticoreGen,
    decideScheduling,
    decideScheduling',
    groupResultArrays,
    renameSegBinOp,
    freeParams,
    renameHistOpLambda,
    atomicUpdateLocking,
    AtomicUpdate (..),
    Locking (..),
    getSpace,
    getIterationDomain,
    getReturnParams,
    segOpString,
  )
where

import Control.Monad
import Data.Bifunctor
import Data.List (elemIndex, find)
import Data.Maybe
import qualified Futhark.CodeGen.ImpCode.Multicore as Imp
import Futhark.CodeGen.ImpGen
import Futhark.Error
import Futhark.IR.MCMem
import Futhark.Transform.Rename
import Futhark.Util (maybeNth)
import Prelude hiding (quot, rem)

-- | Is there an atomic t'BinOp' corresponding to this t'BinOp'?
type AtomicBinOp =
  BinOp ->
  Maybe (VName -> VName -> Imp.Count Imp.Elements (Imp.TExp Int32) -> Imp.Exp -> Imp.AtomicOp)

newtype HostEnv = HostEnv
  {hostAtomics :: AtomicBinOp}

type MulticoreGen = ImpM MCMem HostEnv Imp.Multicore

segOpString :: SegOp () MCMem -> MulticoreGen String
segOpString SegMap {} = return "segmap"
segOpString SegRed {} = return "segred"
segOpString SegScan {} = return "segscan"
segOpString SegHist {} = return "seghist"

toParam :: VName -> TypeBase shape u -> MulticoreGen Imp.Param
toParam name (Prim pt) = return $ Imp.ScalarParam name pt
toParam name (Mem space) = return $ Imp.MemParam name space
toParam name Array {} = do
  name_entry <- lookupVar name
  case name_entry of
    ArrayVar _ (ArrayEntry (MemLocation mem _ _) _) ->
      return $ Imp.MemParam mem DefaultSpace
    _ -> error $ "[toParam] Could not handle array for " ++ show name

getSpace :: SegOp () MCMem -> SegSpace
getSpace (SegHist _ space _ _ _) = space
getSpace (SegRed _ space _ _ _) = space
getSpace (SegScan _ space _ _ _) = space
getSpace (SegMap _ space _ _) = space

getIterationDomain :: SegOp () MCMem -> SegSpace -> MulticoreGen (Imp.TExp Int64)
getIterationDomain SegMap {} space = do
  let ns = map snd $ unSegSpace space
      ns_64 = map toInt64Exp ns
  return $ product ns_64
getIterationDomain _ space = do
  let ns = map snd $ unSegSpace space
      ns_64 = map toInt64Exp ns
  case unSegSpace space of
    [_] -> return $ product ns_64
    -- A segmented SegOp is over the segments
    -- so we drop the last dimension, which is
    -- executed sequentially
    _ -> return $ product $ init ns_64

-- When the SegRed's return value is a scalar
-- we perform a call by value-result in the segop function
getReturnParams :: Pattern MCMem -> SegOp () MCMem -> MulticoreGen [Imp.Param]
getReturnParams pat SegRed {} = do
  let retvals = map patElemName $ patternElements pat
  retvals_ts <- mapM lookupType retvals
  zipWithM toParam retvals retvals_ts
getReturnParams _ _ = return mempty

renameSegBinOp :: [SegBinOp MCMem] -> MulticoreGen [SegBinOp MCMem]
renameSegBinOp segbinops =
  forM segbinops $ \(SegBinOp comm lam ne shape) -> do
    lam' <- renameLambda lam
    return $ SegBinOp comm lam' ne shape

compileThreadResult ::
  SegSpace ->
  PatElem MCMem ->
  KernelResult ->
  MulticoreGen ()
compileThreadResult space pe (Returns _ what) = do
  let is = map (Imp.vi64 . fst) $ unSegSpace space
  copyDWIMFix (patElemName pe) is what []
compileThreadResult _ _ ConcatReturns {} =
  compilerBugS "compileThreadResult: ConcatReturn unhandled."
compileThreadResult _ _ WriteReturns {} =
  compilerBugS "compileThreadResult: WriteReturns unhandled."
compileThreadResult _ _ TileReturns {} =
  compilerBugS "compileThreadResult: TileReturns unhandled."
compileThreadResult _ _ RegTileReturns {} =
  compilerBugS "compileThreadResult: RegTileReturns unhandled."

freeVariables :: Imp.Code -> [VName] -> [VName]
freeVariables code names =
  namesToList $ freeIn code `namesSubtract` namesFromList names

freeParams :: Imp.Code -> [VName] -> MulticoreGen [Imp.Param]
freeParams code names = do
  let freeVars = freeVariables code names
  ts <- mapM lookupType freeVars
  zipWithM toParam freeVars ts

-- | Arrays for storing group results shared between threads
groupResultArrays ::
  String ->
  SubExp ->
  [SegBinOp MCMem] ->
  MulticoreGen [[VName]]
groupResultArrays s num_threads reds =
  forM reds $ \(SegBinOp _ lam _ shape) ->
    forM (lambdaReturnType lam) $ \t -> do
      let pt = elemType t
          full_shape = Shape [num_threads] <> shape <> arrayShape t
      sAllocArray s pt full_shape DefaultSpace

isLoadBalanced :: Imp.Code -> Bool
isLoadBalanced (a Imp.:>>: b) = isLoadBalanced a && isLoadBalanced b
isLoadBalanced (Imp.For _ _ a) = isLoadBalanced a
isLoadBalanced (Imp.If _ a b) = isLoadBalanced a && isLoadBalanced b
isLoadBalanced (Imp.Comment _ a) = isLoadBalanced a
isLoadBalanced Imp.While {} = False
isLoadBalanced (Imp.Op (Imp.ParLoop _ _ _ code _ _ _)) = isLoadBalanced code
isLoadBalanced _ = True

segBinOpComm' :: [SegBinOp lore] -> Commutativity
segBinOpComm' = mconcat . map segBinOpComm

decideScheduling' :: SegOp () lore -> Imp.Code -> Imp.Scheduling
decideScheduling' SegHist {} _ = Imp.Static
decideScheduling' SegScan {} _ = Imp.Static
decideScheduling' (SegRed _ _ reds _ _) code =
  case segBinOpComm' reds of
    Commutative -> decideScheduling code
    Noncommutative -> Imp.Static
decideScheduling' SegMap {} code = decideScheduling code

decideScheduling :: Imp.Code -> Imp.Scheduling
decideScheduling code =
  if isLoadBalanced code
    then Imp.Static
    else Imp.Dynamic

-- | Try to extract invariant allocations.  If we assume that the
-- given 'Imp.Code' is the body of a 'SegOp', then it is always safe
-- to move the immediate allocations to the prebody.
extractAllocations :: Imp.Code -> (Imp.Code, Imp.Code)
extractAllocations segop_code = f segop_code
  where
    declared = Imp.declaredIn segop_code
    f (Imp.DeclareMem name space) =
      -- Hoisting declarations out is always safe.
      (Imp.DeclareMem name space, mempty)
    f (Imp.Allocate name size space)
      | not $ freeIn size `namesIntersect` declared =
        (Imp.Allocate name size space, mempty)
    f (x Imp.:>>: y) = f x <> f y
    f (Imp.While cond body) =
      (mempty, Imp.While cond body)
    f (Imp.For i bound body) =
      (mempty, Imp.For i bound body)
    f (Imp.Comment s code) =
      second (Imp.Comment s) (f code)
    f Imp.Free {} =
      mempty
    f (Imp.If cond tcode fcode) =
      let (ta, tcode') = f tcode
          (fa, fcode') = f fcode
       in (ta <> fa, Imp.If cond tcode' fcode')
    f (Imp.Op (Imp.ParLoop s i prebody body postbody free info)) =
      let (body_allocs, body') = extractAllocations body
          (free_allocs, here_allocs) = f body_allocs
          free' =
            filter
              ( not
                  . (`nameIn` Imp.declaredIn body_allocs)
                  . Imp.paramName
              )
              free
       in ( free_allocs,
            here_allocs
              <> Imp.Op (Imp.ParLoop s i prebody body' postbody free' info)
          )
    f code =
      (mempty, code)

-------------------------------
------- SegHist helpers -------
-------------------------------
renameHistOpLambda :: [HistOp MCMem] -> MulticoreGen [HistOp MCMem]
renameHistOpLambda hist_ops =
  forM hist_ops $ \(HistOp w rf dest neutral shape lam) -> do
    lam' <- renameLambda lam
    return $ HistOp w rf dest neutral shape lam'

-- | Locking strategy used for an atomic update.
data Locking = Locking
  { -- | Array containing the lock.
    lockingArray :: VName,
    -- | Value for us to consider the lock free.
    lockingIsUnlocked :: Imp.TExp Int32,
    -- | What to write when we lock it.
    lockingToLock :: Imp.TExp Int32,
    -- | What to write when we unlock it.
    lockingToUnlock :: Imp.TExp Int32,
    -- | A transformation from the logical lock index to the
    -- physical position in the array.  This can also be used
    -- to make the lock array smaller.
    lockingMapping :: [Imp.TExp Int64] -> [Imp.TExp Int64]
  }

-- | A function for generating code for an atomic update.  Assumes
-- that the bucket is in-bounds.
type DoAtomicUpdate lore r =
  [VName] -> [Imp.TExp Int64] -> MulticoreGen ()

-- | The mechanism that will be used for performing the atomic update.
-- Approximates how efficient it will be.  Ordered from most to least
-- efficient.
data AtomicUpdate lore r
  = AtomicPrim (DoAtomicUpdate lore r)
  | -- | Can be done by efficient swaps.
    AtomicCAS (DoAtomicUpdate lore r)
  | -- | Requires explicit locking.
    AtomicLocking (Locking -> DoAtomicUpdate lore r)

atomicUpdateLocking ::
  AtomicBinOp ->
  Lambda MCMem ->
  AtomicUpdate MCMem ()
atomicUpdateLocking atomicBinOp lam
  | Just ops_and_ts <- splitOp lam,
    all (\(_, t, _, _) -> supportedPrims $ primBitSize t) ops_and_ts =
    primOrCas ops_and_ts $ \arrs bucket ->
      -- If the operator is a vectorised binary operator on 32-bit values,
      -- we can use a particularly efficient implementation. If the
      -- operator has an atomic implementation we use that, otherwise it
      -- is still a binary operator which can be implemented by atomic
      -- compare-and-swap if 32 bits.
      forM_ (zip arrs ops_and_ts) $ \(a, (op, t, x, y)) -> do
        -- Common variables.
        old <- dPrim "old" t

        (arr', _a_space, bucket_offset) <- fullyIndexArray a bucket

        case opHasAtomicSupport (tvVar old) arr' (sExt32 <$> bucket_offset) op of
          Just f -> sOp $ f $ Imp.var y t
          Nothing ->
            atomicUpdateCAS t a (tvVar old) bucket x $
              x <~~ Imp.BinOpExp op (Imp.var x t) (Imp.var y t)
  where
    opHasAtomicSupport old arr' bucket' bop = do
      let atomic f = Imp.Atomic . f old arr' bucket'
      atomic <$> atomicBinOp bop

    primOrCas ops
      | all isPrim ops = AtomicPrim
      | otherwise = AtomicCAS

    isPrim (op, _, _, _) = isJust $ atomicBinOp op
atomicUpdateLocking _ op
  | [Prim t] <- lambdaReturnType op,
    [xp, _] <- lambdaParams op,
    supportedPrims (primBitSize t) = AtomicCAS $ \[arr] bucket -> do
    old <- dPrim "old" t
    atomicUpdateCAS t arr (tvVar old) bucket (paramName xp) $
      compileBody' [xp] $ lambdaBody op
atomicUpdateLocking _ op = AtomicLocking $ \locking arrs bucket -> do
  old <- dPrim "old" int32
  continue <- dPrimVol "continue" int32 (0 :: Imp.TExp Int32)

  -- Correctly index into locks.
  (locks', _locks_space, locks_offset) <-
    fullyIndexArray (lockingArray locking) $ lockingMapping locking bucket

  -- Critical section
  let try_acquire_lock = do
        old <-- (0 :: Imp.TExp Int32)
        sOp $
          Imp.Atomic $
            Imp.AtomicCmpXchg
              int32
              (tvVar old)
              locks'
              (sExt32 <$> locks_offset)
              (tvVar continue)
              (untyped (lockingToLock locking))
      lock_acquired = tvExp continue
      -- Even the releasing is done with an atomic rather than a
      -- simple write, for memory coherency reasons.
      release_lock = do
        old <-- lockingToLock locking
        sOp $
          Imp.Atomic $
            Imp.AtomicCmpXchg
              int32
              (tvVar old)
              locks'
              (sExt32 <$> locks_offset)
              (tvVar continue)
              (untyped (lockingToUnlock locking))

  -- Preparing parameters. It is assumed that the caller has already
  -- filled the arr_params. We copy the current value to the
  -- accumulator parameters.
  let (acc_params, _arr_params) = splitAt (length arrs) $ lambdaParams op
      bind_acc_params =
        everythingVolatile $
          sComment "bind lhs" $
            forM_ (zip acc_params arrs) $ \(acc_p, arr) ->
              copyDWIMFix (paramName acc_p) [] (Var arr) bucket

  let op_body =
        sComment "execute operation" $
          compileBody' acc_params $ lambdaBody op

      do_hist =
        everythingVolatile $
          sComment "update global result" $
            zipWithM_ (writeArray bucket) arrs $ map (Var . paramName) acc_params

  -- While-loop: Try to insert your value
  sWhile (tvExp continue .==. 0) $ do
    try_acquire_lock
    sUnless (lock_acquired .==. 0) $ do
      dLParams acc_params
      bind_acc_params
      op_body
      do_hist
      release_lock
  where
    writeArray bucket arr val = copyDWIMFix arr bucket val []

atomicUpdateCAS ::
  PrimType ->
  VName ->
  VName ->
  [Imp.TExp Int64] ->
  VName ->
  MulticoreGen () ->
  MulticoreGen ()
atomicUpdateCAS t arr old bucket x do_op = do
  -- Code generation target:
  --
  -- old = d_his[idx];
  -- do {
  --   assumed = old;
  --   x = do_op(assumed, y);
  --   old = atomicCAS(&d_his[idx], assumed, tmp);
  -- } while(assumed != old);
  run_loop <- dPrimV "run_loop" (0 :: Imp.TExp Int32)
  everythingVolatile $ copyDWIMFix old [] (Var arr) bucket
  (arr', _a_space, bucket_offset) <- fullyIndexArray arr bucket

  bytes <- toIntegral $ primBitSize t
  (to, from) <- getBitConvertFunc $ primBitSize t
  -- While-loop: Try to insert your value
  let (toBits, _fromBits) =
        case t of
          FloatType _ ->
            ( \v -> Imp.FunExp to [v] bytes,
              \v -> Imp.FunExp from [v] t
            )
          _ -> (id, id)

  sWhile (tvExp run_loop .==. 0) $ do
    x <~~ Imp.var old t
    do_op -- Writes result into x
    sOp $
      Imp.Atomic $
        Imp.AtomicCmpXchg
          bytes
          old
          arr'
          (sExt32 <$> bucket_offset)
          (tvVar run_loop)
          (toBits (Imp.var x t))

-- | Horizontally fission a lambda that models a binary operator.
splitOp :: ASTLore lore => Lambda lore -> Maybe [(BinOp, PrimType, VName, VName)]
splitOp lam = mapM splitStm $ bodyResult $ lambdaBody lam
  where
    n = length $ lambdaReturnType lam
    splitStm (Var res) = do
      Let (Pattern [] [pe]) _ (BasicOp (BinOp op (Var x) (Var y))) <-
        find (([res] ==) . patternNames . stmPattern) $
          stmsToList $ bodyStms $ lambdaBody lam
      i <- Var res `elemIndex` bodyResult (lambdaBody lam)
      xp <- maybeNth i $ lambdaParams lam
      yp <- maybeNth (n + i) $ lambdaParams lam
      guard $ paramName xp == x
      guard $ paramName yp == y
      Prim t <- Just $ patElemType pe
      return (op, t, paramName xp, paramName yp)
    splitStm _ = Nothing

-- TODO for supporting 8 and 16 bits (and 128)
-- we need a functions for converting to and from bits
getBitConvertFunc :: Int -> MulticoreGen (String, String)
-- getBitConvertFunc 8 = return $ ("to_bits8, from_bits8")
-- getBitConvertFunc 16 = return $ ("to_bits8, from_bits8")
getBitConvertFunc 32 = return ("to_bits32", "from_bits32")
getBitConvertFunc 64 = return ("to_bits64", "from_bits64")
getBitConvertFunc b = error $ "number of bytes is not supported " ++ pretty b

supportedPrims :: Int -> Bool
supportedPrims 8 = True
supportedPrims 16 = True
supportedPrims 32 = True
supportedPrims 64 = True
supportedPrims _ = False

-- Supported bytes lengths by GCC (and clang) compiler
toIntegral :: Int -> MulticoreGen PrimType
toIntegral 8 = return int8
toIntegral 16 = return int16
toIntegral 32 = return int32
toIntegral 64 = return int64
toIntegral b = error $ "number of bytes is not supported for CAS - " ++ pretty b