module Futhark.CodeGen.ImpGen.Multicore.Base
  ( extractAllocations,
    compileThreadResult,
    Locks (..),
    HostEnv (..),
    AtomicBinOp,
    MulticoreGen,
    decideScheduling,
    decideScheduling',
    renameSegBinOp,
    freeParams,
    renameHistOpLambda,
    atomicUpdateLocking,
    AtomicUpdate (..),
    DoAtomicUpdate,
    Locking (..),
    getSpace,
    getLoopBounds,
    getIterationDomain,
    getReturnParams,
    segOpString,
    ChunkLoopVectorization (..),
    generateChunkLoop,
    generateUniformizeLoop,
    extractVectorLane,
    inISPC,
    toParam,
    sLoopNestVectorized,
  )
where

import Control.Monad
import Data.Bifunctor
import Data.Map qualified as M
import Data.Maybe
import Futhark.CodeGen.ImpCode.Multicore qualified as Imp
import Futhark.CodeGen.ImpGen
import Futhark.Error
import Futhark.IR.MCMem
import Futhark.MonadFreshNames
import Futhark.Transform.Rename
import Prelude hiding (quot, rem)

-- | Is there an atomic t'BinOp' corresponding to this t'BinOp'?
type AtomicBinOp =
  BinOp ->
  Maybe (VName -> VName -> Imp.Count Imp.Elements (Imp.TExp Int32) -> Imp.Exp -> Imp.AtomicOp)

-- | Information about the locks available for accumulators.
data Locks = Locks
  { Locks -> VName
locksArray :: VName,
    Locks -> Int
locksCount :: Int
  }

data HostEnv = HostEnv
  { HostEnv -> AtomicBinOp
hostAtomics :: AtomicBinOp,
    HostEnv -> Map VName Locks
hostLocks :: M.Map VName Locks
  }

type MulticoreGen = ImpM MCMem HostEnv Imp.Multicore

segOpString :: SegOp () MCMem -> MulticoreGen String
segOpString :: SegOp () MCMem -> MulticoreGen [Char]
segOpString SegMap {} = forall (f :: * -> *) a. Applicative f => a -> f a
pure [Char]
"segmap"
segOpString SegRed {} = forall (f :: * -> *) a. Applicative f => a -> f a
pure [Char]
"segred"
segOpString SegScan {} = forall (f :: * -> *) a. Applicative f => a -> f a
pure [Char]
"segscan"
segOpString SegHist {} = forall (f :: * -> *) a. Applicative f => a -> f a
pure [Char]
"seghist"

arrParam :: VName -> MulticoreGen Imp.Param
arrParam :: VName -> MulticoreGen Param
arrParam VName
arr = do
  VarEntry MCMem
name_entry <- forall rep r op. VName -> ImpM rep r op (VarEntry rep)
lookupVar VName
arr
  case VarEntry MCMem
name_entry of
    ArrayVar Maybe (Exp MCMem)
_ (ArrayEntry (MemLoc VName
mem [SubExp]
_ IxFun (TExp Int64)
_) PrimType
_) ->
      forall (f :: * -> *) a. Applicative f => a -> f a
pure forall a b. (a -> b) -> a -> b
$ VName -> Space -> Param
Imp.MemParam VName
mem Space
DefaultSpace
    VarEntry MCMem
_ -> forall a. HasCallStack => [Char] -> a
error forall a b. (a -> b) -> a -> b
$ [Char]
"arrParam: could not handle array " forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show VName
arr

toParam :: VName -> TypeBase shape u -> MulticoreGen [Imp.Param]
toParam :: forall shape u. VName -> TypeBase shape u -> MulticoreGen [Param]
toParam VName
name (Prim PrimType
pt) = forall (f :: * -> *) a. Applicative f => a -> f a
pure [VName -> PrimType -> Param
Imp.ScalarParam VName
name PrimType
pt]
toParam VName
name (Mem Space
space) = forall (f :: * -> *) a. Applicative f => a -> f a
pure [VName -> Space -> Param
Imp.MemParam VName
name Space
space]
toParam VName
name Array {} = forall (f :: * -> *) a. Applicative f => a -> f a
pure forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> VName -> MulticoreGen Param
arrParam VName
name
toParam VName
_name Acc {} = forall (f :: * -> *) a. Applicative f => a -> f a
pure [] -- FIXME?  Are we sure this works?

getSpace :: SegOp () MCMem -> SegSpace
getSpace :: SegOp () MCMem -> SegSpace
getSpace (SegHist ()
_ SegSpace
space [HistOp MCMem]
_ [Type]
_ KernelBody MCMem
_) = SegSpace
space
getSpace (SegRed ()
_ SegSpace
space [SegBinOp MCMem]
_ [Type]
_ KernelBody MCMem
_) = SegSpace
space
getSpace (SegScan ()
_ SegSpace
space [SegBinOp MCMem]
_ [Type]
_ KernelBody MCMem
_) = SegSpace
space
getSpace (SegMap ()
_ SegSpace
space [Type]
_ KernelBody MCMem
_) = SegSpace
space

getLoopBounds :: MulticoreGen (Imp.TExp Int64, Imp.TExp Int64)
getLoopBounds :: MulticoreGen (TExp Int64, TExp Int64)
getLoopBounds = do
  TV Int64
start <- forall {k} rep r op (t :: k).
[Char] -> PrimType -> ImpM rep r op (TV t)
dPrim [Char]
"start" PrimType
int64
  TV Int64
end <- forall {k} rep r op (t :: k).
[Char] -> PrimType -> ImpM rep r op (TV t)
dPrim [Char]
"end" PrimType
int64
  forall op rep r. Code op -> ImpM rep r op ()
emit forall a b. (a -> b) -> a -> b
$ forall a. a -> Code a
Imp.Op forall a b. (a -> b) -> a -> b
$ VName -> VName -> Multicore
Imp.GetLoopBounds (forall {k} (t :: k). TV t -> VName
tvVar TV Int64
start) (forall {k} (t :: k). TV t -> VName
tvVar TV Int64
end)
  forall (f :: * -> *) a. Applicative f => a -> f a
pure (forall {k} (t :: k). TV t -> TExp t
tvExp TV Int64
start, forall {k} (t :: k). TV t -> TExp t
tvExp TV Int64
end)

getIterationDomain :: SegOp () MCMem -> SegSpace -> MulticoreGen (Imp.TExp Int64)
getIterationDomain :: SegOp () MCMem -> SegSpace -> MulticoreGen (TExp Int64)
getIterationDomain SegMap {} SegSpace
space = do
  let ns :: [SubExp]
ns = forall a b. (a -> b) -> [a] -> [b]
map forall a b. (a, b) -> b
snd forall a b. (a -> b) -> a -> b
$ SegSpace -> [(VName, SubExp)]
unSegSpace SegSpace
space
      ns_64 :: [TExp Int64]
ns_64 = forall a b. (a -> b) -> [a] -> [b]
map SubExp -> TExp Int64
pe64 [SubExp]
ns
  forall (f :: * -> *) a. Applicative f => a -> f a
pure forall a b. (a -> b) -> a -> b
$ forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
product [TExp Int64]
ns_64
getIterationDomain SegOp () MCMem
_ SegSpace
space = do
  let ns :: [SubExp]
ns = forall a b. (a -> b) -> [a] -> [b]
map forall a b. (a, b) -> b
snd forall a b. (a -> b) -> a -> b
$ SegSpace -> [(VName, SubExp)]
unSegSpace SegSpace
space
      ns_64 :: [TExp Int64]
ns_64 = forall a b. (a -> b) -> [a] -> [b]
map SubExp -> TExp Int64
pe64 [SubExp]
ns
  case SegSpace -> [(VName, SubExp)]
unSegSpace SegSpace
space of
    [(VName, SubExp)
_] -> forall (f :: * -> *) a. Applicative f => a -> f a
pure forall a b. (a -> b) -> a -> b
$ forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
product [TExp Int64]
ns_64
    -- A segmented SegOp is over the segments
    -- so we drop the last dimension, which is
    -- executed sequentially
    [(VName, SubExp)]
_ -> forall (f :: * -> *) a. Applicative f => a -> f a
pure forall a b. (a -> b) -> a -> b
$ forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
product forall a b. (a -> b) -> a -> b
$ forall a. [a] -> [a]
init [TExp Int64]
ns_64

-- When the SegRed's return value is a scalar
-- we perform a call by value-result in the segop function
getReturnParams :: Pat LetDecMem -> SegOp () MCMem -> MulticoreGen [Imp.Param]
getReturnParams :: Pat LetDecMem -> SegOp () MCMem -> MulticoreGen [Param]
getReturnParams Pat LetDecMem
pat SegRed {} =
  -- It's a good idea to make sure any prim values are initialised, as
  -- we will load them (redundantly) in the task code, and
  -- uninitialised values are UB.
  forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
forM (forall dec. Pat dec -> [PatElem dec]
patElems Pat LetDecMem
pat) forall a b. (a -> b) -> a -> b
$ \PatElem LetDecMem
pe -> do
    case forall dec. Typed dec => PatElem dec -> Type
patElemType PatElem LetDecMem
pe of
      Prim PrimType
pt -> forall dec. PatElem dec -> VName
patElemName PatElem LetDecMem
pe forall rep r op. VName -> Exp -> ImpM rep r op ()
<~~ forall v. PrimValue -> PrimExp v
ValueExp (PrimType -> PrimValue
blankPrimValue PrimType
pt)
      Type
_ -> forall (f :: * -> *) a. Applicative f => a -> f a
pure ()
    forall shape u. VName -> TypeBase shape u -> MulticoreGen [Param]
toParam (forall dec. PatElem dec -> VName
patElemName PatElem LetDecMem
pe) (forall dec. Typed dec => PatElem dec -> Type
patElemType PatElem LetDecMem
pe)
getReturnParams Pat LetDecMem
_ SegOp () MCMem
_ = forall (f :: * -> *) a. Applicative f => a -> f a
pure forall a. Monoid a => a
mempty

renameSegBinOp :: [SegBinOp MCMem] -> MulticoreGen [SegBinOp MCMem]
renameSegBinOp :: [SegBinOp MCMem] -> MulticoreGen [SegBinOp MCMem]
renameSegBinOp [SegBinOp MCMem]
segbinops =
  forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
forM [SegBinOp MCMem]
segbinops forall a b. (a -> b) -> a -> b
$ \(SegBinOp Commutativity
comm Lambda MCMem
lam [SubExp]
ne Shape
shape) -> do
    Lambda MCMem
lam' <- forall rep (m :: * -> *).
(Renameable rep, MonadFreshNames m) =>
Lambda rep -> m (Lambda rep)
renameLambda Lambda MCMem
lam
    forall (f :: * -> *) a. Applicative f => a -> f a
pure forall a b. (a -> b) -> a -> b
$ forall rep.
Commutativity -> Lambda rep -> [SubExp] -> Shape -> SegBinOp rep
SegBinOp Commutativity
comm Lambda MCMem
lam' [SubExp]
ne Shape
shape

compileThreadResult ::
  SegSpace ->
  PatElem LetDecMem ->
  KernelResult ->
  MulticoreGen ()
compileThreadResult :: SegSpace
-> PatElem LetDecMem
-> KernelResult
-> ImpM MCMem HostEnv Multicore ()
compileThreadResult SegSpace
space PatElem LetDecMem
pe (Returns ResultManifest
_ Certs
_ SubExp
what) = do
  let is :: [TExp Int64]
is = forall a b. (a -> b) -> [a] -> [b]
map (forall a. a -> TPrimExp Int64 a
Imp.le64 forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a, b) -> a
fst) forall a b. (a -> b) -> a -> b
$ SegSpace -> [(VName, SubExp)]
unSegSpace SegSpace
space
  forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix (forall dec. PatElem dec -> VName
patElemName PatElem LetDecMem
pe) [TExp Int64]
is SubExp
what []
compileThreadResult SegSpace
_ PatElem LetDecMem
_ WriteReturns {} =
  forall a. [Char] -> a
compilerBugS [Char]
"compileThreadResult: WriteReturns unhandled."
compileThreadResult SegSpace
_ PatElem LetDecMem
_ TileReturns {} =
  forall a. [Char] -> a
compilerBugS [Char]
"compileThreadResult: TileReturns unhandled."
compileThreadResult SegSpace
_ PatElem LetDecMem
_ RegTileReturns {} =
  forall a. [Char] -> a
compilerBugS [Char]
"compileThreadResult: RegTileReturns unhandled."

freeParams :: FreeIn a => a -> MulticoreGen [Imp.Param]
freeParams :: forall a. FreeIn a => a -> MulticoreGen [Param]
freeParams a
code = do
  let free :: [VName]
free = Names -> [VName]
namesToList forall a b. (a -> b) -> a -> b
$ forall a. FreeIn a => a -> Names
freeIn a
code
  [Type]
ts <- forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM forall rep (m :: * -> *). HasScope rep m => VName -> m Type
lookupType [VName]
free
  forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall (m :: * -> *) a b c.
Applicative m =>
(a -> b -> m c) -> [a] -> [b] -> m [c]
zipWithM forall shape u. VName -> TypeBase shape u -> MulticoreGen [Param]
toParam [VName]
free [Type]
ts

isLoadBalanced :: Imp.MCCode -> Bool
isLoadBalanced :: Code Multicore -> Bool
isLoadBalanced (Code Multicore
a Imp.:>>: Code Multicore
b) = Code Multicore -> Bool
isLoadBalanced Code Multicore
a Bool -> Bool -> Bool
&& Code Multicore -> Bool
isLoadBalanced Code Multicore
b
isLoadBalanced (Imp.For VName
_ Exp
_ Code Multicore
a) = Code Multicore -> Bool
isLoadBalanced Code Multicore
a
isLoadBalanced (Imp.If TExp Bool
_ Code Multicore
a Code Multicore
b) = Code Multicore -> Bool
isLoadBalanced Code Multicore
a Bool -> Bool -> Bool
&& Code Multicore -> Bool
isLoadBalanced Code Multicore
b
isLoadBalanced (Imp.Comment Text
_ Code Multicore
a) = Code Multicore -> Bool
isLoadBalanced Code Multicore
a
isLoadBalanced Imp.While {} = Bool
False
isLoadBalanced (Imp.Op (Imp.ParLoop [Char]
_ Code Multicore
code [Param]
_)) = Code Multicore -> Bool
isLoadBalanced Code Multicore
code
isLoadBalanced (Imp.Op (Imp.ForEachActive VName
_ Code Multicore
a)) = Code Multicore -> Bool
isLoadBalanced Code Multicore
a
isLoadBalanced (Imp.Op (Imp.ForEach VName
_ Exp
_ Exp
_ Code Multicore
a)) = Code Multicore -> Bool
isLoadBalanced Code Multicore
a
isLoadBalanced (Imp.Op (Imp.ISPCKernel Code Multicore
a [Param]
_)) = Code Multicore -> Bool
isLoadBalanced Code Multicore
a
isLoadBalanced Code Multicore
_ = Bool
True

decideScheduling' :: SegOp () rep -> Imp.MCCode -> Imp.Scheduling
decideScheduling' :: forall rep. SegOp () rep -> Code Multicore -> Scheduling
decideScheduling' SegHist {} Code Multicore
_ = Scheduling
Imp.Static
decideScheduling' SegScan {} Code Multicore
_ = Scheduling
Imp.Static
decideScheduling' SegRed {} Code Multicore
_ = Scheduling
Imp.Static
decideScheduling' SegMap {} Code Multicore
code = Code Multicore -> Scheduling
decideScheduling Code Multicore
code

decideScheduling :: Imp.MCCode -> Imp.Scheduling
decideScheduling :: Code Multicore -> Scheduling
decideScheduling Code Multicore
code =
  if Code Multicore -> Bool
isLoadBalanced Code Multicore
code
    then Scheduling
Imp.Static
    else Scheduling
Imp.Dynamic

-- | Try to extract invariant allocations.  If we assume that the
-- given 'Imp.MCCode' is the body of a 'SegOp', then it is always safe
-- to move the immediate allocations to the prebody.
extractAllocations :: Imp.MCCode -> (Imp.MCCode, Imp.MCCode)
extractAllocations :: Code Multicore -> (Code Multicore, Code Multicore)
extractAllocations Code Multicore
segop_code = forall {a}. Code Multicore -> (Code a, Code Multicore)
f Code Multicore
segop_code
  where
    declared :: Names
declared = forall a. Code a -> Names
Imp.declaredIn Code Multicore
segop_code
    f :: Code Multicore -> (Code a, Code Multicore)
f (Imp.DeclareMem VName
name Space
space) =
      -- Hoisting declarations out is always safe.
      (forall a. VName -> Space -> Code a
Imp.DeclareMem VName
name Space
space, forall a. Monoid a => a
mempty)
    f (Imp.Allocate VName
name Count Bytes (TExp Int64)
size Space
space)
      | Bool -> Bool
not forall a b. (a -> b) -> a -> b
$ forall a. FreeIn a => a -> Names
freeIn Count Bytes (TExp Int64)
size Names -> Names -> Bool
`namesIntersect` Names
declared =
          (forall a. VName -> Count Bytes (TExp Int64) -> Space -> Code a
Imp.Allocate VName
name Count Bytes (TExp Int64)
size Space
space, forall a. Monoid a => a
mempty)
    f (Code Multicore
x Imp.:>>: Code Multicore
y) = Code Multicore -> (Code a, Code Multicore)
f Code Multicore
x forall a. Semigroup a => a -> a -> a
<> Code Multicore -> (Code a, Code Multicore)
f Code Multicore
y
    f (Imp.While TExp Bool
cond Code Multicore
body) =
      (forall a. Monoid a => a
mempty, forall a. TExp Bool -> Code a -> Code a
Imp.While TExp Bool
cond Code Multicore
body)
    f (Imp.For VName
i Exp
bound Code Multicore
body) =
      (forall a. Monoid a => a
mempty, forall a. VName -> Exp -> Code a -> Code a
Imp.For VName
i Exp
bound Code Multicore
body)
    f (Imp.Comment Text
s Code Multicore
code) =
      forall (p :: * -> * -> *) b c a.
Bifunctor p =>
(b -> c) -> p a b -> p a c
second (forall a. Text -> Code a -> Code a
Imp.Comment Text
s) (Code Multicore -> (Code a, Code Multicore)
f Code Multicore
code)
    f Imp.Free {} =
      forall a. Monoid a => a
mempty
    f (Imp.If TExp Bool
cond Code Multicore
tcode Code Multicore
fcode) =
      let (Code a
ta, Code Multicore
tcode') = Code Multicore -> (Code a, Code Multicore)
f Code Multicore
tcode
          (Code a
fa, Code Multicore
fcode') = Code Multicore -> (Code a, Code Multicore)
f Code Multicore
fcode
       in (Code a
ta forall a. Semigroup a => a -> a -> a
<> Code a
fa, forall a. TExp Bool -> Code a -> Code a -> Code a
Imp.If TExp Bool
cond Code Multicore
tcode' Code Multicore
fcode')
    f (Imp.Op (Imp.ParLoop [Char]
s Code Multicore
body [Param]
free)) =
      let (Code Multicore
body_allocs, Code Multicore
body') = Code Multicore -> (Code Multicore, Code Multicore)
extractAllocations Code Multicore
body
          (Code a
free_allocs, Code Multicore
here_allocs) = Code Multicore -> (Code a, Code Multicore)
f Code Multicore
body_allocs
          free' :: [Param]
free' =
            forall a. (a -> Bool) -> [a] -> [a]
filter
              ( (VName -> Names -> Bool
`notNameIn` forall a. Code a -> Names
Imp.declaredIn Code Multicore
body_allocs) forall b c a. (b -> c) -> (a -> b) -> a -> c
. Param -> VName
Imp.paramName
              )
              [Param]
free
       in ( Code a
free_allocs,
            Code Multicore
here_allocs forall a. Semigroup a => a -> a -> a
<> forall a. a -> Code a
Imp.Op ([Char] -> Code Multicore -> [Param] -> Multicore
Imp.ParLoop [Char]
s Code Multicore
body' [Param]
free')
          )
    f Code Multicore
code =
      (forall a. Monoid a => a
mempty, Code Multicore
code)

-- | Indicates whether to vectorize a chunk loop or keep it sequential.
-- We use this to allow falling back to sequential chunk loops in cases
-- we don't care about trying to vectorize.
data ChunkLoopVectorization = Vectorized | Scalar

-- | Emit code for the chunk loop, given an action that generates code
-- for a single iteration.
--
-- The action is called with the (symbolic) index of the current
-- iteration.
generateChunkLoop ::
  String ->
  ChunkLoopVectorization ->
  (Imp.TExp Int64 -> MulticoreGen ()) ->
  MulticoreGen ()
generateChunkLoop :: [Char]
-> ChunkLoopVectorization
-> (TExp Int64 -> ImpM MCMem HostEnv Multicore ())
-> ImpM MCMem HostEnv Multicore ()
generateChunkLoop [Char]
desc ChunkLoopVectorization
Scalar TExp Int64 -> ImpM MCMem HostEnv Multicore ()
m = do
  (TExp Int64
start, TExp Int64
end) <- MulticoreGen (TExp Int64, TExp Int64)
getLoopBounds
  TExp Int64
n <- forall {k} (t :: k) rep r op.
[Char] -> TExp t -> ImpM rep r op (TExp t)
dPrimVE [Char]
"n" forall a b. (a -> b) -> a -> b
$ TExp Int64
end forall a. Num a => a -> a -> a
- TExp Int64
start
  VName
i <- forall (m :: * -> *). MonadFreshNames m => [Char] -> m VName
newVName ([Char]
desc forall a. Semigroup a => a -> a -> a
<> [Char]
"_i")
  (Code Multicore
body_allocs, Code Multicore
body) <- forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Code Multicore -> (Code Multicore, Code Multicore)
extractAllocations forall a b. (a -> b) -> a -> b
$
    forall rep r op. ImpM rep r op () -> ImpM rep r op (Code op)
collect forall a b. (a -> b) -> a -> b
$ do
      forall rep r op. VName -> IntType -> ImpM rep r op ()
addLoopVar VName
i IntType
Int64
      TExp Int64 -> ImpM MCMem HostEnv Multicore ()
m forall a b. (a -> b) -> a -> b
$ TExp Int64
start forall a. Num a => a -> a -> a
+ forall a. a -> TPrimExp Int64 a
Imp.le64 VName
i
  forall op rep r. Code op -> ImpM rep r op ()
emit Code Multicore
body_allocs
  -- Emit either foreach or normal for loop
  let bound :: Exp
bound = forall {k} (t :: k) v. TPrimExp t v -> PrimExp v
untyped TExp Int64
n
  forall op rep r. Code op -> ImpM rep r op ()
emit forall a b. (a -> b) -> a -> b
$ forall a. VName -> Exp -> Code a -> Code a
Imp.For VName
i Exp
bound Code Multicore
body
generateChunkLoop [Char]
desc ChunkLoopVectorization
Vectorized TExp Int64 -> ImpM MCMem HostEnv Multicore ()
m = do
  (TExp Int64
start, TExp Int64
end) <- MulticoreGen (TExp Int64, TExp Int64)
getLoopBounds
  TExp Int64
n <- forall {k} (t :: k) rep r op.
[Char] -> TExp t -> ImpM rep r op (TExp t)
dPrimVE [Char]
"n" forall a b. (a -> b) -> a -> b
$ TExp Int64
end forall a. Num a => a -> a -> a
- TExp Int64
start
  VName
i <- forall (m :: * -> *). MonadFreshNames m => [Char] -> m VName
newVName ([Char]
desc forall a. Semigroup a => a -> a -> a
<> [Char]
"_i")
  (Code Multicore
body_allocs, Code Multicore
body) <- forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Code Multicore -> (Code Multicore, Code Multicore)
extractAllocations forall a b. (a -> b) -> a -> b
$
    forall rep r op. ImpM rep r op () -> ImpM rep r op (Code op)
collect forall a b. (a -> b) -> a -> b
$ do
      forall rep r op. VName -> IntType -> ImpM rep r op ()
addLoopVar VName
i IntType
Int64
      TExp Int64 -> ImpM MCMem HostEnv Multicore ()
m forall a b. (a -> b) -> a -> b
$ forall a. a -> TPrimExp Int64 a
Imp.le64 VName
i
  forall op rep r. Code op -> ImpM rep r op ()
emit Code Multicore
body_allocs
  -- Emit either foreach or normal for loop
  let from :: Exp
from = forall {k} (t :: k) v. TPrimExp t v -> PrimExp v
untyped TExp Int64
start
  let bound :: Exp
bound = forall {k} (t :: k) v. TPrimExp t v -> PrimExp v
untyped (TExp Int64
start forall a. Num a => a -> a -> a
+ TExp Int64
n)
  forall op rep r. Code op -> ImpM rep r op ()
emit forall a b. (a -> b) -> a -> b
$ forall a. a -> Code a
Imp.Op forall a b. (a -> b) -> a -> b
$ VName -> Exp -> Exp -> Code Multicore -> Multicore
Imp.ForEach VName
i Exp
from Exp
bound Code Multicore
body

-- | Emit code for a sequential loop over each vector lane, given
-- and action that generates code for a single iteration. The action
-- is called with the symbolic index of the current iteration.
generateUniformizeLoop :: (Imp.TExp Int64 -> MulticoreGen ()) -> MulticoreGen ()
generateUniformizeLoop :: (TExp Int64 -> ImpM MCMem HostEnv Multicore ())
-> ImpM MCMem HostEnv Multicore ()
generateUniformizeLoop TExp Int64 -> ImpM MCMem HostEnv Multicore ()
m = do
  VName
i <- forall (m :: * -> *). MonadFreshNames m => [Char] -> m VName
newVName [Char]
"uni_i"
  Code Multicore
body <- forall rep r op. ImpM rep r op () -> ImpM rep r op (Code op)
collect forall a b. (a -> b) -> a -> b
$ do
    forall rep r op. VName -> IntType -> ImpM rep r op ()
addLoopVar VName
i IntType
Int64
    TExp Int64 -> ImpM MCMem HostEnv Multicore ()
m forall a b. (a -> b) -> a -> b
$ forall a. a -> TPrimExp Int64 a
Imp.le64 VName
i
  forall op rep r. Code op -> ImpM rep r op ()
emit forall a b. (a -> b) -> a -> b
$ forall a. a -> Code a
Imp.Op forall a b. (a -> b) -> a -> b
$ VName -> Code Multicore -> Multicore
Imp.ForEachActive VName
i Code Multicore
body

-- | Given a piece of code, if that code performs an assignment, turn
-- that assignment into an extraction of element from a vector on the
-- right hand side, using a passed index for the extraction. Other code
-- is left as is.
extractVectorLane :: Imp.TExp Int64 -> MulticoreGen Imp.MCCode -> MulticoreGen ()
extractVectorLane :: TExp Int64
-> ImpM MCMem HostEnv Multicore (Code Multicore)
-> ImpM MCMem HostEnv Multicore ()
extractVectorLane TExp Int64
j ImpM MCMem HostEnv Multicore (Code Multicore)
code = do
  let ut_exp :: Exp
ut_exp = forall {k} (t :: k) v. TPrimExp t v -> PrimExp v
untyped TExp Int64
j
  Code Multicore
code' <- ImpM MCMem HostEnv Multicore (Code Multicore)
code
  case Code Multicore
code' of
    Imp.SetScalar VName
vname Exp
e -> do
      Type
typ <- forall rep (m :: * -> *). HasScope rep m => VName -> m Type
lookupType VName
vname
      case Type
typ of
        -- ISPC v1.17 does not support extract on f16 yet..
        -- Thus we do this stupid conversion to f32
        Prim (FloatType FloatType
Float16) -> do
          TV Any
tv <- forall {k} rep r op (t :: k).
[Char] -> PrimType -> ImpM rep r op (TV t)
dPrim [Char]
"hack_extract_f16" (FloatType -> PrimType
FloatType FloatType
Float32)
          forall op rep r. Code op -> ImpM rep r op ()
emit forall a b. (a -> b) -> a -> b
$ forall a. VName -> Exp -> Code a
Imp.SetScalar (forall {k} (t :: k). TV t -> VName
tvVar TV Any
tv) Exp
e
          forall op rep r. Code op -> ImpM rep r op ()
emit forall a b. (a -> b) -> a -> b
$ forall a. a -> Code a
Imp.Op forall a b. (a -> b) -> a -> b
$ VName -> Exp -> Exp -> Multicore
Imp.ExtractLane VName
vname (forall {k} (t :: k) v. TPrimExp t v -> PrimExp v
untyped forall a b. (a -> b) -> a -> b
$ forall {k} (t :: k). TV t -> TExp t
tvExp TV Any
tv) Exp
ut_exp
        Type
_ -> forall op rep r. Code op -> ImpM rep r op ()
emit forall a b. (a -> b) -> a -> b
$ forall a. a -> Code a
Imp.Op forall a b. (a -> b) -> a -> b
$ VName -> Exp -> Exp -> Multicore
Imp.ExtractLane VName
vname Exp
e Exp
ut_exp
    Code Multicore
_ ->
      forall op rep r. Code op -> ImpM rep r op ()
emit Code Multicore
code'

-- | Given an action that may generate some code, put that code
-- into an ISPC kernel.
inISPC :: MulticoreGen () -> MulticoreGen ()
inISPC :: ImpM MCMem HostEnv Multicore () -> ImpM MCMem HostEnv Multicore ()
inISPC ImpM MCMem HostEnv Multicore ()
code = do
  Code Multicore
code' <- forall rep r op. ImpM rep r op () -> ImpM rep r op (Code op)
collect ImpM MCMem HostEnv Multicore ()
code
  [Param]
free <- forall a. FreeIn a => a -> MulticoreGen [Param]
freeParams Code Multicore
code'
  forall op rep r. Code op -> ImpM rep r op ()
emit forall a b. (a -> b) -> a -> b
$ forall a. a -> Code a
Imp.Op forall a b. (a -> b) -> a -> b
$ Code Multicore -> [Param] -> Multicore
Imp.ISPCKernel Code Multicore
code' [Param]
free

-------------------------------
------- SegRed helpers  -------
-------------------------------
sForVectorized' :: VName -> Imp.Exp -> MulticoreGen () -> MulticoreGen ()
sForVectorized' :: VName
-> Exp
-> ImpM MCMem HostEnv Multicore ()
-> ImpM MCMem HostEnv Multicore ()
sForVectorized' VName
i Exp
bound ImpM MCMem HostEnv Multicore ()
body = do
  let it :: IntType
it = case forall v. PrimExp v -> PrimType
primExpType Exp
bound of
        IntType IntType
bound_t -> IntType
bound_t
        PrimType
t -> forall a. HasCallStack => [Char] -> a
error forall a b. (a -> b) -> a -> b
$ [Char]
"sFor': bound " forall a. [a] -> [a] -> [a]
++ forall a. Pretty a => a -> [Char]
prettyString Exp
bound forall a. [a] -> [a] -> [a]
++ [Char]
" is of type " forall a. [a] -> [a] -> [a]
++ forall a. Pretty a => a -> [Char]
prettyString PrimType
t
  forall rep r op. VName -> IntType -> ImpM rep r op ()
addLoopVar VName
i IntType
it
  Code Multicore
body' <- forall rep r op. ImpM rep r op () -> ImpM rep r op (Code op)
collect ImpM MCMem HostEnv Multicore ()
body
  forall op rep r. Code op -> ImpM rep r op ()
emit forall a b. (a -> b) -> a -> b
$ forall a. a -> Code a
Imp.Op forall a b. (a -> b) -> a -> b
$ VName -> Exp -> Exp -> Code Multicore -> Multicore
Imp.ForEach VName
i (forall v. PrimValue -> PrimExp v
Imp.ValueExp forall a b. (a -> b) -> a -> b
$ PrimType -> PrimValue
blankPrimValue forall a b. (a -> b) -> a -> b
$ IntType -> PrimType
Imp.IntType IntType
Imp.Int64) Exp
bound Code Multicore
body'

sForVectorized :: String -> Imp.TExp t -> (Imp.TExp t -> MulticoreGen ()) -> MulticoreGen ()
sForVectorized :: forall {k} (t :: k).
[Char]
-> TExp t
-> (TExp t -> ImpM MCMem HostEnv Multicore ())
-> ImpM MCMem HostEnv Multicore ()
sForVectorized [Char]
i TExp t
bound TExp t -> ImpM MCMem HostEnv Multicore ()
body = do
  VName
i' <- forall (m :: * -> *). MonadFreshNames m => [Char] -> m VName
newVName [Char]
i
  VName
-> Exp
-> ImpM MCMem HostEnv Multicore ()
-> ImpM MCMem HostEnv Multicore ()
sForVectorized' VName
i' (forall {k} (t :: k) v. TPrimExp t v -> PrimExp v
untyped TExp t
bound) forall a b. (a -> b) -> a -> b
$
    TExp t -> ImpM MCMem HostEnv Multicore ()
body forall a b. (a -> b) -> a -> b
$
      forall {k} (t :: k) v. PrimExp v -> TPrimExp t v
TPrimExp forall a b. (a -> b) -> a -> b
$
        VName -> PrimType -> Exp
Imp.var VName
i' forall a b. (a -> b) -> a -> b
$
          forall v. PrimExp v -> PrimType
primExpType forall a b. (a -> b) -> a -> b
$
            forall {k} (t :: k) v. TPrimExp t v -> PrimExp v
untyped TExp t
bound

-- | Like sLoopNest, but puts a vectorized loop at the innermost layer.
sLoopNestVectorized ::
  Shape ->
  ([Imp.TExp Int64] -> MulticoreGen ()) ->
  MulticoreGen ()
sLoopNestVectorized :: Shape
-> ([TExp Int64] -> ImpM MCMem HostEnv Multicore ())
-> ImpM MCMem HostEnv Multicore ()
sLoopNestVectorized = [TExp Int64]
-> [SubExp]
-> ([TExp Int64] -> ImpM MCMem HostEnv Multicore ())
-> ImpM MCMem HostEnv Multicore ()
sLoopNest' [] forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall d. ShapeBase d -> [d]
shapeDims
  where
    sLoopNest' :: [TExp Int64]
-> [SubExp]
-> ([TExp Int64] -> ImpM MCMem HostEnv Multicore ())
-> ImpM MCMem HostEnv Multicore ()
sLoopNest' [TExp Int64]
is [] [TExp Int64] -> ImpM MCMem HostEnv Multicore ()
f = [TExp Int64] -> ImpM MCMem HostEnv Multicore ()
f forall a b. (a -> b) -> a -> b
$ forall a. [a] -> [a]
reverse [TExp Int64]
is
    sLoopNest' [TExp Int64]
is [SubExp
d] [TExp Int64] -> ImpM MCMem HostEnv Multicore ()
f =
      forall {k} (t :: k).
[Char]
-> TExp t
-> (TExp t -> ImpM MCMem HostEnv Multicore ())
-> ImpM MCMem HostEnv Multicore ()
sForVectorized [Char]
"nest_i" (SubExp -> TExp Int64
pe64 SubExp
d) forall a b. (a -> b) -> a -> b
$ \TExp Int64
i -> [TExp Int64]
-> [SubExp]
-> ([TExp Int64] -> ImpM MCMem HostEnv Multicore ())
-> ImpM MCMem HostEnv Multicore ()
sLoopNest' (TExp Int64
i forall a. a -> [a] -> [a]
: [TExp Int64]
is) [] [TExp Int64] -> ImpM MCMem HostEnv Multicore ()
f
    sLoopNest' [TExp Int64]
is (SubExp
d : [SubExp]
ds) [TExp Int64] -> ImpM MCMem HostEnv Multicore ()
f =
      forall {k} (t :: k) rep r op.
[Char]
-> TExp t -> (TExp t -> ImpM rep r op ()) -> ImpM rep r op ()
sFor [Char]
"nest_i" (SubExp -> TExp Int64
pe64 SubExp
d) forall a b. (a -> b) -> a -> b
$ \TExp Int64
i -> [TExp Int64]
-> [SubExp]
-> ([TExp Int64] -> ImpM MCMem HostEnv Multicore ())
-> ImpM MCMem HostEnv Multicore ()
sLoopNest' (TExp Int64
i forall a. a -> [a] -> [a]
: [TExp Int64]
is) [SubExp]
ds [TExp Int64] -> ImpM MCMem HostEnv Multicore ()
f

-------------------------------
------- SegHist helpers -------
-------------------------------
renameHistOpLambda :: [HistOp MCMem] -> MulticoreGen [HistOp MCMem]
renameHistOpLambda :: [HistOp MCMem] -> MulticoreGen [HistOp MCMem]
renameHistOpLambda [HistOp MCMem]
hist_ops =
  forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
forM [HistOp MCMem]
hist_ops forall a b. (a -> b) -> a -> b
$ \(HistOp Shape
w SubExp
rf [VName]
dest [SubExp]
neutral Shape
shape Lambda MCMem
lam) -> do
    Lambda MCMem
lam' <- forall rep (m :: * -> *).
(Renameable rep, MonadFreshNames m) =>
Lambda rep -> m (Lambda rep)
renameLambda Lambda MCMem
lam
    forall (f :: * -> *) a. Applicative f => a -> f a
pure forall a b. (a -> b) -> a -> b
$ forall rep.
Shape
-> SubExp
-> [VName]
-> [SubExp]
-> Shape
-> Lambda rep
-> HistOp rep
HistOp Shape
w SubExp
rf [VName]
dest [SubExp]
neutral Shape
shape Lambda MCMem
lam'

-- | Locking strategy used for an atomic update.
data Locking = Locking
  { -- | Array containing the lock.
    Locking -> VName
lockingArray :: VName,
    -- | Value for us to consider the lock free.
    Locking -> TExp Int32
lockingIsUnlocked :: Imp.TExp Int32,
    -- | What to write when we lock it.
    Locking -> TExp Int32
lockingToLock :: Imp.TExp Int32,
    -- | What to write when we unlock it.
    Locking -> TExp Int32
lockingToUnlock :: Imp.TExp Int32,
    -- | A transformation from the logical lock index to the
    -- physical position in the array.  This can also be used
    -- to make the lock array smaller.
    Locking -> [TExp Int64] -> [TExp Int64]
lockingMapping :: [Imp.TExp Int64] -> [Imp.TExp Int64]
  }

-- | A function for generating code for an atomic update.  Assumes
-- that the bucket is in-bounds.
type DoAtomicUpdate rep r =
  [VName] -> [Imp.TExp Int64] -> MulticoreGen ()

-- | The mechanism that will be used for performing the atomic update.
-- Approximates how efficient it will be.  Ordered from most to least
-- efficient.
data AtomicUpdate rep r
  = AtomicPrim (DoAtomicUpdate rep r)
  | -- | Can be done by efficient swaps.
    AtomicCAS (DoAtomicUpdate rep r)
  | -- | Requires explicit locking.
    AtomicLocking (Locking -> DoAtomicUpdate rep r)

atomicUpdateLocking ::
  AtomicBinOp ->
  Lambda MCMem ->
  AtomicUpdate MCMem ()
atomicUpdateLocking :: AtomicBinOp -> Lambda MCMem -> AtomicUpdate MCMem ()
atomicUpdateLocking AtomicBinOp
atomicBinOp Lambda MCMem
lam
  | Just [(BinOp, PrimType, VName, VName)]
ops_and_ts <- forall rep.
ASTRep rep =>
Lambda rep -> Maybe [(BinOp, PrimType, VName, VName)]
lamIsBinOp Lambda MCMem
lam,
    forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all (\(BinOp
_, PrimType
t, VName
_, VName
_) -> Int -> Bool
supportedPrims forall a b. (a -> b) -> a -> b
$ PrimType -> Int
primBitSize PrimType
t) [(BinOp, PrimType, VName, VName)]
ops_and_ts =
      forall {k} {k} {t :: * -> *} {b} {c} {d} {rep :: k} {r :: k}.
Foldable t =>
t (BinOp, b, c, d) -> DoAtomicUpdate rep r -> AtomicUpdate rep r
primOrCas [(BinOp, PrimType, VName, VName)]
ops_and_ts forall a b. (a -> b) -> a -> b
$ \[VName]
arrs [TExp Int64]
bucket ->
        -- If the operator is a vectorised binary operator on 32-bit values,
        -- we can use a particularly efficient implementation. If the
        -- operator has an atomic implementation we use that, otherwise it
        -- is still a binary operator which can be implemented by atomic
        -- compare-and-swap if 32 bits.
        forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ (forall a b. [a] -> [b] -> [(a, b)]
zip [VName]
arrs [(BinOp, PrimType, VName, VName)]
ops_and_ts) forall a b. (a -> b) -> a -> b
$ \(VName
a, (BinOp
op, PrimType
t, VName
x, VName
y)) -> do
          -- Common variables.
          TV Any
old <- forall {k} rep r op (t :: k).
[Char] -> PrimType -> ImpM rep r op (TV t)
dPrim [Char]
"old" PrimType
t

          (VName
arr', Space
_a_space, Count Elements (TExp Int64)
bucket_offset) <- forall rep r op.
VName
-> [TExp Int64]
-> ImpM rep r op (VName, Space, Count Elements (TExp Int64))
fullyIndexArray VName
a [TExp Int64]
bucket

          case VName
-> VName
-> Count Elements (TExp Int32)
-> BinOp
-> Maybe (Exp -> Multicore)
opHasAtomicSupport (forall {k} (t :: k). TV t -> VName
tvVar TV Any
old) VName
arr' (forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int32 v
sExt32 forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Count Elements (TExp Int64)
bucket_offset) BinOp
op of
            Just Exp -> Multicore
f -> forall op rep r. op -> ImpM rep r op ()
sOp forall a b. (a -> b) -> a -> b
$ Exp -> Multicore
f forall a b. (a -> b) -> a -> b
$ VName -> PrimType -> Exp
Imp.var VName
y PrimType
t
            Maybe (Exp -> Multicore)
Nothing ->
              PrimType
-> VName
-> VName
-> [TExp Int64]
-> VName
-> ImpM MCMem HostEnv Multicore ()
-> ImpM MCMem HostEnv Multicore ()
atomicUpdateCAS PrimType
t VName
a (forall {k} (t :: k). TV t -> VName
tvVar TV Any
old) [TExp Int64]
bucket VName
x forall a b. (a -> b) -> a -> b
$
                VName
x forall rep r op. VName -> Exp -> ImpM rep r op ()
<~~ forall v. BinOp -> PrimExp v -> PrimExp v -> PrimExp v
Imp.BinOpExp BinOp
op (VName -> PrimType -> Exp
Imp.var VName
x PrimType
t) (VName -> PrimType -> Exp
Imp.var VName
y PrimType
t)
  where
    opHasAtomicSupport :: VName
-> VName
-> Count Elements (TExp Int32)
-> BinOp
-> Maybe (Exp -> Multicore)
opHasAtomicSupport VName
old VName
arr' Count Elements (TExp Int32)
bucket' BinOp
bop = do
      let atomic :: (VName -> VName -> Count Elements (TExp Int32) -> a -> AtomicOp)
-> a -> Multicore
atomic VName -> VName -> Count Elements (TExp Int32) -> a -> AtomicOp
f = AtomicOp -> Multicore
Imp.Atomic forall b c a. (b -> c) -> (a -> b) -> a -> c
. VName -> VName -> Count Elements (TExp Int32) -> a -> AtomicOp
f VName
old VName
arr' Count Elements (TExp Int32)
bucket'
      forall {a}.
(VName -> VName -> Count Elements (TExp Int32) -> a -> AtomicOp)
-> a -> Multicore
atomic forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> AtomicBinOp
atomicBinOp BinOp
bop

    primOrCas :: t (BinOp, b, c, d) -> DoAtomicUpdate rep r -> AtomicUpdate rep r
primOrCas t (BinOp, b, c, d)
ops
      | forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all forall {b} {c} {d}. (BinOp, b, c, d) -> Bool
isPrim t (BinOp, b, c, d)
ops = forall {k} {k} (rep :: k) (r :: k).
DoAtomicUpdate rep r -> AtomicUpdate rep r
AtomicPrim
      | Bool
otherwise = forall {k} {k} (rep :: k) (r :: k).
DoAtomicUpdate rep r -> AtomicUpdate rep r
AtomicCAS

    isPrim :: (BinOp, b, c, d) -> Bool
isPrim (BinOp
op, b
_, c
_, d
_) = forall a. Maybe a -> Bool
isJust forall a b. (a -> b) -> a -> b
$ AtomicBinOp
atomicBinOp BinOp
op
atomicUpdateLocking AtomicBinOp
_ Lambda MCMem
op
  | [Prim PrimType
t] <- forall rep. Lambda rep -> [Type]
lambdaReturnType Lambda MCMem
op,
    [LParam MCMem
xp, LParam MCMem
_] <- forall rep. Lambda rep -> [LParam rep]
lambdaParams Lambda MCMem
op,
    Int -> Bool
supportedPrims (PrimType -> Int
primBitSize PrimType
t) = forall {k} {k} (rep :: k) (r :: k).
DoAtomicUpdate rep r -> AtomicUpdate rep r
AtomicCAS forall a b. (a -> b) -> a -> b
$ \[VName
arr] [TExp Int64]
bucket -> do
      TV Any
old <- forall {k} rep r op (t :: k).
[Char] -> PrimType -> ImpM rep r op (TV t)
dPrim [Char]
"old" PrimType
t
      PrimType
-> VName
-> VName
-> [TExp Int64]
-> VName
-> ImpM MCMem HostEnv Multicore ()
-> ImpM MCMem HostEnv Multicore ()
atomicUpdateCAS PrimType
t VName
arr (forall {k} (t :: k). TV t -> VName
tvVar TV Any
old) [TExp Int64]
bucket (forall dec. Param dec -> VName
paramName LParam MCMem
xp) forall a b. (a -> b) -> a -> b
$
        forall dec rep r op. [Param dec] -> Body rep -> ImpM rep r op ()
compileBody' [LParam MCMem
xp] forall a b. (a -> b) -> a -> b
$
          forall rep. Lambda rep -> Body rep
lambdaBody Lambda MCMem
op
atomicUpdateLocking AtomicBinOp
_ Lambda MCMem
op = forall {k} {k} (rep :: k) (r :: k).
(Locking -> DoAtomicUpdate rep r) -> AtomicUpdate rep r
AtomicLocking forall a b. (a -> b) -> a -> b
$ \Locking
locking [VName]
arrs [TExp Int64]
bucket -> do
  TV Int32
old <- forall {k} rep r op (t :: k).
[Char] -> PrimType -> ImpM rep r op (TV t)
dPrim [Char]
"old" PrimType
int32
  TV Int32
continue <- forall {k} (t :: k) rep r op.
[Char] -> PrimType -> TExp t -> ImpM rep r op (TV t)
dPrimVol [Char]
"continue" PrimType
int32 (TExp Int32
0 :: Imp.TExp Int32)

  -- Correctly index into locks.
  (VName
locks', Space
_locks_space, Count Elements (TExp Int64)
locks_offset) <-
    forall rep r op.
VName
-> [TExp Int64]
-> ImpM rep r op (VName, Space, Count Elements (TExp Int64))
fullyIndexArray (Locking -> VName
lockingArray Locking
locking) forall a b. (a -> b) -> a -> b
$ Locking -> [TExp Int64] -> [TExp Int64]
lockingMapping Locking
locking [TExp Int64]
bucket

  -- Critical section
  let try_acquire_lock :: ImpM rep r Multicore ()
try_acquire_lock = do
        TV Int32
old forall {k} (t :: k) rep r op. TV t -> TExp t -> ImpM rep r op ()
<-- (TExp Int32
0 :: Imp.TExp Int32)
        forall op rep r. op -> ImpM rep r op ()
sOp forall b c a. (b -> c) -> (a -> b) -> a -> c
. AtomicOp -> Multicore
Imp.Atomic forall a b. (a -> b) -> a -> b
$
          PrimType
-> VName
-> VName
-> Count Elements (TExp Int32)
-> VName
-> Exp
-> AtomicOp
Imp.AtomicCmpXchg
            PrimType
int32
            (forall {k} (t :: k). TV t -> VName
tvVar TV Int32
old)
            VName
locks'
            (forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int32 v
sExt32 forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Count Elements (TExp Int64)
locks_offset)
            (forall {k} (t :: k). TV t -> VName
tvVar TV Int32
continue)
            (forall {k} (t :: k) v. TPrimExp t v -> PrimExp v
untyped (Locking -> TExp Int32
lockingToLock Locking
locking))
      lock_acquired :: TExp Int32
lock_acquired = forall {k} (t :: k). TV t -> TExp t
tvExp TV Int32
continue
      -- Even the releasing is done with an atomic rather than a
      -- simple write, for memory coherency reasons.
      release_lock :: ImpM rep r Multicore ()
release_lock = do
        TV Int32
old forall {k} (t :: k) rep r op. TV t -> TExp t -> ImpM rep r op ()
<-- Locking -> TExp Int32
lockingToLock Locking
locking
        forall op rep r. op -> ImpM rep r op ()
sOp forall b c a. (b -> c) -> (a -> b) -> a -> c
. AtomicOp -> Multicore
Imp.Atomic forall a b. (a -> b) -> a -> b
$
          PrimType
-> VName
-> VName
-> Count Elements (TExp Int32)
-> VName
-> Exp
-> AtomicOp
Imp.AtomicCmpXchg
            PrimType
int32
            (forall {k} (t :: k). TV t -> VName
tvVar TV Int32
old)
            VName
locks'
            (forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int32 v
sExt32 forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Count Elements (TExp Int64)
locks_offset)
            (forall {k} (t :: k). TV t -> VName
tvVar TV Int32
continue)
            (forall {k} (t :: k) v. TPrimExp t v -> PrimExp v
untyped (Locking -> TExp Int32
lockingToUnlock Locking
locking))

  -- Preparing parameters. It is assumed that the caller has already
  -- filled the arr_params. We copy the current value to the
  -- accumulator parameters.
  let ([Param LetDecMem]
acc_params, [Param LetDecMem]
_arr_params) = forall a. Int -> [a] -> ([a], [a])
splitAt (forall (t :: * -> *) a. Foldable t => t a -> Int
length [VName]
arrs) forall a b. (a -> b) -> a -> b
$ forall rep. Lambda rep -> [LParam rep]
lambdaParams Lambda MCMem
op
      bind_acc_params :: ImpM rep r op ()
bind_acc_params =
        forall rep r op a. ImpM rep r op a -> ImpM rep r op a
everythingVolatile forall a b. (a -> b) -> a -> b
$
          forall rep r op. Text -> ImpM rep r op () -> ImpM rep r op ()
sComment Text
"bind lhs" forall a b. (a -> b) -> a -> b
$
            forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ (forall a b. [a] -> [b] -> [(a, b)]
zip [Param LetDecMem]
acc_params [VName]
arrs) forall a b. (a -> b) -> a -> b
$ \(Param LetDecMem
acc_p, VName
arr) ->
              forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix (forall dec. Param dec -> VName
paramName Param LetDecMem
acc_p) [] (VName -> SubExp
Var VName
arr) [TExp Int64]
bucket

  let op_body :: ImpM MCMem r op ()
op_body =
        forall rep r op. Text -> ImpM rep r op () -> ImpM rep r op ()
sComment Text
"execute operation" forall a b. (a -> b) -> a -> b
$
          forall dec rep r op. [Param dec] -> Body rep -> ImpM rep r op ()
compileBody' [Param LetDecMem]
acc_params forall a b. (a -> b) -> a -> b
$
            forall rep. Lambda rep -> Body rep
lambdaBody Lambda MCMem
op

      do_hist :: ImpM rep r op ()
do_hist =
        forall rep r op a. ImpM rep r op a -> ImpM rep r op a
everythingVolatile forall a b. (a -> b) -> a -> b
$
          forall rep r op. Text -> ImpM rep r op () -> ImpM rep r op ()
sComment Text
"update global result" forall a b. (a -> b) -> a -> b
$
            forall (m :: * -> *) a b c.
Applicative m =>
(a -> b -> m c) -> [a] -> [b] -> m ()
zipWithM_ (forall {rep} {r} {op}.
[TExp Int64] -> VName -> SubExp -> ImpM rep r op ()
writeArray [TExp Int64]
bucket) [VName]
arrs forall a b. (a -> b) -> a -> b
$
              forall a b. (a -> b) -> [a] -> [b]
map (VName -> SubExp
Var forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall dec. Param dec -> VName
paramName) [Param LetDecMem]
acc_params

  -- While-loop: Try to insert your value
  forall rep r op. TExp Bool -> ImpM rep r op () -> ImpM rep r op ()
sWhile (forall {k} (t :: k). TV t -> TExp t
tvExp TV Int32
continue forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.==. TExp Int32
0) forall a b. (a -> b) -> a -> b
$ do
    forall {rep} {r}. ImpM rep r Multicore ()
try_acquire_lock
    forall rep r op. TExp Bool -> ImpM rep r op () -> ImpM rep r op ()
sUnless (TExp Int32
lock_acquired forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.==. TExp Int32
0) forall a b. (a -> b) -> a -> b
$ do
      forall rep (inner :: * -> *) r op.
Mem rep inner =>
[LParam rep] -> ImpM rep r op ()
dLParams [Param LetDecMem]
acc_params
      forall {rep} {r} {op}. ImpM rep r op ()
bind_acc_params
      forall {r} {op}. ImpM MCMem r op ()
op_body
      forall {rep} {r} {op}. ImpM rep r op ()
do_hist
      forall {rep} {r}. ImpM rep r Multicore ()
release_lock
  where
    writeArray :: [TExp Int64] -> VName -> SubExp -> ImpM rep r op ()
writeArray [TExp Int64]
bucket VName
arr SubExp
val = forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix VName
arr [TExp Int64]
bucket SubExp
val []

atomicUpdateCAS ::
  PrimType ->
  VName ->
  VName ->
  [Imp.TExp Int64] ->
  VName ->
  MulticoreGen () ->
  MulticoreGen ()
atomicUpdateCAS :: PrimType
-> VName
-> VName
-> [TExp Int64]
-> VName
-> ImpM MCMem HostEnv Multicore ()
-> ImpM MCMem HostEnv Multicore ()
atomicUpdateCAS PrimType
t VName
arr VName
old [TExp Int64]
bucket VName
x ImpM MCMem HostEnv Multicore ()
do_op = do
  TV Int32
run_loop <- forall {k} (t :: k) rep r op.
[Char] -> TExp t -> ImpM rep r op (TV t)
dPrimV [Char]
"run_loop" (TExp Int32
0 :: Imp.TExp Int32)
  (VName
arr', Space
_a_space, Count Elements (TExp Int64)
bucket_offset) <- forall rep r op.
VName
-> [TExp Int64]
-> ImpM rep r op (VName, Space, Count Elements (TExp Int64))
fullyIndexArray VName
arr [TExp Int64]
bucket

  PrimType
bytes <- Int -> MulticoreGen PrimType
toIntegral forall a b. (a -> b) -> a -> b
$ PrimType -> Int
primBitSize PrimType
t
  let (PrimExp v -> PrimExp v
toBits, PrimExp v -> PrimExp v
fromBits) =
        case PrimType
t of
          FloatType FloatType
Float16 ->
            ( \PrimExp v
v -> forall v. [Char] -> [PrimExp v] -> PrimType -> PrimExp v
Imp.FunExp [Char]
"to_bits16" [PrimExp v
v] PrimType
int16,
              \PrimExp v
v -> forall v. [Char] -> [PrimExp v] -> PrimType -> PrimExp v
Imp.FunExp [Char]
"from_bits16" [PrimExp v
v] PrimType
t
            )
          FloatType FloatType
Float32 ->
            ( \PrimExp v
v -> forall v. [Char] -> [PrimExp v] -> PrimType -> PrimExp v
Imp.FunExp [Char]
"to_bits32" [PrimExp v
v] PrimType
int32,
              \PrimExp v
v -> forall v. [Char] -> [PrimExp v] -> PrimType -> PrimExp v
Imp.FunExp [Char]
"from_bits32" [PrimExp v
v] PrimType
t
            )
          FloatType FloatType
Float64 ->
            ( \PrimExp v
v -> forall v. [Char] -> [PrimExp v] -> PrimType -> PrimExp v
Imp.FunExp [Char]
"to_bits64" [PrimExp v
v] PrimType
int64,
              \PrimExp v
v -> forall v. [Char] -> [PrimExp v] -> PrimType -> PrimExp v
Imp.FunExp [Char]
"from_bits64" [PrimExp v
v] PrimType
t
            )
          PrimType
_ -> (forall a. a -> a
id, forall a. a -> a
id)

      int :: PrimType
int
        | PrimType -> Int
primBitSize PrimType
t forall a. Eq a => a -> a -> Bool
== Int
16 = PrimType
int16
        | PrimType -> Int
primBitSize PrimType
t forall a. Eq a => a -> a -> Bool
== Int
32 = PrimType
int32
        | Bool
otherwise = PrimType
int64

  forall rep r op a. ImpM rep r op a -> ImpM rep r op a
everythingVolatile forall a b. (a -> b) -> a -> b
$ forall rep r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix VName
old [] (VName -> SubExp
Var VName
arr) [TExp Int64]
bucket

  VName
old_bits_v <- forall {k} (t :: k). TV t -> VName
tvVar forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall {k} rep r op (t :: k).
[Char] -> PrimType -> ImpM rep r op (TV t)
dPrim [Char]
"old_bits" PrimType
int
  VName
old_bits_v forall rep r op. VName -> Exp -> ImpM rep r op ()
<~~ forall {v}. PrimExp v -> PrimExp v
toBits (VName -> PrimType -> Exp
Imp.var VName
old PrimType
t)
  let old_bits :: Exp
old_bits = VName -> PrimType -> Exp
Imp.var VName
old_bits_v PrimType
int

  -- While-loop: Try to insert your value
  forall rep r op. TExp Bool -> ImpM rep r op () -> ImpM rep r op ()
sWhile (forall {k} (t :: k). TV t -> TExp t
tvExp TV Int32
run_loop forall {k} v (t :: k).
Eq v =>
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.==. TExp Int32
0) forall a b. (a -> b) -> a -> b
$ do
    VName
x forall rep r op. VName -> Exp -> ImpM rep r op ()
<~~ VName -> PrimType -> Exp
Imp.var VName
old PrimType
t
    ImpM MCMem HostEnv Multicore ()
do_op -- Writes result into x
    forall op rep r. op -> ImpM rep r op ()
sOp forall b c a. (b -> c) -> (a -> b) -> a -> c
. AtomicOp -> Multicore
Imp.Atomic forall a b. (a -> b) -> a -> b
$
      PrimType
-> VName
-> VName
-> Count Elements (TExp Int32)
-> VName
-> Exp
-> AtomicOp
Imp.AtomicCmpXchg
        PrimType
bytes
        VName
old_bits_v
        VName
arr'
        (forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int32 v
sExt32 forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Count Elements (TExp Int64)
bucket_offset)
        (forall {k} (t :: k). TV t -> VName
tvVar TV Int32
run_loop)
        (forall {v}. PrimExp v -> PrimExp v
toBits (VName -> PrimType -> Exp
Imp.var VName
x PrimType
t))
    VName
old forall rep r op. VName -> Exp -> ImpM rep r op ()
<~~ forall {v}. PrimExp v -> PrimExp v
fromBits Exp
old_bits

supportedPrims :: Int -> Bool
supportedPrims :: Int -> Bool
supportedPrims Int
8 = Bool
True
supportedPrims Int
16 = Bool
True
supportedPrims Int
32 = Bool
True
supportedPrims Int
64 = Bool
True
supportedPrims Int
_ = Bool
False

-- Supported bytes lengths by GCC (and clang) compiler
toIntegral :: Int -> MulticoreGen PrimType
toIntegral :: Int -> MulticoreGen PrimType
toIntegral Int
8 = forall (f :: * -> *) a. Applicative f => a -> f a
pure PrimType
int8
toIntegral Int
16 = forall (f :: * -> *) a. Applicative f => a -> f a
pure PrimType
int16
toIntegral Int
32 = forall (f :: * -> *) a. Applicative f => a -> f a
pure PrimType
int32
toIntegral Int
64 = forall (f :: * -> *) a. Applicative f => a -> f a
pure PrimType
int64
toIntegral Int
b = forall a. HasCallStack => [Char] -> a
error forall a b. (a -> b) -> a -> b
$ [Char]
"number of bytes is not supported for CAS - " forall a. [a] -> [a] -> [a]
++ forall a. Pretty a => a -> [Char]
prettyString Int
b