{-# LANGUAGE GADTs #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TupleSections #-} {-# LANGUAGE TypeApplications #-} {-# LANGUAGE TypeOperators #-} {-# OPTIONS_HADDOCK hide #-} -- | -- Module : Data.Array.Accelerate.LLVM.CodeGen.Permute -- Copyright : [2016..2020] The Accelerate Team -- License : BSD3 -- -- Maintainer : Trevor L. McDonell -- Stability : experimental -- Portability : non-portable (GHC extensions) -- module Data.Array.Accelerate.LLVM.CodeGen.Permute ( IRPermuteFun(..), llvmOfPermuteFun, atomicCAS_rmw, atomicCAS_cmp, ) where import Data.Array.Accelerate.AST import Data.Array.Accelerate.AST.Idx import Data.Array.Accelerate.AST.LeftHandSide import Data.Array.Accelerate.AST.Var import Data.Array.Accelerate.Debug import Data.Array.Accelerate.Error import Data.Array.Accelerate.Representation.Type import Data.Array.Accelerate.Trafo.Substitution import Data.Array.Accelerate.Type import Data.Array.Accelerate.LLVM.CodeGen.Environment import Data.Array.Accelerate.LLVM.CodeGen.Exp import Data.Array.Accelerate.LLVM.CodeGen.IR import Data.Array.Accelerate.LLVM.CodeGen.Monad import Data.Array.Accelerate.LLVM.CodeGen.Sugar import Data.Array.Accelerate.LLVM.CodeGen.Type import Data.Array.Accelerate.LLVM.Foreign import LLVM.AST.Type.AddrSpace import LLVM.AST.Type.Instruction import LLVM.AST.Type.Instruction.Atomic import LLVM.AST.Type.Instruction.RMW as RMW import LLVM.AST.Type.Instruction.Volatile import LLVM.AST.Type.Name import LLVM.AST.Type.Operand import LLVM.AST.Type.Representation import Control.Applicative import Data.Constraint ( withDict ) import System.IO.Unsafe import Prelude -- | A forward permutation might be specialised to use atomic instructions to -- perform the read-modify-write of the output array directly, rather than -- separately acquiring a lock. The basic operation is always provided in case -- a backend does not support the atomic operation at that type, or if it is -- executing sequentially. -- -- For the atomicRMW case, the function is applied to the new value before -- feeding to the atomic instruction to combine with the old. -- data IRPermuteFun arch aenv t where IRPermuteFun :: { combine :: IRFun2 arch aenv (e -> e -> e) , atomicRMW :: Maybe ( RMWOperation , IRFun1 arch aenv (e -> e) ) } -> IRPermuteFun arch aenv (e -> e -> e) -- | Analysis and code generation for forward permutation combination function. -- -- Specialisation for atomic operations is currently limited to direct -- applications of the function; that is, we don't dig down underneath -- let-bindings. -- llvmOfPermuteFun :: forall arch aenv e. Foreign arch => Fun aenv (e -> e -> e) -> Gamma aenv -> IRPermuteFun arch aenv (e -> e -> e) llvmOfPermuteFun fun aenv = IRPermuteFun{..} where combine = llvmOfFun2 fun aenv atomicRMW -- If the old value is not used (i.e. permute const) then we can just -- store the new value directly. Since we do not require the return value -- we can do this for any scalar value with a regular Store. However, -- as we use an unzipped struct-of-array representation for product -- types, the multiple store instructions for the different fields -- could come from different threads, so we only allow the non-atomic -- version if the flag @-ffast-permute-const@ is set. -- | Lam lhs (Lam (LeftHandSideWildcard tp) (Body body)) <- fun , True <- fast tp , fun' <- llvmOfFun1 (Lam lhs (Body body)) aenv = Just (Exchange, fun') -- LLVM natively supports atomic operations on integral types only. -- However different targets may support atomic instructions on other -- scalar types (for example the NVPTX target supports atomic add and -- subtract on floating point values). -- -- Additionally it is possible to implement atomic instructions using -- atomic compare-and-swap, which is likely to be more performant than the -- generic spin-lock based approach. -- | Lam lhs@(LeftHandSideSingle _) (Lam (LeftHandSideSingle _) (Body body)) <- fun , Just (rmw, x) <- rmwOp body , Just x' <- strengthenE latest x , fun' <- llvmOfFun1 (Lam lhs (Body x')) aenv = Just (rmw, fun') | otherwise = Nothing fast :: TypeR e -> Bool fast tp | TupRsingle{} <- tp = True | otherwise = unsafePerformIO (getFlag fast_permute_const) -- XXX: This doesn't work for newtypes because the coercion gets in the -- way. This should be generalised to work for product types (e.g. -- complex numbers) and take this factor into account as well. -- TLM-2019-09-27 -- rmwOp :: OpenExp (((),e),e) aenv e -> Maybe (RMWOperation, OpenExp (((),e),e) aenv e) rmwOp (PrimApp f xs) | PrimAdd{} <- f = (RMW.Add,) <$> extract xs | PrimSub{} <- f = (RMW.Sub,) <$> extract xs | PrimMin{} <- f = (RMW.Min,) <$> extract xs | PrimMax{} <- f = (RMW.Max,) <$> extract xs | PrimBOr{} <- f = (RMW.Or,) <$> extract xs | PrimBAnd{} <- f = (RMW.And,) <$> extract xs | PrimBXor{} <- f = (RMW.Xor,) <$> extract xs rmwOp _ = Nothing -- Determine which argument to a binary function was the new value being -- combined. This only works when the old value is used unmodified, but that -- is sufficient for us because otherwise it would not be suitable for the -- atomic update operation. -- -- In the permutation function, the old value is given as the second -- argument, corresponding to ZeroIdx. -- extract :: OpenExp (((),e),e) aenv (e,e) -> Maybe (OpenExp (((),e),e) aenv e) extract (Pair x y) | Evar (Var _ ZeroIdx) <- x = Just y | Evar (Var _ ZeroIdx) <- y = Just x extract _ = Nothing -- Used with 'strengthenE' to ensure that the expression does not make use -- of the old value except in the combination function. latest :: (((),e),e) :?> ((),e) latest ZeroIdx = Nothing latest (SuccIdx ix) = Just ix -- Implementation of atomic RMW operation (e.g. (+), (-)) using atomic -- compare-and-swap instructions, for targets which do not support the native -- instruction at this type but do support CAS at this bit width. -- -- > void casAdd(double *addr, double val) -- > { -- > uint64_t* addr_i = reinterpret_cast addr; -- > uint64_t old = *addr_i; -- > -- > do { -- > uint64_t expected = old; -- > uint64_t new = reinterpret_cast(val + reinterpret_cast(expected)); -- > -- > uint64_t old = atomicCAS(addr_i, expected, new); -- > } -- > while (old != expected); -- > } -- atomicCAS_rmw :: forall arch e. HasCallStack => SingleType e -> (Operands e -> CodeGen arch (Operands e)) -> Operand (Ptr e) -> CodeGen arch () atomicCAS_rmw t update addr = case t of NumSingleType (FloatingNumType f) -> floating f NumSingleType (IntegralNumType i) -> integral i where floating :: FloatingType t -> CodeGen arch () floating TypeHalf{} = atomicCAS_rmw' t (integralType :: IntegralType Word16) update addr floating TypeFloat{} = atomicCAS_rmw' t (integralType :: IntegralType Word32) update addr floating TypeDouble{} = atomicCAS_rmw' t (integralType :: IntegralType Word64) update addr integral :: IntegralType t -> CodeGen arch () integral i = atomicCAS_rmw' t i update addr atomicCAS_rmw' :: HasCallStack => SingleType t -> IntegralType i -> (Operands t -> CodeGen arch (Operands t)) -> Operand (Ptr t) -> CodeGen arch () atomicCAS_rmw' t i update addr = withDict (integralElt i) $ do let si = SingleScalarType (NumSingleType (IntegralNumType i)) -- spin <- newBlock "rmw.spin" exit <- newBlock "rmw.exit" addr' <- instr' $ PtrCast (PtrPrimType (ScalarPrimType si) defaultAddrSpace) addr init' <- instr' $ Load si NonVolatile addr' old' <- fresh $ TupRsingle si top <- br spin setBlock spin old <- instr' $ BitCast (SingleScalarType t) (op i old') val <- update $ ir t old val' <- instr' $ BitCast si (op t val) r <- instr' $ CmpXchg i NonVolatile addr' (op i old') val' (CrossThread, AcquireRelease) Monotonic done <- instr' $ ExtractValue scalarType PairIdxRight r next' <- instr' $ ExtractValue si PairIdxLeft r -- Since we removed Bool from the set of primitive types Accelerate -- supports, we have to do a small hack to have LLVM consider this as its -- correct type of a 1-bit integer (rather than the 8-bits it is actually -- stored as) done' <- case done of LocalReference _ (UnName n) -> return $ OP_Bool (LocalReference type' (UnName n)) _ -> internalError "expected unnamed local reference" bot <- cbr done' exit spin _ <- phi' (TupRsingle si) spin old' [(ir i init',top), (ir i next',bot)] setBlock exit -- Implementation of atomic comparison operators (i.e. min, max) using -- compare-and-swap, for targets which do not support the native instruction at -- this type but do support CAS at this bit width. The old value is discarded. -- -- For example, atomicMin is implemented similarly to the following (however the -- loop condition is more complex): -- -- > void casMin(double *addr, double val) -- > { -- > double old = *addr; -- > uint64_t val_i = reinterpret_cast(val); -- > uint64_t addr_i = reinterpret_cast(addr); -- > -- > while (val < old) { -- > uint64_t assumed_i = reinterpret_cast(old); -- > uint64_t old_i = atomicCAS(addr_i, assumed_i, val_i); -- > old = reinterpret_cast(old_i); -- > } -- > } -- -- If the function returns 'True', then the given value should be written to the -- address. -- atomicCAS_cmp :: forall arch e. HasCallStack => SingleType e -> (SingleType e -> Operands e -> Operands e -> CodeGen arch (Operands Bool)) -> Operand (Ptr e) -> Operand e -> CodeGen arch () atomicCAS_cmp t cmp addr val = case t of NumSingleType (FloatingNumType f) -> floating f NumSingleType (IntegralNumType i) -> integral i where floating :: FloatingType t -> CodeGen arch () floating TypeHalf{} = atomicCAS_cmp' t (integralType :: IntegralType Word16) cmp addr val floating TypeFloat{} = atomicCAS_cmp' t (integralType :: IntegralType Word32) cmp addr val floating TypeDouble{} = atomicCAS_cmp' t (integralType :: IntegralType Word64) cmp addr val integral :: IntegralType t -> CodeGen arch () integral i = atomicCAS_cmp' t i cmp addr val atomicCAS_cmp' :: HasCallStack => SingleType t -- actual type of elements -> IntegralType i -- unsigned integral type of same bit size as 't' -> (SingleType t -> Operands t -> Operands t -> CodeGen arch (Operands Bool)) -> Operand (Ptr t) -> Operand t -> CodeGen arch () atomicCAS_cmp' t i cmp addr val = withDict (singleElt t) $ do let si = SingleScalarType (NumSingleType (IntegralNumType i)) -- test <- newBlock "cas.cmp" spin <- newBlock "cas.retry" exit <- newBlock "cas.exit" -- The new value and address to swap cast to integral type addr' <- instr' $ PtrCast (PtrPrimType (ScalarPrimType si) defaultAddrSpace) addr val' <- instr' $ BitCast si val old <- fresh $ TupRsingle $ SingleScalarType t -- Read the current value at the address start <- instr' $ Load (SingleScalarType t) NonVolatile addr top <- br test -- Compare the new value with the current contents at that memory slot. If the -- comparison fails (e.g. we are computing atomicMin but the new value is -- already larger than the current value) then exit. setBlock test yes <- cmp t (ir t val) old _ <- cbr yes spin exit -- Attempt to exchange the memory at this location with the new value. The -- CmpXchg instruction returns the old value together with a flag indicating -- whether or not the swap occurred. If the swap is successful we are done, -- otherwise reapply the comparison value with the newly acquired value. setBlock spin old' <- instr' $ BitCast si (op t old) r <- instr' $ CmpXchg i NonVolatile addr' old' val' (CrossThread, AcquireRelease) Monotonic done <- instr' $ ExtractValue scalarType PairIdxRight r next <- instr' $ ExtractValue si PairIdxLeft r next' <- instr' $ BitCast (SingleScalarType t) next done' <- case done of LocalReference _ (UnName n) -> return $ OP_Bool (LocalReference type' (UnName n)) _ -> internalError "expected unnamed local reference" bot <- cbr done' exit test _ <- phi' (TupRsingle $ SingleScalarType t) test old [(ir t start,top), (ir t next',bot)] setBlock exit