{-# LANGUAGE GADTs #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TypeApplications #-} {-# LANGUAGE TypeOperators #-} -- | -- Module : Data.Array.Accelerate.LLVM.Native.CodeGen.Fold -- Copyright : [2014..2020] The Accelerate Team -- License : BSD3 -- -- Maintainer : Trevor L. McDonell -- Stability : experimental -- Portability : non-portable (GHC extensions) -- module Data.Array.Accelerate.LLVM.Native.CodeGen.Fold where import Data.Array.Accelerate.Representation.Array import Data.Array.Accelerate.Representation.Shape import Data.Array.Accelerate.Representation.Type import Data.Array.Accelerate.Type import Data.Array.Accelerate.LLVM.CodeGen.Arithmetic as A import Data.Array.Accelerate.LLVM.CodeGen.Array import Data.Array.Accelerate.LLVM.CodeGen.Base import Data.Array.Accelerate.LLVM.CodeGen.Constant import Data.Array.Accelerate.LLVM.CodeGen.Environment import Data.Array.Accelerate.LLVM.CodeGen.Exp import Data.Array.Accelerate.LLVM.CodeGen.IR import Data.Array.Accelerate.LLVM.CodeGen.Monad import Data.Array.Accelerate.LLVM.CodeGen.Sugar import Data.Array.Accelerate.LLVM.Compile.Cache import Data.Array.Accelerate.LLVM.Native.CodeGen.Base import Data.Array.Accelerate.LLVM.Native.CodeGen.Generate import Data.Array.Accelerate.LLVM.Native.CodeGen.Loop import Data.Array.Accelerate.LLVM.Native.Target ( Native ) import Control.Applicative import Prelude as P hiding ( length ) -- Reduce an array along the innermost dimension. The reduction -- function must be associative to allow for an efficient parallel -- implementation. When an initial value is given, the input can be -- empty. The initial element does not need to be a neutral element of -- the operator. When no initial value is given, the array must be -- non-empty -- mkFold :: UID -> Gamma aenv -> ArrayR (Array sh e) -> IRFun2 Native aenv (e -> e -> e) -> Maybe (IRExp Native aenv e) -> MIRDelayed Native aenv (Array (sh, Int) e) -> CodeGen Native (IROpenAcc Native aenv (Array sh e)) mkFold uid aenv aR f z arr = (+++) <$> case aR of ArrayR ShapeRz eR -> mkFoldAll uid aenv eR f z arr _ -> mkFoldDim uid aenv aR f z arr <*> case z of Just z' -> mkFoldFill uid aenv aR z' Nothing -> return (IROpenAcc []) -- Reduce a multidimensional (>1) array along the innermost dimension. -- -- For simplicity, each element of the output (reduction along the entire length -- of an innermost-dimension index) is computed by a single thread. -- mkFoldDim :: UID -> Gamma aenv -> ArrayR (Array sh e) -> IRFun2 Native aenv (e -> e -> e) -> MIRExp Native aenv e -> MIRDelayed Native aenv (Array (sh, Int) e) -> CodeGen Native (IROpenAcc Native aenv (Array sh e)) mkFoldDim uid aenv aR@(ArrayR shR eR) combine mseed mdelayed = let (start, end, paramGang) = gangParam shR (arrOut, paramOut) = mutableArray aR "out" (arrIn, paramIn) = delayedArray "in" mdelayed paramEnv = envParam aenv zero = liftInt 0 in makeOpenAcc uid "fold" (paramGang ++ paramOut ++ paramIn ++ paramEnv) $ do sz <- indexHead <$> delayedExtent arrIn imapNestFromTo shR start end (irArrayShape arrOut) $ \ix i -> do r <- case mseed of Just seed -> do z <- seed reduceFromTo eR zero sz (app2 combine) z (app1 (delayedIndex arrIn) . indexCons ix) Nothing -> reduce1FromTo eR zero sz (app2 combine) (app1 (delayedIndex arrIn) . indexCons ix) writeArray TypeInt arrOut i r return_ -- Reduce an array to single element. -- -- Since reductions consume arrays that have been fused into them, -- a parallel fold requires two passes. At an example, take vector dot -- product: -- -- > dotp xs ys = fold (+) 0 (zipWith (*) xs ys) -- -- 1. The first pass reads in the fused array data, in this case corresponding -- to the function (\i -> (xs!i) * (ys!i)). -- -- 2. The second pass reads in the manifest array data from the first step and -- directly reduces the array. This second step should be small and so is -- usually just done by a single core. -- -- Note that the first step is split into two kernels, the second of which -- reads a carry-in value of that thread's partial reduction, so that -- threads can still participate in work-stealing. These kernels must not -- be invoked over empty ranges. -- -- The final step is sequential reduction of the partial results. If this -- is an exclusive reduction, the seed element is included at this point. -- mkFoldAll :: UID -> Gamma aenv -- ^ array environment -> TypeR e -> IRFun2 Native aenv (e -> e -> e) -- ^ combination function -> MIRExp Native aenv e -- ^ seed element, if this is an exclusive reduction -> MIRDelayed Native aenv (Vector e) -- ^ input data -> CodeGen Native (IROpenAcc Native aenv (Scalar e)) mkFoldAll uid aenv eR combine mseed mdelayed = foldr1 (+++) <$> sequence [ mkFoldAllS uid aenv eR combine mseed mdelayed , mkFoldAllP1 uid aenv eR combine mdelayed , mkFoldAllP2 uid aenv eR combine mseed ] -- Sequential reduction of an entire array to a single element -- mkFoldAllS :: UID -> Gamma aenv -- ^ array environment -> TypeR e -> IRFun2 Native aenv (e -> e -> e) -- ^ combination function -> MIRExp Native aenv e -- ^ seed element, if this is an exclusive reduction -> MIRDelayed Native aenv (Vector e) -- ^ input data -> CodeGen Native (IROpenAcc Native aenv (Scalar e)) mkFoldAllS uid aenv eR combine mseed mdelayed = let (start, end, paramGang) = gangParam dim1 (arrOut, paramOut) = mutableArray (ArrayR dim0 eR) "out" (arrIn, paramIn) = delayedArray "in" mdelayed paramEnv = envParam aenv zero = liftInt 0 in makeOpenAcc uid "foldAllS" (paramGang ++ paramOut ++ paramIn ++ paramEnv) $ do r <- case mseed of Just seed -> do z <- seed reduceFromTo eR (indexHead start) (indexHead end) (app2 combine) z (app1 (delayedLinearIndex arrIn)) Nothing -> reduce1FromTo eR (indexHead start) (indexHead end) (app2 combine) (app1 (delayedLinearIndex arrIn)) writeArray TypeInt arrOut zero r return_ -- Parallel reduction of an entire array to a single element, step 1. -- -- Threads reduce each stripe of the input into a temporary array, incorporating -- any fused functions on the way. -- mkFoldAllP1 :: UID -> Gamma aenv -- ^ array environment -> TypeR e -> IRFun2 Native aenv (e -> e -> e) -- ^ combination function -> MIRDelayed Native aenv (Vector e) -- ^ input data -> CodeGen Native (IROpenAcc Native aenv (Scalar e)) mkFoldAllP1 uid aenv eR combine mdelayed = let (start, end, paramGang) = gangParam dim1 (arrTmp, paramTmp) = mutableArray (ArrayR dim1 eR) "tmp" (arrIn, paramIn) = delayedArray "in" mdelayed piece = local (TupRsingle scalarTypeInt) "ix.piece" paramPiece = parameter (TupRsingle scalarTypeInt) "ix.piece" paramEnv = envParam aenv in makeOpenAcc uid "foldAllP1" (paramGang ++ paramPiece ++ paramTmp ++ paramIn ++ paramEnv) $ do -- A thread reduces a sequential (non-empty) stripe of the input and stores -- that value into a temporary array at a specific index. This method thus -- supports non-commutative operators because the order of operations -- remains left-to-right. -- r <- reduce1FromTo eR (indexHead start) (indexHead end) (app2 combine) (app1 (delayedLinearIndex arrIn)) writeArray TypeInt arrTmp piece r return_ -- Parallel reduction of an entire array to a single element, step 2. -- -- A single thread reduces the temporary array to a single element. -- -- During execution, we choose a stripe size in phase 1 so that the temporary is -- small-ish and thus suitable for sequential reduction. An alternative would be -- to keep the stripe size constant and, for if the partial reductions array is -- large, continuing reducing it in parallel. -- mkFoldAllP2 :: UID -> Gamma aenv -- ^ array environment -> TypeR e -> IRFun2 Native aenv (e -> e -> e) -- ^ combination function -> MIRExp Native aenv e -- ^ seed element, if this is an exclusive reduction -> CodeGen Native (IROpenAcc Native aenv (Scalar e)) mkFoldAllP2 uid aenv eR combine mseed = let (start, end, paramGang) = gangParam dim1 (arrTmp, paramTmp) = mutableArray (ArrayR dim1 eR) "tmp" (arrOut, paramOut) = mutableArray (ArrayR dim0 eR) "out" paramEnv = envParam aenv zero = liftInt 0 in makeOpenAcc uid "foldAllP2" (paramGang ++ paramTmp ++ paramOut ++ paramEnv) $ do r <- case mseed of Just seed -> do z <- seed reduceFromTo eR (indexHead start) (indexHead end) (app2 combine) z (readArray TypeInt arrTmp) Nothing -> reduce1FromTo eR (indexHead start) (indexHead end) (app2 combine) (readArray TypeInt arrTmp) writeArray TypeInt arrOut zero r return_ -- Exclusive reductions over empty arrays (of any dimension) fill the lower -- dimensions with the initial element -- mkFoldFill :: UID -> Gamma aenv -> ArrayR (Array sh e) -> IRExp Native aenv e -> CodeGen Native (IROpenAcc Native aenv (Array sh e)) mkFoldFill uid aenv aR seed = mkGenerate uid aenv aR (IRFun1 (const seed)) -- Reduction loops -- --------------- -- Reduction of a (possibly empty) index space. -- reduceFromTo :: TypeR e -> Operands Int -- ^ starting index -> Operands Int -- ^ final index (exclusive) -> (Operands e -> Operands e -> CodeGen Native (Operands e)) -- ^ combination function -> Operands e -- ^ initial value -> (Operands Int -> CodeGen Native (Operands e)) -- ^ function to retrieve element at index -> CodeGen Native (Operands e) reduceFromTo eR m n f z get = iterFromTo eR m n z $ \i acc -> do x <- get i y <- f acc x return y -- Reduction of an array over a _non-empty_ index space. The array must -- contain at least one element. -- reduce1FromTo :: TypeR e -> Operands Int -- ^ starting index -> Operands Int -- ^ final index -> (Operands e -> Operands e -> CodeGen Native (Operands e)) -- ^ combination function -> (Operands Int -> CodeGen Native (Operands e)) -- ^ function to retrieve element at index -> CodeGen Native (Operands e) reduce1FromTo eR m n f get = do z <- get m m1 <- add numType m (ir numType (num numType 1)) reduceFromTo eR m1 n f z get