{-# LANGUAGE QuasiQuotes #-} {-# LANGUAGE PackageImports #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {- Joel Svensson 2013 -} module Obsidian.CodeGen.CompileIM where import Language.C.Quote hiding (Block) import Language.C.Quote.CUDA as CU import qualified Language.C.Quote.OpenCL as CL import qualified "language-c-quote" Language.C.Syntax as C import Obsidian.Exp (IExp(..),IBinOp(..),IUnOp(..)) import Obsidian.Types import Obsidian.DimSpec import Obsidian.CodeGen.Program import Control.Monad.State import Data.Word import Data.Int {- TODOs: * Pass a target "platform" to code generator. - CUDA - OpenCL - Sequential C - C with OpenMP ? * rewrite some functions here to use a reader monad. * TODO: Make sure tid always has correct Value -} --------------------------------------------------------------------------- -- Platform --------------------------------------------------------------------------- data Platform = PlatformCUDA | PlatformOpenCL | PlatformC data Config = Config { configThreadsPerBlock :: Word32, configSharedMem :: Word32} --------------------------------------------------------------------------- -- compileExp (maybe a bad name) --------------------------------------------------------------------------- compileExp :: IExp -> Exp compileExp (IVar name t) = [cexp| $id:name |] -- TODO: Fix all this! compileExp (IBlockIdx X) = [cexp| $id:("bid")|] -- [cexp| $id:("blockIdx.x") |] compileExp (IBlockIdx Y) = [cexp| $id:("blockIdx.y") |] compileExp (IBlockIdx Z) = [cexp| $id:("blockIdx.z") |] compileExp (IThreadIdx X) = [cexp| $id:("threadIdx.x") |] compileExp (IThreadIdx Y) = [cexp| $id:("threadIdx.y") |] compileExp (IThreadIdx Z) = [cexp| $id:("threadIdx.z") |] compileExp (IBlockDim X) = [cexp| $id:("blockDim.x") |] compileExp (IBlockDim Y) = [cexp| $id:("blockDim.y") |] compileExp (IBlockDim Z) = [cexp| $id:("blockDim.z") |] compileExp (IGridDim X) = [cexp| $id:("GridDim.x") |] compileExp (IGridDim Y) = [cexp| $id:("GridDim.y") |] compileExp (IGridDim Z) = [cexp| $id:("GridDim.z") |] compileExp (IBool True) = [cexp|1|] compileExp (IBool False) = [cexp|0|] compileExp (IInt8 n) = [cexp| $int:(toInteger n) |] compileExp (IInt16 n) = [cexp| $int:(toInteger n) |] compileExp (IInt32 n) = [cexp| $int:(toInteger n) |] compileExp (IInt64 n) = [cexp| $lint:(toInteger n) |] compileExp (IWord8 n) = [cexp| $uint:(toInteger n) |] compileExp (IWord16 n) = [cexp| $uint:(toInteger n) |] compileExp (IWord32 n) = [cexp| $uint:(toInteger n) |] compileExp (IWord64 n) = [cexp| $ulint:(toInteger n) |] compileExp (IFloat n) = [cexp| $float:(toRational n) |] compileExp (IDouble n) = [cexp| $double:(toRational n) |] compileExp (IIndex (i1,[e]) t) = [cexp| $(compileExp i1)[$(compileExp e)] |] compileExp (ICond e1 e2 e3 t) = [cexp| $(compileExp e1) ? $(compileExp e2) : $(compileExp e3) |] compileExp (IBinOp op e1 e2 t) = go op where x = compileExp e1 y = compileExp e2 go IAdd = [cexp| $x + $y |] go ISub = [cexp| $x - $y |] go IMul = [cexp| $x * $y |] go IDiv = [cexp| $x / $y |] go IMod = [cexp| $x % $y |] go IEq = [cexp| $x == $y |] go INotEq = [cexp| $x != $y |] go ILt = [cexp| $x < $y |] go IGt = [cexp| $x > $y |] go IGEq = [cexp| $x >= $y |] go ILEq = [cexp| $x <= $y |] go IAnd = [cexp| $x && $y |] go IOr = [cexp| $x || $y |] go IPow = case t of Float -> [cexp|powf($x,$y) |] Double -> [cexp|pow($x,$y) |] go IBitwiseAnd = [cexp| $x & $y |] go IBitwiseOr = [cexp| $x | $y |] go IBitwiseXor = [cexp| $x ^ $y |] go IShiftL = [cexp| $x << $y |] go IShiftR = [cexp| $x >> $y |] compileExp (IUnOp op e t) = go op where x = compileExp e go IBitwiseNeg = [cexp| ~$x|] go INot = [cexp| !$x|] compileExp (IFunCall name es t) = [cexp| $fc |] where es' = map compileExp es fc = [cexp| $id:(name)($args:(es')) |] compileExp (ICast e t) = [cexp| ($ty:(compileType t)) $e' |] where e' = compileExp e compileType (Int8) = [cty| typename int8_t |] compileType (Int16) = [cty| typename int16_t |] compileType (Int32) = [cty| typename int32_t |] compileType (Int64) = [cty| typename int64_t |] compileType (Word8) = [cty| typename uint8_t |] compileType (Word16) = [cty| typename uint16_t |] compileType (Word32) = [cty| typename uint32_t |] compileType (Word64) = [cty| typename uint64_t |] compileType (Float) = [cty| float |] compileType (Double) = [cty| double |] compileType (Pointer t) = [cty| $ty:(compileType t)* |] compileType (Volatile t) = [cty| volatile $ty:(compileType t)|] --compileType' (Volatile t) = [cty| volatile $ty:(compileType t)|] --compileType' t = compileType t {- Solve the volatile issue. When operating in a warp without syncs Add Volatile to pointers and only there! Need a way to convey this information all the way from the force functions down to the code generator. Also, Change code generation to declare names of pointers into shared memory at different types (that are used in the program) __shared__ uint8_t sbase[X] uint32_t *sbaseU32 = sbase; float *sbaseF = sbase; This changes how offsets are computed within code. May need names for each intermediate array used in the program __shared__ uint8_t sbase[X] uint32_t *arr1 = sbase; uint32_t *arr2 = (sbase + 16U); float *arr3 = sbase; float *arr4 = (sbase + 4U); -} --------------------------------------------------------------------------- -- ** -- Compile IM -- ** --------------------------------------------------------------------------- -- newtype CInfo a = CInfo (State CInfoState a) -- deriving (Monad, MonadState CInfoState) -- data CInfoState = CInfoState { cInfoTid :: (Bool,Exp), -- cInfoWarpID :: (Bool,Exp), -- cInfoWarpIx :: (Bool,Exp) } -- evalCInfo (CInfo s) = evalState s --------------------------------------------------------------------------- -- Statement t to Stm --------------------------------------------------------------------------- compileStm :: Platform -> Config -> Statement t -> [Stm] compileStm p c (SAssign name [] e) = [[cstm| $(compileExp name) = $(compileExp e);|]] compileStm p c (SAssign name [ix] e) = [[cstm| $(compileExp name)[$(compileExp ix)] = $(compileExp e); |]] compileStm p c (SAtomicOp name ix atop) = case atop of AtInc -> [[cstm| atomicInc(&$(compileExp name)[$(compileExp ix)],0xFFFFFFFF); |]] AtAdd e -> [[cstm| atomicAdd(&$(compileExp name)[$(compileExp ix)],$(compileExp e));|]] AtSub e -> [[cstm| atomicSub(&$(compileExp name)[$(compileExp ix)],$(compileExp e));|]] AtExch e -> [[cstm| atomicExch(&$(compileExp name)[$(compileExp ix)],$(compileExp e));|]] compileStm p c (SCond be im) = [[cstm| if ($(compileExp be)) { $stms:body } |]] where body = compileIM p c im -- (compileIM p c im) compileStm p c (SSeqFor loopVar n im) = [[cstm| for (int $id:loopVar = 0; $id:loopVar < $(compileExp n); ++$id:loopVar) { $stms:body } |]] where body = compileIM p c im -- (compileIM p c im) -- Just relay to specific compileFunction compileStm p c a@(SForAll lvl n im) = compileForAll p c a compileStm p c a@(SDistrPar lvl n im) = compileDistr p c a compileStm p c (SSeqWhile b im) = [[cstm| while ($(compileExp b)) { $stms:body}|]] where body = compileIM p c im compileStm p c SSynchronize = case p of PlatformCUDA -> [[cstm| __syncthreads(); |]] PlatformOpenCL -> [[cstm| barrier(CLK_LOCAL_MEM_FENCE); |]] compileStm _ _ (SAllocate _ _ _) = [] compileStm _ _ (SDeclare name t) = [] compileStm _ _ a = error $ "compileStm: missing case " --------------------------------------------------------------------------- -- DistrPar --------------------------------------------------------------------------- compileDistr :: Platform -> Config -> Statement t -> [Stm] compileDistr PlatformCUDA c (SDistrPar Block n im) = codeQ ++ codeR -- New here is BLOCK virtualisation where cim = compileIM PlatformCUDA c im numBlocks = [cexp| $id:("gridDim.x") |] blocksQ = [cexp| $exp:(compileExp n) / $exp:numBlocks|] blocksR = [cexp| $exp:(compileExp n) % $exp:numBlocks|] codeQ = [[cstm| for (int b = 0; b < $exp:blocksQ; ++b) { $stms:bodyQ }|]] bodyQ = [cstm| $id:("bid") = blockIdx.x * $exp:blocksQ + b;|] : cim ++ [[cstm| bid = blockIdx.x;|], [cstm| __syncthreads();|]] -- yes no ? codeR = [[cstm| bid = ($exp:numBlocks * $exp:blocksQ) + blockIdx.x;|], [cstm| if (blockIdx.x < $exp:blocksR) { $stms:cim }|], [cstm| bid = blockIdx.x;|], [cstm| __syncthreads();|]] -- yes no ? -- Can I be absolutely sure that 'n' here is statically known ? -- I must look over the functions that can potentially create this IM. -- Can make a separate case for unknown 'n' but generate worse code. -- (That is true for all levels) compileDistr PlatformCUDA c (SDistrPar Warp (IWord32 n) im) = codeQ ++ codeR -- Here the 'im' should be distributed over 'n'warps. -- 'im' uses a warpID variable to identify what warp it is. -- 'n' may be higher than the actual number of warps we have! -- So GPU warp virtualisation is needed. where cim = compileIM PlatformCUDA c im nWarps = fromIntegral $ configThreadsPerBlock c `div` 32 numWarps = [cexp| $int:nWarps|] warpsQ = [cexp| $int:(n `div` nWarps)|] warpsR = [cexp| $int:(n `mod` nWarps)|] codeQ = [[cstm| for (int w = 0; w < $exp:warpsQ; ++w) { $stms:bodyQ } |]] bodyQ = [cstm| warpID = (threadIdx.x / 32) * $exp:warpsQ + w;|] : cim ++ --[cstm| warpID = w * $exp:warpsQ + (threadIdx.x / 32);|] : cim ++ [[cstm| warpID = threadIdx.x / 32;|]] codeR = case (n `mod` nWarps) of 0 -> [] n -> [[cstm| warpID = ($exp:numWarps * $exp:warpsQ)+ (threadIdx.x / 32);|], [cstm| if (threadIdx.x / 32 < $exp:warpsR) { $stms:cim } |], [cstm| warpID = threadIdx.x / 32; |], [cstm| __syncthreads();|]] --------------------------------------------------------------------------- -- ForAll is compiled differently for different platforms --------------------------------------------------------------------------- compileForAll :: Platform -> Config -> Statement t -> [Stm] compileForAll PlatformCUDA c (SForAll Warp (IWord32 n) im) = codeQ ++ codeR where nt = 32 q = n `div` nt r = n `mod` nt cim = compileIM PlatformCUDA c im codeQ = case q of 0 -> [] 1 -> cim n -> [[cstm| for ( int vw = 0; vw < $int:q; ++vw) { $stms:body } |], [cstm| $id:("warpIx") = threadIdx.x % 32; |]] -- [cstm| __syncthreads();|]] where body = [cstm|$id:("warpIx") = vw*$int:nt + (threadIdx.x % 32); |] : cim --body = [cstm|$id:("warpIx") = (threadIdx.x % 32) * q + vw; |] : cim codeR = case r of 0 -> [] n -> [[cstm| if ((threadIdx.x % 32) < $int:r) { $id:("warpIx") = $int:(q*32) + (threadIdx.x % 32); $stms:cim } |], -- [cstm| __syncthreads();|], [cstm| $id:("warpIx") = threadIdx.x % 32; |]] compileForAll PlatformCUDA c (SForAll Block (IWord32 n) im) = goQ ++ goR where cim = compileIM PlatformCUDA c im nt = configThreadsPerBlock c q = n `quot` nt r = n `rem` nt -- q is the number full "passes" needed to cover the iteration -- space given we have nt threads. goQ = case q of 0 -> [] 1 -> cim -- [cstm|$id:loopVar = threadIdx.x; |]:cim --do -- stm <- updateTid [cexp| threadIdx.x |] -- return $ [cstm| $id:loopVar = threadIdx.x; |] : cim n -> [[cstm| for ( int i = 0; i < $int:q; ++i) { $stms:body } |], [cstm| $id:("tid") = threadIdx.x; |]] where body = [cstm|$id:("tid") = i*$int:nt + threadIdx.x; |] : cim -- r is the number of elements left. -- This generates code for when fewer threads are -- needed than available. (some threads shut down due to the conditional). goR = case (r,q) of (0,_) -> [] --(n,0) -> [[cstm| if (threadIdx.x < $int:n) { -- $stms:cim } |]] (n,m) -> [[cstm| if (threadIdx.x < $int:n) { $id:("tid") = $int:(q*nt) + threadIdx.x; $stms:cim } |], [cstm| $id:("tid") = threadIdx.x; |]] compileForAll PlatformCUDA c (SForAll Grid n im) = cim -- The grid case is special. May need more thought --error "compileForAll: Grid" where cim = compileIM PlatformCUDA c im compileForAll PlatformC c (SForAll lvl (IWord32 n) im) = go where body = compileIM PlatformC c im go = [ [cstm| for (int i = 0; i <$int:n; ++i) { $stms:body } |] ] --------------------------------------------------------------------------- -- compileWarp (needs fixing so that warpIx and warpID are always correct) --------------------------------------------------------------------------- -- compileWarp :: Platform -> Config -> IExp -> IMList t -> [Stm] -- compileWarp PlatformCUDA c (IWord32 warps) im = -- concatMap (go . fst) im -- where -- go (SAllocate nom n t) = [] -- Skip allocations at this point. Been handled earlier. -- go (SForAll Warp (IWord32 n) im) = -- {- -- warps : number of warps that should execute this code -- n : number of threads per each of these warps -- warps may be a larger number than there are actual warps -- n map be a larger number than 32 (the actual number of threads per warp) -- c : is the configuration. It knows the actual number of threads -- we are generating code for. -- -} -- if (wholeRealWarps <= 0) -- then error "compileWarp: Atleast one full warp of real threads needed!" -- else -- case (wholeRealWarps `compare` warps) of -- {- -- 3 potential outcomes. -- #1 we have more real warps than we need -- #2 we have exactly the amount of warps that we need -- #3 we have fewer real warps than requested. -- -} -- GT -> [[cstm| if (threadIdx.x < $int:(warps*32)) { -- $stms:(goQ ++ goR) } |]] -- EQ -> goQR -- LT -> wQ ++ wR -- where -- cim = compileIM PlatformCUDA c im -- nt = configThreadsPerBlock c -- wholeRealWarps = nt `quot` 32 -- threadsPartialWarp = nt `rem` 32 -- Do not use partial warps! -- -- Maybe something else than s for goR -- -- It will depend on how -- goQR = goQ ++ goR -- threadQ = n `quot` 32 -- Set up virtual threads within warp -- threadR = n `rem` 32 -- -- ###################################################################### -- CLEAN THIS MESS UP ! !!! ! ! ! !! ! !!!! ! !! ! ! ! ! ! ! !!! ! ! ! ! -- ###################################################################### -- -- Compile for warps, potentially with virtual threads -- goQ :: [Stm] -- goQ = case threadQ of -- 0 -> [] -- 1 -> cim -- n -> [[cstm| for (int i = 0; i < $int:threadQ; ++i) { -- $stms:body } |], -- [cstm| $id:("warpIx") = threadIdx.x % 32; |]] -- where -- body = [cstm| $id:("warpIx") = i*32 + threadIdx.x % 32; |] : cim -- goR :: [Stm] -- goR = case threadR of -- 0 -> [] -- _ -> [[cstm| if ( threadIdx.x % 32 < $int:threadR) { -- $id:("warpIx") = $int:(threadQ*32) + (threadIdx.x % 32); -- $stms:cim } |], -- [cstm| $id:("warpIx") = threadIdx.x % 32;|]] -- -- Compile for virtual warps -- warpQ = warps `quot` wholeRealWarps -- wholeRealWarps may be zero! -- warpR = warps `rem` wholeRealWarps -- -- each warp will pretend to be warpQ number of warps. -- -- warp jump size -- -- warpQ is the number of virtual warps that EACH real warp -- -- will pretend to be. -- -- warpR is a number of leftover warps (less than wholeRealWarps) -- -- that need to be processed afterwards. -- wQ = case warpQ of -- 0 -> [] -- 1 -> goQR -- n -> [[cstm| for (int vw = 0; vw < $int:warpQ; ++vw) { $stms:body } |], -- [cstm| $id:("warpID") = threadIdx.x / 32; |]] -- where -- --- vw * wholeRealWarps is incorrect. -- --- -- body = [cstm| $id:("warpID") = vw + ((threadIdx.x / 32)*$int:warpQ); |] : goQR -- wR = case warpR of -- 0 -> [] -- -- Skip one division, -- -- Premature optimisation? CUDA compiler will share the threadIdx.x / 32 result. -- n -> [[cstm| if (threadIdx.x < $int:(n * 32)) { -- $id:("warpID") = $int:(warpQ*wholeRealWarps) + threadIdx.x / 32; -- $stms:(goQR) } |], -- cim -- [cstm| $id:("warpID") = threadIdx.x / 32;|]] --------------------------------------------------------------------------- -- CompileIM to list of Stm --------------------------------------------------------------------------- compileIM :: Platform -> Config -> IMList a -> [Stm] compileIM pform conf im = concatMap ((compileStm pform conf) . fst) im --------------------------------------------------------------------------- -- Generate entire Kernel --------------------------------------------------------------------------- type Parameters = [(String,Obsidian.Types.Type)] compile :: Platform -> Config -> String -> (Parameters,IMList a) -> Definition compile pform config kname (params,im) = go pform where stms = compileIM pform config im ps = compileParams pform params go PlatformCUDA = [cedecl| extern "C" __global__ void $id:kname($params:ps) {$items:cudabody} |] go PlatformOpenCL = [CL.cedecl| __kernel void $id:kname($params:ps) {$stms:stms} |] go PlatformC = [cedecl| extern "C" void $id:kname($params:ps) {$items:cbody} |] cudabody = (if (configSharedMem config > 0) -- then [BlockDecl [cdecl| extern volatile __shared__ typename uint8_t sbase[]; |]] then [BlockDecl [cdecl| __shared__ typename uint8_t sbase[$uint:(configSharedMem config)]; |]] else []) ++ --[BlockDecl [cdecl| typename uint32_t tid = threadIdx.x; |]] ++ --[BlockDecl [cdecl| typename uint32_t warpID = threadIdx.x / 32; |], -- BlockDecl [cdecl| typename uint32_t warpIx = threadIdx.x % 32; |]] ++ -- [BlockDecl [cdecl| typename uint32_t bid = blockIdx.x; |]] ++ (if (usesGid im) then [BlockDecl [cdecl| typename uint32_t gid = blockIdx.x * blockDim.x + threadIdx.x; |]] else []) ++ (if (usesBid im) then [BlockDecl [cdecl| typename uint32_t bid = blockIdx.x; |]] else []) ++ (if (usesTid im) then [BlockDecl [cdecl| typename uint32_t tid = threadIdx.x; |]] else []) ++ (if (usesWarps im) then [BlockDecl [cdecl| typename uint32_t warpID = threadIdx.x / 32; |], BlockDecl [cdecl| typename uint32_t warpIx = threadIdx.x % 32; |]] else []) ++ -- All variables used will be unique and can be declared -- at the top level concatMap declares im ++ -- Not sure if I am using language.C correctly. -- Maybe compileSTM should create BlockStms ? -- TODO: look how Nikola does it. map BlockStm stms cbody = -- add memory allocation map BlockStm stms -- Declare variables. declares (SDeclare name t,_) = [BlockDecl [cdecl| $ty:(compileType t) $id:name;|]] declares (SCond _ im,_) = concatMap declares im declares (SSeqWhile _ im,_) = concatMap declares im declares (SForAll _ _ im,_) = concatMap declares im declares (SDistrPar _ _ im,_) = concatMap declares im -- declares (SForAllBlocks _ im,_) = concatMap declares im -- declares (SNWarps _ im,_) = concatMap declares im -- declares (SWarpForAll _ im,_) = concatMap declares im declares _ = [] --------------------------------------------------------------------------- -- Parameter lists for functions (kernel head) --------------------------------------------------------------------------- compileParams :: Platform -> Parameters -> [Param] compileParams PlatformOpenCL = map go where go (name,Pointer t) = [CL.cparam| global $ty:(compileType t) $id:name |] go (name, t) = [CL.cparam| $ty:(compileType t) $id:name |] -- C or CUDA compileParams _ = map go where go (name,t) = [cparam| $ty:(compileType t) $id:name |]