{-# LANGUAGE TypeOperators,
             ScopedTypeVariables,
             TypeFamilies,
             TypeSynonymInstances,
             FlexibleInstances #-}
{-# LANGUAGE BangPatterns #-} 

{-
    Gah!
    Most of this needs to be improved.

    # Add functionality to pass arrays of tuples in and out
      from kernels. 
-} 


module Obsidian.Run.CUDA.Exec ( mkRandomVec
                              , CUDAVector
                              , getDevices
                              , propsSummary
                              , sharedMemConfig
                              , CUDA
                              , KernelT
                              , (<:>)
                              , (<>)
                              , (<==)
                              , syncAll
                              , exec
                              , withCUDA
                              , withCUDA'
                              , destroyCtx
                              , Context(..)
                              , initialise
                              , capture
                              , captureIO
                              , useVector
                              , allocaVector
                              , allocaFillVector
                              , withVector
                              , withFillVector 
                              , allocaVector_
                              , mallocVector
                              , mallocVectorIO
                              , freeVector
                              , freeVectorIO
                              , fill
                              , peekCUDAVector
                              , copyOut
                              , copyInIO
                              , copyOutIO
                              ) where

---------------------------------------------------------------------------
--
-- Low level interface to CUDA functionality from Obsidian
--
---------------------------------------------------------------------------


import qualified Foreign.CUDA.Driver as CUDA

import Obsidian.CodeGen.Reify
import Obsidian.CodeGen.CUDA

-- import Obsidian.Types -- experimental
import Obsidian.Exp hiding (sizeOf)
import Obsidian.Array
import Obsidian.Program (Grid, GProgram)
import Obsidian.Mutable

-- import Foreign.Marshal.Array
import Foreign.ForeignPtr.Unsafe -- (req GHC 7.6 ?)
import Foreign.ForeignPtr 

import qualified Data.Vector.Storable as V
import Foreign.Storable
import Foreign.Ptr

import Data.Word
import Data.Int
import System.Process
import System.Random.MWC

import Control.Monad.State

debug :: Bool
debug = False

---------------------------------------------------------------------------
--
---------------------------------------------------------------------------
instance V.Storable a => V.Storable (Vector4 a) where
  sizeOf _ = sizeOf (undefined :: a) * 4
  alignment _ = alignment (undefined :: a)
 
  {-# INLINE peek #-}
  peek p = do
             a <- peekElemOff q 0
             b <- peekElemOff q 1
             c <- peekElemOff q 2
             d <- peekElemOff q 3
             return (Vector4 a b c d)
    where
      q = castPtr p
  {-# INLINE poke #-}
  poke p (Vector4 a b c d) = do
             pokeElemOff q 0 a
             pokeElemOff q 1 b
             pokeElemOff q 2 c
             pokeElemOff q 3 d
    where
      q = castPtr p

instance V.Storable a => V.Storable (Vector2 a) where
  sizeOf _ = sizeOf (undefined :: a) * 2
  alignment _ = alignment (undefined :: a)
 
  {-# INLINE peek #-}
  peek p = do
             a <- peekElemOff q 0
             b <- peekElemOff q 1
             return (Vector2 a b )
    where
      q = castPtr p
  {-# INLINE poke #-}
  poke p (Vector2 a b) = do
             pokeElemOff q 0 a
             pokeElemOff q 1 b
    where
      q = castPtr p

  
---------------------------------------------------------------------------
-- Tools 
---------------------------------------------------------------------------
-- | Generate a random vector
mkRandomVec :: forall a.(V.Storable a, Variate a) => Int -> IO (V.Vector a)
mkRandomVec k = withSystemRandom $ \g -> uniformVector g k :: IO (V.Vector a) 

---------------------------------------------------------------------------
-- An array located in GPU memory
---------------------------------------------------------------------------
-- | Represents vectors (arrays) in GPU memory.
data CUDAVector a = CUDAVector {cvPtr :: !(CUDA.DevicePtr a),
                                cvLen :: !Word32} 

---------------------------------------------------------------------------
-- Get a list of devices from the CUDA driver
---------------------------------------------------------------------------
-- | Get a list of CUDA devices available in the system 
getDevices :: IO [(CUDA.Device,CUDA.DeviceProperties)]
getDevices = do
  num <- CUDA.count 
  devs <- mapM CUDA.device [0..num-1]
  props <- mapM CUDA.props devs
  return $ zip devs props

---------------------------------------------------------------------------
-- Print a Summary of a device's properties. 
---------------------------------------------------------------------------
-- | A string showing some of the properties os a CUDA device.
propsSummary :: CUDA.DeviceProperties -> String
propsSummary props = unlines
  ["Device Name: " ++ CUDA.deviceName props,
   "Compute Cap: " ++ show (CUDA.computeCapability props),
   "Global Mem:  " ++ show (CUDA.totalGlobalMem props),
   "Shared Mem/Block: " ++ show (CUDA.sharedMemPerBlock props),
   "Registers/Block: "  ++ show (CUDA.regsPerBlock props),
   "Warp Size: " ++ show (CUDA.warpSize props),
   "Max threads/Block: " ++ show (CUDA.maxThreadsPerBlock props),
   "Max threads/MP: " ++ show (CUDA.maxThreadsPerMultiProcessor props),
   "Clock rate: " ++ show (CUDA.clockRate props),
   "Num MP: " ++ show (CUDA.multiProcessorCount props),
   "Mem bus width: " ++ show (CUDA.memBusWidth props)] 

---------------------------------------------------------------------------
-- Extract shared mem config
---------------------------------------------------------------------------
-- | Extract shared memory configuration. This information is used
--   by the memory layout manager.
sharedMemConfig :: CUDA.DeviceProperties -> SharedMemConfig 
sharedMemConfig props = SharedMemConfig sm_bytes num_banks True
  where
    sm_bytes = fromIntegral $ CUDA.sharedMemPerBlock props
    num_banks = if x < 2
                then 16
                else 32              
    (CUDA.Compute x _) = CUDA.computeCapability props
    
--------------------------------------------------------------------------
-- Environment to run CUDA computations in.
--  # Needs to keep track of generated and loaded functions etc. 
---------------------------------------------------------------------------
-- | An environment to perform CUDA computations within.
data CUDAState = CUDAState { csIdent :: Int,

                             csCtx   :: CUDA.Context,
                             csProps :: CUDA.DeviceProperties}

type CUDA a = StateT CUDAState IO a


-- Change so that the type parameter to KernelT
-- represents the "captured" type, with CUDAVectors instead of Pull, Push vectors.

-- | A representation of a CUDA kernel. This represents the
--   kernel after it has been loaded from an Object file. 
data KernelT a = KernelT {ktFun             :: !CUDA.Fun,
                          ktThreadsPerBlock :: !Word32,
                          ktSharedBytes     :: !Word32,
                          ktInputs          :: ![CUDA.FunParam],
                          ktOutput          :: ![CUDA.FunParam] }

---------------------------------------------------------------------------
-- Kernel Input and Output classes
---------------------------------------------------------------------------
class KernelI a where
  type KInput a 
  addInParam :: KernelT (KInput a -> b) -> a -> KernelT b

class KernelM a where
  type KMutable a
  addMutable :: KernelT (KMutable a -> b) -> a -> KernelT b 
  
class KernelO a where
  type KOutput a 
  addOutParam :: KernelT (KOutput a) -> a -> KernelT ()

instance KernelI Int32 where
  type KInput Int32 = Exp Int32
  addInParam (KernelT f t s i o) a =
    KernelT f t s (i ++ [CUDA.IArg $ fromIntegral a]) o

instance KernelI Word32 where
  type KInput Word32 = Exp Word32
  addInParam (KernelT f t s i o) a =
    KernelT f t s (i ++ [CUDA.IArg $ fromIntegral a]) o 

instance Scalar a => KernelI (CUDAVector a) where
  type KInput (CUDAVector a) = DPull (Exp a) 
  addInParam (KernelT f t s i o) b =
    KernelT f t s (i ++ [CUDA.VArg (cvPtr b),
                         CUDA.IArg $ fromIntegral (cvLen b)]) o

instance Scalar a => KernelM (CUDAVector a) where
  type KMutable (CUDAVector a) = Mutable Global EW32 (Exp a) 
  addMutable (KernelT f t s i o) b =
    KernelT f t s (i ++ [CUDA.VArg (cvPtr b),
                         CUDA.IArg $ fromIntegral (cvLen b)]) o

instance Scalar a => KernelO (CUDAVector a) where
  type KOutput (CUDAVector a) = DPush Grid (Exp a) 
  addOutParam (KernelT f t s i o) b =
    KernelT f t s i (o ++ [CUDA.VArg (cvPtr b)])

---------------------------------------------------------------------------
-- (<>) apply a kernel to an input
---------------------------------------------------------------------------
-- | Apply a kernel to an input 
(<>) :: KernelI a
        => (Word32,KernelT (KInput a -> b)) -> a -> (Word32,KernelT b)
(<>) (blocks,kern) a = (blocks,addInParam kern a)

---------------------------------------------------------------------------
-- Assign a mutable input/output to a kernel
---------------------------------------------------------------------------
-- | apply a kernel to a mutable array 
(<:>) :: KernelM a
         => (Word32, KernelT (KMutable a -> b)) -> a -> (Word32, KernelT b)
(<:>) (blocks,kern) a = (blocks, addMutable kern a)


---------------------------------------------------------------------------
-- Execute a kernel and store output to an array
---------------------------------------------------------------------------
-- | Infix operator taking an allocated output array on LHS and
--   a (Number_Of_Blocks,Kernel) pair on the right. 
(<==) :: KernelO b => b -> (Word32, KernelT (KOutput b)) -> CUDA ()
(<==) o (nb,kern) =
  do
    let k = addOutParam kern o
    lift $ CUDA.launchKernel
      (ktFun k)
      (fromIntegral nb,1,1)
      (fromIntegral (ktThreadsPerBlock k), 1, 1)
      (fromIntegral (ktSharedBytes k))
      Nothing -- stream
      (ktInputs k ++ ktOutput k) -- params
    

-- | A variant that returns kernel timing information as well.
-- TODO: We probably can remove this:
-- (<==!) :: KernelO b => b -> (Word32, KernelT (KOutput b)) -> CUDA Word64
-- (<==!) o (nb,kern) =
--   do
--     let k = addOutParam kern o
--     t1 <- lift rdtsc 
--     lift $ CUDA.launchKernel
--       (ktFun k)
--       (fromIntegral nb,1,1)
--       (fromIntegral (ktThreadsPerBlock k), 1, 1)
--       (fromIntegral (ktSharedBytes k))
--       Nothing -- stream
--       (ktInputs k ++ ktOutput k) -- params
--     lift $ CUDA.sync
--     t2 <- lift rdtsc
--     return (t2 - t1)

-- | Wait for all launched work on the device to finish 
syncAll :: CUDA () 
syncAll = lift $ CUDA.sync

-- Tweak these 
infixl 4 <>
infixl 3 <==
-- infixl 3 <==!

---------------------------------------------------------------------------
-- Execute a kernel that has no Output ( a -> GProgram ()) KernelT (GProgram ()) 
---------------------------------------------------------------------------
-- | Execute a kernel that has no specified output.
--   It may write data into a Global Mutable array. 
exec :: (Word32, KernelT (GProgram ())) -> CUDA ()
exec (nb, k) =
 lift $ CUDA.launchKernel
     (ktFun k)
     (fromIntegral nb,1,1)
     (fromIntegral (ktThreadsPerBlock k), 1, 1)
     0 {- (fromIntegral (ktSharedBytes k)) -}
     Nothing -- stream
     (ktInputs k)
---------------------------------------------------------------------------
-- Get a fresh identifier
---------------------------------------------------------------------------
newIdent :: CUDA Int
newIdent =
  do
    i <- gets csIdent
    modify (\s -> s {csIdent = i+1 }) 
    return i

data Context = Context {context :: !CUDA.Context,
                        props :: !CUDA.DeviceProperties} 

initialise :: IO (Context) 
initialise =
  do CUDA.initialise []
     devs <- getDevices
    
     case devs of
         [] -> error "No CUDA device found!"
         (x:_) ->
           do ctx <- CUDA.create (fst x) [CUDA.SchedAuto]
              return (Context ctx (snd x)) 

destroyCtx :: Context -> IO ()
destroyCtx (Context ctx props) = CUDA.destroy ctx

       
withCUDA' :: Context -> CUDA a -> IO a
withCUDA' (Context ctx props)  p =
  do
    (a,_) <- runStateT p (CUDAState 0 ctx props)
    return a

---------------------------------------------------------------------------
-- Run a CUDA computation
---------------------------------------------------------------------------
withCUDA :: CUDA a -> IO a
withCUDA p =
  do
    CUDA.initialise []
    devs <- getDevices
    case devs of
      [] -> error "No CUDA device found!" 
      (x:_) ->
        do 
          !ctx <- CUDA.create (fst x) [CUDA.SchedAuto] 
          (a,_) <- runStateT p (CUDAState 0 ctx (snd x)) 
          CUDA.destroy ctx
          return a

---------------------------------------------------------------------------
-- Capture without an inputlist! 
---------------------------------------------------------------------------    

-- | Compile a program to a CUDA kernel and then load it into memory.
capture :: ToProgram prg
        => Word32  -- ^ Threads per block
        -> prg     -- ^ Program to capture
        -> CUDA (KernelT prg) 
capture threadsPerBlock f =
  do
    i <- newIdent

    props <- return . csProps =<< get
    
    let kn     = "gen" ++ show i
        fn     = kn ++ ".cu"
        --cub    = fn ++ ".cubin"

        sm_conf = sharedMemConfig props
        
        prgstr = genKernelParams sm_conf threadsPerBlock kn f
        header = "#include <stdint.h>\n" -- more includes ? 
      
    
    when debug $ 
      do 
        lift $ putStrLn $ prgstr

    let arch = archStr props
        
    cub_file <- lift $ storeAndCompile arch fn (header ++ prgstr)
    
    cubin <- liftIO $ CUDA.loadFile cub_file
    fun <- liftIO $ CUDA.getFun cubin kn 

    {- After loading the binary into the running process
       can I delete the .cu and the .cu.cubin ? -} 
           
    return $! KernelT fun threadsPerBlock 0 {-bytesShared-} [] []


-- | Compile a program to a CUDA kernel and then load it into memory.
captureIO :: ToProgram prg
             => String  -- ^ name of Kernel (and files stored to disk)
             -> CUDA.DeviceProperties -- ^ deviceproperties to compile for 
             -> Word32  -- ^ Threads per block
             -> prg     -- ^ Program to capture
             -> IO (KernelT prg) 
captureIO nom props threadsPerBlock f =
  do
    let kn     = nom
        fn     = kn ++ ".cu"

        sm_conf = sharedMemConfig props
        
        prgstr = genKernelParams sm_conf threadsPerBlock kn f
        header = "#include <stdint.h>\n" -- more includes ? 
      
    
    when debug $ 
      do 
        putStrLn $ prgstr

    let arch = archStr props
        
    cub_file <- storeAndCompile arch fn (header ++ prgstr)
    
    cubin <- liftIO $ CUDA.loadFile cub_file
    fun <- liftIO $ CUDA.getFun cubin kn 

    {- After loading the binary into the running process
       can I delete the .cu and the .cu.cubin ? -} 
           
    return $! KernelT fun threadsPerBlock 0 {-bytesShared-} [] []


---------------------------------------------------------------------------
-- useVector: Copies a Data.Vector from "Haskell" onto the GPU Global mem 
--------------------------------------------------------------------------- 
useVector :: V.Storable a =>
             V.Vector a -> (CUDAVector a -> CUDA b) -> CUDA b
useVector v f =
  do
    let (hfptr,n) = V.unsafeToForeignPtr0 v
    
    dptr <- lift $ CUDA.mallocArray n
    let hptr = unsafeForeignPtrToPtr hfptr
    lift $ CUDA.pokeArray n hptr dptr
    let cvector = CUDAVector dptr (fromIntegral (V.length v)) 
    b <- f cvector -- dptr
    lift $ CUDA.free dptr
    return b

-- THESE SHOULD BE RENAMED -- 
---------------------------------------------------------------------------
-- allocaVector: allocates room for a vector in the GPU Global mem
---------------------------------------------------------------------------
{-# DEPRECATED allocaVector "Use withVector instead" #-}    
-- | allocate a vector in GPU Device memory
allocaVector :: V.Storable a => 
                Int -> (CUDAVector a -> CUDA b) -> CUDA b                
allocaVector n f =
  do
    dptr <- lift $ CUDA.mallocArray n
    let cvector = CUDAVector dptr (fromIntegral n)
    b <- f cvector -- dptr
    lift $ CUDA.free dptr
    return b

withVector :: V.Storable a =>
              Int -> (CUDAVector a -> CUDA b) -> CUDA b
withVector n f = allocaVector n f 
              
              
---------------------------------------------------------------------------
-- Low level memory allocation/deallocation
---------------------------------------------------------------------------
{-# DEPRECATED allocaVector_ "Use mallocVector instead" #-}    
-- | allocate a vector in GPU Device memory 
allocaVector_ :: V.Storable a =>
                 Int -> CUDA (CUDAVector a)  
allocaVector_ n =
  do
    dptr <- lift $ CUDA.mallocArray n
    return $ CUDAVector dptr (fromIntegral n)

mallocVector :: V.Storable a =>
                Int -> CUDA (CUDAVector a)
mallocVector = allocaVector_

mallocVectorIO :: V.Storable a =>
                  Int -> IO (CUDAVector a)
mallocVectorIO n =
  do
    dptr <- CUDA.mallocArray n
    return $ CUDAVector dptr (fromIntegral n) 

freeVector :: CUDAVector a -> CUDA ()
freeVector (CUDAVector dptr _) = lift $ CUDA.free dptr

freeVectorIO :: CUDAVector a -> IO ()
freeVectorIO (CUDAVector dptr _) = CUDA.free dptr 

copyInIO :: V.Storable a =>
            V.Vector a -> IO (CUDAVector a) 
copyInIO v =
  do
    let (hfptr,n) = V.unsafeToForeignPtr0 v
    
    dptr <- CUDA.mallocArray n
    let hptr = unsafeForeignPtrToPtr hfptr
    CUDA.pokeArray n hptr dptr
    let cvector = CUDAVector dptr (fromIntegral (V.length v)) 
    -- b <- f cvector -- dptr
    -- lift $ CUDA.free dptr
    return cvector

copyOutIO :: V.Storable a => CUDAVector a -> IO (V.Vector a)
copyOutIO (CUDAVector dptr n) =
  do
    (fptr :: ForeignPtr a) <- mallocForeignPtrArray (fromIntegral n)
    let ptr = unsafeForeignPtrToPtr fptr
    CUDA.peekArray (fromIntegral n) dptr ptr
    return $ V.unsafeFromForeignPtr fptr 0 (fromIntegral n)


---------------------------------------------------------------------------
-- Allocate and fill with default value
---------------------------------------------------------------------------
-- | Allocate and Fill a vector in GPU Device memory
allocaFillVector :: V.Storable a => 
                    Int -> a -> (CUDAVector a -> CUDA b) -> CUDA b                
allocaFillVector n a f =
  do
    dptr <- lift $ CUDA.mallocArray n
    lift $ CUDA.memset dptr n a 
    let cvector = CUDAVector dptr (fromIntegral n)
    b <- f cvector -- dptr
    lift $ CUDA.free dptr
    return b 

withFillVector :: V.Storable a => 
                  Int -> a -> (CUDAVector a -> CUDA b) -> CUDA b
withFillVector = allocaFillVector


---------------------------------------------------------------------------
-- Fill a Vector
---------------------------------------------------------------------------
-- | Fill a vector 
fill :: V.Storable a =>
        CUDAVector a -> 
        a -> CUDA ()
fill (CUDAVector dptr n) a =
  lift $ CUDA.memset dptr (fromIntegral n) a 

---------------------------------------------------------------------------
-- Peek in a CUDAVector (Simple "copy back")
---------------------------------------------------------------------------
-- | Copy a vector from the device to the host
--   as a list. (SLOW!)
peekCUDAVector :: V.Storable a => CUDAVector a -> CUDA [a]
peekCUDAVector (CUDAVector dptr n) = 
    lift $ CUDA.peekListArray (fromIntegral n) dptr

-- | Copy a vector from the device to the host. (FAST!)
copyOut :: V.Storable a => CUDAVector a -> CUDA (V.Vector a)
copyOut (CUDAVector dptr n) =
  do
    (fptr :: ForeignPtr a) <- lift $ mallocForeignPtrArray (fromIntegral n)
    let ptr = unsafeForeignPtrToPtr fptr
    lift $ CUDA.peekArray (fromIntegral n) dptr ptr
    return $ V.unsafeFromForeignPtr fptr 0 (fromIntegral n)

---------------------------------------------------------------------------
-- Get the "architecture" of the present CUDA device
---------------------------------------------------------------------------
  
archStr :: CUDA.DeviceProperties -> String
archStr props = "-arch=sm_" ++ archStr' (CUDA.computeCapability props)
  where
    archStr' (CUDA.Compute h l) = show h ++ show l
    
---------------------------------------------------------------------------
-- Compile to Cubin (interface with nvcc)
---------------------------------------------------------------------------
storeAndCompile :: String -> FilePath -> String -> IO FilePath
storeAndCompile arch fp code =
  do
    writeFile fp code
    
    let nfp = fp ++  ".cubin"

    (_,_,_,pid) <-
      createProcess (shell ("nvcc " ++ arch ++ " -cubin -o " ++ nfp ++ " " ++ fp))
    -- This could fail. Should be error check here (waitForProcess).
    _ <- waitForProcess pid
    return nfp