-- GENERATED by C->Haskell Compiler, version 0.28.2 Switcheroo, 1 April 2016 (Haskell)
-- Edit the ORIGNAL .chs file instead!


{-# LINE 1 "src/Foreign/CUDA/Runtime/Marshal.chs" #-}
{-# LANGUAGE BangPatterns             #-}
{-# LANGUAGE EmptyDataDecls           #-}
{-# LANGUAGE ForeignFunctionInterface #-}
{-# LANGUAGE TemplateHaskell          #-}
--------------------------------------------------------------------------------
-- |
-- Module    : Foreign.CUDA.Runtime.Marshal
-- Copyright : [2009..2017] Trevor L. McDonell
-- License   : BSD
--
-- Memory management for CUDA devices
--
--------------------------------------------------------------------------------

module Foreign.CUDA.Runtime.Marshal (

  -- * Host Allocation
  AllocFlag(..),
  mallocHostArray, freeHost,

  -- * Device Allocation
  mallocArray, allocaArray, free,

  -- * Unified Memory Allocation
  AttachFlag(..),
  mallocManagedArray,

  -- * Marshalling
  peekArray, peekArrayAsync, peekArray2D, peekArray2DAsync, peekListArray,
  pokeArray, pokeArrayAsync, pokeArray2D, pokeArray2DAsync, pokeListArray,
  copyArray, copyArrayAsync, copyArray2D, copyArray2DAsync,

  -- * Combined Allocation and Marshalling
  newListArray,  newListArrayLen,
  withListArray, withListArrayLen,

  -- * Utility
  memset

) where
import qualified Foreign.C.Types as C2HSImp
import qualified Foreign.Ptr as C2HSImp





{-# LINE 43 "src/Foreign/CUDA/Runtime/Marshal.chs" #-}


-- Friends
import Foreign.CUDA.Ptr
import Foreign.CUDA.Runtime.Error
import Foreign.CUDA.Runtime.Stream
import Foreign.CUDA.Internal.C2HS

-- System
import Data.Int
import Data.Maybe
import Control.Exception

import Foreign.C
import Foreign.Ptr
import Foreign.Storable
import qualified Foreign.Marshal as F





--------------------------------------------------------------------------------
-- Host Allocation
--------------------------------------------------------------------------------

-- |
-- Options for host allocation
--
data AllocFlag = Portable
               | DeviceMapped
               | WriteCombined
  deriving (Eq,Show,Bounded)
instance Enum AllocFlag where
  succ Portable = DeviceMapped
  succ DeviceMapped = WriteCombined
  succ WriteCombined = error "AllocFlag.succ: WriteCombined has no successor"

  pred DeviceMapped = Portable
  pred WriteCombined = DeviceMapped
  pred Portable = error "AllocFlag.pred: Portable has no predecessor"

  enumFromTo from to = go from
    where
      end = fromEnum to
      go v = case compare (fromEnum v) end of
                 LT -> v : go (succ v)
                 EQ -> [v]
                 GT -> []

  enumFrom from = enumFromTo from WriteCombined

  fromEnum Portable = 1
  fromEnum DeviceMapped = 2
  fromEnum WriteCombined = 4

  toEnum 1 = Portable
  toEnum 2 = DeviceMapped
  toEnum 4 = WriteCombined
  toEnum unmatched = error ("AllocFlag.toEnum: Cannot match " ++ show unmatched)

{-# LINE 88 "src/Foreign/CUDA/Runtime/Marshal.chs" #-}



-- |
-- Allocate a section of linear memory on the host which is page-locked and
-- directly accessible from the device. The storage is sufficient to hold the
-- given number of elements of a storable type. The runtime system automatically
-- accelerates calls to functions such as 'peekArrayAsync' and 'pokeArrayAsync'
-- that refer to page-locked memory.
--
-- Note that since the amount of pageable memory is thusly reduced, overall
-- system performance may suffer. This is best used sparingly to allocate
-- staging areas for data exchange
--
{-# INLINEABLE mallocHostArray #-}
mallocHostArray :: Storable a => [AllocFlag] -> Int -> IO (HostPtr a)
mallocHostArray !flags = doMalloc undefined
  where
    doMalloc :: Storable a' => a' -> Int -> IO (HostPtr a')
    doMalloc x !n = resultIfOk =<< cudaHostAlloc (fromIntegral n * fromIntegral (sizeOf x)) flags

{-# INLINE cudaHostAlloc #-}
cudaHostAlloc :: (Int64) -> ([AllocFlag]) -> IO ((Status), (HostPtr a))
cudaHostAlloc a2 a3 =
  alloca' $ \a1' -> 
  let {a2' = cIntConv a2} in 
  let {a3' = combineBitMasks a3} in 
  cudaHostAlloc'_ a1' a2' a3' >>= \res ->
  let {res' = cToEnum res} in
  hptr  a1'>>= \a1'' -> 
  return (res', a1'')

{-# LINE 113 "src/Foreign/CUDA/Runtime/Marshal.chs" #-}

  where
    alloca' !f = F.alloca $ \ !p -> poke p nullPtr >> f (castPtr p)
    hptr !p    = (HostPtr . castPtr) `fmap` peek p


-- |
-- Free page-locked host memory previously allocated with 'mallecHost'
--
{-# INLINEABLE freeHost #-}
freeHost :: HostPtr a -> IO ()
freeHost !p = nothingIfOk =<< cudaFreeHost p

{-# INLINE cudaFreeHost #-}
cudaFreeHost :: (HostPtr a) -> IO ((Status))
cudaFreeHost a1 =
  let {a1' = hptr a1} in 
  cudaFreeHost'_ a1' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 128 "src/Foreign/CUDA/Runtime/Marshal.chs" #-}

  where hptr = castPtr . useHostPtr


--------------------------------------------------------------------------------
-- Device Allocation
--------------------------------------------------------------------------------

-- |
-- Allocate a section of linear memory on the device, and return a reference to
-- it. The memory is sufficient to hold the given number of elements of storable
-- type. It is suitable aligned, and not cleared.
--
{-# INLINEABLE mallocArray #-}
mallocArray :: Storable a => Int -> IO (DevicePtr a)
mallocArray = doMalloc undefined
  where
    doMalloc :: Storable a' => a' -> Int -> IO (DevicePtr a')
    doMalloc x !n = resultIfOk =<< cudaMalloc (fromIntegral n * fromIntegral (sizeOf x))

{-# INLINE cudaMalloc #-}
cudaMalloc :: (Int64) -> IO ((Status), (DevicePtr a))
cudaMalloc a2 =
  alloca' $ \a1' -> 
  let {a2' = cIntConv a2} in 
  cudaMalloc'_ a1' a2' >>= \res ->
  let {res' = cToEnum res} in
  dptr  a1'>>= \a1'' -> 
  return (res', a1'')

{-# LINE 151 "src/Foreign/CUDA/Runtime/Marshal.chs" #-}

  where
    -- C-> Haskell doesn't like qualified imports in marshaller specifications
    alloca' !f = F.alloca $ \ !p -> poke p nullPtr >> f (castPtr p)
    dptr !p    = (castDevPtr . DevicePtr) `fmap` peek p


-- |
-- Execute a computation, passing a pointer to a temporarily allocated block of
-- memory sufficient to hold the given number of elements of storable type. The
-- memory is freed when the computation terminates (normally or via an
-- exception), so the pointer must not be used after this.
--
-- Note that kernel launches can be asynchronous, so you may need to add a
-- synchronisation point at the end of the computation.
--
{-# INLINEABLE allocaArray #-}
allocaArray :: Storable a => Int -> (DevicePtr a -> IO b) -> IO b
allocaArray n = bracket (mallocArray n) free


-- |
-- Free previously allocated memory on the device
--
{-# INLINEABLE free #-}
free :: DevicePtr a -> IO ()
free !p = nothingIfOk =<< cudaFree p

{-# INLINE cudaFree #-}
cudaFree :: (DevicePtr a) -> IO ((Status))
cudaFree a1 =
  let {a1' = dptr a1} in 
  cudaFree'_ a1' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 181 "src/Foreign/CUDA/Runtime/Marshal.chs" #-}

  where
    dptr = useDevicePtr . castDevPtr


--------------------------------------------------------------------------------
-- Unified memory allocation
--------------------------------------------------------------------------------

-- |
-- Options for unified memory allocations
--
data AttachFlag = Global
                | Host
                | Single
  deriving (Eq,Show,Bounded)
instance Enum AttachFlag where
  succ Global = Host
  succ Host = Single
  succ Single = error "AttachFlag.succ: Single has no successor"

  pred Host = Global
  pred Single = Host
  pred Global = error "AttachFlag.pred: Global has no predecessor"

  enumFromTo from to = go from
    where
      end = fromEnum to
      go v = case compare (fromEnum v) end of
                 LT -> v : go (succ v)
                 EQ -> [v]
                 GT -> []

  enumFrom from = enumFromTo from Single

  fromEnum Global = 1
  fromEnum Host = 2
  fromEnum Single = 4

  toEnum 1 = Global
  toEnum 2 = Host
  toEnum 4 = Single
  toEnum unmatched = error ("AttachFlag.toEnum: Cannot match " ++ show unmatched)

{-# LINE 199 "src/Foreign/CUDA/Runtime/Marshal.chs" #-}


-- |
-- Allocates memory that will be automatically managed by the Unified Memory
-- system
--
{-# INLINEABLE mallocManagedArray #-}
mallocManagedArray :: Storable a => [AttachFlag] -> Int -> IO (DevicePtr a)
mallocManagedArray !flags = doMalloc undefined
  where
    doMalloc :: Storable a' => a' -> Int -> IO (DevicePtr a')
    doMalloc x !n = resultIfOk =<< cudaMallocManaged (fromIntegral n * fromIntegral (sizeOf x)) flags

{-# INLINE cudaMallocManaged #-}
cudaMallocManaged :: (Int64) -> ([AttachFlag]) -> IO ((Status), (DevicePtr a))
cudaMallocManaged a2 a3 =
  alloca' $ \a1' -> 
  let {a2' = cIntConv a2} in 
  let {a3' = combineBitMasks a3} in 
  cudaMallocManaged'_ a1' a2' a3' >>= \res ->
  let {res' = cToEnum res} in
  dptr  a1'>>= \a1'' -> 
  return (res', a1'')

{-# LINE 219 "src/Foreign/CUDA/Runtime/Marshal.chs" #-}

  where
    alloca' !f = F.alloca $ \ !p -> poke p nullPtr >> f (castPtr p)
    dptr !p    = (castDevPtr . DevicePtr) `fmap` peek p


--------------------------------------------------------------------------------
-- Marshalling
--------------------------------------------------------------------------------

-- |
-- Copy a number of elements from the device to host memory. This is a
-- synchronous operation.
--
{-# INLINEABLE peekArray #-}
peekArray :: Storable a => Int -> DevicePtr a -> Ptr a -> IO ()
peekArray !n !dptr !hptr = memcpy hptr (useDevicePtr dptr) n DeviceToHost


-- |
-- Copy memory from the device asynchronously, possibly associated with a
-- particular stream. The destination memory must be page locked.
--
{-# INLINEABLE peekArrayAsync #-}
peekArrayAsync :: Storable a => Int -> DevicePtr a -> HostPtr a -> Maybe Stream -> IO ()
peekArrayAsync !n !dptr !hptr !mst =
  memcpyAsync (useHostPtr hptr) (useDevicePtr dptr) n DeviceToHost mst


-- |
-- Copy a 2D memory area from the device to the host. This is a synchronous
-- operation.
--
{-# INLINEABLE peekArray2D #-}
peekArray2D
    :: Storable a
    => Int                      -- ^ width to copy (elements)
    -> Int                      -- ^ height to copy (elements)
    -> DevicePtr a              -- ^ source array
    -> Int                      -- ^ source array width
    -> Ptr a                    -- ^ destination array
    -> Int                      -- ^ destination array width
    -> IO ()
peekArray2D !w !h !dptr !dw !hptr !hw =
  memcpy2D hptr hw (useDevicePtr dptr) dw w h DeviceToHost


-- |
-- Copy a 2D memory area from the device to the host asynchronously, possibly
-- associated with a particular stream. The destination array must be page
-- locked.
--
{-# INLINEABLE peekArray2DAsync #-}
peekArray2DAsync
    :: Storable a
    => Int                      -- ^ width to copy (elements)
    -> Int                      -- ^ height to copy (elements)
    -> DevicePtr a              -- ^ source array
    -> Int                      -- ^ source array width
    -> HostPtr a                -- ^ destination array
    -> Int                      -- ^ destination array width
    -> Maybe Stream
    -> IO ()
peekArray2DAsync !w !h !dptr !dw !hptr !hw !mst =
  memcpy2DAsync (useHostPtr hptr) hw (useDevicePtr dptr) dw w h DeviceToHost mst


-- |
-- Copy a number of elements from the device into a new Haskell list. Note that
-- this requires two memory copies: firstly from the device into a heap
-- allocated array, and from there marshalled into a list
--
{-# INLINEABLE peekListArray #-}
peekListArray :: Storable a => Int -> DevicePtr a -> IO [a]
peekListArray !n !dptr =
  F.allocaArray n $ \p -> do
    peekArray   n dptr p
    F.peekArray n p


-- |
-- Copy a number of elements onto the device. This is a synchronous operation.
--
{-# INLINEABLE pokeArray #-}
pokeArray :: Storable a => Int -> Ptr a -> DevicePtr a -> IO ()
pokeArray !n !hptr !dptr = memcpy (useDevicePtr dptr) hptr n HostToDevice


-- |
-- Copy memory onto the device asynchronously, possibly associated with a
-- particular stream. The source memory must be page-locked.
--
{-# INLINEABLE pokeArrayAsync #-}
pokeArrayAsync :: Storable a => Int -> HostPtr a -> DevicePtr a -> Maybe Stream -> IO ()
pokeArrayAsync !n !hptr !dptr !mst =
  memcpyAsync (useDevicePtr dptr) (useHostPtr hptr) n HostToDevice mst


-- |
-- Copy a 2D memory area onto the device. This is a synchronous operation.
--
{-# INLINEABLE pokeArray2D #-}
pokeArray2D
    :: Storable a
    => Int                      -- ^ width to copy (elements)
    -> Int                      -- ^ height to copy (elements)
    -> Ptr a                    -- ^ source array
    -> Int                      -- ^ source array width
    -> DevicePtr a              -- ^ destination array
    -> Int                      -- ^ destination array width
    -> IO ()
pokeArray2D !w !h !hptr !dw !dptr !hw =
  memcpy2D (useDevicePtr dptr) dw hptr hw w h HostToDevice


-- |
-- Copy a 2D memory area onto the device asynchronously, possibly associated
-- with a particular stream. The source array must be page locked.
--
{-# INLINEABLE pokeArray2DAsync #-}
pokeArray2DAsync
    :: Storable a
    => Int                      -- ^ width to copy (elements)
    -> Int                      -- ^ height to copy (elements)
    -> HostPtr a                -- ^ source array
    -> Int                      -- ^ source array width
    -> DevicePtr a              -- ^ destination array
    -> Int                      -- ^ destination array width
    -> Maybe Stream
    -> IO ()
pokeArray2DAsync !w !h !hptr !hw !dptr !dw !mst =
  memcpy2DAsync (useDevicePtr dptr) dw (useHostPtr hptr) hw w h HostToDevice mst

-- |
-- Write a list of storable elements into a device array. The array must be
-- sufficiently large to hold the entire list. This requires two marshalling
-- operations
--
{-# INLINEABLE pokeListArray #-}
pokeListArray :: Storable a => [a] -> DevicePtr a -> IO ()
pokeListArray !xs !dptr = F.withArrayLen xs $ \len p -> pokeArray len p dptr


-- |
-- Copy the given number of elements from the first device array (source) to the
-- second (destination). The copied areas may not overlap. This operation is
-- asynchronous with respect to host, but will not overlap other device
-- operations.
--
{-# INLINEABLE copyArray #-}
copyArray :: Storable a => Int -> DevicePtr a -> DevicePtr a -> IO ()
copyArray !n !src !dst = memcpy (useDevicePtr dst) (useDevicePtr src) n DeviceToDevice


-- |
-- Copy the given number of elements from the first device array (source) to the
-- second (destination). The copied areas may not overlap. This operation is
-- asynchronous with respect to the host, and may be associated with a
-- particular stream.
--
{-# INLINEABLE copyArrayAsync #-}
copyArrayAsync :: Storable a => Int -> DevicePtr a -> DevicePtr a -> Maybe Stream -> IO ()
copyArrayAsync !n !src !dst !mst =
  memcpyAsync (useDevicePtr dst) (useDevicePtr src) n DeviceToDevice mst


-- |
-- Copy a 2D memory area from the first device array (source) to the second
-- (destination). The copied areas may not overlap. This operation is
-- asynchronous with respect to the host, but will not overlap other device
-- operations.
--
{-# INLINEABLE copyArray2D #-}
copyArray2D
    :: Storable a
    => Int                      -- ^ width to copy (elements)
    -> Int                      -- ^ height to copy (elements)
    -> DevicePtr a              -- ^ source array
    -> Int                      -- ^ source array width
    -> DevicePtr a              -- ^ destination array
    -> Int                      -- ^ destination array width
    -> IO ()
copyArray2D !w !h !src !sw !dst !dw =
  memcpy2D (useDevicePtr dst) dw (useDevicePtr src) sw w h DeviceToDevice


-- |
-- Copy a 2D memory area from the first device array (source) to the second
-- device array (destination). The copied areas may not overlay. This operation
-- is asynchronous with respect to the host, and may be associated with a
-- particular stream.
--
{-# INLINEABLE copyArray2DAsync #-}
copyArray2DAsync
    :: Storable a
    => Int                      -- ^ width to copy (elements)
    -> Int                      -- ^ height to copy (elements)
    -> DevicePtr a              -- ^ source array
    -> Int                      -- ^ source array width
    -> DevicePtr a              -- ^ destination array
    -> Int                      -- ^ destination array width
    -> Maybe Stream
    -> IO ()
copyArray2DAsync !w !h !src !sw !dst !dw !mst =
  memcpy2DAsync (useDevicePtr dst) dw (useDevicePtr src) sw w h DeviceToDevice mst


--
-- Memory copy kind
--
data CopyDirection = HostToHost
                   | HostToDevice
                   | DeviceToHost
                   | DeviceToDevice
                   | Default
  deriving (Eq,Show)
instance Enum CopyDirection where
  succ HostToHost = HostToDevice
  succ HostToDevice = DeviceToHost
  succ DeviceToHost = DeviceToDevice
  succ DeviceToDevice = Default
  succ Default = error "CopyDirection.succ: Default has no successor"

  pred HostToDevice = HostToHost
  pred DeviceToHost = HostToDevice
  pred DeviceToDevice = DeviceToHost
  pred Default = DeviceToDevice
  pred HostToHost = error "CopyDirection.pred: HostToHost has no predecessor"

  enumFromTo from to = go from
    where
      end = fromEnum to
      go v = case compare (fromEnum v) end of
                 LT -> v : go (succ v)
                 EQ -> [v]
                 GT -> []

  enumFrom from = enumFromTo from Default

  fromEnum HostToHost = 0
  fromEnum HostToDevice = 1
  fromEnum DeviceToHost = 2
  fromEnum DeviceToDevice = 3
  fromEnum Default = 4

  toEnum 0 = HostToHost
  toEnum 1 = HostToDevice
  toEnum 2 = DeviceToHost
  toEnum 3 = DeviceToDevice
  toEnum 4 = Default
  toEnum unmatched = error ("CopyDirection.toEnum: Cannot match " ++ show unmatched)

{-# LINE 431 "src/Foreign/CUDA/Runtime/Marshal.chs" #-}


-- |
-- Copy data between host and device. This is a synchronous operation.
--
{-# INLINEABLE memcpy #-}
memcpy :: Storable a
       => Ptr a                 -- ^ destination
       -> Ptr a                 -- ^ source
       -> Int                   -- ^ number of elements
       -> CopyDirection
       -> IO ()
memcpy !dst !src !n !dir = doMemcpy undefined dst
  where
    doMemcpy :: Storable a' => a' -> Ptr a' -> IO ()
    doMemcpy x _ =
      nothingIfOk =<< cudaMemcpy dst src (fromIntegral n * fromIntegral (sizeOf x)) dir

{-# INLINE cudaMemcpy #-}
cudaMemcpy :: (Ptr a) -> (Ptr a) -> (Int64) -> (CopyDirection) -> IO ((Status))
cudaMemcpy a1 a2 a3 a4 =
  let {a1' = castPtr a1} in 
  let {a2' = castPtr a2} in 
  let {a3' = cIntConv a3} in 
  let {a4' = cFromEnum a4} in 
  cudaMemcpy'_ a1' a2' a3' a4' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 454 "src/Foreign/CUDA/Runtime/Marshal.chs" #-}



-- |
-- Copy data between the host and device asynchronously, possibly associated
-- with a particular stream. The host-side memory must be page-locked (allocated
-- with 'mallocHostArray').
--
{-# INLINEABLE memcpyAsync #-}
memcpyAsync :: Storable a
            => Ptr a            -- ^ destination
            -> Ptr a            -- ^ source
            -> Int              -- ^ number of elements
            -> CopyDirection
            -> Maybe Stream
            -> IO ()
memcpyAsync !dst !src !n !kind !mst = doMemcpy undefined dst
  where
    doMemcpy :: Storable a' => a' -> Ptr a' -> IO ()
    doMemcpy x _ =
      let bytes = fromIntegral n * fromIntegral (sizeOf x) in
      nothingIfOk =<< cudaMemcpyAsync dst src bytes kind (fromMaybe defaultStream mst)

{-# INLINE cudaMemcpyAsync #-}
cudaMemcpyAsync :: (Ptr a) -> (Ptr a) -> (Int64) -> (CopyDirection) -> (Stream) -> IO ((Status))
cudaMemcpyAsync a1 a2 a3 a4 a5 =
  let {a1' = castPtr a1} in 
  let {a2' = castPtr a2} in 
  let {a3' = cIntConv a3} in 
  let {a4' = cFromEnum a4} in 
  let {a5' = useStream a5} in 
  cudaMemcpyAsync'_ a1' a2' a3' a4' a5' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 483 "src/Foreign/CUDA/Runtime/Marshal.chs" #-}



-- |
-- Copy a 2D memory area between the host and device. This is a synchronous
-- operation.
--
{-# INLINEABLE memcpy2D #-}
memcpy2D :: Storable a
         => Ptr a               -- ^ destination
         -> Int                 -- ^ width of destination array
         -> Ptr a               -- ^ source
         -> Int                 -- ^ width of source array
         -> Int                 -- ^ width to copy
         -> Int                 -- ^ height to copy
         -> CopyDirection
         -> IO ()
memcpy2D !dst !dw !src !sw !w !h !kind = doCopy undefined dst
  where
    doCopy :: Storable a' => a' -> Ptr a' -> IO ()
    doCopy x _ =
      let bytes = fromIntegral (sizeOf x)
          dw'   = fromIntegral dw * bytes
          sw'   = fromIntegral sw * bytes
          w'    = fromIntegral w  * bytes
          h'    = fromIntegral h
      in
      nothingIfOk =<< cudaMemcpy2D dst dw' src sw' w' h' kind

{-# INLINE cudaMemcpy2D #-}
cudaMemcpy2D :: (Ptr a) -> (Int64) -> (Ptr a) -> (Int64) -> (Int64) -> (Int64) -> (CopyDirection) -> IO ((Status))
cudaMemcpy2D a1 a2 a3 a4 a5 a6 a7 =
  let {a1' = castPtr a1} in 
  let {a2' = fromIntegral a2} in 
  let {a3' = castPtr a3} in 
  let {a4' = fromIntegral a4} in 
  let {a5' = fromIntegral a5} in 
  let {a6' = fromIntegral a6} in 
  let {a7' = cFromEnum a7} in 
  cudaMemcpy2D'_ a1' a2' a3' a4' a5' a6' a7' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 522 "src/Foreign/CUDA/Runtime/Marshal.chs" #-}



-- |
-- Copy a 2D memory area between the host and device asynchronously, possibly
-- associated with a particular stream. The host-side memory must be
-- page-locked.
--
{-# INLINEABLE memcpy2DAsync #-}
memcpy2DAsync :: Storable a
              => Ptr a          -- ^ destination
              -> Int            -- ^ width of destination array
              -> Ptr a          -- ^ source
              -> Int            -- ^ width of source array
              -> Int            -- ^ width to copy
              -> Int            -- ^ height to copy
              -> CopyDirection
              -> Maybe Stream
              -> IO ()
memcpy2DAsync !dst !dw !src !sw !w !h !kind !mst = doCopy undefined dst
  where
    doCopy :: Storable a' => a' -> Ptr a' -> IO ()
    doCopy x _ =
      let bytes = fromIntegral (sizeOf x)
          dw'   = fromIntegral dw * bytes
          sw'   = fromIntegral sw * bytes
          w'    = fromIntegral w  * bytes
          h'    = fromIntegral h
          st    = fromMaybe defaultStream mst
      in
      nothingIfOk =<< cudaMemcpy2DAsync dst dw' src sw' w' h' kind st

{-# INLINE cudaMemcpy2DAsync #-}
cudaMemcpy2DAsync :: (Ptr a) -> (Int64) -> (Ptr a) -> (Int64) -> (Int64) -> (Int64) -> (CopyDirection) -> (Stream) -> IO ((Status))
cudaMemcpy2DAsync a1 a2 a3 a4 a5 a6 a7 a8 =
  let {a1' = castPtr a1} in 
  let {a2' = fromIntegral a2} in 
  let {a3' = castPtr a3} in 
  let {a4' = fromIntegral a4} in 
  let {a5' = fromIntegral a5} in 
  let {a6' = fromIntegral a6} in 
  let {a7' = cFromEnum a7} in 
  let {a8' = useStream a8} in 
  cudaMemcpy2DAsync'_ a1' a2' a3' a4' a5' a6' a7' a8' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 565 "src/Foreign/CUDA/Runtime/Marshal.chs" #-}



--------------------------------------------------------------------------------
-- Combined Allocation and Marshalling
--------------------------------------------------------------------------------

-- |
-- Write a list of storable elements into a newly allocated device array,
-- returning the device pointer together with the number of elements that were
-- written. Note that this requires two copy operations: firstly from a Haskell
-- list into a heap-allocated array, and from there into device memory. The
-- array should be 'free'd when no longer required.
--
{-# INLINEABLE newListArrayLen #-}
newListArrayLen :: Storable a => [a] -> IO (DevicePtr a, Int)
newListArrayLen !xs =
  F.withArrayLen xs                     $ \len p ->
  bracketOnError (mallocArray len) free $ \d_xs  -> do
    pokeArray len p d_xs
    return (d_xs, len)


-- |
-- Write a list of storable elements into a newly allocated device array. This
-- is 'newListArrayLen' composed with 'fst'.
--
{-# INLINEABLE newListArray #-}
newListArray :: Storable a => [a] -> IO (DevicePtr a)
newListArray !xs = fst `fmap` newListArrayLen xs


-- |
-- Temporarily store a list of elements into a newly allocated device array. An
-- IO action is applied to the array, the result of which is returned. Similar
-- to 'newListArray', this requires two marshalling operations of the data.
--
-- As with 'allocaArray', the memory is freed once the action completes, so you
-- should not return the pointer from the action, and be sure that any
-- asynchronous operations (such as kernel execution) have completed.
--
{-# INLINEABLE withListArray #-}
withListArray :: Storable a => [a] -> (DevicePtr a -> IO b) -> IO b
withListArray !xs = withListArrayLen xs . const


-- |
-- A variant of 'withListArray' which also supplies the number of elements in
-- the array to the applied function
--
{-# INLINEABLE withListArrayLen #-}
withListArrayLen :: Storable a => [a] -> (Int -> DevicePtr a -> IO b) -> IO b
withListArrayLen !xs !f =
  bracket (newListArrayLen xs) (free . fst) (uncurry . flip $ f)
--
-- XXX: Will this attempt to double-free the device array on error (together
-- with newListArrayLen)?
--


--------------------------------------------------------------------------------
-- Utility
--------------------------------------------------------------------------------

-- |
-- Initialise device memory to a given 8-bit value
--
{-# INLINEABLE memset #-}
memset :: DevicePtr a                   -- ^ The device memory
       -> Int64                         -- ^ Number of bytes
       -> Int8                          -- ^ Value to set for each byte
       -> IO ()
memset !dptr !bytes !symbol = nothingIfOk =<< cudaMemset dptr symbol bytes

{-# INLINE cudaMemset #-}
cudaMemset :: (DevicePtr a) -> (Int8) -> (Int64) -> IO ((Status))
cudaMemset a1 a2 a3 =
  let {a1' = dptr a1} in 
  let {a2' = cIntConv a2} in 
  let {a3' = cIntConv a3} in 
  cudaMemset'_ a1' a2' a3' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 643 "src/Foreign/CUDA/Runtime/Marshal.chs" #-}

  where
    dptr = useDevicePtr . castDevPtr


foreign import ccall unsafe "Foreign/CUDA/Runtime/Marshal.chs.h cudaHostAlloc"
  cudaHostAlloc'_ :: ((C2HSImp.Ptr (C2HSImp.Ptr ())) -> (C2HSImp.CULong -> (C2HSImp.CUInt -> (IO C2HSImp.CInt))))

foreign import ccall unsafe "Foreign/CUDA/Runtime/Marshal.chs.h cudaFreeHost"
  cudaFreeHost'_ :: ((C2HSImp.Ptr ()) -> (IO C2HSImp.CInt))

foreign import ccall unsafe "Foreign/CUDA/Runtime/Marshal.chs.h cudaMalloc"
  cudaMalloc'_ :: ((C2HSImp.Ptr (C2HSImp.Ptr ())) -> (C2HSImp.CULong -> (IO C2HSImp.CInt)))

foreign import ccall unsafe "Foreign/CUDA/Runtime/Marshal.chs.h cudaFree"
  cudaFree'_ :: ((C2HSImp.Ptr ()) -> (IO C2HSImp.CInt))

foreign import ccall unsafe "Foreign/CUDA/Runtime/Marshal.chs.h cudaMallocManaged"
  cudaMallocManaged'_ :: ((C2HSImp.Ptr (C2HSImp.Ptr ())) -> (C2HSImp.CULong -> (C2HSImp.CUInt -> (IO C2HSImp.CInt))))

foreign import ccall safe "Foreign/CUDA/Runtime/Marshal.chs.h cudaMemcpy"
  cudaMemcpy'_ :: ((C2HSImp.Ptr ()) -> ((C2HSImp.Ptr ()) -> (C2HSImp.CULong -> (C2HSImp.CInt -> (IO C2HSImp.CInt)))))

foreign import ccall safe "Foreign/CUDA/Runtime/Marshal.chs.h cudaMemcpyAsync"
  cudaMemcpyAsync'_ :: ((C2HSImp.Ptr ()) -> ((C2HSImp.Ptr ()) -> (C2HSImp.CULong -> (C2HSImp.CInt -> ((C2HSImp.Ptr ()) -> (IO C2HSImp.CInt))))))

foreign import ccall safe "Foreign/CUDA/Runtime/Marshal.chs.h cudaMemcpy2D"
  cudaMemcpy2D'_ :: ((C2HSImp.Ptr ()) -> (C2HSImp.CULong -> ((C2HSImp.Ptr ()) -> (C2HSImp.CULong -> (C2HSImp.CULong -> (C2HSImp.CULong -> (C2HSImp.CInt -> (IO C2HSImp.CInt))))))))

foreign import ccall safe "Foreign/CUDA/Runtime/Marshal.chs.h cudaMemcpy2DAsync"
  cudaMemcpy2DAsync'_ :: ((C2HSImp.Ptr ()) -> (C2HSImp.CULong -> ((C2HSImp.Ptr ()) -> (C2HSImp.CULong -> (C2HSImp.CULong -> (C2HSImp.CULong -> (C2HSImp.CInt -> ((C2HSImp.Ptr ()) -> (IO C2HSImp.CInt)))))))))

foreign import ccall unsafe "Foreign/CUDA/Runtime/Marshal.chs.h cudaMemset"
  cudaMemset'_ :: ((C2HSImp.Ptr ()) -> (C2HSImp.CInt -> (C2HSImp.CULong -> (IO C2HSImp.CInt))))