{-# LANGUAGE CPP #-}
-- |
-- Module      : Data.Array.Accelerate.LLVM.Native.State
-- Copyright   : [2014..2020] The Accelerate Team
-- License     : BSD3
--
-- Maintainer  : Trevor L. McDonell <trevor.mcdonell@gmail.com>
-- Stability   : experimental
-- Portability : non-portable (GHC extensions)
--

module Data.Array.Accelerate.LLVM.Native.State (

  evalNative,
  createTarget, defaultTarget,

) where

-- accelerate
import Data.Array.Accelerate.LLVM.State
import Data.Array.Accelerate.LLVM.Native.Target
import Data.Array.Accelerate.LLVM.Native.Execute.Scheduler
import qualified Data.Array.Accelerate.LLVM.Native.Link.Cache   as LC
import qualified Data.Array.Accelerate.LLVM.Native.Debug        as Debug

-- library
import Data.Maybe
import System.Environment
import System.IO.Unsafe
import Text.Printf
import Text.Read
import Prelude                                                  as P

import GHC.Conc


-- | Execute a computation in the Native backend
--
evalNative :: Native -> LLVM Native a -> IO a
evalNative :: Native -> LLVM Native a -> IO a
evalNative = Native -> LLVM Native a -> IO a
forall t a. t -> LLVM t a -> IO a
evalLLVM


-- | Create a Native execution target by spawning a worker thread on each of the
-- given capabilities.
--
createTarget
    :: [Int]              -- ^ CPUs to launch worker threads on
    -> IO Native
createTarget :: [Int] -> IO Native
createTarget [Int]
cpus = do
  Workers
gang    <- [Int] -> IO Workers
hireWorkersOn [Int]
cpus
  LinkCache FunctionTable ObjectCode
linker  <- IO (LinkCache FunctionTable ObjectCode)
forall f o. IO (LinkCache f o)
LC.new
  Native -> IO Native
forall (m :: * -> *) a. Monad m => a -> m a
return  (Native -> IO Native) -> Native -> IO Native
forall a b. (a -> b) -> a -> b
$! LinkCache FunctionTable ObjectCode -> Workers -> Native
Native LinkCache FunctionTable ObjectCode
linker Workers
gang

{--
-- | The strategy for balancing work amongst the available worker threads.
--
type Strategy = Gang -> Executable


-- | Execute an operation sequentially on a single thread
--
sequentialIO :: Strategy
sequentialIO gang =
  Executable $ \name _ppt range fill ->
    timed name $ runSeqIO gang range fill


-- | Execute a computation without load balancing. Each thread computes an
-- equally sized chunk of the input. No work stealing occurs.
--
unbalancedParIO :: Strategy
unbalancedParIO gang =
  Executable $ \name _ppt range fill ->
    timed name $ runParIO Single.mkResource gang range fill


-- | Execute a computation where threads use work stealing (based on lazy
-- splitting of work stealing queues and exponential backoff) in order to
-- automatically balance the workload amongst themselves.
--
balancedParIO
    :: Int                -- ^ number of steal attempts before backing off
    -> Strategy
balancedParIO retries gang =
  Executable $ \name ppt range fill ->
    -- TLM: A suitable PPT should be chosen when invoking the continuation in
    --      order to balance scheduler overhead with fine-grained function calls
    --
    let resource = LBS.mkResource ppt (SMP.mkResource retries <> Backoff.mkResource)
    in  timed name $ runParIO resource gang range fill
--}


-- Top-level mutable state
-- -----------------------
--
-- It is important to keep some information alive for the entire run of the
-- program, not just a single execution. These tokens use 'unsafePerformIO' to
-- ensure they are executed only once, and reused for subsequent invocations.
--

-- | Initialise the gang of threads that will be used to execute computations.
-- This spawns one worker for each available processor, or as specified by the
-- value of the environment variable @ACCELERATE_LLVM_NATIVE_THREADS@.
--
-- This globally shared thread gang is auto-initialised on startup and shared by
-- all computations (unless the user chooses to 'run' with a different gang).
--
-- It does not help to have multiple gangs running at the same time, as then the
-- system as a whole may run slower as the threads contend for cache. The
-- scheduler is able to execute operations from multiple sources concurrently,
-- so multiple gangs should not be necessary.
--
{-# NOINLINE defaultTarget #-}
defaultTarget :: Native
defaultTarget :: Native
defaultTarget = IO Native -> Native
forall a. IO a -> a
unsafePerformIO (IO Native -> Native) -> IO Native -> Native
forall a b. (a -> b) -> a -> b
$ do
  Int
nproc <- IO Int
getNumProcessors
  Int
ncaps <- IO Int
getNumCapabilities
  Maybe Int
menv  <- (String -> Maybe Int
forall a. Read a => String -> Maybe a
readMaybe (String -> Maybe Int) -> Maybe String -> Maybe Int
forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<<) (Maybe String -> Maybe Int) -> IO (Maybe String) -> IO (Maybe Int)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> String -> IO (Maybe String)
lookupEnv String
"ACCELERATE_LLVM_NATIVE_THREADS"

  let nthreads :: Int
nthreads = Int -> Maybe Int -> Int
forall a. a -> Maybe a -> a
fromMaybe Int
nproc Maybe Int
menv

  -- Update the number of capabilities, but never set it lower than it already
  -- is. This target will spawn a worker on each processor (as returned by
  -- 'getNumProcessors', which includes SMT (hyperthreading) cores), but the
  -- user may have requested more capabilities than this to handle, for example,
  -- concurrent output.
  --
  Int -> IO ()
setNumCapabilities (Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Int
ncaps Int
nthreads)

  Flag -> String -> IO ()
Debug.traceIO Flag
Debug.dump_gc (String -> Int -> String
forall r. PrintfType r => String -> r
printf String
"gc: initialise native target with %d worker threads" Int
nthreads)
  [Int] -> IO Native
createTarget [Int
0 .. Int
nthreadsInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
1]


{--
-- Debugging
-- ---------

{-# INLINE timed #-}
timed :: ShortByteString -> IO a -> IO a
timed name f = Debug.timed Debug.dump_exec (elapsed name) f

{-# INLINE elapsed #-}
elapsed :: ShortByteString -> Double -> Double -> String
elapsed name x y = printf "exec: %s %s" (unpack name) (Debug.elapsedP x y)
--}