{- Joel Svensson 2012,2013

   Notes:
   2013-01-08: removed number-of-blocks field from ForAllBlocks

-}

{-# LANGUAGE GADTs,
             FlexibleInstances,
             EmptyDataDecls #-} 


module Obsidian.Program  where 
 
import Data.Word
import Data.Monoid

import Obsidian.Exp
import Obsidian.Types
import Obsidian.Globs
import Obsidian.Atomic
import Obsidian.Names

-- Package value-supply
import Data.Supply
import System.IO.Unsafe

---------------------------------------------------------------------------
-- Thread/Block/Grid 
---------------------------------------------------------------------------
data Thread
data Block
data Grid 

data PT a where
  Thread :: PT Thread
  Block  :: PT Block
  Grid   :: PT Grid 

type Identifier = Int 

---------------------------------------------------------------------------
-- Obsidian
---------------------------------------------------------------------------
data Obsidian a = Obsidian (Program Grid a)
      

---------------------------------------------------------------------------
-- Program datatype
--------------------------------------------------------------------------
data Program t a where
  
  Identifier :: Program t Identifier 
  
  Assign :: Scalar a
            => Name
            -> [Exp Word32]
            -> (Exp a)
            -> Program Thread ()
           
            
  AtomicOp :: Scalar a
              => Name 
              -> Exp Word32
              -> Atomic a
              -> Program Thread (Exp a)

  Cond :: Exp Bool
          -> Program t ()
          -> Program t ()
  
  -- DONE: Code generation for this.
  -- TODO: Generalize this loop! (Replace Thread with t) 
  SeqFor :: Exp Word32 -> (Exp Word32 -> Program t a)
            -> Program t a
 
  ForAll :: (Exp Word32) 
            -> (Exp Word32 -> Program Thread a)
            -> Program Block a 

  {-
     I'm not sure about this constructor.
     As I see it programs from which we generate a kernel
     must be wrapped in one of these ForAllBlocks.
     Programs with sequences of 'ForAllBlocks' are problematic.

     Maybe a (ForAllBlocks n f *>* ForAllBlocks m g) Program
     should be split into two kernels. 
  -} 
  ForAllBlocks :: (Exp Word32) -> (Exp Word32 -> Program Block a) 
                  -> Program Grid a

  ForAllThreads :: (Exp Word32) -> (Exp Word32 -> Program Thread a)
                   -> Program Grid a 

  -- Allocate shared memory in each MP

  -- TODO: Change the Liveness analysis to a two-pass algo
  --       and remove the Allocate constructor. 
  Allocate :: Name -> Word32 -> Type -> Program t () 

  -- Automatic Variables
  Declare :: Name -> Type -> Program t () 
              
  {- About Output (Creates a named output array). 
     This is similar to Allocate but concerning global arrays.

     Since we cannot synchronize writes to a global array inside of an
     kernel, global arrays will only be written as outputs of the kernel
  -} 
  Output   :: Type -> Program Grid Name
  -- (Output may be replaced by AllocateG) 
  
  Sync     :: Program Block ()
  -- Two very experimental threadfence constructs.
  -- should correspond to cuda __threadfence();
  -- and __threadfenceBlock(); 
  ThreadFence :: Program Grid ()
  ThreadFenceBlock :: Program Block () 

  -- Parallel composition of Programs
  -- TODO: Will I use this ? 
  Par :: Program p () ->
         Program p () ->
         Program p () 

  -- Monad
  Return :: a -> Program t a
  Bind   :: Program t a -> (a -> Program t b) -> Program t b

---------------------------------------------------------------------------
-- Helpers 
--------------------------------------------------------------------------- 
uniqueSM = do
  id <- Identifier
  return $ "arr" ++ show id 

uniqueNamed pre = do
  id <- Identifier
  return $ pre ++ show id 

---------------------------------------------------------------------------
-- forAll and forAllN
---------------------------------------------------------------------------
--forAll :: (Exp Word32 -> Program Thread ()) -> Program Block () 
--forAll f = ForAll Nothing f

forAll :: Exp Word32 -> (Exp Word32 -> Program Thread a) -> Program Block a
forAll n f = ForAll n f

(*||*) = Par

---------------------------------------------------------------------------
-- SeqFor
---------------------------------------------------------------------------
seqFor :: Exp Word32 -> (Exp Word32 -> Program t a)
            -> Program t a
seqFor (Literal 1) f = f 0
seqFor n f = SeqFor n f


---------------------------------------------------------------------------
-- forAllT
--------------------------------------------------------------------------- 
-- When we know that all threads are independent and
-- independent of "blocksize".
-- Also any allocation of local storage is impossible.
-- Composition of something using forAllT with something
-- that performs local computations is impossible.
-- Using the hardcoded BlockDim may turn out to be a problem when
-- we want to compute more than one thing per thread (may be fine though). 
forAllT :: (Exp Word32) -> (Exp Word32 -> Program Thread a)
           -> Program Grid a
forAllT n f = ForAllThreads n 
            $ \gtid -> f gtid 


forAllBlocks = ForAllBlocks

---------------------------------------------------------------------------
-- Monad
--------------------------------------------------------------------------
instance Monad (Program t) where
  return = Return
  (>>=) = Bind

---------------------------------------------------------------------------
-- Aliases 
---------------------------------------------------------------------------
type TProgram = Program Thread
type BProgram = Program Block
type GProgram = Program Grid 

---------------------------------------------------------------------------
-- runPrg (fix types here, Integer!)
---------------------------------------------------------------------------
runPrg :: Int -> Program t a -> (a,Int)
runPrg i Identifier = (i,i+1)
runPrg i (Return a) = (a,i)
runPrg i (Bind m f) =
  let (a,i') = runPrg i m
  in runPrg i' (f a) 
runPrg i (Sync) = ((),i)
runPrg i (ForAll n ixf) =
  let (p,i') = runPrg i (ixf (variable "tid")) 
  in  (p,i') 
runPrg i (Allocate id _ _ ) = ((),i)
runPrg i (Assign _ _ a) = ((),i) -- Probaby wrong.. 
runPrg i (AtomicOp _ _ _) = (variable ("new"++show i),i+1)
     
---------------------------------------------------------------------------
-- printPrg
---------------------------------------------------------------------------
printPrg prg = (\(_,x,_) -> x) $ printPrg' 0 prg

printPrg' :: Int -> Program t a -> (a,String,Int)
printPrg' i Identifier = (i,"getId;\n",i+1) 
-- printPrg' i Skip = ((),";\n", i)
printPrg' i (Assign n ix e) =
  ((),n ++ "[" ++ show ix ++ "] = " ++ show e ++ ";\n", i) 
printPrg' i (AtomicOp n ix e) =
  let newname = "r" ++ show i
  in (variable newname,
      newname ++ " = " ++ printAtomic e ++
      "( " ++ n ++ "[" ++ show ix ++ "])\n",i+1)
printPrg' i (Allocate id n t) =
  let newname = id -- "arr" ++ show id
  in ((),newname ++ " = malloc(" ++ show n ++ ");\n",i+1)
printPrg' i (Declare id t) =
  let newname = id -- "arr" ++ show id
  in ((),show t ++ " " ++ newname ++ "\n",i+1)
printPrg' i (Output t) =
  let newname = "globalOut" ++ show i
  in (newname,newname ++ " = new Global output;\n",i+1)
printPrg' i (SeqFor n f) =
  let (a,prg2,i') = printPrg' i (f (variable "i"))
      
  in ( a,  
       "for (i in 0.." ++ show n ++ ")" ++
       "{\n" ++ prg2 ++ "\n}",
       i')
     
printPrg' i (ForAll n f) =
  let (a,prg2,i') = printPrg' i (f (variable "i"))
      
  in ( a,  
       "par (i in 0.." ++ show n ++ ")" ++
       "{\n" ++ prg2 ++ "\n}",
       i')
printPrg' i (ForAllBlocks n f) =
  let (d,prg2,i') = printPrg' i (f (variable "BIX"))
  in (d, 
      "blocks (i)" ++
      "{\n" ++ prg2 ++ "\n}",
      i')
printPrg' i (Return a) = (a,"MonadReturn;\n",i)
printPrg' i (Bind m f) =
  let (a1, str1,i1) = printPrg' i m
      (a2,str2,i2) = printPrg' i1 (f a1)
  in (a2,str1 ++ str2, i2)
printPrg' i Sync = ((),"Sync;\n",i)