{-|
Module      : ETL
Description : Implements ETL operations over RTables.
Copyright   : (c) Nikos Karagiannidis, 2018
                  
License     : BSD3
Maintainer  : nkarag@gmail.com
Stability   : stable
Portability : POSIX

This is an internal module (i.e., not to be imported directly) that implements the core ETL functionality 
that is exposed via the __Julius__ EDSL for ETL/ELT found in the "Etl.Julius" module)
-}

{-# LANGUAGE OverloadedStrings #-}
-- :set -XOverloadedStrings

--{-# LANGUAGE OverloadedRecordFields #-}

--{-# LANGUAGE  DuplicateRecordFields #-}


module Etl.Internal.Core
    (
        -- * Basic Data Types

        RColMapping (..)
        ,ColXForm
        ,createColMapping        
        ,ETLOperation (..)        
        ,ETLMapping (..)
        ,YesNo (..)
        -- * Execution of an ETL Mapping

        ,runCM
        ,etlOpU
        ,etlOpB
        ,etl
        ,etlRes
        -- * Functions for \"Building\" an ETL Mapping

        ,rtabToETLMapping
        ,createLeafETLMapLD
        ,createLeafBinETLMapLD
        ,connectETLMapLD
        -- * Various ETL Operations

        ,
    ) where 

-- Data.RTable

import RTable.Core

-- Text

import Data.Text as T

-- HashMap                          -- https://hackage.haskell.org/package/unordered-containers-0.2.7.2/docs/Data-HashMap-Strict.html

import Data.HashMap.Strict as HM

-- Data.List

import Data.List (notElem, map, zip)

-- Data.Vector

import Data.Vector as V

data YesNo = Yes | No deriving (YesNo -> YesNo -> Bool
(YesNo -> YesNo -> Bool) -> (YesNo -> YesNo -> Bool) -> Eq YesNo
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: YesNo -> YesNo -> Bool
$c/= :: YesNo -> YesNo -> Bool
== :: YesNo -> YesNo -> Bool
$c== :: YesNo -> YesNo -> Bool
Eq, Int -> YesNo -> ShowS
[YesNo] -> ShowS
YesNo -> String
(Int -> YesNo -> ShowS)
-> (YesNo -> String) -> ([YesNo] -> ShowS) -> Show YesNo
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [YesNo] -> ShowS
$cshowList :: [YesNo] -> ShowS
show :: YesNo -> String
$cshow :: YesNo -> String
showsPrec :: Int -> YesNo -> ShowS
$cshowsPrec :: Int -> YesNo -> ShowS
Show)

-- | This is the basic data type to define the column-to-column mapping from a source 'RTable' to a target 'RTable'.

--   Essentially, an 'RColMapping' represents the column-level transformations of an 'RTuple' that will yield a target 'RTuple'. 

--

--   A mapping is simply a triple of the form ( Source-Column(s), Target-Column(s), Transformation, RTuple-Filter), where we define the source columns

--   over which a transformation (i.e. a function) will be applied in order to yield the target columns. Also, an 'RPredicate' (i.e. a filter) might be applied on the source 'RTuple'.

--   Remember that an 'RTuple' is essentially a mapping between a key (the Column Name) and a value (the 'RDataType' value). So the various 'RColMapping'

--   data constructors below simply describe the possible modifications of an 'RTuple' orginating from its own columns.

--

--   So, we can have the following mapping types:

--          a) single-source column to single-target column mapping (1 to 1), 

--                  the source column will be removed or not based on the 'removeSrcCol' flag (dublicate column names are not allowed in an 'RTuple')

--          b) multiple-source columns to single-target column mapping (N to 1),

--                  The N columns will be merged  to the single target column based on the transformation.

--                  The N columns will be removed from the RTuple or not based on the 'removeSrcCol' flag (dublicate column names are not allowed in an 'RTuple')

--          c) single-source column to multiple-target columns mapping  (1 to M)

--                  the source column will be "expanded" to M target columns based ont he transformation.

--                  the source column will be removed or not based on the 'removeSrcCol' flag (dublicate column names are not allowed in an 'RTuple')                  

--          d) multiple-source column to multiple target columns mapping (N to M)

--                  The N source columns will be mapped to M target columns based on the transformation.

--                  The N columns will be removed from the RTuple or not based on the 'removeSrcCol' flag (dublicate column names are not allow in an 'RTuple')

--

--   Some examples of mapping are the following:

--

--   @

--      ("Start_Date", No, "StartDate", \t -> True)  --  copy the source value to target and dont remove the source column, so the target RTuple will have both columns "Start_Date" and "StartDate"

--                                           --  with the exactly the same value)

--

--      (["Amount", "Discount"], Yes, "FinalAmount", (\[a, d] -> a * d) ) -- "FinalAmount" is a derived column based on a function applied to the two source columns. 

--                                                                        --  In the final RTuple we remove the two source columns.

--   @

--

--  An 'RColMapping' can be applied with the 'runCM' (runColMapping) operator

--

data RColMapping = 
        ColMapEmpty
    |   RMap1x1 { RColMapping -> ColumnName
srcCol :: ColumnName,         RColMapping -> YesNo
removeSrcCol :: YesNo,  RColMapping -> ColumnName
trgCol :: ColumnName,       RColMapping -> RDataType -> RDataType
transform1x1 :: RDataType    -> RDataType,   RColMapping -> RPredicate
srcRTupleFilter:: RPredicate   }   -- ^ single-source column to single-target column mapping (1 to 1).

    |   RMapNx1 { RColMapping -> [ColumnName]
srcColGrp :: [ColumnName],    removeSrcCol :: YesNo,  trgCol :: ColumnName,       RColMapping -> [RDataType] -> RDataType
transformNx1 :: [RDataType]  -> RDataType,   srcRTupleFilter:: RPredicate   }      -- ^ multiple-source columns to single-target column mapping (N to 1)

    |   RMap1xN { srcCol :: ColumnName,         removeSrcCol :: YesNo,  RColMapping -> [ColumnName]
trgColGrp :: [ColumnName],  RColMapping -> RDataType -> [RDataType]
transform1xN :: RDataType    -> [RDataType], srcRTupleFilter:: RPredicate }    -- ^ single-source column to multiple-target columns mapping (1 to N)

    |   RMapNxM { srcColGrp :: [ColumnName],    removeSrcCol :: YesNo,  trgColGrp :: [ColumnName],  RColMapping -> [RDataType] -> [RDataType]
transformNxM :: [RDataType]  -> [RDataType], srcRTupleFilter:: RPredicate }    -- ^ multiple-source column to multiple target columns mapping (N to M)                                                 


-- | A Column Transformation function data type.

-- It is used in order to define an arbitrary column-level transformation (i.e., from a list of N input Column-Values we produce a list of M derived (output) Column-Values).

-- A Column value is represented with the 'RDataType'.

type ColXForm = [RDataType]  -> [RDataType]

-- | Constructs an RColMapping.

-- This is the suggested method for creating a column mapping and not by calling the data constructors directly.

createColMapping :: 
       [ColumnName]  -- ^ List of source column names

    -> [ColumnName]  -- ^ List of target column names

    -> ColXForm      -- ^ Column Transformation function

    -> YesNo         -- ^ Remove source column option

    -> RPredicate    -- ^ Filtering predicate

    -> RColMapping   -- ^ Output Column Mapping

createColMapping :: [ColumnName]
-> [ColumnName]
-> ([RDataType] -> [RDataType])
-> YesNo
-> RPredicate
-> RColMapping
createColMapping (ColumnName
src:[]) (ColumnName
trg:[]) [RDataType] -> [RDataType]
xForm YesNo
remove RPredicate
fPred = RMap1x1 :: ColumnName
-> YesNo
-> ColumnName
-> (RDataType -> RDataType)
-> RPredicate
-> RColMapping
RMap1x1 {srcCol :: ColumnName
srcCol = ColumnName
src, removeSrcCol :: YesNo
removeSrcCol = YesNo
remove, trgCol :: ColumnName
trgCol = ColumnName
trg, transform1x1 :: RDataType -> RDataType
transform1x1 = \RDataType
x -> [RDataType] -> RDataType
forall a. [a] -> a
unlist ([RDataType] -> RDataType) -> [RDataType] -> RDataType
forall a b. (a -> b) -> a -> b
$ [RDataType] -> [RDataType]
xForm (RDataType
xRDataType -> [RDataType] -> [RDataType]
forall a. a -> [a] -> [a]
:[]), srcRTupleFilter :: RPredicate
srcRTupleFilter = RPredicate
fPred}
                                                            where unlist :: [a] -> a
                                                                  unlist :: [a] -> a
unlist (a
x:[]) = a
x  -- since this is a 1x1 col mapping, we are sure that xForm will return a single element list

createColMapping [ColumnName]
srcCols (ColumnName
trg:[]) [RDataType] -> [RDataType]
xForm YesNo
remove RPredicate
fPred =  RMapNx1 :: [ColumnName]
-> YesNo
-> ColumnName
-> ([RDataType] -> RDataType)
-> RPredicate
-> RColMapping
RMapNx1 {srcColGrp :: [ColumnName]
srcColGrp = [ColumnName]
srcCols, removeSrcCol :: YesNo
removeSrcCol = YesNo
remove, trgCol :: ColumnName
trgCol = ColumnName
trg, transformNx1 :: [RDataType] -> RDataType
transformNx1 = \[RDataType]
x -> [RDataType] -> RDataType
forall a. [a] -> a
unlist ([RDataType] -> RDataType) -> [RDataType] -> RDataType
forall a b. (a -> b) -> a -> b
$ [RDataType] -> [RDataType]
xForm ([RDataType]
x), srcRTupleFilter :: RPredicate
srcRTupleFilter = RPredicate
fPred}                                                                
                                                            where unlist :: [a] -> a
                                                                  unlist :: [a] -> a
unlist (a
x:[]) = a
x  -- since this is a Nx1 col mapping, we are sure that xForm will return a single element list

createColMapping (ColumnName
src:[]) [ColumnName]
trgCols [RDataType] -> [RDataType]
xForm YesNo
remove RPredicate
fPred =  RMap1xN :: ColumnName
-> YesNo
-> [ColumnName]
-> (RDataType -> [RDataType])
-> RPredicate
-> RColMapping
RMap1xN {srcCol :: ColumnName
srcCol = ColumnName
src, removeSrcCol :: YesNo
removeSrcCol = YesNo
remove, trgColGrp :: [ColumnName]
trgColGrp = [ColumnName]
trgCols, transform1xN :: RDataType -> [RDataType]
transform1xN = \RDataType
x -> [RDataType] -> [RDataType]
xForm (RDataType
xRDataType -> [RDataType] -> [RDataType]
forall a. a -> [a] -> [a]
:[]), srcRTupleFilter :: RPredicate
srcRTupleFilter = RPredicate
fPred}                                                                  
createColMapping [ColumnName]
srcCols [ColumnName]
trgCols [RDataType] -> [RDataType]
xForm YesNo
remove RPredicate
fPred =  RMapNxM :: [ColumnName]
-> YesNo
-> [ColumnName]
-> ([RDataType] -> [RDataType])
-> RPredicate
-> RColMapping
RMapNxM {srcColGrp :: [ColumnName]
srcColGrp = [ColumnName]
srcCols, removeSrcCol :: YesNo
removeSrcCol = YesNo
remove, trgColGrp :: [ColumnName]
trgColGrp = [ColumnName]
trgCols, transformNxM :: [RDataType] -> [RDataType]
transformNxM = [RDataType] -> [RDataType]
xForm, srcRTupleFilter :: RPredicate
srcRTupleFilter = RPredicate
fPred}


-- | runCM operator executes an RColMapping

-- If a target-column has the same name with a source-column and a DontRemoveSrc (i.e., removeSrcCol == No) has been specified, then the (target-column, target-value) key-value pair,

-- overwrites the corresponding (source-column, source-value) key-value pair

runCM :: RColMapping -> RTable -> RTable
runCM = RColMapping -> RTable -> RTable
runColMapping

-- | Apply an RColMapping to a source RTable and produce a new RTable.

-- If a target-column has the same name with a source-column and a DontRemoveSrc (i.e., removeSrcCol == No) has been specified, then the (target-column, target-value) key-value pair,

-- overwrites the corresponding (source-column, source-value) key-value pair.

-- If a filter is embedded in the 'RColMapping', then the returned 'RTable' will include only the 'RTuple's that satisfy the filter predicate.

runColMapping :: RColMapping -> RTable -> RTable
runColMapping :: RColMapping -> RTable -> RTable
runColMapping RColMapping
ColMapEmpty RTable
rtabS = RTable
rtabS
runColMapping RColMapping
rmap RTable
rtabS = 
    if RTable -> Bool
isRTabEmpty RTable
rtabS
        then RTable
emptyRTable
        else 
            case RColMapping
rmap of 
                RMap1x1 {srcCol :: RColMapping -> ColumnName
srcCol = ColumnName
src, trgCol :: RColMapping -> ColumnName
trgCol = ColumnName
trg, removeSrcCol :: RColMapping -> YesNo
removeSrcCol = YesNo
rmvFlag, transform1x1 :: RColMapping -> RDataType -> RDataType
transform1x1 = RDataType -> RDataType
xform, srcRTupleFilter :: RColMapping -> RPredicate
srcRTupleFilter = RPredicate
pred} -> do  -- an RTable is a Monad just like a list is a Monad, representing a non-deterministic value

                        RTuple
srcRtuple <- RPredicate -> RTable -> RTable
f RPredicate
pred RTable
rtabS                                                                        
                        let 
                            -- 1. get original column value 

                            srcValue :: RDataType
srcValue = ColumnName -> RTuple -> RDataType
getRTupColValue ColumnName
src RTuple
srcRtuple
                            -- srcValue = HM.lookupDefault    Null -- return Null if value cannot be found based on column name 

                            --                                src   -- column name to look for (source) - i.e., the key in the HashMap

                            --                                srcRtuple  -- source RTuple (i.e., a HashMap ColumnName RDataType)

                            
                            -- 2. apply transformation to retrieve new column value

                            trgValue :: RDataType
trgValue = RDataType -> RDataType
xform RDataType
srcValue                                         
                            
                            -- 3. remove the original ColumnName, Value mapping from the RTuple

                            rtupleTemp :: RTuple
rtupleTemp = 
                                case YesNo
rmvFlag of
                                    YesNo
Yes -> ColumnName -> RTuple -> RTuple
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> HashMap k v
HM.delete ColumnName
src RTuple
srcRtuple
                                    YesNo
No  -> RTuple
srcRtuple
                            
                            -- 4. insert new (ColumnName, Value) pair and thus create the target RTuple

                            trgRtuple :: RTuple
trgRtuple = ColumnName -> RDataType -> RTuple -> RTuple
forall k v.
(Eq k, Hashable k) =>
k -> v -> HashMap k v -> HashMap k v
HM.insert ColumnName
trg RDataType
trgValue RTuple
rtupleTemp
                        
                        -- return new RTable

                        RTuple -> RTable
forall (m :: * -> *) a. Monad m => a -> m a
return RTuple
trgRtuple

                RMapNx1 {srcColGrp :: RColMapping -> [ColumnName]
srcColGrp = [ColumnName]
srcL, trgCol :: RColMapping -> ColumnName
trgCol = ColumnName
trg, removeSrcCol :: RColMapping -> YesNo
removeSrcCol = YesNo
rmvFlag, transformNx1 :: RColMapping -> [RDataType] -> RDataType
transformNx1 = [RDataType] -> RDataType
xform, srcRTupleFilter :: RColMapping -> RPredicate
srcRTupleFilter = RPredicate
pred} -> do  -- an RTable is a Monad just like a list is a Monad, representing a non-deterministic value

                        RTuple
srcRtuple <- RPredicate -> RTable -> RTable
f RPredicate
pred RTable
rtabS                                                                        
                        let 
                            -- 1. get original column value (in this case it is a list of values)

                            srcValueL :: [RDataType]
srcValueL = (ColumnName -> RDataType) -> [ColumnName] -> [RDataType]
forall a b. (a -> b) -> [a] -> [b]
Data.List.map ( \ColumnName
src ->  ColumnName -> RTuple -> RDataType
getRTupColValue ColumnName
src RTuple
srcRtuple

                                            -- \src -> HM.lookupDefault       Null -- return Null if value cannot be found based on column name 

                                            --                                 src   -- column name to look for (source) - i.e., the key in the HashMap

                                            --                                 srcRtuple  -- source RTuple (i.e., a HashMap ColumnName RDataType)

                                            ) [ColumnName]
srcL
                            
                            -- 2. apply transformation to retrieve new column value

                            trgValue :: RDataType
trgValue = [RDataType] -> RDataType
xform [RDataType]
srcValueL                                         
                            
                            -- 3. remove the original (ColumnName, Value) mappings from the RTuple (i.e., remove ColumnNames mentioned in the RColMapping from source RTuple)

                            rtupleTemp :: RTuple
rtupleTemp = 
                                case YesNo
rmvFlag of
                                    YesNo
Yes -> (ColumnName -> RDataType -> Bool) -> RTuple -> RTuple
forall k v. (k -> v -> Bool) -> HashMap k v -> HashMap k v
HM.filterWithKey (\ColumnName
colName RDataType
_ -> ColumnName -> [ColumnName] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
Data.List.notElem ColumnName
colName [ColumnName]
srcL) RTuple
srcRtuple
                                    YesNo
No  -> RTuple
srcRtuple
                            
                            -- 4. insert new ColumnName, Value mapping as the target RTuple must be

                            trgRtuple :: RTuple
trgRtuple = ColumnName -> RDataType -> RTuple -> RTuple
forall k v.
(Eq k, Hashable k) =>
k -> v -> HashMap k v -> HashMap k v
HM.insert ColumnName
trg RDataType
trgValue RTuple
rtupleTemp
                        -- return new RTable

                        RTuple -> RTable
forall (m :: * -> *) a. Monad m => a -> m a
return RTuple
trgRtuple

                RMap1xN {srcCol :: RColMapping -> ColumnName
srcCol = ColumnName
src, trgColGrp :: RColMapping -> [ColumnName]
trgColGrp = [ColumnName]
trgL, removeSrcCol :: RColMapping -> YesNo
removeSrcCol = YesNo
rmvFlag, transform1xN :: RColMapping -> RDataType -> [RDataType]
transform1xN = RDataType -> [RDataType]
xform, srcRTupleFilter :: RColMapping -> RPredicate
srcRTupleFilter = RPredicate
pred} -> do  -- an RTable is a Monad just like a list is a Monad, representing a non-deterministic value

                        RTuple
srcRtuple <- RPredicate -> RTable -> RTable
f RPredicate
pred RTable
rtabS                                                                        
                        let 
                            -- 1. get original column value 

                            srcValue :: RDataType
srcValue = ColumnName -> RTuple -> RDataType
getRTupColValue ColumnName
src RTuple
srcRtuple

                            -- srcValue = HM.lookupDefault    Null -- return Null if value cannot be found based on column name 

                            --                                src   -- column name to look for (source) - i.e., the key in the HashMap

                            --                                srcRtuple  -- source RTuple (i.e., a HashMap ColumnName RDataType)


                            -- 2. apply transformation to retrieve new column value list

                            trgValueL :: [RDataType]
trgValueL = RDataType -> [RDataType]
xform RDataType
srcValue                                         

                            -- 3. remove the original ColumnName, Value mapping from the RTuple

                            rtupleTemp :: RTuple
rtupleTemp = 
                                case YesNo
rmvFlag of
                                    YesNo
Yes -> ColumnName -> RTuple -> RTuple
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> HashMap k v
HM.delete ColumnName
src RTuple
srcRtuple
                                    YesNo
No  -> RTuple
srcRtuple

                            -- 4. insert new (ColumnName, Value) pairs to the target RTuple

                            tempL :: [(ColumnName, RDataType)]
tempL = [ColumnName] -> [RDataType] -> [(ColumnName, RDataType)]
forall a b. [a] -> [b] -> [(a, b)]
Data.List.zip [ColumnName]
trgL [RDataType]
trgValueL
                            trgRtuple :: RTuple
trgRtuple = RTuple -> RTuple -> RTuple
forall k v.
(Eq k, Hashable k) =>
HashMap k v -> HashMap k v -> HashMap k v
HM.union ([(ColumnName, RDataType)] -> RTuple
forall k v. (Eq k, Hashable k) => [(k, v)] -> HashMap k v
HM.fromList [(ColumnName, RDataType)]
tempL) RTuple
rtupleTemp  -- implement as a hashmap union between new (columnName,value) pairs and source tuple

                                
                        -- return new RTable

                        RTuple -> RTable
forall (m :: * -> *) a. Monad m => a -> m a
return RTuple
trgRtuple

                RMapNxM {srcColGrp :: RColMapping -> [ColumnName]
srcColGrp = [ColumnName]
srcL, trgColGrp :: RColMapping -> [ColumnName]
trgColGrp = [ColumnName]
trgL, removeSrcCol :: RColMapping -> YesNo
removeSrcCol = YesNo
rmvFlag, transformNxM :: RColMapping -> [RDataType] -> [RDataType]
transformNxM = [RDataType] -> [RDataType]
xform, srcRTupleFilter :: RColMapping -> RPredicate
srcRTupleFilter = RPredicate
pred} -> do  -- an RTable is a Monad just like a list is a Monad, representing a non-deterministic value

                        RTuple
srcRtuple <- RPredicate -> RTable -> RTable
f RPredicate
pred RTable
rtabS                                                                        
                        let 
                            -- 1. get original column value (in this case it is a list of values)

                            srcValueL :: [RDataType]
srcValueL = (ColumnName -> RDataType) -> [ColumnName] -> [RDataType]
forall a b. (a -> b) -> [a] -> [b]
Data.List.map (  \ColumnName
src ->  ColumnName -> RTuple -> RDataType
getRTupColValue ColumnName
src RTuple
srcRtuple

                                            -- \src -> HM.lookupDefault       Null -- return Null if value cannot be found based on column name 

                                            --                                 src   -- column name to look for (source) - i.e., the key in the HashMap

                                            --                                 srcRtuple  -- source RTuple (i.e., a HashMap ColumnName RDataType)

                                            ) [ColumnName]
srcL
                            
                            -- 2. apply transformation to retrieve new column value

                            trgValueL :: [RDataType]
trgValueL = [RDataType] -> [RDataType]
xform [RDataType]
srcValueL                                         

                            -- 3. remove the original ColumnName, Value mapping from the RTuple

                            rtupleTemp :: RTuple
rtupleTemp = 
                                case YesNo
rmvFlag of
                                    YesNo
Yes -> (ColumnName -> RDataType -> Bool) -> RTuple -> RTuple
forall k v. (k -> v -> Bool) -> HashMap k v -> HashMap k v
HM.filterWithKey (\ColumnName
colName RDataType
_ -> ColumnName -> [ColumnName] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
Data.List.notElem ColumnName
colName [ColumnName]
srcL) RTuple
srcRtuple
                                    YesNo
No  -> RTuple
srcRtuple

                            -- 4. insert new (ColumnName, Value) pairs to the target RTuple

                            tempL :: [(ColumnName, RDataType)]
tempL = [ColumnName] -> [RDataType] -> [(ColumnName, RDataType)]
forall a b. [a] -> [b] -> [(a, b)]
Data.List.zip [ColumnName]
trgL [RDataType]
trgValueL
                            trgRtuple :: RTuple
trgRtuple = RTuple -> RTuple -> RTuple
forall k v.
(Eq k, Hashable k) =>
HashMap k v -> HashMap k v -> HashMap k v
HM.union ([(ColumnName, RDataType)] -> RTuple
forall k v. (Eq k, Hashable k) => [(k, v)] -> HashMap k v
HM.fromList [(ColumnName, RDataType)]
tempL) RTuple
rtupleTemp  -- implement as a hashmap union between new (columnName,value) pairs and source tuple

                                
                        -- return new RTable

                        RTuple -> RTable
forall (m :: * -> *) a. Monad m => a -> m a
return RTuple
trgRtuple

-- | An ETL operation applied to an RTable can be either an 'ROperation' (a relational agebra operation like join, filter etc.) defined in "RTable.Core" module,

--   or an 'RColMapping' applied to an 'RTable'

data ETLOperation =  
            ETLrOp { ETLOperation -> ROperation
rop  :: ROperation   } 
        |   ETLcOp { ETLOperation -> RColMapping
cmap :: RColMapping } 
                


-- | executes a Unary ETL Operation

etlOpU :: ETLOperation -> RTable -> RTable
etlOpU = ETLOperation -> RTable -> RTable
runUnaryETLOperation

-- | executes an ETL Operation

runUnaryETLOperation ::
    ETLOperation
    -> RTable  -- ^ input RTable

    -> RTable  -- ^ output RTable

runUnaryETLOperation :: ETLOperation -> RTable -> RTable
runUnaryETLOperation ETLOperation
op RTable
inpRtab = 
    case ETLOperation
op of 
        ETLrOp { rop :: ETLOperation -> ROperation
rop  = ROperation
relOp  } -> ROperation -> RTable -> RTable
ropU ROperation
relOp RTable
inpRtab
        ETLcOp { cmap :: ETLOperation -> RColMapping
cmap = RColMapping
colMap } -> RColMapping -> RTable -> RTable
runCM RColMapping
colMap RTable
inpRtab

-- | executes a Binary ETL Operation

etlOpB :: ETLOperation -> RTable -> RTable -> RTable
etlOpB = ETLOperation -> RTable -> RTable -> RTable
runBinaryETLOperation

-- | executes an ETL Operation

runBinaryETLOperation ::
    ETLOperation
    -> RTable  -- ^ input RTable1

    -> RTable  -- ^ input RTable2    

    -> RTable  -- ^ output RTable

runBinaryETLOperation :: ETLOperation -> RTable -> RTable -> RTable
runBinaryETLOperation ETLrOp {rop :: ETLOperation -> ROperation
rop  = ROperation
relOp} RTable
inpT1 RTable
inpT2 = ROperation -> RTable -> RTable -> RTable
ropB ROperation
relOp RTable
inpT1 RTable
inpT2


-- | ETLmapping : it is the equivalent of a mapping in an ETL tool and consists of a series of ETLOperations that are applied, one-by-one,

--   to some initial input RTable, but if binary ETLOperations are included in the ETLMapping, then there will be more than one input RTables that

--   the ETLOperations of the ETLMapping will be applied to. When we apply (i.e., run) an ETLOperation of the ETLMapping we get a new RTable,

--   which is then inputed to the next ETLOperation, until we finally run all ETLOperations. The purpose of the execution of an ETLMapping is     

--   to produce a single new RTable as the result of the execution of all the ETLOperations of the ETLMapping.

--   In terms of database operations an ETLMapping is the equivalent of an CREATE AS SELECT (CTAS) operation in an RDBMS. This means that

--   anything that can be done in the SELECT part (i.e., column projection, row filtering, grouping and join operations, etc.)

--   in order to produce a new table, can be included in an ETLMapping.

--

--   An ETLMapping is executed with the etl (runETLmapping) operator

--

--   Implementation: 

--   An ETLMapping is implemented as a binary tree where the node represents the ETLOperation to be executed and the left branch is another 

--   ETLMapping, while the right branch is an RTable (that might be empty in the case of a Unary ETLOperation). 

--   Execution proceeds from bottom-left to top-right.

--   This is similar in concept to a left-deep join tree. In a Left-Deep ETLOperation tree the "pipe" of ETLOperations comes from 

--   the left branches always.

--   The leaf node is always an ETLMapping with an ETLMapEmpty in the left branch and an RTable  in the right branch (the initial RTable inputed

--   to the ETLMapping).

--   In this way, the result of the execution of each ETLOperation (which is an RTable) is passed on to the next ETLOperation. Here is an example:

--

-- @

--     A Left-Deep ETLOperation Tree

-- 

--                              final RTable result

--                                    / 

--                                 etlOp3 

--                              /       \ 

--                           etlOp2     rtab2

--                          /      \ 

-- A leaf-node -->    etlOp1    emptyRTab

--                    /       \

--              ETLMapEmpty   rtab1

--

-- @

--

--  You see that always on the left branch we have an ETLMapping data type (i.e., a left-deep ETLOperation tree). 

--  So how do we implement the following case?

--

-- @

--

--                     final RTable result

--                             / 

--  A leaf-node -->         etlOp1 

--                          /       \

--                         rtab1   rtab2

--

-- @

--

-- The answer is that we "model" the left RTable (rtab1 in our example) as an ETLMapping of the form:

--

-- @

--  ETLMapLD { etlOp = ETLcOp{cmap = ColMapEmpty}, tabL = ETLMapEmpty, tabR = rtab1 }

-- @

--

-- So we embed the rtab1 in a ETLMapping, which is a leaf (i.e., it has an empty prevMap), the rtab1 is in 

-- the right branch (tabR) and the ETLOperation is the EmptyColMapping, which returns its input RTable when executed.

-- We can use function 'rtabToETLMapping' for this job. So it becomes

-- @

-- A leaf-node -->    etlOp1    

--                    /     \

--   rtabToETLMapping rtab1  rtab2

-- @

--

-- In this manner, a leaf-node can also be implemented like this:

--

-- @

--                              final RTable result

--                                    / 

--                                 etlOp3 

--                              /       \ 

--                           etlOp2     rtab2

--                          /      \ 

-- A leaf-node -->    etlOp1    emptyRTab

--                    /     \

--   rtabToETLMapping rtab1  emptyRTable

-- @

--

data ETLMapping = 
        ETLMapEmpty -- ^ an empty node

    |   ETLMapLD    
        {           ETLMapping -> ETLOperation
etlOp     :: ETLOperation -- ^ the ETLOperation to be executed    

                    ,ETLMapping -> ETLMapping
tabL     :: ETLMapping   -- ^ the left-branch corresponding to the previous ETLOperation, which is input to this one.

                                              --       

                    ,ETLMapping -> RTable
tabR     :: RTable       -- ^ the right branch corresponds to another RTable (for binary ETL operations). 

                                               --  If this is a Unary ETLOperation then this field must be an empty RTable.

        } -- ^ a Left-Deep node

    |   ETLMapRD    
        {           etlOp     :: ETLOperation -- ^ the ETLOperation to be executed    

                    ,ETLMapping -> RTable
tabLrd     :: RTable       -- ^ the left-branch corresponds to another RTable (for binary ETL operations). 

                                              --   If this is a Unary ETLOperation then this field must be an empty RTable.                                                                                             

                    ,ETLMapping -> ETLMapping
tabRrd     :: ETLMapping   -- ^ the right branch corresponding to the previous ETLOperation, which is input to this one.

        } -- ^ a Right-Deep node

    |   ETLMapBal    
        {           etlOp     :: ETLOperation -- ^ the ETLOperation to be executed    

                    ,ETLMapping -> ETLMapping
tabLbal     :: ETLMapping   -- ^ the left-branch corresponding to the previous ETLOperation, which is input to this one. 

                                              --   If this is a Unary ETLOperation then this field might be an empty ETLMapping.                                                                                             

                    ,ETLMapping -> ETLMapping
tabRbal     :: ETLMapping   -- ^ the right branch corresponding corresponding to the previous ETLOperation, which is input to this one.                                               --   If this is a Unary ETLOperation then this field might be an empty ETLMapping.

        } -- ^ a Balanced node



instance Eq ETLMapping where
    ETLMapping
etlMap1 == :: ETLMapping -> ETLMapping -> Bool
== ETLMapping
etlMap2 = 
            (ETLMapping -> RTable
etl ETLMapping
etlMap1) RTable -> RTable -> Bool
forall a. Eq a => a -> a -> Bool
== (ETLMapping -> RTable
etl ETLMapping
etlMap2)  -- two ETLMappings are equal if the RTables resulting from their execution are equal


-- | Creates a left-deep leaf ETL Mapping, of the following form:

--

-- @

--     A Left-Deep ETLOperation Tree

-- 

--                              final RTable result

--                                    / 

--                                 etlOp3 

--                              /       \ 

--                           etlOp2     rtab2

--                          /      \ 

-- A leaf-node -->    etlOp1    emptyRTab

--                    /       \

--              ETLMapEmpty   rtab1

--

-- @

--

createLeafETLMapLD ::
       ETLOperation -- ^ ETL operation of this ETL mapping

    -> RTable       -- ^ input RTable

    -> ETLMapping   -- ^ output ETLMapping

createLeafETLMapLD :: ETLOperation -> RTable -> ETLMapping
createLeafETLMapLD ETLOperation
etlop RTable
rt = ETLMapLD :: ETLOperation -> ETLMapping -> RTable -> ETLMapping
ETLMapLD { etlOp :: ETLOperation
etlOp = ETLOperation
etlop, tabL :: ETLMapping
tabL = ETLMapping
ETLMapEmpty, tabR :: RTable
tabR = RTable
rt}

-- | creates a Binary operation leaf node of the form:

--

-- @

--

-- A leaf-node -->    etlOp1    

--                    /     \

--   rtabToETLMapping rtab1  rtab2

-- @

-- 

createLeafBinETLMapLD ::
       ETLOperation -- ^ ETL operation of this ETL mapping

    -> RTable       -- ^ input RTable1

    -> RTable       -- ^ input RTable2

    -> ETLMapping   -- ^ output ETLMapping

createLeafBinETLMapLD :: ETLOperation -> RTable -> RTable -> ETLMapping
createLeafBinETLMapLD ETLOperation
etlop RTable
rt1 RTable
rt2 = ETLMapLD :: ETLOperation -> ETLMapping -> RTable -> ETLMapping
ETLMapLD { etlOp :: ETLOperation
etlOp = ETLOperation
etlop, tabL :: ETLMapping
tabL = RTable -> ETLMapping
rtabToETLMapping RTable
rt1, tabR :: RTable
tabR = RTable
rt2}

-- | Connects an ETL Mapping to a left-deep ETL Mapping tree, of the form

--

-- @

--     A Left-Deep ETLOperation Tree

-- 

--                              final RTable result

--                                    / 

--                                 etlOp3 

--                              /       \ 

--                           etlOp2     rtab2

--                          /      \ 

-- A leaf-node -->    etlOp1    emptyRTab

--                    /       \

--              ETLMapEmpty   rtab1

--

-- @

--

-- Example:

--

-- @

--   -- connect a Unary ETL mapping (etlOp2)

--

--                           etlOp2    

--                          /      \ 

--                       etlOp1    emptyRTab

--        

--   => connectETLMapLD etlOp2 emptyRTable prevMap

--

--   -- connect a Binary ETL Mapping (etlOp3)

--

--                                 etlOp3 

--                              /       \ 

--                           etlOp2     rtab2

--

--   => connectETLMapLD etlOp3 rtab2 prevMap

-- @

--

-- Note that the right branch (RTable) appears first in the list of input arguments of this function and 

-- the left branch (ETLMapping) appears second. This is strange, and one could thought that it is a mistake

-- (i.e., the left branch should appear first and the right branch second) since we are reading from left to right.

-- However this was a deliberate choice, so that we leave the left branch (which is the connection point with the

-- previous ETLMapping) as the last argument, and thus we can partially apply the argumenets and get a new function

-- with input parameter only the previous mapping. This is very helpfull in function composition

-- 

connectETLMapLD ::
        ETLOperation  -- ^ ETL operation of this ETL Mapping

     -> RTable        -- ^ Right RTable (right branch) (if this is a Unary ETL mapping this should be an emptyRTable) 

     -> ETLMapping    -- ^ Previous ETL mapping (left branch)        

     -> ETLMapping    -- ^ New ETL Mapping, which has added at the end the new node

connectETLMapLD :: ETLOperation -> RTable -> ETLMapping -> ETLMapping
connectETLMapLD ETLOperation
etlop RTable
rt ETLMapping
prevMap = ETLMapLD :: ETLOperation -> ETLMapping -> RTable -> ETLMapping
ETLMapLD { etlOp :: ETLOperation
etlOp = ETLOperation
etlop, tabL :: ETLMapping
tabL = ETLMapping
prevMap, tabR :: RTable
tabR = RTable
rt}


-- | This operator executes an 'ETLMapping'

etl :: ETLMapping -> RTable
etl = ETLMapping -> RTable
runETLmapping

-- | Executes an 'ETLMapping'

runETLmapping ::
    ETLMapping  -- ^ input ETLMapping

    -> RTable   -- ^ output RTable

-- empty ETL mapping

runETLmapping :: ETLMapping -> RTable
runETLmapping ETLMapping
ETLMapEmpty = RTable
emptyRTable
--  ETL mapping with an empty ETLOperation, which is just modelling an RTable

runETLmapping ETLMapLD { etlOp :: ETLMapping -> ETLOperation
etlOp = ETLcOp{cmap :: ETLOperation -> RColMapping
cmap = RColMapping
ColMapEmpty}, tabL :: ETLMapping -> ETLMapping
tabL = ETLMapping
ETLMapEmpty, tabR :: ETLMapping -> RTable
tabR = RTable
rtab } = RTable
rtab
-- leaf node --> unary ETLOperation on RTable

runETLmapping ETLMapLD { etlOp :: ETLMapping -> ETLOperation
etlOp = ETLOperation
runMe, tabL :: ETLMapping -> ETLMapping
tabL = ETLMapping
ETLMapEmpty, tabR :: ETLMapping -> RTable
tabR = RTable
rtab } = ETLOperation -> RTable -> RTable
etlOpU ETLOperation
runMe RTable
rtab {--   if (isRTabEmpty rtab) 
                                                                                                    then emptyRTable  
                                                                                                    else etlOpU runMe rtab    --}
runETLmapping ETLMapLD { etlOp :: ETLMapping -> ETLOperation
etlOp = ETLOperation
runMe, tabL :: ETLMapping -> ETLMapping
tabL = ETLMapping
prevmap, tabR :: ETLMapping -> RTable
tabR = RTable
rtab } =
        if (RTable -> Bool
isRTabEmpty RTable
rtab)
        then let
                prevRtab :: RTable
prevRtab = ETLMapping -> RTable
runETLmapping ETLMapping
prevmap -- execute previous ETLMapping to get the resulting RTable

             in ETLOperation -> RTable -> RTable
etlOpU ETLOperation
runMe RTable
prevRtab
        else let   
                prevRtab :: RTable
prevRtab = ETLMapping -> RTable
runETLmapping ETLMapping
prevmap -- execute previous ETLMapping to get the resulting RTable

             in ETLOperation -> RTable -> RTable -> RTable
etlOpB ETLOperation
runMe RTable
prevRtab RTable
rtab

-- | This operator executes an 'ETLMapping' and returns the 'RTabResult'  'Writer' Monad

-- that embedds apart from the resulting RTable, also the number of 'RTuple's returned

etlRes ::
       ETLMapping  -- ^ input ETLMapping

    -> RTabResult   -- ^ output RTabResult

etlRes :: ETLMapping -> RTabResult
etlRes ETLMapping
etlm = 
    let resultRtab :: RTable
resultRtab = ETLMapping -> RTable
etl ETLMapping
etlm
        returnedRtups :: RTuplesRet
returnedRtups = Int -> RTuplesRet
rtuplesRet (Int -> RTuplesRet) -> Int -> RTuplesRet
forall a b. (a -> b) -> a -> b
$ RTable -> Int
forall a. Vector a -> Int
V.length RTable
resultRtab
    in (RTable, RTuplesRet) -> RTabResult
rtabResult (RTable
resultRtab, RTuplesRet
returnedRtups)

-- | Model an 'RTable' as an 'ETLMapping' which when executed will return the input 'RTable'

rtabToETLMapping ::
       RTable
    -> ETLMapping 
rtabToETLMapping :: RTable -> ETLMapping
rtabToETLMapping RTable
rt =   if (RTable -> Bool
isRTabEmpty RTable
rt)
                        then ETLMapping
ETLMapEmpty
                        else ETLMapLD :: ETLOperation -> ETLMapping -> RTable -> ETLMapping
ETLMapLD { etlOp :: ETLOperation
etlOp = ETLcOp :: RColMapping -> ETLOperation
ETLcOp {cmap :: RColMapping
cmap = RColMapping
ColMapEmpty}, tabL :: ETLMapping
tabL = ETLMapping
ETLMapEmpty, tabR :: RTable
tabR = RTable
rt }

--

--   An ETLMapping is implemented as a series of ETLOperations conected with the :=> operator which is right associative

--  i.e., ETLOp3 :=> ETLOp2 :=> ETLOp1 RTable is  (ETLOp3 :=> (ETLOp2 :=> (ETLOp1 RTable))

--  infixr 9 :=> 

-- data ETLMapping = EmptyETLop RTable | ETLMapping :=> ETLOperation



{--
-- | Executes an ETL mapping 
--   Note htat the source RTables are "embedded" in the data constructors of the ETLMapping data type.
runETLmapping ::
    ETLMapping   -- ^ input ETLMapping
    -> RTable    -- output RTable
runETLmapping EmptyETLop rtab = rtab
runETLmapping etlMap :=> etlOp = runETLmapping etlMap 
    
    case etlOp of
        ETLrOp {rop = relOp} -> runROperation relOp
--}

-- Example of an ETLMapping((<T> TabColTransformation).(<F> RPredicate).(<T> TabTransformation) rtable1) `(<EJ> RJoinPredicate)` rtable2



-- ##################################################

-- *  Various useful RDataType Transformations 

-- *  and pre-cooked Column Mappings

-- ##################################################





-- | Returns an ETL Operation that adds a surrogate key column to an RTable

-- The first argument is the initial value of the surrogate key. If Nothing is given, then

-- the initial value will be 0.    

-- addSurrogateKey_old :: Integral a =>    

--        Maybe a       -- ^ The initial value of the Surrogate Key will be the value of this parameter + 1

--     -> a            -- ^  Number of rows that the Surrogate Key will be assigned

--     -> ColumnName    -- ^ The name of the surrogate key column

--     -> ETLOperation  -- ^ Output ETL operation which encapsulates the add surrogate key column mapping

-- addSurrogateKey_old init 0 cname = 

--     let initVal = case init of 

--             Just x -> x

--             Nothing -> 0

--         cmap = RMap1x1 {   

--                                 srcCol = "", removeSrcCol = No   -- the source column can be any column in this mapping, even ""

--                                 ,trgCol = cname

--                                 ,transform1x1 = \_ -> RInt (fromIntegral initVal)

--                                 ,srcRTupleFilter = \_ -> True   

--                        }

--     in ETLcOp cmap

-- addSurrogateKey_old init numRows cname = 

--     let initVal = case init of 

--             Just x -> x

--             Nothing -> 0

--         cmap = RMap1x1 {   

--                                 srcCol = "", removeSrcCol = No   -- the source column can be any column in this mapping, even ""

--                                 ,trgCol = cname

--                                 ,transform1x1 = \_ -> RInt (fromIntegral initVal + 1)

--                                 ,srcRTupleFilter = \_ -> True   

--                        }

--     in addSurrogateKey_old (Just (initVal + 1)) (numRows - 1) cname