{-| Module : ETL Description : Implements ETL operations over RTables. Copyright : (c) Nikos Karagiannidis, 2018 License : BSD3 Maintainer : nkarag@gmail.com Stability : stable Portability : POSIX This is an internal module (i.e., not to be imported directly) that implements the core ETL functionality that is exposed via the __Julius__ EDSL for ETL/ELT found in the "Etl.Julius" module) -} {-# LANGUAGE OverloadedStrings #-} -- :set -XOverloadedStrings --{-# LANGUAGE OverloadedRecordFields #-} --{-# LANGUAGE DuplicateRecordFields #-} module Etl.Internal.Core ( -- * Basic Data Types RColMapping (..) ,ColXForm ,createColMapping ,ETLOperation (..) ,ETLMapping (..) ,YesNo (..) -- * Execution of an ETL Mapping ,runCM ,etlOpU ,etlOpB ,etl ,etlRes -- * Functions for \"Building\" an ETL Mapping ,rtabToETLMapping ,createLeafETLMapLD ,createLeafBinETLMapLD ,connectETLMapLD -- * Various ETL Operations , ) where -- Data.RTable import RTable.Core -- Text import Data.Text as T -- HashMap -- https://hackage.haskell.org/package/unordered-containers-0.2.7.2/docs/Data-HashMap-Strict.html import Data.HashMap.Strict as HM -- Data.List import Data.List (notElem, map, zip) -- Data.Vector import Data.Vector as V data YesNo = Yes | No deriving (YesNo -> YesNo -> Bool (YesNo -> YesNo -> Bool) -> (YesNo -> YesNo -> Bool) -> Eq YesNo forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a /= :: YesNo -> YesNo -> Bool $c/= :: YesNo -> YesNo -> Bool == :: YesNo -> YesNo -> Bool $c== :: YesNo -> YesNo -> Bool Eq, Int -> YesNo -> ShowS [YesNo] -> ShowS YesNo -> String (Int -> YesNo -> ShowS) -> (YesNo -> String) -> ([YesNo] -> ShowS) -> Show YesNo forall a. (Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a showList :: [YesNo] -> ShowS $cshowList :: [YesNo] -> ShowS show :: YesNo -> String $cshow :: YesNo -> String showsPrec :: Int -> YesNo -> ShowS $cshowsPrec :: Int -> YesNo -> ShowS Show) -- | This is the basic data type to define the column-to-column mapping from a source 'RTable' to a target 'RTable'. -- Essentially, an 'RColMapping' represents the column-level transformations of an 'RTuple' that will yield a target 'RTuple'. -- -- A mapping is simply a triple of the form ( Source-Column(s), Target-Column(s), Transformation, RTuple-Filter), where we define the source columns -- over which a transformation (i.e. a function) will be applied in order to yield the target columns. Also, an 'RPredicate' (i.e. a filter) might be applied on the source 'RTuple'. -- Remember that an 'RTuple' is essentially a mapping between a key (the Column Name) and a value (the 'RDataType' value). So the various 'RColMapping' -- data constructors below simply describe the possible modifications of an 'RTuple' orginating from its own columns. -- -- So, we can have the following mapping types: -- a) single-source column to single-target column mapping (1 to 1), -- the source column will be removed or not based on the 'removeSrcCol' flag (dublicate column names are not allowed in an 'RTuple') -- b) multiple-source columns to single-target column mapping (N to 1), -- The N columns will be merged to the single target column based on the transformation. -- The N columns will be removed from the RTuple or not based on the 'removeSrcCol' flag (dublicate column names are not allowed in an 'RTuple') -- c) single-source column to multiple-target columns mapping (1 to M) -- the source column will be "expanded" to M target columns based ont he transformation. -- the source column will be removed or not based on the 'removeSrcCol' flag (dublicate column names are not allowed in an 'RTuple') -- d) multiple-source column to multiple target columns mapping (N to M) -- The N source columns will be mapped to M target columns based on the transformation. -- The N columns will be removed from the RTuple or not based on the 'removeSrcCol' flag (dublicate column names are not allow in an 'RTuple') -- -- Some examples of mapping are the following: -- -- @ -- ("Start_Date", No, "StartDate", \t -> True) -- copy the source value to target and dont remove the source column, so the target RTuple will have both columns "Start_Date" and "StartDate" -- -- with the exactly the same value) -- -- (["Amount", "Discount"], Yes, "FinalAmount", (\[a, d] -> a * d) ) -- "FinalAmount" is a derived column based on a function applied to the two source columns. -- -- In the final RTuple we remove the two source columns. -- @ -- -- An 'RColMapping' can be applied with the 'runCM' (runColMapping) operator -- data RColMapping = ColMapEmpty | RMap1x1 { RColMapping -> ColumnName srcCol :: ColumnName, RColMapping -> YesNo removeSrcCol :: YesNo, RColMapping -> ColumnName trgCol :: ColumnName, RColMapping -> RDataType -> RDataType transform1x1 :: RDataType -> RDataType, RColMapping -> RPredicate srcRTupleFilter:: RPredicate } -- ^ single-source column to single-target column mapping (1 to 1). | RMapNx1 { RColMapping -> [ColumnName] srcColGrp :: [ColumnName], removeSrcCol :: YesNo, trgCol :: ColumnName, RColMapping -> [RDataType] -> RDataType transformNx1 :: [RDataType] -> RDataType, srcRTupleFilter:: RPredicate } -- ^ multiple-source columns to single-target column mapping (N to 1) | RMap1xN { srcCol :: ColumnName, removeSrcCol :: YesNo, RColMapping -> [ColumnName] trgColGrp :: [ColumnName], RColMapping -> RDataType -> [RDataType] transform1xN :: RDataType -> [RDataType], srcRTupleFilter:: RPredicate } -- ^ single-source column to multiple-target columns mapping (1 to N) | RMapNxM { srcColGrp :: [ColumnName], removeSrcCol :: YesNo, trgColGrp :: [ColumnName], RColMapping -> [RDataType] -> [RDataType] transformNxM :: [RDataType] -> [RDataType], srcRTupleFilter:: RPredicate } -- ^ multiple-source column to multiple target columns mapping (N to M) -- | A Column Transformation function data type. -- It is used in order to define an arbitrary column-level transformation (i.e., from a list of N input Column-Values we produce a list of M derived (output) Column-Values). -- A Column value is represented with the 'RDataType'. type ColXForm = [RDataType] -> [RDataType] -- | Constructs an RColMapping. -- This is the suggested method for creating a column mapping and not by calling the data constructors directly. createColMapping :: [ColumnName] -- ^ List of source column names -> [ColumnName] -- ^ List of target column names -> ColXForm -- ^ Column Transformation function -> YesNo -- ^ Remove source column option -> RPredicate -- ^ Filtering predicate -> RColMapping -- ^ Output Column Mapping createColMapping :: [ColumnName] -> [ColumnName] -> ([RDataType] -> [RDataType]) -> YesNo -> RPredicate -> RColMapping createColMapping (ColumnName src:[]) (ColumnName trg:[]) [RDataType] -> [RDataType] xForm YesNo remove RPredicate fPred = RMap1x1 :: ColumnName -> YesNo -> ColumnName -> (RDataType -> RDataType) -> RPredicate -> RColMapping RMap1x1 {srcCol :: ColumnName srcCol = ColumnName src, removeSrcCol :: YesNo removeSrcCol = YesNo remove, trgCol :: ColumnName trgCol = ColumnName trg, transform1x1 :: RDataType -> RDataType transform1x1 = \RDataType x -> [RDataType] -> RDataType forall a. [a] -> a unlist ([RDataType] -> RDataType) -> [RDataType] -> RDataType forall a b. (a -> b) -> a -> b $ [RDataType] -> [RDataType] xForm (RDataType xRDataType -> [RDataType] -> [RDataType] forall a. a -> [a] -> [a] :[]), srcRTupleFilter :: RPredicate srcRTupleFilter = RPredicate fPred} where unlist :: [a] -> a unlist :: [a] -> a unlist (a x:[]) = a x -- since this is a 1x1 col mapping, we are sure that xForm will return a single element list createColMapping [ColumnName] srcCols (ColumnName trg:[]) [RDataType] -> [RDataType] xForm YesNo remove RPredicate fPred = RMapNx1 :: [ColumnName] -> YesNo -> ColumnName -> ([RDataType] -> RDataType) -> RPredicate -> RColMapping RMapNx1 {srcColGrp :: [ColumnName] srcColGrp = [ColumnName] srcCols, removeSrcCol :: YesNo removeSrcCol = YesNo remove, trgCol :: ColumnName trgCol = ColumnName trg, transformNx1 :: [RDataType] -> RDataType transformNx1 = \[RDataType] x -> [RDataType] -> RDataType forall a. [a] -> a unlist ([RDataType] -> RDataType) -> [RDataType] -> RDataType forall a b. (a -> b) -> a -> b $ [RDataType] -> [RDataType] xForm ([RDataType] x), srcRTupleFilter :: RPredicate srcRTupleFilter = RPredicate fPred} where unlist :: [a] -> a unlist :: [a] -> a unlist (a x:[]) = a x -- since this is a Nx1 col mapping, we are sure that xForm will return a single element list createColMapping (ColumnName src:[]) [ColumnName] trgCols [RDataType] -> [RDataType] xForm YesNo remove RPredicate fPred = RMap1xN :: ColumnName -> YesNo -> [ColumnName] -> (RDataType -> [RDataType]) -> RPredicate -> RColMapping RMap1xN {srcCol :: ColumnName srcCol = ColumnName src, removeSrcCol :: YesNo removeSrcCol = YesNo remove, trgColGrp :: [ColumnName] trgColGrp = [ColumnName] trgCols, transform1xN :: RDataType -> [RDataType] transform1xN = \RDataType x -> [RDataType] -> [RDataType] xForm (RDataType xRDataType -> [RDataType] -> [RDataType] forall a. a -> [a] -> [a] :[]), srcRTupleFilter :: RPredicate srcRTupleFilter = RPredicate fPred} createColMapping [ColumnName] srcCols [ColumnName] trgCols [RDataType] -> [RDataType] xForm YesNo remove RPredicate fPred = RMapNxM :: [ColumnName] -> YesNo -> [ColumnName] -> ([RDataType] -> [RDataType]) -> RPredicate -> RColMapping RMapNxM {srcColGrp :: [ColumnName] srcColGrp = [ColumnName] srcCols, removeSrcCol :: YesNo removeSrcCol = YesNo remove, trgColGrp :: [ColumnName] trgColGrp = [ColumnName] trgCols, transformNxM :: [RDataType] -> [RDataType] transformNxM = [RDataType] -> [RDataType] xForm, srcRTupleFilter :: RPredicate srcRTupleFilter = RPredicate fPred} -- | runCM operator executes an RColMapping -- If a target-column has the same name with a source-column and a DontRemoveSrc (i.e., removeSrcCol == No) has been specified, then the (target-column, target-value) key-value pair, -- overwrites the corresponding (source-column, source-value) key-value pair runCM :: RColMapping -> RTable -> RTable runCM = RColMapping -> RTable -> RTable runColMapping -- | Apply an RColMapping to a source RTable and produce a new RTable. -- If a target-column has the same name with a source-column and a DontRemoveSrc (i.e., removeSrcCol == No) has been specified, then the (target-column, target-value) key-value pair, -- overwrites the corresponding (source-column, source-value) key-value pair. -- If a filter is embedded in the 'RColMapping', then the returned 'RTable' will include only the 'RTuple's that satisfy the filter predicate. runColMapping :: RColMapping -> RTable -> RTable runColMapping :: RColMapping -> RTable -> RTable runColMapping RColMapping ColMapEmpty RTable rtabS = RTable rtabS runColMapping RColMapping rmap RTable rtabS = if RTable -> Bool isRTabEmpty RTable rtabS then RTable emptyRTable else case RColMapping rmap of RMap1x1 {srcCol :: RColMapping -> ColumnName srcCol = ColumnName src, trgCol :: RColMapping -> ColumnName trgCol = ColumnName trg, removeSrcCol :: RColMapping -> YesNo removeSrcCol = YesNo rmvFlag, transform1x1 :: RColMapping -> RDataType -> RDataType transform1x1 = RDataType -> RDataType xform, srcRTupleFilter :: RColMapping -> RPredicate srcRTupleFilter = RPredicate pred} -> do -- an RTable is a Monad just like a list is a Monad, representing a non-deterministic value RTuple srcRtuple <- RPredicate -> RTable -> RTable f RPredicate pred RTable rtabS let -- 1. get original column value srcValue :: RDataType srcValue = ColumnName -> RTuple -> RDataType getRTupColValue ColumnName src RTuple srcRtuple -- srcValue = HM.lookupDefault Null -- return Null if value cannot be found based on column name -- src -- column name to look for (source) - i.e., the key in the HashMap -- srcRtuple -- source RTuple (i.e., a HashMap ColumnName RDataType) -- 2. apply transformation to retrieve new column value trgValue :: RDataType trgValue = RDataType -> RDataType xform RDataType srcValue -- 3. remove the original ColumnName, Value mapping from the RTuple rtupleTemp :: RTuple rtupleTemp = case YesNo rmvFlag of YesNo Yes -> ColumnName -> RTuple -> RTuple forall k v. (Eq k, Hashable k) => k -> HashMap k v -> HashMap k v HM.delete ColumnName src RTuple srcRtuple YesNo No -> RTuple srcRtuple -- 4. insert new (ColumnName, Value) pair and thus create the target RTuple trgRtuple :: RTuple trgRtuple = ColumnName -> RDataType -> RTuple -> RTuple forall k v. (Eq k, Hashable k) => k -> v -> HashMap k v -> HashMap k v HM.insert ColumnName trg RDataType trgValue RTuple rtupleTemp -- return new RTable RTuple -> RTable forall (m :: * -> *) a. Monad m => a -> m a return RTuple trgRtuple RMapNx1 {srcColGrp :: RColMapping -> [ColumnName] srcColGrp = [ColumnName] srcL, trgCol :: RColMapping -> ColumnName trgCol = ColumnName trg, removeSrcCol :: RColMapping -> YesNo removeSrcCol = YesNo rmvFlag, transformNx1 :: RColMapping -> [RDataType] -> RDataType transformNx1 = [RDataType] -> RDataType xform, srcRTupleFilter :: RColMapping -> RPredicate srcRTupleFilter = RPredicate pred} -> do -- an RTable is a Monad just like a list is a Monad, representing a non-deterministic value RTuple srcRtuple <- RPredicate -> RTable -> RTable f RPredicate pred RTable rtabS let -- 1. get original column value (in this case it is a list of values) srcValueL :: [RDataType] srcValueL = (ColumnName -> RDataType) -> [ColumnName] -> [RDataType] forall a b. (a -> b) -> [a] -> [b] Data.List.map ( \ColumnName src -> ColumnName -> RTuple -> RDataType getRTupColValue ColumnName src RTuple srcRtuple -- \src -> HM.lookupDefault Null -- return Null if value cannot be found based on column name -- src -- column name to look for (source) - i.e., the key in the HashMap -- srcRtuple -- source RTuple (i.e., a HashMap ColumnName RDataType) ) [ColumnName] srcL -- 2. apply transformation to retrieve new column value trgValue :: RDataType trgValue = [RDataType] -> RDataType xform [RDataType] srcValueL -- 3. remove the original (ColumnName, Value) mappings from the RTuple (i.e., remove ColumnNames mentioned in the RColMapping from source RTuple) rtupleTemp :: RTuple rtupleTemp = case YesNo rmvFlag of YesNo Yes -> (ColumnName -> RDataType -> Bool) -> RTuple -> RTuple forall k v. (k -> v -> Bool) -> HashMap k v -> HashMap k v HM.filterWithKey (\ColumnName colName RDataType _ -> ColumnName -> [ColumnName] -> Bool forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool Data.List.notElem ColumnName colName [ColumnName] srcL) RTuple srcRtuple YesNo No -> RTuple srcRtuple -- 4. insert new ColumnName, Value mapping as the target RTuple must be trgRtuple :: RTuple trgRtuple = ColumnName -> RDataType -> RTuple -> RTuple forall k v. (Eq k, Hashable k) => k -> v -> HashMap k v -> HashMap k v HM.insert ColumnName trg RDataType trgValue RTuple rtupleTemp -- return new RTable RTuple -> RTable forall (m :: * -> *) a. Monad m => a -> m a return RTuple trgRtuple RMap1xN {srcCol :: RColMapping -> ColumnName srcCol = ColumnName src, trgColGrp :: RColMapping -> [ColumnName] trgColGrp = [ColumnName] trgL, removeSrcCol :: RColMapping -> YesNo removeSrcCol = YesNo rmvFlag, transform1xN :: RColMapping -> RDataType -> [RDataType] transform1xN = RDataType -> [RDataType] xform, srcRTupleFilter :: RColMapping -> RPredicate srcRTupleFilter = RPredicate pred} -> do -- an RTable is a Monad just like a list is a Monad, representing a non-deterministic value RTuple srcRtuple <- RPredicate -> RTable -> RTable f RPredicate pred RTable rtabS let -- 1. get original column value srcValue :: RDataType srcValue = ColumnName -> RTuple -> RDataType getRTupColValue ColumnName src RTuple srcRtuple -- srcValue = HM.lookupDefault Null -- return Null if value cannot be found based on column name -- src -- column name to look for (source) - i.e., the key in the HashMap -- srcRtuple -- source RTuple (i.e., a HashMap ColumnName RDataType) -- 2. apply transformation to retrieve new column value list trgValueL :: [RDataType] trgValueL = RDataType -> [RDataType] xform RDataType srcValue -- 3. remove the original ColumnName, Value mapping from the RTuple rtupleTemp :: RTuple rtupleTemp = case YesNo rmvFlag of YesNo Yes -> ColumnName -> RTuple -> RTuple forall k v. (Eq k, Hashable k) => k -> HashMap k v -> HashMap k v HM.delete ColumnName src RTuple srcRtuple YesNo No -> RTuple srcRtuple -- 4. insert new (ColumnName, Value) pairs to the target RTuple tempL :: [(ColumnName, RDataType)] tempL = [ColumnName] -> [RDataType] -> [(ColumnName, RDataType)] forall a b. [a] -> [b] -> [(a, b)] Data.List.zip [ColumnName] trgL [RDataType] trgValueL trgRtuple :: RTuple trgRtuple = RTuple -> RTuple -> RTuple forall k v. (Eq k, Hashable k) => HashMap k v -> HashMap k v -> HashMap k v HM.union ([(ColumnName, RDataType)] -> RTuple forall k v. (Eq k, Hashable k) => [(k, v)] -> HashMap k v HM.fromList [(ColumnName, RDataType)] tempL) RTuple rtupleTemp -- implement as a hashmap union between new (columnName,value) pairs and source tuple -- return new RTable RTuple -> RTable forall (m :: * -> *) a. Monad m => a -> m a return RTuple trgRtuple RMapNxM {srcColGrp :: RColMapping -> [ColumnName] srcColGrp = [ColumnName] srcL, trgColGrp :: RColMapping -> [ColumnName] trgColGrp = [ColumnName] trgL, removeSrcCol :: RColMapping -> YesNo removeSrcCol = YesNo rmvFlag, transformNxM :: RColMapping -> [RDataType] -> [RDataType] transformNxM = [RDataType] -> [RDataType] xform, srcRTupleFilter :: RColMapping -> RPredicate srcRTupleFilter = RPredicate pred} -> do -- an RTable is a Monad just like a list is a Monad, representing a non-deterministic value RTuple srcRtuple <- RPredicate -> RTable -> RTable f RPredicate pred RTable rtabS let -- 1. get original column value (in this case it is a list of values) srcValueL :: [RDataType] srcValueL = (ColumnName -> RDataType) -> [ColumnName] -> [RDataType] forall a b. (a -> b) -> [a] -> [b] Data.List.map ( \ColumnName src -> ColumnName -> RTuple -> RDataType getRTupColValue ColumnName src RTuple srcRtuple -- \src -> HM.lookupDefault Null -- return Null if value cannot be found based on column name -- src -- column name to look for (source) - i.e., the key in the HashMap -- srcRtuple -- source RTuple (i.e., a HashMap ColumnName RDataType) ) [ColumnName] srcL -- 2. apply transformation to retrieve new column value trgValueL :: [RDataType] trgValueL = [RDataType] -> [RDataType] xform [RDataType] srcValueL -- 3. remove the original ColumnName, Value mapping from the RTuple rtupleTemp :: RTuple rtupleTemp = case YesNo rmvFlag of YesNo Yes -> (ColumnName -> RDataType -> Bool) -> RTuple -> RTuple forall k v. (k -> v -> Bool) -> HashMap k v -> HashMap k v HM.filterWithKey (\ColumnName colName RDataType _ -> ColumnName -> [ColumnName] -> Bool forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool Data.List.notElem ColumnName colName [ColumnName] srcL) RTuple srcRtuple YesNo No -> RTuple srcRtuple -- 4. insert new (ColumnName, Value) pairs to the target RTuple tempL :: [(ColumnName, RDataType)] tempL = [ColumnName] -> [RDataType] -> [(ColumnName, RDataType)] forall a b. [a] -> [b] -> [(a, b)] Data.List.zip [ColumnName] trgL [RDataType] trgValueL trgRtuple :: RTuple trgRtuple = RTuple -> RTuple -> RTuple forall k v. (Eq k, Hashable k) => HashMap k v -> HashMap k v -> HashMap k v HM.union ([(ColumnName, RDataType)] -> RTuple forall k v. (Eq k, Hashable k) => [(k, v)] -> HashMap k v HM.fromList [(ColumnName, RDataType)] tempL) RTuple rtupleTemp -- implement as a hashmap union between new (columnName,value) pairs and source tuple -- return new RTable RTuple -> RTable forall (m :: * -> *) a. Monad m => a -> m a return RTuple trgRtuple -- | An ETL operation applied to an RTable can be either an 'ROperation' (a relational agebra operation like join, filter etc.) defined in "RTable.Core" module, -- or an 'RColMapping' applied to an 'RTable' data ETLOperation = ETLrOp { ETLOperation -> ROperation rop :: ROperation } | ETLcOp { ETLOperation -> RColMapping cmap :: RColMapping } -- | executes a Unary ETL Operation etlOpU :: ETLOperation -> RTable -> RTable etlOpU = ETLOperation -> RTable -> RTable runUnaryETLOperation -- | executes an ETL Operation runUnaryETLOperation :: ETLOperation -> RTable -- ^ input RTable -> RTable -- ^ output RTable runUnaryETLOperation :: ETLOperation -> RTable -> RTable runUnaryETLOperation ETLOperation op RTable inpRtab = case ETLOperation op of ETLrOp { rop :: ETLOperation -> ROperation rop = ROperation relOp } -> ROperation -> RTable -> RTable ropU ROperation relOp RTable inpRtab ETLcOp { cmap :: ETLOperation -> RColMapping cmap = RColMapping colMap } -> RColMapping -> RTable -> RTable runCM RColMapping colMap RTable inpRtab -- | executes a Binary ETL Operation etlOpB :: ETLOperation -> RTable -> RTable -> RTable etlOpB = ETLOperation -> RTable -> RTable -> RTable runBinaryETLOperation -- | executes an ETL Operation runBinaryETLOperation :: ETLOperation -> RTable -- ^ input RTable1 -> RTable -- ^ input RTable2 -> RTable -- ^ output RTable runBinaryETLOperation :: ETLOperation -> RTable -> RTable -> RTable runBinaryETLOperation ETLrOp {rop :: ETLOperation -> ROperation rop = ROperation relOp} RTable inpT1 RTable inpT2 = ROperation -> RTable -> RTable -> RTable ropB ROperation relOp RTable inpT1 RTable inpT2 -- | ETLmapping : it is the equivalent of a mapping in an ETL tool and consists of a series of ETLOperations that are applied, one-by-one, -- to some initial input RTable, but if binary ETLOperations are included in the ETLMapping, then there will be more than one input RTables that -- the ETLOperations of the ETLMapping will be applied to. When we apply (i.e., run) an ETLOperation of the ETLMapping we get a new RTable, -- which is then inputed to the next ETLOperation, until we finally run all ETLOperations. The purpose of the execution of an ETLMapping is -- to produce a single new RTable as the result of the execution of all the ETLOperations of the ETLMapping. -- In terms of database operations an ETLMapping is the equivalent of an CREATE AS SELECT (CTAS) operation in an RDBMS. This means that -- anything that can be done in the SELECT part (i.e., column projection, row filtering, grouping and join operations, etc.) -- in order to produce a new table, can be included in an ETLMapping. -- -- An ETLMapping is executed with the etl (runETLmapping) operator -- -- Implementation: -- An ETLMapping is implemented as a binary tree where the node represents the ETLOperation to be executed and the left branch is another -- ETLMapping, while the right branch is an RTable (that might be empty in the case of a Unary ETLOperation). -- Execution proceeds from bottom-left to top-right. -- This is similar in concept to a left-deep join tree. In a Left-Deep ETLOperation tree the "pipe" of ETLOperations comes from -- the left branches always. -- The leaf node is always an ETLMapping with an ETLMapEmpty in the left branch and an RTable in the right branch (the initial RTable inputed -- to the ETLMapping). -- In this way, the result of the execution of each ETLOperation (which is an RTable) is passed on to the next ETLOperation. Here is an example: -- -- @ -- A Left-Deep ETLOperation Tree -- -- final RTable result -- / -- etlOp3 -- / \ -- etlOp2 rtab2 -- / \ -- A leaf-node --> etlOp1 emptyRTab -- / \ -- ETLMapEmpty rtab1 -- -- @ -- -- You see that always on the left branch we have an ETLMapping data type (i.e., a left-deep ETLOperation tree). -- So how do we implement the following case? -- -- @ -- -- final RTable result -- / -- A leaf-node --> etlOp1 -- / \ -- rtab1 rtab2 -- -- @ -- -- The answer is that we "model" the left RTable (rtab1 in our example) as an ETLMapping of the form: -- -- @ -- ETLMapLD { etlOp = ETLcOp{cmap = ColMapEmpty}, tabL = ETLMapEmpty, tabR = rtab1 } -- @ -- -- So we embed the rtab1 in a ETLMapping, which is a leaf (i.e., it has an empty prevMap), the rtab1 is in -- the right branch (tabR) and the ETLOperation is the EmptyColMapping, which returns its input RTable when executed. -- We can use function 'rtabToETLMapping' for this job. So it becomes -- @ -- A leaf-node --> etlOp1 -- / \ -- rtabToETLMapping rtab1 rtab2 -- @ -- -- In this manner, a leaf-node can also be implemented like this: -- -- @ -- final RTable result -- / -- etlOp3 -- / \ -- etlOp2 rtab2 -- / \ -- A leaf-node --> etlOp1 emptyRTab -- / \ -- rtabToETLMapping rtab1 emptyRTable -- @ -- data ETLMapping = ETLMapEmpty -- ^ an empty node | ETLMapLD { ETLMapping -> ETLOperation etlOp :: ETLOperation -- ^ the ETLOperation to be executed ,ETLMapping -> ETLMapping tabL :: ETLMapping -- ^ the left-branch corresponding to the previous ETLOperation, which is input to this one. -- ,ETLMapping -> RTable tabR :: RTable -- ^ the right branch corresponds to another RTable (for binary ETL operations). -- If this is a Unary ETLOperation then this field must be an empty RTable. } -- ^ a Left-Deep node | ETLMapRD { etlOp :: ETLOperation -- ^ the ETLOperation to be executed ,ETLMapping -> RTable tabLrd :: RTable -- ^ the left-branch corresponds to another RTable (for binary ETL operations). -- If this is a Unary ETLOperation then this field must be an empty RTable. ,ETLMapping -> ETLMapping tabRrd :: ETLMapping -- ^ the right branch corresponding to the previous ETLOperation, which is input to this one. } -- ^ a Right-Deep node | ETLMapBal { etlOp :: ETLOperation -- ^ the ETLOperation to be executed ,ETLMapping -> ETLMapping tabLbal :: ETLMapping -- ^ the left-branch corresponding to the previous ETLOperation, which is input to this one. -- If this is a Unary ETLOperation then this field might be an empty ETLMapping. ,ETLMapping -> ETLMapping tabRbal :: ETLMapping -- ^ the right branch corresponding corresponding to the previous ETLOperation, which is input to this one. -- If this is a Unary ETLOperation then this field might be an empty ETLMapping. } -- ^ a Balanced node instance Eq ETLMapping where ETLMapping etlMap1 == :: ETLMapping -> ETLMapping -> Bool == ETLMapping etlMap2 = (ETLMapping -> RTable etl ETLMapping etlMap1) RTable -> RTable -> Bool forall a. Eq a => a -> a -> Bool == (ETLMapping -> RTable etl ETLMapping etlMap2) -- two ETLMappings are equal if the RTables resulting from their execution are equal -- | Creates a left-deep leaf ETL Mapping, of the following form: -- -- @ -- A Left-Deep ETLOperation Tree -- -- final RTable result -- / -- etlOp3 -- / \ -- etlOp2 rtab2 -- / \ -- A leaf-node --> etlOp1 emptyRTab -- / \ -- ETLMapEmpty rtab1 -- -- @ -- createLeafETLMapLD :: ETLOperation -- ^ ETL operation of this ETL mapping -> RTable -- ^ input RTable -> ETLMapping -- ^ output ETLMapping createLeafETLMapLD :: ETLOperation -> RTable -> ETLMapping createLeafETLMapLD ETLOperation etlop RTable rt = ETLMapLD :: ETLOperation -> ETLMapping -> RTable -> ETLMapping ETLMapLD { etlOp :: ETLOperation etlOp = ETLOperation etlop, tabL :: ETLMapping tabL = ETLMapping ETLMapEmpty, tabR :: RTable tabR = RTable rt} -- | creates a Binary operation leaf node of the form: -- -- @ -- -- A leaf-node --> etlOp1 -- / \ -- rtabToETLMapping rtab1 rtab2 -- @ -- createLeafBinETLMapLD :: ETLOperation -- ^ ETL operation of this ETL mapping -> RTable -- ^ input RTable1 -> RTable -- ^ input RTable2 -> ETLMapping -- ^ output ETLMapping createLeafBinETLMapLD :: ETLOperation -> RTable -> RTable -> ETLMapping createLeafBinETLMapLD ETLOperation etlop RTable rt1 RTable rt2 = ETLMapLD :: ETLOperation -> ETLMapping -> RTable -> ETLMapping ETLMapLD { etlOp :: ETLOperation etlOp = ETLOperation etlop, tabL :: ETLMapping tabL = RTable -> ETLMapping rtabToETLMapping RTable rt1, tabR :: RTable tabR = RTable rt2} -- | Connects an ETL Mapping to a left-deep ETL Mapping tree, of the form -- -- @ -- A Left-Deep ETLOperation Tree -- -- final RTable result -- / -- etlOp3 -- / \ -- etlOp2 rtab2 -- / \ -- A leaf-node --> etlOp1 emptyRTab -- / \ -- ETLMapEmpty rtab1 -- -- @ -- -- Example: -- -- @ -- -- connect a Unary ETL mapping (etlOp2) -- -- etlOp2 -- / \ -- etlOp1 emptyRTab -- -- => connectETLMapLD etlOp2 emptyRTable prevMap -- -- -- connect a Binary ETL Mapping (etlOp3) -- -- etlOp3 -- / \ -- etlOp2 rtab2 -- -- => connectETLMapLD etlOp3 rtab2 prevMap -- @ -- -- Note that the right branch (RTable) appears first in the list of input arguments of this function and -- the left branch (ETLMapping) appears second. This is strange, and one could thought that it is a mistake -- (i.e., the left branch should appear first and the right branch second) since we are reading from left to right. -- However this was a deliberate choice, so that we leave the left branch (which is the connection point with the -- previous ETLMapping) as the last argument, and thus we can partially apply the argumenets and get a new function -- with input parameter only the previous mapping. This is very helpfull in function composition -- connectETLMapLD :: ETLOperation -- ^ ETL operation of this ETL Mapping -> RTable -- ^ Right RTable (right branch) (if this is a Unary ETL mapping this should be an emptyRTable) -> ETLMapping -- ^ Previous ETL mapping (left branch) -> ETLMapping -- ^ New ETL Mapping, which has added at the end the new node connectETLMapLD :: ETLOperation -> RTable -> ETLMapping -> ETLMapping connectETLMapLD ETLOperation etlop RTable rt ETLMapping prevMap = ETLMapLD :: ETLOperation -> ETLMapping -> RTable -> ETLMapping ETLMapLD { etlOp :: ETLOperation etlOp = ETLOperation etlop, tabL :: ETLMapping tabL = ETLMapping prevMap, tabR :: RTable tabR = RTable rt} -- | This operator executes an 'ETLMapping' etl :: ETLMapping -> RTable etl = ETLMapping -> RTable runETLmapping -- | Executes an 'ETLMapping' runETLmapping :: ETLMapping -- ^ input ETLMapping -> RTable -- ^ output RTable -- empty ETL mapping runETLmapping :: ETLMapping -> RTable runETLmapping ETLMapping ETLMapEmpty = RTable emptyRTable -- ETL mapping with an empty ETLOperation, which is just modelling an RTable runETLmapping ETLMapLD { etlOp :: ETLMapping -> ETLOperation etlOp = ETLcOp{cmap :: ETLOperation -> RColMapping cmap = RColMapping ColMapEmpty}, tabL :: ETLMapping -> ETLMapping tabL = ETLMapping ETLMapEmpty, tabR :: ETLMapping -> RTable tabR = RTable rtab } = RTable rtab -- leaf node --> unary ETLOperation on RTable runETLmapping ETLMapLD { etlOp :: ETLMapping -> ETLOperation etlOp = ETLOperation runMe, tabL :: ETLMapping -> ETLMapping tabL = ETLMapping ETLMapEmpty, tabR :: ETLMapping -> RTable tabR = RTable rtab } = ETLOperation -> RTable -> RTable etlOpU ETLOperation runMe RTable rtab {-- if (isRTabEmpty rtab) then emptyRTable else etlOpU runMe rtab --} runETLmapping ETLMapLD { etlOp :: ETLMapping -> ETLOperation etlOp = ETLOperation runMe, tabL :: ETLMapping -> ETLMapping tabL = ETLMapping prevmap, tabR :: ETLMapping -> RTable tabR = RTable rtab } = if (RTable -> Bool isRTabEmpty RTable rtab) then let prevRtab :: RTable prevRtab = ETLMapping -> RTable runETLmapping ETLMapping prevmap -- execute previous ETLMapping to get the resulting RTable in ETLOperation -> RTable -> RTable etlOpU ETLOperation runMe RTable prevRtab else let prevRtab :: RTable prevRtab = ETLMapping -> RTable runETLmapping ETLMapping prevmap -- execute previous ETLMapping to get the resulting RTable in ETLOperation -> RTable -> RTable -> RTable etlOpB ETLOperation runMe RTable prevRtab RTable rtab -- | This operator executes an 'ETLMapping' and returns the 'RTabResult' 'Writer' Monad -- that embedds apart from the resulting RTable, also the number of 'RTuple's returned etlRes :: ETLMapping -- ^ input ETLMapping -> RTabResult -- ^ output RTabResult etlRes :: ETLMapping -> RTabResult etlRes ETLMapping etlm = let resultRtab :: RTable resultRtab = ETLMapping -> RTable etl ETLMapping etlm returnedRtups :: RTuplesRet returnedRtups = Int -> RTuplesRet rtuplesRet (Int -> RTuplesRet) -> Int -> RTuplesRet forall a b. (a -> b) -> a -> b $ RTable -> Int forall a. Vector a -> Int V.length RTable resultRtab in (RTable, RTuplesRet) -> RTabResult rtabResult (RTable resultRtab, RTuplesRet returnedRtups) -- | Model an 'RTable' as an 'ETLMapping' which when executed will return the input 'RTable' rtabToETLMapping :: RTable -> ETLMapping rtabToETLMapping :: RTable -> ETLMapping rtabToETLMapping RTable rt = if (RTable -> Bool isRTabEmpty RTable rt) then ETLMapping ETLMapEmpty else ETLMapLD :: ETLOperation -> ETLMapping -> RTable -> ETLMapping ETLMapLD { etlOp :: ETLOperation etlOp = ETLcOp :: RColMapping -> ETLOperation ETLcOp {cmap :: RColMapping cmap = RColMapping ColMapEmpty}, tabL :: ETLMapping tabL = ETLMapping ETLMapEmpty, tabR :: RTable tabR = RTable rt } -- -- An ETLMapping is implemented as a series of ETLOperations conected with the :=> operator which is right associative -- i.e., ETLOp3 :=> ETLOp2 :=> ETLOp1 RTable is (ETLOp3 :=> (ETLOp2 :=> (ETLOp1 RTable)) -- infixr 9 :=> -- data ETLMapping = EmptyETLop RTable | ETLMapping :=> ETLOperation {-- -- | Executes an ETL mapping -- Note htat the source RTables are "embedded" in the data constructors of the ETLMapping data type. runETLmapping :: ETLMapping -- ^ input ETLMapping -> RTable -- output RTable runETLmapping EmptyETLop rtab = rtab runETLmapping etlMap :=> etlOp = runETLmapping etlMap case etlOp of ETLrOp {rop = relOp} -> runROperation relOp --} -- Example of an ETLMapping((<T> TabColTransformation).(<F> RPredicate).(<T> TabTransformation) rtable1) `(<EJ> RJoinPredicate)` rtable2 -- ################################################## -- * Various useful RDataType Transformations -- * and pre-cooked Column Mappings -- ################################################## -- | Returns an ETL Operation that adds a surrogate key column to an RTable -- The first argument is the initial value of the surrogate key. If Nothing is given, then -- the initial value will be 0. -- addSurrogateKey_old :: Integral a => -- Maybe a -- ^ The initial value of the Surrogate Key will be the value of this parameter + 1 -- -> a -- ^ Number of rows that the Surrogate Key will be assigned -- -> ColumnName -- ^ The name of the surrogate key column -- -> ETLOperation -- ^ Output ETL operation which encapsulates the add surrogate key column mapping -- addSurrogateKey_old init 0 cname = -- let initVal = case init of -- Just x -> x -- Nothing -> 0 -- cmap = RMap1x1 { -- srcCol = "", removeSrcCol = No -- the source column can be any column in this mapping, even "" -- ,trgCol = cname -- ,transform1x1 = \_ -> RInt (fromIntegral initVal) -- ,srcRTupleFilter = \_ -> True -- } -- in ETLcOp cmap -- addSurrogateKey_old init numRows cname = -- let initVal = case init of -- Just x -> x -- Nothing -> 0 -- cmap = RMap1x1 { -- srcCol = "", removeSrcCol = No -- the source column can be any column in this mapping, even "" -- ,trgCol = cname -- ,transform1x1 = \_ -> RInt (fromIntegral initVal + 1) -- ,srcRTupleFilter = \_ -> True -- } -- in addSurrogateKey_old (Just (initVal + 1)) (numRows - 1) cname