{-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE OverloadedStrings #-} {-| The structures of data types in Karps. For a detailed description of the supported types, see http://spark.apache.org/docs/latest/sql-programming-guide.html#data-types At a high-level, Spark DataFrames and Datasets are equivalent to lists of objects whose type can be mapped to the same StructType: Dataset a ~ ArrayType StructType (...) Columns of a dataset are equivalent to lists of object whose type can be mapped to the same DataType (either Strict or Nullable) Local data (or "blobs") are single elements whose type can be mapped to a DataType (either strict or nullable) -} module Spark.Core.Internal.TypesStructures where import Data.Aeson import Data.Vector(Vector) import Control.Monad(guard) import qualified Data.Vector as V import qualified Data.Aeson as A import qualified Data.Text as T import GHC.Generics(Generic) import Test.QuickCheck import Spark.Core.StructuresInternal(FieldName(..)) import Spark.Core.Internal.Utilities -- The core type algebra -- | The data types that are guaranteed to not be null: evaluating them will return a value. data StrictDataType = IntType | DoubleType | StringType | BoolType | Struct !StructType | ArrayType !DataType deriving (Eq) -- | All the data types supported by the Spark engine. -- The data types can either be nullable (they may contain null values) or strict (all the values are present). -- There are a couple of differences with the algebraic data types in Haskell: -- Maybe (Maybe a) ~ Maybe a which implies that arbitrary nesting of values will be flattened to a top-level Nullable -- Similarly, [[]] ~ [] data DataType = StrictType !StrictDataType | NullableType !StrictDataType deriving (Eq) -- | A field in a structure data StructField = StructField { structFieldName :: !FieldName, structFieldType :: !DataType } deriving (Eq) -- | The main structure of a dataframe or a dataset data StructType = StructType { structFields :: !(Vector StructField) } deriving (Eq) -- Convenience types -- | Represents the choice between a strict and a nullable field data Nullable = CanNull | NoNull deriving (Show, Eq) -- | Encodes the type of all the nullable data types data NullableDataType = NullableDataType !StrictDataType deriving (Eq) -- | A tagged datatype that encodes the sql types -- This is the main type information that should be used by users. data SQLType a = SQLType { -- | The underlying data type. unSQLType :: !DataType } deriving (Eq, Generic) instance Show DataType where show (StrictType x) = show x show (NullableType x) = show x ++ "?" instance Show StrictDataType where show StringType = "string" show DoubleType = "double" show IntType = "int" show BoolType = "bool" show (Struct struct) = show struct show (ArrayType at) = "[" ++ show at ++ "]" instance Show StructField where show field = (T.unpack . unFieldName . structFieldName) field ++ ":" ++ s where s = show $ structFieldType field instance Show StructType where show struct = "{" ++ unwords (map show (V.toList . structFields $ struct)) ++ "}" instance Show (SQLType a) where show (SQLType dt) = show dt -- QUICKCHECK INSTANCES instance Arbitrary StructField where arbitrary = do name <- elements ["_1", "a", "b", "abc"] dt <- arbitrary :: Gen DataType return $ StructField (FieldName $ T.pack name) dt instance Arbitrary StructType where arbitrary = do fields <- listOf arbitrary return . StructType . V.fromList $ fields instance Arbitrary StrictDataType where arbitrary = do idx <- elements [1,2] :: Gen Int return $ case idx of 1 -> StringType 2 -> IntType _ -> failure "Arbitrary StrictDataType" instance Arbitrary DataType where arbitrary = do x <- arbitrary u <- arbitrary return $ if x then StrictType u else NullableType u -- AESON INSTANCES -- This follows the same structure as the JSON generated by Spark. instance ToJSON StrictDataType where toJSON IntType = "integer" toJSON DoubleType = "double" toJSON StringType = "string" toJSON BoolType = "bool" toJSON (Struct struct) = toJSON struct toJSON (ArrayType (StrictType dt)) = object [ "type" .= A.String "array" , "elementType" .= toJSON dt , "containsNull" .= A.Bool False ] toJSON (ArrayType (NullableType dt)) = object [ "type" .= A.String "array" , "elementType" .= toJSON dt , "containsNull" .= A.Bool True ] instance ToJSON StructType where toJSON (StructType fields) = let fs = (snd . _fieldToJson) <$> V.toList fields in object [ "type" .= A.String "struct" , "fields" .= fs ] -- Spark drops the info at the highest level. instance ToJSON DataType where toJSON (StrictType dt) = object [ "nullable" .= A.Bool False, "dt" .= toJSON dt] toJSON (NullableType dt) = object [ "nullable" .= A.Bool True, "dt" .= toJSON dt] instance FromJSON DataType where parseJSON = withObject "DataType" $ \o -> do nullable <- o .: "nullable" dt <- o .: "dt" let c = if nullable then NullableType else StrictType return (c dt) instance FromJSON StructField where parseJSON = withObject "StructField" $ \o -> do n <- o .: "name" dt <- o .: "type" nullable <- o .: "nullable" let c = if nullable then NullableType else StrictType return $ StructField (FieldName n) (c dt) instance FromJSON StructType where parseJSON = withObject "StructType" $ \o -> do tp <- o .: "type" guard (tp == T.pack "struct") fs <- o .: "fields" return (StructType fs) instance FromJSON StrictDataType where parseJSON (A.String s) = case s of "integer" -> return IntType "double" -> return DoubleType "string" -> return StringType "bool" -> return BoolType -- TODO: figure out which one is correct "boolean" -> return BoolType _ -> fail ("StrictDataType: unknown type " ++ T.unpack s) parseJSON (Object o) = do tp <- o .: "type" case T.pack tp of "struct" -> Struct <$> parseJSON (Object o) "array" -> do dt <- o .: "elementType" containsNull <- o .: "containsNull" let c = if containsNull then NullableType else StrictType return $ ArrayType (c dt) s -> fail ("StrictDataType: unknown type " ++ T.unpack s) parseJSON x = fail ("StrictDataType: cannot parse " ++ show x) _fieldToJson :: StructField -> (T.Text, A.Value) _fieldToJson (StructField (FieldName n) (StrictType dt)) = (n, object [ "name" .= A.String n , "type" .= toJSON dt , "nullable" .= A.Bool False]) _fieldToJson (StructField (FieldName n) (NullableType dt)) = (n, object [ "name" .= A.String n , "type" .= toJSON dt , "nullable" .= A.Bool True])