{-# LANGUAGE OverloadedStrings #-} module Spark.Core.Internal.TypesFunctions( isNullable, iInnerStrictType, columnType, unsafeCastType, intType, arrayType, compatibleTypes, arrayType', frameTypeFromCol, colTypeFromFrame, canNull, structField, structType, structTypeFromFields, tupleType, structName, iSingleField, -- cellType, ) where import qualified Data.Text as T import Data.List(sort, nub) import qualified Data.Vector as V import Data.Text(Text, intercalate) import Formatting import Spark.Core.Internal.TypesStructures import Spark.Core.StructuresInternal import Spark.Core.Internal.Utilities import Spark.Core.Try -- Performs a cast of the type. -- This may throw an error if the required type b is not -- compatible with the type embedded in a. unsafeCastType :: SQLType a -> SQLType b -- TODO add more error checking here. unsafeCastType (SQLType dt) = SQLType dt -- Given a sql type tag, returns the equivalent data type for a column or a blob -- (internal) columnType :: SQLType a -> DataType columnType (SQLType dt) = dt -- (internal) isNullable :: DataType -> Bool isNullable (StrictType _) = False isNullable (NullableType _) = True -- *** Creation of data types *** -- Takes a data type (assumed to be that of a column or cell) and returns the -- corresponding dataset type. -- This should only be used when talking to Spark. -- All visible operations in Krapsh use Cell types instead. -- TODO should it use value or _1? Both seem to be used in Spark. frameTypeFromCol :: DataType -> StructType frameTypeFromCol (StrictType (Struct struct)) = struct frameTypeFromCol dt = _structFromUnfields [("value", dt)] -- Given the structural type for a dataframe or a dataset, returns the -- equivalent column type. colTypeFromFrame :: StructType -> DataType colTypeFromFrame st @ (StructType fs) = case V.toList fs of [StructField { structFieldName = fname, structFieldType = (StrictType dt)}] | fname == "value" -> StrictType dt _ -> StrictType (Struct st) -- The strict int type compatibleTypes :: DataType -> DataType -> Bool compatibleTypes (StrictType sdt) (StrictType sdt') = _compatibleTypesStrict sdt sdt' compatibleTypes (NullableType sdt) (NullableType sdt') = _compatibleTypesStrict sdt sdt' compatibleTypes _ _ = False _compatibleTypesStrict :: StrictDataType -> StrictDataType -> Bool _compatibleTypesStrict IntType IntType = True _compatibleTypesStrict StringType StringType = True _compatibleTypesStrict (ArrayType et) (ArrayType et') = compatibleTypes et et' _compatibleTypesStrict (Struct (StructType v)) (Struct (StructType v')) = (length v == length v') && and (V.zipWith compatibleTypes (structFieldType <$> v) (structFieldType <$> v')) _compatibleTypesStrict _ _ = False tupleType :: SQLType a -> SQLType b -> SQLType (a, b) tupleType (SQLType dt1) (SQLType dt2) = SQLType $ structType [structField "_1" dt1, structField "_2" dt2] intType :: DataType intType = StrictType IntType -- a string structField :: T.Text -> DataType -> StructField structField txt = StructField (FieldName txt) -- The strict structure type structType :: [StructField] -> DataType structType = StrictType . Struct . StructType . V.fromList -- The strict array type arrayType' :: DataType -> DataType arrayType' = StrictType . ArrayType -- Returns the equivalent data type that may be nulled. canNull :: DataType -> DataType canNull = NullableType . iInnerStrictType -- Given a type, returns the corresponding array type. -- This is preferred to using directly buildType, as it may encounter some -- overlapping instances. arrayType :: SQLType a -> SQLType [a] arrayType (SQLType dt) = SQLType (arrayType' dt) iInnerStrictType :: DataType -> StrictDataType iInnerStrictType (StrictType st) = st iInnerStrictType (NullableType st) = st iSingleField :: DataType -> Maybe DataType iSingleField (StrictType (Struct (StructType fields))) = case V.toList fields of [StructField _ dt] -> Just dt _ -> Nothing iSingleField _ = Nothing structName :: StructType -> Text structName (StructType fields) = "struct(" <> intercalate "," (unFieldName . structFieldName <$> V.toList fields) <> ")" structTypeFromFields :: [(FieldName, DataType)] -> Try StructType structTypeFromFields [] = tryError "You cannot build an empty structure" structTypeFromFields ((hfn, hdt):t) = let fs = (hfn, hdt) : t ct = StructType $ uncurry StructField <$> V.fromList fs names = fst <$> fs numNames = length names numDistincts = length . nub $ names in if numNames == numDistincts then return ct else tryError $ sformat ("Duplicate field names when building the struct: "%sh) (sort names) _structFromUnfields :: [(T.Text, DataType)] -> StructType _structFromUnfields l = StructType . V.fromList $ x where x = [StructField (FieldName name) dt | (name, dt) <- l]