// Start of backends/cuda.h. // Forward declarations. // Invoked by setup_opencl() after the platform and device has been // found, but before the program is loaded. Its intended use is to // tune constants based on the selected platform and device. static void set_tuning_params(struct futhark_context* ctx); static char* get_failure_msg(int failure_idx, int64_t args[]); #define CUDA_SUCCEED_FATAL(x) cuda_api_succeed_fatal(x, #x, __FILE__, __LINE__) #define CUDA_SUCCEED_NONFATAL(x) cuda_api_succeed_nonfatal(x, #x, __FILE__, __LINE__) #define NVRTC_SUCCEED_FATAL(x) nvrtc_api_succeed_fatal(x, #x, __FILE__, __LINE__) #define NVRTC_SUCCEED_NONFATAL(x) nvrtc_api_succeed_nonfatal(x, #x, __FILE__, __LINE__) // Take care not to override an existing error. #define CUDA_SUCCEED_OR_RETURN(e) { \ char *serror = CUDA_SUCCEED_NONFATAL(e); \ if (serror) { \ if (!ctx->error) { \ ctx->error = serror; \ return bad; \ } else { \ free(serror); \ } \ } \ } // CUDA_SUCCEED_OR_RETURN returns the value of the variable 'bad' in // scope. By default, it will be this one. Create a local variable // of some other type if needed. This is a bit of a hack, but it // saves effort in the code generator. static const int bad = 1; static inline void cuda_api_succeed_fatal(CUresult res, const char *call, const char *file, int line) { if (res != CUDA_SUCCESS) { const char *err_str; cuGetErrorString(res, &err_str); if (err_str == NULL) { err_str = "Unknown"; } futhark_panic(-1, "%s:%d: CUDA call\n %s\nfailed with error code %d (%s)\n", file, line, call, res, err_str); } } static char* cuda_api_succeed_nonfatal(CUresult res, const char *call, const char *file, int line) { if (res != CUDA_SUCCESS) { const char *err_str; cuGetErrorString(res, &err_str); if (err_str == NULL) { err_str = "Unknown"; } return msgprintf("%s:%d: CUDA call\n %s\nfailed with error code %d (%s)\n", file, line, call, res, err_str); } else { return NULL; } } static inline void nvrtc_api_succeed_fatal(nvrtcResult res, const char *call, const char *file, int line) { if (res != NVRTC_SUCCESS) { const char *err_str = nvrtcGetErrorString(res); futhark_panic(-1, "%s:%d: NVRTC call\n %s\nfailed with error code %d (%s)\n", file, line, call, res, err_str); } } static char* nvrtc_api_succeed_nonfatal(nvrtcResult res, const char *call, const char *file, int line) { if (res != NVRTC_SUCCESS) { const char *err_str = nvrtcGetErrorString(res); return msgprintf("%s:%d: NVRTC call\n %s\nfailed with error code %d (%s)\n", file, line, call, res, err_str); } else { return NULL; } } struct futhark_context_config { int in_use; int debugging; int profiling; int logging; char* cache_fname; int num_tuning_params; int64_t *tuning_params; const char** tuning_param_names; const char** tuning_param_vars; const char** tuning_param_classes; // Uniform fields above. char* program; int num_nvrtc_opts; char* *nvrtc_opts; char* preferred_device; int preferred_device_num; int unified_memory; char* dump_ptx_to; char* load_ptx_from; size_t default_block_size; size_t default_grid_size; size_t default_tile_size; size_t default_reg_tile_size; size_t default_threshold; int default_block_size_changed; int default_grid_size_changed; int default_tile_size_changed; }; static void backend_context_config_setup(struct futhark_context_config *cfg) { cfg->num_nvrtc_opts = 0; cfg->nvrtc_opts = (char**) malloc(sizeof(char*)); cfg->nvrtc_opts[0] = NULL; cfg->program = strconcat(gpu_program); cfg->preferred_device_num = 0; cfg->preferred_device = strdup(""); cfg->dump_ptx_to = NULL; cfg->load_ptx_from = NULL; cfg->unified_memory = 2; cfg->default_block_size = 256; cfg->default_grid_size = 0; // Set properly later. cfg->default_tile_size = 32; cfg->default_reg_tile_size = 2; cfg->default_threshold = 32*1024; cfg->default_block_size_changed = 0; cfg->default_grid_size_changed = 0; cfg->default_tile_size_changed = 0; } static void backend_context_config_teardown(struct futhark_context_config* cfg) { for (int i = 0; i < cfg->num_nvrtc_opts; i++) { free(cfg->nvrtc_opts[i]); } free(cfg->nvrtc_opts); free(cfg->dump_ptx_to); free(cfg->load_ptx_from); free(cfg->preferred_device); free(cfg->program); } void futhark_context_config_add_nvrtc_option(struct futhark_context_config *cfg, const char *opt) { cfg->nvrtc_opts[cfg->num_nvrtc_opts] = strdup(opt); cfg->num_nvrtc_opts++; cfg->nvrtc_opts = (char **) realloc(cfg->nvrtc_opts, (cfg->num_nvrtc_opts + 1) * sizeof(char *)); cfg->nvrtc_opts[cfg->num_nvrtc_opts] = NULL; } void futhark_context_config_set_device(struct futhark_context_config *cfg, const char *s) { int x = 0; if (*s == '#') { s++; while (isdigit(*s)) { x = x * 10 + (*s++)-'0'; } // Skip trailing spaces. while (isspace(*s)) { s++; } } free(cfg->preferred_device); cfg->preferred_device = strdup(s); cfg->preferred_device_num = x; } const char* futhark_context_config_get_program(struct futhark_context_config *cfg) { return cfg->program; } void futhark_context_config_set_program(struct futhark_context_config *cfg, const char *s) { free(cfg->program); cfg->program = strdup(s); } void futhark_context_config_dump_ptx_to(struct futhark_context_config *cfg, const char *path) { free(cfg->dump_ptx_to); cfg->dump_ptx_to = strdup(path); } void futhark_context_config_load_ptx_from(struct futhark_context_config *cfg, const char *path) { free(cfg->load_ptx_from); cfg->load_ptx_from = strdup(path); } void futhark_context_config_set_unified_memory(struct futhark_context_config* cfg, int flag) { cfg->unified_memory = flag; } void futhark_context_config_set_default_thread_block_size(struct futhark_context_config *cfg, int size) { cfg->default_block_size = size; cfg->default_block_size_changed = 1; } void futhark_context_config_set_default_grid_size(struct futhark_context_config *cfg, int num) { cfg->default_grid_size = num; cfg->default_grid_size_changed = 1; } void futhark_context_config_set_default_group_size(struct futhark_context_config *cfg, int size) { futhark_context_config_set_default_thread_block_size(cfg, size); } void futhark_context_config_set_default_num_groups(struct futhark_context_config *cfg, int num) { futhark_context_config_set_default_grid_size(cfg, num); } void futhark_context_config_set_default_tile_size(struct futhark_context_config *cfg, int size) { cfg->default_tile_size = size; cfg->default_tile_size_changed = 1; } void futhark_context_config_set_default_reg_tile_size(struct futhark_context_config *cfg, int size) { cfg->default_reg_tile_size = size; } void futhark_context_config_set_default_threshold(struct futhark_context_config *cfg, int size) { cfg->default_threshold = size; } int futhark_context_config_set_tuning_param(struct futhark_context_config *cfg, const char *param_name, size_t new_value) { for (int i = 0; i < cfg->num_tuning_params; i++) { if (strcmp(param_name, cfg->tuning_param_names[i]) == 0) { cfg->tuning_params[i] = new_value; return 0; } } if (strcmp(param_name, "default_thread_block_size") == 0 || strcmp(param_name, "default_group_size") == 0) { cfg->default_block_size = new_value; return 0; } if (strcmp(param_name, "default_num_groups") == 0 || strcmp(param_name, "default_grid_size") == 0) { cfg->default_grid_size = new_value; return 0; } if (strcmp(param_name, "default_threshold") == 0) { cfg->default_threshold = new_value; return 0; } if (strcmp(param_name, "default_tile_size") == 0) { cfg->default_tile_size = new_value; return 0; } if (strcmp(param_name, "default_reg_tile_size") == 0) { cfg->default_reg_tile_size = new_value; return 0; } return 1; } // A record of something that happened. struct profiling_record { cudaEvent_t *events; // Points to two events. const char *name; }; struct futhark_context { struct futhark_context_config* cfg; int detail_memory; int debugging; int profiling; int profiling_paused; int logging; lock_t lock; char *error; lock_t error_lock; FILE *log; struct constants *constants; struct free_list free_list; struct event_list event_list; int64_t peak_mem_usage_default; int64_t cur_mem_usage_default; struct program* program; bool program_initialised; // Uniform fields above. CUdeviceptr global_failure; CUdeviceptr global_failure_args; struct tuning_params tuning_params; // True if a potentially failing kernel has been enqueued. int32_t failure_is_an_option; int total_runs; long int total_runtime; int64_t peak_mem_usage_device; int64_t cur_mem_usage_device; CUdevice dev; CUcontext cu_ctx; CUmodule module; CUstream stream; struct free_list gpu_free_list; size_t max_thread_block_size; size_t max_grid_size; size_t max_tile_size; size_t max_threshold; size_t max_shared_memory; size_t max_bespoke; size_t max_registers; size_t max_cache; size_t lockstep_width; struct builtin_kernels* kernels; }; #define CU_DEV_ATTR(x) (CU_DEVICE_ATTRIBUTE_##x) #define device_query(dev,attrib) _device_query(dev, CU_DEV_ATTR(attrib)) static int _device_query(CUdevice dev, CUdevice_attribute attrib) { int val; CUDA_SUCCEED_FATAL(cuDeviceGetAttribute(&val, attrib, dev)); return val; } #define CU_FUN_ATTR(x) (CU_FUNC_ATTRIBUTE_##x) #define function_query(fn,attrib) _function_query(dev, CU_FUN_ATTR(attrib)) static int _function_query(CUfunction dev, CUfunction_attribute attrib) { int val; CUDA_SUCCEED_FATAL(cuFuncGetAttribute(&val, attrib, dev)); return val; } static int cuda_device_setup(struct futhark_context *ctx) { struct futhark_context_config *cfg = ctx->cfg; char name[256]; int count, chosen = -1, best_cc = -1; int cc_major_best = 0, cc_minor_best = 0; int cc_major = 0, cc_minor = 0; CUdevice dev; CUDA_SUCCEED_FATAL(cuDeviceGetCount(&count)); if (count == 0) { return 1; } int num_device_matches = 0; // XXX: Current device selection policy is to choose the device with the // highest compute capability (if no preferred device is set). // This should maybe be changed, since greater compute capability is not // necessarily an indicator of better performance. for (int i = 0; i < count; i++) { CUDA_SUCCEED_FATAL(cuDeviceGet(&dev, i)); cc_major = device_query(dev, COMPUTE_CAPABILITY_MAJOR); cc_minor = device_query(dev, COMPUTE_CAPABILITY_MINOR); CUDA_SUCCEED_FATAL(cuDeviceGetName(name, sizeof(name) - 1, dev)); name[sizeof(name) - 1] = 0; if (cfg->logging) { fprintf(ctx->log, "Device #%d: name=\"%s\", compute capability=%d.%d\n", i, name, cc_major, cc_minor); } if (device_query(dev, COMPUTE_MODE) == CU_COMPUTEMODE_PROHIBITED) { if (cfg->logging) { fprintf(ctx->log, "Device #%d is compute-prohibited, ignoring\n", i); } continue; } if (best_cc == -1 || cc_major > cc_major_best || (cc_major == cc_major_best && cc_minor > cc_minor_best)) { best_cc = i; cc_major_best = cc_major; cc_minor_best = cc_minor; } if (strstr(name, cfg->preferred_device) != NULL && num_device_matches++ == cfg->preferred_device_num) { chosen = i; break; } } if (chosen == -1) { chosen = best_cc; } if (chosen == -1) { return 1; } if (cfg->logging) { fprintf(ctx->log, "Using device #%d\n", chosen); } CUDA_SUCCEED_FATAL(cuDeviceGet(&ctx->dev, chosen)); return 0; } static const char *cuda_nvrtc_get_arch(CUdevice dev) { static struct { int major; int minor; const char *arch_str; } const x[] = { { 3, 0, "compute_30" }, { 3, 2, "compute_32" }, { 3, 5, "compute_35" }, { 3, 7, "compute_37" }, { 5, 0, "compute_50" }, { 5, 2, "compute_52" }, { 5, 3, "compute_53" }, { 6, 0, "compute_60" }, { 6, 1, "compute_61" }, { 6, 2, "compute_62" }, { 7, 0, "compute_70" }, { 7, 2, "compute_72" }, { 7, 5, "compute_75" }, { 8, 0, "compute_80" }, { 8, 6, "compute_80" }, { 8, 7, "compute_80" } }; int major = device_query(dev, COMPUTE_CAPABILITY_MAJOR); int minor = device_query(dev, COMPUTE_CAPABILITY_MINOR); int chosen = -1; int num_archs = sizeof(x)/sizeof(x[0]); for (int i = 0; i < num_archs; i++) { if (x[i].major < major || (x[i].major == major && x[i].minor <= minor)) { chosen = i; } else { break; } } if (chosen == -1) { futhark_panic(-1, "Unsupported compute capability %d.%d\n", major, minor); } if (x[chosen].major != major || x[chosen].minor != minor) { fprintf(stderr, "Warning: device compute capability is %d.%d, but newest supported by Futhark is %d.%d.\n", major, minor, x[chosen].major, x[chosen].minor); } return x[chosen].arch_str; } static void cuda_nvrtc_mk_build_options(struct futhark_context *ctx, const char *extra_opts[], char*** opts_out, size_t *n_opts) { int arch_set = 0, num_extra_opts; struct futhark_context_config *cfg = ctx->cfg; char** macro_names; int64_t* macro_vals; int num_macros = gpu_macros(ctx, ¯o_names, ¯o_vals); // nvrtc cannot handle multiple -arch options. Hence, if one of the // extra_opts is -arch, we have to be careful not to do our usual // automatic generation. for (num_extra_opts = 0; extra_opts[num_extra_opts] != NULL; num_extra_opts++) { if (strstr(extra_opts[num_extra_opts], "-arch") == extra_opts[num_extra_opts] || strstr(extra_opts[num_extra_opts], "--gpu-architecture") == extra_opts[num_extra_opts]) { arch_set = 1; } } size_t i = 0, n_opts_alloc = 20 + num_macros + num_extra_opts + cfg->num_tuning_params; char **opts = (char**) malloc(n_opts_alloc * sizeof(char *)); if (!arch_set) { opts[i++] = strdup("-arch"); opts[i++] = strdup(cuda_nvrtc_get_arch(ctx->dev)); } opts[i++] = strdup("-default-device"); if (cfg->debugging) { opts[i++] = strdup("-G"); opts[i++] = strdup("-lineinfo"); } else { opts[i++] = strdup("--disable-warnings"); } opts[i++] = msgprintf("-D%s=%d", "max_thread_block_size", (int)ctx->max_thread_block_size); opts[i++] = msgprintf("-D%s=%d", "max_shared_memory", (int)ctx->max_shared_memory); opts[i++] = msgprintf("-D%s=%d", "max_registers", (int)ctx->max_registers); for (int j = 0; j < num_macros; j++) { opts[i++] = msgprintf("-D%s=%zu", macro_names[j], macro_vals[j]); } for (int j = 0; j < cfg->num_tuning_params; j++) { opts[i++] = msgprintf("-D%s=%zu", cfg->tuning_param_vars[j], cfg->tuning_params[j]); } opts[i++] = msgprintf("-DLOCKSTEP_WIDTH=%zu", ctx->lockstep_width); opts[i++] = msgprintf("-DMAX_THREADS_PER_BLOCK=%zu", ctx->max_thread_block_size); // Time for the best lines of the code in the entire compiler. if (getenv("CUDA_HOME") != NULL) { opts[i++] = msgprintf("-I%s/include", getenv("CUDA_HOME")); } if (getenv("CUDA_ROOT") != NULL) { opts[i++] = msgprintf("-I%s/include", getenv("CUDA_ROOT")); } if (getenv("CUDA_PATH") != NULL) { opts[i++] = msgprintf("-I%s/include", getenv("CUDA_PATH")); } opts[i++] = msgprintf("-I/usr/local/cuda/include"); opts[i++] = msgprintf("-I/usr/include"); for (int j = 0; extra_opts[j] != NULL; j++) { opts[i++] = strdup(extra_opts[j]); } opts[i++] = msgprintf("-DTR_BLOCK_DIM=%d", TR_BLOCK_DIM); opts[i++] = msgprintf("-DTR_TILE_DIM=%d", TR_TILE_DIM); opts[i++] = msgprintf("-DTR_ELEMS_PER_THREAD=%d", TR_ELEMS_PER_THREAD); free(macro_names); free(macro_vals); *n_opts = i; *opts_out = opts; } static char* cuda_nvrtc_build(const char *src, const char *opts[], size_t n_opts, char **ptx) { nvrtcProgram prog; char *problem = NULL; problem = NVRTC_SUCCEED_NONFATAL(nvrtcCreateProgram(&prog, src, "futhark-cuda", 0, NULL, NULL)); if (problem) { return problem; } nvrtcResult res = nvrtcCompileProgram(prog, n_opts, opts); if (res != NVRTC_SUCCESS) { size_t log_size; if (nvrtcGetProgramLogSize(prog, &log_size) == NVRTC_SUCCESS) { char *log = (char*) malloc(log_size); if (nvrtcGetProgramLog(prog, log) == NVRTC_SUCCESS) { problem = msgprintf("NVRTC compilation failed.\n\n%s\n", log); } else { problem = msgprintf("Could not retrieve compilation log\n"); } free(log); } return problem; } size_t ptx_size; NVRTC_SUCCEED_FATAL(nvrtcGetPTXSize(prog, &ptx_size)); *ptx = (char*) malloc(ptx_size); NVRTC_SUCCEED_FATAL(nvrtcGetPTX(prog, *ptx)); NVRTC_SUCCEED_FATAL(nvrtcDestroyProgram(&prog)); return NULL; } static void cuda_load_ptx_from_cache(struct futhark_context_config *cfg, const char *src, const char *opts[], size_t n_opts, struct cache_hash *h, const char *cache_fname, char **ptx) { if (cfg->logging) { fprintf(stderr, "Restoring cache from from %s...\n", cache_fname); } cache_hash_init(h); for (size_t i = 0; i < n_opts; i++) { cache_hash(h, opts[i], strlen(opts[i])); } cache_hash(h, src, strlen(src)); size_t ptxsize; errno = 0; if (cache_restore(cache_fname, h, (unsigned char**)ptx, &ptxsize) != 0) { if (cfg->logging) { fprintf(stderr, "Failed to restore cache (errno: %s)\n", strerror(errno)); } } } static void cuda_size_setup(struct futhark_context *ctx) { struct futhark_context_config *cfg = ctx->cfg; if (cfg->default_block_size > ctx->max_thread_block_size) { if (cfg->default_block_size_changed) { fprintf(stderr, "Note: Device limits default block size to %zu (down from %zu).\n", ctx->max_thread_block_size, cfg->default_block_size); } cfg->default_block_size = ctx->max_thread_block_size; } if (cfg->default_grid_size > ctx->max_grid_size) { if (cfg->default_grid_size_changed) { fprintf(stderr, "Note: Device limits default grid size to %zu (down from %zu).\n", ctx->max_grid_size, cfg->default_grid_size); } cfg->default_grid_size = ctx->max_grid_size; } if (cfg->default_tile_size > ctx->max_tile_size) { if (cfg->default_tile_size_changed) { fprintf(stderr, "Note: Device limits default tile size to %zu (down from %zu).\n", ctx->max_tile_size, cfg->default_tile_size); } cfg->default_tile_size = ctx->max_tile_size; } if (!cfg->default_grid_size_changed) { cfg->default_grid_size = (device_query(ctx->dev, MULTIPROCESSOR_COUNT) * device_query(ctx->dev, MAX_THREADS_PER_MULTIPROCESSOR)) / cfg->default_block_size; } for (int i = 0; i < cfg->num_tuning_params; i++) { const char *size_class = cfg->tuning_param_classes[i]; int64_t *size_value = &cfg->tuning_params[i]; const char* size_name = cfg->tuning_param_names[i]; int64_t max_value = 0, default_value = 0; if (strstr(size_class, "thread_block_size") == size_class) { max_value = ctx->max_thread_block_size; default_value = cfg->default_block_size; } else if (strstr(size_class, "grid_size") == size_class) { max_value = ctx->max_grid_size; default_value = cfg->default_grid_size; // XXX: as a quick and dirty hack, use twice as many threads for // histograms by default. We really should just be smarter // about sizes somehow. if (strstr(size_name, ".seghist_") != NULL) { default_value *= 2; } } else if (strstr(size_class, "tile_size") == size_class) { max_value = ctx->max_tile_size; default_value = cfg->default_tile_size; } else if (strstr(size_class, "reg_tile_size") == size_class) { max_value = 0; // No limit. default_value = cfg->default_reg_tile_size; } else if (strstr(size_class, "threshold") == size_class) { // Threshold can be as large as it takes. default_value = cfg->default_threshold; } else { // Bespoke sizes have no limit or default. } if (*size_value == 0) { *size_value = default_value; } else if (max_value > 0 && *size_value > max_value) { fprintf(stderr, "Note: Device limits %s to %zu (down from %zu)\n", size_name, max_value, *size_value); *size_value = max_value; } } } static char* cuda_module_setup(struct futhark_context *ctx, const char *src, const char *extra_opts[], const char* cache_fname) { char *ptx = NULL; struct futhark_context_config *cfg = ctx->cfg; if (cfg->load_ptx_from) { ptx = slurp_file(cfg->load_ptx_from, NULL); } char **opts; size_t n_opts; cuda_nvrtc_mk_build_options(ctx, extra_opts, &opts, &n_opts); if (cfg->logging) { fprintf(stderr, "NVRTC compile options:\n"); for (size_t j = 0; j < n_opts; j++) { fprintf(stderr, "\t%s\n", opts[j]); } fprintf(stderr, "\n"); } struct cache_hash h; int loaded_ptx_from_cache = 0; if (cache_fname != NULL) { cuda_load_ptx_from_cache(cfg, src, (const char**)opts, n_opts, &h, cache_fname, &ptx); if (ptx != NULL) { if (cfg->logging) { fprintf(stderr, "Restored PTX from cache; now loading module...\n"); } if (cuModuleLoadData(&ctx->module, ptx) == CUDA_SUCCESS) { if (cfg->logging) { fprintf(stderr, "Success!\n"); } loaded_ptx_from_cache = 1; } else { if (cfg->logging) { fprintf(stderr, "Failed!\n"); } free(ptx); ptx = NULL; } } } if (ptx == NULL) { char* problem = cuda_nvrtc_build(src, (const char**)opts, n_opts, &ptx); if (problem != NULL) { return problem; } } if (cfg->dump_ptx_to != NULL) { dump_file(cfg->dump_ptx_to, ptx, strlen(ptx)); } if (!loaded_ptx_from_cache) { CUDA_SUCCEED_FATAL(cuModuleLoadData(&ctx->module, ptx)); } if (cache_fname != NULL && !loaded_ptx_from_cache) { if (cfg->logging) { fprintf(stderr, "Caching PTX in %s...\n", cache_fname); } errno = 0; if (cache_store(cache_fname, &h, (const unsigned char*)ptx, strlen(ptx)) != 0) { fprintf(stderr, "Failed to cache PTX: %s\n", strerror(errno)); } } for (size_t i = 0; i < n_opts; i++) { free((char *)opts[i]); } free(opts); free(ptx); return NULL; } struct cuda_event { cudaEvent_t start; cudaEvent_t end; }; static struct cuda_event* cuda_event_new(struct futhark_context* ctx) { if (ctx->profiling && !ctx->profiling_paused) { struct cuda_event* e = malloc(sizeof(struct cuda_event)); cudaEventCreate(&e->start); cudaEventCreate(&e->end); return e; } else { return NULL; } } static int cuda_event_report(struct str_builder* sb, struct cuda_event* e) { float ms; CUresult err; if ((err = cuEventElapsedTime(&ms, e->start, e->end)) != CUDA_SUCCESS) { return err; } // CUDA provides milisecond resolution, but we want microseconds. str_builder(sb, ",\"duration\":%f", ms*1000); if ((err = cuEventDestroy(e->start)) != CUDA_SUCCESS) { return 1; } if ((err = cuEventDestroy(e->end)) != CUDA_SUCCESS) { return 1; } free(e); return 0; } int futhark_context_sync(struct futhark_context* ctx) { CUDA_SUCCEED_OR_RETURN(cuCtxPushCurrent(ctx->cu_ctx)); CUDA_SUCCEED_OR_RETURN(cuCtxSynchronize()); if (ctx->failure_is_an_option) { // Check for any delayed error. int32_t failure_idx; CUDA_SUCCEED_OR_RETURN( cuMemcpyDtoH(&failure_idx, ctx->global_failure, sizeof(int32_t))); ctx->failure_is_an_option = 0; if (failure_idx >= 0) { // We have to clear global_failure so that the next entry point // is not considered a failure from the start. int32_t no_failure = -1; CUDA_SUCCEED_OR_RETURN( cuMemcpyHtoD(ctx->global_failure, &no_failure, sizeof(int32_t))); int64_t args[max_failure_args+1]; CUDA_SUCCEED_OR_RETURN( cuMemcpyDtoH(&args, ctx->global_failure_args, sizeof(args))); ctx->error = get_failure_msg(failure_idx, args); return FUTHARK_PROGRAM_ERROR; } } CUDA_SUCCEED_OR_RETURN(cuCtxPopCurrent(&ctx->cu_ctx)); return 0; } struct builtin_kernels* init_builtin_kernels(struct futhark_context* ctx); void free_builtin_kernels(struct futhark_context* ctx, struct builtin_kernels* kernels); int backend_context_setup(struct futhark_context* ctx) { ctx->failure_is_an_option = 0; ctx->total_runs = 0; ctx->total_runtime = 0; ctx->peak_mem_usage_device = 0; ctx->cur_mem_usage_device = 0; ctx->kernels = NULL; CUDA_SUCCEED_FATAL(cuInit(0)); if (cuda_device_setup(ctx) != 0) { futhark_panic(-1, "No suitable CUDA device found.\n"); } CUDA_SUCCEED_FATAL(cuCtxCreate(&ctx->cu_ctx, 0, ctx->dev)); free_list_init(&ctx->gpu_free_list); if (ctx->cfg->unified_memory == 2) { ctx->cfg->unified_memory = device_query(ctx->dev, MANAGED_MEMORY); } if (ctx->cfg->logging) { if (ctx->cfg->unified_memory) { fprintf(ctx->log, "Using managed memory\n"); } else { fprintf(ctx->log, "Using unmanaged memory\n"); } } // MAX_SHARED_MEMORY_PER_BLOCK gives bogus numbers (48KiB); probably // for backwards compatibility. Add _OPTIN and you seem to get the // right number. ctx->max_shared_memory = device_query(ctx->dev, MAX_SHARED_MEMORY_PER_BLOCK_OPTIN); #if CUDART_VERSION >= 12000 ctx->max_shared_memory -= device_query(ctx->dev, RESERVED_SHARED_MEMORY_PER_BLOCK); #endif ctx->max_thread_block_size = device_query(ctx->dev, MAX_THREADS_PER_BLOCK); ctx->max_grid_size = device_query(ctx->dev, MAX_GRID_DIM_X); ctx->max_tile_size = sqrt(ctx->max_thread_block_size); ctx->max_threshold = 0; ctx->max_bespoke = 0; ctx->max_registers = device_query(ctx->dev, MAX_REGISTERS_PER_BLOCK); ctx->max_cache = device_query(ctx->dev, L2_CACHE_SIZE); ctx->lockstep_width = device_query(ctx->dev, WARP_SIZE); CUDA_SUCCEED_FATAL(cuStreamCreate(&ctx->stream, CU_STREAM_DEFAULT)); cuda_size_setup(ctx); ctx->error = cuda_module_setup(ctx, ctx->cfg->program, (const char**)ctx->cfg->nvrtc_opts, ctx->cfg->cache_fname); if (ctx->error != NULL) { futhark_panic(1, "During CUDA initialisation:\n%s\n", ctx->error); } int32_t no_error = -1; CUDA_SUCCEED_FATAL(cuMemAlloc(&ctx->global_failure, sizeof(no_error))); CUDA_SUCCEED_FATAL(cuMemcpyHtoD(ctx->global_failure, &no_error, sizeof(no_error))); // The +1 is to avoid zero-byte allocations. CUDA_SUCCEED_FATAL(cuMemAlloc(&ctx->global_failure_args, sizeof(int64_t)*(max_failure_args+1))); if ((ctx->kernels = init_builtin_kernels(ctx)) == NULL) { return 1; } return 0; } void backend_context_teardown(struct futhark_context* ctx) { if (ctx->kernels != NULL) { free_builtin_kernels(ctx, ctx->kernels); cuMemFree(ctx->global_failure); cuMemFree(ctx->global_failure_args); CUDA_SUCCEED_FATAL(gpu_free_all(ctx)); CUDA_SUCCEED_FATAL(cuStreamDestroy(ctx->stream)); CUDA_SUCCEED_FATAL(cuModuleUnload(ctx->module)); CUDA_SUCCEED_FATAL(cuCtxDestroy(ctx->cu_ctx)); } free_list_destroy(&ctx->gpu_free_list); } // GPU ABSTRACTION LAYER // Types. typedef CUfunction gpu_kernel; typedef CUdeviceptr gpu_mem; static void gpu_create_kernel(struct futhark_context *ctx, gpu_kernel* kernel, const char* name) { if (ctx->debugging) { fprintf(ctx->log, "Creating kernel %s.\n", name); } CUDA_SUCCEED_FATAL(cuModuleGetFunction(kernel, ctx->module, name)); // Unless the below is set, the kernel is limited to 48KiB of memory. CUDA_SUCCEED_FATAL(cuFuncSetAttribute(*kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, ctx->max_shared_memory)); } static void gpu_free_kernel(struct futhark_context *ctx, gpu_kernel kernel) { (void)ctx; (void)kernel; } static int gpu_scalar_to_device(struct futhark_context* ctx, gpu_mem dst, size_t offset, size_t size, void *src) { struct cuda_event *event = cuda_event_new(ctx); if (event != NULL) { add_event(ctx, "copy_scalar_to_dev", strdup(""), event, (event_report_fn)cuda_event_report); CUDA_SUCCEED_FATAL(cuEventRecord(event->start, ctx->stream)); } CUDA_SUCCEED_OR_RETURN(cuMemcpyHtoD(dst + offset, src, size)); if (event != NULL) { CUDA_SUCCEED_FATAL(cuEventRecord(event->end, ctx->stream)); } return FUTHARK_SUCCESS; } static int gpu_scalar_from_device(struct futhark_context* ctx, void *dst, gpu_mem src, size_t offset, size_t size) { struct cuda_event *event = cuda_event_new(ctx); if (event != NULL) { add_event(ctx, "copy_scalar_from_dev", strdup(""), event, (event_report_fn)cuda_event_report); CUDA_SUCCEED_FATAL(cuEventRecord(event->start, ctx->stream)); } CUDA_SUCCEED_OR_RETURN(cuMemcpyDtoH(dst, src + offset, size)); if (event != NULL) { CUDA_SUCCEED_FATAL(cuEventRecord(event->end, ctx->stream)); } return FUTHARK_SUCCESS; } static int gpu_memcpy(struct futhark_context* ctx, gpu_mem dst, int64_t dst_offset, gpu_mem src, int64_t src_offset, int64_t nbytes) { struct cuda_event *event = cuda_event_new(ctx); if (event != NULL) { add_event(ctx, "copy_dev_to_dev", strdup(""), event, (event_report_fn)cuda_event_report); CUDA_SUCCEED_FATAL(cuEventRecord(event->start, ctx->stream)); } CUDA_SUCCEED_OR_RETURN(cuMemcpy(dst+dst_offset, src+src_offset, nbytes)); if (event != NULL) { CUDA_SUCCEED_FATAL(cuEventRecord(event->end, ctx->stream)); } return FUTHARK_SUCCESS; } static int memcpy_host2gpu(struct futhark_context* ctx, bool sync, gpu_mem dst, int64_t dst_offset, const unsigned char* src, int64_t src_offset, int64_t nbytes) { if (nbytes > 0) { struct cuda_event *event = cuda_event_new(ctx); if (event != NULL) { add_event(ctx, "copy_host_to_dev", strdup(""), event, (event_report_fn)cuda_event_report); CUDA_SUCCEED_FATAL(cuEventRecord(event->start, ctx->stream)); } if (sync) { CUDA_SUCCEED_OR_RETURN (cuMemcpyHtoD(dst + dst_offset, src + src_offset, nbytes)); } else { CUDA_SUCCEED_OR_RETURN (cuMemcpyHtoDAsync(dst + dst_offset, src + src_offset, nbytes, ctx->stream)); } if (event != NULL) { CUDA_SUCCEED_FATAL(cuEventRecord(event->end, ctx->stream)); } } return FUTHARK_SUCCESS; } static int memcpy_gpu2host(struct futhark_context* ctx, bool sync, unsigned char* dst, int64_t dst_offset, gpu_mem src, int64_t src_offset, int64_t nbytes) { if (nbytes > 0) { struct cuda_event *event = cuda_event_new(ctx); if (event != NULL) { add_event(ctx, "copy_dev_to_host", strdup(""), event, (event_report_fn)cuda_event_report); CUDA_SUCCEED_FATAL(cuEventRecord(event->start, ctx->stream)); } if (sync) { CUDA_SUCCEED_OR_RETURN (cuMemcpyDtoH(dst + dst_offset, src + src_offset, nbytes)); } else { CUDA_SUCCEED_OR_RETURN (cuMemcpyDtoHAsync(dst + dst_offset, src + src_offset, nbytes, ctx->stream)); } if (event != NULL) { CUDA_SUCCEED_FATAL(cuEventRecord(event->end, ctx->stream)); } if (sync && ctx->failure_is_an_option && futhark_context_sync(ctx) != 0) { return 1; } } return FUTHARK_SUCCESS; } static int gpu_launch_kernel(struct futhark_context* ctx, gpu_kernel kernel, const char *name, const int32_t grid[3], const int32_t block[3], unsigned int shared_mem_bytes, int num_args, void* args[num_args], size_t args_sizes[num_args]) { (void) args_sizes; int64_t time_start = 0, time_end = 0; if (ctx->debugging) { time_start = get_wall_time(); } struct cuda_event *event = cuda_event_new(ctx); if (event != NULL) { CUDA_SUCCEED_FATAL(cuEventRecord(event->start, ctx->stream)); add_event(ctx, name, msgprintf("Kernel %s with\n" " grid=(%d,%d,%d)\n" " block=(%d,%d,%d)\n" " shared memory=%d", name, grid[0], grid[1], grid[2], block[0], block[1], block[2], shared_mem_bytes), event, (event_report_fn)cuda_event_report); } CUDA_SUCCEED_OR_RETURN (cuLaunchKernel(kernel, grid[0], grid[1], grid[2], block[0], block[1], block[2], shared_mem_bytes, ctx->stream, args, NULL)); if (event != NULL) { CUDA_SUCCEED_FATAL(cuEventRecord(event->end, ctx->stream)); } if (ctx->debugging) { CUDA_SUCCEED_FATAL(cuCtxSynchronize()); time_end = get_wall_time(); long int time_diff = time_end - time_start; fprintf(ctx->log, " runtime: %ldus\n\n", time_diff); } return FUTHARK_SUCCESS; } static int gpu_alloc_actual(struct futhark_context *ctx, size_t size, gpu_mem *mem_out) { CUresult res; if (ctx->cfg->unified_memory) { res = cuMemAllocManaged(mem_out, size, CU_MEM_ATTACH_GLOBAL); } else { res = cuMemAlloc(mem_out, size); } if (res == CUDA_ERROR_OUT_OF_MEMORY) { return FUTHARK_OUT_OF_MEMORY; } CUDA_SUCCEED_OR_RETURN(res); return FUTHARK_SUCCESS; } static int gpu_free_actual(struct futhark_context *ctx, gpu_mem mem) { (void)ctx; CUDA_SUCCEED_OR_RETURN(cuMemFree(mem)); return FUTHARK_SUCCESS; } // End of backends/cuda.h.