// Copyright (c) Facebook, Inc. and its affiliates. // // This source code is licensed under the MIT license found in the // LICENSE file in the root directory of this source tree. #ifndef _CUDA_UTILS_H #define _CUDA_UTILS_H #include #include #include #include #include #include #include #define LOG2_TOTAL_THREADS 10 #define TOTAL_THREADS (2 << LOG2_TOTAL_THREADS) inline uint opt_n_threads(uint work_size) { const uint pow_2 = std::log(work_size) / std::log(2.0); return 1 << min(pow_2, LOG2_TOTAL_THREADS); } inline dim3 opt_block_config(int x, int y) { const int x_threads = opt_n_threads(x); const int y_threads = max(min(opt_n_threads(y), TOTAL_THREADS / x_threads), 1); dim3 block_config(x_threads, y_threads, 1); return block_config; } #define CUDA_CHECK_ERRORS() \ do { \ cudaError_t err = cudaGetLastError(); \ if (cudaSuccess != err) { \ fprintf(stderr, "CUDA kernel failed : %s\n%s at L:%d in %s\n", \ cudaGetErrorString(err), __PRETTY_FUNCTION__, __LINE__, __FILE__); \ exit(-1); \ } \ } while (0) #endif template struct vec { __host__ __device__ T &operator[](uint idx) { return data[idx]; } __host__ __device__ T operator[](uint idx) const { return data[idx]; } T data[N_ELEMS]; static constexpr uint N = N_ELEMS; }; template using fvec = vec; template using hvec = vec<__half, N_HALFS>; template using uvec = vec;