Why does cublasSgemm uses `f16` for `float`?

It uses something called “automatic down-conversion”.

I don’t have a detailed description, but I believe the expectation is that your input data must fit within FP16 type in order for the calculation to produce expected results.

Example:

$ cat t2208.cu #include <cublas_v2.h> #include <limits> #include <iostream> int main(){ cublasHandle_t handle; cublasCreate(&handle); #ifdef USE_TC cublasSetMathMode( handle, CUBLAS_TENSOR_OP_MATH ); #endif float *h_X, *d_A = 0, *d_B = 0, *d_C = 0, alpha = 1.0f; const int N = 1024; const int n2 = N*N; h_X = new float[n2](); cudaMalloc(reinterpret_cast<void **>(&d_A), n2 * sizeof(d_A[0])); cudaMalloc(reinterpret_cast<void **>(&d_B), n2 * sizeof(d_B[0])); cudaMalloc(reinterpret_cast<void **>(&d_C), n2 * sizeof(d_C[0])); for (int i = 0; i < n2; i+=N+1) h_X[i] = 0.1f; cudaMemcpy(d_A, h_X, n2*sizeof(h_X[0]), cudaMemcpyHostToDevice); for (int i = 0; i < n2; i++) h_X[i] = std::numeric_limits<float>::max(); cudaMemcpy(d_B, h_X, n2*sizeof(h_X[0]), cudaMemcpyHostToDevice); float beta = 0.0f; cublasStatus_t s = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N); cudaMemcpy(h_X, d_C, n2*sizeof(h_X[0]), cudaMemcpyDeviceToHost); std::cout << std::numeric_limits<float>::max() << std::endl; std::cout << "status: " << (int) s << std::endl; std::cout << "result[0]: " << h_X[0] << std::endl; } $ nvcc -o t2208 t2208.cu -lcublas $ ./t2208 3.40282e+38 status: 0 result[0]: 3.40282e+37 $ nvcc -o t2208 t2208.cu -lcublas -DUSE_TC $ ./t2208 3.40282e+38 status: 0 result[0]: nan $ 

If you’d like to see an improvement in the documentation, please file a bug.