I'm currently trying to use the CUSPARSE library in order to speed up an HPCG implementation. However, it appears I'm making some kind of mistake during device data allocation.
This is the code segment that results in CUSPARSE_STATUS_MAPPING_ERROR:
int HPC_sparsemv( CRS_Matrix *A_crs_d,
FP * x_d, FP * y_d)
{
FP alpha = 1.0f;
FP beta = 0.0f;
FP* vals = A_crs_d->vals;
int* inds = A_crs_d->col_ind;
int* row_ptr = A_crs_d->row_ptr;
/*generate Matrix descriptor for SparseMV computation*/
cusparseMatDescr_t matDescr;
cusparseCreateMatDescr(&matDescr);
cusparseStatus_t status;
/*hand off control to CUSPARSE routine*/
#ifdef DOUBLE
status = cusparseDcsrmv(cuspHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, A_crs_d->nrows,
A_crs_d->ncols,A_crs_d->nnz, &alpha, matDescr, vals, row_ptr,
inds, x_d, &beta, y_d);
#else
status = cusparseScsrmv(cuspHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, A_crs_d->nrows,
A_crs_d->ncols,A_crs_d->nnz, &alpha, matDescr, vals, row_ptr,
col_ind, x_d, &beta, y_d);
#endif
NOTE: FP is a typedef wrapped by conditional compilation guards, meaning it gets evaluated to be either a float or a double alias at compile-time.
And here is the function handling the data allocation:
int cudaAlloc(FP* r_d, FP* p_d, FP* Ap_d, FP* b_d, const FP* const b, FP * x_d, FP * const x,
struct CRS_Matrix* A_crs_d, int nrows, int ncols, int nnz){
std::cout << "Beginning device allocation..." << std::endl;
int size_r = nrows * sizeof(FP);
int size_c = ncols * sizeof(FP);
int size_nnz = nnz * sizeof(FP);
int allocStatus = 0;
/*device alloc r_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &r_d, size_r) );
/*device alloc p_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &p_d, size_c) );
/*device alloc Ap_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &Ap_d, size_r) );
/*device alloc b_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &b_d, size_r ) );
allocStatus |= (int) checkCuda( cudaMemcpy(b_d, b, size_r, cudaMemcpyHostToDevice));
/*device alloc x_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &x_d, size_r ) );
allocStatus |= (int) checkCuda( cudaMemcpy(x_d, x, size_r, cudaMemcpyHostToDevice));
/*device alloc A_crs_d*/
FP * valtmp;
allocStatus |= (int) checkCuda( cudaMalloc((void **) &valtmp, size_nnz) );
allocStatus |= (int) checkCuda( cudaMemcpy(valtmp, CRS->vals, size_nnz, cudaMemcpyHostToDevice) );
int * indtmp;
allocStatus |= (int) checkCuda( cudaMalloc((void **) &indtmp, nnz* sizeof(int)) );
allocStatus |= (int) checkCuda( cudaMemcpy(indtmp, CRS->col_ind,
nnz * sizeof(int) , cudaMemcpyHostToDevice) );
int * rowtmp;
allocStatus |= (int) checkCuda( cudaMalloc((void **) &rowtmp, (nrows + 1) * sizeof(int)) );
allocStatus |= (int) checkCuda( cudaMemcpy(rowtmp, CRS->row_ptr,
(nrows + 1) * sizeof(int), cudaMemcpyHostToDevice) );
allocStatus |= (int) checkCuda( cudaMallocHost( &A_crs_d, sizeof(CRS_Matrix)) );
A_crs_d->vals = valtmp;
A_crs_d->col_ind = indtmp;
A_crs_d->row_ptr = rowtmp;
A_crs_d->nrows = CRS->nrows;
A_crs_d->ncols = CRS->ncols;
A_crs_d->nnz = CRS->nnz;
std::cout << "Device allocation done." << std::endl;
return allocStatus;
}
During my first stop at StackOverflow I found this solved issue posted by somebody else: Cusparse status mapping error while using cuda constant memory
However, as I'm not using constant memory on the arguments passed to csrmv() that didn't solve my problem. I also checked data integrity and the CRS_Matrix on the device exactly matches the original in host memory.
I'm quite at a loss with this issue and couldn't find anything that would indicate a problem in the CUDA Toolkit Documentation, so any help would be greatly appreciated.
Thanks in advance.
See Question&Answers more detail:os