Parallel implementation for multiple SVDs using CUDA -
i'm new parallel programming using gpu apologize if question broad or vague. i'm aware there parallel svd function in cula library, should strategy if have large number of relatively small matrices factorize? example have n
matrices dimension d
, n
large , d
small. how parallelize process? give me hint?
my previous answer out-of-date. of february 2015, cuda 7 (currently in release candidate version) offers full svd capabilities in cusolver library. below, i'm providing example of generating singular value decomposition using cuda cusolver.
concerning specific issue rising (calculating svd of several matrices of small size), should adapt example i'm providing below using streams. associate stream each task can use
cudastreamcreate()
and
cusolverdnsetstream()
kernel.cu
#include "cuda_runtime.h" #include "device_launch_parameters.h" #include<iostream> #include<iomanip> #include<stdlib.h> #include<stdio.h> #include<assert.h> #include<math.h> #include <cusolverdn.h> #include <cuda_runtime_api.h> #include "utilities.cuh" /********/ /* main */ /********/ int main(){ // --- gesvd supports nrows >= ncols // --- column major memory ordering const int nrows = 7; const int ncols = 5; // --- cusolve input/output parameters/arrays int work_size = 0; int *devinfo; gpuerrchk(cudamalloc(&devinfo, sizeof(int))); // --- cuda solver initialization cusolverdnhandle_t solver_handle; cusolverdncreate(&solver_handle); // --- setting host, nrows x ncols matrix double *h_a = (double *)malloc(nrows * ncols * sizeof(double)); for(int j = 0; j < nrows; j++) for(int = 0; < ncols; i++) h_a[j + i*nrows] = (i + j*j) * sqrt((double)(i + j)); // --- setting device matrix , moving host matrix device double *d_a; gpuerrchk(cudamalloc(&d_a, nrows * ncols * sizeof(double))); gpuerrchk(cudamemcpy(d_a, h_a, nrows * ncols * sizeof(double), cudamemcpyhosttodevice)); // --- host side svd results space double *h_u = (double *)malloc(nrows * nrows * sizeof(double)); double *h_v = (double *)malloc(ncols * ncols * sizeof(double)); double *h_s = (double *)malloc(min(nrows, ncols) * sizeof(double)); // --- device side svd workspace , matrices double *d_u; gpuerrchk(cudamalloc(&d_u, nrows * nrows * sizeof(double))); double *d_v; gpuerrchk(cudamalloc(&d_v, ncols * ncols * sizeof(double))); double *d_s; gpuerrchk(cudamalloc(&d_s, min(nrows, ncols) * sizeof(double))); // --- cuda svd initialization cusolvesafecall(cusolverdndgesvd_buffersize(solver_handle, nrows, ncols, &work_size)); double *work; gpuerrchk(cudamalloc(&work, work_size * sizeof(double))); // --- cuda svd execution cusolvesafecall(cusolverdndgesvd(solver_handle, 'a', 'a', nrows, ncols, d_a, nrows, d_s, d_u, nrows, d_v, ncols, work, work_size, null, devinfo)); int devinfo_h = 0; gpuerrchk(cudamemcpy(&devinfo_h, devinfo, sizeof(int), cudamemcpydevicetohost)); if (devinfo_h != 0) std::cout << "unsuccessful svd execution\n\n"; // --- moving results device host gpuerrchk(cudamemcpy(h_s, d_s, min(nrows, ncols) * sizeof(double), cudamemcpydevicetohost)); gpuerrchk(cudamemcpy(h_u, d_u, nrows * nrows * sizeof(double), cudamemcpydevicetohost)); gpuerrchk(cudamemcpy(h_v, d_v, ncols * ncols * sizeof(double), cudamemcpydevicetohost)); std::cout << "singular values\n"; for(int = 0; < min(nrows, ncols); i++) std::cout << "d_s["<<i<<"] = " << std::setprecision(15) << h_s[i] << std::endl; std::cout << "\nleft singular vectors - y = * x, columns of u span space of y\n"; for(int j = 0; j < nrows; j++) { printf("\n"); for(int = 0; < nrows; i++) printf("u[%i,%i]=%f\n",i,j,h_u[j*nrows + i]); } std::cout << "\nright singular vectors - y = * x, columns of v span space of x\n"; for(int = 0; < ncols; i++) { printf("\n"); for(int j = 0; j < ncols; j++) printf("v[%i,%i]=%f\n",i,j,h_v[j*ncols + i]); } cusolverdndestroy(solver_handle); return 0; }
utilities.cuh
#ifndef utilities_cuh #define utilities_cuh extern "c" int idivup(int, int); extern "c" void gpuerrchk(cudaerror_t); extern "c" void cusolvesafecall(cusolverstatus_t); #endif
utilities.cu
#include <stdio.h> #include <assert.h> #include "cuda_runtime.h" #include <cuda.h> #include <cusolverdn.h> /*******************/ /* idivup function */ /*******************/ extern "c" int idivup(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); } /********************/ /* cuda error check */ /********************/ // --- credit http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api void gpuassert(cudaerror_t code, char *file, int line, bool abort=true) { if (code != cudasuccess) { fprintf(stderr,"gpuassert: %s %s %d\n", cudageterrorstring(code), file, line); if (abort) { exit(code); } } } extern "c" void gpuerrchk(cudaerror_t ans) { gpuassert((ans), __file__, __line__); } /**************************/ /* cusolve error checking */ /**************************/ static const char *_cudageterrorenum(cusolverstatus_t error) { switch (error) { case cusolver_status_success: return "cusolver_success"; case cusolver_status_not_initialized: return "cusolver_status_not_initialized"; case cusolver_status_alloc_failed: return "cusolver_status_alloc_failed"; case cusolver_status_invalid_value: return "cusolver_status_invalid_value"; case cusolver_status_arch_mismatch: return "cusolver_status_arch_mismatch"; case cusolver_status_execution_failed: return "cusolver_status_execution_failed"; case cusolver_status_internal_error: return "cusolver_status_internal_error"; case cusolver_status_matrix_type_not_supported: return "cusolver_status_matrix_type_not_supported"; } return "<unknown>"; } inline void __cusolvesafecall(cusolverstatus_t err, const char *file, const int line) { if(cusolver_status_success != err) { fprintf(stderr, "cusolve error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__file__, __line__,err, \ _cudageterrorenum(err)); \ cudadevicereset(); assert(0); \ } } extern "c" void cusolvesafecall(cusolverstatus_t err) { __cusolvesafecall(err, __file__, __line__); }
Comments
Post a Comment