Parallel implementation for multiple SVDs using CUDA -

i'm new parallel programming using gpu apologize if question broad or vague. i'm aware there parallel svd function in cula library, should strategy if have large number of relatively small matrices factorize? example have n matrices dimension d, n large , d small. how parallelize process? give me hint?

my previous answer out-of-date. of february 2015, cuda 7 (currently in release candidate version) offers full svd capabilities in cusolver library. below, i'm providing example of generating singular value decomposition using cuda cusolver.

concerning specific issue rising (calculating svd of several matrices of small size), should adapt example i'm providing below using streams. associate stream each task can use

cudastreamcreate()

and

cusolverdnsetstream()

kernel.cu

#include "cuda_runtime.h" #include "device_launch_parameters.h"  #include<iostream> #include<iomanip> #include<stdlib.h> #include<stdio.h> #include<assert.h> #include<math.h>  #include <cusolverdn.h> #include <cuda_runtime_api.h>  #include "utilities.cuh"  /********/ /* main */ /********/ int main(){      // --- gesvd supports nrows >= ncols     // --- column major memory ordering      const int nrows = 7;     const int ncols = 5;      // --- cusolve input/output parameters/arrays     int work_size = 0;     int *devinfo;           gpuerrchk(cudamalloc(&devinfo,          sizeof(int)));      // --- cuda solver initialization     cusolverdnhandle_t solver_handle;     cusolverdncreate(&solver_handle);      // --- setting host, nrows x ncols matrix     double *h_a = (double *)malloc(nrows * ncols * sizeof(double));     for(int j = 0; j < nrows; j++)         for(int = 0; < ncols; i++)             h_a[j + i*nrows] = (i + j*j) * sqrt((double)(i + j));      // --- setting device matrix , moving host matrix device     double *d_a;            gpuerrchk(cudamalloc(&d_a,      nrows * ncols * sizeof(double)));     gpuerrchk(cudamemcpy(d_a, h_a, nrows * ncols * sizeof(double), cudamemcpyhosttodevice));      // --- host side svd results space     double *h_u = (double *)malloc(nrows * nrows     * sizeof(double));     double *h_v = (double *)malloc(ncols * ncols     * sizeof(double));     double *h_s = (double *)malloc(min(nrows, ncols) * sizeof(double));      // --- device side svd workspace , matrices     double *d_u;            gpuerrchk(cudamalloc(&d_u,  nrows * nrows     * sizeof(double)));     double *d_v;            gpuerrchk(cudamalloc(&d_v,  ncols * ncols     * sizeof(double)));     double *d_s;            gpuerrchk(cudamalloc(&d_s,  min(nrows, ncols) * sizeof(double)));      // --- cuda svd initialization     cusolvesafecall(cusolverdndgesvd_buffersize(solver_handle, nrows, ncols, &work_size));     double *work;   gpuerrchk(cudamalloc(&work, work_size * sizeof(double)));      // --- cuda svd execution     cusolvesafecall(cusolverdndgesvd(solver_handle, 'a', 'a', nrows, ncols, d_a, nrows, d_s, d_u, nrows, d_v, ncols, work, work_size, null, devinfo));     int devinfo_h = 0;  gpuerrchk(cudamemcpy(&devinfo_h, devinfo, sizeof(int), cudamemcpydevicetohost));     if (devinfo_h != 0) std::cout   << "unsuccessful svd execution\n\n";      // --- moving results device host     gpuerrchk(cudamemcpy(h_s, d_s, min(nrows, ncols) * sizeof(double), cudamemcpydevicetohost));     gpuerrchk(cudamemcpy(h_u, d_u, nrows * nrows     * sizeof(double), cudamemcpydevicetohost));     gpuerrchk(cudamemcpy(h_v, d_v, ncols * ncols     * sizeof(double), cudamemcpydevicetohost));      std::cout << "singular values\n";     for(int = 0; < min(nrows, ncols); i++)          std::cout << "d_s["<<i<<"] = " << std::setprecision(15) << h_s[i] << std::endl;      std::cout << "\nleft singular vectors - y = * x, columns of u span space of y\n";     for(int j = 0; j < nrows; j++) {         printf("\n");         for(int = 0; < nrows; i++)             printf("u[%i,%i]=%f\n",i,j,h_u[j*nrows + i]);     }      std::cout << "\nright singular vectors - y = * x, columns of v span space of x\n";     for(int = 0; < ncols; i++) {         printf("\n");         for(int j = 0; j < ncols; j++)             printf("v[%i,%i]=%f\n",i,j,h_v[j*ncols + i]);     }      cusolverdndestroy(solver_handle);      return 0;  }

utilities.cuh

#ifndef utilities_cuh #define utilities_cuh  extern "c" int idivup(int, int); extern "c" void gpuerrchk(cudaerror_t); extern "c" void cusolvesafecall(cusolverstatus_t);  #endif

utilities.cu

#include <stdio.h> #include <assert.h>  #include "cuda_runtime.h" #include <cuda.h>  #include <cusolverdn.h>  /*******************/ /* idivup function */ /*******************/ extern "c" int idivup(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }  /********************/ /* cuda error check */ /********************/ // --- credit http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api void gpuassert(cudaerror_t code, char *file, int line, bool abort=true) {    if (code != cudasuccess)    {       fprintf(stderr,"gpuassert: %s %s %d\n", cudageterrorstring(code), file, line);       if (abort) { exit(code); }    } }  extern "c" void gpuerrchk(cudaerror_t ans) { gpuassert((ans), __file__, __line__); }  /**************************/ /* cusolve error checking */ /**************************/ static const char *_cudageterrorenum(cusolverstatus_t error) {     switch (error)     {         case cusolver_status_success:             return "cusolver_success";          case cusolver_status_not_initialized:             return "cusolver_status_not_initialized";          case cusolver_status_alloc_failed:             return "cusolver_status_alloc_failed";          case cusolver_status_invalid_value:             return "cusolver_status_invalid_value";          case cusolver_status_arch_mismatch:             return "cusolver_status_arch_mismatch";          case cusolver_status_execution_failed:             return "cusolver_status_execution_failed";          case cusolver_status_internal_error:             return "cusolver_status_internal_error";          case cusolver_status_matrix_type_not_supported:             return "cusolver_status_matrix_type_not_supported";      }      return "<unknown>"; }  inline void __cusolvesafecall(cusolverstatus_t err, const char *file, const int line) {     if(cusolver_status_success != err) {         fprintf(stderr, "cusolve error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__file__, __line__,err, \                                 _cudageterrorenum(err)); \         cudadevicereset(); assert(0); \     } }  extern "c" void cusolvesafecall(cusolverstatus_t err) { __cusolvesafecall(err, __file__, __line__); }

Search This Blog

Plus Code

Parallel implementation for multiple SVDs using CUDA -

Comments

Post a Comment

Popular posts from this blog

r - Trouble relying on third party package imports in my package -

java - Intellij IDEA shortcut How to add new element (ex. class or package)? -

Payment information shows nothing in one page checkout page magento -