NICE
Northeastern Interactive Clustering Engine
gpu_operations.h
Go to the documentation of this file.
1 // The MIT License (MIT)
2 //
3 // Copyright (c) 2016 Northeastern University
4 //
5 // Permission is hereby granted, free of charge, to any person obtaining a copy
6 // of this software and associated documentation files (the "Software"), to deal
7 // in the Software without restriction, including without limitation the rights
8 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 // copies of the Software, and to permit persons to whom the Software is
10 // furnished to do so, subject to the following conditions:
11 //
12 // The above copyright notice and this permission notice shall be included in
13 // all copies or substantial portions of the Software.
14 //
15 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 // SOFTWARE.
22 
23 #ifndef CPP_INCLUDE_GPU_OPERATIONS_H_
24 #define CPP_INCLUDE_GPU_OPERATIONS_H_
25 #ifdef NEED_CUDA
26 
27 #include <stdlib.h>
28 #include <time.h>
29 #include<cuda_runtime.h>
30 #include<device_launch_parameters.h>
31 #include<cuda_runtime_api.h>
32 #include <cublas_v2.h>
33 #include <cusolverDn.h>
34 #include <unistd.h>
35 #include <stdexcept>
36 #include <ctime>
37 
38 #include <iostream>
39 
40 #include "include/matrix.h"
41 #include "include/vector.h"
42 #include "include/gpu_util.h"
43 #include "include/gpu_svd_solver.h"
44 
45 namespace Nice {
46 
47 // Abstract class of common matrix operation interface
48 template <typename T>
50  public:
51  static Matrix<T> Multiply(const Matrix<T> &a, const T &scalar) {
52  // Alocate and transfer memories
53  int n = a.cols() * a.rows();
54  const T * h_a = &a(0);
55  Matrix<T> h_c(a.rows(), a.cols());
56  T * d_a; gpuErrchk(cudaMalloc(&d_a, n * sizeof(T)));
57  gpuErrchk(cudaMemcpy(d_a, h_a, n * sizeof(T),
58  cudaMemcpyHostToDevice));
59 
60  // Set up and do cublas matrix scalar multiply
61  cublasStatus_t stat;
62  cublasHandle_t handle;
63  cublasCreate(&handle);
64  stat = GpuMatrixScalarMul(handle, n, scalar, d_a);
65 
66  // Error check
67  if (stat != CUBLAS_STATUS_SUCCESS) {
68  std::cerr << "GPU Matrix Scalar Multiply Internal Failure" << std::endl;
69  cudaFree(d_a);
70  cublasDestroy(handle);
71  exit(1);
72  }
73  cudaDeviceSynchronize();
74 
75  // Transfer memories back, clear memories, and return result
76  gpuErrchk(cudaMemcpy(&h_c(0, 0), d_a, n * sizeof(T),
77  cudaMemcpyDeviceToHost));
78  cudaFree(d_a);
79  cublasDestroy(handle);
80  return h_c;
81  }
82 
83  static Matrix<T> Multiply(const Matrix<T> &a, const Matrix<T> &b) {
84  if (a.cols() == b.rows()) { // Check if matricies k vals are equal
85  // Allocate and transfer memories
86  int m = a.rows();
87  int n = b.cols();
88  int k = a.cols();
89 
90  const T * h_a = &a(0);
91  const T * h_b = &b(0);
92  Matrix<T> h_c(m, n);
93 
94  T * d_a; gpuErrchk(cudaMalloc(&d_a, m * k * sizeof(T)));
95  T * d_b; gpuErrchk(cudaMalloc(&d_b, k * n * sizeof(T)));
96  T * d_c; gpuErrchk(cudaMalloc(&d_c, m * n * sizeof(T)));
97 
98  gpuErrchk(cudaMemcpy(d_a, h_a, m * k * sizeof(T),
99  cudaMemcpyHostToDevice));
100  gpuErrchk(cudaMemcpy(d_b, h_b, k * n * sizeof(T),
101  cudaMemcpyHostToDevice));
102 
103  // Set up and do cublas matrix multiply
104  cublasStatus_t stat;
105  cublasHandle_t handle;
106  cublasCreate(&handle);
107  stat = GpuMatrixMatrixMul(handle, m, n, k, d_a, d_b, d_c);
108 
109  // Error check
110  if (stat != CUBLAS_STATUS_SUCCESS) {
111  std::cerr << "GPU Matrix Matrix Multiply Internal Failure" << std::endl;
112  cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
113  cublasDestroy(handle);
114  exit(1);
115  }
116  cudaDeviceSynchronize();
117  // Transfer memories back, clear memrory, and return result
118  gpuErrchk(cudaMemcpy(&h_c(0, 0), d_c, m * n * sizeof(T),
119  cudaMemcpyDeviceToHost));
120  cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
121  cublasDestroy(handle);
122  return h_c;
123  } else {
124  std::cerr << "Matricies in gpu matrix multiply's sizes aren't compatible"
125  << std::endl;
126  exit(1);
127  }
128  }
129 
139  static Matrix<T> Add(const Matrix<T> &a, const T &scalar) {
140  int m = a.rows();
141  int n = a.cols();
142  int lda = m;
143  int ldb = m;
144  int ldc = m;
145 
146  T alpha = 1.0;
147  T beta = 1.0;
148 
149  const T * h_a = &a(0);
150  Matrix<T> b(m, n);
151  b = Matrix<T>::Constant(m, n, scalar);
152  const T * h_b = &b(0);
153  Matrix<T> h_c(m, n);
154 
155  T * d_a; gpuErrchk(cudaMalloc(&d_a, m * n * sizeof(T)));
156  T * d_b; gpuErrchk(cudaMalloc(&d_b, m * n * sizeof(T)));
157  T * d_c; gpuErrchk(cudaMalloc(&d_c, m * n * sizeof(T)));
158 
159  gpuErrchk(cudaMemcpy(d_a, h_a, m * n * sizeof(T),
160  cudaMemcpyHostToDevice));
161  gpuErrchk(cudaMemcpy(d_b, h_b, m * n * sizeof(T),
162  cudaMemcpyHostToDevice));
163 
164  cublasStatus_t stat;
165  cublasHandle_t handle;
166  cublasCreate(&handle);
167  stat = GpuMatrixAdd(handle,
168  m, n,
169  &alpha,
170  d_a, lda,
171  &beta,
172  d_b, ldb,
173  d_c, ldc);
174  if (stat != CUBLAS_STATUS_SUCCESS) {
175  std::cerr << "GPU Matrix Add Internal Failure" << std::endl;
176  cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
177  cublasDestroy(handle);
178  exit(1);
179  }
180  cudaDeviceSynchronize();
181  gpuErrchk(cudaMemcpy(&h_c(0, 0), d_c, m * n * sizeof(T),
182  cudaMemcpyDeviceToHost));
183  cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
184  cublasDestroy(handle);
185  return h_c;
186  }
187 
197  static Matrix<T> Add(const Matrix<T> &a, const Matrix<T> &b) {
198  if (a.rows() == b.rows() && a.cols() == b.cols()) {
199  int m = a.rows();
200  int n = b.cols();
201  int lda = m;
202  int ldb = n;
203  int ldc = m;
204 
205  T alpha = 1.0;
206  T beta = 1.0;
207 
208  const T * h_a = &a(0);
209  const T * h_b = &b(0);
210  Matrix<T> h_c(m, n);
211 
212  T * d_a; gpuErrchk(cudaMalloc(&d_a, m * n * sizeof(T)));
213  T * d_b; gpuErrchk(cudaMalloc(&d_b, m * n * sizeof(T)));
214  T * d_c; gpuErrchk(cudaMalloc(&d_c, m * n * sizeof(T)));
215 
216  gpuErrchk(cudaMemcpy(d_a, h_a, m * n * sizeof(T),
217  cudaMemcpyHostToDevice));
218  gpuErrchk(cudaMemcpy(d_b, h_b, m * n * sizeof(T),
219  cudaMemcpyHostToDevice));
220 
221  cublasStatus_t stat;
222  cublasHandle_t handle;
223  cublasCreate(&handle);
224  stat = GpuMatrixAdd(handle,
225  m, n,
226  &alpha,
227  d_a, lda,
228  &beta,
229  d_b, ldb,
230  d_c, ldc);
231  if (stat != CUBLAS_STATUS_SUCCESS) {
232  std::cerr << "GPU Matrix Add Internal Failure" << std::endl;
233  cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
234  cublasDestroy(handle);
235  exit(1);
236  }
237  cudaDeviceSynchronize();
238  gpuErrchk(cudaMemcpy(&h_c(0, 0), d_c, m * n * sizeof(T),
239  cudaMemcpyDeviceToHost));
240  cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
241  cublasDestroy(handle);
242  return h_c;
243  } else {
244  std::cerr << "Matricies in gpu matrix add's sizes aren't compatible"
245  << std::endl;
246  exit(1);
247  }
248  }
249  static Matrix<T> Subtract(const Matrix<T> &a, const T &scalar);
250  static Matrix<T> Subtract(const Matrix<T> &a, const Matrix<T> &b) {
251  // If the matricies aren't identical sizes then we cannot subtract them.
252  if (a.rows() != b.rows() || a.cols() != b.cols()) {
253  std::cerr << "Matricies are not the same size" << std::endl;
254  exit(1);
255 
256  // If the matricies are empty this function should not run.
257  } else if (a.rows() == 0) {
258  std::cerr << "Matricies are empty" << std::endl;
259  exit(1);
260 
261  // Otherwise, everything should run fine.
262  } else {
263  // Allocate and Transfer Memory
264  int m = a.rows();
265  int n = a.cols();
266  int lda = m;
267  int ldb = n;
268  int ldc = m;
269  T alpha = 1.0;
270  T beta = -1.0;
271 
272  const T * h_a = &a(0);
273  const T * h_b = &b(0);
274  Matrix<T> h_c(m, n);
275 
276  T * d_a; gpuErrchk(cudaMalloc(&d_a, m * n * sizeof(T)));
277  T * d_b; gpuErrchk(cudaMalloc(&d_b, m * n * sizeof(T)));
278  T * d_c; gpuErrchk(cudaMalloc(&d_c, m * n * sizeof(T)));
279 
280  gpuErrchk(cudaMemcpy(d_a, h_a, m * n * sizeof(T),
281  cudaMemcpyHostToDevice));
282  gpuErrchk(cudaMemcpy(d_b, h_b, m * n * sizeof(T),
283  cudaMemcpyHostToDevice));
284 
285  // Set up and do cublas matrix subtract
286  cublasStatus_t stat;
287  cublasHandle_t handle;
288  cublasCreate(&handle);
289  stat = GpuMatrixMatrixSub(handle, m, n, &alpha, d_a, lda,
290  &beta, d_b, ldb, d_c, ldc);
291 
292  // Error Check
293  if (stat != CUBLAS_STATUS_SUCCESS) {
294  std::cerr << "GPU Matrix Subtract Internal Failure" << std::endl;
295  cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
296  cublasDestroy(handle);
297  exit(1);
298  }
299  cudaDeviceSynchronize();
300  // Transfer memory back and clear it
301  gpuErrchk(cudaMemcpy(&h_c(0, 0), d_c, m * n * sizeof(T),
302  cudaMemcpyDeviceToHost));
303  cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
304  cublasDestroy(handle);
305  // Return result
306  return h_c;
307  }
308  }
317  static Matrix<T> Inverse(const Matrix<T> &a) {
318  // Sanity Check
319  if (a.rows() != a.cols()) {
320  std::cerr << "Matrix is singular" << std::endl;
321  exit(1);
322  }
323 
324  // Get the row/column number
325  int n = a.rows();
326 
327  // Create host memory
328  const T *h_a = &a(0);
329 
330  // Create device memory needed
331  T *d_a;
332  int *d_ipiv;
333  int *d_info;
334  gpuErrchk(cudaMalloc(&d_a, n * n * sizeof(T)));
335  gpuErrchk(cudaMalloc(&d_ipiv, n * sizeof(T)));
336  gpuErrchk(cudaMalloc(&d_info, sizeof(T)));
337 
338  // Copy host memory over to device
339  gpuErrchk(cudaMemcpy(d_a, h_a, n * n * sizeof(T),
340  cudaMemcpyHostToDevice));
341 
342  // Setup cusolver parameters
343  cusolverDnHandle_t handle;
344  cusolverDnCreate(&handle);
345  cusolverStatus_t stat;
346  int lda = n;
347  int nrhs = n;
348  int ldb = lda;
349 
350  // Setup workspace for LU decomposition
351  int workspace_size;
352  stat = GpuGetLUDecompWorkspace(handle, n, n, d_a, lda, &workspace_size);
353  if (stat != CUSOLVER_STATUS_SUCCESS) {
354  std::cerr << "LU decomposition: Workspace allocation failed"
355  << std::endl;
356  cudaFree(d_a);
357  cudaFree(d_ipiv);
358  cudaFree(d_info);
359  exit(1);
360  }
361 
362  T *workspace;
363  gpuErrchk(cudaMalloc(&workspace, workspace_size * sizeof(T)));
364 
365  // Do LU docomposition
366  stat = GpuLUDecomposition(handle, n, n, d_a, lda,
367  workspace, d_ipiv, d_info);
368  if (stat != CUSOLVER_STATUS_SUCCESS) {
369  std::cerr << "LU decomposition: decomposition failed"
370  << std::endl;
371  cudaFree(d_a);
372  cudaFree(d_ipiv);
373  cudaFree(d_info);
374  cudaFree(workspace);
375  exit(1);
376  }
377  cudaFree(workspace);
378 
379  // Create an identity matrix
380  Matrix<T> b = Matrix<T>::Identity(n, n);
381 
382  // Create host memory
383  T *h_b = &b(0);
384 
385  // Create device memory needed
386  T *d_b;
387  gpuErrchk(cudaMalloc(&d_b, n * n * sizeof(T)));
388 
389  // Copy host memory over to device
390  gpuErrchk(cudaMemcpy(d_b, h_b, n * n * sizeof(T),
391  cudaMemcpyHostToDevice));
392 
393  // Do lineaer solver
394  stat = GpuLinearSolver(handle, CUBLAS_OP_N, n, nrhs, d_a, lda, d_ipiv, d_b,
395  ldb, d_info);
396  if (stat != CUSOLVER_STATUS_SUCCESS) {
397  std::cerr << "Linear solver failed"
398  << std::endl;
399  cudaFree(d_a);
400  cudaFree(d_ipiv);
401  cudaFree(d_info);
402  cudaFree(d_b);
403  exit(1);
404  }
405 
406  // Copy device result over to host
407  gpuErrchk(cudaMemcpy(h_b, d_b, n * n * sizeof(T),
408  cudaMemcpyDeviceToHost));
409 
410  // Synchonize and clean up
411  cudaDeviceSynchronize();
412  cudaFree(d_a);
413  cudaFree(d_ipiv);
414  cudaFree(d_info);
415  cudaFree(d_b);
416 
417  // Destroy the handle
418  cusolverDnDestroy(handle);
419 
420  // Return the result
421  return b;
422  }
423  static Vector<T> Norm(const Matrix<T> a, const int &p = 2,
424  const int &axis = 0) {
425  int m = a.rows();
426  int n = a.cols();
427  int incx = 1;
428  Vector<T> c(n);
429  const T * h_a = &a(0);
430 
431  // Allocate and transfer memories
432  T * h_c = reinterpret_cast<T *>(malloc(sizeof(T)));
433  T * d_a; gpuErrchk(cudaMalloc(&d_a, m * n * sizeof(T)));
434  T * d_t; gpuErrchk(cudaMalloc(&d_t, m * sizeof(T)));
435  gpuErrchk(cudaMemcpy(d_a, h_a, m * n * sizeof(T), cudaMemcpyHostToDevice));
436 
437  // Setup and do Frobenious Norm
438  cublasHandle_t handle;
439  cublasCreate(&handle);
440  cublasStatus_t stat;
441  int iter = 0;
442  for (int i = 0; i < n; ++i) {
443  gpuErrchk(cudaMemcpy(d_t, d_a + i * m, m * sizeof(T),
444  cudaMemcpyDeviceToDevice));
445  stat = GpuFrobeniusNorm(handle, m, incx, d_t, h_c);
446  // Error Check
447  if (stat != CUBLAS_STATUS_SUCCESS) {
448  std::cerr << "GPU Matrix Norm Internal Failure"
449  << std::endl;
450  cudaFree(d_a); free(h_c);
451  cublasDestroy(handle);
452  exit(1);
453  }
454  cudaDeviceSynchronize();
455  c(iter) = *h_c;
456  iter++;
457  }
458  // Free memories and return answer
459  cudaFree(d_a); free(h_c);
460  cublasDestroy(handle);
461  return c;
462  }
463  static T Determinant(const Matrix<T> &a) {
464  int m = a.rows();
465  int n = a.cols();
466  const T *h_a = &a(0);
467  T det;
468 
469  // Allocating and transfering memories
470  T *d_a; gpuErrchk(cudaMalloc(&d_a, m * n * sizeof(T)));
471  int *devIpiv_h = reinterpret_cast<int *>(malloc(m * n * sizeof(int)));
472  int *devIpiv_d; gpuErrchk(cudaMalloc(&devIpiv_d, m * n * sizeof(int)));
473  cudaMemset(devIpiv_d, 0, m * n * sizeof(int));
474  int devInfo_h = 0;
475  int *devInfo_d; gpuErrchk(cudaMalloc(&devInfo_d, sizeof(int)));
476  T *h_c = reinterpret_cast<T *>(malloc(m * n *sizeof(T)));
477  gpuErrchk(cudaMemcpy(d_a, h_a, m * n * sizeof(T), cudaMemcpyHostToDevice));
478 
479  // Setup and do get LU decomposition buffer
480  int work_size = 0;
481  cusolverStatus_t stat;
482  cusolverDnHandle_t handle;
483  cusolverDnCreate(&handle);
484  stat = GpuLuWorkspace(handle, m, n, d_a, &work_size);
485 
486  // Error check
487  if (stat != CUSOLVER_STATUS_SUCCESS) {
488  std::cout << "Initialization of determinant buffer failed." << std::endl;
489  cudaFree(d_a); cudaFree(devIpiv_d); free(devIpiv_h); free(h_c);
490  cudaFree(devInfo_d);
491  cusolverDnDestroy(handle);
492  exit(1);
493  }
494 
495  // Allocate LU decomposition workspace memory and do LU decomposistion
496  T *workspace; gpuErrchk(cudaMalloc(&workspace, work_size * sizeof(T)));
497  stat = GpuDeterminant(handle, m, n, d_a, workspace, devIpiv_d, devInfo_d);
498 
499  // Error check
500  gpuErrchk(cudaMemcpy(&devInfo_h, devInfo_d, sizeof(int),
501  cudaMemcpyDeviceToHost));
502  if (stat != CUSOLVER_STATUS_SUCCESS || devInfo_h != 0) {
503  std::cerr << "GPU Determinant Internal Failure" << std::endl;
504  cudaFree(d_a); cudaFree(devIpiv_d); free(devIpiv_h); free(h_c);
505  cudaFree(devInfo_d); cudaFree(workspace);
506  cusolverDnDestroy(handle);
507  exit(1);
508  }
509  cudaDeviceSynchronize();
510 
511  // Transfer memories back to host
512  gpuErrchk(cudaMemcpy(devIpiv_h, devIpiv_d, m * n * sizeof(int),
513  cudaMemcpyDeviceToHost));
514  gpuErrchk(cudaMemcpy(h_c, d_a, m * n * sizeof(T),
515  cudaMemcpyDeviceToHost));
516 
517  // Count number of swaps, if odd multiply determinant by -1
518  int cnt = 0;
519  for (int i = 0; i < m*n; ++i) {
520  if (*(devIpiv_h+i) == 0) break;
521  if (*(devIpiv_h+i) != i+1) cnt++;
522  }
523 
524  // Determinante is product of U matrix diagonal * (-1)^cnt
525  det = *(h_c);
526  for (int i = 1; i < m; ++i) {
527  det = det * *(h_c + (i * m) + i);
528  }
529  if (cnt % 2 != 0) det = det * -1; // if odd multiply by -1
530 
531  // Free memories and return answer
532  cudaFree(d_a); cudaFree(devIpiv_d); free(devIpiv_h); free(h_c);
533  cudaFree(devInfo_d); cudaFree(workspace);
534  cusolverDnDestroy(handle);
535  return det;
536  }
537 
546  static int Rank(const Matrix<T> &a) {
547  // Obtain row echelon form through SVD
548  GpuSvdSolver<T> svd;
549  svd.Compute(a);
550 
551  // Obtain computed sigular vector
552  Vector<T> sigular_vector = svd.SingularValues();
553 
554  // Count non zero elements of sigular vector
555  int rank = 0;
556  for (int i = 0; i < sigular_vector.rows(); i++) {
557  if (sigular_vector[i] != 0)
558  rank++;
559  }
560 
561  return rank;
562  }
563  static T FrobeniusNorm(const Matrix<T> &a) {
564  int m = a.rows();
565  int n = a.cols();
566  int incx = 1;
567  const T * h_a = &a(0);
568 
569  //Traceocate and transfer memories
570  T * h_c = reinterpret_cast<T *>(malloc(sizeof(T)));
571  T * d_a; gpuErrchk(cudaMalloc(&d_a, m * n * sizeof(T)));
572  gpuErrchk(cudaMemcpy(d_a, h_a, m * n * sizeof(T), cudaMemcpyHostToDevice));
573 
574  // Setup and do Frobenious Norm
575  cublasHandle_t handle;
576  cublasCreate(&handle);
577  cublasStatus_t stat;
578  stat = GpuFrobeniusNorm(handle, n * m, incx, d_a, h_c);
579 
580  // Error Check
581  if (stat != CUBLAS_STATUS_SUCCESS) {
582  std::cerr << "GPU Matrix Frobenius Norm Internal Failure"
583  << std::endl;
584  cudaFree(d_a); free(h_c);
585  cublasDestroy(handle);
586  exit(1);
587  }
588  cudaDeviceSynchronize();
589 
590  // Free memories and return answer
591  cudaFree(d_a);
592  cublasDestroy(handle);
593  return *h_c;
594  }
595 
604  static T Trace(const Matrix<T> &a) {
605  // Get the diagonal vector
606  Vector<T> diagonal_vector = a.diagonal();
607 
608  // Get the number of elements in diagonal vector
609  int m = diagonal_vector.rows();
610 
611  // Create host memory
612  const T *h_a = &diagonal_vector(0);
613  T *h_multiplier = new T[m];
614  for (int i = 0; i < m; i++)
615  h_multiplier[i] = 1.0;
616  T h_result;
617 
618  // Create device memory from host memory
619  T *d_a;
620  T *d_multiplier;
621  T *d_result;
622  gpuErrchk(cudaMalloc(&d_a, m * sizeof(T)));
623  gpuErrchk(cudaMalloc(&d_multiplier, m * sizeof(T)));
624  gpuErrchk(cudaMalloc(&d_result, sizeof(T)));
625 
626  // Copy host memory over to device
627  gpuErrchk(cudaMemcpy(d_a, h_a, m * sizeof(T),
628  cudaMemcpyHostToDevice));
629  gpuErrchk(cudaMemcpy(d_multiplier, h_multiplier, m * sizeof(T),
630  cudaMemcpyHostToDevice));
631 
632  // Create parameters for cublas wraper function
633  cublasStatus_t stat;
634  cublasHandle_t handle;
635  cublasCreate(&handle);
636  cublasOperation_t trans = CUBLAS_OP_T;
637  int n = 1;
638  T alpha = 1.0;
639  T beta = 0.0;
640  int lda = m;
641  int incx = 1;
642  int incy = 1;
643 
644  // Do vector summation to obtain trace
645  stat = GpuMatrixVectorMul(handle, trans, m, n, &alpha,
646  d_a, lda, d_multiplier, incx, &beta, d_result, incy);
647 
648  // Error check
649  if (stat != CUBLAS_STATUS_SUCCESS) {
650  std::cerr << "GPU Trace Internal Failure" << std::endl;
651  cudaFree(d_a);
652  cudaFree(d_multiplier);
653  cudaFree(d_result);
654  cublasDestroy(handle);
655  exit(1);
656  }
657 
658  // Copy device result over to host
659  gpuErrchk(cudaMemcpy(&h_result, d_result, sizeof(T),
660  cudaMemcpyDeviceToHost));
661 
662  // Synchonize and clean up
663  cudaDeviceSynchronize();
664  cudaFree(d_a);
665  cudaFree(d_multiplier);
666  cudaFree(d_result);
667  delete []h_multiplier;
668 
669  // Destroy the handle
670  cublasDestroy(handle);
671 
672  // Return the result
673  return h_result;
674  }
675  static T DotProduct(const Vector<T> &a, const Vector<T> &b) {
676  int n = a.rows();
677 
678  // Allocate and transfer memories
679  const T * h_a = &a(0);
680  const T * h_b = &b(0);
681  T * h_c = reinterpret_cast<T *>(malloc(sizeof(T)));
682 
683  T * d_a; gpuErrchk(cudaMalloc(&d_a, n * sizeof(T)));
684  T * d_b; gpuErrchk(cudaMalloc(&d_b, n * sizeof(T)));
685 
686  gpuErrchk(cudaMemcpy(d_a, h_a, n * sizeof(T), cudaMemcpyHostToDevice));
687  gpuErrchk(cudaMemcpy(d_b, h_b, n * sizeof(T), cudaMemcpyHostToDevice));
688 
689  // Setup and do dot product
690  cublasHandle_t handle;
691  cublasCreate(&handle);
692  cublasStatus_t stat;
693  stat = GpuVectorVectorDot(handle, n, d_a, d_b, h_c);
694 
695  // Error Check
696  if (stat != CUBLAS_STATUS_SUCCESS) {
697  std::cerr << "GPU Vector Vector Dot Product Internal Failure"
698  << std::endl;
699  cudaFree(d_a); cudaFree(d_b);
700  cublasDestroy(handle);
701  }
702  cudaDeviceSynchronize();
703 
704  // Free memories and return result
705  cudaFree(d_a); cudaFree(d_b);
706  cublasDestroy(handle);
707  return *h_c;
708  }
709  static Matrix<T> OuterProduct(const Vector<T> &a, const Vector<T> &b) {
710  if (a.cols() == b.cols()) {
711  int m = a.rows();
712  int n = b.rows();
713  int k = 1;
714 
715  // Allocate and transfer memories
716  const T * h_a = &a(0);
717  const T * h_b = &b(0);
718  Matrix<T> h_c(m, n);
719 
720  T * d_a; gpuErrchk(cudaMalloc(&d_a, m * k * sizeof(T)));
721  T * d_b; gpuErrchk(cudaMalloc(&d_b, k * n * sizeof(T)));
722  T * d_c; gpuErrchk(cudaMalloc(&d_c, m * n * sizeof(T)));
723 
724  gpuErrchk(cudaMemcpy(d_a, h_a, m * k * sizeof(T),
725  cudaMemcpyHostToDevice));
726  gpuErrchk(cudaMemcpy(d_b, h_b, k * n * sizeof(T),
727  cudaMemcpyHostToDevice));
728 
729  // Setup and do outer product multiply
730  cublasStatus_t stat;
731  cublasHandle_t handle;
732  cublasCreate(&handle);
733  stat = GpuMatrixMatrixMul(handle, m, n, k, d_a, d_b, d_c);
734 
735  // Error check
736  if (stat != CUBLAS_STATUS_SUCCESS) {
737  std::cerr << "GPU Outer Product Internal Failure" << std::endl;
738  cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
739  cublasDestroy(handle);
740  exit(1);
741  }
742  cudaDeviceSynchronize();
743 
744  // Transfer results back, clear memories, return answer
745  gpuErrchk(cudaMemcpy(&h_c(0, 0), d_c, m * n * sizeof(T),
746  cudaMemcpyDeviceToHost));
747  cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
748  cublasDestroy(handle);
749  return h_c;
750  } else {
751  std::cerr << "Vectors in gpu outer product's sizes aren't compatible"
752  << std::endl;
753  exit(1);
754  }
755  }
756 };
757 } // namespace Nice
758 #endif // NEED_CUDA
759 #endif // CPP_INCLUDE_GPU_OPERATIONS_H_
760 
cublasStatus_t GpuMatrixScalarMul(cublasHandle_t handle, int n, const float &scalar, float *a)
static Matrix< T > OuterProduct(const Vector< T > &a, const Vector< T > &b)
Definition: gpu_operations.h:709
void gpuErrchk(cudaError_t)
static T DotProduct(const Vector< T > &a, const Vector< T > &b)
Definition: gpu_operations.h:675
static T FrobeniusNorm(const Matrix< T > &a)
Definition: gpu_operations.h:563
static Matrix< T > Multiply(const Matrix< T > &a, const Matrix< T > &b)
Definition: gpu_operations.h:83
cublasStatus_t GpuFrobeniusNorm(cublasHandle_t handle, int n, int incx, float *a, float *c)
static Matrix< T > Add(const Matrix< T > &a, const T &scalar)
Definition: gpu_operations.h:139
Definition: cpu_operations.h:36
static Matrix< T > Subtract(const Matrix< T > &a, const T &scalar)
cusolverStatus_t GpuLuWorkspace(cusolverDnHandle_t handle, int m, int n, float *a, int *worksize)
cublasStatus_t GpuVectorVectorDot(cublasHandle_t handle, int n, float *a, float *b, float *c)
Vector< T > SingularValues() const
Definition: gpu_svd_solver.h:117
static Vector< T > Norm(const Matrix< T > a, const int &p=2, const int &axis=0)
Definition: gpu_operations.h:423
cusolverStatus_t GpuDeterminant(cusolverDnHandle_t handle, int m, int n, float *a, float *workspace, int *devIpiv, int *devInfo)
cusolverStatus_t GpuLinearSolver(cusolverDnHandle_t handle, cublasOperation_t trans, int n, int nrhs, const float *A, int lda, const int *devIpiv, float *B, int ldb, int *devInfo)
cublasStatus_t GpuMatrixMatrixMul(cublasHandle_t handle, int m, int n, int k, float *a, float *b, float *c)
Definition: gpu_svd_solver.h:42
static T Determinant(const Matrix< T > &a)
Definition: gpu_operations.h:463
cublasStatus_t GpuMatrixAdd(cublasHandle_t handle, int m, int n, const float *alpha, const float *A, int lda, const float *beta, const float *B, int ldb, float *C, int ldc)
static Matrix< T > Subtract(const Matrix< T > &a, const Matrix< T > &b)
Definition: gpu_operations.h:250
Eigen::Matrix< T, Eigen::Dynamic, 1 > Vector
Definition: vector.h:31
cusolverStatus_t GpuGetLUDecompWorkspace(cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *Lwork)
cublasStatus_t GpuMatrixMatrixSub(cublasHandle_t handle, int m, int n, const float *alpha, float *a, int lda, const float *beta, float *b, int ldb, float *c, int ldc)
static Matrix< T > Multiply(const Matrix< T > &a, const T &scalar)
Definition: gpu_operations.h:51
Definition: gpu_operations.h:49
static Matrix< T > Inverse(const Matrix< T > &a)
Definition: gpu_operations.h:317
void Compute(const Matrix< T > &A)
Definition: gpu_svd_solver.h:51
Eigen::Matrix< T, Eigen::Dynamic, Eigen::Dynamic > Matrix
Definition: matrix.h:31
cusolverStatus_t GpuLUDecomposition(cusolverDnHandle_t handle, int m, int n, float *A, int lda, float *Workspace, int *devIpiv, int *devInfo)
static T Trace(const Matrix< T > &a)
Definition: gpu_operations.h:604
static Matrix< T > Add(const Matrix< T > &a, const Matrix< T > &b)
Definition: gpu_operations.h:197
static int Rank(const Matrix< T > &a)
Definition: gpu_operations.h:546
cublasStatus_t GpuMatrixVectorMul(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const float *alpha, const float *A, int lda, const float *x, int incx, const float *beta, float *y, int incy)