23 #ifndef CPP_INCLUDE_GPU_OPERATIONS_H_ 24 #define CPP_INCLUDE_GPU_OPERATIONS_H_ 29 #include<cuda_runtime.h> 30 #include<device_launch_parameters.h> 31 #include<cuda_runtime_api.h> 32 #include <cublas_v2.h> 33 #include <cusolverDn.h> 53 int n = a.cols() * a.rows();
54 const T * h_a = &a(0);
56 T * d_a;
gpuErrchk(cudaMalloc(&d_a, n *
sizeof(T)));
57 gpuErrchk(cudaMemcpy(d_a, h_a, n *
sizeof(T),
58 cudaMemcpyHostToDevice));
62 cublasHandle_t handle;
63 cublasCreate(&handle);
67 if (stat != CUBLAS_STATUS_SUCCESS) {
68 std::cerr <<
"GPU Matrix Scalar Multiply Internal Failure" << std::endl;
70 cublasDestroy(handle);
73 cudaDeviceSynchronize();
76 gpuErrchk(cudaMemcpy(&h_c(0, 0), d_a, n *
sizeof(T),
77 cudaMemcpyDeviceToHost));
79 cublasDestroy(handle);
84 if (a.cols() == b.rows()) {
90 const T * h_a = &a(0);
91 const T * h_b = &b(0);
94 T * d_a;
gpuErrchk(cudaMalloc(&d_a, m * k *
sizeof(T)));
95 T * d_b;
gpuErrchk(cudaMalloc(&d_b, k * n *
sizeof(T)));
96 T * d_c;
gpuErrchk(cudaMalloc(&d_c, m * n *
sizeof(T)));
98 gpuErrchk(cudaMemcpy(d_a, h_a, m * k *
sizeof(T),
99 cudaMemcpyHostToDevice));
100 gpuErrchk(cudaMemcpy(d_b, h_b, k * n *
sizeof(T),
101 cudaMemcpyHostToDevice));
105 cublasHandle_t handle;
106 cublasCreate(&handle);
110 if (stat != CUBLAS_STATUS_SUCCESS) {
111 std::cerr <<
"GPU Matrix Matrix Multiply Internal Failure" << std::endl;
112 cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
113 cublasDestroy(handle);
116 cudaDeviceSynchronize();
118 gpuErrchk(cudaMemcpy(&h_c(0, 0), d_c, m * n *
sizeof(T),
119 cudaMemcpyDeviceToHost));
120 cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
121 cublasDestroy(handle);
124 std::cerr <<
"Matricies in gpu matrix multiply's sizes aren't compatible" 149 const T * h_a = &a(0);
152 const T * h_b = &b(0);
155 T * d_a;
gpuErrchk(cudaMalloc(&d_a, m * n *
sizeof(T)));
156 T * d_b;
gpuErrchk(cudaMalloc(&d_b, m * n *
sizeof(T)));
157 T * d_c;
gpuErrchk(cudaMalloc(&d_c, m * n *
sizeof(T)));
159 gpuErrchk(cudaMemcpy(d_a, h_a, m * n *
sizeof(T),
160 cudaMemcpyHostToDevice));
161 gpuErrchk(cudaMemcpy(d_b, h_b, m * n *
sizeof(T),
162 cudaMemcpyHostToDevice));
165 cublasHandle_t handle;
166 cublasCreate(&handle);
174 if (stat != CUBLAS_STATUS_SUCCESS) {
175 std::cerr <<
"GPU Matrix Add Internal Failure" << std::endl;
176 cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
177 cublasDestroy(handle);
180 cudaDeviceSynchronize();
181 gpuErrchk(cudaMemcpy(&h_c(0, 0), d_c, m * n *
sizeof(T),
182 cudaMemcpyDeviceToHost));
183 cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
184 cublasDestroy(handle);
198 if (a.rows() == b.rows() && a.cols() == b.cols()) {
208 const T * h_a = &a(0);
209 const T * h_b = &b(0);
212 T * d_a;
gpuErrchk(cudaMalloc(&d_a, m * n *
sizeof(T)));
213 T * d_b;
gpuErrchk(cudaMalloc(&d_b, m * n *
sizeof(T)));
214 T * d_c;
gpuErrchk(cudaMalloc(&d_c, m * n *
sizeof(T)));
216 gpuErrchk(cudaMemcpy(d_a, h_a, m * n *
sizeof(T),
217 cudaMemcpyHostToDevice));
218 gpuErrchk(cudaMemcpy(d_b, h_b, m * n *
sizeof(T),
219 cudaMemcpyHostToDevice));
222 cublasHandle_t handle;
223 cublasCreate(&handle);
231 if (stat != CUBLAS_STATUS_SUCCESS) {
232 std::cerr <<
"GPU Matrix Add Internal Failure" << std::endl;
233 cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
234 cublasDestroy(handle);
237 cudaDeviceSynchronize();
238 gpuErrchk(cudaMemcpy(&h_c(0, 0), d_c, m * n *
sizeof(T),
239 cudaMemcpyDeviceToHost));
240 cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
241 cublasDestroy(handle);
244 std::cerr <<
"Matricies in gpu matrix add's sizes aren't compatible" 252 if (a.rows() != b.rows() || a.cols() != b.cols()) {
253 std::cerr <<
"Matricies are not the same size" << std::endl;
257 }
else if (a.rows() == 0) {
258 std::cerr <<
"Matricies are empty" << std::endl;
272 const T * h_a = &a(0);
273 const T * h_b = &b(0);
276 T * d_a;
gpuErrchk(cudaMalloc(&d_a, m * n *
sizeof(T)));
277 T * d_b;
gpuErrchk(cudaMalloc(&d_b, m * n *
sizeof(T)));
278 T * d_c;
gpuErrchk(cudaMalloc(&d_c, m * n *
sizeof(T)));
280 gpuErrchk(cudaMemcpy(d_a, h_a, m * n *
sizeof(T),
281 cudaMemcpyHostToDevice));
282 gpuErrchk(cudaMemcpy(d_b, h_b, m * n *
sizeof(T),
283 cudaMemcpyHostToDevice));
287 cublasHandle_t handle;
288 cublasCreate(&handle);
290 &beta, d_b, ldb, d_c, ldc);
293 if (stat != CUBLAS_STATUS_SUCCESS) {
294 std::cerr <<
"GPU Matrix Subtract Internal Failure" << std::endl;
295 cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
296 cublasDestroy(handle);
299 cudaDeviceSynchronize();
301 gpuErrchk(cudaMemcpy(&h_c(0, 0), d_c, m * n *
sizeof(T),
302 cudaMemcpyDeviceToHost));
303 cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
304 cublasDestroy(handle);
319 if (a.rows() != a.cols()) {
320 std::cerr <<
"Matrix is singular" << std::endl;
328 const T *h_a = &a(0);
334 gpuErrchk(cudaMalloc(&d_a, n * n *
sizeof(T)));
335 gpuErrchk(cudaMalloc(&d_ipiv, n *
sizeof(T)));
336 gpuErrchk(cudaMalloc(&d_info,
sizeof(T)));
339 gpuErrchk(cudaMemcpy(d_a, h_a, n * n *
sizeof(T),
340 cudaMemcpyHostToDevice));
343 cusolverDnHandle_t handle;
344 cusolverDnCreate(&handle);
345 cusolverStatus_t stat;
353 if (stat != CUSOLVER_STATUS_SUCCESS) {
354 std::cerr <<
"LU decomposition: Workspace allocation failed" 363 gpuErrchk(cudaMalloc(&workspace, workspace_size *
sizeof(T)));
367 workspace, d_ipiv, d_info);
368 if (stat != CUSOLVER_STATUS_SUCCESS) {
369 std::cerr <<
"LU decomposition: decomposition failed" 387 gpuErrchk(cudaMalloc(&d_b, n * n *
sizeof(T)));
390 gpuErrchk(cudaMemcpy(d_b, h_b, n * n *
sizeof(T),
391 cudaMemcpyHostToDevice));
394 stat =
GpuLinearSolver(handle, CUBLAS_OP_N, n, nrhs, d_a, lda, d_ipiv, d_b,
396 if (stat != CUSOLVER_STATUS_SUCCESS) {
397 std::cerr <<
"Linear solver failed" 407 gpuErrchk(cudaMemcpy(h_b, d_b, n * n *
sizeof(T),
408 cudaMemcpyDeviceToHost));
411 cudaDeviceSynchronize();
418 cusolverDnDestroy(handle);
424 const int &axis = 0) {
429 const T * h_a = &a(0);
432 T * h_c =
reinterpret_cast<T *
>(malloc(
sizeof(T)));
433 T * d_a;
gpuErrchk(cudaMalloc(&d_a, m * n *
sizeof(T)));
434 T * d_t;
gpuErrchk(cudaMalloc(&d_t, m *
sizeof(T)));
435 gpuErrchk(cudaMemcpy(d_a, h_a, m * n *
sizeof(T), cudaMemcpyHostToDevice));
438 cublasHandle_t handle;
439 cublasCreate(&handle);
442 for (
int i = 0; i < n; ++i) {
443 gpuErrchk(cudaMemcpy(d_t, d_a + i * m, m *
sizeof(T),
444 cudaMemcpyDeviceToDevice));
447 if (stat != CUBLAS_STATUS_SUCCESS) {
448 std::cerr <<
"GPU Matrix Norm Internal Failure" 450 cudaFree(d_a); free(h_c);
451 cublasDestroy(handle);
454 cudaDeviceSynchronize();
459 cudaFree(d_a); free(h_c);
460 cublasDestroy(handle);
466 const T *h_a = &a(0);
470 T *d_a;
gpuErrchk(cudaMalloc(&d_a, m * n *
sizeof(T)));
471 int *devIpiv_h =
reinterpret_cast<int *
>(malloc(m * n *
sizeof(
int)));
472 int *devIpiv_d;
gpuErrchk(cudaMalloc(&devIpiv_d, m * n *
sizeof(
int)));
473 cudaMemset(devIpiv_d, 0, m * n *
sizeof(
int));
475 int *devInfo_d;
gpuErrchk(cudaMalloc(&devInfo_d,
sizeof(
int)));
476 T *h_c =
reinterpret_cast<T *
>(malloc(m * n *
sizeof(T)));
477 gpuErrchk(cudaMemcpy(d_a, h_a, m * n *
sizeof(T), cudaMemcpyHostToDevice));
481 cusolverStatus_t stat;
482 cusolverDnHandle_t handle;
483 cusolverDnCreate(&handle);
487 if (stat != CUSOLVER_STATUS_SUCCESS) {
488 std::cout <<
"Initialization of determinant buffer failed." << std::endl;
489 cudaFree(d_a); cudaFree(devIpiv_d); free(devIpiv_h); free(h_c);
491 cusolverDnDestroy(handle);
496 T *workspace;
gpuErrchk(cudaMalloc(&workspace, work_size *
sizeof(T)));
497 stat =
GpuDeterminant(handle, m, n, d_a, workspace, devIpiv_d, devInfo_d);
500 gpuErrchk(cudaMemcpy(&devInfo_h, devInfo_d,
sizeof(
int),
501 cudaMemcpyDeviceToHost));
502 if (stat != CUSOLVER_STATUS_SUCCESS || devInfo_h != 0) {
503 std::cerr <<
"GPU Determinant Internal Failure" << std::endl;
504 cudaFree(d_a); cudaFree(devIpiv_d); free(devIpiv_h); free(h_c);
505 cudaFree(devInfo_d); cudaFree(workspace);
506 cusolverDnDestroy(handle);
509 cudaDeviceSynchronize();
512 gpuErrchk(cudaMemcpy(devIpiv_h, devIpiv_d, m * n *
sizeof(
int),
513 cudaMemcpyDeviceToHost));
514 gpuErrchk(cudaMemcpy(h_c, d_a, m * n *
sizeof(T),
515 cudaMemcpyDeviceToHost));
519 for (
int i = 0; i < m*n; ++i) {
520 if (*(devIpiv_h+i) == 0)
break;
521 if (*(devIpiv_h+i) != i+1) cnt++;
526 for (
int i = 1; i < m; ++i) {
527 det = det * *(h_c + (i * m) + i);
529 if (cnt % 2 != 0) det = det * -1;
532 cudaFree(d_a); cudaFree(devIpiv_d); free(devIpiv_h); free(h_c);
533 cudaFree(devInfo_d); cudaFree(workspace);
534 cusolverDnDestroy(handle);
556 for (
int i = 0; i < sigular_vector.rows(); i++) {
557 if (sigular_vector[i] != 0)
567 const T * h_a = &a(0);
570 T * h_c =
reinterpret_cast<T *
>(malloc(
sizeof(T)));
571 T * d_a;
gpuErrchk(cudaMalloc(&d_a, m * n *
sizeof(T)));
572 gpuErrchk(cudaMemcpy(d_a, h_a, m * n *
sizeof(T), cudaMemcpyHostToDevice));
575 cublasHandle_t handle;
576 cublasCreate(&handle);
581 if (stat != CUBLAS_STATUS_SUCCESS) {
582 std::cerr <<
"GPU Matrix Frobenius Norm Internal Failure" 584 cudaFree(d_a); free(h_c);
585 cublasDestroy(handle);
588 cudaDeviceSynchronize();
592 cublasDestroy(handle);
606 Vector<T> diagonal_vector = a.diagonal();
609 int m = diagonal_vector.rows();
612 const T *h_a = &diagonal_vector(0);
613 T *h_multiplier =
new T[m];
614 for (
int i = 0; i < m; i++)
615 h_multiplier[i] = 1.0;
622 gpuErrchk(cudaMalloc(&d_a, m *
sizeof(T)));
623 gpuErrchk(cudaMalloc(&d_multiplier, m *
sizeof(T)));
624 gpuErrchk(cudaMalloc(&d_result,
sizeof(T)));
627 gpuErrchk(cudaMemcpy(d_a, h_a, m *
sizeof(T),
628 cudaMemcpyHostToDevice));
629 gpuErrchk(cudaMemcpy(d_multiplier, h_multiplier, m *
sizeof(T),
630 cudaMemcpyHostToDevice));
634 cublasHandle_t handle;
635 cublasCreate(&handle);
636 cublasOperation_t trans = CUBLAS_OP_T;
646 d_a, lda, d_multiplier, incx, &beta, d_result, incy);
649 if (stat != CUBLAS_STATUS_SUCCESS) {
650 std::cerr <<
"GPU Trace Internal Failure" << std::endl;
652 cudaFree(d_multiplier);
654 cublasDestroy(handle);
659 gpuErrchk(cudaMemcpy(&h_result, d_result,
sizeof(T),
660 cudaMemcpyDeviceToHost));
663 cudaDeviceSynchronize();
665 cudaFree(d_multiplier);
667 delete []h_multiplier;
670 cublasDestroy(handle);
679 const T * h_a = &a(0);
680 const T * h_b = &b(0);
681 T * h_c =
reinterpret_cast<T *
>(malloc(
sizeof(T)));
683 T * d_a;
gpuErrchk(cudaMalloc(&d_a, n *
sizeof(T)));
684 T * d_b;
gpuErrchk(cudaMalloc(&d_b, n *
sizeof(T)));
686 gpuErrchk(cudaMemcpy(d_a, h_a, n *
sizeof(T), cudaMemcpyHostToDevice));
687 gpuErrchk(cudaMemcpy(d_b, h_b, n *
sizeof(T), cudaMemcpyHostToDevice));
690 cublasHandle_t handle;
691 cublasCreate(&handle);
696 if (stat != CUBLAS_STATUS_SUCCESS) {
697 std::cerr <<
"GPU Vector Vector Dot Product Internal Failure" 699 cudaFree(d_a); cudaFree(d_b);
700 cublasDestroy(handle);
702 cudaDeviceSynchronize();
705 cudaFree(d_a); cudaFree(d_b);
706 cublasDestroy(handle);
710 if (a.cols() == b.cols()) {
716 const T * h_a = &a(0);
717 const T * h_b = &b(0);
720 T * d_a;
gpuErrchk(cudaMalloc(&d_a, m * k *
sizeof(T)));
721 T * d_b;
gpuErrchk(cudaMalloc(&d_b, k * n *
sizeof(T)));
722 T * d_c;
gpuErrchk(cudaMalloc(&d_c, m * n *
sizeof(T)));
724 gpuErrchk(cudaMemcpy(d_a, h_a, m * k *
sizeof(T),
725 cudaMemcpyHostToDevice));
726 gpuErrchk(cudaMemcpy(d_b, h_b, k * n *
sizeof(T),
727 cudaMemcpyHostToDevice));
731 cublasHandle_t handle;
732 cublasCreate(&handle);
736 if (stat != CUBLAS_STATUS_SUCCESS) {
737 std::cerr <<
"GPU Outer Product Internal Failure" << std::endl;
738 cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
739 cublasDestroy(handle);
742 cudaDeviceSynchronize();
745 gpuErrchk(cudaMemcpy(&h_c(0, 0), d_c, m * n *
sizeof(T),
746 cudaMemcpyDeviceToHost));
747 cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
748 cublasDestroy(handle);
751 std::cerr <<
"Vectors in gpu outer product's sizes aren't compatible" 759 #endif // CPP_INCLUDE_GPU_OPERATIONS_H_ cublasStatus_t GpuMatrixScalarMul(cublasHandle_t handle, int n, const float &scalar, float *a)
static Matrix< T > OuterProduct(const Vector< T > &a, const Vector< T > &b)
Definition: gpu_operations.h:709
void gpuErrchk(cudaError_t)
static T DotProduct(const Vector< T > &a, const Vector< T > &b)
Definition: gpu_operations.h:675
static T FrobeniusNorm(const Matrix< T > &a)
Definition: gpu_operations.h:563
static Matrix< T > Multiply(const Matrix< T > &a, const Matrix< T > &b)
Definition: gpu_operations.h:83
cublasStatus_t GpuFrobeniusNorm(cublasHandle_t handle, int n, int incx, float *a, float *c)
static Matrix< T > Add(const Matrix< T > &a, const T &scalar)
Definition: gpu_operations.h:139
Definition: cpu_operations.h:36
static Matrix< T > Subtract(const Matrix< T > &a, const T &scalar)
cusolverStatus_t GpuLuWorkspace(cusolverDnHandle_t handle, int m, int n, float *a, int *worksize)
cublasStatus_t GpuVectorVectorDot(cublasHandle_t handle, int n, float *a, float *b, float *c)
Vector< T > SingularValues() const
Definition: gpu_svd_solver.h:117
static Vector< T > Norm(const Matrix< T > a, const int &p=2, const int &axis=0)
Definition: gpu_operations.h:423
cusolverStatus_t GpuDeterminant(cusolverDnHandle_t handle, int m, int n, float *a, float *workspace, int *devIpiv, int *devInfo)
cusolverStatus_t GpuLinearSolver(cusolverDnHandle_t handle, cublasOperation_t trans, int n, int nrhs, const float *A, int lda, const int *devIpiv, float *B, int ldb, int *devInfo)
cublasStatus_t GpuMatrixMatrixMul(cublasHandle_t handle, int m, int n, int k, float *a, float *b, float *c)
Definition: gpu_svd_solver.h:42
static T Determinant(const Matrix< T > &a)
Definition: gpu_operations.h:463
cublasStatus_t GpuMatrixAdd(cublasHandle_t handle, int m, int n, const float *alpha, const float *A, int lda, const float *beta, const float *B, int ldb, float *C, int ldc)
static Matrix< T > Subtract(const Matrix< T > &a, const Matrix< T > &b)
Definition: gpu_operations.h:250
Eigen::Matrix< T, Eigen::Dynamic, 1 > Vector
Definition: vector.h:31
cusolverStatus_t GpuGetLUDecompWorkspace(cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *Lwork)
cublasStatus_t GpuMatrixMatrixSub(cublasHandle_t handle, int m, int n, const float *alpha, float *a, int lda, const float *beta, float *b, int ldb, float *c, int ldc)
static Matrix< T > Multiply(const Matrix< T > &a, const T &scalar)
Definition: gpu_operations.h:51
Definition: gpu_operations.h:49
static Matrix< T > Inverse(const Matrix< T > &a)
Definition: gpu_operations.h:317
void Compute(const Matrix< T > &A)
Definition: gpu_svd_solver.h:51
Eigen::Matrix< T, Eigen::Dynamic, Eigen::Dynamic > Matrix
Definition: matrix.h:31
cusolverStatus_t GpuLUDecomposition(cusolverDnHandle_t handle, int m, int n, float *A, int lda, float *Workspace, int *devIpiv, int *devInfo)
static T Trace(const Matrix< T > &a)
Definition: gpu_operations.h:604
static Matrix< T > Add(const Matrix< T > &a, const Matrix< T > &b)
Definition: gpu_operations.h:197
static int Rank(const Matrix< T > &a)
Definition: gpu_operations.h:546
cublasStatus_t GpuMatrixVectorMul(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const float *alpha, const float *A, int lda, const float *x, int incx, const float *beta, float *y, int incy)