cuda - Cusolver SVD does not give correct U and VT outputs for complex inputs

Question

Welcome To Ask or Share your Answers For Others

cuda - Cusolver SVD does not give correct U and VT outputs for complex inputs

asked Jan 25, 2021 in Technique[技术] by 深蓝 (71.8m points)

cuda - Cusolver SVD does not give correct U and VT outputs for complex inputs

I am trying to find the SVD of a complex matrix in Cuda using CuSolver library. The singular values are correct and matching the python output, but the U and VT matrices are incorrect.

Utilities.cu

#include <stdio.h>
#include <assert.h>

#include "cuda_runtime.h"
#include <cuda.h>

#include <cusolverDn.h>

/*******************/
/* iDivUp FUNCTION */
/*******************/
extern "C" int iDivUp(int a, int b) {
    return ((a % b) != 0) ? (a / b + 1) : (a / b);
}

/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d
", cudaGetErrorString(code), file, line);
        if (abort) {
            exit(code);
        }
    }
}

extern "C" void gpuErrchk(cudaError_t ans) {
    gpuAssert((ans), __FILE__, __LINE__);
}

TimingGPU.cu

/**************/
/* TIMING GPU */
/**************/

#include "TimingGPU.cuh"

#include <cuda.h>
#include <cuda_runtime.h>

struct PrivateTimingGPU {
    cudaEvent_t     start;
    cudaEvent_t     stop;
};

// default constructor
TimingGPU::TimingGPU() {
    privateTimingGPU = new PrivateTimingGPU;
}

// default destructor
TimingGPU::~TimingGPU() { }

void TimingGPU::StartCounter()
{
    cudaEventCreate(&((*privateTimingGPU).start));
    cudaEventCreate(&((*privateTimingGPU).stop));
    cudaEventRecord((*privateTimingGPU).start, 0);
}

void TimingGPU::StartCounterFlags()
{
    int eventflags = cudaEventBlockingSync;

    cudaEventCreateWithFlags(&((*privateTimingGPU).start), eventflags);
    cudaEventCreateWithFlags(&((*privateTimingGPU).stop), eventflags);
    cudaEventRecord((*privateTimingGPU).start, 0);
}

// Gets the counter in ms
float TimingGPU::GetCounter()
{
    float   time;
    cudaEventRecord((*privateTimingGPU).stop, 0);
    cudaEventSynchronize((*privateTimingGPU).stop);
    cudaEventElapsedTime(&time, (*privateTimingGPU).start, (*privateTimingGPU).stop);
    return time;
}

TimingGPU.cuh

#ifndef __TIMING_CUH__
#define __TIMING_CUH__

/**************/
/* TIMING GPU */
/**************/

// Events are a part of CUDA API and provide a system independent way to measure execution times on CUDA devices with approximately 0.5
// microsecond precision.

struct PrivateTimingGPU;

class TimingGPU
{
private:
    PrivateTimingGPU *privateTimingGPU;

public:

    TimingGPU();

    ~TimingGPU();

    void StartCounter();
    void StartCounterFlags();

    float GetCounter();

}; // TimingCPU class

#endif

kernel.cu

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusolverDn.h>

#include "Utilities.cuh"
#include "TimingGPU.cuh"

#define FULLSVD
#define PRINTRESULTS

/********/
/* MAIN */
/********/
int main() {

    const int           M = 3;
    const int           N = 3;
    const int           lda = M;
    const int         numMatrices = 1;
    //const int           numMatrices = 256;
    cublasHandle_t cublasHandle = NULL;

    TimingGPU timerGPU;
    cudaEvent_t start, stop;
    cusolverStatus_t status = CUSOLVER_STATUS_SUCCESS;
    cudaError_t cudaStat1 = cudaSuccess;

    cublasCreate(&cublasHandle);
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // --- Setting the host matrix
    cuComplex *h_A = (cuComplex *)malloc(lda * N * numMatrices * sizeof(double));
    for (unsigned int k = 0; k < numMatrices; k++)
        for (unsigned int i = 0; i < M; i++) {
            for (unsigned int j = 0; j < N; j++) {
                h_A[k * M * N + j * M + i] = make_float2((1. / (k + 1)) * (i + j * j) * (i + j), (1. / (k + 1)) * (i + j * j) * (i + j));
                //printf("[%d, %d] %f
", i, j, h_A[j*M + i]);
                printf("%f %f ", h_A[j*M + i].x, h_A[j * M + i].y);
            }
            printf("
");
        }

    // --- Setting the device matrix and moving the host matrix to the device
    cuComplex *d_A;         gpuErrchk(cudaMalloc(&d_A, M * N * numMatrices * sizeof(cuComplex)));
    gpuErrchk(cudaMemcpy(d_A, h_A, M * N * numMatrices * sizeof(cuComplex), cudaMemcpyHostToDevice));

    // --- host side SVD results space
    float *h_S = (float *)malloc(N * numMatrices * sizeof(float));
    cuComplex *h_SI = (cuComplex *)malloc(N * N * numMatrices * sizeof(cuComplex));
    cuComplex *h_x = (cuComplex *)malloc(N * numMatrices * sizeof(cuComplex));

    cuComplex *h_U = NULL;
    cuComplex *h_V = NULL;
#ifdef FULLSVD
    h_U = (cuComplex *)malloc(M * M * numMatrices * sizeof(cuComplex));
    h_V = (cuComplex *)malloc(N * N * numMatrices * sizeof(cuComplex));
#endif

    // --- device side SVD workspace and matrices
    int work_size = 0;

    int *devInfo;       gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
    float *d_S;         gpuErrchk(cudaMalloc(&d_S, N * numMatrices * sizeof(float)));
    cuComplex *d_SI;    gpuErrchk(cudaMalloc(&d_SI, N * N * numMatrices * sizeof(cuComplex)));

    cuComplex *d_x;     gpuErrchk(cudaMalloc(&d_x, N * numMatrices * sizeof(cuComplex)));
    cuComplex *d_U = NULL;
    cuComplex *d_V = NULL;
#ifdef FULLSVD
    gpuErrchk(cudaMalloc(&d_U, M * M * numMatrices * sizeof(cuComplex)));
    gpuErrchk(cudaMalloc(&d_V, N * N * numMatrices * sizeof(cuComplex)));
#endif

    cuComplex *d_work = NULL; /* devie workspace for gesvdj */
    int devInfo_h = 0; /* host copy of error devInfo_h */

    // --- Parameters configuration of Jacobi-based SVD
    const double            tol = 1.e-7;
    const int               maxSweeps = 15;
    cusolverEigMode_t jobz;                                   // --- CUSOLVER_EIG_MODE_VECTOR - Compute eigenvectors; CUSOLVER_EIG_MODE_NOVECTOR - Compute singular values only
#ifdef FULLSVD
    jobz = CUSOLVER_EIG_MODE_VECTOR;
#else
    jobz = CUSOLVER_EIG_MODE_NOVECTOR;
#endif

    const int               econ = 0;                            // --- econ = 1 for economy size 

    // --- Numerical result parameters of gesvdj 
    double                  residual = 0;
    int                     executedSweeps = 0;

    // --- CUDA solver initialization
    cusolverDnHandle_t solver_handle = NULL;
    cusolveSafeCall(cusolverDnCreate(&solver_handle));

    // --- Configuration of gesvdj
    gesvdjInfo_t gesvdj_params = NULL;
    cusolveSafeCall(cusolverDnCreateGesvdjInfo(&gesvdj_params));

    // --- Set the computation tolerance, since the default tolerance is machine precision
    cusolveSafeCall(cusolverDnXgesvdjSetTolerance(gesvdj_params, tol));

    // --- Set the maximum number of sweeps, since the default value of max. sweeps is 100
    cusolveSafeCall(cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, maxSweeps));

    // --- Query the SVD workspace 
    cusolveSafeCall(cusolverDnCgesvdjBatched_bufferSize(
        solver_handle,
        jobz,                                       // --- Compute the singular vectors or not
        M,                                          // --- Number of rows of A, 0 <= M
        N,                                          // --- Number of columns of A, 0 <= N 
        d_A,                                        // --- M x N
        lda,                                        // --- Leading dimension of A
        d_S,                                        // --- Square matrix of size min(M, N) x min(M, N)
        d_U,                                        // --- M x M if econ = 0, M x min(M, N) if econ = 1
        lda,                                        // --- Leading dimension of U, ldu >= max(1, M)
        d_V,                                        // --- N x N if econ = 0, N x min(M,N) if econ = 1
        lda,                                        // --- Leading dimension of V, ldv >= max(1, N)
        &work_size,
        gesvdj_params,
        numMatrices));

    gpuErrchk(cudaMalloc(&d_work, sizeof(cuComplex) * work_size));

    // --- Compute SVD
    timerGPU.StartCounter();
    cusolveSafeCall(cusolverDnCgesvdjBatched(
        solver_handle,
        jobz,                                       // --- Compute the singular vectors or not
        M,                                          // --- Number of rows of A, 0 <= M
        N,                                          // --- Number of columns of A, 0 <= N 
        d_A,                                        // --- M x N
        lda,                                        // --- Leading dimension of A
        d_S,                                        // --- Square matrix of size min(M, N) x min(M, N)
        d_U,                                        // --- M x M if econ = 0, M x min(M, N) if econ = 1
        lda,                                        // --- Leading dimension of U, ldu >= max(1, M)
        d_V,                                        // --- N x N if econ = 0, N x min(M, N) if econ = 1
        N,                                          // --- Leading dimension of V, ldv >= max(1, N)
        d_work,
        work_size,
        devInfo,
        gesvdj_params,
        numMatrices));

    cudaStat1 = cudaDeviceSynchronize();
    assert(CUSOLVER_STATUS_SUCCESS == status);
    assert(cudaSuccess == cudaStat1);
    printf("Calculation of the singular values only: %f ms

", timerGPU.GetCounter());

    gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_S, d_S, sizeof(float) * N * numMatrices, cudaMemcpyDeviceToHost));
#ifdef FULLSVD
    gpuErrchk(cudaMemcpy(h_U, d_U, sizeof(cuComplex) * lda * M * numMatrices, cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_V, d_V, sizeof(cuComplex) * N * N * numMatrices, cudaMemcpyDeviceToHost));
#endif

#ifdef PRINTRESULTS
    printf("SINGULAR VALUES 
");
    printf("_______________ 
");
    for (int k = 0; k < numMatrices; k++) {
        for (int p = 0; p < N; p++)
            printf("Matrix nr. %d; SV nr. %d; Value = %f
", k, p, h_S[k * N + p]);
        printf("
");
    }
#ifdef FULLSVD
    printf("SINGULAR VECTORS U 
");
    printf("__________________ 
");
    for (int k = 0; k < numMatrices; k++) {
        for (int q = 0; q < M; q++)
            for (int p = 0; p < M; p++)
                printf("Matrix nr. %d; U nr. %d, %d; Value = %e, %e
", k, q, p, h_U[q * M + p].x, h_U[q * M + p].y);
        printf("
");
    }

    printf("SINGULAR VECTORS V 
");
    printf("__________________ 
");
    for (int k = 0; k < numMatrices; k++) {
        for (int q = 0; q < N; q++)
            for (int p = 0; p < N; p++)
                printf("Matrix nr. %d; V nr. %d; Value = %e, %e
", k, p, h_V[N * q + p].x, h_V[N * q + p].y);
        printf("
");
    }
#endif
#endif

    if (0 == devInfo_h) {
        printf("gesvdj converges 
");
    }
    else if (0 > devInfo_h) {
        printf("%d-th par

与恶龙缠斗过久,自身亦成为恶龙；凝视深渊过久,深渊将回以凝视…

Categories

cuda - Cusolver SVD does not give correct U and VT outputs for complex inputs

cuda - Cusolver SVD does not give correct U and VT outputs for complex inputs

Please log in or register to add a comment.

Please log in or register to answer this question.

1 Answer

Please log in or register to add a comment.

Just Browsing Browsing

Most popular tags