Actual source code: aijcusparse.cu
petsc-3.14.0 2020-09-29
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the CUSPARSE library,
4: */
5: #define PETSC_SKIP_SPINLOCK
6: #define PETSC_SKIP_CXX_COMPLEX_FIX
7: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
9: #include <petscconf.h>
10: #include <../src/mat/impls/aij/seq/aij.h>
11: #include <../src/mat/impls/sbaij/seq/sbaij.h>
12: #include <../src/vec/vec/impls/dvecimpl.h>
13: #include <petsc/private/vecimpl.h>
14: #undef VecType
15: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
17: const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19: /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20: 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
22: typedef enum {
23: CUSPARSE_MV_ALG_DEFAULT = 0,
24: CUSPARSE_COOMV_ALG = 1,
25: CUSPARSE_CSRMV_ALG1 = 2,
26: CUSPARSE_CSRMV_ALG2 = 3
27: } cusparseSpMVAlg_t;
29: typedef enum {
30: CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31: CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1,
32: CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2,
33: CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3,
34: CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4,
35: CUSPARSE_SPMM_ALG_DEFAULT = 0,
36: CUSPARSE_SPMM_COO_ALG1 = 1,
37: CUSPARSE_SPMM_COO_ALG2 = 2,
38: CUSPARSE_SPMM_COO_ALG3 = 3,
39: CUSPARSE_SPMM_COO_ALG4 = 5,
40: CUSPARSE_SPMM_CSR_ALG1 = 4,
41: CUSPARSE_SPMM_CSR_ALG2 = 6,
42: } cusparseSpMMAlg_t;
44: typedef enum {
45: CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46: CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc
47: } cusparseCsr2CscAlg_t;
48: */
49: const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50: const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51: const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52: #endif
54: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
58: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
68: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
69: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
75: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
76: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
77: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
78: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
79: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
80: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
82: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
83: {
84: cusparseStatus_t stat;
85: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
88: cusparsestruct->stream = stream;
89: stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
90: return(0);
91: }
93: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
94: {
95: cusparseStatus_t stat;
96: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
99: if (cusparsestruct->handle != handle) {
100: if (cusparsestruct->handle) {
101: stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
102: }
103: cusparsestruct->handle = handle;
104: }
105: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
106: return(0);
107: }
109: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
110: {
111: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
114: if (cusparsestruct->handle) cusparsestruct->handle = 0;
115: return(0);
116: }
118: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
119: {
121: *type = MATSOLVERCUSPARSE;
122: return(0);
123: }
125: /*MC
126: MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
127: on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
128: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
129: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
130: CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
131: algorithms are not recommended. This class does NOT support direct solver operations.
133: Level: beginner
135: .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
136: M*/
138: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
139: {
141: PetscInt n = A->rmap->n;
144: MatCreate(PetscObjectComm((PetscObject)A),B);
145: MatSetSizes(*B,n,n,n,n);
146: (*B)->factortype = ftype;
147: (*B)->useordering = PETSC_TRUE;
148: MatSetType(*B,MATSEQAIJCUSPARSE);
150: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
151: MatSetBlockSizesFromMats(*B,A,A);
152: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
153: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE;
154: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
155: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE;
156: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
157: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
159: MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
160: PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);
161: return(0);
162: }
164: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
165: {
166: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
169: switch (op) {
170: case MAT_CUSPARSE_MULT:
171: cusparsestruct->format = format;
172: break;
173: case MAT_CUSPARSE_ALL:
174: cusparsestruct->format = format;
175: break;
176: default:
177: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
178: }
179: return(0);
180: }
182: /*@
183: MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
184: operation. Only the MatMult operation can use different GPU storage formats
185: for MPIAIJCUSPARSE matrices.
186: Not Collective
188: Input Parameters:
189: + A - Matrix of type SEQAIJCUSPARSE
190: . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
191: - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
193: Output Parameter:
195: Level: intermediate
197: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
198: @*/
199: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
200: {
205: PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
206: return(0);
207: }
209: /*@
210: MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose
212: Collective on mat
214: Input Parameters:
215: + A - Matrix of type SEQAIJCUSPARSE
216: - transgen - the boolean flag
218: Level: intermediate
220: .seealso: MATSEQAIJCUSPARSE
221: @*/
222: PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
223: {
225: PetscBool flg;
229: PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);
230: if (flg) {
231: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
233: if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
234: cusp->transgen = transgen;
235: if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
236: MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);
237: }
238: }
239: return(0);
240: }
242: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
243: {
244: PetscErrorCode ierr;
245: MatCUSPARSEStorageFormat format;
246: PetscBool flg;
247: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
250: PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
251: if (A->factortype == MAT_FACTOR_NONE) {
252: PetscBool transgen = cusparsestruct->transgen;
254: PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);
255: if (flg) {MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);}
257: PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
258: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
259: if (flg) {MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);}
261: PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
262: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
263: if (flg) {MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);}
264: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
265: cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
266: PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
267: "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);
268: /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
269: if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
271: cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
272: PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
273: "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);
274: if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
276: cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
277: PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
278: "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);
279: if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
280: #endif
281: }
282: PetscOptionsTail();
283: return(0);
284: }
286: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
287: {
291: MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
292: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
293: return(0);
294: }
296: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
297: {
301: MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
302: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
303: return(0);
304: }
306: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
307: {
311: MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
312: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
313: return(0);
314: }
316: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
317: {
321: MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
322: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
323: return(0);
324: }
326: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
327: {
328: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
329: PetscInt n = A->rmap->n;
330: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
331: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
332: cusparseStatus_t stat;
333: const PetscInt *ai = a->i,*aj = a->j,*vi;
334: const MatScalar *aa = a->a,*v;
335: PetscInt *AiLo, *AjLo;
336: PetscScalar *AALo;
337: PetscInt i,nz, nzLower, offset, rowOffset;
338: PetscErrorCode ierr;
339: cudaError_t cerr;
342: if (!n) return(0);
343: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
344: try {
345: /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
346: nzLower=n+ai[n]-ai[1];
348: /* Allocate Space for the lower triangular matrix */
349: cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
350: cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
351: cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
353: /* Fill the lower triangular matrix */
354: AiLo[0] = (PetscInt) 0;
355: AiLo[n] = nzLower;
356: AjLo[0] = (PetscInt) 0;
357: AALo[0] = (MatScalar) 1.0;
358: v = aa;
359: vi = aj;
360: offset = 1;
361: rowOffset= 1;
362: for (i=1; i<n; i++) {
363: nz = ai[i+1] - ai[i];
364: /* additional 1 for the term on the diagonal */
365: AiLo[i] = rowOffset;
366: rowOffset += nz+1;
368: PetscArraycpy(&(AjLo[offset]), vi, nz);
369: PetscArraycpy(&(AALo[offset]), v, nz);
371: offset += nz;
372: AjLo[offset] = (PetscInt) i;
373: AALo[offset] = (MatScalar) 1.0;
374: offset += 1;
376: v += nz;
377: vi += nz;
378: }
380: /* allocate space for the triangular factor information */
381: loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
383: /* Create the matrix description */
384: stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
385: stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
386: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
387: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
388: #else
389: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
390: #endif
391: stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
392: stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
394: /* set the operation */
395: loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
397: /* set the matrix */
398: loTriFactor->csrMat = new CsrMatrix;
399: loTriFactor->csrMat->num_rows = n;
400: loTriFactor->csrMat->num_cols = n;
401: loTriFactor->csrMat->num_entries = nzLower;
403: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
404: loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
406: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
407: loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
409: loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
410: loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
412: /* Create the solve analysis information */
413: stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
414: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
415: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
416: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
417: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
418: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
419: &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
420: cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
421: #endif
423: /* perform the solve analysis */
424: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
425: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
426: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
427: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
428: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
429: ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
430: #endif
431: );CHKERRCUSPARSE(stat);
433: /* assign the pointer. Is this really necessary? */
434: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
436: cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
437: cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
438: cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
439: PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));
440: } catch(char *ex) {
441: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
442: }
443: }
444: return(0);
445: }
447: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
448: {
449: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
450: PetscInt n = A->rmap->n;
451: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
452: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
453: cusparseStatus_t stat;
454: const PetscInt *aj = a->j,*adiag = a->diag,*vi;
455: const MatScalar *aa = a->a,*v;
456: PetscInt *AiUp, *AjUp;
457: PetscScalar *AAUp;
458: PetscInt i,nz, nzUpper, offset;
459: PetscErrorCode ierr;
460: cudaError_t cerr;
463: if (!n) return(0);
464: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
465: try {
466: /* next, figure out the number of nonzeros in the upper triangular matrix. */
467: nzUpper = adiag[0]-adiag[n];
469: /* Allocate Space for the upper triangular matrix */
470: cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
471: cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
472: cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
474: /* Fill the upper triangular matrix */
475: AiUp[0]=(PetscInt) 0;
476: AiUp[n]=nzUpper;
477: offset = nzUpper;
478: for (i=n-1; i>=0; i--) {
479: v = aa + adiag[i+1] + 1;
480: vi = aj + adiag[i+1] + 1;
482: /* number of elements NOT on the diagonal */
483: nz = adiag[i] - adiag[i+1]-1;
485: /* decrement the offset */
486: offset -= (nz+1);
488: /* first, set the diagonal elements */
489: AjUp[offset] = (PetscInt) i;
490: AAUp[offset] = (MatScalar)1./v[nz];
491: AiUp[i] = AiUp[i+1] - (nz+1);
493: PetscArraycpy(&(AjUp[offset+1]), vi, nz);
494: PetscArraycpy(&(AAUp[offset+1]), v, nz);
495: }
497: /* allocate space for the triangular factor information */
498: upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
500: /* Create the matrix description */
501: stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
502: stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
503: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
504: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
505: #else
506: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
507: #endif
508: stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
509: stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
511: /* set the operation */
512: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
514: /* set the matrix */
515: upTriFactor->csrMat = new CsrMatrix;
516: upTriFactor->csrMat->num_rows = n;
517: upTriFactor->csrMat->num_cols = n;
518: upTriFactor->csrMat->num_entries = nzUpper;
520: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
521: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
523: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
524: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
526: upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
527: upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
529: /* Create the solve analysis information */
530: stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
531: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
532: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
533: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
534: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
535: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
536: &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
537: cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
538: #endif
540: /* perform the solve analysis */
541: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
542: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
543: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
544: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
545: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
546: ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
547: #endif
548: );CHKERRCUSPARSE(stat);
550: /* assign the pointer. Is this really necessary? */
551: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
553: cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
554: cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
555: cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
556: PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));
557: } catch(char *ex) {
558: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
559: }
560: }
561: return(0);
562: }
564: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
565: {
566: PetscErrorCode ierr;
567: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
568: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
569: IS isrow = a->row,iscol = a->icol;
570: PetscBool row_identity,col_identity;
571: const PetscInt *r,*c;
572: PetscInt n = A->rmap->n;
575: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
576: MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
577: MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);
579: cusparseTriFactors->workVector = new THRUSTARRAY(n);
580: cusparseTriFactors->nnz=a->nz;
582: A->offloadmask = PETSC_OFFLOAD_BOTH;
583: /* lower triangular indices */
584: ISGetIndices(isrow,&r);
585: ISIdentity(isrow,&row_identity);
586: if (!row_identity) {
587: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
588: cusparseTriFactors->rpermIndices->assign(r, r+n);
589: }
590: ISRestoreIndices(isrow,&r);
592: /* upper triangular indices */
593: ISGetIndices(iscol,&c);
594: ISIdentity(iscol,&col_identity);
595: if (!col_identity) {
596: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
597: cusparseTriFactors->cpermIndices->assign(c, c+n);
598: }
600: if (!row_identity && !col_identity) {
601: PetscLogCpuToGpu(2*n*sizeof(PetscInt));
602: } else if(!row_identity) {
603: PetscLogCpuToGpu(n*sizeof(PetscInt));
604: } else if(!col_identity) {
605: PetscLogCpuToGpu(n*sizeof(PetscInt));
606: }
608: ISRestoreIndices(iscol,&c);
609: return(0);
610: }
612: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
613: {
614: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
615: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
616: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
617: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
618: cusparseStatus_t stat;
619: PetscErrorCode ierr;
620: cudaError_t cerr;
621: PetscInt *AiUp, *AjUp;
622: PetscScalar *AAUp;
623: PetscScalar *AALo;
624: PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
625: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data;
626: const PetscInt *ai = b->i,*aj = b->j,*vj;
627: const MatScalar *aa = b->a,*v;
630: if (!n) return(0);
631: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
632: try {
633: /* Allocate Space for the upper triangular matrix */
634: cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
635: cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
636: cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
637: cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
639: /* Fill the upper triangular matrix */
640: AiUp[0]=(PetscInt) 0;
641: AiUp[n]=nzUpper;
642: offset = 0;
643: for (i=0; i<n; i++) {
644: /* set the pointers */
645: v = aa + ai[i];
646: vj = aj + ai[i];
647: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
649: /* first, set the diagonal elements */
650: AjUp[offset] = (PetscInt) i;
651: AAUp[offset] = (MatScalar)1.0/v[nz];
652: AiUp[i] = offset;
653: AALo[offset] = (MatScalar)1.0/v[nz];
655: offset+=1;
656: if (nz>0) {
657: PetscArraycpy(&(AjUp[offset]), vj, nz);
658: PetscArraycpy(&(AAUp[offset]), v, nz);
659: for (j=offset; j<offset+nz; j++) {
660: AAUp[j] = -AAUp[j];
661: AALo[j] = AAUp[j]/v[nz];
662: }
663: offset+=nz;
664: }
665: }
667: /* allocate space for the triangular factor information */
668: upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
670: /* Create the matrix description */
671: stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
672: stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
673: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
674: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
675: #else
676: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
677: #endif
678: stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
679: stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
681: /* set the matrix */
682: upTriFactor->csrMat = new CsrMatrix;
683: upTriFactor->csrMat->num_rows = A->rmap->n;
684: upTriFactor->csrMat->num_cols = A->cmap->n;
685: upTriFactor->csrMat->num_entries = a->nz;
687: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
688: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
690: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
691: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
693: upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
694: upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
696: /* set the operation */
697: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
699: /* Create the solve analysis information */
700: stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
701: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
702: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
703: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
704: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
705: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
706: &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
707: cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
708: #endif
710: /* perform the solve analysis */
711: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
712: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
713: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
714: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
715: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
716: ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
717: #endif
718: );CHKERRCUSPARSE(stat);
720: /* assign the pointer. Is this really necessary? */
721: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
723: /* allocate space for the triangular factor information */
724: loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
726: /* Create the matrix description */
727: stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
728: stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
729: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
730: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
731: #else
732: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
733: #endif
734: stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
735: stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
737: /* set the operation */
738: loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
740: /* set the matrix */
741: loTriFactor->csrMat = new CsrMatrix;
742: loTriFactor->csrMat->num_rows = A->rmap->n;
743: loTriFactor->csrMat->num_cols = A->cmap->n;
744: loTriFactor->csrMat->num_entries = a->nz;
746: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
747: loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
749: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
750: loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
752: loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
753: loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
754: PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));
756: /* Create the solve analysis information */
757: stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
758: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
759: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
760: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
761: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
762: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
763: &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
764: cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
765: #endif
767: /* perform the solve analysis */
768: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
769: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
770: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
771: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
772: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
773: ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
774: #endif
775: );CHKERRCUSPARSE(stat);
777: /* assign the pointer. Is this really necessary? */
778: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
780: A->offloadmask = PETSC_OFFLOAD_BOTH;
781: cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
782: cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
783: cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
784: cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
785: } catch(char *ex) {
786: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
787: }
788: }
789: return(0);
790: }
792: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
793: {
794: PetscErrorCode ierr;
795: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
796: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
797: IS ip = a->row;
798: const PetscInt *rip;
799: PetscBool perm_identity;
800: PetscInt n = A->rmap->n;
803: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
804: MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
805: cusparseTriFactors->workVector = new THRUSTARRAY(n);
806: cusparseTriFactors->nnz=(a->nz-n)*2 + n;
808: /* lower triangular indices */
809: ISGetIndices(ip,&rip);
810: ISIdentity(ip,&perm_identity);
811: if (!perm_identity) {
812: IS iip;
813: const PetscInt *irip;
815: ISInvertPermutation(ip,PETSC_DECIDE,&iip);
816: ISGetIndices(iip,&irip);
817: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
818: cusparseTriFactors->rpermIndices->assign(rip, rip+n);
819: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
820: cusparseTriFactors->cpermIndices->assign(irip, irip+n);
821: ISRestoreIndices(iip,&irip);
822: ISDestroy(&iip);
823: PetscLogCpuToGpu(2*n*sizeof(PetscInt));
824: }
825: ISRestoreIndices(ip,&rip);
826: return(0);
827: }
829: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
830: {
831: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
832: IS isrow = b->row,iscol = b->col;
833: PetscBool row_identity,col_identity;
837: MatLUFactorNumeric_SeqAIJ(B,A,info);
838: B->offloadmask = PETSC_OFFLOAD_CPU;
839: /* determine which version of MatSolve needs to be used. */
840: ISIdentity(isrow,&row_identity);
841: ISIdentity(iscol,&col_identity);
842: if (row_identity && col_identity) {
843: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
844: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
845: B->ops->matsolve = NULL;
846: B->ops->matsolvetranspose = NULL;
847: } else {
848: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
849: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
850: B->ops->matsolve = NULL;
851: B->ops->matsolvetranspose = NULL;
852: }
854: /* get the triangular factors */
855: MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
856: return(0);
857: }
859: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
860: {
861: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
862: IS ip = b->row;
863: PetscBool perm_identity;
867: MatCholeskyFactorNumeric_SeqAIJ(B,A,info);
868: B->offloadmask = PETSC_OFFLOAD_CPU;
869: /* determine which version of MatSolve needs to be used. */
870: ISIdentity(ip,&perm_identity);
871: if (perm_identity) {
872: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
873: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
874: B->ops->matsolve = NULL;
875: B->ops->matsolvetranspose = NULL;
876: } else {
877: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
878: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
879: B->ops->matsolve = NULL;
880: B->ops->matsolvetranspose = NULL;
881: }
883: /* get the triangular factors */
884: MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
885: return(0);
886: }
888: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
889: {
890: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
891: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
892: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
893: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
894: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
895: cusparseStatus_t stat;
896: cusparseIndexBase_t indexBase;
897: cusparseMatrixType_t matrixType;
898: cusparseFillMode_t fillMode;
899: cusparseDiagType_t diagType;
903: /*********************************************/
904: /* Now the Transpose of the Lower Tri Factor */
905: /*********************************************/
907: /* allocate space for the transpose of the lower triangular factor */
908: loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
910: /* set the matrix descriptors of the lower triangular factor */
911: matrixType = cusparseGetMatType(loTriFactor->descr);
912: indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
913: fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
914: CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
915: diagType = cusparseGetMatDiagType(loTriFactor->descr);
917: /* Create the matrix description */
918: stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
919: stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
920: stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
921: stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
922: stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
924: /* set the operation */
925: loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
927: /* allocate GPU space for the CSC of the lower triangular factor*/
928: loTriFactorT->csrMat = new CsrMatrix;
929: loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols;
930: loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows;
931: loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
932: loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
933: loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
934: loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
936: /* compute the transpose of the lower triangular factor, i.e. the CSC */
937: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
938: stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
939: loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
940: loTriFactor->csrMat->values->data().get(),
941: loTriFactor->csrMat->row_offsets->data().get(),
942: loTriFactor->csrMat->column_indices->data().get(),
943: loTriFactorT->csrMat->values->data().get(),
944: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
945: CUSPARSE_ACTION_NUMERIC,indexBase,
946: CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
947: cudaError_t cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
948: #endif
950: stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
951: loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
952: loTriFactor->csrMat->values->data().get(),
953: loTriFactor->csrMat->row_offsets->data().get(),
954: loTriFactor->csrMat->column_indices->data().get(),
955: loTriFactorT->csrMat->values->data().get(),
956: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
957: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
958: CUSPARSE_ACTION_NUMERIC, indexBase,
959: CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
960: #else
961: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
962: CUSPARSE_ACTION_NUMERIC, indexBase
963: #endif
964: );CHKERRCUSPARSE(stat);
966: /* Create the solve analysis information */
967: stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
968: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
969: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
970: loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
971: loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
972: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
973: &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
974: cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
975: #endif
977: /* perform the solve analysis */
978: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
979: loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
980: loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
981: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
982: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
983: ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
984: #endif
985: );CHKERRCUSPARSE(stat);
987: /* assign the pointer. Is this really necessary? */
988: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
990: /*********************************************/
991: /* Now the Transpose of the Upper Tri Factor */
992: /*********************************************/
994: /* allocate space for the transpose of the upper triangular factor */
995: upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
997: /* set the matrix descriptors of the upper triangular factor */
998: matrixType = cusparseGetMatType(upTriFactor->descr);
999: indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1000: fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1001: CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1002: diagType = cusparseGetMatDiagType(upTriFactor->descr);
1004: /* Create the matrix description */
1005: stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1006: stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1007: stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1008: stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1009: stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1011: /* set the operation */
1012: upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1014: /* allocate GPU space for the CSC of the upper triangular factor*/
1015: upTriFactorT->csrMat = new CsrMatrix;
1016: upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols;
1017: upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows;
1018: upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
1019: upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1020: upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1021: upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1023: /* compute the transpose of the upper triangular factor, i.e. the CSC */
1024: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1025: stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1026: upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1027: upTriFactor->csrMat->values->data().get(),
1028: upTriFactor->csrMat->row_offsets->data().get(),
1029: upTriFactor->csrMat->column_indices->data().get(),
1030: upTriFactorT->csrMat->values->data().get(),
1031: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1032: CUSPARSE_ACTION_NUMERIC,indexBase,
1033: CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1034: cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1035: #endif
1037: stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1038: upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1039: upTriFactor->csrMat->values->data().get(),
1040: upTriFactor->csrMat->row_offsets->data().get(),
1041: upTriFactor->csrMat->column_indices->data().get(),
1042: upTriFactorT->csrMat->values->data().get(),
1043: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1044: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1045: CUSPARSE_ACTION_NUMERIC, indexBase,
1046: CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1047: #else
1048: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1049: CUSPARSE_ACTION_NUMERIC, indexBase
1050: #endif
1051: );CHKERRCUSPARSE(stat);
1053: /* Create the solve analysis information */
1054: stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1055: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1056: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1057: upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1058: upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1059: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1060: &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1061: cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1062: #endif
1064: /* perform the solve analysis */
1065: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1066: upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1067: upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1068: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1069: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1070: ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1071: #endif
1072: );CHKERRCUSPARSE(stat);
1074: /* assign the pointer. Is this really necessary? */
1075: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1076: return(0);
1077: }
1079: static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1080: {
1081: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1082: Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1083: Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1084: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1085: cusparseStatus_t stat;
1086: cusparseIndexBase_t indexBase;
1087: cudaError_t err;
1088: PetscErrorCode ierr;
1091: if (!cusparsestruct->transgen || cusparsestruct->matTranspose) return(0);
1092: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1093: PetscLogGpuTimeBegin();
1094: /* create cusparse matrix */
1095: matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1096: stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1097: indexBase = cusparseGetMatIndexBase(matstruct->descr);
1098: stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1099: stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1101: /* set alpha and beta */
1102: err = cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar));CHKERRCUDA(err);
1103: err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1104: err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1105: err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1106: err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1107: err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1108: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1110: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1111: CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1112: CsrMatrix *matrixT= new CsrMatrix;
1113: matrixT->num_rows = A->cmap->n;
1114: matrixT->num_cols = A->rmap->n;
1115: matrixT->num_entries = a->nz;
1116: matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1117: matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1118: matrixT->values = new THRUSTARRAY(a->nz);
1120: cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
1121: cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1123: /* compute the transpose, i.e. the CSC */
1124: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1125: stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1126: A->cmap->n, matrix->num_entries,
1127: matrix->values->data().get(),
1128: cusparsestruct->rowoffsets_gpu->data().get(),
1129: matrix->column_indices->data().get(),
1130: matrixT->values->data().get(),
1131: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1132: CUSPARSE_ACTION_NUMERIC,indexBase,
1133: cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1134: err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1135: #endif
1137: stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1138: A->cmap->n, matrix->num_entries,
1139: matrix->values->data().get(),
1140: cusparsestruct->rowoffsets_gpu->data().get(),
1141: matrix->column_indices->data().get(),
1142: matrixT->values->data().get(),
1143: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1144: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1145: CUSPARSE_ACTION_NUMERIC,indexBase,
1146: cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1147: #else
1148: matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1149: CUSPARSE_ACTION_NUMERIC, indexBase
1150: #endif
1151: );CHKERRCUSPARSE(stat);
1152: matstructT->mat = matrixT;
1154: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1155: stat = cusparseCreateCsr(&matstructT->matDescr,
1156: matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1157: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1158: matrixT->values->data().get(),
1159: CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1160: indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1161: #endif
1162: } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1163: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1164: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1165: #else
1166: CsrMatrix *temp = new CsrMatrix;
1167: CsrMatrix *tempT = new CsrMatrix;
1168: /* First convert HYB to CSR */
1169: temp->num_rows = A->rmap->n;
1170: temp->num_cols = A->cmap->n;
1171: temp->num_entries = a->nz;
1172: temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1173: temp->column_indices = new THRUSTINTARRAY32(a->nz);
1174: temp->values = new THRUSTARRAY(a->nz);
1176: stat = cusparse_hyb2csr(cusparsestruct->handle,
1177: matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1178: temp->values->data().get(),
1179: temp->row_offsets->data().get(),
1180: temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1182: /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1183: tempT->num_rows = A->rmap->n;
1184: tempT->num_cols = A->cmap->n;
1185: tempT->num_entries = a->nz;
1186: tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1187: tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1188: tempT->values = new THRUSTARRAY(a->nz);
1190: stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1191: temp->num_cols, temp->num_entries,
1192: temp->values->data().get(),
1193: temp->row_offsets->data().get(),
1194: temp->column_indices->data().get(),
1195: tempT->values->data().get(),
1196: tempT->column_indices->data().get(),
1197: tempT->row_offsets->data().get(),
1198: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1200: /* Last, convert CSC to HYB */
1201: cusparseHybMat_t hybMat;
1202: stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1203: cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1204: CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1205: stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1206: matstructT->descr, tempT->values->data().get(),
1207: tempT->row_offsets->data().get(),
1208: tempT->column_indices->data().get(),
1209: hybMat, 0, partition);CHKERRCUSPARSE(stat);
1211: /* assign the pointer */
1212: matstructT->mat = hybMat;
1213: /* delete temporaries */
1214: if (tempT) {
1215: if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1216: if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1217: if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1218: delete (CsrMatrix*) tempT;
1219: }
1220: if (temp) {
1221: if (temp->values) delete (THRUSTARRAY*) temp->values;
1222: if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1223: if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1224: delete (CsrMatrix*) temp;
1225: }
1226: #endif
1227: }
1228: err = WaitForCUDA();CHKERRCUDA(err);
1229: PetscLogGpuTimeEnd();
1230: PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1231: /* the compressed row indices is not used for matTranspose */
1232: matstructT->cprowIndices = NULL;
1233: /* assign the pointer */
1234: ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1235: return(0);
1236: }
1238: /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1239: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1240: {
1241: PetscInt n = xx->map->n;
1242: const PetscScalar *barray;
1243: PetscScalar *xarray;
1244: thrust::device_ptr<const PetscScalar> bGPU;
1245: thrust::device_ptr<PetscScalar> xGPU;
1246: cusparseStatus_t stat;
1247: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1248: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1249: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1250: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1251: PetscErrorCode ierr;
1252: cudaError_t cerr;
1255: /* Analyze the matrix and create the transpose ... on the fly */
1256: if (!loTriFactorT && !upTriFactorT) {
1257: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1258: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1259: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1260: }
1262: /* Get the GPU pointers */
1263: VecCUDAGetArrayWrite(xx,&xarray);
1264: VecCUDAGetArrayRead(bb,&barray);
1265: xGPU = thrust::device_pointer_cast(xarray);
1266: bGPU = thrust::device_pointer_cast(barray);
1268: PetscLogGpuTimeBegin();
1269: /* First, reorder with the row permutation */
1270: thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1271: thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1272: xGPU);
1274: /* First, solve U */
1275: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1276: upTriFactorT->csrMat->num_rows,
1277: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1278: upTriFactorT->csrMat->num_entries,
1279: #endif
1280: &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1281: upTriFactorT->csrMat->values->data().get(),
1282: upTriFactorT->csrMat->row_offsets->data().get(),
1283: upTriFactorT->csrMat->column_indices->data().get(),
1284: upTriFactorT->solveInfo,
1285: xarray, tempGPU->data().get()
1286: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1287: ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1288: #endif
1289: );CHKERRCUSPARSE(stat);
1291: /* Then, solve L */
1292: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1293: loTriFactorT->csrMat->num_rows,
1294: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1295: loTriFactorT->csrMat->num_entries,
1296: #endif
1297: &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1298: loTriFactorT->csrMat->values->data().get(),
1299: loTriFactorT->csrMat->row_offsets->data().get(),
1300: loTriFactorT->csrMat->column_indices->data().get(),
1301: loTriFactorT->solveInfo,
1302: tempGPU->data().get(), xarray
1303: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1304: ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1305: #endif
1306: );CHKERRCUSPARSE(stat);
1308: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1309: thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1310: thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1311: tempGPU->begin());
1313: /* Copy the temporary to the full solution. */
1314: thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1316: /* restore */
1317: VecCUDARestoreArrayRead(bb,&barray);
1318: VecCUDARestoreArrayWrite(xx,&xarray);
1319: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1320: PetscLogGpuTimeEnd();
1321: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1322: return(0);
1323: }
1325: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1326: {
1327: const PetscScalar *barray;
1328: PetscScalar *xarray;
1329: cusparseStatus_t stat;
1330: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1331: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1332: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1333: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1334: PetscErrorCode ierr;
1335: cudaError_t cerr;
1338: /* Analyze the matrix and create the transpose ... on the fly */
1339: if (!loTriFactorT && !upTriFactorT) {
1340: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1341: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1342: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1343: }
1345: /* Get the GPU pointers */
1346: VecCUDAGetArrayWrite(xx,&xarray);
1347: VecCUDAGetArrayRead(bb,&barray);
1349: PetscLogGpuTimeBegin();
1350: /* First, solve U */
1351: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1352: upTriFactorT->csrMat->num_rows,
1353: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1354: upTriFactorT->csrMat->num_entries,
1355: #endif
1356: &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1357: upTriFactorT->csrMat->values->data().get(),
1358: upTriFactorT->csrMat->row_offsets->data().get(),
1359: upTriFactorT->csrMat->column_indices->data().get(),
1360: upTriFactorT->solveInfo,
1361: barray, tempGPU->data().get()
1362: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1363: ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1364: #endif
1365: );CHKERRCUSPARSE(stat);
1367: /* Then, solve L */
1368: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1369: loTriFactorT->csrMat->num_rows,
1370: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1371: loTriFactorT->csrMat->num_entries,
1372: #endif
1373: &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1374: loTriFactorT->csrMat->values->data().get(),
1375: loTriFactorT->csrMat->row_offsets->data().get(),
1376: loTriFactorT->csrMat->column_indices->data().get(),
1377: loTriFactorT->solveInfo,
1378: tempGPU->data().get(), xarray
1379: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1380: ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1381: #endif
1382: );CHKERRCUSPARSE(stat);
1384: /* restore */
1385: VecCUDARestoreArrayRead(bb,&barray);
1386: VecCUDARestoreArrayWrite(xx,&xarray);
1387: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1388: PetscLogGpuTimeEnd();
1389: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1390: return(0);
1391: }
1393: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1394: {
1395: const PetscScalar *barray;
1396: PetscScalar *xarray;
1397: thrust::device_ptr<const PetscScalar> bGPU;
1398: thrust::device_ptr<PetscScalar> xGPU;
1399: cusparseStatus_t stat;
1400: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1401: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1402: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1403: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1404: PetscErrorCode ierr;
1405: cudaError_t cerr;
1409: /* Get the GPU pointers */
1410: VecCUDAGetArrayWrite(xx,&xarray);
1411: VecCUDAGetArrayRead(bb,&barray);
1412: xGPU = thrust::device_pointer_cast(xarray);
1413: bGPU = thrust::device_pointer_cast(barray);
1415: PetscLogGpuTimeBegin();
1416: /* First, reorder with the row permutation */
1417: thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1418: thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1419: tempGPU->begin());
1421: /* Next, solve L */
1422: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1423: loTriFactor->csrMat->num_rows,
1424: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1425: loTriFactor->csrMat->num_entries,
1426: #endif
1427: &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1428: loTriFactor->csrMat->values->data().get(),
1429: loTriFactor->csrMat->row_offsets->data().get(),
1430: loTriFactor->csrMat->column_indices->data().get(),
1431: loTriFactor->solveInfo,
1432: tempGPU->data().get(), xarray
1433: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1434: ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1435: #endif
1436: );CHKERRCUSPARSE(stat);
1438: /* Then, solve U */
1439: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1440: upTriFactor->csrMat->num_rows,
1441: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1442: upTriFactor->csrMat->num_entries,
1443: #endif
1444: &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1445: upTriFactor->csrMat->values->data().get(),
1446: upTriFactor->csrMat->row_offsets->data().get(),
1447: upTriFactor->csrMat->column_indices->data().get(),
1448: upTriFactor->solveInfo,
1449: xarray, tempGPU->data().get()
1450: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1451: ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1452: #endif
1453: );CHKERRCUSPARSE(stat);
1455: /* Last, reorder with the column permutation */
1456: thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1457: thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1458: xGPU);
1460: VecCUDARestoreArrayRead(bb,&barray);
1461: VecCUDARestoreArrayWrite(xx,&xarray);
1462: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1463: PetscLogGpuTimeEnd();
1464: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1465: return(0);
1466: }
1468: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1469: {
1470: const PetscScalar *barray;
1471: PetscScalar *xarray;
1472: cusparseStatus_t stat;
1473: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1474: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1475: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1476: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1477: PetscErrorCode ierr;
1478: cudaError_t cerr;
1481: /* Get the GPU pointers */
1482: VecCUDAGetArrayWrite(xx,&xarray);
1483: VecCUDAGetArrayRead(bb,&barray);
1485: PetscLogGpuTimeBegin();
1486: /* First, solve L */
1487: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1488: loTriFactor->csrMat->num_rows,
1489: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1490: loTriFactor->csrMat->num_entries,
1491: #endif
1492: &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1493: loTriFactor->csrMat->values->data().get(),
1494: loTriFactor->csrMat->row_offsets->data().get(),
1495: loTriFactor->csrMat->column_indices->data().get(),
1496: loTriFactor->solveInfo,
1497: barray, tempGPU->data().get()
1498: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1499: ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1500: #endif
1501: );CHKERRCUSPARSE(stat);
1503: /* Next, solve U */
1504: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1505: upTriFactor->csrMat->num_rows,
1506: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1507: upTriFactor->csrMat->num_entries,
1508: #endif
1509: &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1510: upTriFactor->csrMat->values->data().get(),
1511: upTriFactor->csrMat->row_offsets->data().get(),
1512: upTriFactor->csrMat->column_indices->data().get(),
1513: upTriFactor->solveInfo,
1514: tempGPU->data().get(), xarray
1515: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1516: ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1517: #endif
1518: );CHKERRCUSPARSE(stat);
1520: VecCUDARestoreArrayRead(bb,&barray);
1521: VecCUDARestoreArrayWrite(xx,&xarray);
1522: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1523: PetscLogGpuTimeEnd();
1524: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1525: return(0);
1526: }
1528: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1529: {
1530: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1531: Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1532: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1533: PetscInt m = A->rmap->n,*ii,*ridx,tmp;
1534: PetscErrorCode ierr;
1535: cusparseStatus_t stat;
1536: cudaError_t err;
1539: if (A->boundtocpu) return(0);
1540: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1541: if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1542: /* Copy values only */
1543: CsrMatrix *matrix,*matrixT;
1544: matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1546: PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1547: matrix->values->assign(a->a, a->a+a->nz);
1548: err = WaitForCUDA();CHKERRCUDA(err);
1549: PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));
1550: PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1552: /* Update matT when it was built before */
1553: if (cusparsestruct->matTranspose) {
1554: cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1555: matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1556: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1557: stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1558: A->cmap->n, matrix->num_entries,
1559: matrix->values->data().get(),
1560: cusparsestruct->rowoffsets_gpu->data().get(),
1561: matrix->column_indices->data().get(),
1562: matrixT->values->data().get(),
1563: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1564: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1565: CUSPARSE_ACTION_NUMERIC,indexBase,
1566: cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1567: #else
1568: matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1569: CUSPARSE_ACTION_NUMERIC, indexBase
1570: #endif
1571: );CHKERRCUSPARSE(stat);
1572: err = WaitForCUDA();CHKERRCUDA(err);
1573: PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1574: }
1575: } else {
1576: PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1577: MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);
1578: MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);
1579: delete cusparsestruct->workVector;
1580: delete cusparsestruct->rowoffsets_gpu;
1581: try {
1582: if (a->compressedrow.use) {
1583: m = a->compressedrow.nrows;
1584: ii = a->compressedrow.i;
1585: ridx = a->compressedrow.rindex;
1586: } else {
1587: m = A->rmap->n;
1588: ii = a->i;
1589: ridx = NULL;
1590: }
1591: cusparsestruct->nrows = m;
1593: /* create cusparse matrix */
1594: matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1595: stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1596: stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1597: stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1599: err = cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar));CHKERRCUDA(err);
1600: err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1601: err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1602: err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1603: err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1604: err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1605: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1607: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1608: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1609: /* set the matrix */
1610: CsrMatrix *mat= new CsrMatrix;
1611: mat->num_rows = m;
1612: mat->num_cols = A->cmap->n;
1613: mat->num_entries = a->nz;
1614: mat->row_offsets = new THRUSTINTARRAY32(m+1);
1615: mat->row_offsets->assign(ii, ii + m+1);
1617: mat->column_indices = new THRUSTINTARRAY32(a->nz);
1618: mat->column_indices->assign(a->j, a->j+a->nz);
1620: mat->values = new THRUSTARRAY(a->nz);
1621: mat->values->assign(a->a, a->a+a->nz);
1623: /* assign the pointer */
1624: matstruct->mat = mat;
1625: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1626: if (mat->num_rows) { /* cusparse errors on empty matrices! */
1627: stat = cusparseCreateCsr(&matstruct->matDescr,
1628: mat->num_rows, mat->num_cols, mat->num_entries,
1629: mat->row_offsets->data().get(), mat->column_indices->data().get(),
1630: mat->values->data().get(),
1631: CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1632: CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1633: }
1634: #endif
1635: } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1636: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1637: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1638: #else
1639: CsrMatrix *mat= new CsrMatrix;
1640: mat->num_rows = m;
1641: mat->num_cols = A->cmap->n;
1642: mat->num_entries = a->nz;
1643: mat->row_offsets = new THRUSTINTARRAY32(m+1);
1644: mat->row_offsets->assign(ii, ii + m+1);
1646: mat->column_indices = new THRUSTINTARRAY32(a->nz);
1647: mat->column_indices->assign(a->j, a->j+a->nz);
1649: mat->values = new THRUSTARRAY(a->nz);
1650: mat->values->assign(a->a, a->a+a->nz);
1652: cusparseHybMat_t hybMat;
1653: stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1654: cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1655: CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1656: stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1657: matstruct->descr, mat->values->data().get(),
1658: mat->row_offsets->data().get(),
1659: mat->column_indices->data().get(),
1660: hybMat, 0, partition);CHKERRCUSPARSE(stat);
1661: /* assign the pointer */
1662: matstruct->mat = hybMat;
1664: if (mat) {
1665: if (mat->values) delete (THRUSTARRAY*)mat->values;
1666: if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1667: if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1668: delete (CsrMatrix*)mat;
1669: }
1670: #endif
1671: }
1673: /* assign the compressed row indices */
1674: if (a->compressedrow.use) {
1675: cusparsestruct->workVector = new THRUSTARRAY(m);
1676: matstruct->cprowIndices = new THRUSTINTARRAY(m);
1677: matstruct->cprowIndices->assign(ridx,ridx+m);
1678: tmp = m;
1679: } else {
1680: cusparsestruct->workVector = NULL;
1681: matstruct->cprowIndices = NULL;
1682: tmp = 0;
1683: }
1684: PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));
1686: /* assign the pointer */
1687: cusparsestruct->mat = matstruct;
1688: } catch(char *ex) {
1689: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1690: }
1691: err = WaitForCUDA();CHKERRCUDA(err);
1692: PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1693: cusparsestruct->nonzerostate = A->nonzerostate;
1694: }
1695: A->offloadmask = PETSC_OFFLOAD_BOTH;
1696: }
1697: return(0);
1698: }
1700: struct VecCUDAPlusEquals
1701: {
1702: template <typename Tuple>
1703: __host__ __device__
1704: void operator()(Tuple t)
1705: {
1706: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1707: }
1708: };
1710: struct VecCUDAEqualsReverse
1711: {
1712: template <typename Tuple>
1713: __host__ __device__
1714: void operator()(Tuple t)
1715: {
1716: thrust::get<0>(t) = thrust::get<1>(t);
1717: }
1718: };
1720: struct MatMatCusparse {
1721: PetscBool cisdense;
1722: PetscScalar *Bt;
1723: Mat X;
1724: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1725: PetscBool initialized; /* C = alpha op(A) op(B) + beta C */
1726: cusparseDnMatDescr_t matBDescr;
1727: cusparseDnMatDescr_t matCDescr;
1728: size_t spmmBufferSize;
1729: void *spmmBuffer;
1730: PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1731: #endif
1732: };
1734: static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1735: {
1737: MatMatCusparse *mmdata = (MatMatCusparse *)data;
1738: cudaError_t cerr;
1741: cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1742: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1743: cusparseStatus_t stat;
1744: if (mmdata->matBDescr) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);}
1745: if (mmdata->matCDescr) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);}
1746: if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1747: #endif
1748: MatDestroy(&mmdata->X);
1749: PetscFree(data);
1750: return(0);
1751: }
1753: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1755: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1756: {
1757: Mat_Product *product = C->product;
1758: Mat A,B;
1759: PetscInt m,n,blda,clda;
1760: PetscBool flg,biscuda;
1761: Mat_SeqAIJCUSPARSE *cusp;
1762: cusparseStatus_t stat;
1763: cusparseOperation_t opA;
1764: const PetscScalar *barray;
1765: PetscScalar *carray;
1766: PetscErrorCode ierr;
1767: MatMatCusparse *mmdata;
1768: Mat_SeqAIJCUSPARSEMultStruct *mat;
1769: CsrMatrix *csrmat;
1770: cudaError_t cerr;
1773: MatCheckProduct(C,1);
1774: if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1775: mmdata = (MatMatCusparse*)product->data;
1776: A = product->A;
1777: B = product->B;
1778: PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
1779: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1780: /* currently CopyToGpu does not copy if the matrix is bound to CPU
1781: Instead of silently accepting the wrong answer, I prefer to raise the error */
1782: if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1783: MatSeqAIJCUSPARSECopyToGPU(A);
1784: cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1785: switch (product->type) {
1786: case MATPRODUCT_AB:
1787: case MATPRODUCT_PtAP:
1788: mat = cusp->mat;
1789: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1790: m = A->rmap->n;
1791: n = B->cmap->n;
1792: break;
1793: case MATPRODUCT_AtB:
1794: if (!cusp->transgen) {
1795: mat = cusp->mat;
1796: opA = CUSPARSE_OPERATION_TRANSPOSE;
1797: } else {
1798: MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1799: mat = cusp->matTranspose;
1800: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1801: }
1802: m = A->cmap->n;
1803: n = B->cmap->n;
1804: break;
1805: case MATPRODUCT_ABt:
1806: case MATPRODUCT_RARt:
1807: mat = cusp->mat;
1808: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1809: m = A->rmap->n;
1810: n = B->rmap->n;
1811: break;
1812: default:
1813: SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1814: }
1815: if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1816: csrmat = (CsrMatrix*)mat->mat;
1817: /* if the user passed a CPU matrix, copy the data to the GPU */
1818: PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);
1819: if (!biscuda) {MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);}
1820: MatDenseCUDAGetArrayRead(B,&barray);
1822: MatDenseGetLDA(B,&blda);
1823: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1824: MatDenseCUDAGetArrayWrite(mmdata->X,&carray);
1825: MatDenseGetLDA(mmdata->X,&clda);
1826: } else {
1827: MatDenseCUDAGetArrayWrite(C,&carray);
1828: MatDenseGetLDA(C,&clda);
1829: }
1831: PetscLogGpuTimeBegin();
1832: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1833: cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
1834: /* (re)allcoate spmmBuffer if not initialized or LDAs are different */
1835: if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
1836: if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
1837: if (!mmdata->matBDescr) {
1838: stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1839: mmdata->Blda = blda;
1840: }
1842: if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
1843: if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
1844: stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1845: mmdata->Clda = clda;
1846: }
1848: if (!mat->matDescr) {
1849: stat = cusparseCreateCsr(&mat->matDescr,
1850: csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
1851: csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
1852: csrmat->values->data().get(),
1853: CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1854: CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1855: }
1856: stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
1857: mat->matDescr,mmdata->matBDescr,mat->beta_zero,
1858: mmdata->matCDescr,cusparse_scalartype,
1859: cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat);
1860: if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1861: cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr);
1862: mmdata->initialized = PETSC_TRUE;
1863: } else {
1864: /* to be safe, always update pointers of the mats */
1865: stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
1866: stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
1867: stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
1868: }
1870: /* do cusparseSpMM, which supports transpose on B */
1871: stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
1872: mat->matDescr,mmdata->matBDescr,mat->beta_zero,
1873: mmdata->matCDescr,cusparse_scalartype,
1874: cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat);
1875: #else
1876: PetscInt k;
1877: /* cusparseXcsrmm does not support transpose on B */
1878: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1879: cublasHandle_t cublasv2handle;
1880: cublasStatus_t cerr;
1882: PetscCUBLASGetHandle(&cublasv2handle);
1883: cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
1884: B->cmap->n,B->rmap->n,
1885: &PETSC_CUSPARSE_ONE ,barray,blda,
1886: &PETSC_CUSPARSE_ZERO,barray,blda,
1887: mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
1888: blda = B->cmap->n;
1889: k = B->cmap->n;
1890: } else {
1891: k = B->rmap->n;
1892: }
1894: /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
1895: stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
1896: csrmat->num_entries,mat->alpha_one,mat->descr,
1897: csrmat->values->data().get(),
1898: csrmat->row_offsets->data().get(),
1899: csrmat->column_indices->data().get(),
1900: mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
1901: carray,clda);CHKERRCUSPARSE(stat);
1902: #endif
1903: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1904: PetscLogGpuTimeEnd();
1905: PetscLogGpuFlops(n*2.0*csrmat->num_entries);
1906: MatDenseCUDARestoreArrayRead(B,&barray);
1907: if (product->type == MATPRODUCT_RARt) {
1908: MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
1909: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);
1910: } else if (product->type == MATPRODUCT_PtAP) {
1911: MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
1912: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);
1913: } else {
1914: MatDenseCUDARestoreArrayWrite(C,&carray);
1915: }
1916: if (mmdata->cisdense) {
1917: MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);
1918: }
1919: if (!biscuda) {
1920: MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);
1921: }
1922: return(0);
1923: }
1925: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1926: {
1927: Mat_Product *product = C->product;
1928: Mat A,B;
1929: PetscInt m,n;
1930: PetscBool cisdense,flg;
1931: PetscErrorCode ierr;
1932: MatMatCusparse *mmdata;
1933: Mat_SeqAIJCUSPARSE *cusp;
1936: MatCheckProduct(C,1);
1937: if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
1938: A = product->A;
1939: B = product->B;
1940: PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
1941: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1942: cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1943: if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
1944: switch (product->type) {
1945: case MATPRODUCT_AB:
1946: m = A->rmap->n;
1947: n = B->cmap->n;
1948: break;
1949: case MATPRODUCT_AtB:
1950: m = A->cmap->n;
1951: n = B->cmap->n;
1952: break;
1953: case MATPRODUCT_ABt:
1954: m = A->rmap->n;
1955: n = B->rmap->n;
1956: break;
1957: case MATPRODUCT_PtAP:
1958: m = B->cmap->n;
1959: n = B->cmap->n;
1960: break;
1961: case MATPRODUCT_RARt:
1962: m = B->rmap->n;
1963: n = B->rmap->n;
1964: break;
1965: default:
1966: SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1967: }
1968: MatSetSizes(C,m,n,m,n);
1969: /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
1970: PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);
1971: MatSetType(C,MATSEQDENSECUDA);
1973: /* product data */
1974: PetscNew(&mmdata);
1975: mmdata->cisdense = cisdense;
1976: #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
1977: /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
1978: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1979: cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
1980: }
1981: #endif
1982: /* for these products we need intermediate storage */
1983: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1984: MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);
1985: MatSetType(mmdata->X,MATSEQDENSECUDA);
1986: if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
1987: MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);
1988: } else {
1989: MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);
1990: }
1991: }
1992: C->product->data = mmdata;
1993: C->product->destroy = MatDestroy_MatMatCusparse;
1995: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
1996: return(0);
1997: }
1999: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2001: /* handles dense B */
2002: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
2003: {
2004: Mat_Product *product = C->product;
2008: MatCheckProduct(C,1);
2009: if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
2010: if (product->A->boundtocpu) {
2011: MatProductSetFromOptions_SeqAIJ_SeqDense(C);
2012: return(0);
2013: }
2014: switch (product->type) {
2015: case MATPRODUCT_AB:
2016: case MATPRODUCT_AtB:
2017: case MATPRODUCT_ABt:
2018: case MATPRODUCT_PtAP:
2019: case MATPRODUCT_RARt:
2020: C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2021: default:
2022: break;
2023: }
2024: return(0);
2025: }
2027: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2028: {
2032: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);
2033: return(0);
2034: }
2036: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2037: {
2041: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);
2042: return(0);
2043: }
2045: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2046: {
2050: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);
2051: return(0);
2052: }
2054: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2055: {
2059: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);
2060: return(0);
2061: }
2063: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2064: {
2068: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);
2069: return(0);
2070: }
2072: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2073: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2074: {
2075: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2076: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2077: Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2078: PetscScalar *xarray,*zarray,*dptr,*beta,*xptr;
2079: PetscErrorCode ierr;
2080: cudaError_t cerr;
2081: cusparseStatus_t stat;
2082: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2083: PetscBool compressed;
2084: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2085: PetscInt nx,ny;
2086: #endif
2089: if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2090: if (!a->nonzerorowcnt) {
2091: if (!yy) {VecSet_SeqCUDA(zz,0);}
2092: return(0);
2093: }
2094: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2095: MatSeqAIJCUSPARSECopyToGPU(A);
2096: if (!trans) {
2097: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2098: } else {
2099: if (herm || !cusparsestruct->transgen) {
2100: opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2101: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2102: } else {
2103: if (!cusparsestruct->matTranspose) {MatSeqAIJCUSPARSEGenerateTransposeForMult(A);}
2104: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2105: }
2106: }
2107: /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2108: compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2110: try {
2111: VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);
2112: if (yy == zz) {VecCUDAGetArray(zz,&zarray);} /* read & write zz, so need to get uptodate zarray on GPU */
2113: else {VecCUDAGetArrayWrite(zz,&zarray);} /* write zz, so no need to init zarray on GPU */
2115: PetscLogGpuTimeBegin();
2116: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2117: /* z = A x + beta y.
2118: If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2119: When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2120: */
2121: xptr = xarray;
2122: dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2123: beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2124: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2125: /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2126: allocated to accommodate different uses. So we get the length info directly from mat.
2127: */
2128: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2129: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2130: nx = mat->num_cols;
2131: ny = mat->num_rows;
2132: }
2133: #endif
2134: } else {
2135: /* z = A^T x + beta y
2136: If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2137: Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2138: */
2139: xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2140: dptr = zarray;
2141: beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2142: if (compressed) { /* Scatter x to work vector */
2143: thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2144: thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2145: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2146: VecCUDAEqualsReverse());
2147: }
2148: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2149: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2150: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2151: nx = mat->num_rows;
2152: ny = mat->num_cols;
2153: }
2154: #endif
2155: }
2157: /* csr_spmv does y = alpha op(A) x + beta y */
2158: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2159: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2160: if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2161: if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2162: stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2163: stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2164: stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2165: matstruct->matDescr,
2166: matstruct->cuSpMV[opA].vecXDescr, beta,
2167: matstruct->cuSpMV[opA].vecYDescr,
2168: cusparse_scalartype,
2169: cusparsestruct->spmvAlg,
2170: &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2171: cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2173: matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2174: } else {
2175: /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2176: stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2177: stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2178: }
2180: stat = cusparseSpMV(cusparsestruct->handle, opA,
2181: matstruct->alpha_one,
2182: matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2183: matstruct->cuSpMV[opA].vecXDescr,
2184: beta,
2185: matstruct->cuSpMV[opA].vecYDescr,
2186: cusparse_scalartype,
2187: cusparsestruct->spmvAlg,
2188: matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2189: #else
2190: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2191: stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2192: mat->num_rows, mat->num_cols,
2193: mat->num_entries, matstruct->alpha_one, matstruct->descr,
2194: mat->values->data().get(), mat->row_offsets->data().get(),
2195: mat->column_indices->data().get(), xptr, beta,
2196: dptr);CHKERRCUSPARSE(stat);
2197: #endif
2198: } else {
2199: if (cusparsestruct->nrows) {
2200: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2201: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2202: #else
2203: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2204: stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2205: matstruct->alpha_one, matstruct->descr, hybMat,
2206: xptr, beta,
2207: dptr);CHKERRCUSPARSE(stat);
2208: #endif
2209: }
2210: }
2211: cerr = WaitForCUDA();CHKERRCUDA(cerr);
2212: PetscLogGpuTimeEnd();
2214: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2215: if (yy) { /* MatMultAdd: zz = A*xx + yy */
2216: if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2217: VecCopy_SeqCUDA(yy,zz); /* zz = yy */
2218: } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2219: VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
2220: }
2221: } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2222: VecSet_SeqCUDA(zz,0);
2223: }
2225: /* ScatterAdd the result from work vector into the full vector when A is compressed */
2226: if (compressed) {
2227: thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2229: PetscLogGpuTimeBegin();
2230: thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2231: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2232: VecCUDAPlusEquals());
2233: cerr = WaitForCUDA();CHKERRCUDA(cerr);
2234: PetscLogGpuTimeEnd();
2235: }
2236: } else {
2237: if (yy && yy != zz) {
2238: VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
2239: }
2240: }
2241: VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);
2242: if (yy == zz) {VecCUDARestoreArray(zz,&zarray);}
2243: else {VecCUDARestoreArrayWrite(zz,&zarray);}
2244: } catch(char *ex) {
2245: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2246: }
2247: if (yy) {
2248: PetscLogGpuFlops(2.0*a->nz);
2249: } else {
2250: PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);
2251: }
2252: return(0);
2253: }
2255: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2256: {
2260: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);
2261: return(0);
2262: }
2264: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2265: {
2266: PetscErrorCode ierr;
2267: PetscSplitCSRDataStructure *d_mat = NULL, h_mat;
2268: PetscBool is_seq = PETSC_TRUE;
2269: PetscInt nnz_state = A->nonzerostate;
2271: if (A->factortype == MAT_FACTOR_NONE) {
2272: d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2273: }
2274: if (d_mat) {
2275: cudaError_t err;
2276: PetscInfo(A,"Assemble device matrix\n");
2277: err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2278: nnz_state = h_mat.nonzerostate;
2279: is_seq = h_mat.seq;
2280: }
2281: MatAssemblyEnd_SeqAIJ(A,mode); // this does very little if assembled on GPU - call it?
2282: if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) return(0);
2283: if (A->factortype == MAT_FACTOR_NONE && A->nonzerostate >= nnz_state && is_seq) { // assembled on CPU eventhough equiped for GPU
2284: MatSeqAIJCUSPARSECopyToGPU(A);
2285: } else if (nnz_state > A->nonzerostate) {
2286: A->offloadmask = PETSC_OFFLOAD_GPU;
2287: }
2289: return(0);
2290: }
2292: /* --------------------------------------------------------------------------------*/
2293: /*@
2294: MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2295: (the default parallel PETSc format). This matrix will ultimately pushed down
2296: to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2297: assembly performance the user should preallocate the matrix storage by setting
2298: the parameter nz (or the array nnz). By setting these parameters accurately,
2299: performance during matrix assembly can be increased by more than a factor of 50.
2301: Collective
2303: Input Parameters:
2304: + comm - MPI communicator, set to PETSC_COMM_SELF
2305: . m - number of rows
2306: . n - number of columns
2307: . nz - number of nonzeros per row (same for all rows)
2308: - nnz - array containing the number of nonzeros in the various rows
2309: (possibly different for each row) or NULL
2311: Output Parameter:
2312: . A - the matrix
2314: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2315: MatXXXXSetPreallocation() paradgm instead of this routine directly.
2316: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
2318: Notes:
2319: If nnz is given then nz is ignored
2321: The AIJ format (also called the Yale sparse matrix format or
2322: compressed row storage), is fully compatible with standard Fortran 77
2323: storage. That is, the stored row and column indices can begin at
2324: either one (as in Fortran) or zero. See the users' manual for details.
2326: Specify the preallocated storage with either nz or nnz (not both).
2327: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
2328: allocation. For large problems you MUST preallocate memory or you
2329: will get TERRIBLE performance, see the users' manual chapter on matrices.
2331: By default, this format uses inodes (identical nodes) when possible, to
2332: improve numerical efficiency of matrix-vector products and solves. We
2333: search for consecutive rows with the same nonzero structure, thereby
2334: reusing matrix information to achieve increased efficiency.
2336: Level: intermediate
2338: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
2339: @*/
2340: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
2341: {
2345: MatCreate(comm,A);
2346: MatSetSizes(*A,m,n,m,n);
2347: MatSetType(*A,MATSEQAIJCUSPARSE);
2348: MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
2349: return(0);
2350: }
2352: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
2353: {
2354: PetscErrorCode ierr;
2355: PetscSplitCSRDataStructure *d_mat = NULL;
2358: if (A->factortype == MAT_FACTOR_NONE) {
2359: d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2360: ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
2361: MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
2362: } else {
2363: MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
2364: }
2365: if (d_mat) {
2366: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2367: cudaError_t err;
2368: PetscSplitCSRDataStructure h_mat;
2369: PetscInfo(A,"Have device matrix\n");
2370: err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2371: if (h_mat.seq) {
2372: if (a->compressedrow.use) {
2373: err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
2374: }
2375: err = cudaFree(d_mat);CHKERRCUDA(err);
2376: }
2377: }
2378: PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);
2379: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
2380: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
2381: PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);
2382: MatDestroy_SeqAIJ(A);
2383: return(0);
2384: }
2386: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
2387: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
2388: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
2389: {
2393: MatDuplicate_SeqAIJ(A,cpvalues,B);
2394: MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);
2395: return(0);
2396: }
2398: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
2399: {
2403: if (A->factortype != MAT_FACTOR_NONE) return(0);
2404: /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU.
2405: If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one.
2406: Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case.
2407: TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries;
2408: can follow the example of MatBindToCPU_SeqAIJViennaCL(). */
2409: if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov");
2410: /* TODO: add support for this? */
2411: if (flg) {
2412: A->ops->mult = MatMult_SeqAIJ;
2413: A->ops->multadd = MatMultAdd_SeqAIJ;
2414: A->ops->multtranspose = MatMultTranspose_SeqAIJ;
2415: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
2416: A->ops->multhermitiantranspose = NULL;
2417: A->ops->multhermitiantransposeadd = NULL;
2418: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
2419: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
2420: } else {
2421: A->ops->mult = MatMult_SeqAIJCUSPARSE;
2422: A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
2423: A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
2424: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
2425: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE;
2426: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
2427: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
2428: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
2429: }
2430: A->boundtocpu = flg;
2431: return(0);
2432: }
2434: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
2435: {
2436: PetscSplitCSRDataStructure *d_mat = NULL;
2437: PetscErrorCode ierr;
2439: if (A->factortype == MAT_FACTOR_NONE) {
2440: Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
2441: d_mat = spptr->deviceMat;
2442: }
2443: if (d_mat) {
2444: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2445: PetscInt n = A->rmap->n, nnz = a->i[n];
2446: cudaError_t err;
2447: PetscScalar *vals;
2448: PetscInfo(A,"Zero device matrix\n");
2449: err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2450: err = cudaMemset( vals, 0, (nnz)*sizeof(PetscScalar));CHKERRCUDA(err);
2451: }
2452: MatZeroEntries_SeqAIJ(A);
2454: A->offloadmask = PETSC_OFFLOAD_BOTH;
2456: return(0);
2457: }
2459: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
2460: {
2461: PetscErrorCode ierr;
2462: cusparseStatus_t stat;
2463: Mat B;
2466: if (reuse == MAT_INITIAL_MATRIX) {
2467: MatDuplicate(A,MAT_COPY_VALUES,newmat);
2468: } else if (reuse == MAT_REUSE_MATRIX) {
2469: MatCopy(A,*newmat,SAME_NONZERO_PATTERN);
2470: }
2471: B = *newmat;
2473: PetscFree(B->defaultvectype);
2474: PetscStrallocpy(VECCUDA,&B->defaultvectype);
2476: if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
2477: if (B->factortype == MAT_FACTOR_NONE) {
2478: Mat_SeqAIJCUSPARSE *spptr;
2480: PetscNew(&spptr);
2481: spptr->format = MAT_CUSPARSE_CSR;
2482: stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2483: B->spptr = spptr;
2484: spptr->deviceMat = NULL;
2485: } else {
2486: Mat_SeqAIJCUSPARSETriFactors *spptr;
2488: PetscNew(&spptr);
2489: stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2490: B->spptr = spptr;
2491: }
2492: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2493: }
2494: B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
2495: B->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
2496: B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
2497: B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE;
2498: B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
2499: B->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE;
2501: MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);
2502: PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);
2503: PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);
2504: return(0);
2505: }
2507: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
2508: {
2512: PetscCUDAInitializeCheck();
2513: MatCreate_SeqAIJ(B);
2514: MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);
2515: PetscObjectOptionsBegin((PetscObject)B);
2516: MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);
2517: PetscOptionsEnd();
2518: return(0);
2519: }
2521: /*MC
2522: MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
2524: A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
2525: CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
2526: All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
2528: Options Database Keys:
2529: + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2530: . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2531: - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2533: Level: beginner
2535: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2536: M*/
2538: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
2541: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
2542: {
2546: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
2547: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
2548: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
2549: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);
2550: return(0);
2551: }
2553: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
2554: {
2555: PetscErrorCode ierr;
2556: cusparseStatus_t stat;
2557: cusparseHandle_t handle;
2560: if (*cusparsestruct) {
2561: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
2562: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
2563: delete (*cusparsestruct)->workVector;
2564: delete (*cusparsestruct)->rowoffsets_gpu;
2565: if (handle = (*cusparsestruct)->handle) {stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);}
2566: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2567: cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
2568: #endif
2569: PetscFree(*cusparsestruct);
2570: }
2571: return(0);
2572: }
2574: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2575: {
2577: if (*mat) {
2578: delete (*mat)->values;
2579: delete (*mat)->column_indices;
2580: delete (*mat)->row_offsets;
2581: delete *mat;
2582: *mat = 0;
2583: }
2584: return(0);
2585: }
2587: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
2588: {
2589: cusparseStatus_t stat;
2590: PetscErrorCode ierr;
2593: if (*trifactor) {
2594: if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2595: if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
2596: CsrMatrix_Destroy(&(*trifactor)->csrMat);
2597: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2598: cudaError_t cerr;
2599: if ((*trifactor)->solveBuffer) {cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
2600: if ((*trifactor)->csr2cscBuffer) {cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
2601: #endif
2602: delete *trifactor;
2603: *trifactor = 0;
2604: }
2605: return(0);
2606: }
2608: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
2609: {
2610: CsrMatrix *mat;
2611: cusparseStatus_t stat;
2612: cudaError_t err;
2615: if (*matstruct) {
2616: if ((*matstruct)->mat) {
2617: if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2618: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2619: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2620: #else
2621: cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
2622: stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2623: #endif
2624: } else {
2625: mat = (CsrMatrix*)(*matstruct)->mat;
2626: CsrMatrix_Destroy(&mat);
2627: }
2628: }
2629: if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
2630: delete (*matstruct)->cprowIndices;
2631: if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
2632: if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
2633: if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
2635: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2636: Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
2637: if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
2638: for (int i=0; i<3; i++) {
2639: if (mdata->cuSpMV[i].initialized) {
2640: err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
2641: stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
2642: stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
2643: }
2644: }
2645: #endif
2646: delete *matstruct;
2647: *matstruct = 0;
2648: }
2649: return(0);
2650: }
2652: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2653: {
2657: if (*trifactors) {
2658: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
2659: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
2660: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
2661: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
2662: delete (*trifactors)->rpermIndices;
2663: delete (*trifactors)->cpermIndices;
2664: delete (*trifactors)->workVector;
2665: (*trifactors)->rpermIndices = 0;
2666: (*trifactors)->cpermIndices = 0;
2667: (*trifactors)->workVector = 0;
2668: }
2669: return(0);
2670: }
2672: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2673: {
2674: PetscErrorCode ierr;
2675: cusparseHandle_t handle;
2676: cusparseStatus_t stat;
2679: if (*trifactors) {
2680: MatSeqAIJCUSPARSETriFactors_Reset(trifactors);
2681: if (handle = (*trifactors)->handle) {
2682: stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2683: }
2684: PetscFree(*trifactors);
2685: }
2686: return(0);
2687: }