Actual source code: aijcusparse.cu

petsc-3.14.0 2020-09-29
Report Typos and Errors
  1: /*
  2:   Defines the basic matrix operations for the AIJ (compressed row)
  3:   matrix storage format using the CUSPARSE library,
  4: */
  5: #define PETSC_SKIP_SPINLOCK
  6: #define PETSC_SKIP_CXX_COMPLEX_FIX
  7: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1

  9: #include <petscconf.h>
 10: #include <../src/mat/impls/aij/seq/aij.h>
 11: #include <../src/mat/impls/sbaij/seq/sbaij.h>
 12: #include <../src/vec/vec/impls/dvecimpl.h>
 13: #include <petsc/private/vecimpl.h>
 14: #undef VecType
 15: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>

 17: const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
 18: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
 19:   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
 20:     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.

 22:   typedef enum {
 23:       CUSPARSE_MV_ALG_DEFAULT = 0,
 24:       CUSPARSE_COOMV_ALG      = 1,
 25:       CUSPARSE_CSRMV_ALG1     = 2,
 26:       CUSPARSE_CSRMV_ALG2     = 3
 27:   } cusparseSpMVAlg_t;

 29:   typedef enum {
 30:       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
 31:       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
 32:       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
 33:       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
 34:       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
 35:       CUSPARSE_SPMM_ALG_DEFAULT = 0,
 36:       CUSPARSE_SPMM_COO_ALG1    = 1,
 37:       CUSPARSE_SPMM_COO_ALG2    = 2,
 38:       CUSPARSE_SPMM_COO_ALG3    = 3,
 39:       CUSPARSE_SPMM_COO_ALG4    = 5,
 40:       CUSPARSE_SPMM_CSR_ALG1    = 4,
 41:       CUSPARSE_SPMM_CSR_ALG2    = 6,
 42:   } cusparseSpMMAlg_t;

 44:   typedef enum {
 45:       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
 46:       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
 47:   } cusparseCsr2CscAlg_t;
 48:   */
 49:   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
 50:   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
 51:   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
 52: #endif

 54: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 55: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
 56: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 58: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 59: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
 60: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);

 62: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
 63: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 64: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 65: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
 66: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
 67: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
 68: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 69: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 70: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 71: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
 72: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
 73: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);

 75: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
 76: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
 77: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
 78: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
 79: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
 80: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);

 82: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
 83: {
 84:   cusparseStatus_t   stat;
 85:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

 88:   cusparsestruct->stream = stream;
 89:   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
 90:   return(0);
 91: }

 93: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
 94: {
 95:   cusparseStatus_t   stat;
 96:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

 99:   if (cusparsestruct->handle != handle) {
100:     if (cusparsestruct->handle) {
101:       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
102:     }
103:     cusparsestruct->handle = handle;
104:   }
105:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
106:   return(0);
107: }

109: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
110: {
111:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

114:   if (cusparsestruct->handle) cusparsestruct->handle = 0;
115:   return(0);
116: }

118: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
119: {
121:   *type = MATSOLVERCUSPARSE;
122:   return(0);
123: }

125: /*MC
126:   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
127:   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
128:   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
129:   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
130:   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
131:   algorithms are not recommended. This class does NOT support direct solver operations.

133:   Level: beginner

135: .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
136: M*/

138: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
139: {
141:   PetscInt       n = A->rmap->n;

144:   MatCreate(PetscObjectComm((PetscObject)A),B);
145:   MatSetSizes(*B,n,n,n,n);
146:   (*B)->factortype = ftype;
147:   (*B)->useordering = PETSC_TRUE;
148:   MatSetType(*B,MATSEQAIJCUSPARSE);

150:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
151:     MatSetBlockSizesFromMats(*B,A,A);
152:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
153:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
154:   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
155:     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
156:     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
157:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");

159:   MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
160:   PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);
161:   return(0);
162: }

164: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
165: {
166:   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

169:   switch (op) {
170:   case MAT_CUSPARSE_MULT:
171:     cusparsestruct->format = format;
172:     break;
173:   case MAT_CUSPARSE_ALL:
174:     cusparsestruct->format = format;
175:     break;
176:   default:
177:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
178:   }
179:   return(0);
180: }

182: /*@
183:    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
184:    operation. Only the MatMult operation can use different GPU storage formats
185:    for MPIAIJCUSPARSE matrices.
186:    Not Collective

188:    Input Parameters:
189: +  A - Matrix of type SEQAIJCUSPARSE
190: .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
191: -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)

193:    Output Parameter:

195:    Level: intermediate

197: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
198: @*/
199: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
200: {

205:   PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
206:   return(0);
207: }

209: /*@
210:    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose

212:    Collective on mat

214:    Input Parameters:
215: +  A - Matrix of type SEQAIJCUSPARSE
216: -  transgen - the boolean flag

218:    Level: intermediate

220: .seealso: MATSEQAIJCUSPARSE
221: @*/
222: PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
223: {
225:   PetscBool      flg;

229:   PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);
230:   if (flg) {
231:     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;

233:     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
234:     cusp->transgen = transgen;
235:     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
236:       MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);
237:     }
238:   }
239:   return(0);
240: }

242: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
243: {
244:   PetscErrorCode           ierr;
245:   MatCUSPARSEStorageFormat format;
246:   PetscBool                flg;
247:   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;

250:   PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
251:   if (A->factortype == MAT_FACTOR_NONE) {
252:     PetscBool transgen = cusparsestruct->transgen;

254:     PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);
255:     if (flg) {MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);}

257:     PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
258:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
259:     if (flg) {MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);}

261:     PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
262:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
263:     if (flg) {MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);}
264:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
265:     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
266:     PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
267:                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);
268:     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
269:     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");

271:     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
272:     PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
273:                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);
274:     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");

276:     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
277:     PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
278:                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);
279:     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
280:    #endif
281:   }
282:   PetscOptionsTail();
283:   return(0);
284: }

286: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
287: {

291:   MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
292:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
293:   return(0);
294: }

296: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
297: {

301:   MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
302:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
303:   return(0);
304: }

306: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
307: {

311:   MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
312:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
313:   return(0);
314: }

316: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
317: {

321:   MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
322:   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
323:   return(0);
324: }

326: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
327: {
328:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
329:   PetscInt                          n = A->rmap->n;
330:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
331:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
332:   cusparseStatus_t                  stat;
333:   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
334:   const MatScalar                   *aa = a->a,*v;
335:   PetscInt                          *AiLo, *AjLo;
336:   PetscScalar                       *AALo;
337:   PetscInt                          i,nz, nzLower, offset, rowOffset;
338:   PetscErrorCode                    ierr;
339:   cudaError_t                       cerr;

342:   if (!n) return(0);
343:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
344:     try {
345:       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
346:       nzLower=n+ai[n]-ai[1];

348:       /* Allocate Space for the lower triangular matrix */
349:       cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
350:       cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
351:       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);

353:       /* Fill the lower triangular matrix */
354:       AiLo[0]  = (PetscInt) 0;
355:       AiLo[n]  = nzLower;
356:       AjLo[0]  = (PetscInt) 0;
357:       AALo[0]  = (MatScalar) 1.0;
358:       v        = aa;
359:       vi       = aj;
360:       offset   = 1;
361:       rowOffset= 1;
362:       for (i=1; i<n; i++) {
363:         nz = ai[i+1] - ai[i];
364:         /* additional 1 for the term on the diagonal */
365:         AiLo[i]    = rowOffset;
366:         rowOffset += nz+1;

368:         PetscArraycpy(&(AjLo[offset]), vi, nz);
369:         PetscArraycpy(&(AALo[offset]), v, nz);

371:         offset      += nz;
372:         AjLo[offset] = (PetscInt) i;
373:         AALo[offset] = (MatScalar) 1.0;
374:         offset      += 1;

376:         v  += nz;
377:         vi += nz;
378:       }

380:       /* allocate space for the triangular factor information */
381:       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

383:       /* Create the matrix description */
384:       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
385:       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
386:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
387:       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
388:      #else
389:       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
390:      #endif
391:       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
392:       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);

394:       /* set the operation */
395:       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

397:       /* set the matrix */
398:       loTriFactor->csrMat = new CsrMatrix;
399:       loTriFactor->csrMat->num_rows = n;
400:       loTriFactor->csrMat->num_cols = n;
401:       loTriFactor->csrMat->num_entries = nzLower;

403:       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
404:       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);

406:       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
407:       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);

409:       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
410:       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);

412:       /* Create the solve analysis information */
413:       stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
414:     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
415:       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
416:                                      loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
417:                                      loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
418:                                      loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
419:                                      &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
420:       cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
421:     #endif

423:       /* perform the solve analysis */
424:       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
425:                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
426:                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
427:                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
428:                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
429:                                ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
430:                              #endif
431:                               );CHKERRCUSPARSE(stat);

433:       /* assign the pointer. Is this really necessary? */
434:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

436:       cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
437:       cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
438:       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
439:       PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));
440:     } catch(char *ex) {
441:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
442:     }
443:   }
444:   return(0);
445: }

447: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
448: {
449:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
450:   PetscInt                          n = A->rmap->n;
451:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
452:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
453:   cusparseStatus_t                  stat;
454:   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
455:   const MatScalar                   *aa = a->a,*v;
456:   PetscInt                          *AiUp, *AjUp;
457:   PetscScalar                       *AAUp;
458:   PetscInt                          i,nz, nzUpper, offset;
459:   PetscErrorCode                    ierr;
460:   cudaError_t                       cerr;

463:   if (!n) return(0);
464:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
465:     try {
466:       /* next, figure out the number of nonzeros in the upper triangular matrix. */
467:       nzUpper = adiag[0]-adiag[n];

469:       /* Allocate Space for the upper triangular matrix */
470:       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
471:       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
472:       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);

474:       /* Fill the upper triangular matrix */
475:       AiUp[0]=(PetscInt) 0;
476:       AiUp[n]=nzUpper;
477:       offset = nzUpper;
478:       for (i=n-1; i>=0; i--) {
479:         v  = aa + adiag[i+1] + 1;
480:         vi = aj + adiag[i+1] + 1;

482:         /* number of elements NOT on the diagonal */
483:         nz = adiag[i] - adiag[i+1]-1;

485:         /* decrement the offset */
486:         offset -= (nz+1);

488:         /* first, set the diagonal elements */
489:         AjUp[offset] = (PetscInt) i;
490:         AAUp[offset] = (MatScalar)1./v[nz];
491:         AiUp[i]      = AiUp[i+1] - (nz+1);

493:         PetscArraycpy(&(AjUp[offset+1]), vi, nz);
494:         PetscArraycpy(&(AAUp[offset+1]), v, nz);
495:       }

497:       /* allocate space for the triangular factor information */
498:       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

500:       /* Create the matrix description */
501:       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
502:       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
503:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
504:       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
505:      #else
506:       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
507:      #endif
508:       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
509:       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);

511:       /* set the operation */
512:       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

514:       /* set the matrix */
515:       upTriFactor->csrMat = new CsrMatrix;
516:       upTriFactor->csrMat->num_rows = n;
517:       upTriFactor->csrMat->num_cols = n;
518:       upTriFactor->csrMat->num_entries = nzUpper;

520:       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
521:       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);

523:       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
524:       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);

526:       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
527:       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);

529:       /* Create the solve analysis information */
530:       stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
531:     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
532:       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
533:                                    upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
534:                                    upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
535:                                    upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
536:                                    &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
537:       cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
538:     #endif

540:       /* perform the solve analysis */
541:       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
542:                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
543:                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
544:                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
545:                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
546:                                ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
547:                              #endif
548:                               );CHKERRCUSPARSE(stat);

550:       /* assign the pointer. Is this really necessary? */
551:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

553:       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
554:       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
555:       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
556:       PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));
557:     } catch(char *ex) {
558:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
559:     }
560:   }
561:   return(0);
562: }

564: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
565: {
566:   PetscErrorCode               ierr;
567:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
568:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
569:   IS                           isrow = a->row,iscol = a->icol;
570:   PetscBool                    row_identity,col_identity;
571:   const PetscInt               *r,*c;
572:   PetscInt                     n = A->rmap->n;

575:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
576:   MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
577:   MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);

579:   cusparseTriFactors->workVector = new THRUSTARRAY(n);
580:   cusparseTriFactors->nnz=a->nz;

582:   A->offloadmask = PETSC_OFFLOAD_BOTH;
583:   /* lower triangular indices */
584:   ISGetIndices(isrow,&r);
585:   ISIdentity(isrow,&row_identity);
586:   if (!row_identity) {
587:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
588:     cusparseTriFactors->rpermIndices->assign(r, r+n);
589:   }
590:   ISRestoreIndices(isrow,&r);

592:   /* upper triangular indices */
593:   ISGetIndices(iscol,&c);
594:   ISIdentity(iscol,&col_identity);
595:   if (!col_identity) {
596:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
597:     cusparseTriFactors->cpermIndices->assign(c, c+n);
598:   }

600:   if (!row_identity && !col_identity) {
601:     PetscLogCpuToGpu(2*n*sizeof(PetscInt));
602:   } else if(!row_identity) {
603:     PetscLogCpuToGpu(n*sizeof(PetscInt));
604:   } else if(!col_identity) {
605:     PetscLogCpuToGpu(n*sizeof(PetscInt));
606:   }

608:   ISRestoreIndices(iscol,&c);
609:   return(0);
610: }

612: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
613: {
614:   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
615:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
616:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
617:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
618:   cusparseStatus_t                  stat;
619:   PetscErrorCode                    ierr;
620:   cudaError_t                       cerr;
621:   PetscInt                          *AiUp, *AjUp;
622:   PetscScalar                       *AAUp;
623:   PetscScalar                       *AALo;
624:   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
625:   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
626:   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
627:   const MatScalar                   *aa = b->a,*v;

630:   if (!n) return(0);
631:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
632:     try {
633:       /* Allocate Space for the upper triangular matrix */
634:       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
635:       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
636:       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
637:       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);

639:       /* Fill the upper triangular matrix */
640:       AiUp[0]=(PetscInt) 0;
641:       AiUp[n]=nzUpper;
642:       offset = 0;
643:       for (i=0; i<n; i++) {
644:         /* set the pointers */
645:         v  = aa + ai[i];
646:         vj = aj + ai[i];
647:         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */

649:         /* first, set the diagonal elements */
650:         AjUp[offset] = (PetscInt) i;
651:         AAUp[offset] = (MatScalar)1.0/v[nz];
652:         AiUp[i]      = offset;
653:         AALo[offset] = (MatScalar)1.0/v[nz];

655:         offset+=1;
656:         if (nz>0) {
657:           PetscArraycpy(&(AjUp[offset]), vj, nz);
658:           PetscArraycpy(&(AAUp[offset]), v, nz);
659:           for (j=offset; j<offset+nz; j++) {
660:             AAUp[j] = -AAUp[j];
661:             AALo[j] = AAUp[j]/v[nz];
662:           }
663:           offset+=nz;
664:         }
665:       }

667:       /* allocate space for the triangular factor information */
668:       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

670:       /* Create the matrix description */
671:       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
672:       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
673:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
674:       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
675:      #else
676:       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
677:      #endif
678:       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
679:       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);

681:       /* set the matrix */
682:       upTriFactor->csrMat = new CsrMatrix;
683:       upTriFactor->csrMat->num_rows = A->rmap->n;
684:       upTriFactor->csrMat->num_cols = A->cmap->n;
685:       upTriFactor->csrMat->num_entries = a->nz;

687:       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
688:       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

690:       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
691:       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

693:       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
694:       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);

696:       /* set the operation */
697:       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

699:       /* Create the solve analysis information */
700:       stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
701:     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
702:       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
703:                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
704:                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
705:                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
706:                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
707:       cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
708:     #endif

710:       /* perform the solve analysis */
711:       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
712:                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
713:                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
714:                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
715:                               #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
716:                                ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
717:                               #endif
718:                               );CHKERRCUSPARSE(stat);

720:       /* assign the pointer. Is this really necessary? */
721:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;

723:       /* allocate space for the triangular factor information */
724:       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;

726:       /* Create the matrix description */
727:       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
728:       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
729:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
730:       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
731:      #else
732:       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
733:      #endif
734:       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
735:       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);

737:       /* set the operation */
738:       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;

740:       /* set the matrix */
741:       loTriFactor->csrMat = new CsrMatrix;
742:       loTriFactor->csrMat->num_rows = A->rmap->n;
743:       loTriFactor->csrMat->num_cols = A->cmap->n;
744:       loTriFactor->csrMat->num_entries = a->nz;

746:       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
747:       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);

749:       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
750:       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);

752:       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
753:       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
754:       PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));

756:       /* Create the solve analysis information */
757:       stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
758:     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
759:       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
760:                                      loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
761:                                      loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
762:                                      loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
763:                                      &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
764:       cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
765:     #endif

767:       /* perform the solve analysis */
768:       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
769:                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
770:                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
771:                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
772:                               #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
773:                                ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
774:                               #endif
775:                               );CHKERRCUSPARSE(stat);

777:       /* assign the pointer. Is this really necessary? */
778:       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;

780:       A->offloadmask = PETSC_OFFLOAD_BOTH;
781:       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
782:       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
783:       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
784:       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
785:     } catch(char *ex) {
786:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
787:     }
788:   }
789:   return(0);
790: }

792: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
793: {
794:   PetscErrorCode               ierr;
795:   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
796:   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
797:   IS                           ip = a->row;
798:   const PetscInt               *rip;
799:   PetscBool                    perm_identity;
800:   PetscInt                     n = A->rmap->n;

803:   MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
804:   MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
805:   cusparseTriFactors->workVector = new THRUSTARRAY(n);
806:   cusparseTriFactors->nnz=(a->nz-n)*2 + n;

808:   /* lower triangular indices */
809:   ISGetIndices(ip,&rip);
810:   ISIdentity(ip,&perm_identity);
811:   if (!perm_identity) {
812:     IS             iip;
813:     const PetscInt *irip;

815:     ISInvertPermutation(ip,PETSC_DECIDE,&iip);
816:     ISGetIndices(iip,&irip);
817:     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
818:     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
819:     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
820:     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
821:     ISRestoreIndices(iip,&irip);
822:     ISDestroy(&iip);
823:     PetscLogCpuToGpu(2*n*sizeof(PetscInt));
824:   }
825:   ISRestoreIndices(ip,&rip);
826:   return(0);
827: }

829: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
830: {
831:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
832:   IS             isrow = b->row,iscol = b->col;
833:   PetscBool      row_identity,col_identity;

837:   MatLUFactorNumeric_SeqAIJ(B,A,info);
838:   B->offloadmask = PETSC_OFFLOAD_CPU;
839:   /* determine which version of MatSolve needs to be used. */
840:   ISIdentity(isrow,&row_identity);
841:   ISIdentity(iscol,&col_identity);
842:   if (row_identity && col_identity) {
843:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
844:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
845:     B->ops->matsolve = NULL;
846:     B->ops->matsolvetranspose = NULL;
847:   } else {
848:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
849:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
850:     B->ops->matsolve = NULL;
851:     B->ops->matsolvetranspose = NULL;
852:   }

854:   /* get the triangular factors */
855:   MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
856:   return(0);
857: }

859: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
860: {
861:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
862:   IS             ip = b->row;
863:   PetscBool      perm_identity;

867:   MatCholeskyFactorNumeric_SeqAIJ(B,A,info);
868:   B->offloadmask = PETSC_OFFLOAD_CPU;
869:   /* determine which version of MatSolve needs to be used. */
870:   ISIdentity(ip,&perm_identity);
871:   if (perm_identity) {
872:     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
873:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
874:     B->ops->matsolve = NULL;
875:     B->ops->matsolvetranspose = NULL;
876:   } else {
877:     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
878:     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
879:     B->ops->matsolve = NULL;
880:     B->ops->matsolvetranspose = NULL;
881:   }

883:   /* get the triangular factors */
884:   MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
885:   return(0);
886: }

888: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
889: {
890:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
891:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
892:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
893:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
894:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
895:   cusparseStatus_t                  stat;
896:   cusparseIndexBase_t               indexBase;
897:   cusparseMatrixType_t              matrixType;
898:   cusparseFillMode_t                fillMode;
899:   cusparseDiagType_t                diagType;


903:   /*********************************************/
904:   /* Now the Transpose of the Lower Tri Factor */
905:   /*********************************************/

907:   /* allocate space for the transpose of the lower triangular factor */
908:   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;

910:   /* set the matrix descriptors of the lower triangular factor */
911:   matrixType = cusparseGetMatType(loTriFactor->descr);
912:   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
913:   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
914:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
915:   diagType = cusparseGetMatDiagType(loTriFactor->descr);

917:   /* Create the matrix description */
918:   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
919:   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
920:   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
921:   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
922:   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);

924:   /* set the operation */
925:   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

927:   /* allocate GPU space for the CSC of the lower triangular factor*/
928:   loTriFactorT->csrMat = new CsrMatrix;
929:   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
930:   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
931:   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
932:   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
933:   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
934:   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);

936:   /* compute the transpose of the lower triangular factor, i.e. the CSC */
937: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
938:   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
939:                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
940:                                        loTriFactor->csrMat->values->data().get(),
941:                                        loTriFactor->csrMat->row_offsets->data().get(),
942:                                        loTriFactor->csrMat->column_indices->data().get(),
943:                                        loTriFactorT->csrMat->values->data().get(),
944:                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
945:                                        CUSPARSE_ACTION_NUMERIC,indexBase,
946:                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
947:   cudaError_t cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
948: #endif

950:   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
951:                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
952:                           loTriFactor->csrMat->values->data().get(),
953:                           loTriFactor->csrMat->row_offsets->data().get(),
954:                           loTriFactor->csrMat->column_indices->data().get(),
955:                           loTriFactorT->csrMat->values->data().get(),
956:                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
957:                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
958:                           CUSPARSE_ACTION_NUMERIC, indexBase,
959:                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
960:                         #else
961:                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
962:                           CUSPARSE_ACTION_NUMERIC, indexBase
963:                         #endif
964:                         );CHKERRCUSPARSE(stat);

966:   /* Create the solve analysis information */
967:   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
968: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
969:   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
970:                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
971:                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
972:                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
973:                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
974:   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
975: #endif

977:   /* perform the solve analysis */
978:   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
979:                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
980:                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
981:                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
982:                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
983:                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
984:                           #endif
985:                           );CHKERRCUSPARSE(stat);

987:   /* assign the pointer. Is this really necessary? */
988:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;

990:   /*********************************************/
991:   /* Now the Transpose of the Upper Tri Factor */
992:   /*********************************************/

994:   /* allocate space for the transpose of the upper triangular factor */
995:   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;

997:   /* set the matrix descriptors of the upper triangular factor */
998:   matrixType = cusparseGetMatType(upTriFactor->descr);
999:   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1000:   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1001:     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1002:   diagType = cusparseGetMatDiagType(upTriFactor->descr);

1004:   /* Create the matrix description */
1005:   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1006:   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1007:   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1008:   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1009:   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);

1011:   /* set the operation */
1012:   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;

1014:   /* allocate GPU space for the CSC of the upper triangular factor*/
1015:   upTriFactorT->csrMat = new CsrMatrix;
1016:   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1017:   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1018:   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1019:   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1020:   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1021:   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);

1023:   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1024: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1025:   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1026:                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1027:                                 upTriFactor->csrMat->values->data().get(),
1028:                                 upTriFactor->csrMat->row_offsets->data().get(),
1029:                                 upTriFactor->csrMat->column_indices->data().get(),
1030:                                 upTriFactorT->csrMat->values->data().get(),
1031:                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1032:                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1033:                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1034:   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1035: #endif

1037:   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1038:                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1039:                           upTriFactor->csrMat->values->data().get(),
1040:                           upTriFactor->csrMat->row_offsets->data().get(),
1041:                           upTriFactor->csrMat->column_indices->data().get(),
1042:                           upTriFactorT->csrMat->values->data().get(),
1043:                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1044:                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1045:                           CUSPARSE_ACTION_NUMERIC, indexBase,
1046:                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1047:                         #else
1048:                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1049:                           CUSPARSE_ACTION_NUMERIC, indexBase
1050:                         #endif
1051:                         );CHKERRCUSPARSE(stat);

1053:   /* Create the solve analysis information */
1054:   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1055:   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1056:   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1057:                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1058:                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1059:                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1060:                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1061:   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1062:   #endif

1064:   /* perform the solve analysis */
1065:   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1066:                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1067:                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1068:                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1069:                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1070:                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1071:                           #endif
1072:                           );CHKERRCUSPARSE(stat);

1074:   /* assign the pointer. Is this really necessary? */
1075:   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1076:   return(0);
1077: }

1079: static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1080: {
1081:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1082:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1083:   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1084:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1085:   cusparseStatus_t             stat;
1086:   cusparseIndexBase_t          indexBase;
1087:   cudaError_t                  err;
1088:   PetscErrorCode               ierr;

1091:   if (!cusparsestruct->transgen || cusparsestruct->matTranspose) return(0);
1092:   PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1093:   PetscLogGpuTimeBegin();
1094:   /* create cusparse matrix */
1095:   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1096:   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1097:   indexBase = cusparseGetMatIndexBase(matstruct->descr);
1098:   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1099:   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);

1101:   /* set alpha and beta */
1102:   err = cudaMalloc((void **)&(matstructT->alpha_one),    sizeof(PetscScalar));CHKERRCUDA(err);
1103:   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1104:   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1105:   err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1106:   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1107:   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1108:   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);

1110:   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1111:     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1112:     CsrMatrix *matrixT= new CsrMatrix;
1113:     matrixT->num_rows = A->cmap->n;
1114:     matrixT->num_cols = A->rmap->n;
1115:     matrixT->num_entries = a->nz;
1116:     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1117:     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1118:     matrixT->values = new THRUSTARRAY(a->nz);

1120:     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
1121:     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);

1123:     /* compute the transpose, i.e. the CSC */
1124:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1125:     stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1126:                                   A->cmap->n, matrix->num_entries,
1127:                                   matrix->values->data().get(),
1128:                                   cusparsestruct->rowoffsets_gpu->data().get(),
1129:                                   matrix->column_indices->data().get(),
1130:                                   matrixT->values->data().get(),
1131:                                   matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1132:                                   CUSPARSE_ACTION_NUMERIC,indexBase,
1133:                                   cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1134:     err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1135:    #endif

1137:     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1138:                             A->cmap->n, matrix->num_entries,
1139:                             matrix->values->data().get(),
1140:                             cusparsestruct->rowoffsets_gpu->data().get(),
1141:                             matrix->column_indices->data().get(),
1142:                             matrixT->values->data().get(),
1143:                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1144:                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1145:                             CUSPARSE_ACTION_NUMERIC,indexBase,
1146:                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1147:                           #else
1148:                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1149:                             CUSPARSE_ACTION_NUMERIC, indexBase
1150:                           #endif
1151:                            );CHKERRCUSPARSE(stat);
1152:     matstructT->mat = matrixT;

1154:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1155:     stat = cusparseCreateCsr(&matstructT->matDescr,
1156:                              matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1157:                              matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1158:                              matrixT->values->data().get(),
1159:                              CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1160:                              indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1161:    #endif
1162:   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1163:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1164:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1165:    #else
1166:     CsrMatrix *temp  = new CsrMatrix;
1167:     CsrMatrix *tempT = new CsrMatrix;
1168:     /* First convert HYB to CSR */
1169:     temp->num_rows = A->rmap->n;
1170:     temp->num_cols = A->cmap->n;
1171:     temp->num_entries = a->nz;
1172:     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1173:     temp->column_indices = new THRUSTINTARRAY32(a->nz);
1174:     temp->values = new THRUSTARRAY(a->nz);

1176:     stat = cusparse_hyb2csr(cusparsestruct->handle,
1177:                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1178:                             temp->values->data().get(),
1179:                             temp->row_offsets->data().get(),
1180:                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);

1182:     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1183:     tempT->num_rows = A->rmap->n;
1184:     tempT->num_cols = A->cmap->n;
1185:     tempT->num_entries = a->nz;
1186:     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1187:     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1188:     tempT->values = new THRUSTARRAY(a->nz);

1190:     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1191:                             temp->num_cols, temp->num_entries,
1192:                             temp->values->data().get(),
1193:                             temp->row_offsets->data().get(),
1194:                             temp->column_indices->data().get(),
1195:                             tempT->values->data().get(),
1196:                             tempT->column_indices->data().get(),
1197:                             tempT->row_offsets->data().get(),
1198:                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);

1200:     /* Last, convert CSC to HYB */
1201:     cusparseHybMat_t hybMat;
1202:     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1203:     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1204:       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1205:     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1206:                             matstructT->descr, tempT->values->data().get(),
1207:                             tempT->row_offsets->data().get(),
1208:                             tempT->column_indices->data().get(),
1209:                             hybMat, 0, partition);CHKERRCUSPARSE(stat);

1211:     /* assign the pointer */
1212:     matstructT->mat = hybMat;
1213:     /* delete temporaries */
1214:     if (tempT) {
1215:       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1216:       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1217:       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1218:       delete (CsrMatrix*) tempT;
1219:     }
1220:     if (temp) {
1221:       if (temp->values) delete (THRUSTARRAY*) temp->values;
1222:       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1223:       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1224:       delete (CsrMatrix*) temp;
1225:     }
1226:    #endif
1227:   }
1228:   err  = WaitForCUDA();CHKERRCUDA(err);
1229:   PetscLogGpuTimeEnd();
1230:   PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1231:   /* the compressed row indices is not used for matTranspose */
1232:   matstructT->cprowIndices = NULL;
1233:   /* assign the pointer */
1234:   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1235:   return(0);
1236: }

1238: /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1239: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1240: {
1241:   PetscInt                              n = xx->map->n;
1242:   const PetscScalar                     *barray;
1243:   PetscScalar                           *xarray;
1244:   thrust::device_ptr<const PetscScalar> bGPU;
1245:   thrust::device_ptr<PetscScalar>       xGPU;
1246:   cusparseStatus_t                      stat;
1247:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1248:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1249:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1250:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1251:   PetscErrorCode                        ierr;
1252:   cudaError_t                           cerr;

1255:   /* Analyze the matrix and create the transpose ... on the fly */
1256:   if (!loTriFactorT && !upTriFactorT) {
1257:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1258:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1259:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1260:   }

1262:   /* Get the GPU pointers */
1263:   VecCUDAGetArrayWrite(xx,&xarray);
1264:   VecCUDAGetArrayRead(bb,&barray);
1265:   xGPU = thrust::device_pointer_cast(xarray);
1266:   bGPU = thrust::device_pointer_cast(barray);

1268:   PetscLogGpuTimeBegin();
1269:   /* First, reorder with the row permutation */
1270:   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1271:                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1272:                xGPU);

1274:   /* First, solve U */
1275:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1276:                         upTriFactorT->csrMat->num_rows,
1277:                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1278:                         upTriFactorT->csrMat->num_entries,
1279:                       #endif
1280:                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1281:                         upTriFactorT->csrMat->values->data().get(),
1282:                         upTriFactorT->csrMat->row_offsets->data().get(),
1283:                         upTriFactorT->csrMat->column_indices->data().get(),
1284:                         upTriFactorT->solveInfo,
1285:                         xarray, tempGPU->data().get()
1286:                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1287:                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1288:                       #endif
1289:                         );CHKERRCUSPARSE(stat);

1291:   /* Then, solve L */
1292:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1293:                         loTriFactorT->csrMat->num_rows,
1294:                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1295:                         loTriFactorT->csrMat->num_entries,
1296:                       #endif
1297:                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1298:                         loTriFactorT->csrMat->values->data().get(),
1299:                         loTriFactorT->csrMat->row_offsets->data().get(),
1300:                         loTriFactorT->csrMat->column_indices->data().get(),
1301:                         loTriFactorT->solveInfo,
1302:                         tempGPU->data().get(), xarray
1303:                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1304:                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1305:                       #endif
1306:                         );CHKERRCUSPARSE(stat);

1308:   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1309:   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1310:                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1311:                tempGPU->begin());

1313:   /* Copy the temporary to the full solution. */
1314:   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);

1316:   /* restore */
1317:   VecCUDARestoreArrayRead(bb,&barray);
1318:   VecCUDARestoreArrayWrite(xx,&xarray);
1319:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1320:   PetscLogGpuTimeEnd();
1321:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1322:   return(0);
1323: }

1325: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1326: {
1327:   const PetscScalar                 *barray;
1328:   PetscScalar                       *xarray;
1329:   cusparseStatus_t                  stat;
1330:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1331:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1332:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1333:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1334:   PetscErrorCode                    ierr;
1335:   cudaError_t                       cerr;

1338:   /* Analyze the matrix and create the transpose ... on the fly */
1339:   if (!loTriFactorT && !upTriFactorT) {
1340:     MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1341:     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1342:     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1343:   }

1345:   /* Get the GPU pointers */
1346:   VecCUDAGetArrayWrite(xx,&xarray);
1347:   VecCUDAGetArrayRead(bb,&barray);

1349:   PetscLogGpuTimeBegin();
1350:   /* First, solve U */
1351:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1352:                         upTriFactorT->csrMat->num_rows,
1353:                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1354:                         upTriFactorT->csrMat->num_entries,
1355:                       #endif
1356:                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1357:                         upTriFactorT->csrMat->values->data().get(),
1358:                         upTriFactorT->csrMat->row_offsets->data().get(),
1359:                         upTriFactorT->csrMat->column_indices->data().get(),
1360:                         upTriFactorT->solveInfo,
1361:                         barray, tempGPU->data().get()
1362:                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1363:                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1364:                       #endif
1365:                         );CHKERRCUSPARSE(stat);

1367:   /* Then, solve L */
1368:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1369:                         loTriFactorT->csrMat->num_rows,
1370:                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1371:                         loTriFactorT->csrMat->num_entries,
1372:                       #endif
1373:                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1374:                         loTriFactorT->csrMat->values->data().get(),
1375:                         loTriFactorT->csrMat->row_offsets->data().get(),
1376:                         loTriFactorT->csrMat->column_indices->data().get(),
1377:                         loTriFactorT->solveInfo,
1378:                         tempGPU->data().get(), xarray
1379:                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1380:                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1381:                       #endif
1382:                         );CHKERRCUSPARSE(stat);

1384:   /* restore */
1385:   VecCUDARestoreArrayRead(bb,&barray);
1386:   VecCUDARestoreArrayWrite(xx,&xarray);
1387:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1388:   PetscLogGpuTimeEnd();
1389:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1390:   return(0);
1391: }

1393: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1394: {
1395:   const PetscScalar                     *barray;
1396:   PetscScalar                           *xarray;
1397:   thrust::device_ptr<const PetscScalar> bGPU;
1398:   thrust::device_ptr<PetscScalar>       xGPU;
1399:   cusparseStatus_t                      stat;
1400:   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1401:   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1402:   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1403:   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1404:   PetscErrorCode                        ierr;
1405:   cudaError_t                           cerr;


1409:   /* Get the GPU pointers */
1410:   VecCUDAGetArrayWrite(xx,&xarray);
1411:   VecCUDAGetArrayRead(bb,&barray);
1412:   xGPU = thrust::device_pointer_cast(xarray);
1413:   bGPU = thrust::device_pointer_cast(barray);

1415:   PetscLogGpuTimeBegin();
1416:   /* First, reorder with the row permutation */
1417:   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1418:                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1419:                tempGPU->begin());

1421:   /* Next, solve L */
1422:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1423:                         loTriFactor->csrMat->num_rows,
1424:                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1425:                         loTriFactor->csrMat->num_entries,
1426:                       #endif
1427:                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1428:                         loTriFactor->csrMat->values->data().get(),
1429:                         loTriFactor->csrMat->row_offsets->data().get(),
1430:                         loTriFactor->csrMat->column_indices->data().get(),
1431:                         loTriFactor->solveInfo,
1432:                         tempGPU->data().get(), xarray
1433:                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1434:                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1435:                       #endif
1436:                         );CHKERRCUSPARSE(stat);

1438:   /* Then, solve U */
1439:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1440:                         upTriFactor->csrMat->num_rows,
1441:                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1442:                         upTriFactor->csrMat->num_entries,
1443:                       #endif
1444:                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1445:                         upTriFactor->csrMat->values->data().get(),
1446:                         upTriFactor->csrMat->row_offsets->data().get(),
1447:                         upTriFactor->csrMat->column_indices->data().get(),
1448:                         upTriFactor->solveInfo,
1449:                         xarray, tempGPU->data().get()
1450:                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1451:                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1452:                       #endif
1453:                         );CHKERRCUSPARSE(stat);

1455:   /* Last, reorder with the column permutation */
1456:   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1457:                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1458:                xGPU);

1460:   VecCUDARestoreArrayRead(bb,&barray);
1461:   VecCUDARestoreArrayWrite(xx,&xarray);
1462:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1463:   PetscLogGpuTimeEnd();
1464:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1465:   return(0);
1466: }

1468: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1469: {
1470:   const PetscScalar                 *barray;
1471:   PetscScalar                       *xarray;
1472:   cusparseStatus_t                  stat;
1473:   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1474:   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1475:   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1476:   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1477:   PetscErrorCode                    ierr;
1478:   cudaError_t                       cerr;

1481:   /* Get the GPU pointers */
1482:   VecCUDAGetArrayWrite(xx,&xarray);
1483:   VecCUDAGetArrayRead(bb,&barray);

1485:   PetscLogGpuTimeBegin();
1486:   /* First, solve L */
1487:   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1488:                         loTriFactor->csrMat->num_rows,
1489:                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1490:                         loTriFactor->csrMat->num_entries,
1491:                       #endif
1492:                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1493:                         loTriFactor->csrMat->values->data().get(),
1494:                         loTriFactor->csrMat->row_offsets->data().get(),
1495:                         loTriFactor->csrMat->column_indices->data().get(),
1496:                         loTriFactor->solveInfo,
1497:                         barray, tempGPU->data().get()
1498:                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1499:                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1500:                       #endif
1501:                         );CHKERRCUSPARSE(stat);

1503:   /* Next, solve U */
1504:   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1505:                         upTriFactor->csrMat->num_rows,
1506:                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1507:                         upTriFactor->csrMat->num_entries,
1508:                       #endif
1509:                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1510:                         upTriFactor->csrMat->values->data().get(),
1511:                         upTriFactor->csrMat->row_offsets->data().get(),
1512:                         upTriFactor->csrMat->column_indices->data().get(),
1513:                         upTriFactor->solveInfo,
1514:                         tempGPU->data().get(), xarray
1515:                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1516:                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1517:                       #endif
1518:                         );CHKERRCUSPARSE(stat);

1520:   VecCUDARestoreArrayRead(bb,&barray);
1521:   VecCUDARestoreArrayWrite(xx,&xarray);
1522:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1523:   PetscLogGpuTimeEnd();
1524:   PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1525:   return(0);
1526: }

1528: static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1529: {
1530:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1531:   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1532:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1533:   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1534:   PetscErrorCode               ierr;
1535:   cusparseStatus_t             stat;
1536:   cudaError_t                  err;

1539:   if (A->boundtocpu) return(0);
1540:   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1541:     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1542:       /* Copy values only */
1543:       CsrMatrix *matrix,*matrixT;
1544:       matrix = (CsrMatrix*)cusparsestruct->mat->mat;

1546:       PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1547:       matrix->values->assign(a->a, a->a+a->nz);
1548:       err  = WaitForCUDA();CHKERRCUDA(err);
1549:       PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));
1550:       PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);

1552:       /* Update matT when it was built before */
1553:       if (cusparsestruct->matTranspose) {
1554:         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1555:         matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1556:         PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1557:         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1558:                             A->cmap->n, matrix->num_entries,
1559:                             matrix->values->data().get(),
1560:                             cusparsestruct->rowoffsets_gpu->data().get(),
1561:                             matrix->column_indices->data().get(),
1562:                             matrixT->values->data().get(),
1563:                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1564:                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1565:                             CUSPARSE_ACTION_NUMERIC,indexBase,
1566:                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1567:                           #else
1568:                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1569:                             CUSPARSE_ACTION_NUMERIC, indexBase
1570:                           #endif
1571:                            );CHKERRCUSPARSE(stat);
1572:         err  = WaitForCUDA();CHKERRCUDA(err);
1573:         PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1574:       }
1575:     } else {
1576:       PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1577:       MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);
1578:       MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);
1579:       delete cusparsestruct->workVector;
1580:       delete cusparsestruct->rowoffsets_gpu;
1581:       try {
1582:         if (a->compressedrow.use) {
1583:           m    = a->compressedrow.nrows;
1584:           ii   = a->compressedrow.i;
1585:           ridx = a->compressedrow.rindex;
1586:         } else {
1587:           m    = A->rmap->n;
1588:           ii   = a->i;
1589:           ridx = NULL;
1590:         }
1591:         cusparsestruct->nrows = m;

1593:         /* create cusparse matrix */
1594:         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1595:         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1596:         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1597:         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);

1599:         err = cudaMalloc((void **)&(matstruct->alpha_one),    sizeof(PetscScalar));CHKERRCUDA(err);
1600:         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1601:         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1602:         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1603:         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1604:         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1605:         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);

1607:         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1608:         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1609:           /* set the matrix */
1610:           CsrMatrix *mat= new CsrMatrix;
1611:           mat->num_rows = m;
1612:           mat->num_cols = A->cmap->n;
1613:           mat->num_entries = a->nz;
1614:           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1615:           mat->row_offsets->assign(ii, ii + m+1);

1617:           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1618:           mat->column_indices->assign(a->j, a->j+a->nz);

1620:           mat->values = new THRUSTARRAY(a->nz);
1621:           mat->values->assign(a->a, a->a+a->nz);

1623:           /* assign the pointer */
1624:           matstruct->mat = mat;
1625:          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1626:           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1627:             stat = cusparseCreateCsr(&matstruct->matDescr,
1628:                                     mat->num_rows, mat->num_cols, mat->num_entries,
1629:                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1630:                                     mat->values->data().get(),
1631:                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1632:                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1633:           }
1634:          #endif
1635:         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1636:          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1637:           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1638:          #else
1639:           CsrMatrix *mat= new CsrMatrix;
1640:           mat->num_rows = m;
1641:           mat->num_cols = A->cmap->n;
1642:           mat->num_entries = a->nz;
1643:           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1644:           mat->row_offsets->assign(ii, ii + m+1);

1646:           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1647:           mat->column_indices->assign(a->j, a->j+a->nz);

1649:           mat->values = new THRUSTARRAY(a->nz);
1650:           mat->values->assign(a->a, a->a+a->nz);

1652:           cusparseHybMat_t hybMat;
1653:           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1654:           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1655:             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1656:           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1657:               matstruct->descr, mat->values->data().get(),
1658:               mat->row_offsets->data().get(),
1659:               mat->column_indices->data().get(),
1660:               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1661:           /* assign the pointer */
1662:           matstruct->mat = hybMat;

1664:           if (mat) {
1665:             if (mat->values) delete (THRUSTARRAY*)mat->values;
1666:             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1667:             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1668:             delete (CsrMatrix*)mat;
1669:           }
1670:          #endif
1671:         }

1673:         /* assign the compressed row indices */
1674:         if (a->compressedrow.use) {
1675:           cusparsestruct->workVector = new THRUSTARRAY(m);
1676:           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1677:           matstruct->cprowIndices->assign(ridx,ridx+m);
1678:           tmp = m;
1679:         } else {
1680:           cusparsestruct->workVector = NULL;
1681:           matstruct->cprowIndices    = NULL;
1682:           tmp = 0;
1683:         }
1684:         PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));

1686:         /* assign the pointer */
1687:         cusparsestruct->mat = matstruct;
1688:       } catch(char *ex) {
1689:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1690:       }
1691:       err  = WaitForCUDA();CHKERRCUDA(err);
1692:       PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1693:       cusparsestruct->nonzerostate = A->nonzerostate;
1694:     }
1695:     A->offloadmask = PETSC_OFFLOAD_BOTH;
1696:   }
1697:   return(0);
1698: }

1700: struct VecCUDAPlusEquals
1701: {
1702:   template <typename Tuple>
1703:   __host__ __device__
1704:   void operator()(Tuple t)
1705:   {
1706:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1707:   }
1708: };

1710: struct VecCUDAEqualsReverse
1711: {
1712:   template <typename Tuple>
1713:   __host__ __device__
1714:   void operator()(Tuple t)
1715:   {
1716:     thrust::get<0>(t) = thrust::get<1>(t);
1717:   }
1718: };

1720: struct MatMatCusparse {
1721:   PetscBool            cisdense;
1722:   PetscScalar          *Bt;
1723:   Mat                  X;
1724: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1725:   PetscBool            initialized;   /* C = alpha op(A) op(B) + beta C */
1726:   cusparseDnMatDescr_t matBDescr;
1727:   cusparseDnMatDescr_t matCDescr;
1728:   size_t               spmmBufferSize;
1729:   void                 *spmmBuffer;
1730:   PetscInt             Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1731: #endif
1732: };

1734: static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1735: {
1737:   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1738:   cudaError_t    cerr;

1741:   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1742:  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1743:   cusparseStatus_t stat;
1744:   if (mmdata->matBDescr)  {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);}
1745:   if (mmdata->matCDescr)  {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);}
1746:   if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1747:  #endif
1748:   MatDestroy(&mmdata->X);
1749:   PetscFree(data);
1750:   return(0);
1751: }

1753: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);

1755: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1756: {
1757:   Mat_Product                  *product = C->product;
1758:   Mat                          A,B;
1759:   PetscInt                     m,n,blda,clda;
1760:   PetscBool                    flg,biscuda;
1761:   Mat_SeqAIJCUSPARSE           *cusp;
1762:   cusparseStatus_t             stat;
1763:   cusparseOperation_t          opA;
1764:   const PetscScalar            *barray;
1765:   PetscScalar                  *carray;
1766:   PetscErrorCode               ierr;
1767:   MatMatCusparse               *mmdata;
1768:   Mat_SeqAIJCUSPARSEMultStruct *mat;
1769:   CsrMatrix                    *csrmat;
1770:   cudaError_t                  cerr;

1773:   MatCheckProduct(C,1);
1774:   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1775:   mmdata = (MatMatCusparse*)product->data;
1776:   A    = product->A;
1777:   B    = product->B;
1778:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
1779:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1780:   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1781:      Instead of silently accepting the wrong answer, I prefer to raise the error */
1782:   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1783:   MatSeqAIJCUSPARSECopyToGPU(A);
1784:   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1785:   switch (product->type) {
1786:   case MATPRODUCT_AB:
1787:   case MATPRODUCT_PtAP:
1788:     mat = cusp->mat;
1789:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1790:     m   = A->rmap->n;
1791:     n   = B->cmap->n;
1792:     break;
1793:   case MATPRODUCT_AtB:
1794:     if (!cusp->transgen) {
1795:       mat = cusp->mat;
1796:       opA = CUSPARSE_OPERATION_TRANSPOSE;
1797:     } else {
1798:       MatSeqAIJCUSPARSEGenerateTransposeForMult(A);
1799:       mat  = cusp->matTranspose;
1800:       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1801:     }
1802:     m = A->cmap->n;
1803:     n = B->cmap->n;
1804:     break;
1805:   case MATPRODUCT_ABt:
1806:   case MATPRODUCT_RARt:
1807:     mat = cusp->mat;
1808:     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1809:     m   = A->rmap->n;
1810:     n   = B->rmap->n;
1811:     break;
1812:   default:
1813:     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1814:   }
1815:   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1816:   csrmat = (CsrMatrix*)mat->mat;
1817:   /* if the user passed a CPU matrix, copy the data to the GPU */
1818:   PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);
1819:   if (!biscuda) {MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);}
1820:   MatDenseCUDAGetArrayRead(B,&barray);

1822:   MatDenseGetLDA(B,&blda);
1823:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1824:     MatDenseCUDAGetArrayWrite(mmdata->X,&carray);
1825:     MatDenseGetLDA(mmdata->X,&clda);
1826:   } else {
1827:     MatDenseCUDAGetArrayWrite(C,&carray);
1828:     MatDenseGetLDA(C,&clda);
1829:   }

1831:   PetscLogGpuTimeBegin();
1832:  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1833:   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
1834:   /* (re)allcoate spmmBuffer if not initialized or LDAs are different */
1835:   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
1836:     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
1837:     if (!mmdata->matBDescr) {
1838:       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1839:       mmdata->Blda = blda;
1840:     }

1842:     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
1843:     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
1844:       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1845:       mmdata->Clda = clda;
1846:     }

1848:     if (!mat->matDescr) {
1849:       stat = cusparseCreateCsr(&mat->matDescr,
1850:                               csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
1851:                               csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
1852:                               csrmat->values->data().get(),
1853:                               CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1854:                               CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1855:     }
1856:     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
1857:                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
1858:                                    mmdata->matCDescr,cusparse_scalartype,
1859:                                    cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat);
1860:     if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1861:     cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr);
1862:     mmdata->initialized = PETSC_TRUE;
1863:   } else {
1864:     /* to be safe, always update pointers of the mats */
1865:     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
1866:     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
1867:     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
1868:   }

1870:   /* do cusparseSpMM, which supports transpose on B */
1871:   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
1872:                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
1873:                       mmdata->matCDescr,cusparse_scalartype,
1874:                       cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat);
1875:  #else
1876:   PetscInt k;
1877:   /* cusparseXcsrmm does not support transpose on B */
1878:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1879:     cublasHandle_t cublasv2handle;
1880:     cublasStatus_t cerr;

1882:     PetscCUBLASGetHandle(&cublasv2handle);
1883:     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
1884:                        B->cmap->n,B->rmap->n,
1885:                        &PETSC_CUSPARSE_ONE ,barray,blda,
1886:                        &PETSC_CUSPARSE_ZERO,barray,blda,
1887:                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
1888:     blda = B->cmap->n;
1889:     k    = B->cmap->n;
1890:   } else {
1891:     k    = B->rmap->n;
1892:   }

1894:   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
1895:   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
1896:                            csrmat->num_entries,mat->alpha_one,mat->descr,
1897:                            csrmat->values->data().get(),
1898:                            csrmat->row_offsets->data().get(),
1899:                            csrmat->column_indices->data().get(),
1900:                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
1901:                            carray,clda);CHKERRCUSPARSE(stat);
1902:  #endif
1903:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1904:   PetscLogGpuTimeEnd();
1905:   PetscLogGpuFlops(n*2.0*csrmat->num_entries);
1906:   MatDenseCUDARestoreArrayRead(B,&barray);
1907:   if (product->type == MATPRODUCT_RARt) {
1908:     MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
1909:     MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);
1910:   } else if (product->type == MATPRODUCT_PtAP) {
1911:     MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
1912:     MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);
1913:   } else {
1914:     MatDenseCUDARestoreArrayWrite(C,&carray);
1915:   }
1916:   if (mmdata->cisdense) {
1917:     MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);
1918:   }
1919:   if (!biscuda) {
1920:     MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);
1921:   }
1922:   return(0);
1923: }

1925: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1926: {
1927:   Mat_Product        *product = C->product;
1928:   Mat                A,B;
1929:   PetscInt           m,n;
1930:   PetscBool          cisdense,flg;
1931:   PetscErrorCode     ierr;
1932:   MatMatCusparse     *mmdata;
1933:   Mat_SeqAIJCUSPARSE *cusp;

1936:   MatCheckProduct(C,1);
1937:   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
1938:   A    = product->A;
1939:   B    = product->B;
1940:   PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
1941:   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1942:   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1943:   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
1944:   switch (product->type) {
1945:   case MATPRODUCT_AB:
1946:     m = A->rmap->n;
1947:     n = B->cmap->n;
1948:     break;
1949:   case MATPRODUCT_AtB:
1950:     m = A->cmap->n;
1951:     n = B->cmap->n;
1952:     break;
1953:   case MATPRODUCT_ABt:
1954:     m = A->rmap->n;
1955:     n = B->rmap->n;
1956:     break;
1957:   case MATPRODUCT_PtAP:
1958:     m = B->cmap->n;
1959:     n = B->cmap->n;
1960:     break;
1961:   case MATPRODUCT_RARt:
1962:     m = B->rmap->n;
1963:     n = B->rmap->n;
1964:     break;
1965:   default:
1966:     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1967:   }
1968:   MatSetSizes(C,m,n,m,n);
1969:   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
1970:   PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);
1971:   MatSetType(C,MATSEQDENSECUDA);

1973:   /* product data */
1974:   PetscNew(&mmdata);
1975:   mmdata->cisdense = cisdense;
1976:  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
1977:   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
1978:   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1979:     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
1980:   }
1981:  #endif
1982:   /* for these products we need intermediate storage */
1983:   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1984:     MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);
1985:     MatSetType(mmdata->X,MATSEQDENSECUDA);
1986:     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
1987:       MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);
1988:     } else {
1989:       MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);
1990:     }
1991:   }
1992:   C->product->data    = mmdata;
1993:   C->product->destroy = MatDestroy_MatMatCusparse;

1995:   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
1996:   return(0);
1997: }

1999: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);

2001: /* handles dense B */
2002: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
2003: {
2004:   Mat_Product    *product = C->product;

2008:   MatCheckProduct(C,1);
2009:   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
2010:   if (product->A->boundtocpu) {
2011:     MatProductSetFromOptions_SeqAIJ_SeqDense(C);
2012:     return(0);
2013:   }
2014:   switch (product->type) {
2015:   case MATPRODUCT_AB:
2016:   case MATPRODUCT_AtB:
2017:   case MATPRODUCT_ABt:
2018:   case MATPRODUCT_PtAP:
2019:   case MATPRODUCT_RARt:
2020:     C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2021:   default:
2022:     break;
2023:   }
2024:   return(0);
2025: }

2027: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2028: {

2032:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);
2033:   return(0);
2034: }

2036: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2037: {

2041:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);
2042:   return(0);
2043: }

2045: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2046: {

2050:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);
2051:   return(0);
2052: }

2054: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2055: {

2059:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);
2060:   return(0);
2061: }

2063: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2064: {

2068:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);
2069:   return(0);
2070: }

2072: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2073: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2074: {
2075:   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2076:   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2077:   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2078:   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2079:   PetscErrorCode               ierr;
2080:   cudaError_t                  cerr;
2081:   cusparseStatus_t             stat;
2082:   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2083:   PetscBool                    compressed;
2084: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2085:   PetscInt                     nx,ny;
2086: #endif

2089:   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2090:   if (!a->nonzerorowcnt) {
2091:     if (!yy) {VecSet_SeqCUDA(zz,0);}
2092:     return(0);
2093:   }
2094:   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2095:   MatSeqAIJCUSPARSECopyToGPU(A);
2096:   if (!trans) {
2097:     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2098:   } else {
2099:     if (herm || !cusparsestruct->transgen) {
2100:       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2101:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2102:     } else {
2103:       if (!cusparsestruct->matTranspose) {MatSeqAIJCUSPARSEGenerateTransposeForMult(A);}
2104:       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2105:     }
2106:   }
2107:   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2108:   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;

2110:   try {
2111:     VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);
2112:     if (yy == zz) {VecCUDAGetArray(zz,&zarray);} /* read & write zz, so need to get uptodate zarray on GPU */
2113:     else {VecCUDAGetArrayWrite(zz,&zarray);} /* write zz, so no need to init zarray on GPU */

2115:     PetscLogGpuTimeBegin();
2116:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2117:       /* z = A x + beta y.
2118:          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2119:          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2120:       */
2121:       xptr = xarray;
2122:       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2123:       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2124:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2125:       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2126:           allocated to accommodate different uses. So we get the length info directly from mat.
2127:        */
2128:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2129:         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2130:         nx = mat->num_cols;
2131:         ny = mat->num_rows;
2132:       }
2133:      #endif
2134:     } else {
2135:       /* z = A^T x + beta y
2136:          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2137:          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2138:        */
2139:       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2140:       dptr = zarray;
2141:       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2142:       if (compressed) { /* Scatter x to work vector */
2143:         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2144:         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2145:                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2146:                          VecCUDAEqualsReverse());
2147:       }
2148:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2149:       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2150:         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2151:         nx = mat->num_rows;
2152:         ny = mat->num_cols;
2153:       }
2154:      #endif
2155:     }

2157:     /* csr_spmv does y = alpha op(A) x + beta y */
2158:     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2159:      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2160:       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2161:       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2162:         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2163:         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2164:         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2165:                                 matstruct->matDescr,
2166:                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2167:                                 matstruct->cuSpMV[opA].vecYDescr,
2168:                                 cusparse_scalartype,
2169:                                 cusparsestruct->spmvAlg,
2170:                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2171:         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);

2173:         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2174:       } else {
2175:         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2176:         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2177:         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2178:       }

2180:       stat = cusparseSpMV(cusparsestruct->handle, opA,
2181:                                matstruct->alpha_one,
2182:                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2183:                                matstruct->cuSpMV[opA].vecXDescr,
2184:                                beta,
2185:                                matstruct->cuSpMV[opA].vecYDescr,
2186:                                cusparse_scalartype,
2187:                                cusparsestruct->spmvAlg,
2188:                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2189:      #else
2190:       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2191:       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2192:                                mat->num_rows, mat->num_cols,
2193:                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2194:                                mat->values->data().get(), mat->row_offsets->data().get(),
2195:                                mat->column_indices->data().get(), xptr, beta,
2196:                                dptr);CHKERRCUSPARSE(stat);
2197:      #endif
2198:     } else {
2199:       if (cusparsestruct->nrows) {
2200:        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2201:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2202:        #else
2203:         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2204:         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2205:                                  matstruct->alpha_one, matstruct->descr, hybMat,
2206:                                  xptr, beta,
2207:                                  dptr);CHKERRCUSPARSE(stat);
2208:        #endif
2209:       }
2210:     }
2211:     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2212:     PetscLogGpuTimeEnd();

2214:     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2215:       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2216:         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2217:           VecCopy_SeqCUDA(yy,zz); /* zz = yy */
2218:         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2219:           VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
2220:         }
2221:       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2222:         VecSet_SeqCUDA(zz,0);
2223:       }

2225:       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2226:       if (compressed) {
2227:         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);

2229:         PetscLogGpuTimeBegin();
2230:         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2231:                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2232:                          VecCUDAPlusEquals());
2233:         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2234:         PetscLogGpuTimeEnd();
2235:       }
2236:     } else {
2237:       if (yy && yy != zz) {
2238:         VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
2239:       }
2240:     }
2241:     VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);
2242:     if (yy == zz) {VecCUDARestoreArray(zz,&zarray);}
2243:     else {VecCUDARestoreArrayWrite(zz,&zarray);}
2244:   } catch(char *ex) {
2245:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2246:   }
2247:   if (yy) {
2248:     PetscLogGpuFlops(2.0*a->nz);
2249:   } else {
2250:     PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);
2251:   }
2252:   return(0);
2253: }

2255: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2256: {

2260:   MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);
2261:   return(0);
2262: }

2264: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2265: {
2266:   PetscErrorCode              ierr;
2267:   PetscSplitCSRDataStructure  *d_mat = NULL, h_mat;
2268:   PetscBool                   is_seq = PETSC_TRUE;
2269:   PetscInt                    nnz_state = A->nonzerostate;
2271:   if (A->factortype == MAT_FACTOR_NONE) {
2272:     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2273:   }
2274:   if (d_mat) {
2275:     cudaError_t err;
2276:     PetscInfo(A,"Assemble device matrix\n");
2277:     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2278:     nnz_state = h_mat.nonzerostate;
2279:     is_seq = h_mat.seq;
2280:   }
2281:   MatAssemblyEnd_SeqAIJ(A,mode); // this does very little if assembled on GPU - call it?
2282:   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) return(0);
2283:   if (A->factortype == MAT_FACTOR_NONE && A->nonzerostate >= nnz_state && is_seq) { // assembled on CPU eventhough equiped for GPU
2284:     MatSeqAIJCUSPARSECopyToGPU(A);
2285:   } else if (nnz_state > A->nonzerostate) {
2286:     A->offloadmask = PETSC_OFFLOAD_GPU;
2287:   }

2289:   return(0);
2290: }

2292: /* --------------------------------------------------------------------------------*/
2293: /*@
2294:    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2295:    (the default parallel PETSc format). This matrix will ultimately pushed down
2296:    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2297:    assembly performance the user should preallocate the matrix storage by setting
2298:    the parameter nz (or the array nnz).  By setting these parameters accurately,
2299:    performance during matrix assembly can be increased by more than a factor of 50.

2301:    Collective

2303:    Input Parameters:
2304: +  comm - MPI communicator, set to PETSC_COMM_SELF
2305: .  m - number of rows
2306: .  n - number of columns
2307: .  nz - number of nonzeros per row (same for all rows)
2308: -  nnz - array containing the number of nonzeros in the various rows
2309:          (possibly different for each row) or NULL

2311:    Output Parameter:
2312: .  A - the matrix

2314:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2315:    MatXXXXSetPreallocation() paradgm instead of this routine directly.
2316:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

2318:    Notes:
2319:    If nnz is given then nz is ignored

2321:    The AIJ format (also called the Yale sparse matrix format or
2322:    compressed row storage), is fully compatible with standard Fortran 77
2323:    storage.  That is, the stored row and column indices can begin at
2324:    either one (as in Fortran) or zero.  See the users' manual for details.

2326:    Specify the preallocated storage with either nz or nnz (not both).
2327:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
2328:    allocation.  For large problems you MUST preallocate memory or you
2329:    will get TERRIBLE performance, see the users' manual chapter on matrices.

2331:    By default, this format uses inodes (identical nodes) when possible, to
2332:    improve numerical efficiency of matrix-vector products and solves. We
2333:    search for consecutive rows with the same nonzero structure, thereby
2334:    reusing matrix information to achieve increased efficiency.

2336:    Level: intermediate

2338: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
2339: @*/
2340: PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
2341: {

2345:   MatCreate(comm,A);
2346:   MatSetSizes(*A,m,n,m,n);
2347:   MatSetType(*A,MATSEQAIJCUSPARSE);
2348:   MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
2349:   return(0);
2350: }

2352: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
2353: {
2354:   PetscErrorCode              ierr;
2355:   PetscSplitCSRDataStructure  *d_mat = NULL;

2358:   if (A->factortype == MAT_FACTOR_NONE) {
2359:     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2360:     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
2361:     MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
2362:   } else {
2363:     MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
2364:   }
2365:   if (d_mat) {
2366:     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
2367:     cudaError_t                err;
2368:     PetscSplitCSRDataStructure h_mat;
2369:     PetscInfo(A,"Have device matrix\n");
2370:     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2371:     if (h_mat.seq) {
2372:       if (a->compressedrow.use) {
2373:          err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
2374:       }
2375:       err = cudaFree(d_mat);CHKERRCUDA(err);
2376:     }
2377:   }
2378:   PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);
2379:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
2380:   PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
2381:   PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);
2382:   MatDestroy_SeqAIJ(A);
2383:   return(0);
2384: }

2386: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
2387: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
2388: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
2389: {

2393:   MatDuplicate_SeqAIJ(A,cpvalues,B);
2394:   MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);
2395:   return(0);
2396: }

2398: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
2399: {

2403:   if (A->factortype != MAT_FACTOR_NONE) return(0);
2404:   /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU.
2405:      If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one.
2406:      Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case.
2407:      TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries;
2408:            can follow the example of MatBindToCPU_SeqAIJViennaCL(). */
2409:   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov");
2410:   /* TODO: add support for this? */
2411:   if (flg) {
2412:     A->ops->mult                      = MatMult_SeqAIJ;
2413:     A->ops->multadd                   = MatMultAdd_SeqAIJ;
2414:     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
2415:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
2416:     A->ops->multhermitiantranspose    = NULL;
2417:     A->ops->multhermitiantransposeadd = NULL;
2418:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
2419:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
2420:   } else {
2421:     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
2422:     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
2423:     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
2424:     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
2425:     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
2426:     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
2427:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
2428:     PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
2429:   }
2430:   A->boundtocpu = flg;
2431:   return(0);
2432: }

2434: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
2435: {
2436:   PetscSplitCSRDataStructure  *d_mat = NULL;
2437:   PetscErrorCode               ierr;
2439:   if (A->factortype == MAT_FACTOR_NONE) {
2440:     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
2441:     d_mat = spptr->deviceMat;
2442:   }
2443:   if (d_mat) {
2444:     Mat_SeqAIJ   *a = (Mat_SeqAIJ*)A->data;
2445:     PetscInt     n = A->rmap->n, nnz = a->i[n];
2446:     cudaError_t  err;
2447:     PetscScalar  *vals;
2448:     PetscInfo(A,"Zero device matrix\n");
2449:     err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2450:     err = cudaMemset( vals, 0, (nnz)*sizeof(PetscScalar));CHKERRCUDA(err);
2451:   }
2452:   MatZeroEntries_SeqAIJ(A);

2454:   A->offloadmask = PETSC_OFFLOAD_BOTH;

2456:   return(0);
2457: }

2459: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
2460: {
2461:   PetscErrorCode   ierr;
2462:   cusparseStatus_t stat;
2463:   Mat              B;

2466:   if (reuse == MAT_INITIAL_MATRIX) {
2467:     MatDuplicate(A,MAT_COPY_VALUES,newmat);
2468:   } else if (reuse == MAT_REUSE_MATRIX) {
2469:     MatCopy(A,*newmat,SAME_NONZERO_PATTERN);
2470:   }
2471:   B = *newmat;

2473:   PetscFree(B->defaultvectype);
2474:   PetscStrallocpy(VECCUDA,&B->defaultvectype);

2476:   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
2477:     if (B->factortype == MAT_FACTOR_NONE) {
2478:       Mat_SeqAIJCUSPARSE *spptr;

2480:       PetscNew(&spptr);
2481:       spptr->format = MAT_CUSPARSE_CSR;
2482:       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2483:       B->spptr = spptr;
2484:       spptr->deviceMat = NULL;
2485:     } else {
2486:       Mat_SeqAIJCUSPARSETriFactors *spptr;

2488:       PetscNew(&spptr);
2489:       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2490:       B->spptr = spptr;
2491:     }
2492:     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2493:   }
2494:   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
2495:   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
2496:   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
2497:   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
2498:   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
2499:   B->ops->zeroentries    = MatZeroEntries_SeqAIJCUSPARSE;

2501:   MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);
2502:   PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);
2503:   PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);
2504:   return(0);
2505: }

2507: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
2508: {

2512:   PetscCUDAInitializeCheck();
2513:   MatCreate_SeqAIJ(B);
2514:   MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);
2515:   PetscObjectOptionsBegin((PetscObject)B);
2516:   MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);
2517:   PetscOptionsEnd();
2518:   return(0);
2519: }

2521: /*MC
2522:    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.

2524:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
2525:    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
2526:    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.

2528:    Options Database Keys:
2529: +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2530: .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2531: -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).

2533:   Level: beginner

2535: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2536: M*/

2538: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);


2541: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
2542: {

2546:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
2547:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
2548:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
2549:   MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);
2550:   return(0);
2551: }

2553: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
2554: {
2555:   PetscErrorCode   ierr;
2556:   cusparseStatus_t stat;
2557:   cusparseHandle_t handle;

2560:   if (*cusparsestruct) {
2561:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
2562:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
2563:     delete (*cusparsestruct)->workVector;
2564:     delete (*cusparsestruct)->rowoffsets_gpu;
2565:     if (handle = (*cusparsestruct)->handle) {stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);}
2566:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2567:     cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
2568:    #endif
2569:     PetscFree(*cusparsestruct);
2570:   }
2571:   return(0);
2572: }

2574: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2575: {
2577:   if (*mat) {
2578:     delete (*mat)->values;
2579:     delete (*mat)->column_indices;
2580:     delete (*mat)->row_offsets;
2581:     delete *mat;
2582:     *mat = 0;
2583:   }
2584:   return(0);
2585: }

2587: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
2588: {
2589:   cusparseStatus_t stat;
2590:   PetscErrorCode   ierr;

2593:   if (*trifactor) {
2594:     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2595:     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
2596:     CsrMatrix_Destroy(&(*trifactor)->csrMat);
2597:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2598:     cudaError_t cerr;
2599:     if ((*trifactor)->solveBuffer)   {cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
2600:     if ((*trifactor)->csr2cscBuffer) {cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
2601:    #endif
2602:     delete *trifactor;
2603:     *trifactor = 0;
2604:   }
2605:   return(0);
2606: }

2608: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
2609: {
2610:   CsrMatrix        *mat;
2611:   cusparseStatus_t stat;
2612:   cudaError_t      err;

2615:   if (*matstruct) {
2616:     if ((*matstruct)->mat) {
2617:       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2618:        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2619:         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2620:        #else
2621:         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
2622:         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2623:        #endif
2624:       } else {
2625:         mat = (CsrMatrix*)(*matstruct)->mat;
2626:         CsrMatrix_Destroy(&mat);
2627:       }
2628:     }
2629:     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
2630:     delete (*matstruct)->cprowIndices;
2631:     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
2632:     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
2633:     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }

2635:    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2636:     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
2637:     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
2638:     for (int i=0; i<3; i++) {
2639:       if (mdata->cuSpMV[i].initialized) {
2640:         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
2641:         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
2642:         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
2643:       }
2644:     }
2645:    #endif
2646:     delete *matstruct;
2647:     *matstruct = 0;
2648:   }
2649:   return(0);
2650: }

2652: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2653: {

2657:   if (*trifactors) {
2658:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
2659:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
2660:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
2661:     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
2662:     delete (*trifactors)->rpermIndices;
2663:     delete (*trifactors)->cpermIndices;
2664:     delete (*trifactors)->workVector;
2665:     (*trifactors)->rpermIndices = 0;
2666:     (*trifactors)->cpermIndices = 0;
2667:     (*trifactors)->workVector = 0;
2668:   }
2669:   return(0);
2670: }

2672: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2673: {
2674:   PetscErrorCode   ierr;
2675:   cusparseHandle_t handle;
2676:   cusparseStatus_t stat;

2679:   if (*trifactors) {
2680:     MatSeqAIJCUSPARSETriFactors_Reset(trifactors);
2681:     if (handle = (*trifactors)->handle) {
2682:       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2683:     }
2684:     PetscFree(*trifactors);
2685:   }
2686:   return(0);
2687: }