Actual source code: mpiaijcusparse.cu
petsc-3.14.0 2020-09-29
1: #define PETSC_SKIP_SPINLOCK
2: #define PETSC_SKIP_CXX_COMPLEX_FIX
3: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
5: #include <petscconf.h>
6: #include <../src/mat/impls/aij/mpi/mpiaij.h>
7: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
8: #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
10: PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
11: {
12: Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
13: Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
14: PetscErrorCode ierr;
15: PetscInt i;
18: PetscLayoutSetUp(B->rmap);
19: PetscLayoutSetUp(B->cmap);
20: if (d_nnz) {
21: for (i=0; i<B->rmap->n; i++) {
22: if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
23: }
24: }
25: if (o_nnz) {
26: for (i=0; i<B->rmap->n; i++) {
27: if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
28: }
29: }
30: if (!B->preallocated) {
31: /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
32: MatCreate(PETSC_COMM_SELF,&b->A);
33: MatBindToCPU(b->A,B->boundtocpu);
34: MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);
35: MatSetType(b->A,MATSEQAIJCUSPARSE);
36: PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);
37: MatCreate(PETSC_COMM_SELF,&b->B);
38: MatBindToCPU(b->B,B->boundtocpu);
39: MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);
40: MatSetType(b->B,MATSEQAIJCUSPARSE);
41: PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);
42: }
43: MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);
44: MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);
45: MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);
46: MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);
47: MatCUSPARSESetHandle(b->A,cusparseStruct->handle);
48: MatCUSPARSESetHandle(b->B,cusparseStruct->handle);
49: MatCUSPARSESetStream(b->A,cusparseStruct->stream);
50: MatCUSPARSESetStream(b->B,cusparseStruct->stream);
52: B->preallocated = PETSC_TRUE;
53: return(0);
54: }
56: PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
57: {
58: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
60: PetscInt nt;
63: VecGetLocalSize(xx,&nt);
64: if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
65: VecScatterInitializeForGPU(a->Mvctx,xx);
66: VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
67: (*a->A->ops->mult)(a->A,xx,yy);
68: VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
69: (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);
70: VecScatterFinalizeForGPU(a->Mvctx);
71: return(0);
72: }
74: PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
75: {
76: Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data;
80: if (A->factortype == MAT_FACTOR_NONE) {
81: Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
82: PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
83: if (d_mat) {
84: Mat_SeqAIJ *a = (Mat_SeqAIJ*)l->A->data;
85: Mat_SeqAIJ *b = (Mat_SeqAIJ*)l->B->data;
86: PetscInt n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
87: cudaError_t err;
88: PetscScalar *vals;
89: PetscInfo(A,"Zero device matrix diag and offfdiag\n");
90: err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
91: err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
92: err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
93: err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
94: }
95: }
96: MatZeroEntries(l->A);
97: MatZeroEntries(l->B);
99: return(0);
100: }
101: PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
102: {
103: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
105: PetscInt nt;
108: VecGetLocalSize(xx,&nt);
109: if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
110: VecScatterInitializeForGPU(a->Mvctx,xx);
111: VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
112: (*a->A->ops->multadd)(a->A,xx,yy,zz);
113: VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
114: (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);
115: VecScatterFinalizeForGPU(a->Mvctx);
116: return(0);
117: }
119: PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
120: {
121: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
123: PetscInt nt;
126: VecGetLocalSize(xx,&nt);
127: if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
128: VecScatterInitializeForGPU(a->Mvctx,a->lvec);
129: (*a->B->ops->multtranspose)(a->B,xx,a->lvec);
130: (*a->A->ops->multtranspose)(a->A,xx,yy);
131: VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
132: VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);
133: VecScatterFinalizeForGPU(a->Mvctx);
134: return(0);
135: }
137: PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
138: {
139: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
140: Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
143: switch (op) {
144: case MAT_CUSPARSE_MULT_DIAG:
145: cusparseStruct->diagGPUMatFormat = format;
146: break;
147: case MAT_CUSPARSE_MULT_OFFDIAG:
148: cusparseStruct->offdiagGPUMatFormat = format;
149: break;
150: case MAT_CUSPARSE_ALL:
151: cusparseStruct->diagGPUMatFormat = format;
152: cusparseStruct->offdiagGPUMatFormat = format;
153: break;
154: default:
155: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
156: }
157: return(0);
158: }
160: PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
161: {
162: MatCUSPARSEStorageFormat format;
163: PetscErrorCode ierr;
164: PetscBool flg;
165: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
166: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
169: PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");
170: if (A->factortype==MAT_FACTOR_NONE) {
171: PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
172: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
173: if (flg) {
174: MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);
175: }
176: PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
177: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);
178: if (flg) {
179: MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);
180: }
181: PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
182: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);
183: if (flg) {
184: MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
185: }
186: }
187: PetscOptionsTail();
188: return(0);
189: }
191: PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
192: {
193: PetscErrorCode ierr;
194: Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data;
195: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
196: PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
197: PetscInt nnz_state = A->nonzerostate;
199: if (d_mat) {
200: cudaError_t err;
201: err = cudaMemcpy( &nnz_state, &d_mat->nonzerostate, sizeof(PetscInt), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
202: }
203: MatAssemblyEnd_MPIAIJ(A,mode);
204: if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
205: VecSetType(mpiaij->lvec,VECSEQCUDA);
206: }
207: if (nnz_state > A->nonzerostate) {
208: A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
209: }
211: return(0);
212: }
214: PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
215: {
216: PetscErrorCode ierr;
217: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
218: Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
219: cudaError_t err;
220: cusparseStatus_t stat;
223: if (cusparseStruct->deviceMat) {
224: Mat_SeqAIJ *jaca = (Mat_SeqAIJ*)aij->A->data;
225: Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data;
226: PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
227: PetscInfo(A,"Have device matrix\n");
228: err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
229: if (jaca->compressedrow.use) {
230: err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
231: }
232: if (jacb->compressedrow.use) {
233: err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
234: }
235: err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
236: err = cudaFree(d_mat);CHKERRCUDA(err);
237: }
238: try {
239: if (aij->A) { MatCUSPARSEClearHandle(aij->A); }
240: if (aij->B) { MatCUSPARSEClearHandle(aij->B); }
241: stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
242: if (cusparseStruct->stream) {
243: err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
244: }
245: delete cusparseStruct;
246: } catch(char *ex) {
247: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
248: }
249: PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);
250: PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);
251: MatDestroy_MPIAIJ(A);
252: return(0);
253: }
255: PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
256: {
257: PetscErrorCode ierr;
258: Mat_MPIAIJ *a;
259: Mat_MPIAIJCUSPARSE *cusparseStruct;
260: cusparseStatus_t stat;
261: Mat A;
264: if (reuse == MAT_INITIAL_MATRIX) {
265: MatDuplicate(B,MAT_COPY_VALUES,newmat);
266: } else if (reuse == MAT_REUSE_MATRIX) {
267: MatCopy(B,*newmat,SAME_NONZERO_PATTERN);
268: }
269: A = *newmat;
271: PetscFree(A->defaultvectype);
272: PetscStrallocpy(VECCUDA,&A->defaultvectype);
274: a = (Mat_MPIAIJ*)A->data;
275: if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
276: a->spptr = new Mat_MPIAIJCUSPARSE;
278: cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
279: cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR;
280: cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
281: cusparseStruct->stream = 0;
282: stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
283: cusparseStruct->deviceMat = NULL;
284: }
286: A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE;
287: A->ops->mult = MatMult_MPIAIJCUSPARSE;
288: A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE;
289: A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE;
290: A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
291: A->ops->destroy = MatDestroy_MPIAIJCUSPARSE;
292: A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE;
294: PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);
295: PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);
296: PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);
297: return(0);
298: }
300: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
301: {
305: PetscCUDAInitializeCheck();
306: MatCreate_MPIAIJ(A);
307: MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);
308: return(0);
309: }
311: /*@
312: MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
313: (the default parallel PETSc format). This matrix will ultimately pushed down
314: to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
315: assembly performance the user should preallocate the matrix storage by setting
316: the parameter nz (or the array nnz). By setting these parameters accurately,
317: performance during matrix assembly can be increased by more than a factor of 50.
319: Collective
321: Input Parameters:
322: + comm - MPI communicator, set to PETSC_COMM_SELF
323: . m - number of rows
324: . n - number of columns
325: . nz - number of nonzeros per row (same for all rows)
326: - nnz - array containing the number of nonzeros in the various rows
327: (possibly different for each row) or NULL
329: Output Parameter:
330: . A - the matrix
332: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
333: MatXXXXSetPreallocation() paradigm instead of this routine directly.
334: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
336: Notes:
337: If nnz is given then nz is ignored
339: The AIJ format (also called the Yale sparse matrix format or
340: compressed row storage), is fully compatible with standard Fortran 77
341: storage. That is, the stored row and column indices can begin at
342: either one (as in Fortran) or zero. See the users' manual for details.
344: Specify the preallocated storage with either nz or nnz (not both).
345: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
346: allocation. For large problems you MUST preallocate memory or you
347: will get TERRIBLE performance, see the users' manual chapter on matrices.
349: By default, this format uses inodes (identical nodes) when possible, to
350: improve numerical efficiency of matrix-vector products and solves. We
351: search for consecutive rows with the same nonzero structure, thereby
352: reusing matrix information to achieve increased efficiency.
354: Level: intermediate
356: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
357: @*/
358: PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
359: {
361: PetscMPIInt size;
364: MatCreate(comm,A);
365: MatSetSizes(*A,m,n,M,N);
366: MPI_Comm_size(comm,&size);
367: if (size > 1) {
368: MatSetType(*A,MATMPIAIJCUSPARSE);
369: MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);
370: } else {
371: MatSetType(*A,MATSEQAIJCUSPARSE);
372: MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);
373: }
374: return(0);
375: }
377: /*MC
378: MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
380: A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
381: CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
382: All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
384: This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
385: and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators,
386: MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
387: for communicators controlling multiple processes. It is recommended that you call both of
388: the above preallocation routines for simplicity.
390: Options Database Keys:
391: + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
392: . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
393: . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
394: - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
396: Level: beginner
398: .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
399: M
400: M*/
402: // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
403: PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
404: {
405: PetscSplitCSRDataStructure **p_d_mat;
406: PetscMPIInt size,rank;
407: MPI_Comm comm;
408: PetscErrorCode ierr;
409: int *ai,*bi,*aj,*bj;
410: PetscScalar *aa,*ba;
413: PetscObjectGetComm((PetscObject)A,&comm);
414: MPI_Comm_size(comm,&size);
415: MPI_Comm_rank(comm,&rank);
416: if (A->factortype == MAT_FACTOR_NONE) {
417: CsrMatrix *matrixA,*matrixB=NULL;
418: if (size == 1) {
419: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
420: p_d_mat = &cusparsestruct->deviceMat;
421: Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
422: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
423: matrixA = (CsrMatrix*)matstruct->mat;
424: bi = bj = NULL; ba = NULL;
425: } else {
426: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
427: }
428: } else {
429: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
430: Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
431: p_d_mat = &spptr->deviceMat;
432: Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
433: Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
434: Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
435: Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
436: if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
437: if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
438: matrixA = (CsrMatrix*)matstructA->mat;
439: matrixB = (CsrMatrix*)matstructB->mat;
440: bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
441: bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
442: ba = thrust::raw_pointer_cast(matrixB->values->data());
443: if (rank==-1) {
444: for(unsigned int i = 0; i < matrixB->row_offsets->size(); i++)
445: std::cout << "\trow_offsets[" << i << "] = " << (*matrixB->row_offsets)[i] << std::endl;
446: for(unsigned int i = 0; i < matrixB->column_indices->size(); i++)
447: std::cout << "\tcolumn_indices[" << i << "] = " << (*matrixB->column_indices)[i] << std::endl;
448: for(unsigned int i = 0; i < matrixB->values->size(); i++)
449: std::cout << "\tvalues[" << i << "] = " << (*matrixB->values)[i] << std::endl;
450: }
451: } else {
452: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
453: }
454: }
455: ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
456: aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
457: aa = thrust::raw_pointer_cast(matrixA->values->data());
458: } else {
459: *B = NULL;
460: return(0);
461: }
462: // act like MatSetValues because not called on host
463: if (A->assembled) {
464: if (A->was_assembled) {
465: PetscInfo(A,"Assemble more than once already\n");
466: }
467: A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
468: } else {
469: SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
470: }
471: if (!*p_d_mat) {
472: cudaError_t err;
473: PetscSplitCSRDataStructure *d_mat, h_mat;
474: Mat_SeqAIJ *jaca;
475: PetscInt n = A->rmap->n, nnz;
476: // create and copy
477: PetscInfo(A,"Create device matrix\n");
478: err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
479: err = cudaMemset( d_mat, 0, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
480: *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
481: if (size == 1) {
482: jaca = (Mat_SeqAIJ*)A->data;
483: h_mat.nonzerostate = A->nonzerostate;
484: h_mat.rstart = 0; h_mat.rend = A->rmap->n;
485: h_mat.cstart = 0; h_mat.cend = A->cmap->n;
486: h_mat.offdiag.i = h_mat.offdiag.j = NULL;
487: h_mat.offdiag.a = NULL;
488: h_mat.seq = PETSC_TRUE;
489: } else {
490: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
491: Mat_SeqAIJ *jacb;
492: h_mat.seq = PETSC_FALSE; // for MatAssemblyEnd_SeqAIJCUSPARSE
493: jaca = (Mat_SeqAIJ*)aij->A->data;
494: jacb = (Mat_SeqAIJ*)aij->B->data;
495: h_mat.nonzerostate = aij->A->nonzerostate; // just keep one nonzero state?
496: if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
497: if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
498: // create colmap - this is ussually done (lazy) in MatSetValues
499: aij->donotstash = PETSC_TRUE;
500: aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
501: jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
502: #if defined(PETSC_USE_CTABLE)
503: SETERRQ(comm,PETSC_ERR_SUP,"Devioce metadata does not support ctable (--with-ctable=0)");
504: #else
505: PetscCalloc1(A->cmap->N+1,&aij->colmap);
506: aij->colmap[A->cmap->N] = -9;
507: PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));
508: {
509: PetscInt ii;
510: for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
511: }
512: if(aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
513: #endif
514: // allocate B copy data
515: h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
516: h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
517: nnz = jacb->i[n];
519: if (jacb->compressedrow.use) {
520: err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
521: err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
522: } else {
523: h_mat.offdiag.i = bi;
524: }
525: h_mat.offdiag.j = bj;
526: h_mat.offdiag.a = ba;
528: err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
529: err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
530: h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
531: h_mat.offdiag.n = n;
532: }
533: // allocate A copy data
534: nnz = jaca->i[n];
535: h_mat.diag.n = n;
536: h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
537: MPI_Comm_rank(comm,&h_mat.rank);
538: if (jaca->compressedrow.use) {
539: err = cudaMalloc((void **)&h_mat.diag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
540: err = cudaMemcpy( h_mat.diag.i, jaca->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
541: } else {
542: h_mat.diag.i = ai;
543: }
544: h_mat.diag.j = aj;
545: h_mat.diag.a = aa;
546: // copy pointers and metdata to device
547: err = cudaMemcpy( d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
548: PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);
549: } else {
550: *B = *p_d_mat;
551: }
552: A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
553: return(0);
554: }