Actual source code: landaucu.cu
petsc-3.14.0 2020-09-29
1: /*
2: Implements the Landau kernel
3: */
4: #include <petscconf.h>
5: #include <petsc/private/dmpleximpl.h>
6: #include <petsclandau.h>
7: #include <../src/mat/impls/aij/seq/aij.h>
8: #include <petsc/private/kernels/petscaxpy.h>
10: #define PETSC_THREAD_SYNC __syncthreads()
11: #define PETSC_DEVICE_FUNC_DECL __device__
12: #include "../land_kernel.h"
14: // Macro to catch CUDA errors in CUDA runtime calls
15: #define CUDA_SAFE_CALL(call) \
16: do { \
17: cudaError_t err = call; \
18: if (cudaSuccess != err) { \
19: fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
20: __FILE__, __LINE__, cudaGetErrorString(err) ); \
21: exit(EXIT_FAILURE); \
22: } \
23: } while (0)
24: // Macro to catch CUDA errors in kernel launches
25: #define CHECK_LAUNCH_ERROR() \
26: do { \
27: /* Check synchronous errors, i.e. pre-launch */ \
28: cudaError_t err = cudaGetLastError(); \
29: if (cudaSuccess != err) { \
30: fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
31: __FILE__, __LINE__, cudaGetErrorString(err) ); \
32: exit(EXIT_FAILURE); \
33: } \
34: /* Check asynchronous errors, i.e. kernel failed (ULF) */ \
35: err = cudaDeviceSynchronize(); \
36: if (cudaSuccess != err) { \
37: fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
38: __FILE__, __LINE__, cudaGetErrorString( err) ); \
39: exit(EXIT_FAILURE); \
40: } \
41: } while (0)
43: #define LANDAU_USE_SHARED_GPU_MEM
44: //j
45: // The GPU Landau kernel
46: //
47: __global__
48: void landau_kernel(const PetscInt nip, const PetscInt dim, const PetscInt totDim, const PetscInt Nf, const PetscInt Nb, const PetscReal invJj[],
49: const PetscReal nu_alpha[], const PetscReal nu_beta[], const PetscReal invMass[], const PetscReal Eq_m[],
50: const PetscReal * const BB, const PetscReal * const DD, const PetscReal * const IPDataGlobal, const PetscReal wiGlobal[],
51: #if !defined(LANDAU_USE_SHARED_GPU_MEM)
52: PetscReal *g2arr, PetscReal *g3arr,
53: #endif
54: PetscBool quarter3DDomain, PetscScalar elemMats_out[])
55: {
56: const PetscInt Nq = blockDim.x, myelem = blockIdx.x;
57: #if defined(LANDAU_USE_SHARED_GPU_MEM)
58: extern __shared__ PetscReal g2_g3_qi[]; // Nq * { [NSubBlocks][Nf][dim] ; [NSubBlocks][Nf][dim][dim] }
59: PetscReal (*g2)[LANDAU_MAX_NQ][LANDAU_MAX_SUB_THREAD_BLOCKS][LANDAU_MAX_SPECIES][LANDAU_DIM] = (PetscReal (*)[LANDAU_MAX_NQ][LANDAU_MAX_SUB_THREAD_BLOCKS][LANDAU_MAX_SPECIES][LANDAU_DIM]) &g2_g3_qi[0];
60: PetscReal (*g3)[LANDAU_MAX_NQ][LANDAU_MAX_SUB_THREAD_BLOCKS][LANDAU_MAX_SPECIES][LANDAU_DIM][LANDAU_DIM] = (PetscReal (*)[LANDAU_MAX_NQ][LANDAU_MAX_SUB_THREAD_BLOCKS][LANDAU_MAX_SPECIES][LANDAU_DIM][LANDAU_DIM]) &g2_g3_qi[LANDAU_MAX_SUB_THREAD_BLOCKS*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM];
61: #else
62: PetscReal (*g2)[LANDAU_MAX_NQ][LANDAU_MAX_SUB_THREAD_BLOCKS][LANDAU_MAX_SPECIES][LANDAU_DIM] = (PetscReal (*)[LANDAU_MAX_NQ][LANDAU_MAX_SUB_THREAD_BLOCKS][LANDAU_MAX_SPECIES][LANDAU_DIM]) &g2arr[myelem*LANDAU_MAX_SUB_THREAD_BLOCKS*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM ];
63: PetscReal (*g3)[LANDAU_MAX_NQ][LANDAU_MAX_SUB_THREAD_BLOCKS][LANDAU_MAX_SPECIES][LANDAU_DIM][LANDAU_DIM] = (PetscReal (*)[LANDAU_MAX_NQ][LANDAU_MAX_SUB_THREAD_BLOCKS][LANDAU_MAX_SPECIES][LANDAU_DIM][LANDAU_DIM]) &g3arr[myelem*LANDAU_MAX_SUB_THREAD_BLOCKS*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM*LANDAU_DIM];
64: #endif
65: const PetscInt myQi = threadIdx.x, mySubBlk = threadIdx.y, nSubBlks = blockDim.y;
66: const PetscInt jpidx = myQi + myelem * Nq;
67: const PetscInt subblocksz = nip/nSubBlks + !!(nip%nSubBlks), ip_start = mySubBlk*subblocksz, ip_end = (mySubBlk+1)*subblocksz > nip ? nip : (mySubBlk+1)*subblocksz; /* this could be wrong with very few global IPs */
68: PetscScalar *elemMat = &elemMats_out[myelem*totDim*totDim]; /* my output */
70: if (threadIdx.x==0 && threadIdx.y==0) {
71: memset(elemMat, 0, totDim*totDim*sizeof(PetscScalar));
72: }
73: __syncthreads();
74: landau_inner_integral(myQi, Nq, mySubBlk, nSubBlks, ip_start, ip_end, 1, jpidx, Nf, dim, IPDataGlobal, wiGlobal, &invJj[jpidx*dim*dim], nu_alpha, nu_beta, invMass, Eq_m, quarter3DDomain, Nq, Nb, 0, Nq, BB, DD, elemMat, *g2, *g3, myelem); /* compact */
75: // landau_inner_integral(myQi, Nq, mySubBlk, nSubBlks, mySubBlk, nip, nSubBlks, jpidx, Nf, dim, IPDataGlobal, wiGlobal, &invJj[jpidx*dim*dim], nu_alpha, nu_beta, invMass, Eq_m, quarter3DDomain, Nq, Nb, 0, Nq, BB, DD, elemMat, *g2, *g3, myelem); /* spread */
76: }
77: static PetscErrorCode LandauAssembleCuda(PetscInt cStart, PetscInt cEnd, PetscInt totDim, DM plex, PetscSection section, PetscSection globalSection, Mat JacP, PetscScalar elemMats[], PetscContainer container, const PetscLogEvent events[]);
78: __global__ void assemble_kernel(const PetscInt nidx_arr[], PetscInt *idx_arr[], PetscScalar *el_mats[], const ISColoringValue colors[], Mat_SeqAIJ mats[]);
79: PetscErrorCode LandauCUDAJacobian( DM plex, const PetscInt Nq, const PetscReal nu_alpha[],const PetscReal nu_beta[],
80: const PetscReal invMass[], const PetscReal Eq_m[], const PetscReal * const IPDataGlobal,
81: const PetscReal wiGlobal[], const PetscReal invJj[], const PetscInt num_sub_blocks, const PetscLogEvent events[], PetscBool quarter3DDomain,
82: Mat JacP)
83: {
84: PetscErrorCode ierr;
85: PetscInt ii,ej,*Nbf,Nb,nip_dim2,cStart,cEnd,Nf,dim,numGCells,totDim,nip,szf=sizeof(PetscReal);
86: PetscReal *d_BB,*d_DD,*d_invJj,*d_wiGlobal,*d_nu_alpha,*d_nu_beta,*d_invMass,*d_Eq_m;
87: PetscScalar *elemMats,*d_elemMats;
88: PetscLogDouble flops;
89: PetscTabulation *Tf;
90: PetscDS prob;
91: PetscSection section, globalSection;
92: PetscReal *d_IPDataGlobal;
93: PetscBool cuda_assemble = PETSC_FALSE;
96: PetscLogEventBegin(events[3],0,0,0,0);
97: DMGetDimension(plex, &dim);
98: if (dim!=LANDAU_DIM) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "LANDAU_DIM != dim");
99: DMPlexGetHeightStratum(plex,0,&cStart,&cEnd);
100: numGCells = cEnd - cStart;
101: nip = numGCells*Nq; /* length of inner global iteration */
102: DMGetDS(plex, &prob);
103: PetscDSGetNumFields(prob, &Nf);
104: PetscDSGetDimensions(prob, &Nbf); Nb = Nbf[0];
105: if (Nq != Nb) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Nq != Nb. %D %D",Nq,Nb);
106: PetscDSGetTotalDimension(prob, &totDim);
107: PetscDSGetTabulation(prob, &Tf);
108: DMGetLocalSection(plex, §ion);
109: DMGetGlobalSection(plex, &globalSection);
110: // create data
111: CUDA_SAFE_CALL(cudaMalloc((void **)&d_IPDataGlobal, nip*(dim + Nf*(dim+1))*szf )); // kernel input
112: CUDA_SAFE_CALL(cudaMalloc((void **)&d_nu_alpha, Nf*szf)); // kernel input
113: CUDA_SAFE_CALL(cudaMalloc((void **)&d_nu_beta, Nf*szf)); // kernel input
114: CUDA_SAFE_CALL(cudaMalloc((void **)&d_invMass, Nf*szf)); // kernel input
115: CUDA_SAFE_CALL(cudaMalloc((void **)&d_Eq_m, Nf*szf)); // kernel input
116: CUDA_SAFE_CALL(cudaMemcpy(d_IPDataGlobal, IPDataGlobal, nip*(dim + Nf*(dim+1))*szf, cudaMemcpyHostToDevice));
117: CUDA_SAFE_CALL(cudaMemcpy(d_nu_alpha, nu_alpha, Nf*szf, cudaMemcpyHostToDevice));
118: CUDA_SAFE_CALL(cudaMemcpy(d_nu_beta, nu_beta, Nf*szf, cudaMemcpyHostToDevice));
119: CUDA_SAFE_CALL(cudaMemcpy(d_invMass, invMass, Nf*szf, cudaMemcpyHostToDevice));
120: CUDA_SAFE_CALL(cudaMemcpy(d_Eq_m, Eq_m, Nf*szf, cudaMemcpyHostToDevice));
121: CUDA_SAFE_CALL(cudaMalloc((void **)&d_BB, Nq*Nb*szf)); // kernel input
122: CUDA_SAFE_CALL(cudaMemcpy( d_BB, Tf[0]->T[0], Nq*Nb*szf, cudaMemcpyHostToDevice));
123: CUDA_SAFE_CALL(cudaMalloc((void **)&d_DD, Nq*Nb*dim*szf)); // kernel input
124: CUDA_SAFE_CALL(cudaMemcpy( d_DD, Tf[0]->T[1], Nq*Nb*dim*szf, cudaMemcpyHostToDevice));
125: CUDA_SAFE_CALL(cudaMalloc((void **)&d_wiGlobal, Nq*numGCells*szf)); // kernel input
126: CUDA_SAFE_CALL(cudaMemcpy( d_wiGlobal, wiGlobal, Nq*numGCells*szf, cudaMemcpyHostToDevice));
127: // collect geometry
128: flops = (PetscLogDouble)numGCells*(PetscLogDouble)Nq*(PetscLogDouble)(5.*dim*dim*Nf*Nf + 165.);
129: nip_dim2 = Nq*numGCells*dim*dim;
130: CUDA_SAFE_CALL(cudaMalloc((void **)&d_invJj, nip_dim2*szf)); // kernel input
131: CUDA_SAFE_CALL(cudaMemcpy(d_invJj, invJj, nip_dim2*szf, cudaMemcpyHostToDevice));
132: PetscLogEventEnd(events[3],0,0,0,0);
134: PetscLogEventBegin(events[4],0,0,0,0);
135: PetscLogGpuFlops(flops*nip);
136: {
137: dim3 dimBlock(Nq,num_sub_blocks);
138: CUDA_SAFE_CALL(cudaMalloc((void **)&d_elemMats, totDim*totDim*numGCells*sizeof(PetscScalar))); // kernel output
139: ii = LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM*(1+LANDAU_DIM)*LANDAU_MAX_SUB_THREAD_BLOCKS;
140: #if defined(LANDAU_USE_SHARED_GPU_MEM)
141: // PetscPrintf(PETSC_COMM_SELF,"Call land_kernel with %D kB shared memory\n",ii*8/1024);
142: landau_kernel<<<numGCells,dimBlock,ii*szf>>>( nip,dim,totDim,Nf,Nb,d_invJj,d_nu_alpha,d_nu_beta,d_invMass,d_Eq_m,
143: d_BB, d_DD, d_IPDataGlobal, d_wiGlobal, quarter3DDomain, d_elemMats);
144: CHECK_LAUNCH_ERROR();
145: #else
146: PetscReal *d_g2g3;
147: CUDA_SAFE_CALL(cudaMalloc((void **)&d_g2g3, ii*szf*numGCells)); // kernel input
148: PetscReal *g2 = &d_g2g3[0];
149: PetscReal *g3 = &d_g2g3[LANDAU_MAX_SUB_THREAD_BLOCKS*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM*numGCells];
150: landau_kernel<<<numGCells,dimBlock>>>(nip,dim,totDim,Nf,Nb,d_invJj,d_nu_alpha,d_nu_beta,d_invMass,d_Eq_m,
151: d_BB, d_DD, d_IPDataGlobal, d_wiGlobal, g2, g3, quarter3DDomain, d_elemMats);
152: CHECK_LAUNCH_ERROR();
153: CUDA_SAFE_CALL (cudaDeviceSynchronize());
154: CUDA_SAFE_CALL(cudaFree(d_g2g3));
155: #endif
156: }
157: PetscLogEventEnd(events[4],0,0,0,0);
158: // delete device data
159: PetscLogEventBegin(events[5],0,0,0,0);
160: CUDA_SAFE_CALL(cudaFree(d_IPDataGlobal));
161: CUDA_SAFE_CALL(cudaFree(d_invJj));
162: CUDA_SAFE_CALL(cudaFree(d_wiGlobal));
163: CUDA_SAFE_CALL(cudaFree(d_nu_alpha));
164: CUDA_SAFE_CALL(cudaFree(d_nu_beta));
165: CUDA_SAFE_CALL(cudaFree(d_invMass));
166: CUDA_SAFE_CALL(cudaFree(d_Eq_m));
167: CUDA_SAFE_CALL(cudaFree(d_BB));
168: CUDA_SAFE_CALL(cudaFree(d_DD));
169: PetscMalloc1(totDim*totDim*numGCells,&elemMats);
170: CUDA_SAFE_CALL(cudaMemcpy(elemMats, d_elemMats, totDim*totDim*numGCells*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
171: CUDA_SAFE_CALL(cudaFree(d_elemMats));
172: PetscLogEventEnd(events[5],0,0,0,0);
174: PetscLogEventBegin(events[6],0,0,0,0);
175: if (!cuda_assemble) {
176: PetscScalar *elMat;
177: for (ej = cStart, elMat = elemMats ; ej < cEnd; ++ej, elMat += totDim*totDim) {
178: DMPlexMatSetClosure(plex, section, globalSection, JacP, ej, elMat, ADD_VALUES);
179: if (ej==-1) {
180: int d,f;
181: printf("GPU Element matrix\n");
182: for (d = 0; d < totDim; ++d){
183: for (f = 0; f < totDim; ++f) printf(" %17.10e", PetscRealPart(elMat[d*totDim + f]));
184: printf("\n");
185: }
186: exit(12);
187: }
188: }
189: } else {
190: PetscContainer container = NULL;
191: PetscObjectQuery((PetscObject)JacP,"coloring",(PetscObject*)&container);
192: if (!container) {
193: PetscLogEventBegin(events[8],0,0,0,0);
194: LandauCreateColoring(JacP, plex, &container);
195: PetscLogEventEnd(events[8],0,0,0,0);
196: }
197: LandauAssembleCuda(cStart, cEnd, totDim, plex, section, globalSection, JacP, elemMats, container, events);
198: }
199: PetscFree(elemMats);
200: PetscLogEventEnd(events[6],0,0,0,0);
202: return(0);
203: }
205: __global__
206: void assemble_kernel(const PetscInt nidx_arr[], PetscInt *idx_arr[], PetscScalar *el_mats[], const ISColoringValue colors[], Mat_SeqAIJ mats[])
207: {
208: const PetscInt myelem = (gridDim.x==1) ? threadIdx.x : blockIdx.x;
209: Mat_SeqAIJ a = mats[colors[myelem]]; /* copy to GPU */
210: const PetscScalar *v = el_mats[myelem];
211: const PetscInt *in = idx_arr[myelem], *im = idx_arr[myelem], n = nidx_arr[myelem], m = nidx_arr[myelem];
212: /* mat set values */
213: PetscInt *rp,k,low,high,t,row,nrow,i,col,l;
214: PetscInt *ai = a.i,*ailen = a.ilen;
215: PetscInt *aj = a.j,lastcol = -1;
216: MatScalar *ap=NULL,value=0.0,*aa = a.a;
217: for (k=0; k<m; k++) { /* loop over added rows */
218: row = im[k];
219: if (row < 0) continue;
220: rp = aj + ai[row];
221: ap = aa + ai[row];
222: nrow = ailen[row];
223: low = 0;
224: high = nrow;
225: for (l=0; l<n; l++) { /* loop over added columns */
226: /* if (in[l] < 0) { */
227: /* printf("\t\tin[l] < 0 ?????\n"); */
228: /* continue; */
229: /* } */
230: while (l<n && (value = v[l + k*n], PetscAbsScalar(value)==0.0)) l++;
231: if (l==n) break;
232: col = in[l];
233: if (col <= lastcol) low = 0;
234: else high = nrow;
235: lastcol = col;
236: while (high-low > 5) {
237: t = (low+high)/2;
238: if (rp[t] > col) high = t;
239: else low = t;
240: }
241: for (i=low; i<high; i++) {
242: // if (rp[i] > col) break;
243: if (rp[i] == col) {
244: ap[i] += value;
245: low = i + 1;
246: goto noinsert;
247: }
248: }
249: printf("\t\t\t ERROR in assemble_kernel\n");
250: noinsert:;
251: }
252: }
253: }
255: static PetscErrorCode LandauAssembleCuda(PetscInt cStart, PetscInt cEnd, PetscInt totDim, DM plex, PetscSection section, PetscSection globalSection, Mat JacP, PetscScalar elemMats[], PetscContainer container, const PetscLogEvent events[])
256: {
257: PetscErrorCode ierr;
258: #define LANDAU_MAX_COLORS 16
259: #define LANDAU_MAX_ELEMS 512
260: Mat_SeqAIJ h_mats[LANDAU_MAX_COLORS], *jaca = (Mat_SeqAIJ *)JacP->data, *d_mats;
261: const PetscInt nelems = cEnd - cStart, nnz = jaca->i[JacP->rmap->n], N = JacP->rmap->n; /* serial */
262: const ISColoringValue *colors;
263: ISColoringValue *d_colors,colour;
264: PetscInt *h_idx_arr[LANDAU_MAX_ELEMS], h_nidx_arr[LANDAU_MAX_ELEMS], *d_nidx_arr, **d_idx_arr,nc,ej,j,cell;
265: PetscScalar *h_new_el_mats[LANDAU_MAX_ELEMS], *val_buf, **d_new_el_mats;
266: ISColoring iscoloring;
267: PetscContainerGetPointer(container,(void**)&iscoloring);
268: /* get colors */
269: ISColoringGetColors(iscoloring, &j, &nc, &colors);
270: if (nelems>LANDAU_MAX_ELEMS) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "too many elements. %D > %D",nelems,LANDAU_MAX_ELEMS);
271: if (nc>LANDAU_MAX_COLORS) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "too many colors. %D > %D",nc,LANDAU_MAX_COLORS);
272: /* colors for kernel */
273: CUDA_SAFE_CALL(cudaMalloc((void **)&d_colors, nelems*sizeof(ISColoringValue))); // kernel input
274: CUDA_SAFE_CALL(cudaMemcpy( d_colors, colors, nelems*sizeof(ISColoringValue), cudaMemcpyHostToDevice));
275: /* get indices and element matrices */
276: for (cell = cStart, ej = 0 ; cell < cEnd; ++cell, ++ej) {
277: PetscInt numindices,*indices;
278: PetscScalar *elMat = &elemMats[ej*totDim*totDim];
279: PetscScalar *valuesOrig = elMat;
280: DMPlexGetClosureIndices(plex, section, globalSection, cell, PETSC_TRUE, &numindices, &indices, NULL, (PetscScalar **) &elMat);
281: h_nidx_arr[ej] = numindices;
282: CUDA_SAFE_CALL(cudaMalloc((void **)&h_idx_arr[ej], numindices*sizeof(PetscInt))); // kernel input
283: CUDA_SAFE_CALL(cudaMemcpy( h_idx_arr[ej], indices, numindices*sizeof(PetscInt), cudaMemcpyHostToDevice));
284: CUDA_SAFE_CALL(cudaMalloc((void **)&h_new_el_mats[ej], numindices*numindices*sizeof(PetscScalar))); // kernel input
285: CUDA_SAFE_CALL(cudaMemcpy( h_new_el_mats[ej], elMat, numindices*numindices*sizeof(PetscScalar), cudaMemcpyHostToDevice));
286: DMPlexRestoreClosureIndices(plex, section, globalSection, cell, PETSC_TRUE, &numindices, &indices, NULL, (PetscScalar **) &elMat);
287: if (elMat != valuesOrig) {DMRestoreWorkArray(plex, numindices*numindices, MPIU_SCALAR, &elMat);}
288: }
289: CUDA_SAFE_CALL(cudaMalloc((void **)&d_nidx_arr, nelems*sizeof(PetscInt))); // kernel input
290: CUDA_SAFE_CALL(cudaMemcpy( d_nidx_arr, h_nidx_arr, nelems*sizeof(PetscInt), cudaMemcpyHostToDevice));
291: CUDA_SAFE_CALL(cudaMalloc((void **)&d_idx_arr, nelems*sizeof(PetscInt*))); // kernel input
292: CUDA_SAFE_CALL(cudaMemcpy( d_idx_arr, h_idx_arr, nelems*sizeof(PetscInt*), cudaMemcpyHostToDevice));
293: CUDA_SAFE_CALL(cudaMalloc((void **)&d_new_el_mats, nelems*sizeof(PetscScalar*))); // kernel input
294: CUDA_SAFE_CALL(cudaMemcpy( d_new_el_mats, h_new_el_mats,nelems*sizeof(PetscScalar*), cudaMemcpyHostToDevice));
295: /* make matrix buffers */
296: for (colour=0; colour<nc; colour++) {
297: Mat_SeqAIJ *a = &h_mats[colour];
298: /* create on GPU and copy to GPU */
299: CUDA_SAFE_CALL(cudaMalloc((void **)&a->i, (N+1)*sizeof(PetscInt))); // kernel input
300: CUDA_SAFE_CALL(cudaMemcpy( a->i, jaca->i, (N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice));
301: CUDA_SAFE_CALL(cudaMalloc((void **)&a->ilen, (N)*sizeof(PetscInt))); // kernel input
302: CUDA_SAFE_CALL(cudaMemcpy( a->ilen, jaca->ilen,(N)*sizeof(PetscInt), cudaMemcpyHostToDevice));
303: CUDA_SAFE_CALL(cudaMalloc((void **)&a->j, (nnz)*sizeof(PetscInt))); // kernel input
304: CUDA_SAFE_CALL(cudaMemcpy( a->j, jaca->j, (nnz)*sizeof(PetscInt), cudaMemcpyHostToDevice));
305: CUDA_SAFE_CALL(cudaMalloc((void **)&a->a, (nnz)*sizeof(PetscScalar))); // kernel output
306: CUDA_SAFE_CALL(cudaMemset( a->a, 0, (nnz)*sizeof(PetscScalar)));
307: }
308: CUDA_SAFE_CALL(cudaMalloc(&d_mats, nc*sizeof(Mat_SeqAIJ))); // kernel input
309: CUDA_SAFE_CALL(cudaMemcpy( d_mats, h_mats, nc*sizeof(Mat_SeqAIJ), cudaMemcpyHostToDevice));
310: /* do it */
311: assemble_kernel<<<nelems,1>>>(d_nidx_arr, d_idx_arr, d_new_el_mats, d_colors, d_mats);
312: CHECK_LAUNCH_ERROR();
313: /* cleanup */
314: CUDA_SAFE_CALL(cudaFree(d_colors));
315: CUDA_SAFE_CALL(cudaFree(d_nidx_arr));
316: for (ej = cStart ; ej < nelems; ++ej) {
317: CUDA_SAFE_CALL(cudaFree(h_idx_arr[ej]));
318: CUDA_SAFE_CALL(cudaFree(h_new_el_mats[ej]));
319: }
320: CUDA_SAFE_CALL(cudaFree(d_idx_arr));
321: CUDA_SAFE_CALL(cudaFree(d_new_el_mats));
322: /* copy & add Mat data back to CPU to JacP */
324: PetscLogEventBegin(events[2],0,0,0,0);
325: PetscMalloc1(nnz,&val_buf);
326: PetscMemzero(jaca->a,nnz*sizeof(PetscScalar));
327: for (colour=0; colour<nc; colour++) {
328: Mat_SeqAIJ *a = &h_mats[colour];
329: CUDA_SAFE_CALL(cudaMemcpy(val_buf, a->a, (nnz)*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
330: PetscKernelAXPY(jaca->a,1.0,val_buf,nnz);
331: }
332: PetscFree(val_buf);
333: PetscLogEventEnd(events[2],0,0,0,0);
335: for (colour=0; colour<nc; colour++) {
336: Mat_SeqAIJ *a = &h_mats[colour];
337: /* destroy mat */
338: CUDA_SAFE_CALL(cudaFree(a->i));
339: CUDA_SAFE_CALL(cudaFree(a->ilen));
340: CUDA_SAFE_CALL(cudaFree(a->j));
341: CUDA_SAFE_CALL(cudaFree(a->a));
342: }
343: CUDA_SAFE_CALL(cudaFree(d_mats));
344: return(0);
345: }