Actual source code: vecscattercusp.cu

petsc-3.7.3 2016-07-24
Report Typos and Errors
  1: /*
  2:    Implements the various scatter operations on cusp vectors
  3: */

  5: #define PETSC_SKIP_COMPLEX
  6: #define PETSC_SKIP_SPINLOCK

  8: #include <petscconf.h>
  9: #include <petsc/private/vecimpl.h>          /*I "petscvec.h" I*/
 10: #include <../src/vec/vec/impls/dvecimpl.h>
 11: #include <../src/vec/vec/impls/seq/seqcusp/cuspvecimpl.h>

 13: #include <cuda_runtime.h>

 17: PetscErrorCode VecScatterCUSPIndicesCreate_StoS(PetscInt n,PetscInt toFirst,PetscInt fromFirst,PetscInt toStep, PetscInt fromStep,PetscInt *tslots, PetscInt *fslots,PetscCUSPIndices *ci) {

 19:   PetscCUSPIndices           cci;
 20:   VecScatterCUSPIndices_StoS stos_scatter;
 21:   cudaError_t                err = cudaSuccess;
 22:   cudaStream_t               stream;
 23:   PetscInt                   *intVecGPU;
 24:   int                        device;
 25:   cudaDeviceProp             props;

 28:   cci = new struct _p_PetscCUSPIndices;
 29:   stos_scatter = new struct _p_VecScatterCUSPIndices_StoS;

 31:   /* create the "from" indices */
 32:   stos_scatter->fslots = 0;
 33:   stos_scatter->fromFirst = 0;
 34:   stos_scatter->fromStep = 0;
 35:   if (n) {
 36:     if (fslots) {
 37:       /* allocate GPU memory for the to-slots */
 38:       err = cudaMalloc((void **)&intVecGPU,n*sizeof(PetscInt));CHKERRCUSP((int)err);
 39:       err = cudaMemcpy(intVecGPU,fslots,n*sizeof(PetscInt),cudaMemcpyHostToDevice);CHKERRCUSP((int)err);

 41:       /* assign the pointer to the struct */
 42:       stos_scatter->fslots = intVecGPU;
 43:       stos_scatter->fromMode = VEC_SCATTER_CUSP_GENERAL;
 44:     } else if (fromStep) {
 45:       stos_scatter->fromFirst = fromFirst;
 46:       stos_scatter->fromStep = fromStep;
 47:       stos_scatter->fromMode = VEC_SCATTER_CUSP_STRIDED;
 48:     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must provide fslots or fromStep.");
 49:   }

 51:   /* create the "to" indices */
 52:   stos_scatter->tslots = 0;
 53:   stos_scatter->toFirst = 0;
 54:   stos_scatter->toStep = 0;
 55:   if (n) {
 56:     if (tslots) {
 57:       /* allocate GPU memory for the to-slots */
 58:       err = cudaMalloc((void **)&intVecGPU,n*sizeof(PetscInt));CHKERRCUSP((int)err);
 59:       err = cudaMemcpy(intVecGPU,tslots,n*sizeof(PetscInt),cudaMemcpyHostToDevice);CHKERRCUSP((int)err);

 61:       /* assign the pointer to the struct */
 62:       stos_scatter->tslots = intVecGPU;
 63:       stos_scatter->toMode = VEC_SCATTER_CUSP_GENERAL;
 64:     } else if (toStep) {
 65:       stos_scatter->toFirst = toFirst;
 66:       stos_scatter->toStep = toStep;
 67:       stos_scatter->toMode = VEC_SCATTER_CUSP_STRIDED;
 68:     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must provide tslots or toStep.");
 69:   }

 71:   /* allocate the stream variable */
 72:   err = cudaStreamCreate(&stream);CHKERRCUSP((int)err);
 73:   stos_scatter->stream = stream;

 75:   /* the number of indices */
 76:   stos_scatter->n = n;

 78:   /* get the maximum number of coresident thread blocks */
 79:   cudaGetDevice(&device);
 80:   cudaGetDeviceProperties(&props, device);
 81:   stos_scatter->MAX_CORESIDENT_THREADS = props.maxThreadsPerMultiProcessor;
 82:   if (props.major>=3) {
 83:     stos_scatter->MAX_BLOCKS = 16*props.multiProcessorCount;
 84:   } else {
 85:     stos_scatter->MAX_BLOCKS = 8*props.multiProcessorCount;
 86:   }

 88:   /* assign the indices */
 89:   cci->scatter = (VecScatterCUSPIndices_StoS)stos_scatter;
 90:   cci->scatterType = VEC_SCATTER_CUSP_STOS;
 91:   *ci = cci;
 92:   return(0);
 93: }

 97: PetscErrorCode VecScatterCUSPIndicesCreate_PtoP(PetscInt ns,PetscInt *sendIndices,PetscInt nr,PetscInt *recvIndices,PetscCUSPIndices *ci)
 98: {
 99:   PetscCUSPIndices           cci;
100:   VecScatterCUSPIndices_PtoP ptop_scatter;

103:   cci = new struct _p_PetscCUSPIndices;
104:   ptop_scatter = new struct _p_VecScatterCUSPIndices_PtoP;

106:   /* this calculation assumes that the input indices are sorted */
107:   ptop_scatter->ns = sendIndices[ns-1]-sendIndices[0]+1;
108:   ptop_scatter->sendLowestIndex = sendIndices[0];
109:   ptop_scatter->nr = recvIndices[nr-1]-recvIndices[0]+1;
110:   ptop_scatter->recvLowestIndex = recvIndices[0];

112:   /* assign indices */
113:   cci->scatter = (VecScatterCUSPIndices_PtoP)ptop_scatter;
114:   cci->scatterType = VEC_SCATTER_CUSP_PTOP;

116:   *ci = cci;
117:   return(0);
118: }

122: PetscErrorCode VecScatterCUSPIndicesDestroy(PetscCUSPIndices *ci)
123: {
125:   if (!(*ci)) return(0);
126:   try {
127:     if (ci) {
128:       if ((*ci)->scatterType == VEC_SCATTER_CUSP_PTOP) {
129:         delete (VecScatterCUSPIndices_PtoP)(*ci)->scatter;
130:         (*ci)->scatter = 0;
131:       } else {
132:         cudaError_t err = cudaSuccess;
133:         VecScatterCUSPIndices_StoS stos_scatter = (VecScatterCUSPIndices_StoS)(*ci)->scatter;
134:         if (stos_scatter->fslots) {
135:           err = cudaFree(stos_scatter->fslots);CHKERRCUSP((int)err);
136:           stos_scatter->fslots = 0;
137:         }

139:         /* free the GPU memory for the to-slots */
140:         if (stos_scatter->tslots) {
141:           err = cudaFree(stos_scatter->tslots);CHKERRCUSP((int)err);
142:           stos_scatter->tslots = 0;
143:         }

145:         /* free the stream variable */
146:         if (stos_scatter->stream) {
147:           err = cudaStreamDestroy(stos_scatter->stream);CHKERRCUSP((int)err);
148:           stos_scatter->stream = 0;
149:         }
150:         delete stos_scatter;
151:         (*ci)->scatter = 0;
152:       }
153:       delete *ci;
154:       *ci = 0;
155:     }
156:   } catch(char *ex) {
157:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
158:   }
159:   return(0);
160: }

162: /* Insert operator */
163: class Insert {
164:  public:
165:   __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
166:     return a;
167:   }
168: };

170: /* Add operator */
171: class Add {
172:  public:
173:   __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
174:     return a+b;
175:   }
176: };

178: /* Add operator */
179: class Max {
180:  public:
181:   __device__ PetscScalar operator() (PetscScalar a,PetscScalar b) const {
182: #if !defined(PETSC_USE_COMPLEX)
183:     return PetscMax(a,b);
184: #endif
185:   }
186: };

188: /* Sequential general to sequential general GPU kernel */
189: template<class OPERATOR>
190: __global__ void VecScatterCUSP_SGtoSG_kernel(PetscInt n,PetscInt *xind,PetscScalar *x,PetscInt *yind,PetscScalar *y,OPERATOR OP) {
191:   const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
192:   const int grid_size = gridDim.x * blockDim.x;
193:   for (int i = tidx; i < n; i += grid_size) {
194:     y[yind[i]] = OP(x[xind[i]],y[yind[i]]);
195:   }
196: }

198: /* Sequential general to sequential strided GPU kernel */
199: template<class OPERATOR>
200: __global__ void VecScatterCUSP_SGtoSS_kernel(PetscInt n,PetscInt *xind,PetscScalar *x,PetscInt toFirst,PetscInt toStep,PetscScalar *y,OPERATOR OP) {
201:   const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
202:   const int grid_size = gridDim.x * blockDim.x;
203:   for (int i = tidx; i < n; i += grid_size) {
204:     y[toFirst+i*toStep] = OP(x[xind[i]],y[toFirst+i*toStep]);
205:   }
206: }

208: /* Sequential strided to sequential strided GPU kernel */
209: template<class OPERATOR>
210: __global__ void VecScatterCUSP_SStoSS_kernel(PetscInt n,PetscInt fromFirst,PetscInt fromStep,PetscScalar *x,PetscInt toFirst,PetscInt toStep,PetscScalar *y,OPERATOR OP) {
211:   const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
212:   const int grid_size = gridDim.x * blockDim.x;
213:   for (int i = tidx; i < n; i += grid_size) {
214:     y[toFirst+i*toStep] = OP(x[fromFirst+i*fromStep],y[toFirst+i*toStep]);
215:   }
216: }

218: /* Sequential strided to sequential general GPU kernel */
219: template<class OPERATOR>
220: __global__ void VecScatterCUSP_SStoSG_kernel(PetscInt n,PetscInt fromFirst,PetscInt fromStep,PetscScalar *x,PetscInt *yind,PetscScalar *y,OPERATOR OP) {
221:   const int tidx = blockIdx.x*blockDim.x + threadIdx.x;
222:   const int grid_size = gridDim.x * blockDim.x;
223:   for (int i = tidx; i < n; i += grid_size) {
224:     y[yind[i]] = OP(x[fromFirst+i*fromStep],y[yind[i]]);
225:   }
226: }

228: template<class OPERATOR>
229: void VecScatterCUSP_StoS_Dispatcher(CUSPARRAY *xarray,CUSPARRAY *yarray,PetscCUSPIndices ci,ScatterMode mode,OPERATOR OP) {

231:   PetscInt                   nBlocks=0,nThreads=128;
232:   VecScatterCUSPIndices_StoS stos_scatter = (VecScatterCUSPIndices_StoS)ci->scatter;

234:   nBlocks=(int)ceil(((float) stos_scatter->n)/((float) nThreads))+1;
235:   if (nBlocks>stos_scatter->MAX_CORESIDENT_THREADS/nThreads) {
236:     nBlocks = stos_scatter->MAX_CORESIDENT_THREADS/nThreads;
237:   }
238:   dim3 block(nThreads,1,1);
239:   dim3 grid(nBlocks,1,1);

241:   if (mode == SCATTER_FORWARD) {
242:     if (stos_scatter->fromMode == VEC_SCATTER_CUSP_GENERAL && stos_scatter->toMode == VEC_SCATTER_CUSP_GENERAL) {
243:       VecScatterCUSP_SGtoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fslots,xarray->data().get(),stos_scatter->tslots,yarray->data().get(),OP);
244:     } else if (stos_scatter->fromMode == VEC_SCATTER_CUSP_GENERAL && stos_scatter->toMode == VEC_SCATTER_CUSP_STRIDED) {
245:       VecScatterCUSP_SGtoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fslots,xarray->data().get(),stos_scatter->toFirst,stos_scatter->toStep,yarray->data().get(),OP);
246:     } else if (stos_scatter->fromMode == VEC_SCATTER_CUSP_STRIDED && stos_scatter->toMode == VEC_SCATTER_CUSP_STRIDED) {
247:       VecScatterCUSP_SStoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fromFirst,stos_scatter->fromStep,xarray->data().get(),stos_scatter->toFirst,stos_scatter->toStep,yarray->data().get(),OP);
248:     } else if (stos_scatter->fromMode == VEC_SCATTER_CUSP_STRIDED && stos_scatter->toMode == VEC_SCATTER_CUSP_GENERAL) {
249:       VecScatterCUSP_SStoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->fromFirst,stos_scatter->fromStep,xarray->data().get(),stos_scatter->tslots,yarray->data().get(),OP);
250:     }
251:   } else {
252:     if (stos_scatter->toMode == VEC_SCATTER_CUSP_GENERAL && stos_scatter->fromMode == VEC_SCATTER_CUSP_GENERAL) {
253:       VecScatterCUSP_SGtoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->tslots,xarray->data().get(),stos_scatter->fslots,yarray->data().get(),OP);
254:     } else if (stos_scatter->toMode == VEC_SCATTER_CUSP_GENERAL && stos_scatter->fromMode == VEC_SCATTER_CUSP_STRIDED) {
255:       VecScatterCUSP_SGtoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->tslots,xarray->data().get(),stos_scatter->fromFirst,stos_scatter->fromStep,yarray->data().get(),OP);
256:     } else if (stos_scatter->toMode == VEC_SCATTER_CUSP_STRIDED && stos_scatter->fromMode == VEC_SCATTER_CUSP_STRIDED) {
257:       VecScatterCUSP_SStoSS_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->toFirst,stos_scatter->toStep,xarray->data().get(),stos_scatter->fromFirst,stos_scatter->fromStep,yarray->data().get(),OP);
258:     } else if (stos_scatter->toMode == VEC_SCATTER_CUSP_STRIDED && stos_scatter->fromMode == VEC_SCATTER_CUSP_GENERAL) {
259:       VecScatterCUSP_SStoSG_kernel<<<grid,block,0,stos_scatter->stream>>>(stos_scatter->n,stos_scatter->toFirst,stos_scatter->toStep,xarray->data().get(),stos_scatter->fslots,yarray->data().get(),OP);
260:     }
261:   }
262: }

266: PetscErrorCode VecScatterCUSP_StoS(Vec x,Vec y,PetscCUSPIndices ci,InsertMode addv,ScatterMode mode)
267: {
268:   PetscErrorCode             ierr;
269:   CUSPARRAY                  *xarray,*yarray;
270:   VecScatterCUSPIndices_StoS stos_scatter = (VecScatterCUSPIndices_StoS)ci->scatter;
271:   cudaError_t                err = cudaSuccess;

274:   VecCUSPAllocateCheck(x);
275:   VecCUSPAllocateCheck(y);
276:   VecCUSPGetArrayRead(x,&xarray);
277:   VecCUSPGetArrayReadWrite(y,&yarray);
278:   if (stos_scatter->n) {
279:     if (addv == INSERT_VALUES)
280:       VecScatterCUSP_StoS_Dispatcher(xarray,yarray,ci,mode,Insert());
281:     else if (addv == ADD_VALUES)
282:       VecScatterCUSP_StoS_Dispatcher(xarray,yarray,ci,mode,Add());
283: #if !defined(PETSC_USE_COMPLEX)
284:     else if (addv == MAX_VALUES)
285:       VecScatterCUSP_StoS_Dispatcher(xarray,yarray,ci,mode,Max());
286: #endif
287:     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Wrong insert option");
288:     err = cudaGetLastError();CHKERRCUSP((int)err);
289:     err = cudaStreamSynchronize(stos_scatter->stream);CHKERRCUSP((int)err);
290:   }
291:   VecCUSPRestoreArrayRead(x,&xarray);
292:   VecCUSPRestoreArrayReadWrite(y,&yarray);
293:   return(0);
294: }