Actual source code: bvblas.c

slepc-3.7.3 2016-09-29
Report Typos and Errors
  1: /*
  2:    BV private kernels that use the BLAS.

  4:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  5:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  6:    Copyright (c) 2002-2016, Universitat Politecnica de Valencia, Spain

  8:    This file is part of SLEPc.

 10:    SLEPc is free software: you can redistribute it and/or modify it under  the
 11:    terms of version 3 of the GNU Lesser General Public License as published by
 12:    the Free Software Foundation.

 14:    SLEPc  is  distributed in the hope that it will be useful, but WITHOUT  ANY
 15:    WARRANTY;  without even the implied warranty of MERCHANTABILITY or  FITNESS
 16:    FOR  A  PARTICULAR PURPOSE. See the GNU Lesser General Public  License  for
 17:    more details.

 19:    You  should have received a copy of the GNU Lesser General  Public  License
 20:    along with SLEPc. If not, see <http://www.gnu.org/licenses/>.
 21:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 22: */

 24: #include <slepc/private/bvimpl.h>
 25: #include <slepcblaslapack.h>

 27: #define BLOCKSIZE 64

 31: /*
 32:     C := alpha*A*B + beta*C

 34:     A is mxk (ld=m), B is kxn (ld=ldb), C is mxn (ld=m)
 35: */
 36: PetscErrorCode BVMult_BLAS_Private(BV bv,PetscInt m_,PetscInt n_,PetscInt k_,PetscInt ldb_,PetscScalar alpha,const PetscScalar *A,const PetscScalar *B,PetscScalar beta,PetscScalar *C)
 37: {
 39:   PetscBLASInt   m,n,k,ldb;
 40: #if defined(PETSC_HAVE_FBLASLAPACK) || defined(PETSC_HAVE_F2CBLASLAPACK)
 41:   PetscBLASInt   l,bs=BLOCKSIZE;
 42: #endif

 45:   PetscBLASIntCast(m_,&m);
 46:   PetscBLASIntCast(n_,&n);
 47:   PetscBLASIntCast(k_,&k);
 48:   PetscBLASIntCast(ldb_,&ldb);
 49: #if defined(PETSC_HAVE_FBLASLAPACK) || defined(PETSC_HAVE_F2CBLASLAPACK)
 50:   l = m % bs;
 51:   if (l) PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&l,&n,&k,&alpha,(PetscScalar*)A,&m,(PetscScalar*)B,&ldb,&beta,C,&m));
 52:   for (;l<m;l+=bs) {
 53:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bs,&n,&k,&alpha,(PetscScalar*)A+l,&m,(PetscScalar*)B,&ldb,&beta,C+l,&m));
 54:   }
 55: #else
 56:   if (m) PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&m,&n,&k,&alpha,(PetscScalar*)A,&m,(PetscScalar*)B,&ldb,&beta,C,&m));
 57: #endif
 58:   PetscLogFlops(2.0*m*n*k);
 59:   return(0);
 60: }

 64: /*
 65:     y := alpha*A*x + beta*y

 67:     A is nxk (ld=n)
 68: */
 69: PetscErrorCode BVMultVec_BLAS_Private(BV bv,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *A,const PetscScalar *x,PetscScalar beta,PetscScalar *y)
 70: {
 72:   PetscBLASInt   n,k,one=1;

 75:   PetscBLASIntCast(n_,&n);
 76:   PetscBLASIntCast(k_,&k);
 77:   if (n) PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&n,&k,&alpha,A,&n,x,&one,&beta,y,&one));
 78:   PetscLogFlops(2.0*n*k);
 79:   return(0);
 80: }

 84: /*
 85:     A(:,s:e-1) := A*B(:,s:e-1)

 87:     A is mxk (ld=m), B is kxn (ld=ldb)  n=e-s
 88: */
 89: PetscErrorCode BVMultInPlace_BLAS_Private(BV bv,PetscInt m_,PetscInt k_,PetscInt ldb_,PetscInt s,PetscInt e,PetscScalar *A,const PetscScalar *B,PetscBool btrans)
 90: {
 92:   PetscScalar    *pb,zero=0.0,one=1.0;
 93:   PetscBLASInt   m,n,k,l,ldb,bs=BLOCKSIZE;
 94:   PetscInt       j,n_=e-s;
 95:   const char     *bt;

 98:   PetscBLASIntCast(m_,&m);
 99:   PetscBLASIntCast(n_,&n);
100:   PetscBLASIntCast(k_,&k);
101:   PetscBLASIntCast(ldb_,&ldb);
102:   BVAllocateWork_Private(bv,BLOCKSIZE*n_);
103:   if (btrans) {
104:     pb = (PetscScalar*)B+s;
105:     bt = "C";
106:   } else {
107:     pb = (PetscScalar*)B+s*ldb;
108:     bt = "N";
109:   }
110:   l = m % bs;
111:   if (l) {
112:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N",bt,&l,&n,&k,&one,A,&m,pb,&ldb,&zero,bv->work,&l));
113:     for (j=0;j<n;j++) {
114:       PetscMemcpy(A+(s+j)*m,bv->work+j*l,l*sizeof(PetscScalar));
115:     }
116:   }
117:   for (;l<m;l+=bs) {
118:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N",bt,&bs,&n,&k,&one,A+l,&m,pb,&ldb,&zero,bv->work,&bs));
119:     for (j=0;j<n;j++) {
120:       PetscMemcpy(A+(s+j)*m+l,bv->work+j*bs,bs*sizeof(PetscScalar));
121:     }
122:   }
123:   PetscLogFlops(2.0*m*n*k);
124:   return(0);
125: }

129: /*
130:     V := V*B

132:     V is mxn (ld=m), B is nxn (ld=k)
133: */
134: PetscErrorCode BVMultInPlace_Vecs_Private(BV bv,PetscInt m_,PetscInt n_,PetscInt k_,Vec *V,const PetscScalar *B,PetscBool btrans)
135: {
136:   PetscErrorCode    ierr;
137:   PetscScalar       zero=0.0,one=1.0,*out,*pout;
138:   const PetscScalar *pin;
139:   PetscBLASInt      m,n,k,l,bs=BLOCKSIZE;
140:   PetscInt          j;
141:   const char        *bt;

144:   PetscBLASIntCast(m_,&m);
145:   PetscBLASIntCast(n_,&n);
146:   PetscBLASIntCast(k_,&k);
147:   BVAllocateWork_Private(bv,2*BLOCKSIZE*n_);
148:   out = bv->work+BLOCKSIZE*n_;
149:   if (btrans) bt = "C";
150:   else bt = "N";
151:   l = m % bs;
152:   if (l) {
153:     for (j=0;j<n;j++) {
154:       VecGetArrayRead(V[j],&pin);
155:       PetscMemcpy(bv->work+j*l,pin,l*sizeof(PetscScalar));
156:       VecRestoreArrayRead(V[j],&pin);
157:     }
158:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N",bt,&l,&n,&n,&one,bv->work,&l,(PetscScalar*)B,&k,&zero,out,&l));
159:     for (j=0;j<n;j++) {
160:       VecGetArray(V[j],&pout);
161:       PetscMemcpy(pout,out+j*l,l*sizeof(PetscScalar));
162:       VecRestoreArray(V[j],&pout);
163:     }
164:   }
165:   for (;l<m;l+=bs) {
166:     for (j=0;j<n;j++) {
167:       VecGetArrayRead(V[j],&pin);
168:       PetscMemcpy(bv->work+j*bs,pin+l,bs*sizeof(PetscScalar));
169:       VecRestoreArrayRead(V[j],&pin);
170:     }
171:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N",bt,&bs,&n,&n,&one,bv->work,&bs,(PetscScalar*)B,&k,&zero,out,&bs));
172:     for (j=0;j<n;j++) {
173:       VecGetArray(V[j],&pout);
174:       PetscMemcpy(pout+l,out+j*bs,bs*sizeof(PetscScalar));
175:       VecRestoreArray(V[j],&pout);
176:     }
177:   }
178:   PetscLogFlops(2.0*n*n*k);
179:   return(0);
180: }

184: /*
185:     B := alpha*A + beta*B

187:     A,B are nxk (ld=n)
188: */
189: PetscErrorCode BVAXPY_BLAS_Private(BV bv,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *A,PetscScalar beta,PetscScalar *B)
190: {
192:   PetscBLASInt   m,one=1;

195:   PetscBLASIntCast(n_*k_,&m);
196:   if (beta!=(PetscScalar)1.0) {
197:     PetscStackCallBLAS("BLASscal",BLASscal_(&m,&beta,B,&one));
198:     PetscLogFlops(m);
199:   }
200:   PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&m,&alpha,A,&one,B,&one));
201:   PetscLogFlops(2.0*m);
202:   return(0);
203: }

207: /*
208:     C := A'*B

210:     A' is mxk (ld=k), B is kxn (ld=k), C is mxn (ld=ldc)
211: */
212: PetscErrorCode BVDot_BLAS_Private(BV bv,PetscInt m_,PetscInt n_,PetscInt k_,PetscInt ldc_,const PetscScalar *A,const PetscScalar *B,PetscScalar *C,PetscBool mpi)
213: {
215:   PetscScalar    zero=0.0,one=1.0,*CC;
216:   PetscBLASInt   m,n,k,ldc,j,len;

219:   PetscBLASIntCast(m_,&m);
220:   PetscBLASIntCast(n_,&n);
221:   PetscBLASIntCast(k_,&k);
222:   PetscBLASIntCast(ldc_,&ldc);
223:   if (mpi) {
224:     if (ldc==m) {
225:       BVAllocateWork_Private(bv,m*n);
226:       if (k) PetscStackCallBLAS("BLASgemm",BLASgemm_("C","N",&m,&n,&k,&one,(PetscScalar*)A,&k,(PetscScalar*)B,&k,&zero,bv->work,&ldc));
227:       else { PetscMemzero(bv->work,m*n*sizeof(PetscScalar)); }
228:       PetscMPIIntCast(m*n,&len);
229:       MPI_Allreduce(bv->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv));
230:     } else {
231:       BVAllocateWork_Private(bv,2*m*n);
232:       CC = bv->work+m*n;
233:       if (k) PetscStackCallBLAS("BLASgemm",BLASgemm_("C","N",&m,&n,&k,&one,(PetscScalar*)A,&k,(PetscScalar*)B,&k,&zero,bv->work,&m));
234:       else { PetscMemzero(bv->work,m*n*sizeof(PetscScalar)); }
235:       PetscMPIIntCast(m*n,&len);
236:       MPI_Allreduce(bv->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv));
237:       for (j=0;j<n;j++) {
238:         PetscMemcpy(C+j*ldc,CC+j*m,m*sizeof(PetscScalar));
239:       }
240:     }
241:   } else {
242:     if (k) PetscStackCallBLAS("BLASgemm",BLASgemm_("C","N",&m,&n,&k,&one,(PetscScalar*)A,&k,(PetscScalar*)B,&k,&zero,C,&ldc));
243:   }
244:   PetscLogFlops(2.0*m*n*k);
245:   return(0);
246: }

250: /*
251:     y := A'*x

253:     A is nxk (ld=n)
254: */
255: PetscErrorCode BVDotVec_BLAS_Private(BV bv,PetscInt n_,PetscInt k_,const PetscScalar *A,const PetscScalar *x,PetscScalar *y,PetscBool mpi)
256: {
258:   PetscScalar    zero=0.0,done=1.0;
259:   PetscBLASInt   n,k,one=1,len;

262:   PetscBLASIntCast(n_,&n);
263:   PetscBLASIntCast(k_,&k);
264:   if (mpi) {
265:     BVAllocateWork_Private(bv,k);
266:     if (n) {
267:       PetscStackCallBLAS("BLASgemv",BLASgemv_("C",&n,&k,&done,A,&n,x,&one,&zero,bv->work,&one));
268:     } else {
269:       PetscMemzero(bv->work,k*sizeof(PetscScalar));
270:     }
271:     PetscMPIIntCast(k,&len);
272:     MPI_Allreduce(bv->work,y,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv));
273:   } else {
274:     if (n) PetscStackCallBLAS("BLASgemv",BLASgemv_("C",&n,&k,&done,A,&n,x,&one,&zero,y,&one));
275:   }
276:   PetscLogFlops(2.0*n*k);
277:   return(0);
278: }

282: /*
283:     Scale n scalars
284: */
285: PetscErrorCode BVScale_BLAS_Private(BV bv,PetscInt n_,PetscScalar *A,PetscScalar alpha)
286: {
288:   PetscBLASInt   n,one=1;

291:   if (alpha == (PetscScalar)0.0) {
292:     PetscMemzero(A,n_*sizeof(PetscScalar));
293:   } else if (alpha!=(PetscScalar)1.0) {
294:     PetscBLASIntCast(n_,&n);
295:     PetscStackCallBLAS("BLASscal",BLASscal_(&n,&alpha,A,&one));
296:     PetscLogFlops(n);
297:   }
298:   return(0);
299: }

303: /*
304:     Compute ||A|| for an mxn matrix
305: */
306: PetscErrorCode BVNorm_LAPACK_Private(BV bv,PetscInt m_,PetscInt n_,const PetscScalar *A,NormType type,PetscReal *nrm,PetscBool mpi)
307: {
309:   PetscBLASInt   m,n,i,j;
310:   PetscMPIInt    len;
311:   PetscReal      lnrm,*rwork=NULL,*rwork2=NULL;

314:   PetscFPTrapPush(PETSC_FP_TRAP_OFF);
315:   PetscBLASIntCast(m_,&m);
316:   PetscBLASIntCast(n_,&n);
317:   if (type==NORM_FROBENIUS || type==NORM_2) {
318:     lnrm = LAPACKlange_("F",&m,&n,(PetscScalar*)A,&m,rwork);
319:     if (mpi) {
320:       lnrm = lnrm*lnrm;
321:       MPI_Allreduce(&lnrm,nrm,1,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)bv));
322:       *nrm = PetscSqrtReal(*nrm);
323:     } else *nrm = lnrm;
324:     PetscLogFlops(2.0*m*n);
325:   } else if (type==NORM_1) {
326:     if (mpi) {
327:       BVAllocateWork_Private(bv,2*n_);
328:       rwork = (PetscReal*)bv->work;
329:       rwork2 = rwork+n_;
330:       PetscMemzero(rwork,n_*sizeof(PetscReal));
331:       PetscMemzero(rwork2,n_*sizeof(PetscReal));
332:       for (j=0;j<n_;j++) {
333:         for (i=0;i<m_;i++) {
334:           rwork[j] += PetscAbsScalar(A[i+j*m_]);
335:         }
336:       }
337:       PetscMPIIntCast(n_,&len);
338:       MPI_Allreduce(rwork,rwork2,len,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)bv));
339:       *nrm = 0.0;
340:       for (j=0;j<n_;j++) if (rwork2[j] > *nrm) *nrm = rwork2[j];
341:     } else {
342:       *nrm = LAPACKlange_("O",&m,&n,(PetscScalar*)A,&m,rwork);
343:     }
344:     PetscLogFlops(1.0*m*n);
345:   } else if (type==NORM_INFINITY) {
346:     BVAllocateWork_Private(bv,m_);
347:     rwork = (PetscReal*)bv->work;
348:     lnrm = LAPACKlange_("I",&m,&n,(PetscScalar*)A,&m,rwork);
349:     if (mpi) {
350:       MPI_Allreduce(&lnrm,nrm,1,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)bv));
351:     } else *nrm = lnrm;
352:     PetscLogFlops(1.0*m*n);
353:   }
354:   PetscFPTrapPop();
355:   return(0);
356: }

360: /*
361:     QR factorization of an mxn matrix
362: */
363: PetscErrorCode BVOrthogonalize_LAPACK_Private(BV bv,PetscInt m_,PetscInt n_,PetscScalar *Q,PetscScalar *R,PetscBool mpi)
364: {
365: #if defined(PETSC_MISSING_LAPACK_GEQRF) || defined(PETSC_MISSING_LAPACK_ORGQR)
367:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"GEQRF/ORGQR - Lapack routines are unavailable");
368: #else
370:   PetscBLASInt   m,n,i,j,k,l,nb,lwork,info;
371:   PetscScalar    *tau,*work,*Rl=NULL,*A=NULL,*C=NULL,one=1.0,zero=0.0;
372:   PetscMPIInt    rank,size,len;

375:   PetscFPTrapPush(PETSC_FP_TRAP_OFF);
376:   PetscBLASIntCast(m_,&m);
377:   PetscBLASIntCast(n_,&n);
378:   k = PetscMin(m,n);
379:   nb = 16;
380:   if (mpi) {
381:     MPI_Comm_rank(PetscObjectComm((PetscObject)bv),&rank);
382:     MPI_Comm_size(PetscObjectComm((PetscObject)bv),&size);
383:     BVAllocateWork_Private(bv,k+n*nb+n*n+n*n*size+m*n);
384:   } else {
385:     BVAllocateWork_Private(bv,k+n*nb);
386:    }
387:   tau = bv->work;
388:   work = bv->work+k;
389:   PetscBLASIntCast(n*nb,&lwork);
390:   if (mpi) {
391:     Rl = bv->work+k+n*nb;
392:     A  = bv->work+k+n*nb+n*n;
393:     C  = bv->work+k+n*nb+n*n+n*n*size;
394:   }

396:   /* Compute QR */
397:   PetscStackCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&m,&n,Q,&m,tau,work,&lwork,&info));
398:   if (info) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Error in Lapack xGEQRF %d",info);

400:   /* Extract R */
401:   if (R || mpi) {
402:     PetscMemzero(mpi? Rl: R,n*n*sizeof(PetscScalar));
403:     for (j=0;j<n;j++) {
404:       for (i=0;i<=j;i++) {
405:         if (mpi) Rl[i+j*n] = Q[i+j*m];
406:         else R[i+j*n] = Q[i+j*m];
407:       }
408:     }
409:   }

411:   /* Compute orthogonal matrix in Q */
412:   PetscStackCallBLAS("LAPACKungqr",LAPACKungqr_(&m,&n,&k,Q,&m,tau,work,&lwork,&info));
413:   if (info) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Error in Lapack xORGQR %d",info);

415:   if (mpi) {

417:     /* Stack triangular matrices */
418:     PetscBLASIntCast(n*size,&l);
419:     PetscMPIIntCast(n,&len);
420:     for (j=0;j<n;j++) {
421:       MPI_Allgather(Rl+j*n,len,MPIU_SCALAR,A+j*l,len,MPIU_SCALAR,PetscObjectComm((PetscObject)bv));
422:     }

424:     /* Compute QR */
425:     PetscStackCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&l,&n,A,&l,tau,work,&lwork,&info));
426:     if (info) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Error in Lapack xGEQRF %d",info);

428:     /* Extract R */
429:     if (R) {
430:       PetscMemzero(R,n*n*sizeof(PetscScalar));
431:       for (j=0;j<n;j++)
432:         for (i=0;i<=j;i++)
433:           R[i+j*n] = A[i+j*l];
434:     }

436:     /* Accumulate orthogonal matrix */
437:     PetscStackCallBLAS("LAPACKungqr",LAPACKungqr_(&l,&n,&n,A,&l,tau,work,&lwork,&info));
438:     if (info) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Error in Lapack xORGQR %d",info);
439:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&m,&n,&n,&one,Q,&m,A+rank*n,&l,&zero,C,&m));
440:     PetscMemcpy(Q,C,m*n*sizeof(PetscScalar));
441:   }

443:   PetscLogFlops(3.0*m*n*n);
444:   PetscFPTrapPop();
445:   return(0);
446: #endif
447: }