Actual source code: agmresorthog.c

  1: #define PETSCKSP_DLL

  3: #include <../src/ksp/ksp/impls/gmres/agmres/agmresimpl.h>
  4: /*
  5:   This file implements the RODDEC algorithm : its purpose is to orthogonalize a set of vectors distributed across several processes.
  6:   These processes are organized in a virtual ring.

  8:   References : [1] Sidje, Roger B. Alternatives for parallel Krylov subspace basis computation. Numer. Linear Algebra Appl. 4 (1997), no. 4, 305-331

 10:   initial author R. B. SIDJE,
 11:   modified : G.-A Atenekeng-Kahou, 2008
 12:   modified : D. NUENTSA WAKAM, 2011

 14: */

 16: /*
 17: * Take the processes that own the vectors and organize them on a virtual ring.
 18: */
 19: static PetscErrorCode KSPAGMRESRoddecGivens(PetscReal *, PetscReal *, PetscReal *, PetscInt);

 21: PetscErrorCode KSPAGMRESRoddecInitNeighboor(KSP ksp)
 22: {
 23:   MPI_Comm    comm;
 24:   KSP_AGMRES *agmres = (KSP_AGMRES *)ksp->data;
 25:   PetscMPIInt First, Last, rank, size;

 27:   PetscFunctionBegin;
 28:   PetscCall(PetscObjectGetComm((PetscObject)agmres->vecs[0], &comm));
 29:   PetscCallMPI(MPI_Comm_rank(comm, &rank));
 30:   PetscCallMPI(MPI_Comm_size(comm, &size));
 31:   PetscCallMPI(MPIU_Allreduce(&rank, &First, 1, MPI_INT, MPI_MIN, comm));
 32:   PetscCallMPI(MPIU_Allreduce(&rank, &Last, 1, MPI_INT, MPI_MAX, comm));

 34:   if ((rank != Last) && (rank == 0)) {
 35:     agmres->Ileft  = rank - 1;
 36:     agmres->Iright = rank + 1;
 37:   } else {
 38:     if (rank == Last) {
 39:       agmres->Ileft  = rank - 1;
 40:       agmres->Iright = First;
 41:     } else {
 42:       {
 43:         agmres->Ileft  = Last;
 44:         agmres->Iright = rank + 1;
 45:       }
 46:     }
 47:   }
 48:   agmres->rank  = rank;
 49:   agmres->size  = size;
 50:   agmres->First = First;
 51:   agmres->Last  = Last;
 52:   PetscFunctionReturn(PETSC_SUCCESS);
 53: }

 55: static PetscErrorCode KSPAGMRESRoddecGivens(PetscReal *c, PetscReal *s, PetscReal *r, PetscInt make_r)
 56: {
 57:   PetscReal a, b, t;

 59:   PetscFunctionBegin;
 60:   if (make_r == 1) {
 61:     a = *c;
 62:     b = *s;
 63:     if (b == 0.e0) {
 64:       *c = 1.e0;
 65:       *s = 0.e0;
 66:     } else {
 67:       if (PetscAbsReal(b) > PetscAbsReal(a)) {
 68:         t  = -a / b;
 69:         *s = 1.e0 / PetscSqrtReal(1.e0 + t * t);
 70:         *c = (*s) * t;
 71:       } else {
 72:         t  = -b / a;
 73:         *c = 1.e0 / PetscSqrtReal(1.e0 + t * t);
 74:         *s = (*c) * t;
 75:       }
 76:     }
 77:     if (*c == 0.e0) {
 78:       *r = 1.e0;
 79:     } else {
 80:       if (PetscAbsReal(*s) < PetscAbsReal(*c)) {
 81:         *r = PetscSign(*c) * (*s) / 2.e0;
 82:       } else {
 83:         *r = PetscSign(*s) * 2.e0 / (*c);
 84:       }
 85:     }
 86:   }

 88:   if (*r == 1.e0) {
 89:     *c = 0.e0;
 90:     *s = 1.e0;
 91:   } else {
 92:     if (PetscAbsReal(*r) < 1.e0) {
 93:       *s = 2.e0 * (*r);
 94:       *c = PetscSqrtReal(1.e0 - (*s) * (*s));
 95:     } else {
 96:       *c = 2.e0 / (*r);
 97:       *s = PetscSqrtReal(1.e0 - (*c) * (*c));
 98:     }
 99:   }
100:   PetscFunctionReturn(PETSC_SUCCESS);
101: }

103: /*
104:   Compute the QR factorization of the Krylov basis vectors
105:   Input :
106:    - the vectors are available in agmres->vecs (alias VEC_V)
107:    - nvec :  the number of vectors
108:   Output :
109:    - agmres->Qloc : product of elementary reflectors for the QR of the (local part) of the vectors.
110:    - agmres->sgn :  Sign of the rotations
111:    - agmres->tloc : scalar factors of the elementary reflectors.

113: */
114: PetscErrorCode KSPAGMRESRoddec(KSP ksp, PetscInt nvec)
115: {
116:   KSP_AGMRES  *agmres = (KSP_AGMRES *)ksp->data;
117:   MPI_Comm     comm;
118:   PetscScalar *Qloc    = agmres->Qloc;
119:   PetscScalar *sgn     = agmres->sgn;
120:   PetscScalar *tloc    = agmres->tloc;
121:   PetscReal   *wbufptr = agmres->wbufptr;
122:   PetscMPIInt  rank    = agmres->rank;
123:   PetscMPIInt  First   = agmres->First;
124:   PetscMPIInt  Last    = agmres->Last;
125:   PetscBLASInt pas, len, bnloc, bpos;
126:   PetscInt     nloc, d, i, j, k;
127:   PetscInt     pos;
128:   PetscReal    c, s, rho, Ajj, val, tt, old;
129:   PetscScalar *col;
130:   MPI_Status   status;
131:   PetscBLASInt N;

133:   PetscFunctionBegin;
134:   PetscCall(PetscBLASIntCast(MAXKSPSIZE + 1, &N));
135:   PetscCall(PetscObjectGetComm((PetscObject)ksp, &comm));
136:   PetscCall(PetscLogEventBegin(KSP_AGMRESRoddec, ksp, 0, 0, 0));
137:   PetscCall(PetscArrayzero(agmres->Rloc, N * N));
138:   /* check input arguments */
139:   PetscCheck(nvec >= 1, PetscObjectComm((PetscObject)ksp), PETSC_ERR_ARG_OUTOFRANGE, "The number of input vectors should be positive");
140:   PetscCall(VecGetLocalSize(VEC_V(0), &nloc));
141:   PetscCall(PetscBLASIntCast(nloc, &bnloc));
142:   PetscCheck(nvec <= nloc, PetscObjectComm((PetscObject)ksp), PETSC_ERR_ARG_WRONG, "In QR factorization, the number of local rows should be greater or equal to the number of columns");
143:   pas = 1;
144:   /* Copy the vectors of the basis */
145:   for (j = 0; j < nvec; j++) {
146:     PetscCall(VecGetArray(VEC_V(j), &col));
147:     PetscCallBLAS("BLAScopy", BLAScopy_(&bnloc, col, &pas, &Qloc[j * nloc], &pas));
148:     PetscCall(VecRestoreArray(VEC_V(j), &col));
149:   }
150:   /* Each process performs a local QR on its own block */
151:   for (j = 0; j < nvec; j++) {
152:     PetscCall(PetscBLASIntCast(nloc - j, &len));
153:     Ajj = Qloc[j * nloc + j];
154:     PetscCallBLAS("BLASnrm2", rho = -PetscSign(Ajj) * BLASnrm2_(&len, &Qloc[j * nloc + j], &pas));
155:     if (rho == 0.0) tloc[j] = 0.0;
156:     else {
157:       tloc[j] = (Ajj - rho) / rho;
158:       len     = len - 1;
159:       val     = 1.0 / (Ajj - rho);
160:       PetscCallBLAS("BLASscal", BLASscal_(&len, &val, &Qloc[j * nloc + j + 1], &pas));
161:       Qloc[j * nloc + j] = 1.0;
162:       len                = len + 1;
163:       for (k = j + 1; k < nvec; k++) {
164:         PetscCallBLAS("BLASdot", tt = tloc[j] * BLASdot_(&len, &Qloc[j * nloc + j], &pas, &Qloc[k * nloc + j], &pas));
165:         PetscCallBLAS("BLASaxpy", BLASaxpy_(&len, &tt, &Qloc[j * nloc + j], &pas, &Qloc[k * nloc + j], &pas));
166:       }
167:       Qloc[j * nloc + j] = rho;
168:     }
169:   }
170:   /* annihilate undesirable Rloc, diagonal by diagonal*/
171:   for (d = 0; d < nvec; d++) {
172:     PetscCall(PetscBLASIntCast(nloc - j, &len));
173:     if (rank == First) {
174:       PetscCallBLAS("BLAScopy", BLAScopy_(&len, &Qloc[d * nloc + d], &bnloc, &wbufptr[d], &pas));
175:       PetscCallMPI(MPI_Send(&wbufptr[d], len, MPIU_SCALAR, rank + 1, agmres->tag, comm));
176:     } else {
177:       PetscCallMPI(MPI_Recv(&wbufptr[d], len, MPIU_SCALAR, rank - 1, agmres->tag, comm, &status));
178:       /* Elimination of Rloc(1,d)*/
179:       c = wbufptr[d];
180:       s = Qloc[d * nloc];
181:       PetscCall(KSPAGMRESRoddecGivens(&c, &s, &rho, 1));
182:       /* Apply Givens Rotation*/
183:       for (k = d; k < nvec; k++) {
184:         old            = wbufptr[k];
185:         wbufptr[k]     = c * old - s * Qloc[k * nloc];
186:         Qloc[k * nloc] = s * old + c * Qloc[k * nloc];
187:       }
188:       Qloc[d * nloc] = rho;
189:       if (rank != Last) PetscCallMPI(MPI_Send(&wbufptr[d], len, MPIU_SCALAR, rank + 1, agmres->tag, comm));
190:       /* zero-out the d-th diagonal of Rloc ...*/
191:       for (j = d + 1; j < nvec; j++) {
192:         /* elimination of Rloc[i][j]*/
193:         i = j - d;
194:         c = Qloc[j * nloc + i - 1];
195:         s = Qloc[j * nloc + i];
196:         PetscCall(KSPAGMRESRoddecGivens(&c, &s, &rho, 1));
197:         for (k = j; k < nvec; k++) {
198:           old                    = Qloc[k * nloc + i - 1];
199:           Qloc[k * nloc + i - 1] = c * old - s * Qloc[k * nloc + i];
200:           Qloc[k * nloc + i]     = s * old + c * Qloc[k * nloc + i];
201:         }
202:         Qloc[j * nloc + i] = rho;
203:       }
204:       if (rank == Last) {
205:         PetscCallBLAS("BLAScopy", BLAScopy_(&len, &wbufptr[d], &pas, RLOC(d, d), &N));
206:         for (k = d + 1; k < nvec; k++) *RLOC(k, d) = 0.0;
207:       }
208:     }
209:   }

211:   if (rank == Last) {
212:     for (d = 0; d < nvec; d++) {
213:       pos = nvec - d;
214:       PetscCall(PetscBLASIntCast(pos, &bpos));
215:       sgn[d] = PetscSign(*RLOC(d, d));
216:       PetscCallBLAS("BLASscal", BLASscal_(&bpos, &sgn[d], RLOC(d, d), &N));
217:     }
218:   }
219:   /* BroadCast Rloc to all other processes
220:    * NWD : should not be needed
221:    */
222:   PetscCallMPI(MPI_Bcast(agmres->Rloc, N * N, MPIU_SCALAR, Last, comm));
223:   PetscCall(PetscLogEventEnd(KSP_AGMRESRoddec, ksp, 0, 0, 0));
224:   PetscFunctionReturn(PETSC_SUCCESS);
225: }

227: /*
228:    Computes Out <-- Q * In where Q is the orthogonal matrix from AGMRESRoddec
229:    Input
230:     - Qloc, sgn, tloc, nvec (see AGMRESRoddec above)
231:     - In : input vector (size nvec)
232:    Output :
233:     - Out : Petsc vector (distributed as the basis vectors)
234: */
235: PetscErrorCode KSPAGMRESRodvec(KSP ksp, PetscInt nvec, PetscScalar *In, Vec Out)
236: {
237:   KSP_AGMRES  *agmres = (KSP_AGMRES *)ksp->data;
238:   MPI_Comm     comm;
239:   PetscScalar *Qloc  = agmres->Qloc;
240:   PetscScalar *sgn   = agmres->sgn;
241:   PetscScalar *tloc  = agmres->tloc;
242:   PetscMPIInt  rank  = agmres->rank;
243:   PetscMPIInt  First = agmres->First, Last = agmres->Last;
244:   PetscMPIInt  Iright = agmres->Iright, Ileft = agmres->Ileft;
245:   PetscScalar *y, *zloc;
246:   PetscInt     nloc, d, len, i, j;
247:   PetscBLASInt bnvec, pas, blen;
248:   PetscInt     dpt;
249:   PetscReal    c, s, rho, zp, zq, yd = 0.0, tt;
250:   MPI_Status   status;

252:   PetscFunctionBegin;
253:   PetscCall(PetscBLASIntCast(nvec, &bnvec));
254:   PetscCall(PetscObjectGetComm((PetscObject)ksp, &comm));
255:   pas = 1;
256:   PetscCall(VecGetLocalSize(VEC_V(0), &nloc));
257:   PetscCall(PetscMalloc1(nvec, &y));
258:   PetscCall(PetscArraycpy(y, In, nvec));
259:   PetscCall(VecGetArray(Out, &zloc));

261:   if (rank == Last) {
262:     for (i = 0; i < nvec; i++) y[i] = sgn[i] * y[i];
263:   }
264:   for (i = 0; i < nloc; i++) zloc[i] = 0.0;
265:   if (agmres->size == 1) PetscCallBLAS("BLAScopy", BLAScopy_(&bnvec, y, &pas, &zloc[0], &pas));
266:   else {
267:     for (d = nvec - 1; d >= 0; d--) {
268:       if (rank == First) {
269:         PetscCallMPI(MPI_Recv(&zloc[d], 1, MPIU_SCALAR, Iright, agmres->tag, comm, &status));
270:       } else {
271:         for (j = nvec - 1; j >= d + 1; j--) {
272:           i = j - d;
273:           PetscCall(KSPAGMRESRoddecGivens(&c, &s, &Qloc[j * nloc + i], 0));
274:           zp          = zloc[i - 1];
275:           zq          = zloc[i];
276:           zloc[i - 1] = c * zp + s * zq;
277:           zloc[i]     = -s * zp + c * zq;
278:         }
279:         PetscCall(KSPAGMRESRoddecGivens(&c, &s, &Qloc[d * nloc], 0));
280:         if (rank == Last) {
281:           zp      = y[d];
282:           zq      = zloc[0];
283:           y[d]    = c * zp + s * zq;
284:           zloc[0] = -s * zp + c * zq;
285:           PetscCallMPI(MPI_Send(&y[d], 1, MPIU_SCALAR, Ileft, agmres->tag, comm));
286:         } else {
287:           PetscCallMPI(MPI_Recv(&yd, 1, MPIU_SCALAR, Iright, agmres->tag, comm, &status));
288:           zp      = yd;
289:           zq      = zloc[0];
290:           yd      = c * zp + s * zq;
291:           zloc[0] = -s * zp + c * zq;
292:           PetscCallMPI(MPI_Send(&yd, 1, MPIU_SCALAR, Ileft, agmres->tag, comm));
293:         }
294:       }
295:     }
296:   }
297:   for (j = nvec - 1; j >= 0; j--) {
298:     dpt = j * nloc + j;
299:     if (tloc[j] != 0.0) {
300:       len = nloc - j;
301:       PetscCall(PetscBLASIntCast(len, &blen));
302:       rho       = Qloc[dpt];
303:       Qloc[dpt] = 1.0;
304:       tt        = tloc[j] * (BLASdot_(&blen, &Qloc[dpt], &pas, &zloc[j], &pas));
305:       PetscCallBLAS("BLASaxpy", BLASaxpy_(&blen, &tt, &Qloc[dpt], &pas, &zloc[j], &pas));
306:       Qloc[dpt] = rho;
307:     }
308:   }
309:   PetscCall(VecRestoreArray(Out, &zloc));
310:   PetscCall(PetscFree(y));
311:   PetscFunctionReturn(PETSC_SUCCESS);
312: }