Actual source code: agmresorthog.c
1: #define PETSCKSP_DLL
3: #include <../src/ksp/ksp/impls/gmres/agmres/agmresimpl.h>
4: /*
5: This file implements the RODDEC algorithm : its purpose is to orthogonalize a set of vectors distributed across several processes.
6: These processes are organized in a virtual ring.
8: References : [1] Sidje, Roger B. Alternatives for parallel Krylov subspace basis computation. Numer. Linear Algebra Appl. 4 (1997), no. 4, 305-331
10: initial author R. B. SIDJE,
11: modified : G.-A Atenekeng-Kahou, 2008
12: modified : D. NUENTSA WAKAM, 2011
14: */
16: /*
17: * Take the processes that own the vectors and organize them on a virtual ring.
18: */
19: static PetscErrorCode KSPAGMRESRoddecGivens(PetscReal *, PetscReal *, PetscReal *, PetscInt);
21: PetscErrorCode KSPAGMRESRoddecInitNeighboor(KSP ksp)
22: {
23: MPI_Comm comm;
24: KSP_AGMRES *agmres = (KSP_AGMRES *)ksp->data;
25: PetscMPIInt First, Last, rank, size;
27: PetscFunctionBegin;
28: PetscCall(PetscObjectGetComm((PetscObject)agmres->vecs[0], &comm));
29: PetscCallMPI(MPI_Comm_rank(comm, &rank));
30: PetscCallMPI(MPI_Comm_size(comm, &size));
31: PetscCallMPI(MPIU_Allreduce(&rank, &First, 1, MPI_INT, MPI_MIN, comm));
32: PetscCallMPI(MPIU_Allreduce(&rank, &Last, 1, MPI_INT, MPI_MAX, comm));
34: if ((rank != Last) && (rank == 0)) {
35: agmres->Ileft = rank - 1;
36: agmres->Iright = rank + 1;
37: } else {
38: if (rank == Last) {
39: agmres->Ileft = rank - 1;
40: agmres->Iright = First;
41: } else {
42: {
43: agmres->Ileft = Last;
44: agmres->Iright = rank + 1;
45: }
46: }
47: }
48: agmres->rank = rank;
49: agmres->size = size;
50: agmres->First = First;
51: agmres->Last = Last;
52: PetscFunctionReturn(PETSC_SUCCESS);
53: }
55: static PetscErrorCode KSPAGMRESRoddecGivens(PetscReal *c, PetscReal *s, PetscReal *r, PetscInt make_r)
56: {
57: PetscReal a, b, t;
59: PetscFunctionBegin;
60: if (make_r == 1) {
61: a = *c;
62: b = *s;
63: if (b == 0.e0) {
64: *c = 1.e0;
65: *s = 0.e0;
66: } else {
67: if (PetscAbsReal(b) > PetscAbsReal(a)) {
68: t = -a / b;
69: *s = 1.e0 / PetscSqrtReal(1.e0 + t * t);
70: *c = (*s) * t;
71: } else {
72: t = -b / a;
73: *c = 1.e0 / PetscSqrtReal(1.e0 + t * t);
74: *s = (*c) * t;
75: }
76: }
77: if (*c == 0.e0) {
78: *r = 1.e0;
79: } else {
80: if (PetscAbsReal(*s) < PetscAbsReal(*c)) {
81: *r = PetscSign(*c) * (*s) / 2.e0;
82: } else {
83: *r = PetscSign(*s) * 2.e0 / (*c);
84: }
85: }
86: }
88: if (*r == 1.e0) {
89: *c = 0.e0;
90: *s = 1.e0;
91: } else {
92: if (PetscAbsReal(*r) < 1.e0) {
93: *s = 2.e0 * (*r);
94: *c = PetscSqrtReal(1.e0 - (*s) * (*s));
95: } else {
96: *c = 2.e0 / (*r);
97: *s = PetscSqrtReal(1.e0 - (*c) * (*c));
98: }
99: }
100: PetscFunctionReturn(PETSC_SUCCESS);
101: }
103: /*
104: Compute the QR factorization of the Krylov basis vectors
105: Input :
106: - the vectors are available in agmres->vecs (alias VEC_V)
107: - nvec : the number of vectors
108: Output :
109: - agmres->Qloc : product of elementary reflectors for the QR of the (local part) of the vectors.
110: - agmres->sgn : Sign of the rotations
111: - agmres->tloc : scalar factors of the elementary reflectors.
113: */
114: PetscErrorCode KSPAGMRESRoddec(KSP ksp, PetscInt nvec)
115: {
116: KSP_AGMRES *agmres = (KSP_AGMRES *)ksp->data;
117: MPI_Comm comm;
118: PetscScalar *Qloc = agmres->Qloc;
119: PetscScalar *sgn = agmres->sgn;
120: PetscScalar *tloc = agmres->tloc;
121: PetscReal *wbufptr = agmres->wbufptr;
122: PetscMPIInt rank = agmres->rank;
123: PetscMPIInt First = agmres->First;
124: PetscMPIInt Last = agmres->Last;
125: PetscBLASInt pas, len, bnloc, bpos;
126: PetscInt nloc, d, i, j, k;
127: PetscInt pos;
128: PetscReal c, s, rho, Ajj, val, tt, old;
129: PetscScalar *col;
130: MPI_Status status;
131: PetscBLASInt N;
133: PetscFunctionBegin;
134: PetscCall(PetscBLASIntCast(MAXKSPSIZE + 1, &N));
135: PetscCall(PetscObjectGetComm((PetscObject)ksp, &comm));
136: PetscCall(PetscLogEventBegin(KSP_AGMRESRoddec, ksp, 0, 0, 0));
137: PetscCall(PetscArrayzero(agmres->Rloc, N * N));
138: /* check input arguments */
139: PetscCheck(nvec >= 1, PetscObjectComm((PetscObject)ksp), PETSC_ERR_ARG_OUTOFRANGE, "The number of input vectors should be positive");
140: PetscCall(VecGetLocalSize(VEC_V(0), &nloc));
141: PetscCall(PetscBLASIntCast(nloc, &bnloc));
142: PetscCheck(nvec <= nloc, PetscObjectComm((PetscObject)ksp), PETSC_ERR_ARG_WRONG, "In QR factorization, the number of local rows should be greater or equal to the number of columns");
143: pas = 1;
144: /* Copy the vectors of the basis */
145: for (j = 0; j < nvec; j++) {
146: PetscCall(VecGetArray(VEC_V(j), &col));
147: PetscCallBLAS("BLAScopy", BLAScopy_(&bnloc, col, &pas, &Qloc[j * nloc], &pas));
148: PetscCall(VecRestoreArray(VEC_V(j), &col));
149: }
150: /* Each process performs a local QR on its own block */
151: for (j = 0; j < nvec; j++) {
152: PetscCall(PetscBLASIntCast(nloc - j, &len));
153: Ajj = Qloc[j * nloc + j];
154: PetscCallBLAS("BLASnrm2", rho = -PetscSign(Ajj) * BLASnrm2_(&len, &Qloc[j * nloc + j], &pas));
155: if (rho == 0.0) tloc[j] = 0.0;
156: else {
157: tloc[j] = (Ajj - rho) / rho;
158: len = len - 1;
159: val = 1.0 / (Ajj - rho);
160: PetscCallBLAS("BLASscal", BLASscal_(&len, &val, &Qloc[j * nloc + j + 1], &pas));
161: Qloc[j * nloc + j] = 1.0;
162: len = len + 1;
163: for (k = j + 1; k < nvec; k++) {
164: PetscCallBLAS("BLASdot", tt = tloc[j] * BLASdot_(&len, &Qloc[j * nloc + j], &pas, &Qloc[k * nloc + j], &pas));
165: PetscCallBLAS("BLASaxpy", BLASaxpy_(&len, &tt, &Qloc[j * nloc + j], &pas, &Qloc[k * nloc + j], &pas));
166: }
167: Qloc[j * nloc + j] = rho;
168: }
169: }
170: /* annihilate undesirable Rloc, diagonal by diagonal*/
171: for (d = 0; d < nvec; d++) {
172: PetscCall(PetscBLASIntCast(nloc - j, &len));
173: if (rank == First) {
174: PetscCallBLAS("BLAScopy", BLAScopy_(&len, &Qloc[d * nloc + d], &bnloc, &wbufptr[d], &pas));
175: PetscCallMPI(MPI_Send(&wbufptr[d], len, MPIU_SCALAR, rank + 1, agmres->tag, comm));
176: } else {
177: PetscCallMPI(MPI_Recv(&wbufptr[d], len, MPIU_SCALAR, rank - 1, agmres->tag, comm, &status));
178: /* Elimination of Rloc(1,d)*/
179: c = wbufptr[d];
180: s = Qloc[d * nloc];
181: PetscCall(KSPAGMRESRoddecGivens(&c, &s, &rho, 1));
182: /* Apply Givens Rotation*/
183: for (k = d; k < nvec; k++) {
184: old = wbufptr[k];
185: wbufptr[k] = c * old - s * Qloc[k * nloc];
186: Qloc[k * nloc] = s * old + c * Qloc[k * nloc];
187: }
188: Qloc[d * nloc] = rho;
189: if (rank != Last) PetscCallMPI(MPI_Send(&wbufptr[d], len, MPIU_SCALAR, rank + 1, agmres->tag, comm));
190: /* zero-out the d-th diagonal of Rloc ...*/
191: for (j = d + 1; j < nvec; j++) {
192: /* elimination of Rloc[i][j]*/
193: i = j - d;
194: c = Qloc[j * nloc + i - 1];
195: s = Qloc[j * nloc + i];
196: PetscCall(KSPAGMRESRoddecGivens(&c, &s, &rho, 1));
197: for (k = j; k < nvec; k++) {
198: old = Qloc[k * nloc + i - 1];
199: Qloc[k * nloc + i - 1] = c * old - s * Qloc[k * nloc + i];
200: Qloc[k * nloc + i] = s * old + c * Qloc[k * nloc + i];
201: }
202: Qloc[j * nloc + i] = rho;
203: }
204: if (rank == Last) {
205: PetscCallBLAS("BLAScopy", BLAScopy_(&len, &wbufptr[d], &pas, RLOC(d, d), &N));
206: for (k = d + 1; k < nvec; k++) *RLOC(k, d) = 0.0;
207: }
208: }
209: }
211: if (rank == Last) {
212: for (d = 0; d < nvec; d++) {
213: pos = nvec - d;
214: PetscCall(PetscBLASIntCast(pos, &bpos));
215: sgn[d] = PetscSign(*RLOC(d, d));
216: PetscCallBLAS("BLASscal", BLASscal_(&bpos, &sgn[d], RLOC(d, d), &N));
217: }
218: }
219: /* BroadCast Rloc to all other processes
220: * NWD : should not be needed
221: */
222: PetscCallMPI(MPI_Bcast(agmres->Rloc, N * N, MPIU_SCALAR, Last, comm));
223: PetscCall(PetscLogEventEnd(KSP_AGMRESRoddec, ksp, 0, 0, 0));
224: PetscFunctionReturn(PETSC_SUCCESS);
225: }
227: /*
228: Computes Out <-- Q * In where Q is the orthogonal matrix from AGMRESRoddec
229: Input
230: - Qloc, sgn, tloc, nvec (see AGMRESRoddec above)
231: - In : input vector (size nvec)
232: Output :
233: - Out : Petsc vector (distributed as the basis vectors)
234: */
235: PetscErrorCode KSPAGMRESRodvec(KSP ksp, PetscInt nvec, PetscScalar *In, Vec Out)
236: {
237: KSP_AGMRES *agmres = (KSP_AGMRES *)ksp->data;
238: MPI_Comm comm;
239: PetscScalar *Qloc = agmres->Qloc;
240: PetscScalar *sgn = agmres->sgn;
241: PetscScalar *tloc = agmres->tloc;
242: PetscMPIInt rank = agmres->rank;
243: PetscMPIInt First = agmres->First, Last = agmres->Last;
244: PetscMPIInt Iright = agmres->Iright, Ileft = agmres->Ileft;
245: PetscScalar *y, *zloc;
246: PetscInt nloc, d, len, i, j;
247: PetscBLASInt bnvec, pas, blen;
248: PetscInt dpt;
249: PetscReal c, s, rho, zp, zq, yd = 0.0, tt;
250: MPI_Status status;
252: PetscFunctionBegin;
253: PetscCall(PetscBLASIntCast(nvec, &bnvec));
254: PetscCall(PetscObjectGetComm((PetscObject)ksp, &comm));
255: pas = 1;
256: PetscCall(VecGetLocalSize(VEC_V(0), &nloc));
257: PetscCall(PetscMalloc1(nvec, &y));
258: PetscCall(PetscArraycpy(y, In, nvec));
259: PetscCall(VecGetArray(Out, &zloc));
261: if (rank == Last) {
262: for (i = 0; i < nvec; i++) y[i] = sgn[i] * y[i];
263: }
264: for (i = 0; i < nloc; i++) zloc[i] = 0.0;
265: if (agmres->size == 1) PetscCallBLAS("BLAScopy", BLAScopy_(&bnvec, y, &pas, &zloc[0], &pas));
266: else {
267: for (d = nvec - 1; d >= 0; d--) {
268: if (rank == First) {
269: PetscCallMPI(MPI_Recv(&zloc[d], 1, MPIU_SCALAR, Iright, agmres->tag, comm, &status));
270: } else {
271: for (j = nvec - 1; j >= d + 1; j--) {
272: i = j - d;
273: PetscCall(KSPAGMRESRoddecGivens(&c, &s, &Qloc[j * nloc + i], 0));
274: zp = zloc[i - 1];
275: zq = zloc[i];
276: zloc[i - 1] = c * zp + s * zq;
277: zloc[i] = -s * zp + c * zq;
278: }
279: PetscCall(KSPAGMRESRoddecGivens(&c, &s, &Qloc[d * nloc], 0));
280: if (rank == Last) {
281: zp = y[d];
282: zq = zloc[0];
283: y[d] = c * zp + s * zq;
284: zloc[0] = -s * zp + c * zq;
285: PetscCallMPI(MPI_Send(&y[d], 1, MPIU_SCALAR, Ileft, agmres->tag, comm));
286: } else {
287: PetscCallMPI(MPI_Recv(&yd, 1, MPIU_SCALAR, Iright, agmres->tag, comm, &status));
288: zp = yd;
289: zq = zloc[0];
290: yd = c * zp + s * zq;
291: zloc[0] = -s * zp + c * zq;
292: PetscCallMPI(MPI_Send(&yd, 1, MPIU_SCALAR, Ileft, agmres->tag, comm));
293: }
294: }
295: }
296: }
297: for (j = nvec - 1; j >= 0; j--) {
298: dpt = j * nloc + j;
299: if (tloc[j] != 0.0) {
300: len = nloc - j;
301: PetscCall(PetscBLASIntCast(len, &blen));
302: rho = Qloc[dpt];
303: Qloc[dpt] = 1.0;
304: tt = tloc[j] * (BLASdot_(&blen, &Qloc[dpt], &pas, &zloc[j], &pas));
305: PetscCallBLAS("BLASaxpy", BLASaxpy_(&blen, &tt, &Qloc[dpt], &pas, &zloc[j], &pas));
306: Qloc[dpt] = rho;
307: }
308: }
309: PetscCall(VecRestoreArray(Out, &zloc));
310: PetscCall(PetscFree(y));
311: PetscFunctionReturn(PETSC_SUCCESS);
312: }