Actual source code: sfmpi.c

  1: #include <../src/vec/is/sf/impls/basic/sfpack.h>

  3: // Though there is no default mechanism to start a communication, we have a
  4: // default to finish communication, which is just waiting on the requests.
  5: // It should work for both non-blocking or persistent send/recvs or collectivwes.
  6: static PetscErrorCode PetscSFLinkFinishCommunication_Default(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
  7: {
  8:   PetscSF_Basic     *bas           = (PetscSF_Basic *)sf->data;
  9:   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi;
 10:   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;

 12:   PetscFunctionBegin;
 13:   if (sf->monitor) {
 14:     PetscMPIInt rank;
 15:     const char *rootaction = (direction == PETSCSF_ROOT2LEAF) ? "sending to  " : "recving from";
 16:     const char *leafaction = (direction == PETSCSF_ROOT2LEAF) ? "recving from" : "sending to  ";
 17:     const char *sfaction   = (direction == PETSCSF_ROOT2LEAF) ? "PetscSFBcast" : "PetscSFReduce";

 19:     PetscCall(PetscPrintf(PETSC_COMM_SELF, "---------------- Begin %s communication -------------------\n", sfaction));
 20:     PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)sf), &rank));

 22:     for (PetscMPIInt i = 0; i < bas->nrootreqs; i++) {
 23:       size_t size = (bas->ioffset[i + bas->ndiranks + 1] - bas->ioffset[i + bas->ndiranks]) * link->unitbytes;
 24:       PetscCall(PetscPrintf(PETSC_COMM_SELF, "Rank %6d %s Rank %6d (%16zu bytes) with MPI tag %10d ... ", rank, rootaction, bas->iranks[i + bas->ndiranks], size, link->tag));
 25:       PetscCallMPI(MPI_Wait(link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + i, MPI_STATUS_IGNORE));
 26:       PetscCall(PetscPrintf(PETSC_COMM_SELF, "DONE\n"));
 27:     }
 28:     for (PetscMPIInt i = 0; i < sf->nleafreqs; i++) {
 29:       size_t size = (sf->roffset[i + sf->ndranks + 1] - sf->roffset[i + sf->ndranks]) * link->unitbytes;
 30:       PetscCall(PetscPrintf(PETSC_COMM_SELF, "Rank %6d %s Rank %6d (%16zu bytes) with MPI tag %10d ... ", rank, leafaction, sf->ranks[i + sf->ndranks], size, link->tag));
 31:       PetscCallMPI(MPI_Wait(link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + i, MPI_STATUS_IGNORE));
 32:       PetscCall(PetscPrintf(PETSC_COMM_SELF, "DONE\n"));
 33:     }
 34:     PetscCall(PetscPrintf(PETSC_COMM_SELF, "---------------- End   %s communication -------------------\n\n", sfaction));
 35:   } else {
 36:     if (bas->nrootreqs) PetscCallMPI(MPI_Waitall(bas->nrootreqs, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi], MPI_STATUSES_IGNORE));
 37:     if (sf->nleafreqs) PetscCallMPI(MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi], MPI_STATUSES_IGNORE));
 38:   }

 40:   if (direction == PETSCSF_ROOT2LEAF) {
 41:     PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE /* host2device after recving */));
 42:   } else {
 43:     PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE));
 44:   }
 45:   PetscFunctionReturn(PETSC_SUCCESS);
 46: }

 48: /*
 49:    The routine Creates a communication link for the given operation. It first looks up its link cache. If
 50:    there is a free & suitable one, it uses it. Otherwise it creates a new one.

 52:    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
 53:    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
 54:    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
 55:    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.

 57:    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.

 59:    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.

 61:    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
 62:    need pack/unpack data.
 63: */
 64: PetscErrorCode PetscSFLinkCreate_MPI(PetscSF sf, MPI_Datatype unit, PetscMemType xrootmtype, const void *rootdata, PetscMemType xleafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink)
 65: {
 66:   PetscSF_Basic   *bas = (PetscSF_Basic *)sf->data;
 67:   PetscInt         i, j, k, nrootreqs, nleafreqs, nreqs;
 68:   PetscSFLink     *p, link;
 69:   PetscSFDirection direction;
 70:   MPI_Request     *reqs = NULL;
 71:   PetscBool        match, rootdirect[2], leafdirect[2];
 72:   PetscMemType     rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1 as we will use it in subscript */
 73:   PetscMemType     leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE;
 74:   PetscMemType     rootmtype_mpi, leafmtype_mpi;   /* mtypes seen by MPI */
 75:   PetscInt         rootdirect_mpi, leafdirect_mpi; /* root/leafdirect seen by MPI*/

 77:   PetscFunctionBegin;
 78:   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
 79:   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
 80:     if (sfop == PETSCSF_BCAST) {
 81:       rootdirect[i] = bas->rootcontig[i];                                                  /* Pack roots */
 82:       leafdirect[i] = (sf->leafcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */
 83:     } else if (sfop == PETSCSF_REDUCE) {
 84:       leafdirect[i] = sf->leafcontig[i];                                                    /* Pack leaves */
 85:       rootdirect[i] = (bas->rootcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
 86:     } else {                                                                                /* PETSCSF_FETCH */
 87:       rootdirect[i] = PETSC_FALSE;                                                          /* FETCH always need a separate rootbuf */
 88:       leafdirect[i] = PETSC_FALSE;                                                          /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
 89:     }
 90:   }

 92:   // NEVER use root/leafdirect[] for persistent collectives. Otherwise, suppose for the first time, all ranks build
 93:   // a persistent MPI request in a collective call. Then in a second call to PetscSFBcast, one rank uses root/leafdirect
 94:   // but with new rootdata/leafdata pointers. Other ranks keep using the same rootdata/leafdata pointers as last time.
 95:   // Only that rank will try to rebuild the request with a collective call, resulting in hanging. We could to call
 96:   // MPI_Allreduce() every time to detect changes in root/leafdata, but that is too expensive for sparse communication.
 97:   // So we always set root/leafdirect[] to false and allocate additional root/leaf buffers for persistent collectives.
 98:   if (sf->persistent && sf->collective) {
 99:     rootdirect[PETSCSF_REMOTE] = PETSC_FALSE;
100:     leafdirect[PETSCSF_REMOTE] = PETSC_FALSE;
101:   }

103:   if (sf->use_gpu_aware_mpi) {
104:     rootmtype_mpi = rootmtype;
105:     leafmtype_mpi = leafmtype;
106:   } else {
107:     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
108:   }
109:   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is buffered on host and then passed to MPI */
110:   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype) ? 1 : 0;
111:   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype) ? 1 : 0;

113:   direction = (sfop == PETSCSF_BCAST) ? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT;
114:   nrootreqs = bas->nrootreqs;
115:   nleafreqs = sf->nleafreqs;

117:   /* Look for free links in cache */
118:   for (p = &bas->avail; (link = *p); p = &link->next) {
119:     if (!link->use_nvshmem) { /* Only check with MPI links */
120:       PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match));
121:       if (match) {
122:         /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with the current.
123:            If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests() with the same tag.
124:         */
125:         if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
126:           reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
127:           for (i = 0; i < nrootreqs; i++) {
128:             if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i]));
129:           }
130:           link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
131:         }
132:         if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
133:           reqs = link->leafreqs[direction][leafmtype][1];
134:           for (i = 0; i < nleafreqs; i++) {
135:             if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i]));
136:           }
137:           link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
138:         }
139:         *p = link->next; /* Remove from available list */
140:         goto found;
141:       }
142:     }
143:   }

145:   PetscCall(PetscNew(&link));
146:   PetscCall(PetscSFLinkSetUp_Host(sf, link, unit));
147:   PetscCall(PetscCommGetNewTag(PetscObjectComm((PetscObject)sf), &link->tag)); /* One tag per link */

149:   nreqs = (nrootreqs + nleafreqs) * 8;
150:   PetscCall(PetscMalloc1(nreqs, &link->reqs));
151:   for (i = 0; i < nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */

153:   if (nreqs)
154:     for (i = 0; i < 2; i++) {     /* Two communication directions */
155:       for (j = 0; j < 2; j++) {   /* Two memory types */
156:         for (k = 0; k < 2; k++) { /* root/leafdirect 0 or 1 */
157:           link->rootreqs[i][j][k] = link->reqs + nrootreqs * (4 * i + 2 * j + k);
158:           link->leafreqs[i][j][k] = link->reqs + nrootreqs * 8 + nleafreqs * (4 * i + 2 * j + k);
159:         }
160:       }
161:     }

163:   link->FinishCommunication = PetscSFLinkFinishCommunication_Default;
164:   // each SF type could customize their communication by setting function pointers in the link.
165:   // Currently only BASIC and NEIGHBOR use this abstraction.
166:   PetscTryTypeMethod(sf, SetCommunicationOps, link);

168: found:

170: #if defined(PETSC_HAVE_DEVICE)
171:   if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) {
172:   #if defined(PETSC_HAVE_CUDA)
173:     if (sf->backend == PETSCSF_BACKEND_CUDA) PetscCall(PetscSFLinkSetUp_CUDA(sf, link, unit)); /* Setup streams etc */
174:   #endif
175:   #if defined(PETSC_HAVE_HIP)
176:     if (sf->backend == PETSCSF_BACKEND_HIP) PetscCall(PetscSFLinkSetUp_HIP(sf, link, unit)); /* Setup streams etc */
177:   #endif
178:   #if defined(PETSC_HAVE_KOKKOS)
179:     if (sf->backend == PETSCSF_BACKEND_KOKKOS) PetscCall(PetscSFLinkSetUp_Kokkos(sf, link, unit));
180:   #endif
181:   }
182: #endif

184:   /* Allocate buffers along root/leafdata */
185:   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
186:     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
187:     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
188:     if (bas->rootbuflen[i]) {
189:       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
190:         link->rootbuf[i][rootmtype] = (char *)rootdata + bas->rootstart[i] * link->unitbytes;
191:       } else { /* Have to have a separate rootbuf */
192:         if (!link->rootbuf_alloc[i][rootmtype]) PetscCall(PetscSFMalloc(sf, rootmtype, bas->rootbuflen[i] * link->unitbytes, (void **)&link->rootbuf_alloc[i][rootmtype]));
193:         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
194:       }
195:     }

197:     if (sf->leafbuflen[i]) {
198:       if (leafdirect[i]) {
199:         link->leafbuf[i][leafmtype] = (char *)leafdata + sf->leafstart[i] * link->unitbytes;
200:       } else {
201:         if (!link->leafbuf_alloc[i][leafmtype]) PetscCall(PetscSFMalloc(sf, leafmtype, sf->leafbuflen[i] * link->unitbytes, (void **)&link->leafbuf_alloc[i][leafmtype]));
202:         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
203:       }
204:     }
205:   }

207: #if defined(PETSC_HAVE_DEVICE)
208:   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
209:   if (PetscMemTypeDevice(rootmtype) && PetscMemTypeHost(rootmtype_mpi)) {
210:     if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]));
211:     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
212:   }
213:   if (PetscMemTypeDevice(leafmtype) && PetscMemTypeHost(leafmtype_mpi)) {
214:     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]));
215:     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
216:   }
217: #endif

219:   /* Set `current` state of the link. They may change between different SF invocations with the same link */
220:   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
221:     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
222:     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
223:   }

225:   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
226:   link->leafdata = leafdata;
227:   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
228:     link->rootdirect[i] = rootdirect[i];
229:     link->leafdirect[i] = leafdirect[i];
230:   }
231:   link->rootdirect_mpi = rootdirect_mpi;
232:   link->leafdirect_mpi = leafdirect_mpi;
233:   link->rootmtype      = rootmtype;
234:   link->leafmtype      = leafmtype;
235:   link->rootmtype_mpi  = rootmtype_mpi;
236:   link->leafmtype_mpi  = leafmtype_mpi;

238:   link->next = bas->inuse;
239:   bas->inuse = link;
240:   *mylink    = link;
241:   PetscFunctionReturn(PETSC_SUCCESS);
242: }