TridiagLU  1.0
Scalable, parallel solver for tridiagonal system of equations
blocktridiagLU.c
Go to the documentation of this file.
1 
6 #include <stdlib.h>
7 #include <stdio.h>
8 #include <sys/time.h>
9 #include <string.h>
10 #ifndef serial
11 #include <mpi.h>
12 #endif
13 #include <tridiagLU.h>
14 #include <matops.h>
15 
108  double *a,
109  double *b,
110  double *c,
111  double *x,
112  int n,
114  int ns,
115  int bs,
116  void *r,
117  void *m
118  )
119 {
120  TridiagLU *params = (TridiagLU*) r;
121  int d,i,j,istart,iend,size;
122  int rank,nproc,bs2=bs*bs,nsbs=ns*bs;
123  struct timeval start,stage1,stage2,stage3,stage4;
124 
125 #ifdef serial
126  rank = 0;
127  nproc = 1;
128 #else
129  MPI_Comm *comm = (MPI_Comm*) m;
130  const int nvar = 4;
131  int ierr = 0;
132 
133  if (comm) {
134  MPI_Comm_size(*comm,&nproc);
135  MPI_Comm_rank(*comm,&rank);
136  } else {
137  rank = 0;
138  nproc = 1;
139  }
140 #endif
141 
142  if (!params) {
143  fprintf(stderr,"Error in tridiagLU(): NULL pointer passed for parameters.\n");
144  return(1);
145  }
146 
147  /* start */
148  gettimeofday(&start,NULL);
149 
150  if ((ns == 0) || (n == 0) || (bs == 0)) return(0);
151  double *xs1, *xp1;
152  xs1 = (double*) calloc (nsbs, sizeof(double));
153  xp1 = (double*) calloc (nsbs, sizeof(double));
154  for (i=0; i<nsbs; i++) xs1[i] = xp1[i] = 0;
155 
156  /* Stage 1 - Parallel elimination of subdiagonal entries */
157  istart = (rank == 0 ? 1 : 2);
158  iend = n;
159  for (i = istart; i < iend; i++) {
160  double binv[bs2], factor[bs2];
161  for (d = 0; d < ns; d++) {
162  _MatrixInvert_ (b+((i-1)*ns+d)*bs2,binv,bs);
163  _MatrixMultiply_ (a+(i*ns+d)*bs2,binv,factor,bs);
164  _MatrixMultiplySubtract_ (b+(i*ns+d)*bs2,factor,c+((i-1)*ns+d)*bs2,bs);
165  _MatrixZero_ (a+(i*ns+d)*bs2,bs);
166  _MatrixMultiplySubtract_ (a+(i*ns+d)*bs2,factor,a+((i-1)*ns+d)*bs2,bs);
167  _MatVecMultiplySubtract_ (x+(i*ns+d)*bs ,factor,x+((i-1)*ns+d)*bs ,bs);
168  if (rank) {
169  _MatrixMultiply_ (c+d*bs2,binv,factor,bs);
170  _MatrixZero_ (c+d*bs2,bs);
171  _MatrixMultiplySubtract_ (c+d*bs2,factor,c+((i-1)*ns+d)*bs2,bs);
172  _MatrixMultiplySubtract_ (b+d*bs2,factor,a+((i-1)*ns+d)*bs2,bs);
173  _MatVecMultiplySubtract_ (x+d*bs ,factor,x+((i-1)*ns+d)*bs ,bs);
174  }
175  }
176  }
177 
178  /* end of stage 1 */
179  gettimeofday(&stage1,NULL);
180 
181  /* Stage 2 - Eliminate the first sub- & super-diagonal entries */
182  /* This needs the last (a,b,c,x) from the previous process */
183 #ifndef serial
184  double *sendbuf, *recvbuf;
185  size = ns*bs2*(nvar-1)+nsbs;
186  sendbuf = (double*) calloc (size, sizeof(double));
187  recvbuf = (double*) calloc (size, sizeof(double));
188  for (d=0; d<ns; d++) {
189  for (i=0; i<bs2; i++) {
190  sendbuf[(0*ns+d)*bs2+i] = a[((n-1)*ns+d)*bs2+i];
191  sendbuf[(1*ns+d)*bs2+i] = b[((n-1)*ns+d)*bs2+i];
192  sendbuf[(2*ns+d)*bs2+i] = c[((n-1)*ns+d)*bs2+i];
193  }
194  for (i=0; i<bs; i++) sendbuf[3*ns*bs2+d*bs+i] = x[((n-1)*ns+d)*bs+i];
195  }
196  if (nproc > 1) {
197  MPI_Request req[2] = {MPI_REQUEST_NULL,MPI_REQUEST_NULL};
198  if (rank) MPI_Irecv(recvbuf,size,MPI_DOUBLE,rank-1,1436,*comm,&req[0]);
199  if (rank != nproc-1) MPI_Isend(sendbuf,size,MPI_DOUBLE,rank+1,1436,*comm,&req[1]);
200  MPI_Waitall(2,&req[0],MPI_STATUS_IGNORE);
201  }
202  /* The first process sits this one out */
203  if (rank) {
204  for (d = 0; d < ns; d++) {
205  double am1[bs2], bm1[bs2], cm1[bs2], xm1[bs];
206  for (i=0; i<bs2; i++) {
207  am1[i] = recvbuf[(0*ns+d)*bs2+i];
208  bm1[i] = recvbuf[(1*ns+d)*bs2+i];
209  cm1[i] = recvbuf[(2*ns+d)*bs2+i];
210  }
211  for (i=0; i<bs; i++) xm1[i] = recvbuf[3*ns*bs2+d*bs+i];
212  double factor[bs2], binv[bs2];
213  _MatrixInvert_ (bm1,binv,bs);
214  _MatrixMultiply_ (a+d*bs2,binv,factor,bs);
215  _MatrixMultiplySubtract_ (b+d*bs2,factor,cm1,bs);
216  _MatrixZero_ (a+d*bs2,bs);
217  _MatrixMultiplySubtract_ (a+d*bs2,factor,am1,bs);
218  _MatVecMultiplySubtract_ (x+d*bs ,factor,xm1,bs);
219 
220  _MatrixInvert_ (b+((n-1)*ns+d)*bs2,binv,bs); if (ierr) return(ierr);
221  _MatrixMultiply_ (c+d*bs2,binv,factor,bs);
222  _MatrixMultiplySubtract_ (b+d*bs2,factor,a+((n-1)*ns+d)*bs2,bs);
223  _MatrixZero_ (c+d*bs2,bs);
224  _MatrixMultiplySubtract_ (c+d*bs2,factor,c+((n-1)*ns+d)*bs2,bs);
225  _MatVecMultiplySubtract_ (x+d*bs ,factor,x+((n-1)*ns+d)*bs ,bs);
226  }
227  }
228  free(sendbuf);
229  free(recvbuf);
230 #endif
231 
232  /* end of stage 2 */
233  gettimeofday(&stage2,NULL);
234 
235  /* Stage 3 - Solve the reduced (nproc-1) X (nproc-1) tridiagonal system */
236 #ifndef serial
237  if (nproc > 1) {
238  double *zero, *eye;
239  zero = (double*) calloc (ns*bs2, sizeof(double));
240  eye = (double*) calloc (ns*bs2, sizeof(double));
241  for (d=0; d<ns*bs2; d++) zero[d] = eye[d] = 0.0;
242  for (d=0; d<ns; d++) {
243  for (i=0; i<bs; i++) eye[d*bs2+(i*bs+i)] = 1.0;
244  }
245 
246  if (!strcmp(params->reducedsolvetype,_TRIDIAG_GS_)) {
247  /* not supported */
248  fprintf(stderr,"Error in blocktridiagLU(): Gather-and-solve for reduced system not available.\n");
249  return(1);
250  } else if (!strcmp(params->reducedsolvetype,_TRIDIAG_JACOBI_)) {
251  /* Solving the reduced system iteratively with the Jacobi method */
252  if (rank) ierr = blocktridiagIterJacobi(a,b,c,x,1,ns,bs,params,comm);
253  else ierr = blocktridiagIterJacobi(zero,eye,zero,zero,1,ns,bs,params,comm);
254  }
255  free(zero);
256  free(eye);
257 
258  /* Each process, get the first x of the next process */
259  MPI_Request req[2] = {MPI_REQUEST_NULL,MPI_REQUEST_NULL};
260  for (d=0; d<nsbs; d++) xs1[d] = x[d];
261  if (rank+1 < nproc) MPI_Irecv(xp1,nsbs,MPI_DOUBLE,rank+1,1323,*comm,&req[0]);
262  if (rank) MPI_Isend(xs1,nsbs,MPI_DOUBLE,rank-1,1323,*comm,&req[1]);
263  MPI_Waitall(2,&req[0],MPI_STATUS_IGNORE);
264  }
265 #else
266  if (nproc > 1) {
267  fprintf(stderr,"Error: nproc > 1 for a serial run!\n");
268  return(1);
269  }
270 #endif /* if not serial */
271  /* end of stage 3 */
272  gettimeofday(&stage3,NULL);
273 
274  /* Stage 4 - Parallel back-substitution to get the solution */
275  istart = n-1;
276  iend = (rank == 0 ? 0 : 1);
277 
278  for (d = 0; d < ns; d++) {
279  double binv[bs2],xt[bs];
280  _MatrixInvert_ (b+(istart*ns+d)*bs2,binv,bs);
281  _MatVecMultiplySubtract_ (x+(istart*ns+d)*bs ,a+(istart*ns+d)*bs2,x +d*bs,bs);
282  _MatVecMultiplySubtract_ (x+(istart*ns+d)*bs ,c+(istart*ns+d)*bs2,xp1+d*bs,bs);
283  _MatVecMultiply_ (binv,x+(istart*ns+d)*bs,xt,bs);
284  for (j=0; j<bs; j++) x[(istart*ns+d)*bs+j]=xt[j];
285  }
286  for (i = istart-1; i > iend-1; i--) {
287  for (d = 0; d < ns; d++) {
288  double binv[bs2],xt[bs];
289  _MatrixInvert_ (b+(i*ns+d)*bs2,binv,bs);
290  _MatVecMultiplySubtract_ (x+(i*ns+d)*bs ,c+(i*ns+d)*bs2,x+((i+1)*ns+d)*bs,bs);
291  _MatVecMultiplySubtract_ (x+(i*ns+d)*bs ,a+(i*ns+d)*bs2,x+d*bs,bs);
292  _MatVecMultiply_ (binv,x+(i*ns+d)*bs,xt,bs);
293  for (j=0; j<bs; j++) x[(i*ns+d)*bs+j] = xt[j];
294  }
295  }
296 
297  /* end of stage 4 */
298  gettimeofday(&stage4,NULL);
299 
300  /* Done - now x contains the solution */
301  free(xs1);
302  free(xp1);
303 
304  /* save runtimes if needed */
305  long long walltime;
306  walltime = ((stage1.tv_sec * 1000000 + stage1.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
307  params->stage1_time = (double) walltime / 1000000.0;
308  walltime = ((stage2.tv_sec * 1000000 + stage2.tv_usec) - (stage1.tv_sec * 1000000 + stage1.tv_usec));
309  params->stage2_time = (double) walltime / 1000000.0;
310  walltime = ((stage3.tv_sec * 1000000 + stage3.tv_usec) - (stage2.tv_sec * 1000000 + stage2.tv_usec));
311  params->stage3_time = (double) walltime / 1000000.0;
312  walltime = ((stage4.tv_sec * 1000000 + stage4.tv_usec) - (stage3.tv_sec * 1000000 + stage3.tv_usec));
313  params->stage4_time = (double) walltime / 1000000.0;
314  walltime = ((stage4.tv_sec * 1000000 + stage4.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
315  params->total_time = (double) walltime / 1000000.0;
316  return(0);
317 }
#define _MatrixMultiplySubtract_(C, A, B, N)
Definition: matops.h:51
double stage3_time
Definition: tridiagLU.h:101
#define _MatrixMultiply_(A, B, C, N)
Definition: matops.h:22
double stage2_time
Definition: tridiagLU.h:100
int blocktridiagLU(double *a, double *b, double *c, double *x, int n, int ns, int bs, void *r, void *m)
double stage1_time
Definition: tridiagLU.h:99
Header file for TridiagLU.
#define _TRIDIAG_JACOBI_
Definition: tridiagLU.h:71
Contains macros and function definitions for common matrix operations.
double stage4_time
Definition: tridiagLU.h:102
#define _MatrixZero_(A, N)
Definition: matops.h:12
char reducedsolvetype[50]
Definition: tridiagLU.h:88
int blocktridiagIterJacobi(double *, double *, double *, double *, int, int, int, void *, void *)
#define _TRIDIAG_GS_
Definition: tridiagLU.h:73
#define _MatVecMultiplySubtract_(y, A, x, N)
Definition: matops.h:78
#define _MatVecMultiply_(A, x, y, N)
Definition: matops.h:64
double total_time
Definition: tridiagLU.h:98
#define _MatrixInvert_(A, B, N)
Definition: matops.h:90