TridiagLU  1.0
Scalable, parallel solver for tridiagonal system of equations
tridiagLU.c File Reference

Solve tridiagonal systems of equation using parallel LU decomposition. More...

#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h>
#include <string.h>
#include <mpi.h>
#include <tridiagLU.h>

Go to the source code of this file.

Functions

int tridiagLU (double *a, double *b, double *c, double *x, int n, int ns, void *r, void *m)
 

Detailed Description

Solve tridiagonal systems of equation using parallel LU decomposition.

Author
Debojyoti Ghosh

Definition in file tridiagLU.c.

Function Documentation

int tridiagLU ( double *  a,
double *  b,
double *  c,
double *  x,
int  n,
int  ns,
void *  r,
void *  m 
)

Solve tridiagonal (non-periodic) systems of equations using parallel LU decomposition: This function can solve multiple independent systems with one call. The systems need not share the same left- or right-hand-sides. The iterative substructuring method is used in this function that can be briefly described through the following 4 stages:

  • Stage 1: Parallel elimination of the tridiagonal blocks on each processor comprising all points of the subdomain except the 1st point (unless its the 1st global point, i.e., a physical boundary)
  • Stage 2: Elimination of the 1st row on each processor (except the 1st processor) using the last row of the previous processor.
  • Stage 3: Solution of the reduced tridiagonal system that represents the coupling of the system across the processors, using blocktridiagIterJacobi() in this implementation.
  • Stage 4: Backward-solve to obtain the final solution

Specific details of the method implemented here are available in:

More references on this class of parallel tridiagonal solvers:

  • E. Polizzi and A. H. Sameh, "A parallel hybrid banded system solver: The SPIKE algorithm", Parallel Comput., 32 (2006), pp. 177–194.
  • E. Polizzi and A. H. Sameh, "SPIKE: A parallel environment for solving banded linear systems", Comput. & Fluids, 36 (2007), pp. 113–120.

Array layout: The arguments a, b, c, and x are local 1D arrays (containing this processor's part of the subdiagonal, diagonal, superdiagonal, and right-hand-side) of size (n X ns), where n is the local size of the system, and ns is the number of independent systems to solve. The ordering of the elements in these arrays is as follows:

  • Elements of the same row for each of the independent systems are stored adjacent to each other.

For example, consider the following systems:

\begin{equation} \left[\begin{array}{ccccc} b_0^k & c_0^k & & & \\ a_1^k & b_1^k & c_1^k & & \\ & a_2^k & b_2^k & c_2^k & & \\ & & a_3^k & b_3^k & c_3^k & \\ & & & a_4^k & b_4^k & c_4^k \\ \end{array}\right] \left[\begin{array}{c} x_0^k \\ x_1^k \\ x_2^k \\ x_3^k \\ x_4^k \end{array}\right] = \left[\begin{array}{c} r_0^k \\ r_1^k \\ r_2^k \\ r_3^k \\ r_4^k \end{array}\right]; \ \ k= 1,\cdots,ns \end{equation}

and let \( ns = 3\). Note that in the code, \(x\) and \(r\) are the same array x.

Then, the array b must be a 1D array with the following layout of elements:
[
b_0^0, b_0^1, b_0^2, (diagonal element of the first row in each system)
b_1^0, b_1^1, b_1^2, (diagonal element of the second row in each system)
...,
b_{n-1}^0, b_{n-1}^1, b_{n-1}^2 (diagonal element of the last row in each system)
]
The arrays a, c, and x are stored similarly.

Notes:

  • This function does not preserve the sub-diagonal, diagonal, super-diagonal elements and the right-hand-sides.
  • The input array x contains the right-hand-side on entering the function, and the solution on exiting it.
Parameters
aArray containing the sub-diagonal elements
bArray containing the diagonal elements
cArray containing the super-diagonal elements
xRight-hand side; will contain the solution on exit
nLocal size of the system on this processor
nsNumber of systems to solve
rObject of type TridiagLU
mMPI communicator

Definition at line 83 of file tridiagLU.c.

93 {
94  TridiagLU *params = (TridiagLU*) r;
95  int d,i,istart,iend;
96  int rank,nproc;
97  struct timeval start,stage1,stage2,stage3,stage4;
98 
99 #ifdef serial
100  rank = 0;
101  nproc = 1;
102 #else
103  MPI_Comm *comm = (MPI_Comm*) m;
104  int ierr = 0;
105  const int nvar = 4;
106 
107  if (comm) {
108  MPI_Comm_size(*comm,&nproc);
109  MPI_Comm_rank(*comm,&rank);
110  } else {
111  rank = 0;
112  nproc = 1;
113  }
114 #endif
115 
116  if (!params) {
117  fprintf(stderr,"Error in tridiagLU(): NULL pointer passed for parameters.\n");
118  return(1);
119  }
120 
121  /* start */
122  gettimeofday(&start,NULL);
123 
124  if ((ns == 0) || (n == 0)) return(0);
125  double *xs1, *xp1;
126  xs1 = (double*) calloc (ns, sizeof(double));
127  xp1 = (double*) calloc (ns, sizeof(double));
128  for (i=0; i<ns; i++) xs1[i] = xp1[i] = 0;
129 
130  /* Stage 1 - Parallel elimination of subdiagonal entries */
131  istart = (rank == 0 ? 1 : 2);
132  iend = n;
133  for (i = istart; i < iend; i++) {
134  for (d = 0; d < ns; d++) {
135  if (b[(i-1)*ns+d] == 0) return(-1);
136  double factor = a[i*ns+d] / b[(i-1)*ns+d];
137  b[i*ns+d] -= factor * c[(i-1)*ns+d];
138  a[i*ns+d] = -factor * a[(i-1)*ns+d];
139  x[i*ns+d] -= factor * x[(i-1)*ns+d];
140  if (rank) {
141  double factor = c[d] / b[(i-1)*ns+d];
142  c[d] = -factor * c[(i-1)*ns+d];
143  b[d] -= factor * a[(i-1)*ns+d];
144  x[d] -= factor * x[(i-1)*ns+d];
145  }
146  }
147  }
148 
149  /* end of stage 1 */
150  gettimeofday(&stage1,NULL);
151 
152  /* Stage 2 - Eliminate the first sub- & super-diagonal entries */
153  /* This needs the last (a,b,c,x) from the previous process */
154 #ifndef serial
155  double *sendbuf, *recvbuf;
156  sendbuf = (double*) calloc (ns*nvar, sizeof(double));
157  recvbuf = (double*) calloc (ns*nvar, sizeof(double));
158  for (d=0; d<ns; d++) {
159  sendbuf[d*nvar+0] = a[(n-1)*ns+d];
160  sendbuf[d*nvar+1] = b[(n-1)*ns+d];
161  sendbuf[d*nvar+2] = c[(n-1)*ns+d];
162  sendbuf[d*nvar+3] = x[(n-1)*ns+d];
163  }
164  if (nproc > 1) {
165  MPI_Request req[2] = {MPI_REQUEST_NULL,MPI_REQUEST_NULL};
166  if (rank) MPI_Irecv(recvbuf,nvar*ns,MPI_DOUBLE,rank-1,1436,*comm,&req[0]);
167  if (rank != nproc-1) MPI_Isend(sendbuf,nvar*ns,MPI_DOUBLE,rank+1,1436,*comm,&req[1]);
168  MPI_Waitall(2,&req[0],MPI_STATUS_IGNORE);
169  }
170  /* The first process sits this one out */
171  if (rank) {
172  for (d = 0; d < ns; d++) {
173  double am1, bm1, cm1, xm1;
174  am1 = recvbuf[d*nvar+0];
175  bm1 = recvbuf[d*nvar+1];
176  cm1 = recvbuf[d*nvar+2];
177  xm1 = recvbuf[d*nvar+3];
178  double factor;
179  if (bm1 == 0) return(-1);
180  factor = a[d] / bm1;
181  b[d] -= factor * cm1;
182  a[d] = -factor * am1;
183  x[d] -= factor * xm1;
184  if (b[(n-1)*ns+d] == 0) return(-1);
185  factor = c[d] / b[(n-1)*ns+d];
186  b[d] -= factor * a[(n-1)*ns+d];
187  c[d] = -factor * c[(n-1)*ns+d];
188  x[d] -= factor * x[(n-1)*ns+d];
189  }
190  }
191  free(sendbuf);
192  free(recvbuf);
193 #endif
194 
195  /* end of stage 2 */
196  gettimeofday(&stage2,NULL);
197 
198  /* Stage 3 - Solve the reduced (nproc-1) X (nproc-1) tridiagonal system */
199 #ifndef serial
200  if (nproc > 1) {
201  double *zero, *one;
202  zero = (double*) calloc (ns, sizeof(double));
203  one = (double*) calloc (ns, sizeof(double));
204  for (d=0; d<ns; d++) {
205  zero[d] = 0.0;
206  one [d] = 1.0;
207  }
208  if (!strcmp(params->reducedsolvetype,_TRIDIAG_GS_)) {
209  /* Solving the reduced system by gather-and-solve algorithm */
210  if (rank) ierr = tridiagLUGS(a,b,c,x,1,ns,params,comm);
211  else ierr = tridiagLUGS(zero,one,zero,zero,1,ns,params,comm);
212  if (ierr) return(ierr);
213  } else if (!strcmp(params->reducedsolvetype,_TRIDIAG_JACOBI_)) {
214  /* Solving the reduced system iteratively with the Jacobi method */
215  if (rank) ierr = tridiagIterJacobi(a,b,c,x,1,ns,params,comm);
216  else ierr = tridiagIterJacobi(zero,one,zero,zero,1,ns,params,comm);
217  }
218  free(zero);
219  free(one);
220 
221  /* Each process, get the first x of the next process */
222  MPI_Request req[2] = {MPI_REQUEST_NULL,MPI_REQUEST_NULL};
223  for (d=0; d<ns; d++) xs1[d] = x[d];
224  if (rank+1 < nproc) MPI_Irecv(xp1,ns,MPI_DOUBLE,rank+1,1323,*comm,&req[0]);
225  if (rank) MPI_Isend(xs1,ns,MPI_DOUBLE,rank-1,1323,*comm,&req[1]);
226  MPI_Waitall(2,&req[0],MPI_STATUS_IGNORE);
227  }
228 #else
229  if (nproc > 1) {
230  fprintf(stderr,"Error: nproc > 1 for a serial run!\n");
231  return(1);
232  }
233 #endif /* if not serial */
234  /* end of stage 3 */
235  gettimeofday(&stage3,NULL);
236 
237  /* Stage 4 - Parallel back-substitution to get the solution */
238  istart = n-1;
239  iend = (rank == 0 ? 0 : 1);
240 
241  for (d = 0; d < ns; d++) {
242  if (b[istart*ns+d] == 0) return(-1);
243  x[istart*ns+d] = (x[istart*ns+d]-a[istart*ns+d]*x[d]-c[istart*ns+d]*xp1[d]) / b[istart*ns+d];
244  }
245  for (i = istart-1; i > iend-1; i--) {
246  for (d = 0; d < ns; d++) {
247  if (b[i*ns+d] == 0) return(-1);
248  x[i*ns+d] = (x[i*ns+d]-c[i*ns+d]*x[(i+1)*ns+d]-a[i*ns+d]*x[d]) / b[i*ns+d];
249  }
250  }
251 
252  /* end of stage 4 */
253  gettimeofday(&stage4,NULL);
254 
255  /* Done - now x contains the solution */
256  free(xs1);
257  free(xp1);
258 
259  /* save runtimes if needed */
260  long long walltime;
261  walltime = ((stage1.tv_sec * 1000000 + stage1.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
262  params->stage1_time = (double) walltime / 1000000.0;
263  walltime = ((stage2.tv_sec * 1000000 + stage2.tv_usec) - (stage1.tv_sec * 1000000 + stage1.tv_usec));
264  params->stage2_time = (double) walltime / 1000000.0;
265  walltime = ((stage3.tv_sec * 1000000 + stage3.tv_usec) - (stage2.tv_sec * 1000000 + stage2.tv_usec));
266  params->stage3_time = (double) walltime / 1000000.0;
267  walltime = ((stage4.tv_sec * 1000000 + stage4.tv_usec) - (stage3.tv_sec * 1000000 + stage3.tv_usec));
268  params->stage4_time = (double) walltime / 1000000.0;
269  walltime = ((stage4.tv_sec * 1000000 + stage4.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec));
270  params->total_time = (double) walltime / 1000000.0;
271  return(0);
272 }
double stage3_time
Definition: tridiagLU.h:101
int tridiagLUGS(double *, double *, double *, double *, int, int, void *, void *)
Definition: tridiagLUGS.c:64
double stage2_time
Definition: tridiagLU.h:100
double stage1_time
Definition: tridiagLU.h:99
#define _TRIDIAG_JACOBI_
Definition: tridiagLU.h:71
double stage4_time
Definition: tridiagLU.h:102
char reducedsolvetype[50]
Definition: tridiagLU.h:88
#define _TRIDIAG_GS_
Definition: tridiagLU.h:73
int tridiagIterJacobi(double *, double *, double *, double *, int, int, void *, void *)
double total_time
Definition: tridiagLU.h:98