-
Notifications
You must be signed in to change notification settings - Fork 0
/
nbody_omp.cu
398 lines (345 loc) · 15.6 KB
/
nbody_omp.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
/* nbody.c: sequential 2-d nbody simulation
Author: Nick Mancuso and Jiaman Wang
Link this with a translation unit that defines the extern
variables, and anim.o, to make a complete program.
*/
#include <assert.h>
#include <math.h>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
//#include "anim_dev.h"
//#include "anim.h"
#include "cudaanim.h"
#include "nbody.h"
/* Global variables */
extern const double x_min; /* coord of left edge of window (meters) */
extern const double x_max; /* coord of right edge of window (meters) */
extern const double y_min; /* coord of bottom edge of window (meters) */
extern const double y_max; /* coord of top edge of window (meters) */
extern const int nx; /* width of movie window (pixels) */
extern const int nbodies; /* number of bodies */
extern const double delta_t; /* time between discrete time steps (seconds) */
extern const int nstep; /* number of time steps */
extern const int wstep; /* number of times steps beween movie frames */
extern const int ncolors; /* number of colors to use for the bodies */
extern const int colors[][3]; /* colors we will use for the bodies */
extern const Body bodies[]; /* list of bodies with initial data */
__device__ const double G = 6.674e-11; /* universal gravitational constant */
int ny; /* height of movie window (pixels) */
State* states; /* host copy of state array */
ANIM_File af; /* output anim file */
double* posbuf; /* to send data to anim, 2*nbodies doubles */
double start_time; /* time simulation starts */
int num_gpu_devices; /* number of gpus */
int cpu_nbodies; /* number of bodies that the cpu is updating */
int gpu_nbodies; /* number of bodies that the gpu is updating */
// Below, we declare all as thread private so that each thread maintains these
// pointers across all parallel regions for the life of the thread.
State* states_dev;
#pragma omp threadprivate(states_dev)
Body* bodies_dev;
#pragma omp threadprivate(bodies_dev)
int err;
#pragma omp threadprivate(err)
State* states_new_dev;
#pragma omp threadprivate(states_new_dev)
State* states_new;
#pragma omp threadprivate(states_new)
int num_owned;
#pragma omp threadprivate(num_owned)
int first;
#pragma omp threadprivate(first)
int cpu_thread_id;
#pragma omp threadprivate(cpu_thread_id)
int num_cpu_threads;
#pragma omp threadprivate(num_cpu_threads)
int nblocks;
#pragma omp threadprivate(nblocks)
// Set up distribution here, these are more complicated than usual
// due to division of labor between GPU and CPU
#define threadsPerBlock 1024
#define FIRST(device, num_gpu_devices_or_threads, nbodies) \
((((ulong)(device)) * ((ulong)nbodies)) / num_gpu_devices_or_threads)
#define NUM_OWNED(device, num_gpu_devices_or_threads, nbodies) \
(FIRST(device + 1, num_gpu_devices_or_threads, nbodies) - \
FIRST(device, num_gpu_devices_or_threads, nbodies))
/* Handles setup */
static void init(char* filename, float gpu_body_prop) {
start_time = ANIM_time();
assert(x_max > x_min && y_max > y_min);
ny = ceil(nx * (y_max - y_min) / (x_max - x_min));
printf("nbody: nbodies=%d nx=%d ny=%d nstep=%d wstep=%d\n", nbodies, nx, ny, nstep,
wstep);
const int nframes = wstep == 0 ? 0 : 1 + nstep / wstep;
printf("nbody: creating ANIM file %s with %d frames, %zu bytes.\n", filename, nframes,
ANIM_Nbody_file_size(2, nbodies, ncolors, nframes));
fflush(stdout);
assert(gpu_body_prop >= 0.0 && gpu_body_prop <= 1.0);
assert(nx >= 10 && ny >= 10);
assert(nstep >= 1 && wstep >= 0 && nbodies > 0);
assert(ncolors >= 1 && ncolors <= ANIM_MAXCOLORS);
states = (State*)malloc(nbodies * sizeof(State));
assert(states);
posbuf = (double*)malloc(2 * nbodies * sizeof(double));
assert(posbuf);
// Init device memory for state and body arrays
cudaGetDeviceCount(&num_gpu_devices);
// This is where we divide up the work between the cpu and gpu
gpu_nbodies = nbodies * gpu_body_prop;
cpu_nbodies = nbodies - gpu_nbodies;
// Here we set the number of threads to be the same as the number of GPUs,
// if the cpu isn't updating any bodies. This provides the best performance.
if (cpu_nbodies == 0) {
omp_set_num_threads(num_gpu_devices);
}
printf(
"Running nbody with %d GPU devices. The gpu is updating %d nbodies and the cpu "
"is updating %d nbodies (gpu_body_prop = %.2f).\n",
num_gpu_devices, gpu_nbodies, cpu_nbodies, gpu_body_prop);
// Set up animation
assert(states);
int radii[nbodies], bcolors[nbodies];
ANIM_color_t acolors[ncolors]; // RGB colors converted to ANIM colors
// Setup arrays to pass to ANIM_Create_nbody
#pragma omp parallel for shared(bodies, states, radii, bcolors)
for (int i = 0; i < nbodies; i++) {
assert(bodies[i].mass > 0);
assert(bodies[i].color >= 0 && bodies[i].color < ncolors);
assert(bodies[i].radius > 0);
states[i] = bodies[i].state;
radii[i] = bodies[i].radius;
bcolors[i] = bodies[i].color;
}
// These declarations must be explicit in CUDA
int dims[] = {nx, ny};
ANIM_range_t ranges[] = {{x_min, x_max}, {y_min, y_max}};
// Translate color codes to ANIM colors
#pragma omp parallel for shared(acolors, colors)
for (int i = 0; i < ncolors; i++)
acolors[i] = ANIM_Make_color(colors[i][0], colors[i][1], colors[i][2]);
// Initialize anim file
af = ANIM_Create_nbody(2, dims, ranges, nbodies, radii, ncolors, acolors, bcolors,
filename);
}
/* Writes a frame to the position buffer */
static inline void write_frame() {
#pragma omp parallel for shared(states, posbuf)
for (int i = 0; i < nbodies; i++) {
posbuf[2 * i] = states[i].x;
posbuf[2 * i + 1] = states[i].y;
}
ANIM_Write_frame(af, posbuf);
}
/* Move forward one time step. This is the "integration step". For
each body b, compute the total force acting on that body. If you
divide this by the mass of b, you get b's acceleration. So you
actually just calculate b's acceleration directly, since this is
what you want to know. Once you have the acceleration, update the
velocity, then update the position.
int first - the index of this GPU's first state to update
int num_owned - the total number of states that this GPu is updating
double delta_t - the change in time
State* states_dev - the main states array, in total
State* states_new_dev - this GPU's new state array (size of num_owned)
Body* bodies - the intial Body array, in total
int device_id - the gpu id of this gpu
int nblocks - the number of blocks declared for this GPU
int nbodies - the total number of nbodies
*/
__global__ static void update(int first, int num_owned, double delta_t, State* states_dev,
State* states_new_dev, Body* bodies, int device_id,
int nblocks, int nbodies) {
// Get id of this CUDA thread
const int local_idx = blockIdx.x * blockDim.x + threadIdx.x;
const int global_idx = local_idx + first;
// Set up variables for calculation
double x = states_dev[global_idx].x, y = states_dev[global_idx].y;
double vx = states_dev[global_idx].vx, vy = states_dev[global_idx].vy;
// ax times delta t, ay times delta t...
double ax_delta_t = 0.0, ay_delta_t = 0.0;
if (global_idx >= first && global_idx < first + num_owned) {
#ifdef DEBUG
printf("global_idx (%d) from device (%d) in update!\n", global_idx, device_id);
printf("x = %.2f, y = %.2f, vx = %.2f, vy = %.2f\n", x, y, vx, vy);
#endif
for (int j = 0; j < nbodies; j++) {
if (j == global_idx) continue; // should be global
const double dx = states_dev[j].x - x, dy = states_dev[j].y - y;
const double mass = bodies[j].mass;
const double r_squared = dx * dx + dy * dy;
if (r_squared != 0) {
const double r = sqrt(r_squared);
if (r != 0) {
const double acceleration = G * mass / r_squared;
const double atOverr = acceleration * delta_t / r;
ax_delta_t += dx * atOverr;
ay_delta_t += dy * atOverr;
}
}
}
vx += ax_delta_t;
vy += ay_delta_t;
x += delta_t * vx;
y += delta_t * vy;
assert(!isnan(x) && !isnan(y) && !isnan(vx) && !isnan(vy));
states_new_dev[local_idx] = (State){x, y, vx, vy};
}
}
/* Move forward one time step. This is the "integration step". For
each body b, compute the total force acting on that body. If you
divide this by the mass of b, you get b's acceleration. So you
actually just calculate b's acceleration directly, since this is
what you want to know. Once you have the acceleration, update the
velocity, then update the position.
int first - the index of this GPU's first state to update
int my_nbodies - the number of bodies that this thread is updating
*/
static inline void cpu_update(int first, int my_nbodies) {
int states_new_index = 0;
for (int i = first; i < first + my_nbodies; i++) {
double x = states[i].x, y = states[i].y;
double vx = states[i].vx, vy = states[i].vy;
// ax times delta t, ay times delta t...
double ax_delta_t = 0.0, ay_delta_t = 0.0;
for (int j = 0; j < nbodies; j++) {
if (j == i) continue;
const double dx = states[j].x - x, dy = states[j].y - y;
const double mass = bodies[j].mass;
const double r_squared = dx * dx + dy * dy;
if (r_squared != 0) {
const double r = sqrt(r_squared);
if (r != 0) {
const double acceleration = G * mass / r_squared;
const double atOverr = acceleration * delta_t / r;
ax_delta_t += dx * atOverr;
ay_delta_t += dy * atOverr;
}
}
}
vx += ax_delta_t;
vy += ay_delta_t;
x += delta_t * vx;
y += delta_t * vy;
assert(!isnan(x) && !isnan(y) && !isnan(vx) && !isnan(vy));
states_new[states_new_index++] = (State){x, y, vx, vy};
}
}
/* Close GIF file, free all allocated data structures */
static void wrapup() {
ANIM_Close(af);
free(posbuf);
free(states);
#pragma omp parallel
{
free(states_new);
if (cpu_thread_id < num_gpu_devices) {
int gpu_id = -1;
cudaSetDevice(cpu_thread_id % num_gpu_devices);
cudaGetDevice(&gpu_id);
cudaFree(states_dev);
cudaFree(bodies_dev);
cudaFree(states_new_dev);
}
}
printf("\nnbody: finished. Time = %lf\n", ANIM_time() - start_time);
}
/* One argument: the name of the output file */
int main(int argc, char* argv[]) {
int statbar = 0; // used for printing status updates
assert(argc == 3);
float gpu_body_prop = atof(argv[2]);
init(argv[1], gpu_body_prop);
// Initialize memory for all threads and GPUs
#pragma omp parallel shared(num_gpu_devices, gpu_nbodies, cpu_nbodies)
{
// Set up individual thread variables
cpu_thread_id = omp_get_thread_num();
num_cpu_threads = omp_get_num_threads();
// If cpu is in charge of GPU updates
if (cpu_thread_id < num_gpu_devices && num_gpu_devices > 0) {
num_owned = NUM_OWNED(cpu_thread_id, num_gpu_devices, gpu_nbodies);
first = FIRST(cpu_thread_id, num_gpu_devices, gpu_nbodies);
// Assign and check the GPU device for each thread
int gpu_id = -1;
cudaSetDevice(cpu_thread_id);
cudaGetDevice(&gpu_id);
// Set number of blocks in grid for this GPU
nblocks = num_owned / (threadsPerBlock * num_cpu_threads) +
(0 != (num_owned % (threadsPerBlock * num_cpu_threads)));
#ifdef DEBUG
printf("gpu thread %d, num owned = %d, first = %d\n", cpu_thread_id,
num_owned, first);
#endif
// Allocate memory for GPU
if (num_owned > 0) {
err = cudaMalloc((void**)&states_dev, nbodies * sizeof(State));
assert(err == cudaSuccess);
err = cudaMalloc((void**)&bodies_dev, nbodies * sizeof(Body));
assert(err == cudaSuccess);
err = cudaMalloc((void**)&states_new_dev, num_owned * sizeof(State));
assert(err == cudaSuccess);
// Only do this once, since this is never updated
err = cudaMemcpy(bodies_dev, bodies, nbodies * sizeof(Body),
cudaMemcpyHostToDevice);
assert(err == cudaSuccess);
// Host copies for new state array for each thread
states_new = (State*)malloc(num_owned * sizeof(State));
}
}
// If cpu does updates itself
else if (cpu_nbodies > 0) {
int offset_cpu_id = cpu_thread_id - num_gpu_devices;
int offset_num_gpus = num_cpu_threads - num_gpu_devices;
num_owned = NUM_OWNED(offset_cpu_id, offset_num_gpus, cpu_nbodies);
// Below we are adding the offset from the bodies that the
// gpu is updating
first = FIRST(offset_cpu_id, offset_num_gpus, cpu_nbodies) + gpu_nbodies;
#ifdef DEBUG
printf("cpu thread %d, num owned = %d, first = %d\n", cpu_thread_id,
num_owned, first);
#endif
if (num_owned > 0) states_new = (State*)malloc(num_owned * sizeof(State));
}
}
if (wstep != 0) write_frame();
for (int i = 1; i <= nstep; i++) {
// Call device kernel from each thread
#pragma omp parallel shared(num_gpu_devices, states)
{
// Call update for GPUs
if (cpu_thread_id < num_gpu_devices && num_gpu_devices > 0) {
// Assign and check the GPU device for each thread
int gpu_id = -1;
cudaSetDevice(cpu_thread_id);
cudaGetDevice(&gpu_id);
if (num_owned > 0) {
err = cudaMemcpy(states_dev, states, nbodies * sizeof(State),
cudaMemcpyHostToDevice);
assert(err == cudaSuccess);
update<<<nblocks, threadsPerBlock>>>(
first, num_owned, delta_t, states_dev, states_new_dev, bodies_dev,
gpu_id, nblocks, nbodies);
// Copy back this gpu's part of states_new_dev back to host's states
// into correct location
err = cudaMemcpy(states_new, states_new_dev,
num_owned * sizeof(State), cudaMemcpyDeviceToHost);
assert(err == cudaSuccess);
}
}
// Here we call the cpu to update any bodies that it owns
else if (cpu_nbodies > 0) {
if (num_owned > 0) cpu_update(first, num_owned);
}
#pragma omp barrier
// If this thread owns nbodies, then update states
if (num_owned > 0)
memcpy(&states[first], states_new, num_owned * sizeof(State));
} // Implicit barrier for all threads
ANIM_Status_update(stdout, nstep, i, &statbar);
if (wstep != 0 && i % wstep == 0) {
write_frame();
}
}
wrapup();
}