Skip to content

Commit

Permalink
1. Add GPU potential calculation support for fast initialization; 2. …
Browse files Browse the repository at this point in the history
…Add uniform binary ratio distribution option for massive stars; 3. Add NBODY6++GPU support
  • Loading branch information
lwang-astro committed Nov 16, 2015
1 parent d92cd46 commit 5bd01cb
Show file tree
Hide file tree
Showing 4 changed files with 581 additions and 91 deletions.
14 changes: 12 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,14 @@ FC = gfortran -O2 #-ffast-math

LFLAGS = const_bse.h zdata.h

CC = gcc -O2 -fopenmp #-ffast-math
CC = gcc -O2 -fopenmp -Wall #-ffast-math
#use "-D NOOMP" and remove "-fopenmp" for compilation without OpenMP

#CFLAGS = -L/opt/local/lib/gcc44/ -lgfortran
CFLAGS = -L/usr/lib/ -lgfortran
CUFLAGS= -O3 -D WITH_CUDA5
CUDA_PATH = /usr/local/cuda
SDK_PATH=$(CUDA_PATH)/samples

.f.o:
$(FC) -c $<
Expand All @@ -26,8 +29,15 @@ mcluster_sse: $(OBJECTS) $(LFLAGS)
$(CC) -c main.c -D SSE -lm
$(CC) $(CFLAGS) $(OBJECTS) main.o -o mcluster_sse -lm

mcluster_gpu: $(OBJECTS) $(LFLAGS) gpupot.gpu.o main.c
$(CC) -c main.c -D SSE -D GPU -lm -I$(CUDA_PATH)/include
$(CC) $(CFLAGS) $(OBJECTS) main.o gpupot.gpu.o -L$(CUDA_PATH)/lib64 -lcudart -lstdc++ -o mcluster_gpu -lm

mcluster:
$(CC) -o mcluster main.c -lm

gpupot.gpu.o: gpupot.gpu.cu cuda_pointer.h
nvcc -c $(CUFLAGS) -Xcompiler "-fPIC -O3 -Wall" -I$(SDK_PATH)/common/inc -I. gpupot.gpu.cu

clean:
rm *.o mcluster_sse mcluster
rm -f *.o mcluster_sse mcluster mcluster_gpu
52 changes: 52 additions & 0 deletions cuda_pointer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#include <assert.h>
// #include <cutil.h>

template <typename T>
struct cudaPointer{
T *dev_pointer;
T *host_pointer;
int size;
cudaPointer(){
dev_pointer = NULL;
host_pointer = NULL;
size = 0;
}
// ~cudaPointer(){
// free();
// }
void allocate(int _size){
size = _size;
void *p;
CUDA_SAFE_CALL(cudaMalloc(&p, size * sizeof(T)));
assert(p);
dev_pointer = (T*)p;
CUDA_SAFE_CALL(cudaMallocHost(&p, size * sizeof(T)));
assert(p);
host_pointer = (T*)p;
}
void free(){
CUDA_SAFE_CALL(cudaFree(dev_pointer));
CUDA_SAFE_CALL(cudaFreeHost(host_pointer));
dev_pointer = NULL;
host_pointer = NULL;
size = 0;
}
void htod(int count){
CUDA_SAFE_CALL(cudaMemcpy(dev_pointer, host_pointer, count * sizeof(T), cudaMemcpyHostToDevice));
}
void htod(){
this->htod(size);
}
void dtoh(int count){
CUDA_SAFE_CALL(cudaMemcpy(host_pointer, dev_pointer, count * sizeof(T), cudaMemcpyDeviceToHost));
}
void dtoh(){
this->dtoh(size);
}
T &operator [] (int i){
return host_pointer[i];
}
operator T* (){
return dev_pointer;
}
};
169 changes: 169 additions & 0 deletions gpupot.gpu.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
//#include "gpupot.h"
// #include <iostream>
#include <cstdio>
// #include <cutil.h>
#ifdef WITH_CUDA5
# include <helper_cuda.h>
# define CUDA_SAFE_CALL checkCudaErrors
#else
# include <cutil.h>
#endif
#include "cuda_pointer.h"
#define NTHREAD 128

#define PROFILE
#ifdef PROFILE
#include <sys/time.h>
static double get_wtime(){
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec + 1.e-6 * tv.tv_usec;
}
#else
static double get_wtime(){
return 0.0;
}
#endif

static float2 float2_split(double x){
const int shift = 20;
float2 ret;
x *= (1<<shift);
double xi = (int)x;
double xf = x - xi;
ret.x = xi * (1./(1<<shift));
ret.y = xf * (1./(1<<shift));
return ret;
}
__device__ float2 float2_accum(float2 acc, float x){
float tmp = acc.x + x;
acc.y -= (tmp - acc.x) - x;
acc.x = tmp;
return acc;
}

__device__ float2 float2_regularize(float2 acc){
float tmp = acc.x + acc.y;
acc.y = acc.y -(tmp - acc.x);
acc.x = tmp;
return acc;
}

__device__ float2 float2_add(float2 a, float2 b){
float tmp = a.x + b.x;
a.y -= (tmp - a.x) - b.x - b.y;
a.x = tmp;
// a.x = a.x + b.x;
// a.y = a.y + b.y;
return a;
}

struct Particle{
float2 pos[3];
float mass;
float pad;

Particle(double x[3], double m){
pos[0] = float2_split(x[0]);
pos[1] = float2_split(x[1]);
pos[2] = float2_split(x[2]);
mass = (float)m;
}
Particle(int){
pos[0].x = pos[0].y = pos[1].x = pos[1].y = pos[2].x = pos[2].y = mass = pad = 0.f;
}
__device__ Particle() {}
};

__global__ void pot_kernel(int n, int nblock, Particle *ptcl, float2 *phi){
__shared__ Particle jpbuf[NTHREAD];
int i = NTHREAD * blockIdx.x + threadIdx.x;
Particle ip = ptcl[i];
float2 phii = make_float2(0.f, 0.f);
for(int j=0; j<n; j+= NTHREAD){
__syncthreads();
jpbuf[threadIdx.x] = ptcl[j + threadIdx.x];
__syncthreads();
#pragma unroll 4
for(int jj=0; jj<NTHREAD; jj++){
// if(j+jj == i) continue;
Particle &jp = jpbuf[jj];
float dx = (jp.pos[0].x - ip.pos[0].x) + (jp.pos[0].y - ip.pos[0].y);
float dy = (jp.pos[1].x - ip.pos[1].x) + (jp.pos[1].y - ip.pos[1].y);
float dz = (jp.pos[2].x - ip.pos[2].x) + (jp.pos[2].y - ip.pos[2].y);
float r2 = dx*dx + dy*dy + dz*dz;
// if(r2==0.f) continue;
float pij = jp.mass * rsqrtf(r2);
// phii = float2_accum(phii, pij);
if(r2 > 0.f) phii = float2_accum(phii, pij);
}
phii = float2_regularize(phii);
}
phi[i] = phii;

// for(int j = nblock/2 + nblock%2; j>1; j = j/2 + j%2) {
// int offset = j%2;
// if(blockIdx.x<j-offset) phi[i] = float2_add(phi[i],phi[i + j*NTHREAD]);
// }

// for(int j = NTHREAD/2; j>1; j/= 2) {
// if(threadIdx.x<j) phi[i] = float2_add(phi[i],phi[i + j]);
// }
}

extern "C" void gpupot(int n, double **star, double *pot) {

int numGPU=0;
cudaGetDeviceCount(&numGPU);
assert(numGPU>0);
cudaSetDevice(0);

double t0 = get_wtime();

cudaPointer <float2> phi;
cudaPointer <Particle> ptcl;

int ng = NTHREAD * (n/NTHREAD + (n%NTHREAD ? 1 : 0));

phi.allocate(ng);
ptcl.allocate(ng);

// std::cout << n << " " << ng << std::endl;
for(int i=0; i<n; i++){
// ptcl_h[i] = Particle(x[i], m[i]);
ptcl[i] = Particle(&star[i][1], star[i][0]);
}
for(int i=n; i<ng; i++){
// ptcl_h[i] = Particle(0);
ptcl[i] = Particle(0);
}

// cudaMemcpy(ptcl_d, ptcl_h, ng * sizeof(Particle), cudaMemcpyHostToDevice);
ptcl.htod(ng);

dim3 grid(ng/NTHREAD, 1, 1);
dim3 threads(NTHREAD, 1, 1);
int sharedMemSize = NTHREAD * sizeof(Particle);
// pot_kernel <<<grid, threads, sharedMemSize >>> (n, ptcl_d, phi_d);
pot_kernel <<<grid, threads, sharedMemSize >>> (n, ng/NTHREAD, ptcl, phi);

// phi.dtoh(1);
// double pot = (double)phi[0].x + (double)phi[0].y;
// cudaMemcpy(phi_h, phi_d, n * sizeof(float2), cudaMemcpyDeviceToHost);
phi.dtoh(n);
*pot = 0;
for(int i=0; i<n; i++){
// pot[i] = (double)phi_h[i].x + (double)phi_h[i].y;
*pot -= star[i][0]*((double)phi[i].x + (double)phi[i].y);
}
*pot /=2.;

phi.free();
ptcl.free();

double t1 = get_wtime();
#ifdef PROFILE
fprintf(stderr, "gpupot: %f sec\n", t1 - t0);
#endif
}

Loading

0 comments on commit 5bd01cb

Please sign in to comment.