1. Add GPU potential calculation support for fast initialization; 2. …

…Add uniform binary ratio distribution option for massive stars; 3. Add NBODY6++GPU support
ahwkuepper · Nov 16, 2015 · 5bd01cb · 5bd01cb
1 parent d92cd46
commit 5bd01cb
Show file tree

Hide file tree

Showing 4 changed files with 581 additions and 91 deletions.
diff --git a/Makefile b/Makefile
@@ -5,11 +5,14 @@ FC = gfortran -O2 #-ffast-math
 
 LFLAGS = const_bse.h zdata.h 
 
-CC = gcc -O2 -fopenmp #-ffast-math 
+CC = gcc -O2 -fopenmp -Wall #-ffast-math 
 #use "-D NOOMP" and remove "-fopenmp" for compilation without OpenMP
 
 #CFLAGS = -L/opt/local/lib/gcc44/ -lgfortran
 CFLAGS = -L/usr/lib/ -lgfortran
+CUFLAGS= -O3 -D WITH_CUDA5
+CUDA_PATH = /usr/local/cuda
+SDK_PATH=$(CUDA_PATH)/samples
 
 .f.o:
 	$(FC) -c $<
@@ -26,8 +29,15 @@ mcluster_sse: $(OBJECTS) $(LFLAGS)
 	$(CC) -c main.c -D SSE -lm
 	$(CC) $(CFLAGS) $(OBJECTS) main.o -o mcluster_sse -lm 
 
+mcluster_gpu: $(OBJECTS) $(LFLAGS) gpupot.gpu.o main.c
+	$(CC) -c main.c -D SSE -D GPU -lm -I$(CUDA_PATH)/include
+	$(CC) $(CFLAGS) $(OBJECTS) main.o gpupot.gpu.o -L$(CUDA_PATH)/lib64 -lcudart -lstdc++ -o mcluster_gpu -lm
+
 mcluster: 
 	$(CC) -o mcluster main.c -lm
 
+gpupot.gpu.o: gpupot.gpu.cu cuda_pointer.h
+	nvcc -c $(CUFLAGS) -Xcompiler "-fPIC -O3 -Wall" -I$(SDK_PATH)/common/inc -I. gpupot.gpu.cu 
+
 clean:
-	rm *.o mcluster_sse mcluster
+	rm -f *.o mcluster_sse mcluster mcluster_gpu
diff --git a/cuda_pointer.h b/cuda_pointer.h
@@ -0,0 +1,52 @@
+#include <assert.h>
+// #include <cutil.h>
+
+template <typename T>
+struct cudaPointer{
+  T *dev_pointer;
+  T *host_pointer;
+  int size;
+  cudaPointer(){
+    dev_pointer = NULL;
+    host_pointer = NULL;
+    size = 0;
+  }
+  //  ~cudaPointer(){
+  // free();
+  //  }
+  void allocate(int _size){
+    size = _size;
+    void *p;
+    CUDA_SAFE_CALL(cudaMalloc(&p, size * sizeof(T)));
+    assert(p);
+    dev_pointer = (T*)p;
+    CUDA_SAFE_CALL(cudaMallocHost(&p, size * sizeof(T)));
+    assert(p);
+    host_pointer = (T*)p;
+  }
+  void free(){
+    CUDA_SAFE_CALL(cudaFree(dev_pointer));
+    CUDA_SAFE_CALL(cudaFreeHost(host_pointer));
+    dev_pointer = NULL;
+    host_pointer = NULL;
+    size = 0;
+  }
+  void htod(int count){
+    CUDA_SAFE_CALL(cudaMemcpy(dev_pointer, host_pointer, count * sizeof(T), cudaMemcpyHostToDevice));
+  }
+  void htod(){
+    this->htod(size);
+  }
+  void dtoh(int count){
+    CUDA_SAFE_CALL(cudaMemcpy(host_pointer, dev_pointer, count * sizeof(T), cudaMemcpyDeviceToHost));
+  }
+  void dtoh(){
+    this->dtoh(size);
+  }
+  T &operator [] (int i){
+    return host_pointer[i];
+  }
+  operator T* (){
+    return dev_pointer;
+  }
+};
diff --git a/gpupot.gpu.cu b/gpupot.gpu.cu
@@ -0,0 +1,169 @@
+//#include "gpupot.h"
+// #include <iostream>
+#include <cstdio>
+// #include <cutil.h>
+#ifdef WITH_CUDA5
+#  include <helper_cuda.h>
+#  define CUDA_SAFE_CALL checkCudaErrors
+#else
+#  include <cutil.h>
+#endif
+#include "cuda_pointer.h"
+#define NTHREAD 128
+
+#define PROFILE
+#ifdef PROFILE
+#include <sys/time.h>
+static double get_wtime(){
+	struct timeval tv;
+	gettimeofday(&tv, NULL);
+	return tv.tv_sec + 1.e-6 * tv.tv_usec;
+}
+#else
+static double get_wtime(){
+	return 0.0;
+}
+#endif
+
+static float2 float2_split(double x){
+	const int shift = 20;
+	float2 ret;
+	x *= (1<<shift);
+	double xi = (int)x;
+	double xf = x - xi;
+	ret.x = xi * (1./(1<<shift));
+	ret.y = xf * (1./(1<<shift));
+	return ret;
+}
+__device__ float2 float2_accum(float2 acc, float x){
+	float tmp = acc.x + x;
+	acc.y -= (tmp - acc.x) - x;
+	acc.x = tmp;
+	return acc;
+}
+
+__device__ float2 float2_regularize(float2 acc){
+	float tmp = acc.x + acc.y;
+	acc.y = acc.y -(tmp - acc.x);
+	acc.x = tmp;
+	return acc;
+}
+
+__device__ float2 float2_add(float2 a, float2 b){
+  float tmp = a.x + b.x;
+  a.y -= (tmp - a.x) - b.x - b.y;
+  a.x = tmp;
+  // a.x = a.x + b.x;
+  // a.y = a.y + b.y;
+  return a;
+}
+
+struct Particle{
+	float2 pos[3];
+	float mass;
+	float pad;
+
+	Particle(double x[3], double m){
+		pos[0] = float2_split(x[0]);
+		pos[1] = float2_split(x[1]);
+		pos[2] = float2_split(x[2]);
+		mass = (float)m;
+	}
+	Particle(int){
+		pos[0].x = pos[0].y = pos[1].x = pos[1].y = pos[2].x = pos[2].y = mass = pad = 0.f;
+	}
+	__device__ Particle() {}
+};
+
+__global__ void pot_kernel(int n, int nblock, Particle *ptcl, float2 *phi){
+	__shared__ Particle jpbuf[NTHREAD];
+	int i = NTHREAD * blockIdx.x + threadIdx.x;
+	Particle ip = ptcl[i];
+	float2 phii = make_float2(0.f, 0.f);
+	for(int j=0; j<n; j+= NTHREAD){
+		__syncthreads();
+		jpbuf[threadIdx.x] = ptcl[j + threadIdx.x];
+		__syncthreads();
+#pragma unroll 4
+		for(int jj=0; jj<NTHREAD; jj++){
+			// if(j+jj == i) continue;
+			Particle &jp = jpbuf[jj];
+			float dx = (jp.pos[0].x - ip.pos[0].x) + (jp.pos[0].y - ip.pos[0].y);
+			float dy = (jp.pos[1].x - ip.pos[1].x) + (jp.pos[1].y - ip.pos[1].y);
+			float dz = (jp.pos[2].x - ip.pos[2].x) + (jp.pos[2].y - ip.pos[2].y);
+			float r2 = dx*dx + dy*dy + dz*dz;
+			// if(r2==0.f) continue;
+			float pij = jp.mass * rsqrtf(r2);
+			// phii = float2_accum(phii, pij);
+			if(r2 > 0.f) phii = float2_accum(phii, pij);
+		}
+		phii = float2_regularize(phii);
+	}
+	phi[i] = phii;
+
+    // for(int j = nblock/2 + nblock%2; j>1; j = j/2 + j%2) {
+    //   int offset = j%2;
+    //   if(blockIdx.x<j-offset) phi[i] = float2_add(phi[i],phi[i + j*NTHREAD]);
+    // }
+
+    // for(int j = NTHREAD/2; j>1; j/= 2) {
+    //   if(threadIdx.x<j) phi[i] = float2_add(phi[i],phi[i + j]);
+    // }
+}
+
+extern "C" void gpupot(int n, double **star, double *pot) {
+
+    int numGPU=0;
+    cudaGetDeviceCount(&numGPU);
+    assert(numGPU>0);
+    cudaSetDevice(0);
+
+	double t0 = get_wtime();
+
+	cudaPointer <float2> phi;
+	cudaPointer <Particle> ptcl;
+
+	int ng = NTHREAD * (n/NTHREAD + (n%NTHREAD ? 1 : 0));
+
+	phi.allocate(ng);
+	ptcl.allocate(ng);
+
+	// std::cout << n << " " << ng << std::endl;
+	for(int i=0; i<n; i++){
+		// ptcl_h[i] = Particle(x[i], m[i]);
+		ptcl[i] = Particle(&star[i][1], star[i][0]);
+	}
+	for(int i=n; i<ng; i++){
+		// ptcl_h[i] = Particle(0);
+		ptcl[i] = Particle(0);
+	}
+
+	// cudaMemcpy(ptcl_d, ptcl_h, ng * sizeof(Particle), cudaMemcpyHostToDevice);
+	ptcl.htod(ng);
+
+	dim3 grid(ng/NTHREAD, 1, 1);
+	dim3 threads(NTHREAD, 1, 1);
+	int sharedMemSize = NTHREAD * sizeof(Particle);
+	// pot_kernel <<<grid, threads, sharedMemSize >>> (n, ptcl_d, phi_d);
+    pot_kernel <<<grid, threads, sharedMemSize >>> (n, ng/NTHREAD, ptcl, phi);
+
+    // phi.dtoh(1);
+    // double pot = (double)phi[0].x + (double)phi[0].y;
+	// cudaMemcpy(phi_h, phi_d, n * sizeof(float2), cudaMemcpyDeviceToHost);
+    phi.dtoh(n);
+    *pot = 0;
+    for(int i=0; i<n; i++){
+		// pot[i] = (double)phi_h[i].x + (double)phi_h[i].y;
+      *pot -= star[i][0]*((double)phi[i].x + (double)phi[i].y);
+    }
+    *pot /=2.;
+
+	phi.free();
+	ptcl.free();
+
+	double t1 = get_wtime();
+#ifdef PROFILE
+	fprintf(stderr, "gpupot: %f sec\n", t1 - t0);
+#endif
+}
+