-
Notifications
You must be signed in to change notification settings - Fork 0
/
example-app.cu
56 lines (51 loc) · 1.2 KB
/
example-app.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#include <iostream>
#include <math.h>
#include <stdio.h>
// Kernel function to add the elements of two arrays
__global__ void sum(float *x)
{
int tid = blockDim.x * blockIdx.x + threadIdx.x;
float mul_temp[32];
float result = 0;
// #pragma unroll
// for (int j = 0; j < 32; ++j)
// {
// mul_temp[j] = x[j];
// }
// #pragma unroll
// for (int j = 0; j < 32; ++j)
// {
// result += mul_temp[j];
// }
float xi = x[tid];
// for (int offset = 1; offset < 32; offset *= 2)
// {
// result += __shfl_xor_sync(-1, xi, offset);
// }
printf("%i %f\n", tid, xi);
for (int offset = 32/2; offset > 0; offset /= 2){
xi+=__shfl_down_sync(0xFFFFFFFF, xi, offset,32);
}
x[0] = xi;
}
int main(void)
{
printf("test\n");
int N = 32;
float *x; // Allocate Unified Memory – accessible from CPU or GPU
cudaMallocManaged(&x, N * sizeof(float));
float s = 0;
for (int i = 0; i < N; i++)
{
x[i] = i;
s += x[i];
}
// Run kernel on 1M elements on the GPU
sum<<<1, 32>>>(x);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
printf("sum result: %f %f \n", x[0], s);
// Free memory
cudaFree(x);
return 0;
}