-
Notifications
You must be signed in to change notification settings - Fork 7
/
softmax.cu
102 lines (82 loc) · 2.36 KB
/
softmax.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#include <stdio.h>
#include <stdlib.h>
__global__ void softmax(float *d_in, float *d_out, float *expArr, float *redArr, int N) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (col < N) {
float local_exp = expf(d_in[col]);
expArr[col] = local_exp;
redArr[col] = expArr[col];
// amount of padding for parallel reduction
int padding = 0;
for (int e = 0; (float)N/(float)(1 << e)>= 1; ++e) {
padding = e + 1;
}
for (int i = 0; i < (1 << padding) - N; ++i) {
expArr[N + i] = 0;
redArr[N + i] = 0;
}
__syncthreads();
// parallel reduction to compute sum
for (int stride = 1 << padding; stride >= 1; stride /= 2) {
if (col < stride) {
redArr[col] += redArr[col + stride];
}
}
__syncthreads();
}
// calculate e^x / sum(e^x) = softmax for each element
if (col < N) {
float sum = redArr[0];
d_out[col] = expArr[col] / sum;
}
}
int main() {
// var declaration
int N = 6;
float h_in[N];
float h_out[N];
float *d_in, *d_out;
float *expArr;
float *redArr;
// memory allocation
cudaMalloc((void**)&d_in, N * sizeof(float));
cudaMalloc((void**)&d_out, N * sizeof(float));
cudaMalloc((void**)&expArr, 2 * N * sizeof(float));
cudaMalloc((void**)&redArr, 2 * N * sizeof(float));
// data initialization
for (int i = 0; i < N; ++i) {
h_in[i] = (float)(rand() % 5 + 1);
}
cudaMemcpy(d_in, h_in, N * sizeof(float), cudaMemcpyHostToDevice);
// launch softmax kernel
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
softmax<<<blocksPerGrid, threadsPerBlock>>>(d_in, d_out, expArr, redArr, N);
// copy result to host
cudaMemcpy(h_out, d_out, N * sizeof(float), cudaMemcpyDeviceToHost);
// print result
printf("Softmax input:\n");
for (int i = 0; i < N; ++i) {
printf("%f ", h_in[i]);
}
printf("\n----------\n");
printf("Softmax output:\n");
for (int i = 0; i < N; ++i) {
printf("%f ", h_out[i]);
}
printf("\n----------\n");
// compare with CPU implementation
float sum = 0.0f;
for (int i = 0; i < N; ++i) {
sum += exp(h_in[i]);
}
printf("Expected output:\n");
for (int i = 0; i < N; ++i) {
printf("%f ", exp(h_in[i]) / sum);
}
printf("\n");
// clean device memory
cudaFree(d_in);
cudaFree(d_out);
return 0;
}