forked from digitalbrain79/NNPACK-darknet
-
Notifications
You must be signed in to change notification settings - Fork 0
/
relu-output.c
115 lines (97 loc) · 3.41 KB
/
relu-output.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#include <string.h>
#include <stdlib.h>
#include <stdbool.h>
#include <stdint.h>
#include <assert.h>
#include <nnpack.h>
#include <nnpack/relu.h>
#include <nnpack/utils.h>
#include <nnpack/validation.h>
struct NNP_CACHE_ALIGN inplace_relu_context {
nnp_inplace_relu_function relu_function;
float* data;
float negative_slope;
};
static void compute_inplace_relu_output(
const struct inplace_relu_context context[restrict static 1],
size_t block_start, size_t block_size)
{
nnp_inplace_relu_function relu_function = context->relu_function;
float* data = context->data;
float negative_slope = context->negative_slope;
relu_function(data + block_start, block_size, negative_slope);
}
struct NNP_CACHE_ALIGN outplace_relu_context {
nnp_outplace_relu_function relu_function;
const float* input;
float* output;
float negative_slope;
};
static void compute_outplace_relu_output(
const struct outplace_relu_context context[restrict static 1],
size_t block_start, size_t block_size)
{
nnp_outplace_relu_function relu_function = context->relu_function;
const float* input = context->input;
float* output = context->output;
float negative_slope = context->negative_slope;
relu_function(input + block_start, output + block_start, block_size, negative_slope);
}
static inline float relu(float data, float negative_slope) {
return data > 0.0f ? data : data * negative_slope;
}
enum nnp_status nnp_relu_output(
size_t batch_size,
size_t channels,
const float input[],
float output[],
float negative_slope,
pthreadpool_t threadpool)
{
enum nnp_status status = validate_relu_arguments(batch_size, channels);
if (status != nnp_status_success) {
return status;
}
size_t elements = batch_size * channels;
const size_t simd_width = nnp_hwinfo.simd_width;
assert(((uintptr_t) input) % sizeof(float) == 0);
assert(((uintptr_t) output) % sizeof(float) == 0);
const size_t prologue_elements = min((size_t) (-(((uintptr_t) output) / sizeof(float)) % simd_width), elements);
for (size_t i = 0; i < prologue_elements; i++) {
output[i] = relu(input[i], negative_slope);
}
elements -= prologue_elements;
input += prologue_elements;
output += prologue_elements;
const size_t epilogue_elements = elements % simd_width;
for (size_t i = 0; i < epilogue_elements; i++) {
output[elements - epilogue_elements + i] =
relu(input[elements - epilogue_elements + i], negative_slope);
}
elements -= epilogue_elements;
if (input == output) {
/* In-place transformation */
struct inplace_relu_context inplace_relu_context = {
.relu_function = nnp_hwinfo.activations.inplace_relu,
.data = output,
.negative_slope = negative_slope,
};
pthreadpool_compute_1d_tiled(threadpool,
(pthreadpool_function_1d_tiled_t) compute_inplace_relu_output,
&inplace_relu_context,
elements, round_down(nnp_hwinfo.blocking.l1 / sizeof(float), simd_width));
} else {
/* Out-of-place transformation */
struct outplace_relu_context outplace_relu_context = {
.relu_function = nnp_hwinfo.activations.outplace_relu,
.input = input,
.output = output,
.negative_slope = negative_slope,
};
pthreadpool_compute_1d_tiled(threadpool,
(pthreadpool_function_1d_tiled_t) compute_outplace_relu_output,
&outplace_relu_context,
elements, round_down(nnp_hwinfo.blocking.l1 / sizeof(float), simd_width));
}
return nnp_status_success;
}