Skip to content

Commit

Permalink
grouped convolutions 🐍 🐍 🐍
Browse files Browse the repository at this point in the history
  • Loading branch information
pjreddie committed Oct 4, 2017
1 parent 62b781a commit fbd48ab
Show file tree
Hide file tree
Showing 9 changed files with 166 additions and 140 deletions.
4 changes: 4 additions & 0 deletions cfg/darknet.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
[net]
# Train
batch=128
subdivisions=1
# Test
# batch=1
# subdivisions=1
height=224
width=224
channels=3
Expand Down
4 changes: 4 additions & 0 deletions cfg/tiny.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
[net]
# Train
batch=128
subdivisions=1
# Test
# batch=1
# subdivisions=1
height=224
width=224
channels=3
Expand Down
2 changes: 1 addition & 1 deletion examples/darknet.c
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ long numops(network net)
for(i = 0; i < net.n; ++i){
layer l = net.layers[i];
if(l.type == CONVOLUTIONAL){
ops += 2l * l.n * l.size*l.size*l.c * l.out_h*l.out_w;
ops += 2l * l.n * l.size*l.size*l.c/l.groups * l.out_h*l.out_w;
} else if(l.type == CONNECTED){
ops += 2l * l.inputs * l.outputs;
} else if (l.type == RNN){
Expand Down
118 changes: 63 additions & 55 deletions src/convolutional_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -74,12 +74,12 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network net)
{
fill_gpu(l.outputs*l.batch, 0, l.output_gpu, 1);
if(l.binary){
binarize_weights_gpu(l.weights_gpu, l.n, l.c*l.size*l.size, l.binary_weights_gpu);
binarize_weights_gpu(l.weights_gpu, l.n, l.c/l.groups*l.size*l.size, l.binary_weights_gpu);
swap_binary(&l);
}

if(l.xnor){
binarize_weights_gpu(l.weights_gpu, l.n, l.c*l.size*l.size, l.binary_weights_gpu);
binarize_weights_gpu(l.weights_gpu, l.n, l.c/l.groups*l.size*l.size, l.binary_weights_gpu);
swap_binary(&l);
binarize_gpu(net.input_gpu, l.c*l.h*l.w*l.batch, l.binary_input_gpu);
net.input_gpu = l.binary_input_gpu;
Expand All @@ -102,16 +102,20 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network net)
l.output_gpu);

#else
int i;
int m = l.n;
int k = l.size*l.size*l.c;
int i, j;
int m = l.n/l.groups;
int k = l.size*l.size*l.c/l.groups;
int n = l.out_w*l.out_h;
for(i = 0; i < l.batch; ++i){
im2col_gpu(net.input_gpu + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, net.workspace);
float * a = l.weights_gpu;
float * b = net.workspace;
float * c = l.output_gpu;
gemm_gpu(0,0,m,n,k,1.,a,k,b,n,1.,c+i*m*n,n);
for(j = 0; j < l.groups; ++j){
float *a = l.weights_gpu + j*l.nweights/l.groups;
float *b = net.workspace;
float *c = l.output_gpu + (i*l.groups + j)*n*m;

im2col_gpu(net.input_gpu + (i*l.groups + j)*l.c/l.groups*l.h*l.w,
l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
gemm_gpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
}
}
#endif

Expand Down Expand Up @@ -221,60 +225,66 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network net)
}

#else
int m = l.n;
int n = l.size*l.size*l.c;
int m = l.n/l.groups;
int n = l.size*l.size*l.c/l.groups;
int k = l.out_w*l.out_h;

int i;
int i, j;
for(i = 0; i < l.batch; ++i){
float * a = l.delta_gpu;
float * b = net.workspace;
float * c = l.weight_updates_gpu;

im2col_gpu(net.input_gpu + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, net.workspace);
gemm_gpu(0,1,m,n,k,1,a + i*m*k,k,b,k,1,c,n);

if(net.delta_gpu){
if(l.binary || l.xnor) swap_binary(&l);
float * a = l.weights_gpu;
float * b = l.delta_gpu;
float * c = net.workspace;

gemm_gpu(1,0,n,k,m,1,a,n,b + i*k*m,k,0,c,k);

col2im_gpu(net.workspace, l.c, l.h, l.w, l.size, l.stride, l.pad, net.delta_gpu + i*l.c*l.h*l.w);
if(l.binary || l.xnor) {
swap_binary(&l);
for(j = 0; j < l.groups; ++j){
float *a = l.delta_gpu + (i*l.groups + j)*m*k;
float *b = net.workspace;
float *c = l.weight_updates_gpu + j*l.nweights/l.groups;

float *im = net.input+(i*l.groups + j)*l.c/l.groups*l.h*l.w;

im2col_gpu(im, l.c/l.groups, l.h, l.w,
l.size, l.stride, l.pad, b);
gemm_gpu(0,1,m,n,k,1,a,k,b,k,1,c,n);

if(net.delta_gpu){
if(l.binary || l.xnor) swap_binary(&l);
a = l.weights_gpu + j*l.nweights/l.groups;
b = l.delta_gpu + (i*l.groups + j)*m*k;
c = net.workspace;

gemm_gpu(1,0,n,k,m,1,a,n,b,k,0,c,k);

col2im_gpu(net.workspace, l.c/l.groups, l.h, l.w, l.size, l.stride,
l.pad, net.delta_gpu + (i*l.groups + j)*l.c/l.groups*l.h*l.w);
if(l.binary || l.xnor) {
swap_binary(&l);
}
}
if(l.xnor) gradient_array_gpu(original_input + i*l.c*l.h*l.w, l.c*l.h*l.w, HARDTAN, net.delta_gpu + i*l.c*l.h*l.w);
}
}
#endif
}

void pull_convolutional_layer(convolutional_layer layer)
void pull_convolutional_layer(layer l)
{
cuda_pull_array(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size);
cuda_pull_array(layer.biases_gpu, layer.biases, layer.n);
cuda_pull_array(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size);
cuda_pull_array(layer.bias_updates_gpu, layer.bias_updates, layer.n);
if (layer.batch_normalize){
cuda_pull_array(layer.scales_gpu, layer.scales, layer.n);
cuda_pull_array(layer.rolling_mean_gpu, layer.rolling_mean, layer.n);
cuda_pull_array(layer.rolling_variance_gpu, layer.rolling_variance, layer.n);
cuda_pull_array(l.weights_gpu, l.weights, l.nweights);
cuda_pull_array(l.biases_gpu, l.biases, l.n);
cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.n);
if (l.batch_normalize){
cuda_pull_array(l.scales_gpu, l.scales, l.n);
cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.n);
cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.n);
}
}

void push_convolutional_layer(convolutional_layer layer)
void push_convolutional_layer(layer l)
{
cuda_push_array(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size);
cuda_push_array(layer.biases_gpu, layer.biases, layer.n);
cuda_push_array(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size);
cuda_push_array(layer.bias_updates_gpu, layer.bias_updates, layer.n);
if (layer.batch_normalize){
cuda_push_array(layer.scales_gpu, layer.scales, layer.n);
cuda_push_array(layer.rolling_mean_gpu, layer.rolling_mean, layer.n);
cuda_push_array(layer.rolling_variance_gpu, layer.rolling_variance, layer.n);
cuda_push_array(l.weights_gpu, l.weights, l.nweights);
cuda_push_array(l.biases_gpu, l.biases, l.n);
cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n);
if (l.batch_normalize){
cuda_push_array(l.scales_gpu, l.scales, l.n);
cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.n);
cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.n);
}
}

Expand All @@ -285,18 +295,16 @@ void update_convolutional_layer_gpu(layer l, update_args a)
float decay = a.decay;
int batch = a.batch;

int size = l.size*l.size*l.c*l.n;

if(a.adam){
adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, size, batch, a.t);
adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.nweights, batch, a.t);
adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
if(l.scales_gpu){
adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
}
}else{
axpy_gpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
axpy_gpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
scal_gpu(size, momentum, l.weight_updates_gpu, 1);
axpy_gpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
axpy_gpu(l.nweights, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
scal_gpu(l.nweights, momentum, l.weight_updates_gpu, 1);

axpy_gpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
scal_gpu(l.n, momentum, l.bias_updates_gpu, 1);
Expand Down
Loading

0 comments on commit fbd48ab

Please sign in to comment.