Skip to content
This repository has been archived by the owner on Oct 1, 2020. It is now read-only.

Commit

Permalink
5x5 sse2 kernel (#27)
Browse files Browse the repository at this point in the history
  • Loading branch information
harouwu committed Nov 10, 2018
1 parent c7b8e98 commit 7426638
Show file tree
Hide file tree
Showing 7 changed files with 983 additions and 1 deletion.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ SET(QNNPACK_X86_SSE2_UKERNELS
src/q8gemm/2x4c8-sse2.c
src/q8gemm/4x4c2-sse2.c
src/q8conv/4x4c2-sse2.c
src/q8mpdw/25c8-sse2.c
src/q8updw/9c8-sse2.c)

SET(QNNPACK_UKERNELS ${QNNPACK_PSIMD_UKERNELS})
Expand Down
1 change: 1 addition & 0 deletions configure.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ def main(args):
build.cc("q8gemm/2x4c8-sse2.c"),
build.cc("q8gemm/4x4c2-sse2.c"),
build.cc("q8conv/4x4c2-sse2.c"),
build.cc("q8mpdw/25c8-sse2.c"),
build.cc("q8updw/9c8-sse2.c"),
]
build.static_library("qnnpack", qnnpack_objects)
Expand Down
2 changes: 1 addition & 1 deletion src/convolution.c
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ enum qnnp_status qnnp_create_convolution2d_nhwc_q8(
const size_t kernel_size = kernel_height * kernel_width;

uint32_t flags = 0;
if (kernel_size == 9 && group_input_channels == 1 && group_output_channels == 1 && groups > 1) {
if ((kernel_size == 9 || kernel_size == 25) && group_input_channels == 1 && group_output_channels == 1 && groups > 1) {
flags |= QNNP_CONVOLUTION_FLAG_DW;
} else if (kernel_size == 1 && subsampling_height == 1 && subsampling_width == 1) {
if (group_input_channels >= qnnp_params.q8conv_xzp.kthreshold) {
Expand Down
8 changes: 8 additions & 0 deletions src/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,10 @@ static void init(void) {
.updw = q8updw_ukernel_9c8__neon,
.cr = 8,
};
qnnp_params.q8dw25 = (struct q8mpdw_parameters) {
.mpdw = q8mpdw_ukernel_25c8__neon,
.cr = 8,
};
#elif CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
if (!cpuinfo_has_x86_sse2()) {
qnnp_log_error("QNNPACK initialization failed: SSE2 is not supported");
Expand All @@ -107,6 +111,10 @@ static void init(void) {
.updw = q8updw_ukernel_9c8__sse2,
.cr = 8,
};
qnnp_params.q8dw25 = (struct q8mpdw_parameters) {
.mpdw = q8mpdw_ukernel_25c8__sse2,
.cr = 8,
};
#else
#error "Unsupported architecture"
#endif
Expand Down
742 changes: 742 additions & 0 deletions src/q8mpdw/25c8-sse2.c

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/qnnpack/q8dw.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ DECLARE_Q8UPDW_FUNCTION(q8updw_ukernel_9c8__sse2)
const union qnnp_conv_quantization_params* quantization_params);

DECLARE_Q8MPDW_FUNCTION(q8mpdw_ukernel_25c8__neon)
DECLARE_Q8MPDW_FUNCTION(q8mpdw_ukernel_25c8__sse2)

#ifdef __cplusplus
} /* extern "C" */
Expand Down
229 changes: 229 additions & 0 deletions test/q8mpdw.cc
Original file line number Diff line number Diff line change
Expand Up @@ -223,3 +223,232 @@
}
}
#endif /* CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */

#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
TEST(Q8DW_25c8_SSE2, single_output_channels_eq_8) {
DepthwiseMicrokernelTester()
.kernelHeight(5)
.kernelWidth(5)
.cr(8)
.channels(8)
.width(1)
.test(q8mpdw_ukernel_25c8__sse2);
}

TEST(Q8DW_25c8_SSE2, single_output_channels_eq_8_with_qmin) {
DepthwiseMicrokernelTester()
.kernelHeight(5)
.kernelWidth(5)
.cr(8)
.channels(8)
.width(1)
.qmin(128)
.test(q8mpdw_ukernel_25c8__sse2);
}

TEST(Q8DW_25c8_SSE2, single_output_channels_eq_8_with_qmax) {
DepthwiseMicrokernelTester()
.kernelHeight(5)
.kernelWidth(5)
.cr(8)
.channels(8)
.width(1)
.qmax(128)
.test(q8mpdw_ukernel_25c8__sse2);
}

TEST(Q8DW_25c8_SSE2, single_output_channels_eq_8_with_input_zero_point_only) {
DepthwiseMicrokernelTester()
.kernelHeight(5)
.kernelWidth(5)
.cr(8)
.channels(8)
.width(1)
.inputZeroPoint(255)
.kernelZeroPoint(0)
.test(q8mpdw_ukernel_25c8__sse2);
}

TEST(Q8DW_25c8_SSE2, single_output_channels_eq_8_with_kernel_zero_point_only) {
DepthwiseMicrokernelTester()
.kernelHeight(5)
.kernelWidth(5)
.cr(8)
.channels(8)
.width(1)
.inputZeroPoint(0)
.kernelZeroPoint(255)
.test(q8mpdw_ukernel_25c8__sse2);
}

TEST(Q8DW_25c8_SSE2, multi_output_channels_eq_8) {
DepthwiseMicrokernelTester()
.kernelHeight(5)
.kernelWidth(5)
.cr(8)
.channels(8)
.width(5)
.test(q8mpdw_ukernel_25c8__sse2);
}

TEST(Q8DW_25c8_SSE2, multi_output_channels_eq_8_with_subsampling) {
DepthwiseMicrokernelTester()
.kernelHeight(5)
.kernelWidth(5)
.subsampling(2)
.cr(8)
.channels(8)
.width(5)
.test(q8mpdw_ukernel_25c8__sse2);
}

TEST(Q8DW_25c8_SSE2, multi_output_channels_eq_8_with_input_stride) {
DepthwiseMicrokernelTester()
.kernelHeight(5)
.kernelWidth(5)
.cr(8)
.channels(8)
.width(5)
.inputStride(17)
.test(q8mpdw_ukernel_25c8__sse2);
}

TEST(Q8DW_25c8_SSE2, multi_output_channels_eq_8_with_output_stride) {
DepthwiseMicrokernelTester()
.kernelHeight(5)
.kernelWidth(5)
.cr(8)
.channels(8)
.width(5)
.outputStride(19)
.test(q8mpdw_ukernel_25c8__sse2);
}

TEST(Q8DW_25c8_SSE2, single_output_channels_div_8) {
for (uint32_t channels = 16; channels < 128; channels += 24) {
DepthwiseMicrokernelTester()
.kernelHeight(5)
.kernelWidth(5)
.cr(8)
.channels(channels)
.width(1)
.test(q8mpdw_ukernel_25c8__sse2);
}
}

TEST(Q8DW_25c8_SSE2, multi_output_channels_div_8) {
for (uint32_t channels = 16; channels < 128; channels += 24) {
DepthwiseMicrokernelTester()
.kernelHeight(5)
.kernelWidth(5)
.cr(8)
.channels(channels)
.width(5)
.test(q8mpdw_ukernel_25c8__sse2);
}
}

TEST(Q8DW_25c8_SSE2, multi_output_channels_div_8_with_output_stride) {
for (uint32_t channels = 16; channels < 128; channels += 24) {
DepthwiseMicrokernelTester()
.kernelHeight(5)
.kernelWidth(5)
.cr(8)
.channels(channels)
.width(5)
.outputStride(171)
.test(q8mpdw_ukernel_25c8__sse2);
}
}

TEST(Q8DW_25c8_SSE2, single_output_channels_gt_8) {
for (uint32_t channels = 9; channels < 16; channels++) {
DepthwiseMicrokernelTester()
.kernelHeight(5)
.kernelWidth(5)
.cr(8)
.channels(channels)
.width(1)
.test(q8mpdw_ukernel_25c8__sse2);
}
}

TEST(Q8DW_25c8_SSE2, single_output_channels_gt_8_with_qmin) {
for (uint32_t channels = 9; channels < 16; channels++) {
DepthwiseMicrokernelTester()
.kernelHeight(5)
.kernelWidth(5)
.cr(8)
.channels(channels)
.width(1)
.qmin(128)
.test(q8mpdw_ukernel_25c8__sse2);
}
}

TEST(Q8DW_25c8_SSE2, single_output_channels_gt_8_with_qmax) {
for (uint32_t channels = 9; channels < 16; channels++) {
DepthwiseMicrokernelTester()
.kernelHeight(5)
.kernelWidth(5)
.cr(8)
.channels(channels)
.width(1)
.qmax(128)
.test(q8mpdw_ukernel_25c8__sse2);
}
}

TEST(Q8DW_25c8_SSE2, single_output_channels_gt_8_with_input_zero_point_only) {
for (uint32_t channels = 9; channels < 16; channels++) {
DepthwiseMicrokernelTester()
.kernelHeight(5)
.kernelWidth(5)
.cr(8)
.channels(channels)
.width(1)
.inputZeroPoint(255)
.kernelZeroPoint(0)
.test(q8mpdw_ukernel_25c8__sse2);
}
}

TEST(Q8DW_25c8_SSE2, single_output_channels_gt_8_with_kernel_zero_point_only) {
for (uint32_t channels = 9; channels < 16; channels++) {
DepthwiseMicrokernelTester()
.kernelHeight(5)
.kernelWidth(5)
.cr(8)
.channels(channels)
.width(1)
.inputZeroPoint(0)
.kernelZeroPoint(255)
.test(q8mpdw_ukernel_25c8__sse2);
}
}

TEST(Q8DW_25c8_SSE2, multi_output_channels_gt_8) {
for (uint32_t channels = 9; channels < 16; channels++) {
DepthwiseMicrokernelTester()
.kernelHeight(5)
.kernelWidth(5)
.cr(8)
.channels(channels)
.width(5)
.test(q8mpdw_ukernel_25c8__sse2);
}
}

TEST(Q8DW_25c8_SSE2, multi_output_channels_gt_8_with_output_stride) {
for (uint32_t channels = 9; channels < 16; channels++) {
DepthwiseMicrokernelTester()
.kernelHeight(5)
.kernelWidth(5)
.cr(8)
.channels(channels)
.width(5)
.outputStride(17)
.test(q8mpdw_ukernel_25c8__sse2);
}
}
#endif /* CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 */

0 comments on commit 7426638

Please sign in to comment.