Skip to content

Commit

Permalink
Add AVX512 decoder for short inputs (no spaces)
Browse files Browse the repository at this point in the history
  • Loading branch information
WojciechMula committed Feb 6, 2019
1 parent 2bf0936 commit 74fa5c9
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 2 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ perf.data*

# executables
unit
unit_tail
unit_despace
benchmark
benchmark_despace
Expand Down
7 changes: 5 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# we target only AVX512VBMI Cannon Lake CPU

FLAGS=$(CFLAGS) -O3 -ftree-vectorize -funroll-loops -std=c99 -Wall -Wextra -pedantic -I./include
FLAGS+=-march=native
#FLAGS+=-march=cannonlake
#FLAGS+=-march=native
FLAGS+=-march=cannonlake

BASE64=obj/chromiumbase64.o\
obj/fastavxbase64.o\
Expand Down Expand Up @@ -46,6 +46,9 @@ obj/decode_base64_avx512vbmi_despace.o: src/base64/decode_base64_avx512vbmi_desp
unit: src/unit.c $(BASE64)
$(CC) $(FLAGS) $< $(BASE64) -o $@

unit_tail: src/unit_tail.c src/base64/decode_base64_tail_avx512vbmi.c obj/chromiumbase64.o
$(CC) $(FLAGS) $< obj/chromiumbase64.o -o $@

unit_despace: src/unit_despace.c $(BASE64)
$(CC) $(FLAGS) $< $(BASE64) -o $@

Expand Down
79 changes: 79 additions & 0 deletions src/base64/decode_base64_tail_avx512vbmi.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
// This file is meant to be included into another .c file in
// order to allow inlining.
//
// Procedure decodes tail of whitespace-free base64-encoded
// message, and deals with trailing '='.
#include <stddef.h>
#include <stdint.h>
#include <immintrin.h>

int decode_base64_tail_avx512vbmi(uint8_t* dst, const uint8_t* src, size_t size) {

const char BASE64_PAD = '=';

assert(size <= 64);
if (size == 0) {
return 0;
}

if (size % 4 != 0) {
return -1;
}

uint64_t input_mask = ((uint64_t)-1) >> (64 - size);
int output_size = (size / 4) * 3;
if (src[size - 1] == BASE64_PAD) {
output_size -= 1;
input_mask >>= 1;
if (src[size - 2] == BASE64_PAD) {
output_size -= 1;
input_mask >>= 1;
}
}

const uint64_t output_mask = (0x0000ffffffffffffllu) >> (48 - output_size);

const __m512i lookup_0 = _mm512_setr_epi32(
0x80808080, 0x80808080, 0x80808080, 0x80808080,
0x80808080, 0x80808080, 0x80808080, 0x80808080,
0x80808080, 0x80808080, 0x3e808080, 0x3f808080,
0x37363534, 0x3b3a3938, 0x80803d3c, 0x80808080);
const __m512i lookup_1 = _mm512_setr_epi32(
0x02010080, 0x06050403, 0x0a090807, 0x0e0d0c0b,
0x1211100f, 0x16151413, 0x80191817, 0x80808080,
0x1c1b1a80, 0x201f1e1d, 0x24232221, 0x28272625,
0x2c2b2a29, 0x302f2e2d, 0x80333231, 0x80808080);

// 1. load input, fill past-end bytes with a valid base64 character
__m512i input = _mm512_mask_loadu_epi8(_mm512_set1_epi8('a'), input_mask, src);

// 2. translate from ASCII into 6-bit values
__m512i translated = _mm512_permutex2var_epi8(lookup_0, input, lookup_1);

// 2a. check for errors --- convert MSBs to a mask
const uint64_t mask = _mm512_test_epi8_mask(translated | input, _mm512_set1_epi8((int8_t)0x80));
if (mask != 0)
return -1;

// 3. pack four 6-bit values into 24-bit words (all within 32-bit lanes)
// Note: exactly the same procedure as we have in AVX2 version
// input: packed_dword([00dddddd|00cccccc|00bbbbbb|00aaaaaa] x 4)
// merged: packed_dword([00000000|aaaaabbb|bbbbcccc|ccdddddd] x 4)
const __m512i merge_ab_and_bc = _mm512_maddubs_epi16(translated,
_mm512_set1_epi32(0x01400140));

__m512i merged = _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000));


// 4. pack 24-bit values into continous array of 48 bytes
const __m512i pack = _mm512_setr_epi32(
0x06000102, 0x090a0405, 0x0c0d0e08, 0x16101112,
0x191a1415, 0x1c1d1e18, 0x26202122, 0x292a2425,
0x2c2d2e28, 0x36303132, 0x393a3435, 0x3c3d3e38,
0x00000000, 0x00000000, 0x00000000, 0x00000000);
const __m512i shuffled = _mm512_permutexvar_epi8(pack, merged);

_mm512_mask_storeu_epi8((__m512*)dst, output_mask, shuffled);

return output_size;
}
39 changes: 39 additions & 0 deletions src/unit_tail.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#include "chromiumbase64.h"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>

#include "base64/decode_base64_tail_avx512vbmi.c"

static const char* input_string = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789#%";

int main() {
char input[256];
char encoded[256];
char decoded[256];

for (int input_size=1; /**/; input_size++) {
memcpy(input, input_string, strlen(input_string));
input[input_size] = '\0';

const int encoded_size = chromium_base64_encode(encoded, input, input_size);
if (encoded_size > 64)
break;

puts("");
printf("input = '%s', len = %d\n", input, input_size);
printf("encoded = '%s', len = %d\n", encoded, encoded_size);

const int decoded_size = decode_base64_tail_avx512vbmi((uint8_t*)decoded,
(const uint8_t*)encoded,
encoded_size);
assert(decoded_size >= 0);

decoded[decoded_size] = '\0';
printf("decoded = '%s', len = %d\n", decoded, decoded_size);

assert(decoded_size == input_size);
assert(memcmp(decoded, input_string, input_size) == 0);
}
}

0 comments on commit 74fa5c9

Please sign in to comment.