-
Notifications
You must be signed in to change notification settings - Fork 7
/
ssebmx.src
34 lines (28 loc) · 1.12 KB
/
ssebmx.src
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
// II is defined as either (i) or (i ^ 7)
int rr, cc, i, h;
union { XMM x; uint8_t b[16]; } tmp;
// Do the main body in [16 x 8] blocks:
for (rr = 0; rr <= nrows - 16; rr += 16)
for (cc = 0; cc < ncols; cc += 8) {
for (i = 0; i < 16; ++i)
tmp.b[i] = INP(rr + II, cc);
for (i = 8; i--; tmp.x = xm_shl64(tmp.x, 1))
*(uint16_t*)&OUT(rr, cc + II) = _mm_movemask_epi8(tmp.x);
}
if (rr == nrows) return;
// The remainder is a row of [8 x 16]* [8 x 8]?
// Do the [8 x 16] blocks:
for (cc = 0; cc <= ncols - 16; cc += 16) {
for (i = 8; i--;)
tmp.b[i] = h = *(uint16_t const*)&INP(rr + II, cc),
tmp.b[i + 8] = h >> 8;
for (i = 8; i--; tmp.x = xm_shl64(tmp.x, 1))
OUT(rr, cc + II) = h = _mm_movemask_epi8(tmp.x),
OUT(rr, cc + II + 8) = h >> 8;
}
if (cc == ncols) return;
// Do the remaining [8 x 8] block:
for (i = 8; i--;)
tmp.b[i] = INP(rr + II, cc);
for (i = 8; i--; tmp.x = xm_shl64(tmp.x, 1))
OUT(rr, cc + II) = _mm_movemask_epi8(tmp.x);