Skip to content
This repository has been archived by the owner on Apr 19, 2024. It is now read-only.

Commit

Permalink
Rejig the bottom-level loops in internal_mul_* to use pointers instead
Browse files Browse the repository at this point in the history
of array indices. You'd hope that compilers could automatically turn
the one representation into the other if it was faster to do so, but
apparently not: even on gcc -O3, this source transformation gains over
15% performance.

[originally from svn r9105]
  • Loading branch information
sgtatham committed Feb 22, 2011
1 parent 9d4005e commit 7957ca1
Showing 1 changed file with 45 additions and 38 deletions.
83 changes: 45 additions & 38 deletions sshbn.c
Original file line number Diff line number Diff line change
Expand Up @@ -221,10 +221,8 @@ static int mul_compute_scratch(int len)
static void internal_mul(const BignumInt *a, const BignumInt *b,
BignumInt *c, int len, BignumInt *scratch)
{
int i, j;
BignumDblInt t;

if (len > KARATSUBA_THRESHOLD) {
int i;

/*
* Karatsuba divide-and-conquer algorithm. Cut each input in
Expand Down Expand Up @@ -311,9 +309,9 @@ static void internal_mul(const BignumInt *a, const BignumInt *b,
* copied over. */
scratch[0] = scratch[1] = scratch[midlen] = scratch[midlen+1] = 0;

for (j = 0; j < toplen; j++) {
scratch[midlen - toplen + j] = a[j]; /* a_1 */
scratch[2*midlen - toplen + j] = b[j]; /* b_1 */
for (i = 0; i < toplen; i++) {
scratch[midlen - toplen + i] = a[i]; /* a_1 */
scratch[2*midlen - toplen + i] = b[i]; /* b_1 */
}

/* compute a_1 + a_0 */
Expand Down Expand Up @@ -355,8 +353,8 @@ static void internal_mul(const BignumInt *a, const BignumInt *b,
* product to obtain the middle one.
*/
scratch[0] = scratch[1] = scratch[2] = scratch[3] = 0;
for (j = 0; j < 2*toplen; j++)
scratch[2*midlen - 2*toplen + j] = c[j];
for (i = 0; i < 2*toplen; i++)
scratch[2*midlen - 2*toplen + i] = c[i];
scratch[1] = internal_add(scratch+2, c + 2*toplen,
scratch+2, 2*botlen);
#ifdef KARA_DEBUG
Expand Down Expand Up @@ -386,13 +384,13 @@ static void internal_mul(const BignumInt *a, const BignumInt *b,
carry = internal_add(c + 2*len - botlen - 2*midlen,
scratch + 2*midlen,
c + 2*len - botlen - 2*midlen, 2*midlen);
j = 2*len - botlen - 2*midlen - 1;
i = 2*len - botlen - 2*midlen - 1;
while (carry) {
assert(j >= 0);
carry += c[j];
c[j] = (BignumInt)carry;
assert(i >= 0);
carry += c[i];
c[i] = (BignumInt)carry;
carry >>= BIGNUM_INT_BITS;
j--;
i--;
}
#ifdef KARA_DEBUG
printf("ab = 0x");
Expand All @@ -403,23 +401,27 @@ static void internal_mul(const BignumInt *a, const BignumInt *b,
#endif

} else {
int i;
BignumInt carry;
BignumDblInt t;
const BignumInt *ap, *bp;
BignumInt *cp, *cps;

/*
* Multiply in the ordinary O(N^2) way.
*/

for (j = 0; j < 2 * len; j++)
c[j] = 0;
for (i = 0; i < 2 * len; i++)
c[i] = 0;

for (i = len - 1; i >= 0; i--) {
t = 0;
for (j = len - 1; j >= 0; j--) {
t += MUL_WORD(a[i], (BignumDblInt) b[j]);
t += (BignumDblInt) c[i + j + 1];
c[i + j + 1] = (BignumInt) t;
t = t >> BIGNUM_INT_BITS;
for (cps = c + 2*len, ap = a + len; ap-- > a; cps--) {
carry = 0;
for (cp = cps, bp = b + len; cp--, bp-- > b ;) {
t = (MUL_WORD(*ap, *bp) + carry) + *cp;
*cp = (BignumInt) t;
carry = t >> BIGNUM_INT_BITS;
}
c[i] = (BignumInt) t;
*cp = carry;
}
}
}
Expand All @@ -432,10 +434,8 @@ static void internal_mul(const BignumInt *a, const BignumInt *b,
static void internal_mul_low(const BignumInt *a, const BignumInt *b,
BignumInt *c, int len, BignumInt *scratch)
{
int i, j;
BignumDblInt t;

if (len > KARATSUBA_THRESHOLD) {
int i;

/*
* Karatsuba-aware version of internal_mul_low. As before, we
Expand Down Expand Up @@ -492,8 +492,8 @@ static void internal_mul_low(const BignumInt *a, const BignumInt *b,
scratch + 2*len);

/* Copy the bottom half of the big coefficient into place */
for (j = 0; j < botlen; j++)
c[toplen + j] = scratch[2*toplen + botlen + j];
for (i = 0; i < botlen; i++)
c[toplen + i] = scratch[2*toplen + botlen + i];

/* Add the two small coefficients, throwing away the returned carry */
internal_add(scratch, scratch + toplen, scratch, toplen);
Expand All @@ -503,20 +503,27 @@ static void internal_mul_low(const BignumInt *a, const BignumInt *b,
c, toplen);

} else {
int i;
BignumInt carry;
BignumDblInt t;
const BignumInt *ap, *bp;
BignumInt *cp, *cps;

for (j = 0; j < len; j++)
c[j] = 0;
/*
* Multiply in the ordinary O(N^2) way.
*/

for (i = len - 1; i >= 0; i--) {
t = 0;
for (j = len - 1; j >= len - i - 1; j--) {
t += MUL_WORD(a[i], (BignumDblInt) b[j]);
t += (BignumDblInt) c[i + j + 1 - len];
c[i + j + 1 - len] = (BignumInt) t;
t = t >> BIGNUM_INT_BITS;
for (i = 0; i < len; i++)
c[i] = 0;

for (cps = c + len, ap = a + len; ap-- > a; cps--) {
carry = 0;
for (cp = cps, bp = b + len; bp--, cp-- > c ;) {
t = (MUL_WORD(*ap, *bp) + carry) + *cp;
*cp = (BignumInt) t;
carry = t >> BIGNUM_INT_BITS;
}
}

}
}

Expand Down

0 comments on commit 7957ca1

Please sign in to comment.