diff --git a/crypto/fipsmodule/ec/ec_nistp.c b/crypto/fipsmodule/ec/ec_nistp.c
index 13120c6ee8..6105353e14 100644
--- a/crypto/fipsmodule/ec/ec_nistp.c
+++ b/crypto/fipsmodule/ec/ec_nistp.c
@@ -18,7 +18,7 @@
 // | 1. |   x   |   x   |   x*  |
 // | 2. |   x   |   x   |   x*  |
 // | 3. |       |       |       |
-// | 4. |       |       |       |
+// | 4. |   x   |   x   |   x*  |
 // | 5. |       |       |       |
 //  * For P-256, only the Fiat-crypto implementation in p256.c is replaced. 
 
@@ -30,11 +30,11 @@
 // for the moment, this will be fixed when we migrate the whole P-521
 // implementation to ec_nistp.c.
 #if defined(EC_NISTP_USE_64BIT_LIMB)
-#define NISTP_FELEM_MAX_NUM_OF_LIMBS (9)
+#define FELEM_MAX_NUM_OF_LIMBS (9)
 #else
-#define NISTP_FELEM_MAX_NUM_OF_LIMBS (19)
+#define FELEM_MAX_NUM_OF_LIMBS (19)
 #endif
-typedef ec_nistp_felem_limb ec_nistp_felem[NISTP_FELEM_MAX_NUM_OF_LIMBS];
+typedef ec_nistp_felem_limb ec_nistp_felem[FELEM_MAX_NUM_OF_LIMBS];
 
 // Conditional copy in constant-time (out = t == 0 ? z : nz).
 static void cmovznz(ec_nistp_felem_limb *out,
@@ -280,8 +280,8 @@ static int16_t get_bit(const EC_SCALAR *in, size_t i) {
 // It forces an odd scalar and outputs digits in
 // {\pm 1, \pm 3, \pm 5, \pm 7, \pm 9, ...}
 // i.e. signed odd digits with _no zeroes_ -- that makes it "regular".
-void scalar_rwnaf(int16_t *out, size_t window_size,
-                  const EC_SCALAR *scalar, size_t scalar_bit_size) {
+static void scalar_rwnaf(int16_t *out, size_t window_size,
+                         const EC_SCALAR *scalar, size_t scalar_bit_size) {
   assert(window_size < 14);
 
   // The assert above ensures this works correctly.
@@ -304,13 +304,30 @@ void scalar_rwnaf(int16_t *out, size_t window_size,
   out[num_windows - 1] = window;
 }
 
+// The window size for scalar multiplication is hard coded for now.
+#define SCALAR_MUL_WINDOW_SIZE (5)
+#define SCALAR_MUL_TABLE_NUM_POINTS (1 << (SCALAR_MUL_WINDOW_SIZE - 1))
+
+// To avoid dynamic allocation and freeing of memory in functions below
+// we define maximum values of certain variables.
+//
+// The maximum number of limbs the table in |ec_nistp_scalar_mul| can have.
+// Each point in the table has 3 coordinates that are field elements,
+// and each field element has a defined maximum number of limbs.
+#define SCALAR_MUL_TABLE_MAX_NUM_FELEM_LIMBS \
+                (SCALAR_MUL_TABLE_NUM_POINTS * 3 * FELEM_MAX_NUM_OF_LIMBS)
+
+// Maximum number of windows (digits) for a scalar encoding which is
+// determined by the maximum scalar bit size -- 521 bits in our case.
+#define SCALAR_MUL_MAX_NUM_WINDOWS DIV_AND_CEIL(521, SCALAR_MUL_WINDOW_SIZE)
+
 // Generate table of multiples of the input point P = (x_in, y_in, z_in):
 //  table <-- [2i + 1]P for i in [0, SCALAR_MUL_TABLE_NUM_POINTS - 1].
-void generate_table(const ec_nistp_meth *ctx,
-                    ec_nistp_felem_limb *table,
-                    ec_nistp_felem_limb *x_in,
-                    ec_nistp_felem_limb *y_in,
-                    ec_nistp_felem_limb *z_in)
+static void generate_table(const ec_nistp_meth *ctx,
+                           ec_nistp_felem_limb *table,
+                           const ec_nistp_felem_limb *x_in,
+                           const ec_nistp_felem_limb *y_in,
+                           const ec_nistp_felem_limb *z_in)
 {
   const size_t felem_num_limbs = ctx->felem_num_limbs;
   const size_t felem_num_bytes = felem_num_limbs * sizeof(ec_nistp_felem_limb);
@@ -343,3 +360,130 @@ void generate_table(const ec_nistp_meth *ctx,
   }
 }
 
+// Writes to xyz_out the idx-th point from table in constant-time.
+static void select_point_from_table(const ec_nistp_meth *ctx,
+                                    ec_nistp_felem_limb *xyz_out,
+                                    const ec_nistp_felem_limb *table,
+                                    const size_t idx) {
+  size_t entry_size = 3 * ctx->felem_num_limbs * sizeof(ec_nistp_felem_limb);
+
+  constant_time_select_entry_from_table_8(
+          (uint8_t*)xyz_out, (uint8_t*)table,
+          idx, SCALAR_MUL_TABLE_NUM_POINTS, entry_size);
+}
+
+// Multiplication of an arbitrary point by a scalar, r = [scalar]P.
+// The product is computed with the use of a small table generated on-the-fly
+// and the scalar recoded in the regular-wNAF representation.
+//
+// The precomputed (on-the-fly) table |table| holds odd multiples of P:
+//     [2i + 1]P for i in [0, SCALAR_MUL_TABLE_NUM_POINTS - 1].
+// Computing the negation of a point P = (x, y, z) is relatively easy:
+//     -P = (x, -y, z),
+// so we may assume that for each point we have its negative as well.
+//
+// The scalar is recoded (regular-wNAF encoding) into signed digits as explained
+// in |scalar_rwnaf| function. Namely, for a window size |w| we have:
+//     scalar' = s_0 + s_1*2^w + s_2*2^(2*w) + ... + s_{m-1}*2^((m-1)*w),
+// where digits s_i are in [\pm 1, \pm 3, ..., \pm (2^w-1)] and
+// m = ceil(scalar_bit_size / w). Note that for an odd scalar we have that
+// scalar = scalar', while in the case of an even scalar we have that
+// scalar = scalar' - 1.
+//
+// The required product, [scalar]P, is computed by the following algorithm.
+//     1. Initialize the accumulator with the point from |table|
+//        corresponding to the most significant digit s_{m-1} of the scalar.
+//     2. For digits s_i starting from s_{m-2} down to s_0:
+//     3.   Double the accumulator w times. (note that doubling a point [a]P
+//          w times results in [2^w*a]P).
+//     4.   Read from |table| the point corresponding to abs(s_i),
+//          negate it if s_i is negative, and add it to the accumulator.
+//     5. Subtract P from the result if the scalar is even.
+//
+// Note: this function is constant-time.
+void ec_nistp_scalar_mul(const ec_nistp_meth *ctx,
+                         ec_nistp_felem_limb *x_out,
+                         ec_nistp_felem_limb *y_out,
+                         ec_nistp_felem_limb *z_out,
+                         const ec_nistp_felem_limb *x_in,
+                         const ec_nistp_felem_limb *y_in,
+                         const ec_nistp_felem_limb *z_in,
+                         const EC_SCALAR *scalar) {
+  // Make sure that the max table size is large enough.
+  assert(SCALAR_MUL_TABLE_MAX_NUM_FELEM_LIMBS >=
+         SCALAR_MUL_TABLE_NUM_POINTS * ctx->felem_num_limbs * 3);
+
+  // Table of multiples of P = (x_in, y_in, z_in).
+  ec_nistp_felem_limb table[SCALAR_MUL_TABLE_MAX_NUM_FELEM_LIMBS];
+  generate_table(ctx, table, x_in, y_in, z_in);
+
+  // Regular-wNAF encoding of the scalar.
+  int16_t rwnaf[SCALAR_MUL_MAX_NUM_WINDOWS];
+  scalar_rwnaf(rwnaf, SCALAR_MUL_WINDOW_SIZE, scalar, ctx->felem_num_bits);
+
+  // We need two point accumulators, so we define them of maximum size
+  // to avoid allocation, and just take pointers to individual coordinates.
+  // (This cruft will dissapear when we refactor point_add/dbl to work with
+  // whole points instead of individual coordinates).
+  ec_nistp_felem_limb res[3 * FELEM_MAX_NUM_OF_LIMBS];
+  ec_nistp_felem_limb tmp[3 * FELEM_MAX_NUM_OF_LIMBS];
+  ec_nistp_felem_limb *x_res = &res[0];
+  ec_nistp_felem_limb *y_res = &res[ctx->felem_num_limbs];
+  ec_nistp_felem_limb *z_res = &res[ctx->felem_num_limbs * 2];
+  ec_nistp_felem_limb *x_tmp = &tmp[0];
+  ec_nistp_felem_limb *y_tmp = &tmp[ctx->felem_num_limbs];
+  ec_nistp_felem_limb *z_tmp = &tmp[ctx->felem_num_limbs * 2];
+
+  // The actual number of windows (digits) of the scalar (denoted by m in the
+  // description above the function).
+  const size_t num_windows = DIV_AND_CEIL(ctx->felem_num_bits, SCALAR_MUL_WINDOW_SIZE);
+
+  // Step 1. Initialize the accmulator (res) with the input point multiplied by
+  // the most significant digit of the scalar s_{m-1} (note that this digit
+  // can't be negative).
+  int16_t idx = rwnaf[num_windows - 1];
+  idx >>= 1;
+  select_point_from_table(ctx, res, table, idx);
+
+  // Step 2. Process the remaining digits of the scalar (s_{m-2} to s_0).
+  for (int i = num_windows - 2; i >= 0; i--) {
+    // Step 3. Double the accumulator w times.
+    for (size_t j = 0; j < SCALAR_MUL_WINDOW_SIZE; j++) {
+      ctx->point_dbl(x_res, y_res, z_res, x_res, y_res, z_res);
+    }
+
+    // Step 4a. Compute abs(s_i).
+    int16_t d = rwnaf[i];
+    int16_t is_neg = (d >> 15) & 1; // is_neg = (d < 0) ? 1 : 0
+    d = (d ^ -is_neg) + is_neg;     // d = abs(d)
+
+    // Step 4b. Select from table the point corresponding to abs(s_i).
+    idx = d >> 1;
+    select_point_from_table(ctx, tmp, table, idx);
+
+    // Step 4c. Negate the point if s_i < 0.
+    ec_nistp_felem ftmp;
+    ctx->felem_neg(ftmp, y_tmp);
+
+    cmovznz(y_tmp, ctx->felem_num_limbs, is_neg, y_tmp, ftmp);
+
+    // Step 4d. Add the point to the accumulator.
+    ctx->point_add(x_res, y_res, z_res, x_res, y_res, z_res, 0, x_tmp, y_tmp, z_tmp);
+  }
+
+  // Step 5a. Negate the input point P (we negate it in-place since we already
+  // have it stored as the first entry in the table).
+  ec_nistp_felem_limb *x_mp = &table[0];
+  ec_nistp_felem_limb *y_mp = &table[ctx->felem_num_limbs];
+  ec_nistp_felem_limb *z_mp = &table[ctx->felem_num_limbs * 2];
+  ctx->felem_neg(y_mp, y_mp);
+
+  // Step 5b. Subtract P from the accumulator.
+  ctx->point_add(x_tmp, y_tmp, z_tmp, x_res, y_res, z_res, 0, x_mp, y_mp, z_mp);
+
+  // Step 5c. Select |res| or |res - P| based on parity of the scalar.
+  ec_nistp_felem_limb t = scalar->words[0] & 1;
+  cmovznz(x_out, ctx->felem_num_limbs, t, x_tmp, x_res);
+  cmovznz(y_out, ctx->felem_num_limbs, t, y_tmp, y_res);
+  cmovznz(z_out, ctx->felem_num_limbs, t, z_tmp, z_res);
+}
diff --git a/crypto/fipsmodule/ec/ec_nistp.h b/crypto/fipsmodule/ec/ec_nistp.h
index 18730ab056..d567dc18ee 100644
--- a/crypto/fipsmodule/ec/ec_nistp.h
+++ b/crypto/fipsmodule/ec/ec_nistp.h
@@ -47,10 +47,12 @@ typedef uint32_t ec_nistp_felem_limb;
 // providing an appropriate methods object.
 typedef struct {
   size_t felem_num_limbs;
+  size_t felem_num_bits;
   void (*felem_add)(ec_nistp_felem_limb *c, const ec_nistp_felem_limb *a, const ec_nistp_felem_limb *b);
   void (*felem_sub)(ec_nistp_felem_limb *c, const ec_nistp_felem_limb *a, const ec_nistp_felem_limb *b);
   void (*felem_mul)(ec_nistp_felem_limb *c, const ec_nistp_felem_limb *a, const ec_nistp_felem_limb *b);
   void (*felem_sqr)(ec_nistp_felem_limb *c, const ec_nistp_felem_limb *a);
+  void (*felem_neg)(ec_nistp_felem_limb *c, const ec_nistp_felem_limb *a);
   ec_nistp_felem_limb (*felem_nz)(const ec_nistp_felem_limb *a);
 
   void (*point_dbl)(ec_nistp_felem_limb *x_out,
@@ -96,20 +98,13 @@ void ec_nistp_point_add(const ec_nistp_meth *ctx,
                         const ec_nistp_felem_limb *y2,
                         const ec_nistp_felem_limb *z2);
 
-// These two functions and two macros are temporarily defined here.
-// They will be moved to ec_nistp.c as static function
-// once all the scalar multiplications are implemented.
-void scalar_rwnaf(int16_t *out, size_t window_size,
-                  const EC_SCALAR *scalar, size_t scalar_bit_size);
-void generate_table(const ec_nistp_meth *ctx,
-                    ec_nistp_felem_limb *table,
-                    ec_nistp_felem_limb *x_in,
-                    ec_nistp_felem_limb *y_in,
-                    ec_nistp_felem_limb *z_in);
-
-// The window size for scalar multiplication is hard coded for now.
-#define SCALAR_MUL_WINDOW_SIZE (5)
-#define SCALAR_MUL_TABLE_NUM_POINTS (1 << (SCALAR_MUL_WINDOW_SIZE - 1))
-
+void ec_nistp_scalar_mul(const ec_nistp_meth *ctx,
+                         ec_nistp_felem_limb *x_out,
+                         ec_nistp_felem_limb *y_out,
+                         ec_nistp_felem_limb *z_out,
+                         const ec_nistp_felem_limb *x_in,
+                         const ec_nistp_felem_limb *y_in,
+                         const ec_nistp_felem_limb *z_in,
+                         const EC_SCALAR *scalar);
 #endif // EC_NISTP_H
 
diff --git a/crypto/fipsmodule/ec/p256.c b/crypto/fipsmodule/ec/p256.c
index b0d94e69eb..f8789ced96 100644
--- a/crypto/fipsmodule/ec/p256.c
+++ b/crypto/fipsmodule/ec/p256.c
@@ -188,10 +188,12 @@ static void fiat_p256_point_add(fiat_p256_felem x3, fiat_p256_felem y3,
 
 DEFINE_METHOD_FUNCTION(ec_nistp_meth, p256_methods) {
     out->felem_num_limbs = FIAT_P256_NLIMBS;
+    out->felem_num_bits = 256;
     out->felem_add = fiat_p256_add;
     out->felem_sub = fiat_p256_sub;
     out->felem_mul = fiat_p256_mul;
     out->felem_sqr = fiat_p256_square;
+    out->felem_neg = fiat_p256_opp;
     out->felem_nz  = fiat_p256_nz;
     out->point_dbl = fiat_p256_point_double;
     out->point_add = fiat_p256_point_add;
@@ -214,20 +216,6 @@ static void fiat_p256_select_point_affine(
   fiat_p256_cmovznz(out[2], idx, out[2], fiat_p256_one);
 }
 
-// fiat_p256_select_point selects the |idx|th point from a precomputation table
-// and copies it to out.
-static void fiat_p256_select_point(const fiat_p256_limb_t idx, size_t size,
-                                   const fiat_p256_felem pre_comp[/*size*/][3],
-                                   fiat_p256_felem out[3]) {
-  OPENSSL_memset(out, 0, sizeof(fiat_p256_felem) * 3);
-  for (size_t i = 0; i < size; i++) {
-    fiat_p256_limb_t mismatch = i ^ idx;
-    fiat_p256_cmovznz(out[0], mismatch, pre_comp[i][0], out[0]);
-    fiat_p256_cmovznz(out[1], mismatch, pre_comp[i][1], out[1]);
-    fiat_p256_cmovznz(out[2], mismatch, pre_comp[i][2], out[2]);
-  }
-}
-
 // fiat_p256_get_bit returns the |i|th bit in |in|.
 static crypto_word_t fiat_p256_get_bit(const EC_SCALAR *in, int i) {
   if (i < 0 || i >= 256) {
@@ -309,68 +297,16 @@ static void ec_GFp_nistp256_dbl(const EC_GROUP *group, EC_JACOBIAN *r,
 static void ec_GFp_nistp256_point_mul(const EC_GROUP *group, EC_JACOBIAN *r,
                                       const EC_JACOBIAN *p,
                                       const EC_SCALAR *scalar) {
-  fiat_p256_felem p_pre_comp[17][3];
-  OPENSSL_memset(&p_pre_comp, 0, sizeof(p_pre_comp));
-  // Precompute multiples.
-  fiat_p256_from_generic(p_pre_comp[1][0], &p->X);
-  fiat_p256_from_generic(p_pre_comp[1][1], &p->Y);
-  fiat_p256_from_generic(p_pre_comp[1][2], &p->Z);
-  for (size_t j = 2; j <= 16; ++j) {
-    if (j & 1) {
-      fiat_p256_point_add(p_pre_comp[j][0], p_pre_comp[j][1], p_pre_comp[j][2],
-                          p_pre_comp[1][0], p_pre_comp[1][1], p_pre_comp[1][2],
-                          0, p_pre_comp[j - 1][0], p_pre_comp[j - 1][1],
-                          p_pre_comp[j - 1][2]);
-    } else {
-      fiat_p256_point_double(p_pre_comp[j][0], p_pre_comp[j][1],
-                             p_pre_comp[j][2], p_pre_comp[j / 2][0],
-                             p_pre_comp[j / 2][1], p_pre_comp[j / 2][2]);
-    }
-  }
+  fiat_p256_felem res[3], tmp[3];
+  fiat_p256_from_generic(tmp[0], &p->X);
+  fiat_p256_from_generic(tmp[1], &p->Y);
+  fiat_p256_from_generic(tmp[2], &p->Z);
 
-  // Set nq to the point at infinity.
-  fiat_p256_felem nq[3] = {{0}, {0}, {0}}, ftmp, tmp[3];
+  ec_nistp_scalar_mul(p256_methods(), res[0], res[1], res[2], tmp[0], tmp[1], tmp[2], scalar);
 
-  // Loop over |scalar| msb-to-lsb, incorporating |p_pre_comp| every 5th round.
-  int skip = 1;  // Save two point operations in the first round.
-  for (size_t i = 255; i < 256; i--) {
-    // double
-    if (!skip) {
-      fiat_p256_point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
-    }
-
-    // do other additions every 5 doublings
-    if (i % 5 == 0) {
-      crypto_word_t bits = fiat_p256_get_bit(scalar, i + 4) << 5;
-      bits |= fiat_p256_get_bit(scalar, i + 3) << 4;
-      bits |= fiat_p256_get_bit(scalar, i + 2) << 3;
-      bits |= fiat_p256_get_bit(scalar, i + 1) << 2;
-      bits |= fiat_p256_get_bit(scalar, i) << 1;
-      bits |= fiat_p256_get_bit(scalar, i - 1);
-      crypto_word_t sign, digit;
-      ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
-
-      // select the point to add or subtract, in constant time.
-      fiat_p256_select_point((fiat_p256_limb_t)digit, 17,
-                             (const fiat_p256_felem(*)[3])p_pre_comp, tmp);
-      fiat_p256_opp(ftmp, tmp[1]);  // (X, -Y, Z) is the negative point.
-      fiat_p256_cmovznz(tmp[1], (fiat_p256_limb_t)sign, tmp[1], ftmp);
-
-      if (!skip) {
-        fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2],
-                            0 /* mixed */, tmp[0], tmp[1], tmp[2]);
-      } else {
-        fiat_p256_copy(nq[0], tmp[0]);
-        fiat_p256_copy(nq[1], tmp[1]);
-        fiat_p256_copy(nq[2], tmp[2]);
-        skip = 0;
-      }
-    }
-  }
-
-  fiat_p256_to_generic(&r->X, nq[0]);
-  fiat_p256_to_generic(&r->Y, nq[1]);
-  fiat_p256_to_generic(&r->Z, nq[2]);
+  fiat_p256_to_generic(&r->X, res[0]);
+  fiat_p256_to_generic(&r->Y, res[1]);
+  fiat_p256_to_generic(&r->Z, res[2]);
 }
 
 static void ec_GFp_nistp256_point_mul_base(const EC_GROUP *group,
diff --git a/crypto/fipsmodule/ec/p384.c b/crypto/fipsmodule/ec/p384.c
index bcd3c5ba4e..f11bee777b 100644
--- a/crypto/fipsmodule/ec/p384.c
+++ b/crypto/fipsmodule/ec/p384.c
@@ -273,10 +273,12 @@ static void p384_point_add(p384_felem x3, p384_felem y3, p384_felem z3,
 #if defined(EC_NISTP_USE_S2N_BIGNUM)
 DEFINE_METHOD_FUNCTION(ec_nistp_meth, p384_methods) {
     out->felem_num_limbs = P384_NLIMBS;
+    out->felem_num_bits = 384;
     out->felem_add = bignum_add_p384;
     out->felem_sub = bignum_sub_p384;
     out->felem_mul = bignum_montmul_p384_selector;
     out->felem_sqr = bignum_montsqr_p384_selector;
+    out->felem_neg = bignum_neg_p384;
     out->felem_nz  = p384_felem_nz;
     out->point_dbl = p384_point_double;
     out->point_add = p384_point_add;
@@ -284,10 +286,12 @@ DEFINE_METHOD_FUNCTION(ec_nistp_meth, p384_methods) {
 #else
 DEFINE_METHOD_FUNCTION(ec_nistp_meth, p384_methods) {
     out->felem_num_limbs = P384_NLIMBS;
+    out->felem_num_bits = 384;
     out->felem_add = fiat_p384_add;
     out->felem_sub = fiat_p384_sub;
     out->felem_mul = fiat_p384_mul;
     out->felem_sqr = fiat_p384_square;
+    out->felem_neg = fiat_p384_opp;
     out->felem_nz  = p384_felem_nz;
     out->point_dbl = p384_point_double;
     out->point_add = p384_point_add;
@@ -490,24 +494,6 @@ OPENSSL_STATIC_ASSERT(P384_MUL_WSIZE == 5,
 #define P384_MUL_TABLE_SIZE     (P384_MUL_TWO_TO_WSIZE >> 1)
 #define P384_MUL_PUB_TABLE_SIZE (1 << (P384_MUL_PUB_WSIZE - 1))
 
-OPENSSL_STATIC_ASSERT(P384_MUL_TABLE_SIZE <= SCALAR_MUL_TABLE_NUM_POINTS,
-        p384_table_size_larger_than_ec_nistp_supports)
-
-// p384_select_point selects the |idx|-th projective point from the given
-// precomputed table and copies it to |out| in constant time.
-static void p384_select_point(p384_felem out[3],
-                              size_t idx,
-                              p384_felem table[][3],
-                              size_t table_size) {
-  OPENSSL_memset(out, 0, sizeof(p384_felem) * 3);
-  for (size_t i = 0; i < table_size; i++) {
-    p384_limb_t mismatch = i ^ idx;
-    p384_felem_cmovznz(out[0], mismatch, table[i][0], out[0]);
-    p384_felem_cmovznz(out[1], mismatch, table[i][1], out[1]);
-    p384_felem_cmovznz(out[2], mismatch, table[i][2], out[2]);
-  }
-}
-
 // p384_select_point_affine selects the |idx|-th affine point from
 // the given precomputed table and copies it to |out| in constant-time.
 static void p384_select_point_affine(p384_felem out[2],
@@ -522,105 +508,19 @@ static void p384_select_point_affine(p384_felem out[2],
   }
 }
 
-// Multiplication of a point by a scalar, r = [scalar]P.
-// The product is computed with the use of a small table generated on-the-fly
-// and the scalar recoded in the regular-wNAF representation.
-//
-// The precomputed (on-the-fly) table |p_pre_comp| holds 16 odd multiples of P:
-//     [2i + 1]P for i in [0, 15].
-// Computing the negation of a point P = (x, y) is relatively easy:
-//     -P = (x, -y).
-// So we may assume that instead of the above-mentioned 64, we have 128 points:
-//     [\pm 1]P, [\pm 3]P, [\pm 5]P, ..., [\pm 31]P.
-//
-// The 384-bit scalar is recoded (regular-wNAF encoding) into 77 signed digits
-// each of length 5 bits, as explained in the |p384_felem_mul_scalar_rwnaf|
-// function. Namely,
-//     scalar' = s_0 + s_1*2^5 + s_2*2^10 + ... + s_76*2^380,
-// where digits s_i are in [\pm 1, \pm 3, ..., \pm 31]. Note that for an odd
-// scalar we have that scalar = scalar', while in the case of an even
-// scalar we have that scalar = scalar' - 1.
-//
-// The required product, [scalar]P, is computed by the following algorithm.
-//     1. Initialize the accumulator with the point from |p_pre_comp|
-//        corresponding to the most significant digit s_76 of the scalar.
-//     2. For digits s_i starting from s_75 down to s_0:
-//     3.   Double the accumulator 5 times. (note that doubling a point [a]P
-//          seven times results in [2^5*a]P).
-//     4.   Read from |p_pre_comp| the point corresponding to abs(s_i),
-//          negate it if s_i is negative, and add it to the accumulator.
-//
-// Note: this function is constant-time.
+// Multiplication of an arbitrary point by a scalar, r = [scalar]P.
 static void ec_GFp_nistp384_point_mul(const EC_GROUP *group, EC_JACOBIAN *r,
                                       const EC_JACOBIAN *p,
                                       const EC_SCALAR *scalar) {
 
-  p384_felem res[3] = {{0}, {0}, {0}}, tmp[3] = {{0}, {0}, {0}}, ftmp;
+  p384_felem res[3] = {{0}, {0}, {0}}, tmp[3] = {{0}, {0}, {0}};
 
-  // Table of multiples of P:  [2i + 1]P for i in [0, 15].
-  p384_felem p_pre_comp[P384_MUL_TABLE_SIZE][3];
-
-  // Set the first point in the table to P.
   p384_from_generic(tmp[0], &p->X);
   p384_from_generic(tmp[1], &p->Y);
   p384_from_generic(tmp[2], &p->Z);
 
-  assert(sizeof(p_pre_comp) == (P384_MUL_TABLE_SIZE * 3 * sizeof(p384_felem)));
-  generate_table(p384_methods(), (ec_nistp_felem_limb*)p_pre_comp, tmp[0], tmp[1], tmp[2]);
+  ec_nistp_scalar_mul(p384_methods(), res[0], res[1], res[2], tmp[0], tmp[1], tmp[2], scalar);
 
-  // Recode the scalar.
-  int16_t rnaf[P384_MUL_NWINDOWS] = {0};
-  scalar_rwnaf(rnaf, P384_MUL_WSIZE, scalar, 384);
-
-  // Initialize the accumulator |res| with the table entry corresponding to
-  // the most significant digit of the recoded scalar (note that this digit
-  // can't be negative).
-  int16_t idx = rnaf[P384_MUL_NWINDOWS - 1] >> 1;
-  p384_select_point(res, idx, p_pre_comp, P384_MUL_TABLE_SIZE);
-
-  // Process the remaining digits of the scalar.
-  for (int i = P384_MUL_NWINDOWS - 2; i >= 0; i--) {
-    // Double |res| 5 times in each iteration.
-    for (size_t j = 0; j < P384_MUL_WSIZE; j++) {
-      p384_point_double(res[0], res[1], res[2], res[0], res[1], res[2]);
-    }
-
-    int16_t d = rnaf[i];
-    // is_neg = (d < 0) ? 1 : 0
-    int16_t is_neg = (d >> 15) & 1;
-    // d = abs(d)
-    d = (d ^ -is_neg) + is_neg;
-
-    idx = d >> 1;
-
-    // Select the point to add, in constant time.
-    p384_select_point(tmp, idx, p_pre_comp, P384_MUL_TABLE_SIZE);
-
-    // Negate y coordinate of the point tmp = (x, y); ftmp = -y.
-    p384_felem_opp(ftmp, tmp[1]);
-    // Conditionally select y or -y depending on the sign of the digit |d|.
-    p384_felem_cmovznz(tmp[1], is_neg, tmp[1], ftmp);
-
-    // Add the point to the accumulator |res|.
-    p384_point_add(res[0], res[1], res[2], res[0], res[1], res[2],
-                   0 /* both Jacobian */, tmp[0], tmp[1], tmp[2]);
-
-  }
-
-  // Conditionally subtract P if the scalar is even, in constant-time.
-  // First, compute |tmp| = |res| + (-P).
-  p384_felem_copy(tmp[0], p_pre_comp[0][0]);
-  p384_felem_opp(tmp[1], p_pre_comp[0][1]);
-  p384_felem_copy(tmp[2], p_pre_comp[0][2]);
-  p384_point_add(tmp[0], tmp[1], tmp[2], res[0], res[1], res[2],
-                 0 /* both Jacobian */, tmp[0], tmp[1], tmp[2]);
-
-  // Select |res| or |tmp| based on the |scalar| parity, in constant-time.
-  p384_felem_cmovznz(res[0], scalar->words[0] & 1, tmp[0], res[0]);
-  p384_felem_cmovznz(res[1], scalar->words[0] & 1, tmp[1], res[1]);
-  p384_felem_cmovznz(res[2], scalar->words[0] & 1, tmp[2], res[2]);
-
-  // Copy the result to the output.
   p384_to_generic(&r->X, res[0]);
   p384_to_generic(&r->Y, res[1]);
   p384_to_generic(&r->Z, res[2]);
diff --git a/crypto/fipsmodule/ec/p521.c b/crypto/fipsmodule/ec/p521.c
index 3833e04954..b1ed65dc7b 100644
--- a/crypto/fipsmodule/ec/p521.c
+++ b/crypto/fipsmodule/ec/p521.c
@@ -291,10 +291,12 @@ static void p521_point_add(p521_felem x3, p521_felem y3, p521_felem z3,
 #if defined(EC_NISTP_USE_S2N_BIGNUM)
 DEFINE_METHOD_FUNCTION(ec_nistp_meth, p521_methods) {
     out->felem_num_limbs = P521_NLIMBS;
+    out->felem_num_bits = 521;
     out->felem_add = bignum_add_p521;
     out->felem_sub = bignum_sub_p521;
     out->felem_mul = bignum_mul_p521_selector;
     out->felem_sqr = bignum_sqr_p521_selector;
+    out->felem_neg = bignum_neg_p521;
     out->felem_nz  = p521_felem_nz;
     out->point_dbl = p521_point_double;
     out->point_add = p521_point_add;
@@ -302,10 +304,12 @@ DEFINE_METHOD_FUNCTION(ec_nistp_meth, p521_methods) {
 #else
 DEFINE_METHOD_FUNCTION(ec_nistp_meth, p521_methods) {
     out->felem_num_limbs = P521_NLIMBS;
+    out->felem_num_bits = 521;
     out->felem_add = fiat_secp521r1_carry_add;
     out->felem_sub = fiat_secp521r1_carry_sub;
     out->felem_mul = fiat_secp521r1_carry_mul;
     out->felem_sqr = fiat_secp521r1_carry_square;
+    out->felem_neg = fiat_secp521r1_carry_opp;
     out->felem_nz  = p521_felem_nz;
     out->point_dbl = p521_point_double;
     out->point_add = p521_point_add;
@@ -427,24 +431,6 @@ OPENSSL_STATIC_ASSERT(P521_MUL_WSIZE == 5,
 #define P521_MUL_TABLE_SIZE     (P521_MUL_TWO_TO_WSIZE >> 1)
 #define P521_MUL_PUB_TABLE_SIZE (1 << (P521_MUL_PUB_WSIZE - 1))
 
-OPENSSL_STATIC_ASSERT(P521_MUL_TABLE_SIZE <= SCALAR_MUL_TABLE_NUM_POINTS,
-        p521_table_size_larger_than_ec_nistp_supports)
-
-// p521_select_point selects the |idx|-th projective point from the given
-// precomputed table and copies it to |out| in constant time.
-static void p521_select_point(p521_felem out[3],
-                              size_t idx,
-                              p521_felem table[][3],
-                              size_t table_size) {
-  OPENSSL_memset(out, 0, sizeof(p521_felem) * 3);
-  for (size_t i = 0; i < table_size; i++) {
-    p521_limb_t mismatch = i ^ idx;
-    p521_felem_cmovznz(out[0], mismatch, table[i][0], out[0]);
-    p521_felem_cmovznz(out[1], mismatch, table[i][1], out[1]);
-    p521_felem_cmovznz(out[2], mismatch, table[i][2], out[2]);
-  }
-}
-
 // p521_select_point_affine selects the |idx|-th affine point from
 // the given precomputed table and copies it to |out| in constant-time.
 static void p521_select_point_affine(p521_felem out[2],
@@ -459,105 +445,19 @@ static void p521_select_point_affine(p521_felem out[2],
   }
 }
 
-// Multiplication of a point by a scalar, r = [scalar]P.
-// The product is computed with the use of a small table generated on-the-fly
-// and the scalar recoded in the regular-wNAF representation.
-//
-// The precomputed (on-the-fly) table |p_pre_comp| holds 16 odd multiples of P:
-//     [2i + 1]P for i in [0, 15].
-// Computing the negation of a point P = (x, y) is relatively easy:
-//     -P = (x, -y).
-// So we may assume that instead of the above-mentioned 16, we have 32 points:
-//     [\pm 1]P, [\pm 3]P, [\pm 5]P, ..., [\pm 31]P.
-//
-// The 521-bit scalar is recoded (regular-wNAF encoding) into 105 signed digits
-// each of length 5 bits, as explained in the |p521_felem_mul_scalar_rwnaf|
-// function. Namely,
-//     scalar' = s_0 + s_1*2^5 + s_2*2^10 + ... + s_104*2^520,
-// where digits s_i are in [\pm 1, \pm 3, ..., \pm 31]. Note that for an odd
-// scalar we have that scalar = scalar', while in the case of an even
-// scalar we have that scalar = scalar' - 1.
-//
-// The required product, [scalar]P, is computed by the following algorithm.
-//     1. Initialize the accumulator with the point from |p_pre_comp|
-//        corresponding to the most significant digit s_104 of the scalar.
-//     2. For digits s_i starting from s_104 down to s_0:
-//     3.   Double the accumulator 5 times. (note that doubling a point [a]P
-//          seven times results in [2^5*a]P).
-//     4.   Read from |p_pre_comp| the point corresponding to abs(s_i),
-//          negate it if s_i is negative, and add it to the accumulator.
-//
-// Note: this function is constant-time.
+// Multiplication of an arbitrary point by a scalar, r = [scalar]P.
 static void ec_GFp_nistp521_point_mul(const EC_GROUP *group, EC_JACOBIAN *r,
                                       const EC_JACOBIAN *p,
                                       const EC_SCALAR *scalar) {
 
-  p521_felem res[3] = {{0}, {0}, {0}}, tmp[3] = {{0}, {0}, {0}}, ftmp;
+  p521_felem res[3] = {{0}, {0}, {0}}, tmp[3] = {{0}, {0}, {0}};
 
-  // Table of multiples of P:  [2i + 1]P for i in [0, 15].
-  p521_felem p_pre_comp[P521_MUL_TABLE_SIZE][3];
-
-  // Set the first point in the table to P.
   p521_from_generic(tmp[0], &p->X);
   p521_from_generic(tmp[1], &p->Y);
   p521_from_generic(tmp[2], &p->Z);
 
-  assert(sizeof(p_pre_comp) == (P521_MUL_TABLE_SIZE * 3 * sizeof(p521_felem)));
-  generate_table(p521_methods(), (ec_nistp_felem_limb*)p_pre_comp, tmp[0], tmp[1], tmp[2]);
+  ec_nistp_scalar_mul(p521_methods(), res[0], res[1], res[2], tmp[0], tmp[1], tmp[2], scalar);
 
-  // Recode the scalar.
-  int16_t rnaf[P521_MUL_NWINDOWS] = {0};
-  scalar_rwnaf(rnaf, P521_MUL_WSIZE, scalar, 521);
-
-  // Initialize the accumulator |res| with the table entry corresponding to
-  // the most significant digit of the recoded scalar (note that this digit
-  // can't be negative).
-  int16_t idx = rnaf[P521_MUL_NWINDOWS - 1] >> 1;
-  p521_select_point(res, idx, p_pre_comp, P521_MUL_TABLE_SIZE);
-
-  // Process the remaining digits of the scalar.
-  for (int i = P521_MUL_NWINDOWS - 2; i >= 0; i--) {
-    // Double |res| 7 times in each iteration.
-    for (size_t j = 0; j < P521_MUL_WSIZE; j++) {
-      p521_point_double(res[0], res[1], res[2], res[0], res[1], res[2]);
-    }
-
-    int16_t d = rnaf[i];
-    // is_neg = (d < 0) ? 1 : 0
-    int16_t is_neg = (d >> 15) & 1;
-    // d = abs(d)
-    d = (d ^ -is_neg) + is_neg;
-
-    idx = d >> 1;
-
-    // Select the point to add, in constant time.
-    p521_select_point(tmp, idx, p_pre_comp, P521_MUL_TABLE_SIZE);
-
-    // Negate y coordinate of the point tmp = (x, y); ftmp = -y.
-    p521_felem_opp(ftmp, tmp[1]);
-    // Conditionally select y or -y depending on the sign of the digit |d|.
-    p521_felem_cmovznz(tmp[1], is_neg, tmp[1], ftmp);
-
-    // Add the point to the accumulator |res|.
-    p521_point_add(res[0], res[1], res[2], res[0], res[1], res[2],
-                   0 /* both Jacobian */, tmp[0], tmp[1], tmp[2]);
-
-  }
-
-  // Conditionally subtract P if the scalar is even, in constant-time.
-  // First, compute |tmp| = |res| + (-P).
-  p521_felem_copy(tmp[0], p_pre_comp[0][0]);
-  p521_felem_opp(tmp[1], p_pre_comp[0][1]);
-  p521_felem_copy(tmp[2], p_pre_comp[0][2]);
-  p521_point_add(tmp[0], tmp[1], tmp[2], res[0], res[1], res[2],
-                 0 /* both Jacobian */, tmp[0], tmp[1], tmp[2]);
-
-  // Select |res| or |tmp| based on the |scalar| parity, in constant-time.
-  p521_felem_cmovznz(res[0], scalar->words[0] & 1, tmp[0], res[0]);
-  p521_felem_cmovznz(res[1], scalar->words[0] & 1, tmp[1], res[1]);
-  p521_felem_cmovznz(res[2], scalar->words[0] & 1, tmp[2], res[2]);
-
-  // Copy the result to the output.
   p521_to_generic(&r->X, res[0]);
   p521_to_generic(&r->Y, res[1]);
   p521_to_generic(&r->Z, res[2]);
diff --git a/crypto/internal.h b/crypto/internal.h
index ba08e232f5..1546549714 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -466,17 +466,32 @@ static inline void constant_time_select_array_w(
   }
 }
 
+static inline void constant_time_select_array_8(
+        uint8_t *c, uint8_t *a, uint8_t *b, uint8_t mask, size_t len) {
+  for (size_t i = 0; i < len; i++) {
+    c[i] = constant_time_select_8(mask, a[i], b[i]);
+  }
+}
+
 // constant_time_select_entry_from_table_w selects the idx-th entry from table.
 static inline void constant_time_select_entry_from_table_w(
         crypto_word_t *out, crypto_word_t *table,
-        size_t idx, size_t num_entries, size_t entry_size)
-{
+        size_t idx, size_t num_entries, size_t entry_size) {
   for (size_t i = 0; i < num_entries; i++) {
     crypto_word_t mask = constant_time_eq_w(i, idx);
     constant_time_select_array_w(out, &table[i * entry_size], out, mask, entry_size);
   }
 }
 
+static inline void constant_time_select_entry_from_table_8(
+        uint8_t *out, uint8_t *table, size_t idx,
+        size_t num_entries, size_t entry_size) {
+  for (size_t i = 0; i < num_entries; i++) {
+    uint8_t mask = (uint8_t)(constant_time_eq_w(i, idx));
+    constant_time_select_array_8(out, &table[i * entry_size], out, mask, entry_size);
+  }
+}
+
 #if defined(BORINGSSL_CONSTANT_TIME_VALIDATION)
 
 // CONSTTIME_SECRET takes a pointer and a number of bytes and marks that region