Quantco · tbenthompson · Jun 4, 2020 · Jun 3, 2020 · Jun 4, 2020
diff --git a/requirements/conda-requirements.txt b/requirements/conda-requirements.txt
@@ -1,10 +1,11 @@
 black
-compilers 
+compilers
 cython
 flake8
 git_root
 ipdb
 ipython
+jemalloc
 jupyterlab
 libblas=*=*mkl
 line_profiler

diff --git a/setup.py b/setup.py
@@ -53,6 +53,7 @@
     Extension(
         name="glm_benchmarks.matrix.sandwich.sandwich",
         sources=["src/glm_benchmarks/matrix/sandwich/sandwich.pyx"],
+        libraries=["jemalloc"],
         **extension_args,
     ),
     Extension(

diff --git a/src/glm_benchmarks/matrix/sandwich/dense-tmpl.cpp b/src/glm_benchmarks/matrix/sandwich/dense-tmpl.cpp
@@ -20,12 +20,29 @@
 //
 //
 // Also included is a csr_dense_sandwich function for computing the
-// off-diagonal blocks when using a dense-sparse split matrix. 
-#include "xsimd/xsimd.hpp"
+// off-diagonal blocks when using a dense-sparse split matrix.
+
+#define JEMALLOC_NO_DEMANGLE
+#if __APPLE__
+#define JEMALLOC_NO_RENAME
+#endif
+#include <jemalloc/jemalloc.h>
+#include <xsimd/xsimd.hpp>
 #include <iostream>
 #include <omp.h>
+
 namespace xs = xsimd;
 
+std::size_t round_to_align(std::size_t size, std::size_t alignment) {
+  std::size_t remainder = size % alignment;
+
+  if (remainder == 0) {
+    return size;
+  } else {
+    return size + alignment - remainder;
+  }
+}
+
 <%def name="middle_j(kparallel, IBLOCK, JBLOCK, KBLOCKS)">
     int jmaxblock = jmin + ((jmaxinner - jmin) / ${JBLOCK}) * ${JBLOCK};
     for (; j < jmaxblock; j += ${JBLOCK}) {
@@ -237,15 +254,17 @@ void _dense${order}_sandwich(F* X, F* d, F* out,
         int m, int n, int thresh1d, int kratio, int innerblock) 
 {
     constexpr std::size_t simd_size = xsimd::simd_type<F>::size;
-    constexpr auto alignment = std::align_val_t{simd_size*sizeof(F)};
+    constexpr auto alignment = simd_size * sizeof(F);
 
     bool kparallel = (n / (kratio*thresh1d)) > (m / thresh1d);
     size_t Rsize = thresh1d*thresh1d*kratio*kratio;
     if (kparallel) {
         Rsize *= omp_get_max_threads();
     }
-    auto Rglobal = new (alignment) F[Rsize];
-    auto Lglobal = new (alignment) F[omp_get_max_threads()*thresh1d*thresh1d*kratio];
+    std::size_t Rglobal_size = round_to_align(Rsize * sizeof(F), alignment);
+    F* Rglobal = static_cast<F*>(je_aligned_alloc(alignment, Rglobal_size));
+    std::size_t Lglobal_size = round_to_align(omp_get_max_threads() * thresh1d * thresh1d * kratio * sizeof(F), alignment);
+    F* Lglobal = static_cast<F*>(je_aligned_alloc(alignment, Lglobal_size));
     for (int j = 0; j < m; j+=kratio*thresh1d) {
         int jmax2 = j + kratio*thresh1d; 
         if (jmax2 > m) {
@@ -257,8 +276,8 @@ void _dense${order}_sandwich(F* X, F* d, F* out,
             ${k_loop(False, order)}
         }
     }
-    ::operator delete(Lglobal, alignment);
-    ::operator delete(Rglobal, alignment);
+    je_sdallocx(Lglobal, Lglobal_size, 0);
+    je_sdallocx(Rglobal, Rglobal_size, 0);
 
     #pragma omp parallel if(m > 100)
     for (int i = 0; i < m; i++) {
@@ -281,16 +300,18 @@ void _csr_dense${order}_sandwich(
     int m, int n, int r) 
 {
     constexpr int simd_size = xsimd::simd_type<F>::size;
-    constexpr auto alignment = std::align_val_t{simd_size*sizeof(F)};
+    constexpr auto alignment = simd_size*sizeof(F);
 
     int kblock = 128;
     int jblock = 128;
-    F* Rglobal = new (alignment) F[omp_get_max_threads()*kblock*jblock];
+    std::size_t Rglobal_size = round_to_align(omp_get_max_threads() * kblock * jblock * sizeof(F), alignment);
+    F* Rglobal = static_cast<F*>(je_aligned_alloc(alignment, Rglobal_size));
 
     #pragma omp parallel
     {
         int r2 = ceil(((float)r) / ((float)simd_size)) * simd_size;
-        F* outtemp = new (alignment) F[m*r2];
+        std::size_t outtemp_size = round_to_align(m * r2 * sizeof(F), alignment);
+        F* outtemp = static_cast<F*>(je_aligned_alloc(alignment, outtemp_size));
         for (int i = 0; i < m; i++) {
             for (int j = 0; j < r; j++) {
                 outtemp[i*r2+j] = 0.0;
@@ -350,8 +371,10 @@ void _csr_dense${order}_sandwich(
                 out[i*r+j] += outtemp[i*r2+j];
             }
         }
-        ::operator delete(outtemp, alignment);
+        je_sdallocx(outtemp, outtemp_size, 0);
     }
+
+    je_sdallocx(Rglobal, Rglobal_size, 0);
 }
 </%def>