Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use jemalloc for aligned allocation #165

Merged
merged 2 commits into from
Jun 4, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion requirements/conda-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
black
compilers
compilers
cython
flake8
git_root
ipdb
ipython
jemalloc
jupyterlab
libblas=*=*mkl
line_profiler
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
Extension(
name="glm_benchmarks.matrix.sandwich.sandwich",
sources=["src/glm_benchmarks/matrix/sandwich/sandwich.pyx"],
libraries=["jemalloc"],
**extension_args,
),
Extension(
Expand Down
45 changes: 34 additions & 11 deletions src/glm_benchmarks/matrix/sandwich/dense-tmpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,29 @@
//
//
// Also included is a csr_dense_sandwich function for computing the
// off-diagonal blocks when using a dense-sparse split matrix.
#include "xsimd/xsimd.hpp"
// off-diagonal blocks when using a dense-sparse split matrix.

#define JEMALLOC_NO_DEMANGLE
#if __APPLE__
#define JEMALLOC_NO_RENAME
#endif
#include <jemalloc/jemalloc.h>
#include <xsimd/xsimd.hpp>
#include <iostream>
#include <omp.h>

namespace xs = xsimd;

std::size_t round_to_align(std::size_t size, std::size_t alignment) {
std::size_t remainder = size % alignment;

if (remainder == 0) {
return size;
} else {
return size + alignment - remainder;
}
}

<%def name="middle_j(kparallel, IBLOCK, JBLOCK, KBLOCKS)">
int jmaxblock = jmin + ((jmaxinner - jmin) / ${JBLOCK}) * ${JBLOCK};
for (; j < jmaxblock; j += ${JBLOCK}) {
Expand Down Expand Up @@ -237,15 +254,17 @@ void _dense${order}_sandwich(F* X, F* d, F* out,
int m, int n, int thresh1d, int kratio, int innerblock)
{
constexpr std::size_t simd_size = xsimd::simd_type<F>::size;
constexpr auto alignment = std::align_val_t{simd_size*sizeof(F)};
constexpr auto alignment = simd_size * sizeof(F);

bool kparallel = (n / (kratio*thresh1d)) > (m / thresh1d);
size_t Rsize = thresh1d*thresh1d*kratio*kratio;
if (kparallel) {
Rsize *= omp_get_max_threads();
}
auto Rglobal = new (alignment) F[Rsize];
auto Lglobal = new (alignment) F[omp_get_max_threads()*thresh1d*thresh1d*kratio];
std::size_t Rglobal_size = round_to_align(Rsize * sizeof(F), alignment);
F* Rglobal = static_cast<F*>(je_aligned_alloc(alignment, Rglobal_size));
std::size_t Lglobal_size = round_to_align(omp_get_max_threads() * thresh1d * thresh1d * kratio * sizeof(F), alignment);
F* Lglobal = static_cast<F*>(je_aligned_alloc(alignment, Lglobal_size));
for (int j = 0; j < m; j+=kratio*thresh1d) {
int jmax2 = j + kratio*thresh1d;
if (jmax2 > m) {
Expand All @@ -257,8 +276,8 @@ void _dense${order}_sandwich(F* X, F* d, F* out,
${k_loop(False, order)}
}
}
::operator delete(Lglobal, alignment);
::operator delete(Rglobal, alignment);
je_sdallocx(Lglobal, Lglobal_size, 0);
je_sdallocx(Rglobal, Rglobal_size, 0);

#pragma omp parallel if(m > 100)
for (int i = 0; i < m; i++) {
Expand All @@ -281,16 +300,18 @@ void _csr_dense${order}_sandwich(
int m, int n, int r)
{
constexpr int simd_size = xsimd::simd_type<F>::size;
constexpr auto alignment = std::align_val_t{simd_size*sizeof(F)};
constexpr auto alignment = simd_size*sizeof(F);

int kblock = 128;
int jblock = 128;
F* Rglobal = new (alignment) F[omp_get_max_threads()*kblock*jblock];
std::size_t Rglobal_size = round_to_align(omp_get_max_threads() * kblock * jblock * sizeof(F), alignment);
F* Rglobal = static_cast<F*>(je_aligned_alloc(alignment, Rglobal_size));

#pragma omp parallel
{
int r2 = ceil(((float)r) / ((float)simd_size)) * simd_size;
F* outtemp = new (alignment) F[m*r2];
std::size_t outtemp_size = round_to_align(m * r2 * sizeof(F), alignment);
F* outtemp = static_cast<F*>(je_aligned_alloc(alignment, outtemp_size));
for (int i = 0; i < m; i++) {
for (int j = 0; j < r; j++) {
outtemp[i*r2+j] = 0.0;
Expand Down Expand Up @@ -350,8 +371,10 @@ void _csr_dense${order}_sandwich(
out[i*r+j] += outtemp[i*r2+j];
}
}
::operator delete(outtemp, alignment);
je_sdallocx(outtemp, outtemp_size, 0);
}

je_sdallocx(Rglobal, Rglobal_size, 0);
}
</%def>

Expand Down