From be706cc02d0f38c020029d4b9cd00d9d03433dbf Mon Sep 17 00:00:00 2001
From: Marcel Martin <marcel.martin@scilifelab.se>
Date: Mon, 18 Dec 2023 10:24:23 +0100
Subject: [PATCH 01/32] Fix misspelled "strobealign"

---
 src/cmdline.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/cmdline.cpp b/src/cmdline.cpp
index 48ce9e06..c00c5479 100644
--- a/src/cmdline.cpp
+++ b/src/cmdline.cpp
@@ -8,7 +8,7 @@ class Version {};
 
 CommandLineOptions parse_command_line_arguments(int argc, char **argv) {
 
-    args::ArgumentParser parser("strobelign " + version_string());
+    args::ArgumentParser parser("strobealign " + version_string());
     parser.helpParams.showTerminator = false;
     parser.helpParams.helpindent = 20;
     parser.helpParams.width = 90;

From 32fb1486a1c2f4c92122e219d386397c96f04827 Mon Sep 17 00:00:00 2001
From: Marcel Martin <marcel.martin@scilifelab.se>
Date: Fri, 19 Jan 2024 13:58:53 +0100
Subject: [PATCH 02/32] Fix lost mapping-only (PAF) accuracy

Introduced by buggy refactor in 1fe3e341b829cb6c9697d88dffec29c8bcb3a6ec.

NamPair.score is not actually the sum of the scores of its constituent NAMs,
but the sum of the n_hits.
---
 src/aln.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/aln.cpp b/src/aln.cpp
index 082e4285..bdb70929 100644
--- a/src/aln.cpp
+++ b/src/aln.cpp
@@ -883,7 +883,7 @@ inline void get_best_map_location(
     Nam n1_joint_max, n2_joint_max;
     for (auto &[score, nam1, nam2] : nam_pairs) { // already sorted by descending score
         if (nam1.ref_start >= 0 && nam2.ref_start >=0) { // Valid pair
-            score_joint = score;
+            score_joint = nam1.score + nam2.score;
             n1_joint_max = nam1;
             n2_joint_max = nam2;
             break;

From b20253a6668e2095fc3a0b2b160ad8b4f9a09712 Mon Sep 17 00:00:00 2001
From: Marcel Martin <marcel.martin@scilifelab.se>
Date: Fri, 19 Jan 2024 14:01:41 +0100
Subject: [PATCH 03/32] Rename score attribute of NamPair to n_hits

To make it clearer what it represents.
---
 src/aln.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/aln.cpp b/src/aln.cpp
index bdb70929..4322a1d5 100644
--- a/src/aln.cpp
+++ b/src/aln.cpp
@@ -15,7 +15,7 @@ using namespace klibpp;
 namespace {
 
 struct NamPair {
-    int score;
+    int n_hits;
     Nam nam1;
     Nam nam2;
 };
@@ -415,7 +415,7 @@ inline std::vector<NamPair> get_best_scoring_nam_pairs(
     std::sort(
         nam_pairs.begin(),
         nam_pairs.end(),
-        [](const NamPair& a, const NamPair& b) -> bool { return a.score > b.score; }
+        [](const NamPair& a, const NamPair& b) -> bool { return a.n_hits > b.n_hits; }
     ); // Sort by highest score first
 
     return nam_pairs;
@@ -778,7 +778,7 @@ std::vector<ScoredAlignmentPair> align_paired(
 
     // Turn pairs of high-scoring NAMs into pairs of alignments
     std::vector<ScoredAlignmentPair> high_scores;
-    auto max_score = nam_pairs[0].score;
+    auto max_score = nam_pairs[0].n_hits;
     for (auto &[score_, n1, n2] : nam_pairs) {
         float score_dropoff = (float) score_ / max_score;
 

From df612a078a713b518812592c64fad9a4a1a9d444 Mon Sep 17 00:00:00 2001
From: Marcel Martin <marcel.martin@scilifelab.se>
Date: Fri, 19 Jan 2024 16:24:53 +0100
Subject: [PATCH 04/32] Fix wrong insert size computation in is_proper_nam_pair

---
 src/aln.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/aln.cpp b/src/aln.cpp
index 4322a1d5..4dec133a 100644
--- a/src/aln.cpp
+++ b/src/aln.cpp
@@ -333,7 +333,7 @@ bool is_proper_nam_pair(const Nam nam1, const Nam nam2, float mu, float sigma) {
     if (nam1.ref_id != nam2.ref_id || nam1.is_rc == nam2.is_rc) {
         return false;
     }
-    int a = std::max(0, nam1.ref_start - nam2.query_start);
+    int a = std::max(0, nam1.ref_start - nam1.query_start);
     int b = std::max(0, nam2.ref_start - nam2.query_start);
 
     // r1 ---> <---- r2

From d8a064c0cd3aacbb2fb6d2cb79c649024632b98f Mon Sep 17 00:00:00 2001
From: Marcel Martin <marcel.martin@scilifelab.se>
Date: Fri, 19 Jan 2024 16:46:37 +0100
Subject: [PATCH 05/32] Update baseline commit

---
 tests/baseline-commit.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/baseline-commit.txt b/tests/baseline-commit.txt
index b2a2c68f..94995525 100644
--- a/tests/baseline-commit.txt
+++ b/tests/baseline-commit.txt
@@ -1 +1 @@
-baseline_commit=54f9fe4266ae0ab7843ee7fb70ddfbc2d95dc729
+baseline_commit=df612a078a713b518812592c64fad9a4a1a9d444

From 0ced9903276834e6b9bfe095a255952f0616d330 Mon Sep 17 00:00:00 2001
From: Marcel Martin <marcel.martin@scilifelab.se>
Date: Fri, 19 Jan 2024 16:52:52 +0100
Subject: [PATCH 06/32] Add Nam::projected_ref_start()

---
 src/aln.cpp | 14 +++++++-------
 src/nam.hpp |  4 ++++
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/aln.cpp b/src/aln.cpp
index 4dec133a..b5105bb1 100644
--- a/src/aln.cpp
+++ b/src/aln.cpp
@@ -215,7 +215,7 @@ inline Alignment extend_seed(
     const std::string query = nam.is_rc ? read.rc : read.seq;
     const std::string& ref = references.sequences[nam.ref_id];
 
-    const auto projected_ref_start = std::max(0, nam.ref_start - nam.query_start);
+    const auto projected_ref_start = nam.projected_ref_start();
     const auto projected_ref_end = std::min(nam.ref_end + query.size() - nam.query_end, ref.size());
 
     AlignmentInfo info;
@@ -333,14 +333,14 @@ bool is_proper_nam_pair(const Nam nam1, const Nam nam2, float mu, float sigma) {
     if (nam1.ref_id != nam2.ref_id || nam1.is_rc == nam2.is_rc) {
         return false;
     }
-    int a = std::max(0, nam1.ref_start - nam1.query_start);
-    int b = std::max(0, nam2.ref_start - nam2.query_start);
+    int r1_ref_start = nam1.projected_ref_start();
+    int r2_ref_start = nam2.projected_ref_start();
 
     // r1 ---> <---- r2
-    bool r1_r2 = nam2.is_rc && (a <= b) && (b - a < mu + 10*sigma);
+    bool r1_r2 = nam2.is_rc && (r1_ref_start <= r2_ref_start) && (r2_ref_start - r1_ref_start < mu + 10*sigma);
 
      // r2 ---> <---- r1
-    bool r2_r1 = nam1.is_rc && (b <= a) && (a - b < mu + 10*sigma);
+    bool r2_r1 = nam1.is_rc && (r2_ref_start <= r1_ref_start) && (r1_ref_start - r2_ref_start < mu + 10*sigma);
 
     return r1_r2 || r2_r1;
 }
@@ -442,8 +442,8 @@ inline Alignment rescue_align(
 
     if (mate_nam.is_rc) {
         r_tmp = read.seq;
-        a = mate_nam.ref_start - mate_nam.query_start - (mu+5*sigma);
-        b = mate_nam.ref_start - mate_nam.query_start + read_len/2; // at most half read overlap
+        a = mate_nam.projected_ref_start() - (mu+5*sigma);
+        b = mate_nam.projected_ref_start() + read_len/2; // at most half read overlap
     } else {
         r_tmp = read.rc; // mate is rc since fr orientation
         a = mate_nam.ref_end + (read_len - mate_nam.query_end) - read_len/2; // at most half read overlap
diff --git a/src/nam.hpp b/src/nam.hpp
index 6fc807fa..b052b1e3 100644
--- a/src/nam.hpp
+++ b/src/nam.hpp
@@ -29,6 +29,10 @@ struct Nam {
     int query_span() const {
         return query_end - query_start;
     }
+
+    int projected_ref_start() const {
+        return std::max(0, ref_start - query_start);
+    }
 };
 
 std::pair<float, std::vector<Nam>> find_nams(

From 781b17a6cbdad7ad8182680261439060be55723e Mon Sep 17 00:00:00 2001
From: Marcel Martin <marcel.martin@scilifelab.se>
Date: Mon, 22 Jan 2024 13:04:29 +0100
Subject: [PATCH 07/32] Update baseline commit

---
 .github/workflows/ci.yml  | 2 +-
 tests/baseline-commit.txt | 2 +-
 tests/compare-baseline.sh | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 5c34eb67..0cdc4ce4 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -68,7 +68,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v3
-      - run: cat tests/baseline-commit.txt >> $GITHUB_ENV
+      - run: "echo baseline_commit=$(< tests/baseline-commit.txt) >> $GITHUB_ENV"
       - uses: actions/checkout@v3
         with:
           ref: ${{ env.baseline_commit }}
diff --git a/tests/baseline-commit.txt b/tests/baseline-commit.txt
index 94995525..9c1e9b61 100644
--- a/tests/baseline-commit.txt
+++ b/tests/baseline-commit.txt
@@ -1 +1 @@
-baseline_commit=df612a078a713b518812592c64fad9a4a1a9d444
+0ced9903276834e6b9bfe095a255952f0616d330
diff --git a/tests/compare-baseline.sh b/tests/compare-baseline.sh
index 5f13ba2f..a5176503 100755
--- a/tests/compare-baseline.sh
+++ b/tests/compare-baseline.sh
@@ -33,7 +33,7 @@ fi
 # Ensure test data is available
 tests/download.sh
 
-source tests/baseline-commit.txt
+baseline_commit=$(< tests/baseline-commit.txt)
 
 baseline_bam=baseline/baseline-${baseline_commit}.${ends}.bam
 baseline_binary=baseline/strobealign-${baseline_commit}

From f749f21e43c4acce6cc27b1fbddb84cabdc9cfc1 Mon Sep 17 00:00:00 2001
From: Marcel Martin <marcel.martin@scilifelab.se>
Date: Mon, 22 Jan 2024 13:30:19 +0100
Subject: [PATCH 08/32] Simplify baseline comparison on CI

Do not let GitHub Actions do the work, but rely on the baseline comparison
script. Only ensure caches are filled before starting it.

This reduces code duplication.
---
 .github/workflows/ci.yml  | 40 ++++++++++-----------------------------
 tests/compare-baseline.sh |  6 +++---
 2 files changed, 13 insertions(+), 33 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 0cdc4ce4..d6f1d976 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -14,7 +14,7 @@ jobs:
       github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Check for tab characters
         run: "! grep -P -R '\\t' src/ tests/*.{cpp,py}"
 
@@ -27,7 +27,7 @@ jobs:
       matrix:
         os: [ubuntu-latest, macos-latest]
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Build
         run: |
           cmake -B build -DCMAKE_BUILD_TYPE=RelWithDebInfo
@@ -49,7 +49,7 @@ jobs:
       github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
@@ -67,48 +67,28 @@ jobs:
       github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
-      - run: "echo baseline_commit=$(< tests/baseline-commit.txt) >> $GITHUB_ENV"
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
-          ref: ${{ env.baseline_commit }}
-          path: baseline
+          fetch-depth: 0  # Baseline comparison needs older commits
       - name: Install Linux dependencies
         if: runner.os == 'Linux'
         run: sudo apt-get install samtools python3-pysam picard-tools
       - name: Install macOS dependencies
         if: runner.os == 'macOS'
         run: brew install samtools pysam picard-tools
-
       - name: Cache test dataset
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           key: test-data-${{ hashFiles('tests/download.sh') }}
           path: tests/drosophila/
-      - name: Download test dataset
-        run: tests/download.sh
-
       - name: Cache baseline BAM
         id: cache-baseline-bam
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           key: baseline-bam-${{ hashFiles('tests/baseline-commit.txt') }}
-          path: baseline.bam
-      - name: Generate baseline BAM
-        if: ${{ steps.cache-baseline-bam.outputs.cache-hit != 'true' }}
-        run: |
-          ( cd baseline && cmake -B build )
-          make -j3 -C baseline/build
-          baseline/build/strobealign tests/drosophila/ref.fasta tests/drosophila/reads.1.fastq.gz tests/drosophila/reads.2.fastq.gz | samtools view -o baseline.bam
-
-      - name: Build HEAD version
-        run: |
-          cmake -B build -DCMAKE_BUILD_TYPE=RelWithDebInfo
-          make -j3 -C build
-      - name: Generate HEAD BAM
-        run: build/strobealign tests/drosophila/ref.fasta tests/drosophila/reads.1.fastq.gz tests/drosophila/reads.2.fastq.gz | samtools view -o head.bam
-      - name: Compare
-        run: python3 tests/samdiff.py baseline.bam head.bam
+          path: baseline/bam/
+      - name: Compare to baseline
+        run: tests/compare-baseline.sh
       - name: Validate with Picard
         run: |
           PicardCommandLine ValidateSamFile IGNORE=RECORD_MISSING_READ_GROUP IGNORE=MISSING_READ_GROUP I=head.bam
diff --git a/tests/compare-baseline.sh b/tests/compare-baseline.sh
index a5176503..57f9faac 100755
--- a/tests/compare-baseline.sh
+++ b/tests/compare-baseline.sh
@@ -35,18 +35,18 @@ tests/download.sh
 
 baseline_commit=$(< tests/baseline-commit.txt)
 
-baseline_bam=baseline/baseline-${baseline_commit}.${ends}.bam
+baseline_bam=baseline/bam/${baseline_commit}.${ends}.bam
 baseline_binary=baseline/strobealign-${baseline_commit}
 cmake_options=-DCMAKE_BUILD_TYPE=RelWithDebInfo
 strobealign_options="-t 4"
 
 # Generate the baseline BAM if necessary
-mkdir -p baseline
+mkdir -p baseline/bam
 if ! test -f ${baseline_bam}; then
   if ! test -f ${baseline_binary}; then
     srcdir=$(mktemp -p . -d compile.XXXXXXX)
     git clone . ${srcdir}
-    ( cd ${srcdir} && git checkout ${baseline_commit} )
+    ( cd ${srcdir} && git checkout -d ${baseline_commit} )
     cmake ${srcdir} -B ${srcdir}/build ${cmake_options}
     if ! make -j 4 -C ${srcdir}/build strobealign; then
       exit 1

From 40c9c09e3fc31d19e8a5c6df5bda26c1f90e6d3c Mon Sep 17 00:00:00 2001
From: Marcel Martin <marcel.martin@scilifelab.se>
Date: Mon, 22 Jan 2024 15:08:14 +0100
Subject: [PATCH 09/32] Vendor zstr instead of fetching it from Git at build
 time

This way, the strobealign sources are more self-contained and it should not
be necessary to have internet access at build time.
---
 CHANGES.md                      |   3 +
 CMakeLists.txt                  |   6 +-
 ext/README.md                   |   7 +
 ext/zstr/CMakeLists.txt         |  46 +++
 ext/zstr/LICENSE                |  21 ++
 ext/zstr/README.org             | 103 +++++++
 ext/zstr/src/strict_fstream.hpp | 237 +++++++++++++++
 ext/zstr/src/zstr.hpp           | 502 ++++++++++++++++++++++++++++++++
 8 files changed, 920 insertions(+), 5 deletions(-)
 create mode 100644 ext/zstr/CMakeLists.txt
 create mode 100644 ext/zstr/LICENSE
 create mode 100644 ext/zstr/README.org
 create mode 100644 ext/zstr/src/strict_fstream.hpp
 create mode 100644 ext/zstr/src/zstr.hpp

diff --git a/CHANGES.md b/CHANGES.md
index 5e653588..266b5832 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -10,6 +10,9 @@
 * #378: Added `-C` option for appending the FASTA or FASTQ comment to SAM
   output. (Idea and name of the option taken from BWA-MEM.)
 * #371: Added `--no-PG` option for not outputting the PG SAM header
+* Include [ZStr](https://github.com/mateidavid/zstr/) in our own repository
+  instead of downloading it at build time. This should make it possible to
+  build strobealign without internet access.
 
 ## v0.12.0 (2023-11-23)
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 19ace7a2..988b1235 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,11 +25,7 @@ endif()
 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 add_compile_options(-Wall -Wextra -Werror=maybe-uninitialized)
 
-FetchContent_Declare(ZStrGitRepo
-  GIT_REPOSITORY    "https://github.com/mateidavid/zstr"
-  GIT_TAG           "755da7890ea22478a702e3139092e6c964fab1f5"
-)
-FetchContent_MakeAvailable(ZStrGitRepo)
+add_subdirectory(ext/zstr)
 
 # Obtain version from Git or fall back to PROJECT_VERSION if not building
 # from a Git repository
diff --git a/ext/README.md b/ext/README.md
index e473fdf5..e8316d26 100644
--- a/ext/README.md
+++ b/ext/README.md
@@ -47,3 +47,10 @@ License: See ssw/README.md
 Homepage: https://www.xxhash.com
 Version: ?
 License: See xxhash.c
+
+
+## zstr
+
+Homepage: https://github.com/mateidavid/zstr
+Commit used: 755da7890ea22478a702e3139092e6c964fab1f5
+License: See zstr/LICENSE
diff --git a/ext/zstr/CMakeLists.txt b/ext/zstr/CMakeLists.txt
new file mode 100644
index 00000000..8a015618
--- /dev/null
+++ b/ext/zstr/CMakeLists.txt
@@ -0,0 +1,46 @@
+cmake_minimum_required(VERSION 3.10 FATAL_ERROR)
+
+project(zstr LANGUAGES CXX)
+
+if (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.12)
+  cmake_policy(SET CMP0074 NEW) # find_package uses <PackageName>_ROOT variables
+endif()
+
+if(${CMAKE_VERSION} VERSION_LESS 3.13)
+  message(WARNING
+    "Interface library targets are not well supported before cmake 3.13  .... "
+    "You may need to add \${ZSTR_INCLUDE_DIRS} to your include directories\n"
+    "target_include_directories(YourTarget PRIVATE \${ZSTR_INCLUDE_DIRS}) "
+  )
+endif()
+
+# -- locate zlib
+
+find_package(ZLIB 1.2.3 REQUIRED) # defines imported target ZLIB::ZLIB
+message(STATUS "zstr - found ZLIB (version: ${ZLIB_VERSION_STRING})")
+
+# -- add target
+
+add_library(zstr INTERFACE)
+add_library(zstr::zstr ALIAS zstr)
+
+# -- set target properties
+
+target_include_directories(zstr INTERFACE "${PROJECT_SOURCE_DIR}/src")
+target_link_libraries(zstr INTERFACE ZLIB::ZLIB)
+target_compile_features(zstr INTERFACE cxx_std_11) # require c++11 flag
+
+# -- set cache variables
+
+# NOTE: these vars are mostly useful to people using cmake < 3.13
+set(ZSTR_INCLUDE_DIRS "${PROJECT_SOURCE_DIR}/src;${ZLIB_INCLUDE_DIRS}" CACHE PATH "" FORCE)
+set(ZSTR_LIBRARIES "${ZLIB_LIBRARIES}" CACHE PATH "" FORCE)
+
+# -- print target summary
+
+message(STATUS
+  "zstr - added INTERFACE target 'zstr::zstr'
+          includes : ${ZSTR_INCLUDE_DIRS}
+          libraries: ZLIB::ZLIB
+          features : cxx_std_11"
+)
diff --git a/ext/zstr/LICENSE b/ext/zstr/LICENSE
new file mode 100644
index 00000000..841c7214
--- /dev/null
+++ b/ext/zstr/LICENSE
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2015 Matei David, Ontario Institute for Cancer Research
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE. 
diff --git a/ext/zstr/README.org b/ext/zstr/README.org
new file mode 100644
index 00000000..dea53589
--- /dev/null
+++ b/ext/zstr/README.org
@@ -0,0 +1,103 @@
+# -*- mode:org; mode:visual-line; coding:utf-8; -*-
+
+** A C++ ZLib wrapper
+
+[[http://travis-ci.org/mateidavid/zstr][http://travis-ci.org/mateidavid/zstr.svg?branch=master]] [[https://tldrlegal.com/license/mit-license][http://img.shields.io/:license-mit-blue.svg]]
+
+This C++ header-only library enables the use of C++ standard iostreams to access ZLib-compressed streams.
+
+For input access (decompression), the compression format is auto-detected, and multiple concatenated compressed streams are decompressed seamlessly.
+
+For output access (compression), the only parameter exposed by this API is the compression level.
+
+Alternatives to this library include:
+
+- The original [[http://www.zlib.net/][ZLib]], through its [[http://www.zlib.net/manual.html][C API]]. This does not interact nicely with C++ iostreams.
+
+- The [[http://www.cs.unc.edu/Research/compgeom/gzstream/][GZStream]] library. This library does not auto-detect input compression, and it cannot wrap streams (only files).
+
+- The [[http://www.boost.org/doc/libs/release/libs/iostreams/][Boost IOStreams]] library. The library does not auto-detect input compression (by default, though that can be easily implemented with filters), and more importantly, it is not a header-only Boost library.
+
+- The [[https://github.com/tmaklin/bxzstr][bxzstr]] library, if you want support for BZ2 and/or LZMA as well.
+
+For an example usage, see [[examples/ztxtpipe.cpp]] and [[examples/zc.cpp]].
+
+It is compatible with [[https://github.com/richgel999/miniz][miniz]] in case you don't want to get frustrated with zlib e. g. on Windows.
+
+**** Input Auto-detection
+
+For input access, the library seamlessly auto-detects whether the source stream is compressed or not. The following compressed streams are detected:
+
+- GZip header, when stream starts with =1F 8B=. See [[http://en.wikipedia.org/wiki/Gzip][GZip format]].
+
+- ZLib header, when stream starts with =78 01=, =78 9C=, and =78 DA=. See [[http://stackoverflow.com/a/17176881][answer here]].
+
+If none of these formats are detected, the library assumes the input is not compressed, and it produces a plain copy of the source stream.
+
+**** Classes
+
+The package provides 6 classes for accessing ZLib streams:
+
+- =zstr::istreambuf= is the core decompression class. This is constructed from an existing =std::streambuf= that contains source data. The =zstr::istreambuf= constructor accepts explicit settings for the internal buffer size (default: 1 MB) and the auto-detection option (default: on). ZLib errors cause exceptions to be thrown.
+
+- =zstr::ostreambuf= is the core compression class. This is constructed from an existing =std::streambuf= that contains sink data. The =zstr::ostreambuf= constructor accepts explicit settings for the internal buffer size (default: 1 MB) and the compression option (default: ZLib default). ZLib errors cause exceptions to be thrown.
+
+- =zstr::istream= is a wrapper for a =zstr::istreambuf= that accesses an /external/ =std::streambuf=. It can be constructed from an existing =std::istream= (such as =std::cin=) or =std::streambuf=.
+
+- =zstr::ostream= is a wrapper for a =zstr::ostreambuf= that accesses an /external/ =std::streambuf=. It can be constructed from an existing =std::ostream= (such as =std::cout=) or =std::streambuf=.
+
+- =zstr::ifstream= is a wrapper for a =zstr::istreambuf= that accesses an /internal/ =std::ifstream=. This can be used to open a file and read decompressed data from it.
+
+- =zstr::ofstream= is a wrapper for a =zstr::ostreambuf= that accesses an /internal/ =std::ofstream=. This can be used to open a file and write compressed data to it.
+
+For all stream objects, the =badbit= of their exception mask is turned on in order to propagate exceptions.
+
+**** CMake
+
+There are three simple ways to add zstr to a CMake project.
+
+Method 1. Add zstr as a subdirectory and link to the =zstr::zstr= target
+
+  #+BEGIN_SRC cmake
+    add_subdirectory(zstr) # defines INTERFACE target 'zstr::zstr'
+
+    add_executable(YourTarget main.cpp)
+    target_link_libraries(YourTarget PRIVATE zstr::zstr)
+    # if using cmake < 3.13 you may also need the following line
+    # target_include_directories(YourTarget PRIVATE ${ZSTR_INCLUDE_DIRS})
+  #+END_SRC
+
+Method 2. Fetch a copy of zstr from an external repository and link to the =zstr::zstr= target
+
+  /NOTE: The FetchContent functions shown here were introduced in CMake 3.14/
+
+  #+BEGIN_SRC cmake
+    include(FetchContent)
+    FetchContent_Declare(ZStrGitRepo
+      GIT_REPOSITORY    "https://github.com/mateidavid/zstr" # can also be a local filesystem path!
+      GIT_TAG           "master"
+    )
+    FetchContent_MakeAvailable(ZStrGitRepo) # defines INTERFACE target 'zstr::zstr'
+
+    add_executable(YourTarget main.cpp)
+    target_link_libraries(YourTarget PRIVATE zstr::zstr)
+  #+END_SRC
+
+Method 3. Add path containing 'zstr.hpp' to your target's include directories
+
+  /NOTE: With this method you're responsible for finding and linking to ZLIB !/
+
+  #+BEGIN_SRC cmake
+    find_package(ZLIB REQUIRED)
+    add_executable(YourTarget main.cpp)
+    target_link_libraries(YourTarget PRIVATE ZLIB::ZLIB)
+    target_include_directories(YourTarget PRIVATE /path/to/zstr/src)
+  #+END_SRC
+
+**** Requisites
+
+If you use GCC and want to use the `fs.open()` function, you need to deploy at least GCC version 5.1.
+
+**** License
+
+Released under the [[file:LICENSE][MIT license]].
diff --git a/ext/zstr/src/strict_fstream.hpp b/ext/zstr/src/strict_fstream.hpp
new file mode 100644
index 00000000..7d03ea66
--- /dev/null
+++ b/ext/zstr/src/strict_fstream.hpp
@@ -0,0 +1,237 @@
+#pragma once
+
+#include <cassert>
+#include <fstream>
+#include <cstring>
+#include <string>
+#include <vector>
+
+/**
+ * This namespace defines wrappers for std::ifstream, std::ofstream, and
+ * std::fstream objects. The wrappers perform the following steps:
+ * - check the open modes make sense
+ * - check that the call to open() is successful
+ * - (for input streams) check that the opened file is peek-able
+ * - turn on the badbit in the exception mask
+ */
+namespace strict_fstream
+{
+
+// Help people out a bit, it seems like this is a common recommenation since
+// musl breaks all over the place.
+#if defined(__NEED_size_t) && !defined(__MUSL__)
+#warning "It seems to be recommended to patch in a define for __MUSL__ if you use musl globally: https://www.openwall.com/lists/musl/2013/02/10/5"
+#define __MUSL__
+#endif
+
+// Workaround for broken musl implementation
+// Since musl insists that they are perfectly compatible, ironically enough,
+// they don't officially have a __musl__ or similar. But __NEED_size_t is defined in their
+// relevant header (and not in working implementations), so we can use that.
+#ifdef __MUSL__
+#warning "Working around broken strerror_r() implementation in musl, remove when musl is fixed"
+#endif
+
+// Non-gnu variants of strerror_* don't necessarily null-terminate if
+// truncating, so we have to do things manually.
+inline std::string trim_to_null(const std::vector<char> &buff)
+{
+    std::string ret(buff.begin(), buff.end());
+
+    const std::string::size_type pos = ret.find('\0');
+    if (pos == std::string::npos) {
+        ret += " [...]"; // it has been truncated
+    } else {
+        ret.resize(pos);
+    }
+    return ret;
+}
+
+/// Overload of error-reporting function, to enable use with VS and non-GNU
+/// POSIX libc's
+/// Ref:
+///   - http://stackoverflow.com/a/901316/717706
+static std::string strerror()
+{
+    // Can't use std::string since we're pre-C++17
+    std::vector<char> buff(256, '\0');
+
+#ifdef _WIN32
+    // Since strerror_s might set errno itself, we need to store it.
+    const int err_num = errno;
+    if (strerror_s(buff.data(), buff.size(), err_num) != 0) {
+        return trim_to_null(buff);
+    } else {
+        return "Unknown error (" + std::to_string(err_num) + ")";
+    }
+#elif ((_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || defined(__APPLE__) || defined(__FreeBSD__)) && ! _GNU_SOURCE) || defined(__MUSL__)
+// XSI-compliant strerror_r()
+    const int err_num = errno; // See above
+    if (strerror_r(err_num, buff.data(), buff.size()) == 0) {
+        return trim_to_null(buff);
+    } else {
+        return "Unknown error (" + std::to_string(err_num) + ")";
+    }
+#else
+// GNU-specific strerror_r()
+    char * p = strerror_r(errno, &buff[0], buff.size());
+    return std::string(p, std::strlen(p));
+#endif
+}
+
+/// Exception class thrown by failed operations.
+class Exception
+    : public std::exception
+{
+public:
+    Exception(const std::string& msg) : _msg(msg) {}
+    const char * what() const noexcept { return _msg.c_str(); }
+private:
+    std::string _msg;
+}; // class Exception
+
+namespace detail
+{
+
+struct static_method_holder
+{
+    static std::string mode_to_string(std::ios_base::openmode mode)
+    {
+        static const int n_modes = 6;
+        static const std::ios_base::openmode mode_val_v[n_modes] =
+            {
+                std::ios_base::in,
+                std::ios_base::out,
+                std::ios_base::app,
+                std::ios_base::ate,
+                std::ios_base::trunc,
+                std::ios_base::binary
+            };
+
+        static const char * mode_name_v[n_modes] =
+            {
+                "in",
+                "out",
+                "app",
+                "ate",
+                "trunc",
+                "binary"
+            };
+        std::string res;
+        for (int i = 0; i < n_modes; ++i)
+        {
+            if (mode & mode_val_v[i])
+            {
+                res += (! res.empty()? "|" : "");
+                res += mode_name_v[i];
+            }
+        }
+        if (res.empty()) res = "none";
+        return res;
+    }
+    static void check_mode(const std::string& filename, std::ios_base::openmode mode)
+    {
+        if ((mode & std::ios_base::trunc) && ! (mode & std::ios_base::out))
+        {
+            throw Exception(std::string("strict_fstream: open('") + filename + "'): mode error: trunc and not out");
+        }
+        else if ((mode & std::ios_base::app) && ! (mode & std::ios_base::out))
+        {
+            throw Exception(std::string("strict_fstream: open('") + filename + "'): mode error: app and not out");
+        }
+        else if ((mode & std::ios_base::trunc) && (mode & std::ios_base::app))
+        {
+            throw Exception(std::string("strict_fstream: open('") + filename + "'): mode error: trunc and app");
+        }
+     }
+    static void check_open(std::ios * s_p, const std::string& filename, std::ios_base::openmode mode)
+    {
+        if (s_p->fail())
+        {
+            throw Exception(std::string("strict_fstream: open('")
+                            + filename + "'," + mode_to_string(mode) + "): open failed: "
+                            + strerror());
+        }
+    }
+    static void check_peek(std::istream * is_p, const std::string& filename, std::ios_base::openmode mode)
+    {
+        bool peek_failed = true;
+        try
+        {
+            is_p->peek();
+            peek_failed = is_p->fail();
+        }
+        catch (const std::ios_base::failure &) {}
+        if (peek_failed)
+        {
+            throw Exception(std::string("strict_fstream: open('")
+                            + filename + "'," + mode_to_string(mode) + "): peek failed: "
+                            + strerror());
+        }
+        is_p->clear();
+    }
+}; // struct static_method_holder
+
+} // namespace detail
+
+class ifstream
+    : public std::ifstream
+{
+public:
+    ifstream() = default;
+    ifstream(const std::string& filename, std::ios_base::openmode mode = std::ios_base::in)
+    {
+        open(filename, mode);
+    }
+    void open(const std::string& filename, std::ios_base::openmode mode = std::ios_base::in)
+    {
+        mode |= std::ios_base::in;
+        exceptions(std::ios_base::badbit);
+        detail::static_method_holder::check_mode(filename, mode);
+        std::ifstream::open(filename, mode);
+        detail::static_method_holder::check_open(this, filename, mode);
+        detail::static_method_holder::check_peek(this, filename, mode);
+    }
+}; // class ifstream
+
+class ofstream
+    : public std::ofstream
+{
+public:
+    ofstream() = default;
+    ofstream(const std::string& filename, std::ios_base::openmode mode = std::ios_base::out)
+    {
+        open(filename, mode);
+    }
+    void open(const std::string& filename, std::ios_base::openmode mode = std::ios_base::out)
+    {
+        mode |= std::ios_base::out;
+        exceptions(std::ios_base::badbit);
+        detail::static_method_holder::check_mode(filename, mode);
+        std::ofstream::open(filename, mode);
+        detail::static_method_holder::check_open(this, filename, mode);
+    }
+}; // class ofstream
+
+class fstream
+    : public std::fstream
+{
+public:
+    fstream() = default;
+    fstream(const std::string& filename, std::ios_base::openmode mode = std::ios_base::in)
+    {
+        open(filename, mode);
+    }
+    void open(const std::string& filename, std::ios_base::openmode mode = std::ios_base::in)
+    {
+        if (! (mode & std::ios_base::out)) mode |= std::ios_base::in;
+        exceptions(std::ios_base::badbit);
+        detail::static_method_holder::check_mode(filename, mode);
+        std::fstream::open(filename, mode);
+        detail::static_method_holder::check_open(this, filename, mode);
+        detail::static_method_holder::check_peek(this, filename, mode);
+    }
+}; // class fstream
+
+} // namespace strict_fstream
+
diff --git a/ext/zstr/src/zstr.hpp b/ext/zstr/src/zstr.hpp
new file mode 100644
index 00000000..bd330ea1
--- /dev/null
+++ b/ext/zstr/src/zstr.hpp
@@ -0,0 +1,502 @@
+//---------------------------------------------------------
+// Copyright 2015 Ontario Institute for Cancer Research
+// Written by Matei David (matei@cs.toronto.edu)
+//---------------------------------------------------------
+
+// Reference:
+// http://stackoverflow.com/questions/14086417/how-to-write-custom-input-stream-in-c
+
+#pragma once
+
+#include <cassert>
+#include <fstream>
+#include <sstream>
+#include <zlib.h>
+#include <memory>
+#include <iostream>
+#include "strict_fstream.hpp"
+
+#if defined(__GNUC__) && !defined(__clang__)
+#if (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__>0)
+#define CAN_MOVE_IOSTREAM
+#endif
+#else
+#define CAN_MOVE_IOSTREAM
+#endif
+
+namespace zstr
+{
+
+static const std::size_t default_buff_size = static_cast<std::size_t>(1 << 20);
+
+/// Exception class thrown by failed zlib operations.
+class Exception
+    : public std::ios_base::failure
+{
+public:
+    static std::string error_to_message(z_stream * zstrm_p, int ret)
+    {
+        std::string msg = "zlib: ";
+        switch (ret)
+        {
+        case Z_STREAM_ERROR:
+            msg += "Z_STREAM_ERROR: ";
+            break;
+        case Z_DATA_ERROR:
+            msg += "Z_DATA_ERROR: ";
+            break;
+        case Z_MEM_ERROR:
+            msg += "Z_MEM_ERROR: ";
+            break;
+        case Z_VERSION_ERROR:
+            msg += "Z_VERSION_ERROR: ";
+            break;
+        case Z_BUF_ERROR:
+            msg += "Z_BUF_ERROR: ";
+            break;
+        default:
+            std::ostringstream oss;
+            oss << ret;
+            msg += "[" + oss.str() + "]: ";
+            break;
+        }
+        if (zstrm_p->msg) {
+            msg += zstrm_p->msg;
+        }
+        msg += " ("
+                "next_in: " +
+                std::to_string(uintptr_t(zstrm_p->next_in)) +
+                ", avail_in: " +
+                std::to_string(uintptr_t(zstrm_p->avail_in)) +
+                ", next_out: " +
+                std::to_string(uintptr_t(zstrm_p->next_out)) +
+                ", avail_out: " +
+                std::to_string(uintptr_t(zstrm_p->avail_out)) +
+                ")";
+        return msg;
+    }
+
+    Exception(z_stream * zstrm_p, int ret)
+        : std::ios_base::failure(error_to_message(zstrm_p, ret))
+    {
+    }
+}; // class Exception
+
+namespace detail
+{
+
+class z_stream_wrapper
+    : public z_stream
+{
+public:
+    z_stream_wrapper(bool _is_input, int _level, int _window_bits)
+        : is_input(_is_input)
+    {
+        this->zalloc = nullptr;//Z_NULL
+        this->zfree = nullptr;//Z_NULL
+        this->opaque = nullptr;//Z_NULL
+        int ret;
+        if (is_input)
+        {
+            this->avail_in = 0;
+            this->next_in = nullptr;//Z_NULL
+            ret = inflateInit2(this, _window_bits ? _window_bits : 15+32);
+        }
+        else
+        {
+            ret = deflateInit2(this, _level, Z_DEFLATED, _window_bits ? _window_bits : 15+16, 8, Z_DEFAULT_STRATEGY);
+        }
+        if (ret != Z_OK) throw Exception(this, ret);
+    }
+    ~z_stream_wrapper()
+    {
+        if (is_input)
+        {
+            inflateEnd(this);
+        }
+        else
+        {
+            deflateEnd(this);
+        }
+    }
+private:
+    bool is_input;
+}; // class z_stream_wrapper
+
+} // namespace detail
+
+class istreambuf
+    : public std::streambuf
+{
+public:
+    istreambuf(std::streambuf * _sbuf_p,
+               std::size_t _buff_size = default_buff_size, bool _auto_detect = true, int _window_bits = 0)
+        : sbuf_p(_sbuf_p),
+          in_buff(),
+          in_buff_start(nullptr),
+          in_buff_end(nullptr),
+          out_buff(),
+          zstrm_p(nullptr),
+          buff_size(_buff_size),
+          auto_detect(_auto_detect),
+          auto_detect_run(false),
+          is_text(false),
+          window_bits(_window_bits)
+    {
+        assert(sbuf_p);
+        in_buff = std::unique_ptr<char[]>(new char[buff_size]);
+        in_buff_start = in_buff.get();
+        in_buff_end = in_buff.get();
+        out_buff = std::unique_ptr<char[]>(new char[buff_size]);
+        setg(out_buff.get(), out_buff.get(), out_buff.get());
+    }
+
+    istreambuf(const istreambuf &) = delete;
+    istreambuf & operator = (const istreambuf &) = delete;
+
+    pos_type seekoff(off_type off, std::ios_base::seekdir dir,
+                     std::ios_base::openmode which) override
+    {
+        if (off != 0 || dir != std::ios_base::cur) {
+            return std::streambuf::seekoff(off, dir, which);
+        }
+
+        if (!zstrm_p) {
+            return 0;
+        }
+
+        return static_cast<long int>(zstrm_p->total_out - static_cast<uLong>(in_avail()));
+    }
+
+    std::streambuf::int_type underflow() override
+    {
+        if (this->gptr() == this->egptr())
+        {
+            // pointers for free region in output buffer
+            char * out_buff_free_start = out_buff.get();
+            int tries = 0;
+            do
+            {
+                if (++tries > 1000) {
+                    throw std::ios_base::failure("Failed to fill buffer after 1000 tries");
+                }
+
+                // read more input if none available
+                if (in_buff_start == in_buff_end)
+                {
+                    // empty input buffer: refill from the start
+                    in_buff_start = in_buff.get();
+                    std::streamsize sz = sbuf_p->sgetn(in_buff.get(), static_cast<std::streamsize>(buff_size));
+                    in_buff_end = in_buff_start + sz;
+                    if (in_buff_end == in_buff_start) break; // end of input
+                }
+                // auto detect if the stream contains text or deflate data
+                if (auto_detect && ! auto_detect_run)
+                {
+                    auto_detect_run = true;
+                    unsigned char b0 = *reinterpret_cast< unsigned char * >(in_buff_start);
+                    unsigned char b1 = *reinterpret_cast< unsigned char * >(in_buff_start + 1);
+                    // Ref:
+                    // http://en.wikipedia.org/wiki/Gzip
+                    // http://stackoverflow.com/questions/9050260/what-does-a-zlib-header-look-like
+                    is_text = ! (in_buff_start + 2 <= in_buff_end
+                                 && ((b0 == 0x1F && b1 == 0x8B)         // gzip header
+                                     || (b0 == 0x78 && (b1 == 0x01      // zlib header
+                                                        || b1 == 0x9C
+                                                        || b1 == 0xDA))));
+                }
+                if (is_text)
+                {
+                    // simply swap in_buff and out_buff, and adjust pointers
+                    assert(in_buff_start == in_buff.get());
+                    std::swap(in_buff, out_buff);
+                    out_buff_free_start = in_buff_end;
+                    in_buff_start = in_buff.get();
+                    in_buff_end = in_buff.get();
+                }
+                else
+                {
+                    // run inflate() on input
+                    if (! zstrm_p) zstrm_p = std::unique_ptr<detail::z_stream_wrapper>(new detail::z_stream_wrapper(true, Z_DEFAULT_COMPRESSION, window_bits));
+                    zstrm_p->next_in = reinterpret_cast< decltype(zstrm_p->next_in) >(in_buff_start);
+                    zstrm_p->avail_in = uint32_t(in_buff_end - in_buff_start);
+                    zstrm_p->next_out = reinterpret_cast< decltype(zstrm_p->next_out) >(out_buff_free_start);
+                    zstrm_p->avail_out = uint32_t((out_buff.get() + buff_size) - out_buff_free_start);
+                    int ret = inflate(zstrm_p.get(), Z_NO_FLUSH);
+                    // process return code
+                    if (ret != Z_OK && ret != Z_STREAM_END) throw Exception(zstrm_p.get(), ret);
+                    // update in&out pointers following inflate()
+                    in_buff_start = reinterpret_cast< decltype(in_buff_start) >(zstrm_p->next_in);
+                    in_buff_end = in_buff_start + zstrm_p->avail_in;
+                    out_buff_free_start = reinterpret_cast< decltype(out_buff_free_start) >(zstrm_p->next_out);
+                    assert(out_buff_free_start + zstrm_p->avail_out == out_buff.get() + buff_size);
+
+                    if (ret == Z_STREAM_END) {
+                        // if stream ended, deallocate inflator
+                        zstrm_p.reset();
+                    }
+                }
+            } while (out_buff_free_start == out_buff.get());
+            // 2 exit conditions:
+            // - end of input: there might or might not be output available
+            // - out_buff_free_start != out_buff: output available
+            this->setg(out_buff.get(), out_buff.get(), out_buff_free_start);
+        }
+        return this->gptr() == this->egptr()
+            ? traits_type::eof()
+            : traits_type::to_int_type(*this->gptr());
+    }
+private:
+    std::streambuf * sbuf_p;
+    std::unique_ptr<char[]> in_buff;
+    char * in_buff_start;
+    char * in_buff_end;
+    std::unique_ptr<char[]> out_buff;
+    std::unique_ptr<detail::z_stream_wrapper> zstrm_p;
+    std::size_t buff_size;
+    bool auto_detect;
+    bool auto_detect_run;
+    bool is_text;
+    int window_bits;
+
+}; // class istreambuf
+
+class ostreambuf
+    : public std::streambuf
+{
+public:
+    ostreambuf(std::streambuf * _sbuf_p,
+               std::size_t _buff_size = default_buff_size, int _level = Z_DEFAULT_COMPRESSION, int _window_bits = 0)
+        : sbuf_p(_sbuf_p),
+          in_buff(),
+          out_buff(),
+          zstrm_p(new detail::z_stream_wrapper(false, _level, _window_bits)),
+          buff_size(_buff_size)
+    {
+        assert(sbuf_p);
+        in_buff = std::unique_ptr<char[]>(new char[buff_size]);
+        out_buff = std::unique_ptr<char[]>(new char[buff_size]);
+        setp(in_buff.get(), in_buff.get() + buff_size);
+    }
+
+    ostreambuf(const ostreambuf &) = delete;
+    ostreambuf & operator = (const ostreambuf &) = delete;
+
+    int deflate_loop(int flush)
+    {
+        while (true)
+        {
+            zstrm_p->next_out = reinterpret_cast< decltype(zstrm_p->next_out) >(out_buff.get());
+            zstrm_p->avail_out = uint32_t(buff_size);
+            int ret = deflate(zstrm_p.get(), flush);
+            if (ret != Z_OK && ret != Z_STREAM_END && ret != Z_BUF_ERROR) {
+                failed = true;
+                throw Exception(zstrm_p.get(), ret);
+            }
+            std::streamsize sz = sbuf_p->sputn(out_buff.get(), reinterpret_cast< decltype(out_buff.get()) >(zstrm_p->next_out) - out_buff.get());
+            if (sz != reinterpret_cast< decltype(out_buff.get()) >(zstrm_p->next_out) - out_buff.get())
+            {
+                // there was an error in the sink stream
+                return -1;
+            }
+            if (ret == Z_STREAM_END || ret == Z_BUF_ERROR || sz == 0)
+            {
+                break;
+            }
+        }
+        return 0;
+    }
+
+    virtual ~ostreambuf()
+    {
+        // flush the zlib stream
+        //
+        // NOTE: Errors here (sync() return value not 0) are ignored, because we
+        // cannot throw in a destructor. This mirrors the behaviour of
+        // std::basic_filebuf::~basic_filebuf(). To see an exception on error,
+        // close the ofstream with an explicit call to close(), and do not rely
+        // on the implicit call in the destructor.
+        //
+        if (!failed) try {
+            sync();
+        } catch (...) {}
+    }
+    std::streambuf::int_type overflow(std::streambuf::int_type c = traits_type::eof()) override
+    {
+        zstrm_p->next_in = reinterpret_cast< decltype(zstrm_p->next_in) >(pbase());
+        zstrm_p->avail_in = uint32_t(pptr() - pbase());
+        while (zstrm_p->avail_in > 0)
+        {
+            int r = deflate_loop(Z_NO_FLUSH);
+            if (r != 0)
+            {
+                setp(nullptr, nullptr);
+                return traits_type::eof();
+            }
+        }
+        setp(in_buff.get(), in_buff.get() + buff_size);
+        return traits_type::eq_int_type(c, traits_type::eof()) ? traits_type::eof() : sputc(char_type(c));
+    }
+    int sync() override
+    {
+        // first, call overflow to clear in_buff
+        overflow();
+        if (! pptr()) return -1;
+        // then, call deflate asking to finish the zlib stream
+        zstrm_p->next_in = nullptr;
+        zstrm_p->avail_in = 0;
+        if (deflate_loop(Z_FINISH) != 0) return -1;
+        deflateReset(zstrm_p.get());
+        return 0;
+    }
+private:
+    std::streambuf * sbuf_p = nullptr;
+    std::unique_ptr<char[]> in_buff;
+    std::unique_ptr<char[]> out_buff;
+    std::unique_ptr<detail::z_stream_wrapper> zstrm_p;
+    std::size_t buff_size;
+    bool failed = false;
+
+}; // class ostreambuf
+
+class istream
+    : public std::istream
+{
+public:
+    istream(std::istream & is,
+            std::size_t _buff_size = default_buff_size, bool _auto_detect = true, int _window_bits = 0)
+        : std::istream(new istreambuf(is.rdbuf(), _buff_size, _auto_detect, _window_bits))
+    {
+        exceptions(std::ios_base::badbit);
+    }
+    explicit istream(std::streambuf * sbuf_p)
+        : std::istream(new istreambuf(sbuf_p))
+    {
+        exceptions(std::ios_base::badbit);
+    }
+    virtual ~istream()
+    {
+        delete rdbuf();
+    }
+}; // class istream
+
+class ostream
+    : public std::ostream
+{
+public:
+    ostream(std::ostream & os,
+            std::size_t _buff_size = default_buff_size, int _level = Z_DEFAULT_COMPRESSION, int _window_bits = 0)
+        : std::ostream(new ostreambuf(os.rdbuf(), _buff_size, _level, _window_bits))
+    {
+        exceptions(std::ios_base::badbit);
+    }
+    explicit ostream(std::streambuf * sbuf_p)
+        : std::ostream(new ostreambuf(sbuf_p))
+    {
+        exceptions(std::ios_base::badbit);
+    }
+    virtual ~ostream()
+    {
+        delete rdbuf();
+    }
+}; // class ostream
+
+namespace detail
+{
+
+template < typename FStream_Type >
+struct strict_fstream_holder
+{
+    strict_fstream_holder(const std::string& filename, std::ios_base::openmode mode = std::ios_base::in)
+        : _fs(filename, mode)
+    {}
+    strict_fstream_holder() = default;
+    FStream_Type _fs {};
+}; // class strict_fstream_holder
+
+} // namespace detail
+
+class ifstream
+    : private detail::strict_fstream_holder< strict_fstream::ifstream >,
+      public std::istream
+{
+public:
+    explicit ifstream(const std::string filename, std::ios_base::openmode mode = std::ios_base::in, size_t buff_size = default_buff_size)
+        : detail::strict_fstream_holder< strict_fstream::ifstream >(filename, mode),
+          std::istream(new istreambuf(_fs.rdbuf(), buff_size))
+    {
+        exceptions(std::ios_base::badbit);
+    }
+    explicit ifstream(): detail::strict_fstream_holder< strict_fstream::ifstream >(), std::istream(new istreambuf(_fs.rdbuf())){}
+    void close() {
+        _fs.close();
+    }
+    #ifdef CAN_MOVE_IOSTREAM
+    void open(const std::string filename, std::ios_base::openmode mode = std::ios_base::in) {
+        _fs.open(filename, mode);
+        std::istream::operator=(std::istream(new istreambuf(_fs.rdbuf())));
+    }
+    #endif
+    bool is_open() const {
+        return _fs.is_open();
+    }
+    virtual ~ifstream()
+    {
+        if (_fs.is_open()) close();
+        if (rdbuf()) delete rdbuf();
+    }
+
+    /// Return the position within the compressed file (wrapped filestream)
+    std::streampos compressed_tellg()
+    {
+        return _fs.tellg();
+    }
+}; // class ifstream
+
+class ofstream
+    : private detail::strict_fstream_holder< strict_fstream::ofstream >,
+      public std::ostream
+{
+public:
+    explicit ofstream(const std::string filename, std::ios_base::openmode mode = std::ios_base::out,
+                      int level = Z_DEFAULT_COMPRESSION, size_t buff_size = default_buff_size)
+        : detail::strict_fstream_holder< strict_fstream::ofstream >(filename, mode | std::ios_base::binary),
+          std::ostream(new ostreambuf(_fs.rdbuf(), buff_size, level))
+    {
+        exceptions(std::ios_base::badbit);
+    }
+    explicit ofstream(): detail::strict_fstream_holder< strict_fstream::ofstream >(), std::ostream(new ostreambuf(_fs.rdbuf())){}
+    void close() {
+        std::ostream::flush();
+        _fs.close();
+    }
+    #ifdef CAN_MOVE_IOSTREAM
+    void open(const std::string filename, std::ios_base::openmode mode = std::ios_base::out, int level = Z_DEFAULT_COMPRESSION) {
+        flush();
+        _fs.open(filename, mode | std::ios_base::binary);
+        std::ostream::operator=(std::ostream(new ostreambuf(_fs.rdbuf(), default_buff_size, level)));
+    }
+    #endif
+    bool is_open() const {
+        return _fs.is_open();
+    }
+    ofstream& flush() {
+        std::ostream::flush();
+        _fs.flush();
+        return *this;
+    }
+    virtual ~ofstream()
+    {
+        if (_fs.is_open()) close();
+        if (rdbuf()) delete rdbuf();
+    }
+
+    // Return the position within the compressed file (wrapped filestream)
+    std::streampos compressed_tellp()
+    {
+        return _fs.tellp();
+    }
+}; // class ofstream
+
+} // namespace zstr
+

From 7b3bc20d7b5ae39704b2d07490771d4e8b57352c Mon Sep 17 00:00:00 2001
From: Marcel Martin <marcel.martin@scilifelab.se>
Date: Thu, 7 Dec 2023 10:49:15 +0100
Subject: [PATCH 10/32] Use poolstl to sort randstrobes in parallel
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

See https://github.com/alugowski/poolSTL

Sorting the randstrobes is currently a bottleneck in index generation as it
does not run in parallel. This is an attempt to parallelize it.

poolstl’s sort uses regular std::sort under the hood.
We currently use pdqsort_branchless, which is about twice as fast as
std::sort, so parallel sorting breaks even with pdqsort_branchless at about
2-3 threads. It gets faster with more threads, but not as much as one would
perhaps expect. Here are the sorting runtimes for CHM13:

- 31 s with pdqsort_branchless
- 59 s with std::sort
- 34 s with parallel sort, 2 threads
- 24 s with parallel sort, 4 threads
- 23 s with parallel sort, 8 threads

Another issue is that sorting is no longer in place, so memory usage goes
up by a couple of gigabytes, which is another reason for me not to make this
change.
---
 ext/poolstl/poolstl.hpp | 1697 +++++++++++++++++++++++++++++++++++++++
 src/index.cpp           |   12 +-
 src/index.hpp           |    2 +-
 3 files changed, 1707 insertions(+), 4 deletions(-)
 create mode 100644 ext/poolstl/poolstl.hpp

diff --git a/ext/poolstl/poolstl.hpp b/ext/poolstl/poolstl.hpp
new file mode 100644
index 00000000..ea79146e
--- /dev/null
+++ b/ext/poolstl/poolstl.hpp
@@ -0,0 +1,1697 @@
+// SPDX-License-Identifier: BSD-2-Clause OR MIT OR BSL-1.0
+/**
+ * @brief Thread pool-based implementation of parallel standard library algorithms. Single-file version.
+ * @see https://github.com/alugowski/poolSTL
+ * @author Adam Lugowski
+ * @copyright Copyright (C) 2023 Adam Lugowski.
+ *            Licensed under any of the following open-source licenses:
+ *            BSD-2-Clause license, MIT license, Boost Software License 1.0
+ *
+ *
+ * BSD-2-Clause license:
+ *
+ * Copyright (C) 2023 Adam Lugowski
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ *
+ * MIT License:
+ *
+ * Copyright (c) 2023 Adam Lugowski
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *
+ *
+ * Boost Software License 1.0:
+ *
+ * Permission is hereby granted, free of charge, to any person or organization
+ * obtaining a copy of the software and accompanying documentation covered by
+ * this license (the "Software") to use, reproduce, display, distribute, execute,
+ * and transmit the Software, and to prepare derivative works of the Software,
+ * and to permit third-parties to whom the Software is furnished to do so,
+ * all subject to the following:
+ *
+ * The copyright notices in the Software and this entire statement, including
+ * the above license grant, this restriction and the following disclaimer, must
+ * be included in all copies of the Software, in whole or in part, and all
+ * derivative works of the Software, unless such copies or derivative works
+ * are solely in the form of machine-executable object code generated by a
+ * source language processor.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+ * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+ * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef POOLSTL_HPP
+#define POOLSTL_HPP
+
+
+#ifndef POOLSTL_EXECUTION_HPP
+#define POOLSTL_EXECUTION_HPP
+
+#include <memory>
+#include <mutex>
+#include <type_traits>
+
+
+#ifndef AL_TASK_THREAD_POOL_HPP
+#define AL_TASK_THREAD_POOL_HPP
+
+// Version macros.
+#define TASK_THREAD_POOL_VERSION_MAJOR 1
+#define TASK_THREAD_POOL_VERSION_MINOR 0
+#define TASK_THREAD_POOL_VERSION_PATCH 9
+
+#include <condition_variable>
+#include <functional>
+#include <future>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <type_traits>
+
+// MSVC does not correctly set the __cplusplus macro by default, so we must read it from _MSVC_LANG
+// See https://devblogs.microsoft.com/cppblog/msvc-now-correctly-reports-__cplusplus/
+#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
+#define TTP_CXX17 1
+#else
+#define TTP_CXX17 0
+#endif
+
+#if TTP_CXX17
+#define TTP_NODISCARD [[nodiscard]]
+#else
+#define TTP_NODISCARD
+#endif
+
+namespace task_thread_pool {
+
+#if !TTP_CXX17
+    /**
+     * A reimplementation of std::decay_t, which is only available since C++14.
+     */
+    template <class T>
+    using decay_t = typename std::decay<T>::type;
+#endif
+
+    /**
+     * A fast and lightweight thread pool that uses C++11 threads.
+     */
+    class task_thread_pool {
+    public:
+        /**
+         * Create a task_thread_pool and start worker threads.
+         *
+         * @param num_threads Number of worker threads. If 0 then number of threads is equal to the
+         *                    number of physical cores on the machine, as given by std::thread::hardware_concurrency().
+         */
+        explicit task_thread_pool(unsigned int num_threads = 0) {
+            if (num_threads < 1) {
+                num_threads = std::thread::hardware_concurrency();
+                if (num_threads < 1) { num_threads = 1; }
+            }
+            start_threads(num_threads);
+        }
+
+        /**
+         * Finish all tasks left in the queue then shut down worker threads.
+         * If the pool is currently paused then it is resumed.
+         */
+        ~task_thread_pool() {
+            unpause();
+            wait_for_queued_tasks();
+            stop_all_threads();
+        }
+
+        /**
+         * Drop all tasks that have been submitted but not yet started by a worker.
+         *
+         * Tasks already in progress continue executing.
+         */
+        void clear_task_queue() {
+            const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+            tasks = {};
+        }
+
+        /**
+         * Get number of enqueued tasks.
+         *
+         * @return Number of tasks that have been enqueued but not yet started.
+         */
+        TTP_NODISCARD size_t get_num_queued_tasks() const {
+            const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+            return tasks.size();
+        }
+
+        /**
+         * Get number of in-progress tasks.
+         *
+         * @return Approximate number of tasks currently being processed by worker threads.
+         */
+        TTP_NODISCARD size_t get_num_running_tasks() const {
+            const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+            return num_inflight_tasks;
+        }
+
+        /**
+         * Get total number of tasks in the pool.
+         *
+         * @return Approximate number of tasks both enqueued and running.
+         */
+        TTP_NODISCARD size_t get_num_tasks() const {
+            const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+            return tasks.size() + num_inflight_tasks;
+        }
+
+        /**
+         * Get number of worker threads.
+         *
+         * @return Number of worker threads.
+         */
+        TTP_NODISCARD unsigned int get_num_threads() const {
+            const std::lock_guard<std::recursive_mutex> threads_lock(thread_mutex);
+            return static_cast<unsigned int>(threads.size());
+        }
+
+        /**
+         * Set number of worker threads. Will start or stop worker threads as necessary.
+         *
+         * @param num_threads Number of worker threads. If 0 then number of threads is equal to the
+         *                    number of physical cores on the machine, as given by std::thread::hardware_concurrency().
+         * @return Previous number of worker threads.
+         */
+        unsigned int set_num_threads(unsigned int num_threads) {
+            const std::lock_guard<std::recursive_mutex> threads_lock(thread_mutex);
+            unsigned int previous_num_threads = get_num_threads();
+
+            if (num_threads < 1) {
+                num_threads = std::thread::hardware_concurrency();
+                if (num_threads < 1) { num_threads = 1; }
+            }
+
+            if (previous_num_threads <= num_threads) {
+                // expanding the thread pool
+                start_threads(num_threads - previous_num_threads);
+            } else {
+                // contracting the thread pool
+                stop_all_threads();
+                {
+                    const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+                    pool_running = true;
+                }
+                start_threads(num_threads);
+            }
+
+            return previous_num_threads;
+        }
+
+        /**
+         * Stop executing queued tasks. Use `unpause()` to resume. Note: Destroying the pool will implicitly unpause.
+         *
+         * Any in-progress tasks continue executing.
+         */
+        void pause() {
+            const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+            pool_paused = true;
+        }
+
+        /**
+         * Resume executing queued tasks.
+         */
+        void unpause() {
+            const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+            pool_paused = false;
+            task_cv.notify_all();
+        }
+
+        /**
+         * Check whether the pool is paused.
+         *
+         * @return true if pause() has been called without an intervening unpause().
+         */
+        TTP_NODISCARD bool is_paused() const {
+            const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+            return pool_paused;
+        }
+
+        /**
+         * Submit a Callable for the pool to execute and return a std::future.
+         *
+         * @param func The Callable to execute. Can be a function, a lambda, std::packaged_task, std::function, etc.
+         * @param args Arguments for func. Optional.
+         * @return std::future that can be used to get func's return value or thrown exception.
+         */
+        template <typename F, typename... A,
+#if TTP_CXX17
+            typename R = std::invoke_result_t<std::decay_t<F>, std::decay_t<A>...>
+#else
+            typename R = typename std::result_of<decay_t<F>(decay_t<A>...)>::type
+#endif
+            >
+        TTP_NODISCARD std::future<R> submit(F&& func, A&&... args) {
+            std::shared_ptr<std::packaged_task<R()>> ptask =
+                std::make_shared<std::packaged_task<R()>>(std::bind(std::forward<F>(func), std::forward<A>(args)...));
+            submit_detach([ptask] { (*ptask)(); });
+            return ptask->get_future();
+        }
+
+        /**
+         * Submit a zero-argument Callable for the pool to execute.
+         *
+         * @param func The Callable to execute. Can be a function, a lambda, std::packaged_task, std::function, etc.
+         */
+        template <typename F>
+        void submit_detach(F&& func) {
+            const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+            tasks.emplace(std::forward<F>(func));
+            task_cv.notify_one();
+        }
+
+        /**
+         * Submit a Callable with arguments for the pool to execute.
+         *
+         * @param func The Callable to execute. Can be a function, a lambda, std::packaged_task, std::function, etc.
+         */
+        template <typename F, typename... A>
+        void submit_detach(F&& func, A&&... args) {
+            const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+            tasks.emplace(std::bind(std::forward<F>(func), std::forward<A>(args)...));
+            task_cv.notify_one();
+        }
+
+        /**
+         * Block until the task queue is empty. Some tasks may be in-progress when this method returns.
+         */
+        void wait_for_queued_tasks() {
+            std::unique_lock<std::mutex> tasks_lock(task_mutex);
+            notify_task_finish = true;
+            task_finished_cv.wait(tasks_lock, [&] { return tasks.empty(); });
+            notify_task_finish = false;
+        }
+
+        /**
+         * Block until all tasks have finished.
+         */
+        void wait_for_tasks() {
+            std::unique_lock<std::mutex> tasks_lock(task_mutex);
+            notify_task_finish = true;
+            task_finished_cv.wait(tasks_lock, [&] { return tasks.empty() && num_inflight_tasks == 0; });
+            notify_task_finish = false;
+        }
+
+    protected:
+
+        /**
+         * Main function for worker threads.
+         */
+        void worker_main() {
+            bool finished_task = false;
+
+            while (true) {
+                std::unique_lock<std::mutex> tasks_lock(task_mutex);
+
+                if (finished_task) {
+                    --num_inflight_tasks;
+                    if (notify_task_finish) {
+                        task_finished_cv.notify_all();
+                    }
+                }
+
+                task_cv.wait(tasks_lock, [&]() { return !pool_running || (!pool_paused && !tasks.empty()); });
+
+                if (!pool_running) {
+                    break;
+                }
+
+                // Must mean that (!pool_paused && !tasks.empty()) is true
+
+                std::packaged_task<void()> task{std::move(tasks.front())};
+                tasks.pop();
+                ++num_inflight_tasks;
+                tasks_lock.unlock();
+
+                try {
+                    task();
+                } catch (...) {
+                    // std::packaged_task::operator() may throw in some error conditions, such as if the task
+                    // had already been run. Nothing that the pool can do anything about.
+                }
+
+                finished_task = true;
+            }
+        }
+
+        /**
+         * Start worker threads.
+         *
+         * @param num_threads How many threads to start.
+         */
+        void start_threads(const unsigned int num_threads) {
+            const std::lock_guard<std::recursive_mutex> threads_lock(thread_mutex);
+
+            for (unsigned int i = 0; i < num_threads; ++i) {
+                threads.emplace_back(&task_thread_pool::worker_main, this);
+            }
+        }
+
+        /**
+         * Stop, join, and destroy all worker threads.
+         */
+        void stop_all_threads() {
+            const std::lock_guard<std::recursive_mutex> threads_lock(thread_mutex);
+
+            {
+                const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+                pool_running = false;
+                task_cv.notify_all();
+            }
+
+            for (auto& thread : threads) {
+                if (thread.joinable()) {
+                    thread.join();
+                }
+            }
+            threads.clear();
+        }
+
+        /**
+         * The worker threads.
+         *
+         * Access protected by thread_mutex
+         */
+        std::vector<std::thread> threads;
+
+        /**
+         * A mutex for methods that start/stop threads.
+         */
+        mutable std::recursive_mutex thread_mutex;
+
+        /**
+         * The task queue.
+         *
+         * Access protected by task_mutex.
+         */
+        std::queue<std::packaged_task<void()>> tasks = {};
+
+        /**
+         * A mutex for all variables related to tasks.
+         */
+        mutable std::mutex task_mutex;
+
+        /**
+         * Used to notify changes to the task queue, such as a new task added, pause/unpause, etc.
+         */
+        std::condition_variable task_cv;
+
+        /**
+         * Used to notify of finished tasks.
+         */
+        std::condition_variable task_finished_cv;
+
+        /**
+         * A signal for worker threads that the pool is either running or shutting down.
+         *
+         * Access protected by task_mutex.
+         */
+        bool pool_running = true;
+
+        /**
+         * A signal for worker threads to not pull new tasks from the queue.
+         *
+         * Access protected by task_mutex.
+         */
+        bool pool_paused = false;
+
+        /**
+         * A signal for worker threads that they should notify task_finished_cv when they finish a task.
+         *
+         * Access protected by task_mutex.
+         */
+        bool notify_task_finish = false;
+
+        /**
+         * A counter of the number of tasks in-progress by worker threads.
+         * Incremented when a task is popped off the task queue and decremented when that task is complete.
+         *
+         * Access protected by task_mutex.
+         */
+        int num_inflight_tasks = 0;
+    };
+}
+
+// clean up
+#undef TTP_NODISCARD
+#undef TTP_CXX17
+
+#endif
+
+#ifndef POOLSTL_INTERNAL_UTILS_HPP
+#define POOLSTL_INTERNAL_UTILS_HPP
+
+// Version macros.
+#define POOLSTL_VERSION_MAJOR 0
+#define POOLSTL_VERSION_MINOR 3
+#define POOLSTL_VERSION_PATCH 1
+
+#include <cstddef>
+#include <iterator>
+
+#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
+#define POOLSTL_HAVE_CXX17 1
+#define POOLSTL_NO_DISCARD [[nodiscard]]
+#else
+#define POOLSTL_HAVE_CXX17 0
+#define POOLSTL_NO_DISCARD
+#endif
+
+#if POOLSTL_HAVE_CXX17 && (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE >= 9)
+#define POOLSTL_HAVE_CXX17_LIB 1
+#else
+#define POOLSTL_HAVE_CXX17_LIB 0
+#endif
+
+#if __cplusplus >= 201402L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201402L)
+#define POOLSTL_HAVE_CXX14 1
+#else
+#define POOLSTL_HAVE_CXX14 0
+#endif
+
+namespace poolstl {
+    namespace internal {
+
+        inline constexpr std::size_t get_chunk_size(std::size_t num_steps, unsigned int num_threads) {
+            return (num_steps / num_threads) + ((num_steps % num_threads) > 0 ? 1 : 0);
+        }
+
+        template<typename Iterator>
+        constexpr typename std::iterator_traits<Iterator>::difference_type
+        get_chunk_size(Iterator first, Iterator last, unsigned int num_threads) {
+            using diff_t = typename std::iterator_traits<Iterator>::difference_type;
+            return static_cast<diff_t>(get_chunk_size((std::size_t)std::distance(first, last), num_threads));
+        }
+
+        template<typename Iterator>
+        constexpr typename std::iterator_traits<Iterator>::difference_type
+        get_iter_chunk_size(const Iterator& iter, const Iterator& last,
+                            typename std::iterator_traits<Iterator>::difference_type chunk_size) {
+            return std::min(chunk_size, std::distance(iter, last));
+        }
+
+        template<typename Iterator>
+        Iterator advanced(Iterator iter, typename std::iterator_traits<Iterator>::difference_type offset) {
+            Iterator ret = iter;
+            std::advance(ret, offset);
+            return ret;
+        }
+
+        /**
+         * An iterator wrapper that calls std::future<>::get().
+         * @tparam Iterator
+         */
+        template<typename Iterator>
+        class getting_iter : public Iterator {
+        public:
+            using value_type = decltype((*std::declval<Iterator>()).get());
+            using difference_type = typename std::iterator_traits<Iterator>::difference_type;
+            using pointer = value_type*;
+            using reference = value_type&;
+            explicit getting_iter(Iterator iter) : iter(iter) {}
+
+            getting_iter operator++() { ++iter; return *this; }
+            getting_iter operator++(int) { getting_iter ret(*this); ++iter; return ret; }
+
+            value_type operator*() { return (*iter).get(); }
+            value_type operator[](difference_type offset) { return iter[offset].get(); }
+
+            bool operator==(const getting_iter<Iterator> &other) const { return iter == other.iter; }
+            bool operator!=(const getting_iter<Iterator> &other) const { return iter != other.iter; }
+
+        protected:
+            Iterator iter;
+        };
+
+        template<typename Iterator>
+        getting_iter<Iterator> get_wrap(Iterator iter) {
+            return getting_iter<Iterator>(iter);
+        }
+
+        template <class Container>
+        void get_futures(Container& futures) {
+            for (auto &future: futures) {
+                future.get();
+            }
+        }
+
+        /*
+         * Some methods are only available with C++17 and up. Reimplement on older standards.
+         */
+#if POOLSTL_HAVE_CXX17_LIB
+        namespace cpp17 = std;
+#else
+        namespace cpp17 {
+
+            // std::reduce
+
+            template<class InputIt, class Tp, class BinOp>
+            Tp reduce(InputIt first, InputIt last, Tp init, BinOp b) {
+                for (; first != last; ++first)
+                    init = b(init, *first);
+                return init;
+            }
+
+            template<class InputIt>
+            typename std::iterator_traits<InputIt>::value_type reduce(InputIt first, InputIt last) {
+                return reduce(first, last,
+                              typename std::iterator_traits<InputIt>::value_type{},
+                              std::plus<typename std::iterator_traits<InputIt>::value_type>());
+            }
+
+            // std::transform
+
+            template<class InputIt, class OutputIt, class UnaryOperation>
+            OutputIt transform(InputIt first1, InputIt last1, OutputIt d_first,
+                               UnaryOperation unary_op) {
+                while (first1 != last1) {
+                    *d_first++ = unary_op(*first1++);
+                }
+
+                return d_first;
+            }
+
+            template<class InputIt1, class InputIt2, class OutputIt, class BinaryOperation>
+            OutputIt transform(InputIt1 first1, InputIt1 last1,
+                               InputIt2 first2, OutputIt d_first,
+                               BinaryOperation binary_op) {
+                while (first1 != last1) {
+                    *d_first++ = binary_op(*first1++, *first2++);
+                }
+
+                return d_first;
+            }
+        }
+#endif
+    }
+}
+
+#endif
+
+#if POOLSTL_HAVE_CXX17
+#include <variant>
+#endif
+
+namespace poolstl {
+
+    namespace ttp = task_thread_pool;
+
+    namespace execution {
+        namespace internal {
+            /**
+             * Holds the thread pool used by par.
+             */
+            inline std::shared_ptr<ttp::task_thread_pool> get_default_pool() {
+                static std::shared_ptr<ttp::task_thread_pool> pool;
+                static std::once_flag flag;
+                std::call_once(flag, [&](){ pool = std::make_shared<ttp::task_thread_pool>(); });
+                return pool;
+            }
+        }
+
+        /**
+         * A sequential policy that simply forwards to the non-policy overload.
+         */
+        struct sequenced_policy {};
+
+        /**
+         * A parallel policy that can use a user-specified thread pool or a default one.
+         */
+        struct parallel_policy {
+            parallel_policy() = default;
+            explicit parallel_policy(ttp::task_thread_pool& on_pool): on_pool(&on_pool) {}
+
+            parallel_policy on(ttp::task_thread_pool& pool) const {
+                return parallel_policy{pool};
+            }
+
+            POOLSTL_NO_DISCARD ttp::task_thread_pool& pool() const {
+                if (on_pool) {
+                    return *on_pool;
+                } else {
+                    return *(internal::get_default_pool());
+                }
+            }
+
+        protected:
+            ttp::task_thread_pool *on_pool = nullptr;
+        };
+
+        constexpr sequenced_policy seq{};
+        constexpr parallel_policy par{};
+
+
+#if POOLSTL_HAVE_CXX17
+        /**
+         * A policy that allows selecting a policy at runtime.
+         *
+         * @tparam Variant std::variant<> of policy options.
+         */
+        template <typename Variant>
+        struct variant_policy {
+            explicit variant_policy(const Variant& policy): var(policy) {}
+            Variant var;
+        };
+
+        namespace internal {
+            using poolstl_policy_variant = std::variant<
+                poolstl::execution::parallel_policy,
+                poolstl::execution::sequenced_policy>;
+        }
+
+        /**
+         * Choose parallel or sequential at runtime.
+         *
+         * @param call_par Whether to use a parallel policy.
+         * @return `par` if call_par is true, else `seq`.
+         */
+        inline variant_policy<internal::poolstl_policy_variant> par_if(bool call_par) {
+            if (call_par) {
+                return variant_policy(internal::poolstl_policy_variant(par));
+            } else {
+                return variant_policy(internal::poolstl_policy_variant(seq));
+            }
+        }
+
+        /**
+         * Choose parallel or sequential at runtime, with pool selection.
+         *
+         * @param call_par Whether to use a parallel policy.
+         * @return `par.on(pool)` if call_par is true, else `seq`.
+         */
+        inline variant_policy<internal::poolstl_policy_variant> par_if(bool call_par, ttp::task_thread_pool& pool) {
+            if (call_par) {
+                return variant_policy(internal::poolstl_policy_variant(par.on(pool)));
+            } else {
+                return variant_policy(internal::poolstl_policy_variant(seq));
+            }
+        }
+#endif
+    }
+
+    using execution::seq;
+    using execution::par;
+#if POOLSTL_HAVE_CXX17
+    using execution::variant_policy;
+    using execution::par_if;
+#endif
+
+    namespace internal {
+        /**
+         * To enable/disable seq overload resolution
+         */
+        template <class ExecPolicy, class Tp>
+        using enable_if_seq =
+            typename std::enable_if<
+                std::is_same<poolstl::execution::sequenced_policy,
+                    typename std::remove_cv<typename std::remove_reference<ExecPolicy>::type>::type>::value,
+                Tp>::type;
+
+        /**
+         * To enable/disable par overload resolution
+         */
+        template <class ExecPolicy, class Tp>
+        using enable_if_par =
+            typename std::enable_if<
+                std::is_same<poolstl::execution::parallel_policy,
+                    typename std::remove_cv<typename std::remove_reference<ExecPolicy>::type>::type>::value,
+                Tp>::type;
+
+#if POOLSTL_HAVE_CXX17
+        /**
+         * Helper for enable_if_poolstl_variant
+         */
+        template <typename T> struct is_poolstl_variant_policy : std::false_type {};
+        template <typename V> struct is_poolstl_variant_policy<
+            ::poolstl::execution::variant_policy<V>> :std::true_type {};
+
+        /**
+         * To enable/disable variant_policy (for par_if) overload resolution
+         */
+        template <class ExecPolicy, class Tp>
+        using enable_if_poolstl_variant =
+            typename std::enable_if<
+                is_poolstl_variant_policy<
+                    typename std::remove_cv<typename std::remove_reference<ExecPolicy>::type>::type>::value,
+                Tp>::type;
+#endif
+    }
+}
+
+#endif
+
+#ifndef POOLSTL_ALGORITHM_HPP
+#define POOLSTL_ALGORITHM_HPP
+
+#include <functional>
+
+
+#ifndef POOLSTL_INTERNAL_TTP_IMPL_HPP
+#define POOLSTL_INTERNAL_TTP_IMPL_HPP
+
+#include <algorithm>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+
+namespace poolstl {
+    namespace internal {
+
+#if POOLSTL_HAVE_CXX17_LIB
+        /**
+         * Call std::apply in parallel.
+         */
+        template <class ExecPolicy, class Op, class ArgContainer>
+        std::vector<std::future<void>>
+        parallel_apply(ExecPolicy &&policy, Op op, const ArgContainer& args_list) {
+            std::vector<std::future<void>> futures;
+            auto& task_pool = policy.pool();
+
+            for (const auto& args : args_list) {
+                futures.emplace_back(task_pool.submit([op](const auto& args_fwd) { std::apply(op, args_fwd); }, args));
+            }
+
+            return futures;
+        }
+#endif
+
+        /**
+         * Chunk a single range.
+         */
+        template <class ExecPolicy, class RandIt, class Chunk>
+        std::vector<std::future<decltype(std::declval<Chunk>()(std::declval<RandIt>(), std::declval<RandIt>()))>>
+        parallel_chunk_for(ExecPolicy &&policy, RandIt first, RandIt last, Chunk chunk, int extra_split_factor = 1) {
+            std::vector<std::future<
+                decltype(std::declval<Chunk>()(std::declval<RandIt>(), std::declval<RandIt>()))
+                >> futures;
+            auto& task_pool = policy.pool();
+            auto chunk_size = get_chunk_size(first, last, extra_split_factor * task_pool.get_num_threads());
+
+            while (first < last) {
+                auto iter_chunk_size = get_iter_chunk_size(first, last, chunk_size);
+                RandIt loop_end = advanced(first, iter_chunk_size);
+
+                futures.emplace_back(task_pool.submit(chunk, first, loop_end));
+
+                first = loop_end;
+            }
+
+            return futures;
+        }
+
+        /**
+         * Element-wise chunk two ranges.
+         */
+        template <class ExecPolicy, class RandIt1, class RandIt2, class Chunk>
+        std::vector<std::future<decltype(std::declval<Chunk>()(
+            std::declval<RandIt1>(),
+            std::declval<RandIt1>(),
+            std::declval<RandIt2>()))>>
+        parallel_chunk_for(ExecPolicy &&policy, RandIt1 first1, RandIt1 last1, RandIt2 first2, Chunk chunk) {
+            std::vector<std::future<decltype(std::declval<Chunk>()(
+                    std::declval<RandIt1>(),
+                    std::declval<RandIt1>(),
+                    std::declval<RandIt2>()))
+            >> futures;
+            auto& task_pool = policy.pool();
+            auto chunk_size = get_chunk_size(first1, last1, task_pool.get_num_threads());
+
+            while (first1 < last1) {
+                auto iter_chunk_size = get_iter_chunk_size(first1, last1, chunk_size);
+                RandIt1 loop_end = advanced(first1, iter_chunk_size);
+
+                futures.emplace_back(task_pool.submit(chunk, first1, loop_end, first2));
+
+                first1 = loop_end;
+                std::advance(first2, iter_chunk_size);
+            }
+
+            return futures;
+        }
+
+        /**
+         * Element-wise chunk three ranges.
+         */
+        template <class ExecPolicy, class RandIt1, class RandIt2, class RandIt3, class Chunk>
+        std::vector<std::future<decltype(std::declval<Chunk>()(
+            std::declval<RandIt1>(),
+            std::declval<RandIt1>(),
+            std::declval<RandIt2>(),
+            std::declval<RandIt3>()))>>
+        parallel_chunk_for(ExecPolicy &&policy, RandIt1 first1, RandIt1 last1, RandIt2 first2, RandIt3 first3,
+                           Chunk chunk) {
+            std::vector<std::future<decltype(std::declval<Chunk>()(
+                std::declval<RandIt1>(),
+                std::declval<RandIt1>(),
+                std::declval<RandIt2>(),
+                std::declval<RandIt3>()))
+            >> futures;
+            auto& task_pool = policy.pool();
+            auto chunk_size = get_chunk_size(first1, last1, task_pool.get_num_threads());
+
+            while (first1 < last1) {
+                auto iter_chunk_size = get_iter_chunk_size(first1, last1, chunk_size);
+                RandIt1 loop_end = advanced(first1, iter_chunk_size);
+
+                futures.emplace_back(task_pool.submit(chunk, first1, loop_end, first2, first3));
+
+                first1 = loop_end;
+                std::advance(first2, iter_chunk_size);
+                std::advance(first3, iter_chunk_size);
+            }
+
+            return futures;
+        }
+
+        /**
+         * Sort a range in parallel.
+         *
+         * @param stable Whether to use std::stable_sort or std::sort
+         */
+        template <class ExecPolicy, class RandIt, class Compare>
+        void parallel_sort(ExecPolicy &&policy, RandIt first, RandIt last, Compare comp, bool stable) {
+            if (first == last) {
+                return;
+            }
+
+            // Sort chunks in parallel
+            auto futures = parallel_chunk_for(std::forward<ExecPolicy>(policy), first, last,
+                             [&comp, stable] (RandIt chunk_first, RandIt chunk_last) {
+                                 if (stable) {
+                                     std::stable_sort(chunk_first, chunk_last, comp);
+                                 } else {
+                                     std::sort(chunk_first, chunk_last, comp);
+                                 }
+                                 return std::make_pair(chunk_first, chunk_last);
+                             });
+
+            // Merge the sorted ranges
+            using SortedRange = std::pair<RandIt, RandIt>;
+            auto& task_pool = policy.pool();
+            std::vector<SortedRange> subranges;
+            do {
+                for (auto& future : futures) {
+                    subranges.emplace_back(future.get());
+                }
+                futures.clear();
+
+                for (std::size_t i = 0; i < subranges.size(); ++i) {
+                    if (i + 1 < subranges.size()) {
+                        // pair up and merge
+                        auto& lhs = subranges[i];
+                        auto& rhs = subranges[i + 1];
+                        futures.emplace_back(task_pool.submit([&comp] (RandIt chunk_first, RandIt chunk_middle,
+                                                                       RandIt chunk_last) {
+                            std::inplace_merge(chunk_first, chunk_middle, chunk_last, comp);
+                            return std::make_pair(chunk_first, chunk_last);
+                        }, lhs.first, lhs.second, rhs.second));
+                        ++i;
+                    } else {
+                        // forward the final extra range
+                        std::promise<SortedRange> p;
+                        futures.emplace_back(p.get_future());
+                        p.set_value(subranges[i]);
+                    }
+                }
+
+                subranges.clear();
+            } while (futures.size() > 1);
+            futures.front().get();
+        }
+    }
+}
+
+#endif
+
+namespace std {
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     * See std::copy https://en.cppreference.com/w/cpp/algorithm/copy
+     */
+    template <class ExecPolicy, class RandIt1, class RandIt2>
+    poolstl::internal::enable_if_par<ExecPolicy, RandIt2>
+    copy(ExecPolicy &&policy, RandIt1 first, RandIt1 last, RandIt2 dest) {
+        auto futures = poolstl::internal::parallel_chunk_for(std::forward<ExecPolicy>(policy), first, last, dest,
+                     [](RandIt1 chunk_first, RandIt1 chunk_last, RandIt2 chunk_dest) {
+                          std::copy(chunk_first, chunk_last, chunk_dest);
+                     });
+        poolstl::internal::get_futures(futures);
+        return poolstl::internal::advanced(dest, std::distance(first, last));
+    }
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     * See std::copy_n https://en.cppreference.com/w/cpp/algorithm/copy_n
+     */
+    template <class ExecPolicy, class RandIt1, class Size, class RandIt2>
+    poolstl::internal::enable_if_par<ExecPolicy, RandIt2>
+    copy_n(ExecPolicy &&policy, RandIt1 first, Size n, RandIt2 dest) {
+        if (n <= 0) {
+            return dest;
+        }
+        RandIt1 last = poolstl::internal::advanced(first, n);
+        std::copy(std::forward<ExecPolicy>(policy), first, last, dest);
+        return poolstl::internal::advanced(dest, n);
+    }
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     * See std::count_if https://en.cppreference.com/w/cpp/algorithm/count_if
+     */
+    template <class ExecPolicy, class RandIt, class UnaryPredicate>
+    poolstl::internal::enable_if_par<ExecPolicy, typename iterator_traits<RandIt>::difference_type>
+    count_if(ExecPolicy&& policy, RandIt first, RandIt last, UnaryPredicate p) {
+        using T = typename iterator_traits<RandIt>::difference_type;
+
+        auto futures = poolstl::internal::parallel_chunk_for(std::forward<ExecPolicy>(policy), first, last,
+                                                             [&p](RandIt chunk_first, RandIt chunk_last) {
+                                                                 return std::count_if(chunk_first, chunk_last, p);
+                                                             });
+
+        return poolstl::internal::cpp17::reduce(
+            poolstl::internal::get_wrap(futures.begin()),
+            poolstl::internal::get_wrap(futures.end()), (T)0, std::plus<T>());
+    }
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     * See std::count https://en.cppreference.com/w/cpp/algorithm/count
+     */
+    template <class ExecPolicy, class RandIt, class T>
+    poolstl::internal::enable_if_par<ExecPolicy, typename iterator_traits<RandIt>::difference_type>
+    count(ExecPolicy&& policy, RandIt first, RandIt last, const T& value) {
+        return std::count_if(std::forward<ExecPolicy>(policy), first, last,
+                             [&value](const T& test) { return test == value; });
+    }
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     * See std::fill https://en.cppreference.com/w/cpp/algorithm/fill
+     */
+    template <class ExecPolicy, class RandIt, class Tp>
+    poolstl::internal::enable_if_par<ExecPolicy, void>
+    fill(ExecPolicy &&policy, RandIt first, RandIt last, const Tp& value) {
+        auto futures = poolstl::internal::parallel_chunk_for(std::forward<ExecPolicy>(policy), first, last,
+                                                             [&value](RandIt chunk_first, RandIt chunk_last) {
+                                                                 std::fill(chunk_first, chunk_last, value);
+                                                             });
+        poolstl::internal::get_futures(futures);
+    }
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     * See std::fill_n https://en.cppreference.com/w/cpp/algorithm/fill_n
+     */
+    template <class ExecPolicy, class RandIt, class Size, class Tp>
+    poolstl::internal::enable_if_par<ExecPolicy, RandIt>
+    fill_n(ExecPolicy &&policy, RandIt first, Size n, const Tp& value) {
+        if (n <= 0) {
+            return first;
+        }
+        RandIt last = poolstl::internal::advanced(first, n);
+        std::fill(std::forward<ExecPolicy>(policy), first, last, value);
+        return last;
+    }
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     * See std::find_if https://en.cppreference.com/w/cpp/algorithm/find_if
+     */
+    template <class ExecPolicy, class RandIt, class UnaryPredicate>
+    poolstl::internal::enable_if_par<ExecPolicy, RandIt>
+    find_if(ExecPolicy &&policy, RandIt first, RandIt last, UnaryPredicate p) {
+        using diff_t = typename std::iterator_traits<RandIt>::difference_type;
+        diff_t n = std::distance(first, last);
+        std::atomic<diff_t> extremum(n);
+
+        auto futures = poolstl::internal::parallel_chunk_for(std::forward<ExecPolicy>(policy), first, last,
+                                        [&first, &extremum, &p](RandIt chunk_first, RandIt chunk_last) {
+                                            if (std::distance(first, chunk_first) > extremum) {
+                                             // already found by another task
+                                             return;
+                                            }
+
+                                            RandIt chunk_res = std::find_if(chunk_first, chunk_last, p);
+                                            if (chunk_res != chunk_last) {
+                                                // Found, update exremum using a priority update CAS, as discussed in
+                                                // "Reducing Contention Through Priority Updates", PPoPP '13
+                                                const diff_t k = std::distance(first, chunk_res);
+                                                for (diff_t old = extremum; k < old; old = extremum) {
+                                                    extremum.compare_exchange_weak(old, k);
+                                                }
+                                            }
+                                        }, 8); // use small tasks so later ones may exit early if item is already found
+        poolstl::internal::get_futures(futures);
+        return extremum == n ? last : first + extremum;
+    }
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     * See std::find_if_not https://en.cppreference.com/w/cpp/algorithm/find_if_not
+     */
+    template <class ExecPolicy, class RandIt, class UnaryPredicate>
+    poolstl::internal::enable_if_par<ExecPolicy, RandIt>
+    find_if_not(ExecPolicy &&policy, RandIt first, RandIt last, UnaryPredicate p) {
+        return std::find_if(std::forward<ExecPolicy>(policy), first, last,
+                            [&p](const typename std::iterator_traits<RandIt>::value_type& test) { return !p(test); });
+    }
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     * See std::find https://en.cppreference.com/w/cpp/algorithm/find
+     */
+    template <class ExecPolicy, class RandIt, class T>
+    poolstl::internal::enable_if_par<ExecPolicy, RandIt>
+    find(ExecPolicy &&policy, RandIt first, RandIt last, const T& value) {
+        return std::find_if(std::forward<ExecPolicy>(policy), first, last,
+                            [&value](const T& test) { return value == test; });
+    }
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     * See std::for_each https://en.cppreference.com/w/cpp/algorithm/for_each
+     */
+    template <class ExecPolicy, class RandIt, class UnaryFunction>
+    poolstl::internal::enable_if_par<ExecPolicy, void>
+    for_each(ExecPolicy &&policy, RandIt first, RandIt last, UnaryFunction f) {
+        auto futures = poolstl::internal::parallel_chunk_for(std::forward<ExecPolicy>(policy), first, last,
+                                                             [&f](RandIt chunk_first, RandIt chunk_last) {
+                                                                 // std::for_each(chunk_first, chunk_last, f);
+                                                                 for (; chunk_first != chunk_last; ++chunk_first) {
+                                                                     f(*chunk_first);
+                                                                 }
+                                                             });
+        poolstl::internal::get_futures(futures);
+    }
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     * See std::for_each_n https://en.cppreference.com/w/cpp/algorithm/for_each_n
+     */
+    template <class ExecPolicy, class RandIt, class Size, class UnaryFunction>
+    poolstl::internal::enable_if_par<ExecPolicy, RandIt>
+    for_each_n(ExecPolicy &&policy, RandIt first, Size n, UnaryFunction f) {
+        RandIt last = poolstl::internal::advanced(first, n);
+        std::for_each(std::forward<ExecPolicy>(policy), first, last, f);
+        return last;
+    }
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     * See std::sort https://en.cppreference.com/w/cpp/algorithm/sort
+     */
+    template <class ExecPolicy, class RandIt, class Compare>
+    poolstl::internal::enable_if_par<ExecPolicy, void>
+    sort(ExecPolicy &&policy, RandIt first, RandIt last, Compare comp) {
+        poolstl::internal::parallel_sort(std::forward<ExecPolicy>(policy), first, last, comp, false);
+    }
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     * See std::sort https://en.cppreference.com/w/cpp/algorithm/sort
+     */
+    template <class ExecPolicy, class RandIt>
+    poolstl::internal::enable_if_par<ExecPolicy, void>
+    sort(ExecPolicy &&policy, RandIt first, RandIt last) {
+        using T = typename std::iterator_traits<RandIt>::value_type;
+        poolstl::internal::parallel_sort(std::forward<ExecPolicy>(policy), first, last, std::less<T>(), false);
+    }
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     * See std::stable_sort https://en.cppreference.com/w/cpp/algorithm/stable_sort
+     */
+    template <class ExecPolicy, class RandIt, class Compare>
+    poolstl::internal::enable_if_par<ExecPolicy, void>
+    stable_sort(ExecPolicy &&policy, RandIt first, RandIt last, Compare comp) {
+        poolstl::internal::parallel_sort(std::forward<ExecPolicy>(policy), first, last, comp, true);
+    }
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     * See std::stable_sort https://en.cppreference.com/w/cpp/algorithm/stable_sort
+     */
+    template <class ExecPolicy, class RandIt>
+    poolstl::internal::enable_if_par<ExecPolicy, void>
+    stable_sort(ExecPolicy &&policy, RandIt first, RandIt last) {
+        using T = typename std::iterator_traits<RandIt>::value_type;
+        poolstl::internal::parallel_sort(std::forward<ExecPolicy>(policy), first, last, std::less<T>(), true);
+    }
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     * See std::transform https://en.cppreference.com/w/cpp/algorithm/transform
+     */
+    template <class ExecPolicy, class RandIt1, class RandIt2, class UnaryOperation>
+    poolstl::internal::enable_if_par<ExecPolicy, RandIt2>
+    transform(ExecPolicy&& policy, RandIt1 first1, RandIt1 last1,
+              RandIt2 dest, UnaryOperation unary_op) {
+
+        auto futures = poolstl::internal::parallel_chunk_for(std::forward<ExecPolicy>(policy), first1, last1, dest,
+                 [&unary_op](RandIt1 chunk_first1, RandIt1 chunk_last1, RandIt2 dest_first) {
+                      return poolstl::internal::cpp17::transform(chunk_first1, chunk_last1, dest_first, unary_op);
+                 });
+        poolstl::internal::get_futures(futures);
+        return dest + std::distance(first1, last1);
+    }
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     * See std::transform https://en.cppreference.com/w/cpp/algorithm/transform
+     */
+    template <class ExecPolicy, class RandIt1, class RandIt2, class RandIt3, class BinaryOperation>
+    poolstl::internal::enable_if_par<ExecPolicy, RandIt3>
+    transform(ExecPolicy&& policy, RandIt1 first1, RandIt1 last1,
+              RandIt2 first2, RandIt3 dest, BinaryOperation binary_op) {
+
+        auto futures = poolstl::internal::parallel_chunk_for(std::forward<ExecPolicy>(policy), first1, last1,
+                                                             first2, dest,
+                 [&binary_op](RandIt1 chunk_first1, RandIt1 chunk_last1, RandIt1 chunk_first2, RandIt3 dest_first) {
+                     return poolstl::internal::cpp17::transform(chunk_first1, chunk_last1,
+                                                                chunk_first2, dest_first, binary_op);
+                 });
+        poolstl::internal::get_futures(futures);
+        return dest + std::distance(first1, last1);
+    }
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     * See std::all_of https://en.cppreference.com/w/cpp/algorithm/all_of
+     */
+    template <class ExecPolicy, typename RandIt, typename Predicate>
+    poolstl::internal::enable_if_par<ExecPolicy, bool>
+    all_of(ExecPolicy&& policy, RandIt first, RandIt last, Predicate pred) {
+        return last == std::find_if_not(std::forward<ExecPolicy>(policy), first, last, pred);
+    }
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     * See std::none_of https://en.cppreference.com/w/cpp/algorithm/none_of
+     */
+    template <class ExecPolicy, typename RandIt, typename Predicate>
+    poolstl::internal::enable_if_par<ExecPolicy, bool>
+    none_of(ExecPolicy&& policy, RandIt first, RandIt last, Predicate pred) {
+        return last == std::find_if(std::forward<ExecPolicy>(policy), first, last, pred);
+    }
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     * See std::any_of https://en.cppreference.com/w/cpp/algorithm/any_of
+     */
+    template <class ExecPolicy, typename RandIt, typename Predicate>
+    poolstl::internal::enable_if_par<ExecPolicy, bool>
+    any_of(ExecPolicy&& policy, RandIt first, RandIt last, Predicate pred) {
+        return !std::none_of(std::forward<ExecPolicy>(policy), first, last, pred);
+    }
+}
+
+namespace poolstl {
+
+    template <class RandIt, class ChunkConstructor, class UnaryFunction>
+    void for_each_chunk(RandIt first, RandIt last, ChunkConstructor construct, UnaryFunction f) {
+        if (first == last) {
+            return;
+        }
+
+        auto chunk_data = construct();
+        for (; first != last; ++first) {
+            f(*first, chunk_data);
+        }
+    }
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     *
+     * Like `std::for_each`, but exposes the chunking. The `construct` method is called once per parallel chunk and
+     * its output is passed to `f`.
+     *
+     * Useful for cases where an expensive workspace can be shared between loop iterations
+     * but cannot be shared by all parallel iterations.
+     */
+    template <class ExecPolicy, class RandIt, class ChunkConstructor, class UnaryFunction>
+    poolstl::internal::enable_if_par<ExecPolicy, void>
+    for_each_chunk(ExecPolicy&& policy, RandIt first, RandIt last, ChunkConstructor construct, UnaryFunction f) {
+        auto futures = poolstl::internal::parallel_chunk_for(std::forward<ExecPolicy>(policy), first, last,
+                                                             [&construct, &f](RandIt chunk_first, RandIt chunk_last) {
+                                                                 for_each_chunk(chunk_first, chunk_last, construct, f);
+                                                             });
+        poolstl::internal::get_futures(futures);
+    }
+}
+
+#endif
+
+#ifndef POOLSTL_NUMERIC_HPP
+#define POOLSTL_NUMERIC_HPP
+
+#include <tuple>
+
+
+namespace std {
+
+#if POOLSTL_HAVE_CXX17_LIB
+    /**
+     * NOTE: Iterators are expected to be random access.
+     * See std::exclusive_scan https://en.cppreference.com/w/cpp/algorithm/exclusive_scan
+     */
+    template <class ExecPolicy, class RandIt1, class RandIt2, class T, class BinaryOp>
+    poolstl::internal::enable_if_par<ExecPolicy, RandIt2>
+    exclusive_scan(ExecPolicy &&policy, RandIt1 first, RandIt1 last, RandIt2 dest, T init, BinaryOp binop) {
+        if (first == last) {
+            return dest;
+        }
+
+        // Pass 1: Chunk the input and find the sum of each chunk
+        auto futures = poolstl::internal::parallel_chunk_for(std::forward<ExecPolicy>(policy), first, last,
+                             [binop](RandIt1 chunk_first, RandIt1 chunk_last) {
+                                 auto sum = std::accumulate(chunk_first, chunk_last, T{}, binop);
+                                 return std::make_tuple(std::make_pair(chunk_first, chunk_last), sum);
+                             });
+
+        std::vector<std::pair<RandIt1, RandIt1>> ranges;
+        std::vector<T> sums;
+
+        for (auto& future : futures) {
+            auto res = future.get();
+            ranges.push_back(std::get<0>(res));
+            sums.push_back(std::get<1>(res));
+        }
+
+        // find initial values for each range
+        std::exclusive_scan(sums.begin(), sums.end(), sums.begin(), init, binop);
+
+        // Pass 2: perform exclusive scan of each chunk, using the sum of previous chunks as init
+        std::vector<std::tuple<RandIt1, RandIt1, RandIt2, T>> args;
+        for (std::size_t i = 0; i < sums.size(); ++i) {
+            auto chunk_first = std::get<0>(ranges[i]);
+            args.emplace_back(std::make_tuple(
+                chunk_first, std::get<1>(ranges[i]),
+                dest + (chunk_first - first),
+                sums[i]));
+        }
+
+        auto futures2 = poolstl::internal::parallel_apply(std::forward<ExecPolicy>(policy),
+            [binop](RandIt1 chunk_first, RandIt1 chunk_last, RandIt2 chunk_dest, T chunk_init){
+                std::exclusive_scan(chunk_first, chunk_last, chunk_dest, chunk_init, binop);
+            }, args);
+
+        poolstl::internal::get_futures(futures2);
+        return dest + (last - first);
+    }
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     * See std::exclusive_scan https://en.cppreference.com/w/cpp/algorithm/exclusive_scan
+     */
+    template <class ExecPolicy, class RandIt1, class RandIt2, class T>
+    poolstl::internal::enable_if_par<ExecPolicy, RandIt2>
+    exclusive_scan(ExecPolicy &&policy, RandIt1 first, RandIt1 last, RandIt2 dest, T init) {
+        return std::exclusive_scan(std::forward<ExecPolicy>(policy), first, last, dest, init, std::plus<T>());
+    }
+#endif
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     * See std::reduce https://en.cppreference.com/w/cpp/algorithm/reduce
+     */
+    template <class ExecPolicy, class RandIt, class T, class BinaryOp>
+    poolstl::internal::enable_if_par<ExecPolicy, T>
+    reduce(ExecPolicy &&policy, RandIt first, RandIt last, T init, BinaryOp binop) {
+        auto futures = poolstl::internal::parallel_chunk_for(std::forward<ExecPolicy>(policy), first, last,
+                                  [init, binop](RandIt chunk_first, RandIt chunk_last) {
+                                      return poolstl::internal::cpp17::reduce(chunk_first, chunk_last, init, binop);
+                                  });
+
+        return poolstl::internal::cpp17::reduce(
+            poolstl::internal::get_wrap(futures.begin()),
+            poolstl::internal::get_wrap(futures.end()), init, binop);
+    }
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     * See std::reduce https://en.cppreference.com/w/cpp/algorithm/reduce
+     */
+    template <class ExecPolicy, class RandIt, class T>
+    poolstl::internal::enable_if_par<ExecPolicy, T>
+    reduce(ExecPolicy &&policy, RandIt first, RandIt last, T init) {
+        return std::reduce(std::forward<ExecPolicy>(policy), first, last, init, std::plus<T>());
+    }
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     * See std::reduce https://en.cppreference.com/w/cpp/algorithm/reduce
+     */
+    template <class ExecPolicy, class RandIt>
+    poolstl::internal::enable_if_par<
+        ExecPolicy, typename std::iterator_traits<RandIt>::value_type>
+    reduce(ExecPolicy &&policy, RandIt first, RandIt last) {
+        return std::reduce(std::forward<ExecPolicy>(policy), first, last,
+                           typename std::iterator_traits<RandIt>::value_type{});
+    }
+
+#if POOLSTL_HAVE_CXX17_LIB
+    /**
+     * NOTE: Iterators are expected to be random access.
+     * See std::transform_reduce https://en.cppreference.com/w/cpp/algorithm/transform_reduce
+     */
+    template <class ExecPolicy, class RandIt1, class T, class BinaryReductionOp, class UnaryTransformOp>
+    poolstl::internal::enable_if_par<ExecPolicy, T>
+    transform_reduce(ExecPolicy&& policy, RandIt1 first1, RandIt1 last1, T init,
+                     BinaryReductionOp reduce_op, UnaryTransformOp transform_op) {
+
+        auto futures = poolstl::internal::parallel_chunk_for(std::forward<ExecPolicy>(policy), first1, last1,
+             [&init, &reduce_op, &transform_op](RandIt1 chunk_first1, RandIt1 chunk_last1) {
+                 return std::transform_reduce(chunk_first1, chunk_last1, init, reduce_op, transform_op);
+             });
+
+        return poolstl::internal::cpp17::reduce(
+            poolstl::internal::get_wrap(futures.begin()),
+            poolstl::internal::get_wrap(futures.end()), init, reduce_op);
+    }
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     * See std::transform_reduce https://en.cppreference.com/w/cpp/algorithm/transform_reduce
+     */
+    template <class ExecPolicy, class RandIt1, class RandIt2, class T, class BinaryReductionOp, class BinaryTransformOp>
+    poolstl::internal::enable_if_par<ExecPolicy, T>
+    transform_reduce(ExecPolicy&& policy, RandIt1 first1, RandIt1 last1, RandIt2 first2, T init,
+                     BinaryReductionOp reduce_op, BinaryTransformOp transform_op) {
+
+        auto futures = poolstl::internal::parallel_chunk_for(std::forward<ExecPolicy>(policy), first1, last1, first2,
+             [&init, &reduce_op, &transform_op](RandIt1 chunk_first1, RandIt1 chunk_last1, RandIt2 chunk_first2) {
+                 return std::transform_reduce(chunk_first1, chunk_last1, chunk_first2, init, reduce_op, transform_op);
+             });
+
+        return poolstl::internal::cpp17::reduce(
+            poolstl::internal::get_wrap(futures.begin()),
+            poolstl::internal::get_wrap(futures.end()), init, reduce_op);
+    }
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     * See std::transform_reduce https://en.cppreference.com/w/cpp/algorithm/transform_reduce
+     */
+    template< class ExecPolicy, class RandIt1, class RandIt2, class T >
+    poolstl::internal::enable_if_par<ExecPolicy, T>
+    transform_reduce(ExecPolicy&& policy, RandIt1 first1, RandIt1 last1, RandIt2 first2, T init ) {
+        return transform_reduce(std::forward<ExecPolicy>(policy),
+            first1, last1, first2, init, std::plus<>(), std::multiplies<>());
+    }
+#endif
+
+}
+
+#endif
+
+#ifndef POOLSTL_SEQ_FWD_HPP
+#define POOLSTL_SEQ_FWD_HPP
+
+
+/*
+ * Forward poolstl::seq to the native sequential (no policy) method.
+ */
+
+#define POOLSTL_DEFINE_SEQ_FWD(NS, FNAME)                                                                   \
+    template<class EP, typename...ARGS>                                                                     \
+    auto FNAME(EP&&, ARGS&&...args) ->                                                                      \
+                poolstl::internal::enable_if_seq<EP, decltype(NS::FNAME(std::forward<ARGS>(args)...))> {    \
+        return NS::FNAME(std::forward<ARGS>(args)...);                                                      \
+    }
+
+#define POOLSTL_DEFINE_SEQ_FWD_VOID(NS, FNAME)                                   \
+    template<class EP, typename...ARGS>                                          \
+    poolstl::internal::enable_if_seq<EP, void> FNAME(EP&&, ARGS&&... args) {     \
+        NS::FNAME(std::forward<ARGS>(args)...);                                  \
+    }
+
+#if POOLSTL_HAVE_CXX17
+
+/*
+ * Dynamically choose policy from a std::variant.
+ * Useful to choose between parallel and sequential policies at runtime via par_if.
+ */
+
+#define POOLSTL_DEFINE_PAR_IF_FWD_VOID(NS, FNAME)                                                         \
+    template<class EP, typename...ARGS>                                                                   \
+    poolstl::internal::enable_if_poolstl_variant<EP, void> FNAME(EP&& policy, ARGS&&...args) {            \
+        std::visit([&](auto&& pol) { NS::FNAME(pol, std::forward<ARGS>(args)...); }, policy.var);         \
+    }
+
+#define POOLSTL_DEFINE_PAR_IF_FWD(NS, FNAME)                                                                          \
+    template<class EP, typename...ARGS>                                                                               \
+    auto FNAME(EP&& policy, ARGS&&...args) ->                                                                         \
+                poolstl::internal::enable_if_poolstl_variant<EP, decltype(NS::FNAME(std::forward<ARGS>(args)...))> {  \
+        return std::visit([&](auto&& pol) { return NS::FNAME(pol, std::forward<ARGS>(args)...); }, policy.var);       \
+    }
+
+#else
+#define POOLSTL_DEFINE_PAR_IF_FWD_VOID(NS, FNAME)
+#define POOLSTL_DEFINE_PAR_IF_FWD(NS, FNAME)
+#endif
+/*
+ * Define both the sequential forward and dynamic chooser.
+ */
+#define POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(NS, FNAME)        \
+                    POOLSTL_DEFINE_SEQ_FWD(NS, FNAME)            \
+                    POOLSTL_DEFINE_PAR_IF_FWD(NS, FNAME)
+
+#define POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF_VOID(NS, FNAME)   \
+                    POOLSTL_DEFINE_SEQ_FWD_VOID(NS, FNAME)       \
+                    POOLSTL_DEFINE_PAR_IF_FWD_VOID(NS, FNAME)
+
+namespace std {
+    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, all_of)
+    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, any_of)
+    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, none_of)
+
+    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, count)
+    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, count_if)
+
+    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, copy)
+    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, copy_n)
+
+    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF_VOID(std, fill)
+    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, fill_n)
+
+    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, find)
+    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, find_if)
+    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, find_if_not)
+
+    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF_VOID(std, for_each)
+#if POOLSTL_HAVE_CXX17_LIB
+    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, for_each_n)
+#endif
+
+    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, transform)
+
+#if POOLSTL_HAVE_CXX17_LIB
+    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, exclusive_scan)
+    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, reduce)
+    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, transform_reduce)
+#endif
+}
+
+namespace poolstl {
+    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF_VOID(poolstl, for_each_chunk)
+}
+
+#endif
+
+// Note that iota_iter.hpp is self-contained in its own right.
+
+#ifndef POOLSTL_IOTA_ITER_HPP
+#define POOLSTL_IOTA_ITER_HPP
+
+#include <cstddef>
+#include <iterator>
+
+namespace poolstl {
+
+    /**
+     * An iterator over the integers.
+     *
+     * Effectively a view on a fictional vector populated by std::iota, but without materializing anything.
+     *
+     * Useful to parallelize loops that are not over a container, like this:
+     *
+     * \code{.cpp}
+     * for (int i = 0; i < 10; ++i) {
+     * }
+     *\endcode
+     *
+     * Becomes:
+     * \code{.cpp}
+     * std::for_each(iota_iter<int>(0), iota_iter<int>(10), [](int i) {
+     * });
+     * \endcode
+     *
+     * @tparam T A type that acts as an integer.
+     */
+    template<typename T>
+    class iota_iter {
+    public:
+        using value_type = T;
+        using difference_type = std::ptrdiff_t;
+        using pointer = T *;
+        using reference = T;
+        using iterator_category = std::random_access_iterator_tag;
+
+        iota_iter() : value{} {}
+        explicit iota_iter(T rhs) : value(rhs) {}
+        iota_iter(const iota_iter<T> &rhs) : value(rhs.value) {}
+
+        iota_iter<T> &operator=(T rhs) { value = rhs; return *this; }
+        iota_iter<T> &operator=(const iota_iter &rhs) { value = rhs.value; return *this; }
+
+        reference operator*() const { return value; }
+        reference operator[](difference_type rhs) const { return value + rhs; }
+        // operator-> has no meaning in this application
+
+        bool operator==(const iota_iter<T> &rhs) const { return value == rhs.value; }
+        bool operator!=(const iota_iter<T> &rhs) const { return value != rhs.value; }
+        bool operator<(const iota_iter<T> &rhs) const { return value < rhs.value; }
+        bool operator>(const iota_iter<T> &rhs) const { return value > rhs.value; }
+        bool operator<=(const iota_iter<T> &rhs) const { return value <= rhs.value; }
+        bool operator>=(const iota_iter<T> &rhs) const { return value >= rhs.value; }
+
+        iota_iter<T> &operator+=(difference_type rhs) { value += rhs; return *this; }
+        iota_iter<T> &operator-=(difference_type rhs) { value -= rhs; return *this; }
+
+        iota_iter<T> &operator++() { ++value; return *this; }
+        iota_iter<T> &operator--() { --value; return *this; }
+        iota_iter<T> operator++(int) { iota_iter<T> ret(value); ++value; return ret; }
+        iota_iter<T> operator--(int) { iota_iter<T> ret(value); --value; return ret; }
+
+        difference_type operator-(const iota_iter<T> &rhs) const { return value - rhs.value; }
+        iota_iter<T> operator-(difference_type rhs) const { return iota_iter(value - rhs); }
+        iota_iter<T> operator+(difference_type rhs) const { return iota_iter(value + rhs); }
+
+        friend inline iota_iter<T> operator+(difference_type lhs, const iota_iter<T> &rhs) {
+            return iota_iter(lhs + rhs.value);
+        }
+
+    protected:
+        T value;
+    };
+}
+
+namespace std {
+    /**
+     * Specialize std::iterator_traits for poolstl::iota_iter.
+     */
+    template <typename T>
+    struct iterator_traits<poolstl::iota_iter<T>> {
+        using value_type =        typename poolstl::iota_iter<T>::value_type;
+        using difference_type =   typename poolstl::iota_iter<T>::difference_type;
+        using pointer =           typename poolstl::iota_iter<T>::pointer;
+        using reference =         typename poolstl::iota_iter<T>::reference;
+        using iterator_category = typename poolstl::iota_iter<T>::iterator_category;
+    };
+}
+
+#endif
+
+/*
+ * Optionally alias `poolstl::par` as `std::execution::par` to enable poolSTL to fill in for missing compiler support.
+ *
+ * USE AT YOUR OWN RISK!
+ *
+ * To use this define POOLSTL_STD_SUPPLEMENT=1 before including poolstl.hpp.
+ *
+ * Attempts to autodetect native support by checking for <execution>, including it if it exists, and then checking for
+ * the __cpp_lib_parallel_algorithm feature macro.
+ *
+ * If native support is not found then the standard execution policies are declared as forwards to poolSTL.
+ *
+ * GCC and Clang: TBB is required if <execution> is #included. If you'd like to use the poolSTL supplement in cases
+ * that TBB is not available, have your build system define POOLSTL_STD_SUPPLEMENT_NO_INCLUDE if TBB is not found.
+ * PoolSTL will then not include <execution> and the supplement will kick in.
+ * Your code must not #include <execution>.
+ *
+ * MinGW: the compiler declares support, but actual performance is sequential (see poolSTL benchmark). To use
+ * the supplement anyway define POOLSTL_STD_SUPPLEMENT_FORCE to override the autodetection.
+ * Your code must not #include <execution>.
+ *
+ * Define POOLSTL_ALLOW_SUPPLEMENT=0 to override POOLSTL_STD_SUPPLEMENT and disable this feature.
+ */
+#ifndef POOLSTL_ALLOW_SUPPLEMENT
+#define POOLSTL_ALLOW_SUPPLEMENT 1
+#endif
+
+#if POOLSTL_ALLOW_SUPPLEMENT && defined(POOLSTL_STD_SUPPLEMENT)
+
+#if __cplusplus >= 201603L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201603L)
+#if __has_include(<execution>)
+#ifndef POOLSTL_STD_SUPPLEMENT_NO_INCLUDE
+#endif
+#endif
+#endif
+
+#if !defined(__cpp_lib_parallel_algorithm) || defined(POOLSTL_STD_SUPPLEMENT_FORCE)
+namespace std {
+    namespace execution {
+        using ::poolstl::execution::sequenced_policy;
+        using ::poolstl::execution::seq;
+        using ::poolstl::execution::parallel_policy;
+        using ::poolstl::execution::par;
+        using parallel_unsequenced_policy = ::poolstl::execution::parallel_policy;
+        constexpr parallel_unsequenced_policy par_unseq{};
+    }
+}
+
+#endif
+#endif
+
+#endif
diff --git a/src/index.cpp b/src/index.cpp
index 7773e509..9b907257 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -11,6 +11,7 @@
 #include <cassert>
 #include <algorithm>
 #include "pdqsort/pdqsort.h"
+#include "poolstl/poolstl.hpp"
 #include <iostream>
 #include <thread>
 #include <atomic>
@@ -138,7 +139,7 @@ int StrobemerIndex::pick_bits(size_t size) const {
     return std::clamp(static_cast<int>(log2(estimated_number_of_randstrobes)) - 1, 8, 31);
 }
 
-void StrobemerIndex::populate(float f, size_t n_threads) {
+void StrobemerIndex::populate(float f, unsigned n_threads) {
     Timer count_hash;
     auto randstrobe_counts = count_all_randstrobes(references, parameters, n_threads);
     stats.elapsed_counting_hashes = count_hash.duration();
@@ -164,8 +165,13 @@ void StrobemerIndex::populate(float f, size_t n_threads) {
 
     Timer sorting_timer;
     logger.debug() << "  Sorting ...\n";
-    // sort by hash values
-    pdqsort_branchless(randstrobes.begin(), randstrobes.end());
+    if (true) {
+        task_thread_pool::task_thread_pool pool{n_threads};
+        std::sort(poolstl::par.on(pool), randstrobes.begin(), randstrobes.end());
+    } else {
+        // sort by hash values
+        pdqsort_branchless(randstrobes.begin(), randstrobes.end());
+    }
     stats.elapsed_sorting_seeds = sorting_timer.duration();
 
     Timer hash_index_timer;
diff --git a/src/index.hpp b/src/index.hpp
index 941db7de..a6b9f003 100644
--- a/src/index.hpp
+++ b/src/index.hpp
@@ -51,7 +51,7 @@ struct StrobemerIndex {
 
     void write(const std::string& filename) const;
     void read(const std::string& filename);
-    void populate(float f, size_t n_threads);
+    void populate(float f, unsigned n_threads);
     void print_diagnostics(const std::string& logfile_name, int k) const;
     int pick_bits(size_t size) const;
     size_t find(randstrobe_hash_t key) const {

From 2012420a4741685712baa49990065c707ef948ed Mon Sep 17 00:00:00 2001
From: Marcel Martin <marcel.martin@scilifelab.se>
Date: Sun, 14 Jan 2024 16:00:48 +0100
Subject: [PATCH 11/32] Bump poolSTL and use poolstl::pluggable_sort

---
 ext/README.md           |    6 +
 ext/poolstl/poolstl.hpp | 1476 +++++++++++++++++++++++++++++++--------
 src/index.cpp           |    9 +-
 3 files changed, 1179 insertions(+), 312 deletions(-)

diff --git a/ext/README.md b/ext/README.md
index e8316d26..d80e5a2d 100644
--- a/ext/README.md
+++ b/ext/README.md
@@ -27,6 +27,12 @@ Homepage: https://github.com/orlp/pdqsort
 Commit used: b1ef26a55cdb60d236a5cb199c4234c704f46726
 License: See pdqsort/license.txt
 
+## poolstl
+
+Homepage: https://github.com/alugowski/poolSTL/
+Downloaded file: https://github.com/alugowski/poolSTL/releases/download/v0.3.3/poolstl.hpp
+Version: 0.3.3
+License: See poolstl.hpp
 
 ## robin_hood
 
diff --git a/ext/poolstl/poolstl.hpp b/ext/poolstl/poolstl.hpp
index ea79146e..d1340a1e 100644
--- a/ext/poolstl/poolstl.hpp
+++ b/ext/poolstl/poolstl.hpp
@@ -84,6 +84,7 @@
  * DEALINGS IN THE SOFTWARE.
  */
 
+
 #ifndef POOLSTL_HPP
 #define POOLSTL_HPP
 
@@ -93,6 +94,767 @@
 
 #include <memory>
 #include <mutex>
+#include <stdexcept>
+#include <type_traits>
+
+
+#ifndef AL_TASK_THREAD_POOL_HPP
+#define AL_TASK_THREAD_POOL_HPP
+
+// Version macros.
+#define TASK_THREAD_POOL_VERSION_MAJOR 1
+#define TASK_THREAD_POOL_VERSION_MINOR 0
+#define TASK_THREAD_POOL_VERSION_PATCH 10
+
+#include <condition_variable>
+#include <functional>
+#include <future>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <type_traits>
+
+// MSVC does not correctly set the __cplusplus macro by default, so we must read it from _MSVC_LANG
+// See https://devblogs.microsoft.com/cppblog/msvc-now-correctly-reports-__cplusplus/
+#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
+#define TTP_CXX17 1
+#else
+#define TTP_CXX17 0
+#endif
+
+#if TTP_CXX17
+#define TTP_NODISCARD [[nodiscard]]
+#else
+#define TTP_NODISCARD
+#endif
+
+namespace task_thread_pool {
+
+#if !TTP_CXX17
+    /**
+     * A reimplementation of std::decay_t, which is only available since C++14.
+     */
+    template <class T>
+    using decay_t = typename std::decay<T>::type;
+#endif
+
+    /**
+     * A fast and lightweight thread pool that uses C++11 threads.
+     */
+    class task_thread_pool {
+    public:
+        /**
+         * Create a task_thread_pool and start worker threads.
+         *
+         * @param num_threads Number of worker threads. If 0 then number of threads is equal to the
+         *                    number of physical cores on the machine, as given by std::thread::hardware_concurrency().
+         */
+        explicit task_thread_pool(unsigned int num_threads = 0) {
+            if (num_threads < 1) {
+                num_threads = std::thread::hardware_concurrency();
+                if (num_threads < 1) { num_threads = 1; }
+            }
+            start_threads(num_threads);
+        }
+
+        /**
+         * Finish all tasks left in the queue then shut down worker threads.
+         * If the pool is currently paused then it is resumed.
+         */
+        ~task_thread_pool() {
+            unpause();
+            wait_for_queued_tasks();
+            stop_all_threads();
+        }
+
+        /**
+         * Drop all tasks that have been submitted but not yet started by a worker.
+         *
+         * Tasks already in progress continue executing.
+         */
+        void clear_task_queue() {
+            const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+            tasks = {};
+        }
+
+        /**
+         * Get number of enqueued tasks.
+         *
+         * @return Number of tasks that have been enqueued but not yet started.
+         */
+        TTP_NODISCARD size_t get_num_queued_tasks() const {
+            const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+            return tasks.size();
+        }
+
+        /**
+         * Get number of in-progress tasks.
+         *
+         * @return Approximate number of tasks currently being processed by worker threads.
+         */
+        TTP_NODISCARD size_t get_num_running_tasks() const {
+            const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+            return num_inflight_tasks;
+        }
+
+        /**
+         * Get total number of tasks in the pool.
+         *
+         * @return Approximate number of tasks both enqueued and running.
+         */
+        TTP_NODISCARD size_t get_num_tasks() const {
+            const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+            return tasks.size() + num_inflight_tasks;
+        }
+
+        /**
+         * Get number of worker threads.
+         *
+         * @return Number of worker threads.
+         */
+        TTP_NODISCARD unsigned int get_num_threads() const {
+            const std::lock_guard<std::recursive_mutex> threads_lock(thread_mutex);
+            return static_cast<unsigned int>(threads.size());
+        }
+
+        /**
+         * Set number of worker threads. Will start or stop worker threads as necessary.
+         *
+         * @param num_threads Number of worker threads. If 0 then number of threads is equal to the
+         *                    number of physical cores on the machine, as given by std::thread::hardware_concurrency().
+         * @return Previous number of worker threads.
+         */
+        unsigned int set_num_threads(unsigned int num_threads) {
+            const std::lock_guard<std::recursive_mutex> threads_lock(thread_mutex);
+            unsigned int previous_num_threads = get_num_threads();
+
+            if (num_threads < 1) {
+                num_threads = std::thread::hardware_concurrency();
+                if (num_threads < 1) { num_threads = 1; }
+            }
+
+            if (previous_num_threads <= num_threads) {
+                // expanding the thread pool
+                start_threads(num_threads - previous_num_threads);
+            } else {
+                // contracting the thread pool
+                stop_all_threads();
+                {
+                    const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+                    pool_running = true;
+                }
+                start_threads(num_threads);
+            }
+
+            return previous_num_threads;
+        }
+
+        /**
+         * Stop executing queued tasks. Use `unpause()` to resume. Note: Destroying the pool will implicitly unpause.
+         *
+         * Any in-progress tasks continue executing.
+         */
+        void pause() {
+            const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+            pool_paused = true;
+        }
+
+        /**
+         * Resume executing queued tasks.
+         */
+        void unpause() {
+            const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+            pool_paused = false;
+            task_cv.notify_all();
+        }
+
+        /**
+         * Check whether the pool is paused.
+         *
+         * @return true if pause() has been called without an intervening unpause().
+         */
+        TTP_NODISCARD bool is_paused() const {
+            const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+            return pool_paused;
+        }
+
+        /**
+         * Submit a Callable for the pool to execute and return a std::future.
+         *
+         * @param func The Callable to execute. Can be a function, a lambda, std::packaged_task, std::function, etc.
+         * @param args Arguments for func. Optional.
+         * @return std::future that can be used to get func's return value or thrown exception.
+         */
+        template <typename F, typename... A,
+#if TTP_CXX17
+            typename R = std::invoke_result_t<std::decay_t<F>, std::decay_t<A>...>
+#else
+            typename R = typename std::result_of<decay_t<F>(decay_t<A>...)>::type
+#endif
+        >
+        TTP_NODISCARD std::future<R> submit(F&& func, A&&... args) {
+#if defined(_MSC_VER)
+            // MSVC's packaged_task is not movable even though it should be.
+            // Discussion about this bug and its future fix:
+            // https://developercommunity.visualstudio.com/t/unable-to-move-stdpackaged-task-into-any-stl-conta/108672
+            std::shared_ptr<std::packaged_task<R()>> ptask =
+                std::make_shared<std::packaged_task<R()>>(std::bind(std::forward<F>(func), std::forward<A>(args)...));
+            submit_detach([ptask] { (*ptask)(); });
+            return ptask->get_future();
+#else
+            std::packaged_task<R()> task(std::bind(std::forward<F>(func), std::forward<A>(args)...));
+            auto ret = task.get_future();
+            submit_detach(std::move(task));
+            return ret;
+#endif
+        }
+
+        /**
+         * Submit a zero-argument Callable for the pool to execute.
+         *
+         * @param func The Callable to execute. Can be a function, a lambda, std::packaged_task, std::function, etc.
+         */
+        template <typename F>
+        void submit_detach(F&& func) {
+            const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+            tasks.emplace(std::forward<F>(func));
+            task_cv.notify_one();
+        }
+
+        /**
+         * Submit a Callable with arguments for the pool to execute.
+         *
+         * @param func The Callable to execute. Can be a function, a lambda, std::packaged_task, std::function, etc.
+         */
+        template <typename F, typename... A>
+        void submit_detach(F&& func, A&&... args) {
+            const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+            tasks.emplace(std::bind(std::forward<F>(func), std::forward<A>(args)...));
+            task_cv.notify_one();
+        }
+
+        /**
+         * Block until the task queue is empty. Some tasks may be in-progress when this method returns.
+         */
+        void wait_for_queued_tasks() {
+            std::unique_lock<std::mutex> tasks_lock(task_mutex);
+            notify_task_finish = true;
+            task_finished_cv.wait(tasks_lock, [&] { return tasks.empty(); });
+            notify_task_finish = false;
+        }
+
+        /**
+         * Block until all tasks have finished.
+         */
+        void wait_for_tasks() {
+            std::unique_lock<std::mutex> tasks_lock(task_mutex);
+            notify_task_finish = true;
+            task_finished_cv.wait(tasks_lock, [&] { return tasks.empty() && num_inflight_tasks == 0; });
+            notify_task_finish = false;
+        }
+
+    protected:
+
+        /**
+         * Main function for worker threads.
+         */
+        void worker_main() {
+            bool finished_task = false;
+
+            while (true) {
+                std::unique_lock<std::mutex> tasks_lock(task_mutex);
+
+                if (finished_task) {
+                    --num_inflight_tasks;
+                    if (notify_task_finish) {
+                        task_finished_cv.notify_all();
+                    }
+                }
+
+                task_cv.wait(tasks_lock, [&]() { return !pool_running || (!pool_paused && !tasks.empty()); });
+
+                if (!pool_running) {
+                    break;
+                }
+
+                // Must mean that (!pool_paused && !tasks.empty()) is true
+
+                std::packaged_task<void()> task{std::move(tasks.front())};
+                tasks.pop();
+                ++num_inflight_tasks;
+                tasks_lock.unlock();
+
+                try {
+                    task();
+                } catch (...) {
+                    // std::packaged_task::operator() may throw in some error conditions, such as if the task
+                    // had already been run. Nothing that the pool can do anything about.
+                }
+
+                finished_task = true;
+            }
+        }
+
+        /**
+         * Start worker threads.
+         *
+         * @param num_threads How many threads to start.
+         */
+        void start_threads(const unsigned int num_threads) {
+            const std::lock_guard<std::recursive_mutex> threads_lock(thread_mutex);
+
+            for (unsigned int i = 0; i < num_threads; ++i) {
+                threads.emplace_back(&task_thread_pool::worker_main, this);
+            }
+        }
+
+        /**
+         * Stop, join, and destroy all worker threads.
+         */
+        void stop_all_threads() {
+            const std::lock_guard<std::recursive_mutex> threads_lock(thread_mutex);
+
+            {
+                const std::lock_guard<std::mutex> tasks_lock(task_mutex);
+                pool_running = false;
+                task_cv.notify_all();
+            }
+
+            for (auto& thread : threads) {
+                if (thread.joinable()) {
+                    thread.join();
+                }
+            }
+            threads.clear();
+        }
+
+        /**
+         * The worker threads.
+         *
+         * Access protected by thread_mutex
+         */
+        std::vector<std::thread> threads;
+
+        /**
+         * A mutex for methods that start/stop threads.
+         */
+        mutable std::recursive_mutex thread_mutex;
+
+        /**
+         * The task queue.
+         *
+         * Access protected by task_mutex.
+         */
+        std::queue<std::packaged_task<void()>> tasks = {};
+
+        /**
+         * A mutex for all variables related to tasks.
+         */
+        mutable std::mutex task_mutex;
+
+        /**
+         * Used to notify changes to the task queue, such as a new task added, pause/unpause, etc.
+         */
+        std::condition_variable task_cv;
+
+        /**
+         * Used to notify of finished tasks.
+         */
+        std::condition_variable task_finished_cv;
+
+        /**
+         * A signal for worker threads that the pool is either running or shutting down.
+         *
+         * Access protected by task_mutex.
+         */
+        bool pool_running = true;
+
+        /**
+         * A signal for worker threads to not pull new tasks from the queue.
+         *
+         * Access protected by task_mutex.
+         */
+        bool pool_paused = false;
+
+        /**
+         * A signal for worker threads that they should notify task_finished_cv when they finish a task.
+         *
+         * Access protected by task_mutex.
+         */
+        bool notify_task_finish = false;
+
+        /**
+         * A counter of the number of tasks in-progress by worker threads.
+         * Incremented when a task is popped off the task queue and decremented when that task is complete.
+         *
+         * Access protected by task_mutex.
+         */
+        int num_inflight_tasks = 0;
+    };
+}
+
+// clean up
+#undef TTP_NODISCARD
+#undef TTP_CXX17
+
+#endif
+
+#ifndef POOLSTL_INTERNAL_UTILS_HPP
+#define POOLSTL_INTERNAL_UTILS_HPP
+
+// Version macros.
+#define POOLSTL_VERSION_MAJOR 0
+#define POOLSTL_VERSION_MINOR 3
+#define POOLSTL_VERSION_PATCH 3
+
+#include <cstddef>
+#include <iterator>
+
+#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
+#define POOLSTL_HAVE_CXX17 1
+#define POOLSTL_NO_DISCARD [[nodiscard]]
+#else
+#define POOLSTL_HAVE_CXX17 0
+#define POOLSTL_NO_DISCARD
+#endif
+
+#if POOLSTL_HAVE_CXX17 && (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE >= 9)
+#define POOLSTL_HAVE_CXX17_LIB 1
+#else
+#define POOLSTL_HAVE_CXX17_LIB 0
+#endif
+
+#if __cplusplus >= 201402L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201402L)
+#define POOLSTL_HAVE_CXX14 1
+#else
+#define POOLSTL_HAVE_CXX14 0
+#endif
+
+namespace poolstl {
+    namespace internal {
+
+        inline constexpr std::size_t get_chunk_size(std::size_t num_steps, unsigned int num_threads) {
+            return (num_steps / num_threads) + ((num_steps % num_threads) > 0 ? 1 : 0);
+        }
+
+        template<typename Iterator>
+        constexpr typename std::iterator_traits<Iterator>::difference_type
+        get_chunk_size(Iterator first, Iterator last, unsigned int num_threads) {
+            using diff_t = typename std::iterator_traits<Iterator>::difference_type;
+            return static_cast<diff_t>(get_chunk_size((std::size_t)std::distance(first, last), num_threads));
+        }
+
+        template<typename Iterator>
+        constexpr typename std::iterator_traits<Iterator>::difference_type
+        get_iter_chunk_size(const Iterator& iter, const Iterator& last,
+                            typename std::iterator_traits<Iterator>::difference_type chunk_size) {
+            return std::min(chunk_size, std::distance(iter, last));
+        }
+
+        template<typename Iterator>
+        Iterator advanced(Iterator iter, typename std::iterator_traits<Iterator>::difference_type offset) {
+            Iterator ret = iter;
+            std::advance(ret, offset);
+            return ret;
+        }
+
+        /**
+         * An iterator wrapper that calls std::future<>::get().
+         * @tparam Iterator
+         */
+        template<typename Iterator>
+        class getting_iter : public Iterator {
+        public:
+            using value_type = decltype((*std::declval<Iterator>()).get());
+            using difference_type = typename std::iterator_traits<Iterator>::difference_type;
+            using pointer = value_type*;
+            using reference = value_type&;
+            explicit getting_iter(Iterator iter) : iter(iter) {}
+
+            getting_iter operator++() { ++iter; return *this; }
+            getting_iter operator++(int) { getting_iter ret(*this); ++iter; return ret; }
+
+            value_type operator*() { return (*iter).get(); }
+            value_type operator[](difference_type offset) { return iter[offset].get(); }
+
+            bool operator==(const getting_iter<Iterator> &other) const { return iter == other.iter; }
+            bool operator!=(const getting_iter<Iterator> &other) const { return iter != other.iter; }
+
+        protected:
+            Iterator iter;
+        };
+
+        template<typename Iterator>
+        getting_iter<Iterator> get_wrap(Iterator iter) {
+            return getting_iter<Iterator>(iter);
+        }
+
+        template <class Container>
+        void get_futures(Container& futures) {
+            for (auto &future: futures) {
+                future.get();
+            }
+        }
+
+        /*
+         * Some methods are only available with C++17 and up. Reimplement on older standards.
+         */
+#if POOLSTL_HAVE_CXX17_LIB
+        namespace cpp17 = std;
+#else
+        namespace cpp17 {
+
+            // std::reduce
+
+            template<class InputIt, class Tp, class BinOp>
+            Tp reduce(InputIt first, InputIt last, Tp init, BinOp b) {
+                for (; first != last; ++first)
+                    init = b(init, *first);
+                return init;
+            }
+
+            template<class InputIt>
+            typename std::iterator_traits<InputIt>::value_type reduce(InputIt first, InputIt last) {
+                return reduce(first, last,
+                              typename std::iterator_traits<InputIt>::value_type{},
+                              std::plus<typename std::iterator_traits<InputIt>::value_type>());
+            }
+
+            // std::transform
+
+            template<class InputIt, class OutputIt, class UnaryOperation>
+            OutputIt transform(InputIt first1, InputIt last1, OutputIt d_first,
+                               UnaryOperation unary_op) {
+                while (first1 != last1) {
+                    *d_first++ = unary_op(*first1++);
+                }
+
+                return d_first;
+            }
+
+            template<class InputIt1, class InputIt2, class OutputIt, class BinaryOperation>
+            OutputIt transform(InputIt1 first1, InputIt1 last1,
+                               InputIt2 first2, OutputIt d_first,
+                               BinaryOperation binary_op) {
+                while (first1 != last1) {
+                    *d_first++ = binary_op(*first1++, *first2++);
+                }
+
+                return d_first;
+            }
+        }
+#endif
+    }
+}
+
+#endif
+
+namespace poolstl {
+
+    namespace ttp = task_thread_pool;
+
+    namespace execution {
+        namespace internal {
+            /**
+             * Holds the thread pool used by par.
+             */
+            inline std::shared_ptr<ttp::task_thread_pool> get_default_pool() {
+                static std::shared_ptr<ttp::task_thread_pool> pool;
+                static std::once_flag flag;
+                std::call_once(flag, [&](){ pool = std::make_shared<ttp::task_thread_pool>(); });
+                return pool;
+            }
+        }
+
+        /**
+         * Base class for all poolSTL policies.
+         */
+        struct poolstl_policy {
+        };
+
+        /**
+         * A sequential policy that simply forwards to the non-policy overload.
+         */
+        struct sequenced_policy : public poolstl_policy {
+            POOLSTL_NO_DISCARD ttp::task_thread_pool* pool() const {
+                // never called, but must exist for C++11 support
+                throw std::runtime_error("poolSTL: requested thread pool for seq policy.");
+            }
+
+            POOLSTL_NO_DISCARD bool par_allowed() const {
+                return false;
+            }
+        };
+
+        /**
+         * A parallel policy that can use a user-specified thread pool or a default one.
+         */
+        struct parallel_policy : public poolstl_policy {
+            parallel_policy() = default;
+            explicit parallel_policy(ttp::task_thread_pool* on_pool, bool par_ok): on_pool(on_pool), par_ok(par_ok) {}
+
+            parallel_policy on(ttp::task_thread_pool& pool) const {
+                return parallel_policy{&pool, par_ok};
+            }
+
+            parallel_policy par_if(bool call_par) const {
+                return parallel_policy{on_pool, call_par};
+            }
+
+            POOLSTL_NO_DISCARD ttp::task_thread_pool* pool() const {
+                if (on_pool) {
+                    return on_pool;
+                } else {
+                    return internal::get_default_pool().get();
+                }
+            }
+
+            POOLSTL_NO_DISCARD bool par_allowed() const {
+                return par_ok;
+            }
+
+        protected:
+            ttp::task_thread_pool *on_pool = nullptr;
+            bool par_ok = true;
+        };
+
+        constexpr sequenced_policy seq{};
+        constexpr parallel_policy par{};
+
+        /**
+         * EXPERIMENTAL: Subject to significant changes or removal.
+         * Use pure threads for each operation instead of a shared thread pool.
+         *
+         * Advantage:
+         *  - Fewer symbols (no packaged_task with its operators, destructors, vtable, etc) means smaller binary
+         *    which can mean a lot when there are many calls.
+         *  - No thread pool to manage.
+         *
+         * Disadvantages:
+         *  - Threads are started and joined for every operation, so it is harder to amortize that cost.
+         *  - Barely any algorithms are supported.
+         */
+        struct pure_threads_policy : public poolstl_policy {
+            explicit pure_threads_policy(unsigned int num_threads, bool par_ok): num_threads(num_threads),
+                                                                                 par_ok(par_ok) {}
+
+            POOLSTL_NO_DISCARD unsigned int get_num_threads() const {
+                if (num_threads == 0) {
+                    return std::thread::hardware_concurrency();
+                }
+                return num_threads;
+            }
+
+            POOLSTL_NO_DISCARD bool par_allowed() const {
+                return par_ok;
+            }
+
+        protected:
+            unsigned int num_threads = 1;
+            bool par_ok = true;
+        };
+
+        /**
+         * Choose parallel or sequential at runtime.
+         *
+         * @param call_par Whether to use a parallel policy.
+         * @return `par` if call_par is true, else a sequential policy (like `seq`).
+         */
+        inline parallel_policy par_if(bool call_par) {
+            return parallel_policy{nullptr, call_par};
+        }
+
+        /**
+         * Choose parallel or sequential at runtime, with pool selection.
+         *
+         * @param call_par Whether to use a parallel policy.
+         * @return `par.on(pool)` if call_par is true, else a sequential policy (like `seq`).
+         */
+        inline parallel_policy par_if(bool call_par, ttp::task_thread_pool& pool) {
+            return parallel_policy{&pool, call_par};
+        }
+
+        /**
+         * EXPERIMENTAL: Subject to significant changes or removal. See `pure_threads_policy`.
+         * Choose parallel or sequential at runtime, with thread count selection.
+         *
+         * @param call_par Whether to use a parallel policy.
+         * @return `par.on(pool)` if call_par is true, else `seq`.
+         */
+        inline pure_threads_policy par_if_threads(bool call_par, unsigned int num_threads) {
+            return pure_threads_policy{num_threads, call_par};
+        }
+    }
+
+    using execution::seq;
+    using execution::par;
+    using execution::par_if;
+
+    namespace internal {
+        /**
+         * To enable/disable seq overload resolution
+         */
+        template <class ExecPolicy, class Tp>
+        using enable_if_seq =
+            typename std::enable_if<
+                std::is_same<poolstl::execution::sequenced_policy,
+                    typename std::remove_cv<typename std::remove_reference<ExecPolicy>::type>::type>::value,
+                Tp>::type;
+
+        /**
+         * To enable/disable par overload resolution
+         */
+        template <class ExecPolicy, class Tp>
+        using enable_if_par =
+            typename std::enable_if<
+                std::is_same<poolstl::execution::parallel_policy,
+                    typename std::remove_cv<typename std::remove_reference<ExecPolicy>::type>::type>::value,
+                Tp>::type;
+
+        /**
+         * To enable/disable par overload resolution
+         */
+        template <class ExecPolicy, class Tp>
+        using enable_if_poolstl_policy =
+            typename std::enable_if<
+                std::is_base_of<poolstl::execution::poolstl_policy,
+                    typename std::remove_cv<typename std::remove_reference<ExecPolicy>::type>::type>::value,
+                Tp>::type;
+
+        template <class ExecPolicy>
+        bool is_seq(const ExecPolicy& policy) {
+            return !policy.par_allowed();
+        }
+
+        template <class ExecPolicy>
+        using is_pure_threads_policy = std::is_same<poolstl::execution::pure_threads_policy,
+            typename std::remove_cv<typename std::remove_reference<ExecPolicy>::type>::type>;
+    }
+}
+
+#endif
+
+#ifndef POOLSTL_ALGORITHM_HPP
+#define POOLSTL_ALGORITHM_HPP
+
+#include <functional>
+
+
+#ifndef POOLSTL_INTERNAL_TTP_IMPL_HPP
+#define POOLSTL_INTERNAL_TTP_IMPL_HPP
+
+#include <algorithm>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+
+#ifndef POOLSTL_EXECUTION_HPP
+#define POOLSTL_EXECUTION_HPP
+
+#include <memory>
+#include <mutex>
+#include <stdexcept>
 #include <type_traits>
 
 
@@ -102,7 +864,7 @@
 // Version macros.
 #define TASK_THREAD_POOL_VERSION_MAJOR 1
 #define TASK_THREAD_POOL_VERSION_MINOR 0
-#define TASK_THREAD_POOL_VERSION_PATCH 9
+#define TASK_THREAD_POOL_VERSION_PATCH 10
 
 #include <condition_variable>
 #include <functional>
@@ -289,12 +1051,22 @@ namespace task_thread_pool {
 #else
             typename R = typename std::result_of<decay_t<F>(decay_t<A>...)>::type
 #endif
-            >
+        >
         TTP_NODISCARD std::future<R> submit(F&& func, A&&... args) {
+#if defined(_MSC_VER)
+            // MSVC's packaged_task is not movable even though it should be.
+            // Discussion about this bug and its future fix:
+            // https://developercommunity.visualstudio.com/t/unable-to-move-stdpackaged-task-into-any-stl-conta/108672
             std::shared_ptr<std::packaged_task<R()>> ptask =
                 std::make_shared<std::packaged_task<R()>>(std::bind(std::forward<F>(func), std::forward<A>(args)...));
             submit_detach([ptask] { (*ptask)(); });
             return ptask->get_future();
+#else
+            std::packaged_task<R()> task(std::bind(std::forward<F>(func), std::forward<A>(args)...));
+            auto ret = task.get_future();
+            submit_detach(std::move(task));
+            return ret;
+#endif
         }
 
         /**
@@ -493,7 +1265,7 @@ namespace task_thread_pool {
 // Version macros.
 #define POOLSTL_VERSION_MAJOR 0
 #define POOLSTL_VERSION_MINOR 3
-#define POOLSTL_VERSION_PATCH 1
+#define POOLSTL_VERSION_PATCH 3
 
 #include <cstddef>
 #include <iterator>
@@ -637,10 +1409,6 @@ namespace poolstl {
 
 #endif
 
-#if POOLSTL_HAVE_CXX17
-#include <variant>
-#endif
-
 namespace poolstl {
 
     namespace ttp = task_thread_pool;
@@ -658,92 +1426,129 @@ namespace poolstl {
             }
         }
 
+        /**
+         * Base class for all poolSTL policies.
+         */
+        struct poolstl_policy {
+        };
+
         /**
          * A sequential policy that simply forwards to the non-policy overload.
          */
-        struct sequenced_policy {};
+        struct sequenced_policy : public poolstl_policy {
+            POOLSTL_NO_DISCARD ttp::task_thread_pool* pool() const {
+                // never called, but must exist for C++11 support
+                throw std::runtime_error("poolSTL: requested thread pool for seq policy.");
+            }
+
+            POOLSTL_NO_DISCARD bool par_allowed() const {
+                return false;
+            }
+        };
 
         /**
          * A parallel policy that can use a user-specified thread pool or a default one.
          */
-        struct parallel_policy {
+        struct parallel_policy : public poolstl_policy {
             parallel_policy() = default;
-            explicit parallel_policy(ttp::task_thread_pool& on_pool): on_pool(&on_pool) {}
+            explicit parallel_policy(ttp::task_thread_pool* on_pool, bool par_ok): on_pool(on_pool), par_ok(par_ok) {}
 
             parallel_policy on(ttp::task_thread_pool& pool) const {
-                return parallel_policy{pool};
+                return parallel_policy{&pool, par_ok};
+            }
+
+            parallel_policy par_if(bool call_par) const {
+                return parallel_policy{on_pool, call_par};
             }
 
-            POOLSTL_NO_DISCARD ttp::task_thread_pool& pool() const {
+            POOLSTL_NO_DISCARD ttp::task_thread_pool* pool() const {
                 if (on_pool) {
-                    return *on_pool;
+                    return on_pool;
                 } else {
-                    return *(internal::get_default_pool());
+                    return internal::get_default_pool().get();
                 }
             }
 
+            POOLSTL_NO_DISCARD bool par_allowed() const {
+                return par_ok;
+            }
+
         protected:
             ttp::task_thread_pool *on_pool = nullptr;
+            bool par_ok = true;
         };
 
         constexpr sequenced_policy seq{};
         constexpr parallel_policy par{};
 
-
-#if POOLSTL_HAVE_CXX17
         /**
-         * A policy that allows selecting a policy at runtime.
+         * EXPERIMENTAL: Subject to significant changes or removal.
+         * Use pure threads for each operation instead of a shared thread pool.
          *
-         * @tparam Variant std::variant<> of policy options.
+         * Advantage:
+         *  - Fewer symbols (no packaged_task with its operators, destructors, vtable, etc) means smaller binary
+         *    which can mean a lot when there are many calls.
+         *  - No thread pool to manage.
+         *
+         * Disadvantages:
+         *  - Threads are started and joined for every operation, so it is harder to amortize that cost.
+         *  - Barely any algorithms are supported.
          */
-        template <typename Variant>
-        struct variant_policy {
-            explicit variant_policy(const Variant& policy): var(policy) {}
-            Variant var;
-        };
+        struct pure_threads_policy : public poolstl_policy {
+            explicit pure_threads_policy(unsigned int num_threads, bool par_ok): num_threads(num_threads),
+                                                                                 par_ok(par_ok) {}
 
-        namespace internal {
-            using poolstl_policy_variant = std::variant<
-                poolstl::execution::parallel_policy,
-                poolstl::execution::sequenced_policy>;
-        }
+            POOLSTL_NO_DISCARD unsigned int get_num_threads() const {
+                if (num_threads == 0) {
+                    return std::thread::hardware_concurrency();
+                }
+                return num_threads;
+            }
+
+            POOLSTL_NO_DISCARD bool par_allowed() const {
+                return par_ok;
+            }
+
+        protected:
+            unsigned int num_threads = 1;
+            bool par_ok = true;
+        };
 
         /**
          * Choose parallel or sequential at runtime.
          *
          * @param call_par Whether to use a parallel policy.
-         * @return `par` if call_par is true, else `seq`.
+         * @return `par` if call_par is true, else a sequential policy (like `seq`).
          */
-        inline variant_policy<internal::poolstl_policy_variant> par_if(bool call_par) {
-            if (call_par) {
-                return variant_policy(internal::poolstl_policy_variant(par));
-            } else {
-                return variant_policy(internal::poolstl_policy_variant(seq));
-            }
+        inline parallel_policy par_if(bool call_par) {
+            return parallel_policy{nullptr, call_par};
         }
 
         /**
          * Choose parallel or sequential at runtime, with pool selection.
          *
          * @param call_par Whether to use a parallel policy.
+         * @return `par.on(pool)` if call_par is true, else a sequential policy (like `seq`).
+         */
+        inline parallel_policy par_if(bool call_par, ttp::task_thread_pool& pool) {
+            return parallel_policy{&pool, call_par};
+        }
+
+        /**
+         * EXPERIMENTAL: Subject to significant changes or removal. See `pure_threads_policy`.
+         * Choose parallel or sequential at runtime, with thread count selection.
+         *
+         * @param call_par Whether to use a parallel policy.
          * @return `par.on(pool)` if call_par is true, else `seq`.
          */
-        inline variant_policy<internal::poolstl_policy_variant> par_if(bool call_par, ttp::task_thread_pool& pool) {
-            if (call_par) {
-                return variant_policy(internal::poolstl_policy_variant(par.on(pool)));
-            } else {
-                return variant_policy(internal::poolstl_policy_variant(seq));
-            }
+        inline pure_threads_policy par_if_threads(bool call_par, unsigned int num_threads) {
+            return pure_threads_policy{num_threads, call_par};
         }
-#endif
     }
 
     using execution::seq;
     using execution::par;
-#if POOLSTL_HAVE_CXX17
-    using execution::variant_policy;
     using execution::par_if;
-#endif
 
     namespace internal {
         /**
@@ -766,44 +1571,29 @@ namespace poolstl {
                     typename std::remove_cv<typename std::remove_reference<ExecPolicy>::type>::type>::value,
                 Tp>::type;
 
-#if POOLSTL_HAVE_CXX17
-        /**
-         * Helper for enable_if_poolstl_variant
-         */
-        template <typename T> struct is_poolstl_variant_policy : std::false_type {};
-        template <typename V> struct is_poolstl_variant_policy<
-            ::poolstl::execution::variant_policy<V>> :std::true_type {};
-
         /**
-         * To enable/disable variant_policy (for par_if) overload resolution
+         * To enable/disable par overload resolution
          */
         template <class ExecPolicy, class Tp>
-        using enable_if_poolstl_variant =
+        using enable_if_poolstl_policy =
             typename std::enable_if<
-                is_poolstl_variant_policy<
+                std::is_base_of<poolstl::execution::poolstl_policy,
                     typename std::remove_cv<typename std::remove_reference<ExecPolicy>::type>::type>::value,
                 Tp>::type;
-#endif
+
+        template <class ExecPolicy>
+        bool is_seq(const ExecPolicy& policy) {
+            return !policy.par_allowed();
+        }
+
+        template <class ExecPolicy>
+        using is_pure_threads_policy = std::is_same<poolstl::execution::pure_threads_policy,
+            typename std::remove_cv<typename std::remove_reference<ExecPolicy>::type>::type>;
     }
 }
 
 #endif
 
-#ifndef POOLSTL_ALGORITHM_HPP
-#define POOLSTL_ALGORITHM_HPP
-
-#include <functional>
-
-
-#ifndef POOLSTL_INTERNAL_TTP_IMPL_HPP
-#define POOLSTL_INTERNAL_TTP_IMPL_HPP
-
-#include <algorithm>
-#include <numeric>
-#include <utility>
-#include <vector>
-
-
 namespace poolstl {
     namespace internal {
 
@@ -815,33 +1605,61 @@ namespace poolstl {
         std::vector<std::future<void>>
         parallel_apply(ExecPolicy &&policy, Op op, const ArgContainer& args_list) {
             std::vector<std::future<void>> futures;
-            auto& task_pool = policy.pool();
+            auto& task_pool = *policy.pool();
 
             for (const auto& args : args_list) {
-                futures.emplace_back(task_pool.submit([op](const auto& args_fwd) { std::apply(op, args_fwd); }, args));
+                futures.emplace_back(task_pool.submit([](Op op, const auto& args_fwd) {
+                        std::apply(op, args_fwd);
+                    }, op, args));
             }
 
             return futures;
         }
 #endif
 
+        /**
+         * Chunk a single range, with autodetected return types.
+         */
+        template <class ExecPolicy, class RandIt, class Chunk,
+            class ChunkRet = decltype(std::declval<Chunk>()(std::declval<RandIt>(), std::declval<RandIt>()))>
+        std::vector<std::future<ChunkRet>>
+        parallel_chunk_for_gen(ExecPolicy &&policy, RandIt first, RandIt last, Chunk chunk,
+                               ChunkRet* = (decltype(std::declval<Chunk>()(std::declval<RandIt>(),
+                                                     std::declval<RandIt>()))*)nullptr,
+                               int extra_split_factor = 1) {
+            std::vector<std::future<ChunkRet>> futures;
+            auto& task_pool = *policy.pool();
+            auto chunk_size = get_chunk_size(first, last, extra_split_factor * task_pool.get_num_threads());
+
+            while (first < last) {
+                auto iter_chunk_size = get_iter_chunk_size(first, last, chunk_size);
+                RandIt loop_end = advanced(first, iter_chunk_size);
+
+                futures.emplace_back(task_pool.submit(std::forward<Chunk>(chunk), first, loop_end));
+
+                first = loop_end;
+            }
+
+            return futures;
+        }
+
         /**
          * Chunk a single range.
          */
-        template <class ExecPolicy, class RandIt, class Chunk>
-        std::vector<std::future<decltype(std::declval<Chunk>()(std::declval<RandIt>(), std::declval<RandIt>()))>>
-        parallel_chunk_for(ExecPolicy &&policy, RandIt first, RandIt last, Chunk chunk, int extra_split_factor = 1) {
-            std::vector<std::future<
-                decltype(std::declval<Chunk>()(std::declval<RandIt>(), std::declval<RandIt>()))
-                >> futures;
-            auto& task_pool = policy.pool();
+        template <class ExecPolicy, class RandIt, class Chunk, class ChunkRet, typename... A>
+        std::vector<std::future<ChunkRet>>
+        parallel_chunk_for_1(ExecPolicy &&policy, RandIt first, RandIt last,
+                             Chunk chunk, ChunkRet*, int extra_split_factor, A&&... chunk_args) {
+            std::vector<std::future<ChunkRet>> futures;
+            auto& task_pool = *policy.pool();
             auto chunk_size = get_chunk_size(first, last, extra_split_factor * task_pool.get_num_threads());
 
             while (first < last) {
                 auto iter_chunk_size = get_iter_chunk_size(first, last, chunk_size);
                 RandIt loop_end = advanced(first, iter_chunk_size);
 
-                futures.emplace_back(task_pool.submit(chunk, first, loop_end));
+                futures.emplace_back(task_pool.submit(std::forward<Chunk>(chunk), first, loop_end,
+                                                      std::forward<A>(chunk_args)...));
 
                 first = loop_end;
             }
@@ -849,28 +1667,36 @@ namespace poolstl {
             return futures;
         }
 
+        /**
+         * Chunk a single range.
+         */
+        template <class ExecPolicy, class RandIt, class Chunk, class ChunkRet, typename... A>
+        typename std::enable_if<!is_pure_threads_policy<ExecPolicy>::value, void>::type
+        parallel_chunk_for_1_wait(ExecPolicy &&policy, RandIt first, RandIt last,
+                                  Chunk chunk, ChunkRet* rettype, int extra_split_factor, A&&... chunk_args) {
+            auto futures = parallel_chunk_for_1(std::forward<ExecPolicy>(policy), first, last,
+                                                std::forward<Chunk>(chunk), rettype, extra_split_factor,
+                                                std::forward<A>(chunk_args)...);
+            get_futures(futures);
+        }
+
         /**
          * Element-wise chunk two ranges.
          */
-        template <class ExecPolicy, class RandIt1, class RandIt2, class Chunk>
-        std::vector<std::future<decltype(std::declval<Chunk>()(
-            std::declval<RandIt1>(),
-            std::declval<RandIt1>(),
-            std::declval<RandIt2>()))>>
-        parallel_chunk_for(ExecPolicy &&policy, RandIt1 first1, RandIt1 last1, RandIt2 first2, Chunk chunk) {
-            std::vector<std::future<decltype(std::declval<Chunk>()(
-                    std::declval<RandIt1>(),
-                    std::declval<RandIt1>(),
-                    std::declval<RandIt2>()))
-            >> futures;
-            auto& task_pool = policy.pool();
+        template <class ExecPolicy, class RandIt1, class RandIt2, class Chunk, class ChunkRet, typename... A>
+        std::vector<std::future<ChunkRet>>
+        parallel_chunk_for_2(ExecPolicy &&policy, RandIt1 first1, RandIt1 last1, RandIt2 first2,
+                             Chunk chunk, ChunkRet*, A&&... chunk_args) {
+            std::vector<std::future<ChunkRet>> futures;
+            auto& task_pool = *policy.pool();
             auto chunk_size = get_chunk_size(first1, last1, task_pool.get_num_threads());
 
             while (first1 < last1) {
                 auto iter_chunk_size = get_iter_chunk_size(first1, last1, chunk_size);
                 RandIt1 loop_end = advanced(first1, iter_chunk_size);
 
-                futures.emplace_back(task_pool.submit(chunk, first1, loop_end, first2));
+                futures.emplace_back(task_pool.submit(std::forward<Chunk>(chunk), first1, loop_end, first2,
+                                                      std::forward<A>(chunk_args)...));
 
                 first1 = loop_end;
                 std::advance(first2, iter_chunk_size);
@@ -882,28 +1708,21 @@ namespace poolstl {
         /**
          * Element-wise chunk three ranges.
          */
-        template <class ExecPolicy, class RandIt1, class RandIt2, class RandIt3, class Chunk>
-        std::vector<std::future<decltype(std::declval<Chunk>()(
-            std::declval<RandIt1>(),
-            std::declval<RandIt1>(),
-            std::declval<RandIt2>(),
-            std::declval<RandIt3>()))>>
-        parallel_chunk_for(ExecPolicy &&policy, RandIt1 first1, RandIt1 last1, RandIt2 first2, RandIt3 first3,
-                           Chunk chunk) {
-            std::vector<std::future<decltype(std::declval<Chunk>()(
-                std::declval<RandIt1>(),
-                std::declval<RandIt1>(),
-                std::declval<RandIt2>(),
-                std::declval<RandIt3>()))
-            >> futures;
-            auto& task_pool = policy.pool();
+        template <class ExecPolicy, class RandIt1, class RandIt2, class RandIt3,
+                  class Chunk, class ChunkRet, typename... A>
+        std::vector<std::future<ChunkRet>>
+        parallel_chunk_for_3(ExecPolicy &&policy, RandIt1 first1, RandIt1 last1, RandIt2 first2, RandIt3 first3,
+                           Chunk chunk, ChunkRet*, A&&... chunk_args) {
+            std::vector<std::future<ChunkRet>> futures;
+            auto& task_pool = *policy.pool();
             auto chunk_size = get_chunk_size(first1, last1, task_pool.get_num_threads());
 
             while (first1 < last1) {
                 auto iter_chunk_size = get_iter_chunk_size(first1, last1, chunk_size);
                 RandIt1 loop_end = advanced(first1, iter_chunk_size);
 
-                futures.emplace_back(task_pool.submit(chunk, first1, loop_end, first2, first3));
+                futures.emplace_back(task_pool.submit(std::forward<Chunk>(chunk), first1, loop_end, first2, first3,
+                                                      std::forward<A>(chunk_args)...));
 
                 first1 = loop_end;
                 std::advance(first2, iter_chunk_size);
@@ -916,28 +1735,26 @@ namespace poolstl {
         /**
          * Sort a range in parallel.
          *
-         * @param stable Whether to use std::stable_sort or std::sort
+         * @param sort_func Sequential sort method, like std::sort or std::stable_sort
+         * @param merge_func Sequential merge method, like std::inplace_merge
          */
-        template <class ExecPolicy, class RandIt, class Compare>
-        void parallel_sort(ExecPolicy &&policy, RandIt first, RandIt last, Compare comp, bool stable) {
+        template <class ExecPolicy, class RandIt, class Compare, class SortFunc, class MergeFunc>
+        void parallel_sort(ExecPolicy &&policy, RandIt first, RandIt last,
+                           Compare comp, SortFunc sort_func, MergeFunc merge_func) {
             if (first == last) {
                 return;
             }
 
             // Sort chunks in parallel
-            auto futures = parallel_chunk_for(std::forward<ExecPolicy>(policy), first, last,
-                             [&comp, stable] (RandIt chunk_first, RandIt chunk_last) {
-                                 if (stable) {
-                                     std::stable_sort(chunk_first, chunk_last, comp);
-                                 } else {
-                                     std::sort(chunk_first, chunk_last, comp);
-                                 }
+            auto futures = parallel_chunk_for_gen(std::forward<ExecPolicy>(policy), first, last,
+                             [&comp, sort_func] (RandIt chunk_first, RandIt chunk_last) {
+                                 sort_func(chunk_first, chunk_last, comp);
                                  return std::make_pair(chunk_first, chunk_last);
                              });
 
             // Merge the sorted ranges
             using SortedRange = std::pair<RandIt, RandIt>;
-            auto& task_pool = policy.pool();
+            auto& task_pool = *policy.pool();
             std::vector<SortedRange> subranges;
             do {
                 for (auto& future : futures) {
@@ -950,9 +1767,10 @@ namespace poolstl {
                         // pair up and merge
                         auto& lhs = subranges[i];
                         auto& rhs = subranges[i + 1];
-                        futures.emplace_back(task_pool.submit([&comp] (RandIt chunk_first, RandIt chunk_middle,
-                                                                       RandIt chunk_last) {
-                            std::inplace_merge(chunk_first, chunk_middle, chunk_last, comp);
+                        futures.emplace_back(task_pool.submit([&comp, merge_func] (RandIt chunk_first,
+                                                                                   RandIt chunk_middle,
+                                                                                   RandIt chunk_last) {
+                            merge_func(chunk_first, chunk_middle, chunk_last, comp);
                             return std::make_pair(chunk_first, chunk_last);
                         }, lhs.first, lhs.second, rhs.second));
                         ++i;
@@ -973,6 +1791,56 @@ namespace poolstl {
 
 #endif
 
+#ifndef POOLSTL_INTERNAL_THREAD_IMPL_HPP
+#define POOLSTL_INTERNAL_THREAD_IMPL_HPP
+
+/**
+ * EXPERIMENTAL: Subject to significant changes or removal.
+ * An implementation using only std::thread and no thread pool at all.
+ *
+ * Advantage:
+ *  - Fewer symbols (no packaged_task with its operators, destructors, vtable, etc) means smaller binary
+ *    which can mean a lot when there are many calls like with many templates.
+ *  - No thread pool to manage.
+ *
+ * Disadvantages:
+ *  - Threads are started and joined for every operation, so it is harder to amortize that cost.
+ *  - Barely any algorithms are supported.
+ */
+
+
+
+namespace poolstl {
+    namespace internal {
+
+        template <class ExecPolicy, class RandIt, class Chunk, class ChunkRet, typename... A>
+        typename std::enable_if<is_pure_threads_policy<ExecPolicy>::value, void>::type
+        parallel_chunk_for_1_wait(ExecPolicy &&policy, RandIt first, RandIt last,
+                                  Chunk chunk, ChunkRet*, int extra_split_factor, A&&... chunk_args) {
+            std::vector<std::thread> threads;
+            auto chunk_size = get_chunk_size(first, last, extra_split_factor * policy.get_num_threads());
+
+            while (first < last) {
+                auto iter_chunk_size = get_iter_chunk_size(first, last, chunk_size);
+                RandIt loop_end = advanced(first, iter_chunk_size);
+
+                threads.emplace_back(std::thread(std::forward<Chunk>(chunk), first, loop_end,
+                                                 std::forward<A>(chunk_args)...));
+
+                first = loop_end;
+            }
+
+            for (auto& thread : threads) {
+                if (thread.joinable()) {
+                    thread.join();
+                }
+            }
+        }
+    }
+}
+
+#endif
+
 namespace std {
 
     /**
@@ -980,12 +1848,14 @@ namespace std {
      * See std::copy https://en.cppreference.com/w/cpp/algorithm/copy
      */
     template <class ExecPolicy, class RandIt1, class RandIt2>
-    poolstl::internal::enable_if_par<ExecPolicy, RandIt2>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, RandIt2>
     copy(ExecPolicy &&policy, RandIt1 first, RandIt1 last, RandIt2 dest) {
-        auto futures = poolstl::internal::parallel_chunk_for(std::forward<ExecPolicy>(policy), first, last, dest,
-                     [](RandIt1 chunk_first, RandIt1 chunk_last, RandIt2 chunk_dest) {
-                          std::copy(chunk_first, chunk_last, chunk_dest);
-                     });
+        if (poolstl::internal::is_seq<ExecPolicy>(policy)) {
+            return std::copy(first, last, dest);
+        }
+
+        auto futures = poolstl::internal::parallel_chunk_for_2(std::forward<ExecPolicy>(policy), first, last, dest,
+                                                               std::copy<RandIt1, RandIt2>, (RandIt2*)nullptr);
         poolstl::internal::get_futures(futures);
         return poolstl::internal::advanced(dest, std::distance(first, last));
     }
@@ -995,7 +1865,7 @@ namespace std {
      * See std::copy_n https://en.cppreference.com/w/cpp/algorithm/copy_n
      */
     template <class ExecPolicy, class RandIt1, class Size, class RandIt2>
-    poolstl::internal::enable_if_par<ExecPolicy, RandIt2>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, RandIt2>
     copy_n(ExecPolicy &&policy, RandIt1 first, Size n, RandIt2 dest) {
         if (n <= 0) {
             return dest;
@@ -1010,14 +1880,17 @@ namespace std {
      * See std::count_if https://en.cppreference.com/w/cpp/algorithm/count_if
      */
     template <class ExecPolicy, class RandIt, class UnaryPredicate>
-    poolstl::internal::enable_if_par<ExecPolicy, typename iterator_traits<RandIt>::difference_type>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, typename iterator_traits<RandIt>::difference_type>
     count_if(ExecPolicy&& policy, RandIt first, RandIt last, UnaryPredicate p) {
+        if (poolstl::internal::is_seq<ExecPolicy>(policy)) {
+            return std::count_if(first, last, p);
+        }
+
         using T = typename iterator_traits<RandIt>::difference_type;
 
-        auto futures = poolstl::internal::parallel_chunk_for(std::forward<ExecPolicy>(policy), first, last,
-                                                             [&p](RandIt chunk_first, RandIt chunk_last) {
-                                                                 return std::count_if(chunk_first, chunk_last, p);
-                                                             });
+        auto futures = poolstl::internal::parallel_chunk_for_1(std::forward<ExecPolicy>(policy), first, last,
+                                                               std::count_if<RandIt, UnaryPredicate>,
+                                                               (T*)nullptr, 1, p);
 
         return poolstl::internal::cpp17::reduce(
             poolstl::internal::get_wrap(futures.begin()),
@@ -1029,7 +1902,7 @@ namespace std {
      * See std::count https://en.cppreference.com/w/cpp/algorithm/count
      */
     template <class ExecPolicy, class RandIt, class T>
-    poolstl::internal::enable_if_par<ExecPolicy, typename iterator_traits<RandIt>::difference_type>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, typename iterator_traits<RandIt>::difference_type>
     count(ExecPolicy&& policy, RandIt first, RandIt last, const T& value) {
         return std::count_if(std::forward<ExecPolicy>(policy), first, last,
                              [&value](const T& test) { return test == value; });
@@ -1040,13 +1913,15 @@ namespace std {
      * See std::fill https://en.cppreference.com/w/cpp/algorithm/fill
      */
     template <class ExecPolicy, class RandIt, class Tp>
-    poolstl::internal::enable_if_par<ExecPolicy, void>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, void>
     fill(ExecPolicy &&policy, RandIt first, RandIt last, const Tp& value) {
-        auto futures = poolstl::internal::parallel_chunk_for(std::forward<ExecPolicy>(policy), first, last,
-                                                             [&value](RandIt chunk_first, RandIt chunk_last) {
-                                                                 std::fill(chunk_first, chunk_last, value);
-                                                             });
-        poolstl::internal::get_futures(futures);
+        if (poolstl::internal::is_seq<ExecPolicy>(policy)) {
+            std::fill(first, last, value);
+            return;
+        }
+
+        poolstl::internal::parallel_chunk_for_1_wait(std::forward<ExecPolicy>(policy), first, last,
+                                                     std::fill<RandIt, Tp>, (void*)nullptr, 1, value);
     }
 
     /**
@@ -1054,7 +1929,7 @@ namespace std {
      * See std::fill_n https://en.cppreference.com/w/cpp/algorithm/fill_n
      */
     template <class ExecPolicy, class RandIt, class Size, class Tp>
-    poolstl::internal::enable_if_par<ExecPolicy, RandIt>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, RandIt>
     fill_n(ExecPolicy &&policy, RandIt first, Size n, const Tp& value) {
         if (n <= 0) {
             return first;
@@ -1069,13 +1944,17 @@ namespace std {
      * See std::find_if https://en.cppreference.com/w/cpp/algorithm/find_if
      */
     template <class ExecPolicy, class RandIt, class UnaryPredicate>
-    poolstl::internal::enable_if_par<ExecPolicy, RandIt>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, RandIt>
     find_if(ExecPolicy &&policy, RandIt first, RandIt last, UnaryPredicate p) {
+        if (poolstl::internal::is_seq<ExecPolicy>(policy)) {
+            return std::find_if(first, last, p);
+        }
+
         using diff_t = typename std::iterator_traits<RandIt>::difference_type;
         diff_t n = std::distance(first, last);
         std::atomic<diff_t> extremum(n);
 
-        auto futures = poolstl::internal::parallel_chunk_for(std::forward<ExecPolicy>(policy), first, last,
+        poolstl::internal::parallel_chunk_for_1_wait(std::forward<ExecPolicy>(policy), first, last,
                                         [&first, &extremum, &p](RandIt chunk_first, RandIt chunk_last) {
                                             if (std::distance(first, chunk_first) > extremum) {
                                              // already found by another task
@@ -1091,8 +1970,8 @@ namespace std {
                                                     extremum.compare_exchange_weak(old, k);
                                                 }
                                             }
-                                        }, 8); // use small tasks so later ones may exit early if item is already found
-        poolstl::internal::get_futures(futures);
+                                        }, (void*)nullptr,
+                                        8); // use small tasks so later ones may exit early if item is already found
         return extremum == n ? last : first + extremum;
     }
 
@@ -1101,10 +1980,15 @@ namespace std {
      * See std::find_if_not https://en.cppreference.com/w/cpp/algorithm/find_if_not
      */
     template <class ExecPolicy, class RandIt, class UnaryPredicate>
-    poolstl::internal::enable_if_par<ExecPolicy, RandIt>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, RandIt>
     find_if_not(ExecPolicy &&policy, RandIt first, RandIt last, UnaryPredicate p) {
         return std::find_if(std::forward<ExecPolicy>(policy), first, last,
-                            [&p](const typename std::iterator_traits<RandIt>::value_type& test) { return !p(test); });
+#if POOLSTL_HAVE_CXX17_LIB
+                            std::not_fn(p)
+#else
+                            [&p](const typename std::iterator_traits<RandIt>::value_type& test) { return !p(test); }
+#endif
+                            );
     }
 
     /**
@@ -1112,7 +1996,7 @@ namespace std {
      * See std::find https://en.cppreference.com/w/cpp/algorithm/find
      */
     template <class ExecPolicy, class RandIt, class T>
-    poolstl::internal::enable_if_par<ExecPolicy, RandIt>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, RandIt>
     find(ExecPolicy &&policy, RandIt first, RandIt last, const T& value) {
         return std::find_if(std::forward<ExecPolicy>(policy), first, last,
                             [&value](const T& test) { return value == test; });
@@ -1123,16 +2007,23 @@ namespace std {
      * See std::for_each https://en.cppreference.com/w/cpp/algorithm/for_each
      */
     template <class ExecPolicy, class RandIt, class UnaryFunction>
-    poolstl::internal::enable_if_par<ExecPolicy, void>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, void>
     for_each(ExecPolicy &&policy, RandIt first, RandIt last, UnaryFunction f) {
-        auto futures = poolstl::internal::parallel_chunk_for(std::forward<ExecPolicy>(policy), first, last,
-                                                             [&f](RandIt chunk_first, RandIt chunk_last) {
-                                                                 // std::for_each(chunk_first, chunk_last, f);
-                                                                 for (; chunk_first != chunk_last; ++chunk_first) {
-                                                                     f(*chunk_first);
-                                                                 }
-                                                             });
-        poolstl::internal::get_futures(futures);
+        // Using a lambda instead of just calling the non-policy std::for_each because it appears to
+        // result in a smaller binary.
+        auto chunk_func = [&f](RandIt chunk_first, RandIt chunk_last) {
+            for (; chunk_first != chunk_last; ++chunk_first) {
+                f(*chunk_first);
+            }
+        };
+
+        if (poolstl::internal::is_seq<ExecPolicy>(policy)) {
+            chunk_func(first, last);
+            return;
+        }
+
+        poolstl::internal::parallel_chunk_for_1_wait(std::forward<ExecPolicy>(policy), first, last,
+                                                     chunk_func, (void*)nullptr, 1);
     }
 
     /**
@@ -1140,7 +2031,7 @@ namespace std {
      * See std::for_each_n https://en.cppreference.com/w/cpp/algorithm/for_each_n
      */
     template <class ExecPolicy, class RandIt, class Size, class UnaryFunction>
-    poolstl::internal::enable_if_par<ExecPolicy, RandIt>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, RandIt>
     for_each_n(ExecPolicy &&policy, RandIt first, Size n, UnaryFunction f) {
         RandIt last = poolstl::internal::advanced(first, n);
         std::for_each(std::forward<ExecPolicy>(policy), first, last, f);
@@ -1152,9 +2043,15 @@ namespace std {
      * See std::sort https://en.cppreference.com/w/cpp/algorithm/sort
      */
     template <class ExecPolicy, class RandIt, class Compare>
-    poolstl::internal::enable_if_par<ExecPolicy, void>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, void>
     sort(ExecPolicy &&policy, RandIt first, RandIt last, Compare comp) {
-        poolstl::internal::parallel_sort(std::forward<ExecPolicy>(policy), first, last, comp, false);
+        if (poolstl::internal::is_seq<ExecPolicy>(policy)) {
+            std::sort(first, last, comp);
+            return;
+        }
+
+        poolstl::internal::parallel_sort(std::forward<ExecPolicy>(policy), first, last, comp,
+                                         std::sort<RandIt, Compare>, std::inplace_merge<RandIt, Compare>);
     }
 
     /**
@@ -1162,10 +2059,10 @@ namespace std {
      * See std::sort https://en.cppreference.com/w/cpp/algorithm/sort
      */
     template <class ExecPolicy, class RandIt>
-    poolstl::internal::enable_if_par<ExecPolicy, void>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, void>
     sort(ExecPolicy &&policy, RandIt first, RandIt last) {
         using T = typename std::iterator_traits<RandIt>::value_type;
-        poolstl::internal::parallel_sort(std::forward<ExecPolicy>(policy), first, last, std::less<T>(), false);
+        std::sort(std::forward<ExecPolicy>(policy), first, last, std::less<T>());
     }
 
     /**
@@ -1173,9 +2070,15 @@ namespace std {
      * See std::stable_sort https://en.cppreference.com/w/cpp/algorithm/stable_sort
      */
     template <class ExecPolicy, class RandIt, class Compare>
-    poolstl::internal::enable_if_par<ExecPolicy, void>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, void>
     stable_sort(ExecPolicy &&policy, RandIt first, RandIt last, Compare comp) {
-        poolstl::internal::parallel_sort(std::forward<ExecPolicy>(policy), first, last, comp, true);
+        if (poolstl::internal::is_seq<ExecPolicy>(policy)) {
+            std::stable_sort(first, last, comp);
+            return;
+        }
+
+        poolstl::internal::parallel_sort(std::forward<ExecPolicy>(policy), first, last, comp,
+                                         std::stable_sort<RandIt, Compare>, std::inplace_merge<RandIt, Compare>);
     }
 
     /**
@@ -1183,10 +2086,10 @@ namespace std {
      * See std::stable_sort https://en.cppreference.com/w/cpp/algorithm/stable_sort
      */
     template <class ExecPolicy, class RandIt>
-    poolstl::internal::enable_if_par<ExecPolicy, void>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, void>
     stable_sort(ExecPolicy &&policy, RandIt first, RandIt last) {
         using T = typename std::iterator_traits<RandIt>::value_type;
-        poolstl::internal::parallel_sort(std::forward<ExecPolicy>(policy), first, last, std::less<T>(), true);
+        std::stable_sort(std::forward<ExecPolicy>(policy), first, last, std::less<T>());
     }
 
     /**
@@ -1194,14 +2097,17 @@ namespace std {
      * See std::transform https://en.cppreference.com/w/cpp/algorithm/transform
      */
     template <class ExecPolicy, class RandIt1, class RandIt2, class UnaryOperation>
-    poolstl::internal::enable_if_par<ExecPolicy, RandIt2>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, RandIt2>
     transform(ExecPolicy&& policy, RandIt1 first1, RandIt1 last1,
               RandIt2 dest, UnaryOperation unary_op) {
+        if (poolstl::internal::is_seq<ExecPolicy>(policy)) {
+            return poolstl::internal::cpp17::transform(first1, last1, dest, unary_op);
+        }
 
-        auto futures = poolstl::internal::parallel_chunk_for(std::forward<ExecPolicy>(policy), first1, last1, dest,
-                 [&unary_op](RandIt1 chunk_first1, RandIt1 chunk_last1, RandIt2 dest_first) {
-                      return poolstl::internal::cpp17::transform(chunk_first1, chunk_last1, dest_first, unary_op);
-                 });
+        auto futures = poolstl::internal::parallel_chunk_for_2(std::forward<ExecPolicy>(policy), first1, last1, dest,
+                                                               poolstl::internal::cpp17::transform<RandIt1, RandIt2,
+                                                                                                   UnaryOperation>,
+                                                               (RandIt2*)nullptr, unary_op);
         poolstl::internal::get_futures(futures);
         return dest + std::distance(first1, last1);
     }
@@ -1211,16 +2117,18 @@ namespace std {
      * See std::transform https://en.cppreference.com/w/cpp/algorithm/transform
      */
     template <class ExecPolicy, class RandIt1, class RandIt2, class RandIt3, class BinaryOperation>
-    poolstl::internal::enable_if_par<ExecPolicy, RandIt3>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, RandIt3>
     transform(ExecPolicy&& policy, RandIt1 first1, RandIt1 last1,
               RandIt2 first2, RandIt3 dest, BinaryOperation binary_op) {
+        if (poolstl::internal::is_seq<ExecPolicy>(policy)) {
+            return poolstl::internal::cpp17::transform(first1, last1, first2, dest, binary_op);
+        }
 
-        auto futures = poolstl::internal::parallel_chunk_for(std::forward<ExecPolicy>(policy), first1, last1,
-                                                             first2, dest,
-                 [&binary_op](RandIt1 chunk_first1, RandIt1 chunk_last1, RandIt1 chunk_first2, RandIt3 dest_first) {
-                     return poolstl::internal::cpp17::transform(chunk_first1, chunk_last1,
-                                                                chunk_first2, dest_first, binary_op);
-                 });
+        auto futures = poolstl::internal::parallel_chunk_for_3(std::forward<ExecPolicy>(policy), first1, last1,
+                                                               first2, dest,
+                                                               poolstl::internal::cpp17::transform<RandIt1, RandIt2,
+                                                                                              RandIt3, BinaryOperation>,
+                                                               (RandIt3*)nullptr, binary_op);
         poolstl::internal::get_futures(futures);
         return dest + std::distance(first1, last1);
     }
@@ -1230,7 +2138,7 @@ namespace std {
      * See std::all_of https://en.cppreference.com/w/cpp/algorithm/all_of
      */
     template <class ExecPolicy, typename RandIt, typename Predicate>
-    poolstl::internal::enable_if_par<ExecPolicy, bool>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, bool>
     all_of(ExecPolicy&& policy, RandIt first, RandIt last, Predicate pred) {
         return last == std::find_if_not(std::forward<ExecPolicy>(policy), first, last, pred);
     }
@@ -1240,7 +2148,7 @@ namespace std {
      * See std::none_of https://en.cppreference.com/w/cpp/algorithm/none_of
      */
     template <class ExecPolicy, typename RandIt, typename Predicate>
-    poolstl::internal::enable_if_par<ExecPolicy, bool>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, bool>
     none_of(ExecPolicy&& policy, RandIt first, RandIt last, Predicate pred) {
         return last == std::find_if(std::forward<ExecPolicy>(policy), first, last, pred);
     }
@@ -1250,7 +2158,7 @@ namespace std {
      * See std::any_of https://en.cppreference.com/w/cpp/algorithm/any_of
      */
     template <class ExecPolicy, typename RandIt, typename Predicate>
-    poolstl::internal::enable_if_par<ExecPolicy, bool>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, bool>
     any_of(ExecPolicy&& policy, RandIt first, RandIt last, Predicate pred) {
         return !std::none_of(std::forward<ExecPolicy>(policy), first, last, pred);
     }
@@ -1280,13 +2188,52 @@ namespace poolstl {
      * but cannot be shared by all parallel iterations.
      */
     template <class ExecPolicy, class RandIt, class ChunkConstructor, class UnaryFunction>
-    poolstl::internal::enable_if_par<ExecPolicy, void>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, void>
     for_each_chunk(ExecPolicy&& policy, RandIt first, RandIt last, ChunkConstructor construct, UnaryFunction f) {
-        auto futures = poolstl::internal::parallel_chunk_for(std::forward<ExecPolicy>(policy), first, last,
-                                                             [&construct, &f](RandIt chunk_first, RandIt chunk_last) {
-                                                                 for_each_chunk(chunk_first, chunk_last, construct, f);
-                                                             });
-        poolstl::internal::get_futures(futures);
+        if (poolstl::internal::is_seq<ExecPolicy>(policy)) {
+            for_each_chunk(first, last, construct, f);
+            return;
+        }
+
+        poolstl::internal::parallel_chunk_for_1_wait(std::forward<ExecPolicy>(policy), first, last,
+                                                     for_each_chunk <RandIt, ChunkConstructor, UnaryFunction>,
+                                                     (void*)nullptr, 1, construct, f);
+    }
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     *
+     * Like `std::sort`, but allows specifying the sequential sort and merge methods. These methods must have the
+     * same signature as the comparator versions of `std::sort` and `std::inplace_merge`, respectively.
+     */
+    template <class ExecPolicy, class RandIt, class Compare>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, void>
+    pluggable_sort(ExecPolicy &&policy, RandIt first, RandIt last, Compare comp,
+                   void (sort_func)(RandIt, RandIt, Compare) = std::sort,
+                   void (merge_func)(RandIt, RandIt, RandIt, Compare) = std::inplace_merge) {
+        if (poolstl::internal::is_seq<ExecPolicy>(policy)) {
+            sort_func(first, last, comp);
+            return;
+        }
+
+        poolstl::internal::parallel_sort(std::forward<ExecPolicy>(policy), first, last, comp, sort_func, merge_func);
+    }
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     *
+     * Like `std::sort`, but allows specifying the sequential sort and merge methods. These methods must have the
+     * same signature as the comparator versions of `std::sort` and `std::inplace_merge`, respectively.
+     */
+    template <class ExecPolicy, class RandIt>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, void>
+    pluggable_sort(ExecPolicy &&policy, RandIt first, RandIt last,
+                   void (sort_func)(RandIt, RandIt,
+                                    std::less<typename std::iterator_traits<RandIt>::value_type>) = std::sort,
+                   void (merge_func)(RandIt, RandIt, RandIt,
+                                    std::less<typename std::iterator_traits<RandIt>::value_type>) = std::inplace_merge){
+        using T = typename std::iterator_traits<RandIt>::value_type;
+        pluggable_sort(std::forward<ExecPolicy>(policy), first, last, std::less<T>(), sort_func, merge_func);
     }
 }
 
@@ -1306,14 +2253,18 @@ namespace std {
      * See std::exclusive_scan https://en.cppreference.com/w/cpp/algorithm/exclusive_scan
      */
     template <class ExecPolicy, class RandIt1, class RandIt2, class T, class BinaryOp>
-    poolstl::internal::enable_if_par<ExecPolicy, RandIt2>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, RandIt2>
     exclusive_scan(ExecPolicy &&policy, RandIt1 first, RandIt1 last, RandIt2 dest, T init, BinaryOp binop) {
         if (first == last) {
             return dest;
         }
 
+        if (poolstl::internal::is_seq<ExecPolicy>(policy)) {
+            return std::exclusive_scan(first, last, dest, init, binop);
+        }
+
         // Pass 1: Chunk the input and find the sum of each chunk
-        auto futures = poolstl::internal::parallel_chunk_for(std::forward<ExecPolicy>(policy), first, last,
+        auto futures = poolstl::internal::parallel_chunk_for_gen(std::forward<ExecPolicy>(policy), first, last,
                              [binop](RandIt1 chunk_first, RandIt1 chunk_last) {
                                  auto sum = std::accumulate(chunk_first, chunk_last, T{}, binop);
                                  return std::make_tuple(std::make_pair(chunk_first, chunk_last), sum);
@@ -1355,7 +2306,7 @@ namespace std {
      * See std::exclusive_scan https://en.cppreference.com/w/cpp/algorithm/exclusive_scan
      */
     template <class ExecPolicy, class RandIt1, class RandIt2, class T>
-    poolstl::internal::enable_if_par<ExecPolicy, RandIt2>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, RandIt2>
     exclusive_scan(ExecPolicy &&policy, RandIt1 first, RandIt1 last, RandIt2 dest, T init) {
         return std::exclusive_scan(std::forward<ExecPolicy>(policy), first, last, dest, init, std::plus<T>());
     }
@@ -1366,12 +2317,15 @@ namespace std {
      * See std::reduce https://en.cppreference.com/w/cpp/algorithm/reduce
      */
     template <class ExecPolicy, class RandIt, class T, class BinaryOp>
-    poolstl::internal::enable_if_par<ExecPolicy, T>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, T>
     reduce(ExecPolicy &&policy, RandIt first, RandIt last, T init, BinaryOp binop) {
-        auto futures = poolstl::internal::parallel_chunk_for(std::forward<ExecPolicy>(policy), first, last,
-                                  [init, binop](RandIt chunk_first, RandIt chunk_last) {
-                                      return poolstl::internal::cpp17::reduce(chunk_first, chunk_last, init, binop);
-                                  });
+        if (poolstl::internal::is_seq<ExecPolicy>(policy)) {
+            return poolstl::internal::cpp17::reduce(first, last, init, binop);
+        }
+
+        auto futures = poolstl::internal::parallel_chunk_for_1(std::forward<ExecPolicy>(policy), first, last,
+                                                               poolstl::internal::cpp17::reduce<RandIt, T, BinaryOp>,
+                                                               (T*)nullptr, 1, init, binop);
 
         return poolstl::internal::cpp17::reduce(
             poolstl::internal::get_wrap(futures.begin()),
@@ -1383,7 +2337,7 @@ namespace std {
      * See std::reduce https://en.cppreference.com/w/cpp/algorithm/reduce
      */
     template <class ExecPolicy, class RandIt, class T>
-    poolstl::internal::enable_if_par<ExecPolicy, T>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, T>
     reduce(ExecPolicy &&policy, RandIt first, RandIt last, T init) {
         return std::reduce(std::forward<ExecPolicy>(policy), first, last, init, std::plus<T>());
     }
@@ -1393,7 +2347,7 @@ namespace std {
      * See std::reduce https://en.cppreference.com/w/cpp/algorithm/reduce
      */
     template <class ExecPolicy, class RandIt>
-    poolstl::internal::enable_if_par<
+    poolstl::internal::enable_if_poolstl_policy<
         ExecPolicy, typename std::iterator_traits<RandIt>::value_type>
     reduce(ExecPolicy &&policy, RandIt first, RandIt last) {
         return std::reduce(std::forward<ExecPolicy>(policy), first, last,
@@ -1406,14 +2360,17 @@ namespace std {
      * See std::transform_reduce https://en.cppreference.com/w/cpp/algorithm/transform_reduce
      */
     template <class ExecPolicy, class RandIt1, class T, class BinaryReductionOp, class UnaryTransformOp>
-    poolstl::internal::enable_if_par<ExecPolicy, T>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, T>
     transform_reduce(ExecPolicy&& policy, RandIt1 first1, RandIt1 last1, T init,
                      BinaryReductionOp reduce_op, UnaryTransformOp transform_op) {
+        if (poolstl::internal::is_seq<ExecPolicy>(policy)) {
+            return std::transform_reduce(first1, last1, init, reduce_op, transform_op);
+        }
 
-        auto futures = poolstl::internal::parallel_chunk_for(std::forward<ExecPolicy>(policy), first1, last1,
-             [&init, &reduce_op, &transform_op](RandIt1 chunk_first1, RandIt1 chunk_last1) {
-                 return std::transform_reduce(chunk_first1, chunk_last1, init, reduce_op, transform_op);
-             });
+        auto futures = poolstl::internal::parallel_chunk_for_1(std::forward<ExecPolicy>(policy), first1, last1,
+                                                               std::transform_reduce<RandIt1, T,
+                                                                                   BinaryReductionOp, UnaryTransformOp>,
+                                                               (T*)nullptr, 1, init, reduce_op, transform_op);
 
         return poolstl::internal::cpp17::reduce(
             poolstl::internal::get_wrap(futures.begin()),
@@ -1425,14 +2382,17 @@ namespace std {
      * See std::transform_reduce https://en.cppreference.com/w/cpp/algorithm/transform_reduce
      */
     template <class ExecPolicy, class RandIt1, class RandIt2, class T, class BinaryReductionOp, class BinaryTransformOp>
-    poolstl::internal::enable_if_par<ExecPolicy, T>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, T>
     transform_reduce(ExecPolicy&& policy, RandIt1 first1, RandIt1 last1, RandIt2 first2, T init,
                      BinaryReductionOp reduce_op, BinaryTransformOp transform_op) {
+        if (poolstl::internal::is_seq<ExecPolicy>(policy)) {
+            return std::transform_reduce(first1, last1, first2, init, reduce_op, transform_op);
+        }
 
-        auto futures = poolstl::internal::parallel_chunk_for(std::forward<ExecPolicy>(policy), first1, last1, first2,
-             [&init, &reduce_op, &transform_op](RandIt1 chunk_first1, RandIt1 chunk_last1, RandIt2 chunk_first2) {
-                 return std::transform_reduce(chunk_first1, chunk_last1, chunk_first2, init, reduce_op, transform_op);
-             });
+        auto futures = poolstl::internal::parallel_chunk_for_2(std::forward<ExecPolicy>(policy), first1, last1, first2,
+                                                               std::transform_reduce<RandIt1, RandIt2, T,
+                                                                                  BinaryReductionOp, BinaryTransformOp>,
+                                                               (T*)nullptr, init, reduce_op, transform_op);
 
         return poolstl::internal::cpp17::reduce(
             poolstl::internal::get_wrap(futures.begin()),
@@ -1444,7 +2404,7 @@ namespace std {
      * See std::transform_reduce https://en.cppreference.com/w/cpp/algorithm/transform_reduce
      */
     template< class ExecPolicy, class RandIt1, class RandIt2, class T >
-    poolstl::internal::enable_if_par<ExecPolicy, T>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, T>
     transform_reduce(ExecPolicy&& policy, RandIt1 first1, RandIt1 last1, RandIt2 first2, T init ) {
         return transform_reduce(std::forward<ExecPolicy>(policy),
             first1, last1, first2, init, std::plus<>(), std::multiplies<>());
@@ -1455,100 +2415,6 @@ namespace std {
 
 #endif
 
-#ifndef POOLSTL_SEQ_FWD_HPP
-#define POOLSTL_SEQ_FWD_HPP
-
-
-/*
- * Forward poolstl::seq to the native sequential (no policy) method.
- */
-
-#define POOLSTL_DEFINE_SEQ_FWD(NS, FNAME)                                                                   \
-    template<class EP, typename...ARGS>                                                                     \
-    auto FNAME(EP&&, ARGS&&...args) ->                                                                      \
-                poolstl::internal::enable_if_seq<EP, decltype(NS::FNAME(std::forward<ARGS>(args)...))> {    \
-        return NS::FNAME(std::forward<ARGS>(args)...);                                                      \
-    }
-
-#define POOLSTL_DEFINE_SEQ_FWD_VOID(NS, FNAME)                                   \
-    template<class EP, typename...ARGS>                                          \
-    poolstl::internal::enable_if_seq<EP, void> FNAME(EP&&, ARGS&&... args) {     \
-        NS::FNAME(std::forward<ARGS>(args)...);                                  \
-    }
-
-#if POOLSTL_HAVE_CXX17
-
-/*
- * Dynamically choose policy from a std::variant.
- * Useful to choose between parallel and sequential policies at runtime via par_if.
- */
-
-#define POOLSTL_DEFINE_PAR_IF_FWD_VOID(NS, FNAME)                                                         \
-    template<class EP, typename...ARGS>                                                                   \
-    poolstl::internal::enable_if_poolstl_variant<EP, void> FNAME(EP&& policy, ARGS&&...args) {            \
-        std::visit([&](auto&& pol) { NS::FNAME(pol, std::forward<ARGS>(args)...); }, policy.var);         \
-    }
-
-#define POOLSTL_DEFINE_PAR_IF_FWD(NS, FNAME)                                                                          \
-    template<class EP, typename...ARGS>                                                                               \
-    auto FNAME(EP&& policy, ARGS&&...args) ->                                                                         \
-                poolstl::internal::enable_if_poolstl_variant<EP, decltype(NS::FNAME(std::forward<ARGS>(args)...))> {  \
-        return std::visit([&](auto&& pol) { return NS::FNAME(pol, std::forward<ARGS>(args)...); }, policy.var);       \
-    }
-
-#else
-#define POOLSTL_DEFINE_PAR_IF_FWD_VOID(NS, FNAME)
-#define POOLSTL_DEFINE_PAR_IF_FWD(NS, FNAME)
-#endif
-/*
- * Define both the sequential forward and dynamic chooser.
- */
-#define POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(NS, FNAME)        \
-                    POOLSTL_DEFINE_SEQ_FWD(NS, FNAME)            \
-                    POOLSTL_DEFINE_PAR_IF_FWD(NS, FNAME)
-
-#define POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF_VOID(NS, FNAME)   \
-                    POOLSTL_DEFINE_SEQ_FWD_VOID(NS, FNAME)       \
-                    POOLSTL_DEFINE_PAR_IF_FWD_VOID(NS, FNAME)
-
-namespace std {
-    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, all_of)
-    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, any_of)
-    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, none_of)
-
-    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, count)
-    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, count_if)
-
-    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, copy)
-    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, copy_n)
-
-    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF_VOID(std, fill)
-    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, fill_n)
-
-    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, find)
-    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, find_if)
-    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, find_if_not)
-
-    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF_VOID(std, for_each)
-#if POOLSTL_HAVE_CXX17_LIB
-    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, for_each_n)
-#endif
-
-    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, transform)
-
-#if POOLSTL_HAVE_CXX17_LIB
-    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, exclusive_scan)
-    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, reduce)
-    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, transform_reduce)
-#endif
-}
-
-namespace poolstl {
-    POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF_VOID(poolstl, for_each_chunk)
-}
-
-#endif
-
 // Note that iota_iter.hpp is self-contained in its own right.
 
 #ifndef POOLSTL_IOTA_ITER_HPP
diff --git a/src/index.cpp b/src/index.cpp
index 9b907257..9b3fbfab 100644
--- a/src/index.cpp
+++ b/src/index.cpp
@@ -165,13 +165,8 @@ void StrobemerIndex::populate(float f, unsigned n_threads) {
 
     Timer sorting_timer;
     logger.debug() << "  Sorting ...\n";
-    if (true) {
-        task_thread_pool::task_thread_pool pool{n_threads};
-        std::sort(poolstl::par.on(pool), randstrobes.begin(), randstrobes.end());
-    } else {
-        // sort by hash values
-        pdqsort_branchless(randstrobes.begin(), randstrobes.end());
-    }
+    task_thread_pool::task_thread_pool pool{n_threads};
+    poolstl::pluggable_sort(poolstl::par.on(pool), randstrobes.begin(), randstrobes.end(), pdqsort_branchless);
     stats.elapsed_sorting_seeds = sorting_timer.duration();
 
     Timer hash_index_timer;

From a905f7bdd2dcc2e843b0cbac23b51912adadfe7a Mon Sep 17 00:00:00 2001
From: Marcel Martin <marcel.martin@scilifelab.se>
Date: Mon, 22 Jan 2024 13:46:09 +0100
Subject: [PATCH 12/32] Bump to poolSTL 0.3.4
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This version indeed gives us a very nice speed improvement without using
extra memory:

Sorting-only runtimes:
* 1 thread: 32.8 s
* 2 threads: 20.3 s
* 4 threads: 15.7 s
* 8 threads: 14.8 s

Overall indexing runtimes (before/after):
* 1 thread: 151 s → 153 s
* 2 threads: 100 s → 88 s
* 4 threads: 73 s → 57 s
* 8 threads: 63 s → 47 s
---
 CHANGES.md              |   4 +
 README.md               |   2 +-
 ext/README.md           |   4 +-
 ext/poolstl/poolstl.hpp | 284 +++++++++++++++++++++++++++++++++++++---
 4 files changed, 275 insertions(+), 19 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index 266b5832..644e9644 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -2,6 +2,10 @@
 
 ## development version
 
+* #386: Parallelize indexing even more by using @alugowski’s
+  [poolSTL](https://github.com/alugowski/) `pluggable_sort`.
+  Indexing a human reference (measured on CHM13) now takes only ~45 s on a
+  recent machine (using 8 threads).
 * #376: Improve accuracy for read length 50 by optimizing the default
   indexing parameters. Paired-end accuracy increases by 0.3 percentage
   points on average. Single-end accuracy increases by 1 percentage point.
diff --git a/README.md b/README.md
index f7ec6e05..1ef03a51 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ Strobealign is a read mapper that is typically significantly faster than other r
 
 - Map single-end and paired-end reads
 - Multithreading support
-- Fast indexing (1-2 minutes for a human-sized reference genome using four cores)
+- Fast indexing (<1 minute for a human-sized reference genome using four cores)
 - On-the-fly indexing by default. Optionally create an on-disk index.
 - Output in standard SAM format or produce even faster results by writing PAF (without alignments)
 - Strobealign is most suited for read lengths between 100 and 500 bp
diff --git a/ext/README.md b/ext/README.md
index d80e5a2d..a3480467 100644
--- a/ext/README.md
+++ b/ext/README.md
@@ -30,8 +30,8 @@ License: See pdqsort/license.txt
 ## poolstl
 
 Homepage: https://github.com/alugowski/poolSTL/
-Downloaded file: https://github.com/alugowski/poolSTL/releases/download/v0.3.3/poolstl.hpp
-Version: 0.3.3
+Downloaded file: https://github.com/alugowski/poolSTL/releases/download/v0.3.4/poolstl.hpp
+Version: 0.3.4
 License: See poolstl.hpp
 
 ## robin_hood
diff --git a/ext/poolstl/poolstl.hpp b/ext/poolstl/poolstl.hpp
index d1340a1e..8d569ecb 100644
--- a/ext/poolstl/poolstl.hpp
+++ b/ext/poolstl/poolstl.hpp
@@ -505,7 +505,7 @@ namespace task_thread_pool {
 // Version macros.
 #define POOLSTL_VERSION_MAJOR 0
 #define POOLSTL_VERSION_MINOR 3
-#define POOLSTL_VERSION_PATCH 3
+#define POOLSTL_VERSION_PATCH 4
 
 #include <cstddef>
 #include <iterator>
@@ -596,6 +596,28 @@ namespace poolstl {
             }
         }
 
+        /**
+         * Identify a pivot element for quicksort. Chooses the middle element of the range.
+         */
+        template <typename Iterator>
+        typename std::iterator_traits<Iterator>::value_type quicksort_pivot(Iterator first, Iterator last) {
+            return *(std::next(first, std::distance(first, last) / 2));
+        }
+
+        /**
+         * Predicate for std::partition (for quicksort)
+         */
+        template <class Compare, class T>
+        struct pivot_predicate {
+            pivot_predicate(Compare comp, const T& pivot) : comp(comp), pivot(pivot) {}
+
+            bool operator()(const T& em) {
+                return comp(em, pivot);
+            }
+            Compare comp;
+            const T pivot;
+        };
+
         /*
          * Some methods are only available with C++17 and up. Reimplement on older standards.
          */
@@ -1265,7 +1287,7 @@ namespace task_thread_pool {
 // Version macros.
 #define POOLSTL_VERSION_MAJOR 0
 #define POOLSTL_VERSION_MINOR 3
-#define POOLSTL_VERSION_PATCH 3
+#define POOLSTL_VERSION_PATCH 4
 
 #include <cstddef>
 #include <iterator>
@@ -1356,6 +1378,28 @@ namespace poolstl {
             }
         }
 
+        /**
+         * Identify a pivot element for quicksort. Chooses the middle element of the range.
+         */
+        template <typename Iterator>
+        typename std::iterator_traits<Iterator>::value_type quicksort_pivot(Iterator first, Iterator last) {
+            return *(std::next(first, std::distance(first, last) / 2));
+        }
+
+        /**
+         * Predicate for std::partition (for quicksort)
+         */
+        template <class Compare, class T>
+        struct pivot_predicate {
+            pivot_predicate(Compare comp, const T& pivot) : comp(comp), pivot(pivot) {}
+
+            bool operator()(const T& em) {
+                return comp(em, pivot);
+            }
+            Compare comp;
+            const T pivot;
+        };
+
         /*
          * Some methods are only available with C++17 and up. Reimplement on older standards.
          */
@@ -1739,8 +1783,8 @@ namespace poolstl {
          * @param merge_func Sequential merge method, like std::inplace_merge
          */
         template <class ExecPolicy, class RandIt, class Compare, class SortFunc, class MergeFunc>
-        void parallel_sort(ExecPolicy &&policy, RandIt first, RandIt last,
-                           Compare comp, SortFunc sort_func, MergeFunc merge_func) {
+        void parallel_mergesort(ExecPolicy &&policy, RandIt first, RandIt last,
+                                Compare comp, SortFunc sort_func, MergeFunc merge_func) {
             if (first == last) {
                 return;
             }
@@ -1786,6 +1830,103 @@ namespace poolstl {
             } while (futures.size() > 1);
             futures.front().get();
         }
+
+        /**
+         * Quicksort worker function.
+         */
+        template <class RandIt, class Compare, class SortFunc, class PartFunc, class PivotFunc>
+        void quicksort_impl(task_thread_pool::task_thread_pool* task_pool, const RandIt first, const RandIt last,
+                            Compare comp, SortFunc sort_func, PartFunc part_func, PivotFunc pivot_func,
+                            std::ptrdiff_t target_leaf_size,
+                            std::vector<std::future<void>>* futures, std::mutex* mutex,
+                            std::condition_variable* cv, int* inflight_spawns) {
+            using T = typename std::iterator_traits<RandIt>::value_type;
+
+            auto partition_size = std::distance(first, last);
+
+            if (partition_size > target_leaf_size) {
+                // partition the range
+                auto mid = part_func(first, last, pivot_predicate<Compare, T>(comp, pivot_func(first, last)));
+
+                if (mid != first && mid != last) {
+                    // was able to partition the range, so recurse
+                    std::lock_guard<std::mutex> guard(*mutex);
+                    ++(*inflight_spawns);
+
+                    futures->emplace_back(task_pool->submit(
+                        quicksort_impl<RandIt, Compare, SortFunc, PartFunc, PivotFunc>,
+                        task_pool, first, mid, comp, sort_func, part_func, pivot_func, target_leaf_size,
+                        futures, mutex, cv, inflight_spawns));
+
+                    futures->emplace_back(task_pool->submit(
+                        quicksort_impl<RandIt, Compare, SortFunc, PartFunc, PivotFunc>,
+                        task_pool, mid, last, comp, sort_func, part_func, pivot_func, target_leaf_size,
+                        futures, mutex, cv, inflight_spawns));
+                    return;
+                }
+            }
+
+            // Range does not need to be subdivided (or was unable to subdivide). Run the sequential sort.
+            {
+                // notify main thread that partitioning may be finished
+                std::lock_guard<std::mutex> guard(*mutex);
+                --(*inflight_spawns);
+            }
+            cv->notify_one();
+
+            sort_func(first, last, comp);
+        }
+
+        /**
+         * Sort a range in parallel using quicksort.
+         *
+         * @param sort_func Sequential sort method, like std::sort or std::stable_sort
+         * @param part_func Method that partitions a range, like std::partition or std::stable_partition
+         * @param pivot_func Method that identifies the pivot
+         */
+        template <class ExecPolicy, class RandIt, class Compare, class SortFunc, class PartFunc, class PivotFunc>
+        void parallel_quicksort(ExecPolicy &&policy, RandIt first, RandIt last,
+                                Compare comp, SortFunc sort_func, PartFunc part_func, PivotFunc pivot_func) {
+            if (first == last) {
+                return;
+            }
+
+            auto& task_pool = *policy.pool();
+
+            // Target partition size. Range will be recursively partitioned into partitions no bigger than this
+            // size. Target approximately twice as many partitions as threads to reduce impact of uneven pivot
+            // selection.
+            std::ptrdiff_t target_leaf_size = std::max(std::distance(first, last) / (task_pool.get_num_threads() * 2),
+                                                       (std::ptrdiff_t)5);
+
+            // task_thread_pool does not support creating task DAGs, so organize the code such that
+            // all parallel tasks are independent. The parallel tasks can spawn additional parallel tasks, and they
+            // record their "child" task's std::future into a common vector to be waited on by the main thread.
+            std::mutex mutex;
+
+            // Futures of parallel tasks. Access protected by mutex.
+            std::vector<std::future<void>> futures;
+
+            // For signaling that all partitioning has been completed and futures vector is complete. Uses mutex.
+            std::condition_variable cv;
+
+            // Number of `quicksort_impl` calls that haven't finished yet. Nonzero value means futures vector may
+            // still be modified. Access protected by mutex.
+            int inflight_spawns = 1;
+
+            // Root task.
+            quicksort_impl(&task_pool, first, last, comp, sort_func, part_func, pivot_func, target_leaf_size,
+                           &futures, &mutex, &cv, &inflight_spawns);
+
+            // Wait for all partitioning to finish.
+            {
+                std::unique_lock<std::mutex> lock(mutex);
+                cv.wait(lock, [&] { return inflight_spawns == 0; });
+            }
+
+            // Wait on all the parallel tasks.
+            get_futures(futures);
+        }
     }
 }
 
@@ -2050,8 +2191,11 @@ namespace std {
             return;
         }
 
-        poolstl::internal::parallel_sort(std::forward<ExecPolicy>(policy), first, last, comp,
-                                         std::sort<RandIt, Compare>, std::inplace_merge<RandIt, Compare>);
+        poolstl::internal::parallel_quicksort(std::forward<ExecPolicy>(policy), first, last, comp,
+                                              std::sort<RandIt, Compare>,
+                                              std::partition<RandIt, poolstl::internal::pivot_predicate<Compare,
+                                                  typename std::iterator_traits<RandIt>::value_type>>,
+                                              poolstl::internal::quicksort_pivot<RandIt>);
     }
 
     /**
@@ -2077,8 +2221,11 @@ namespace std {
             return;
         }
 
-        poolstl::internal::parallel_sort(std::forward<ExecPolicy>(policy), first, last, comp,
-                                         std::stable_sort<RandIt, Compare>, std::inplace_merge<RandIt, Compare>);
+        poolstl::internal::parallel_quicksort(std::forward<ExecPolicy>(policy), first, last, comp,
+                                              std::stable_sort<RandIt, Compare>,
+                                              std::stable_partition<RandIt, poolstl::internal::pivot_predicate<Compare,
+                                                  typename std::iterator_traits<RandIt>::value_type>>,
+                                              poolstl::internal::quicksort_pivot<RandIt>);
     }
 
     /**
@@ -2203,37 +2350,142 @@ namespace poolstl {
     /**
      * NOTE: Iterators are expected to be random access.
      *
-     * Like `std::sort`, but allows specifying the sequential sort and merge methods. These methods must have the
-     * same signature as the comparator versions of `std::sort` and `std::inplace_merge`, respectively.
+     * Like `std::sort`, but allows specifying the sequential sort method, which must have the
+     * same signature as the comparator version of `std::sort`.
+     *
+     * Implemented as a high-level quicksort that delegates to `sort_func`, in parallel, once the range has been
+     * sufficiently partitioned.
      */
     template <class ExecPolicy, class RandIt, class Compare>
     poolstl::internal::enable_if_poolstl_policy<ExecPolicy, void>
     pluggable_sort(ExecPolicy &&policy, RandIt first, RandIt last, Compare comp,
-                   void (sort_func)(RandIt, RandIt, Compare) = std::sort,
-                   void (merge_func)(RandIt, RandIt, RandIt, Compare) = std::inplace_merge) {
+                   void (sort_func)(RandIt, RandIt, Compare) = std::sort) {
         if (poolstl::internal::is_seq<ExecPolicy>(policy)) {
             sort_func(first, last, comp);
             return;
         }
 
-        poolstl::internal::parallel_sort(std::forward<ExecPolicy>(policy), first, last, comp, sort_func, merge_func);
+        poolstl::internal::parallel_quicksort(std::forward<ExecPolicy>(policy), first, last, comp, sort_func,
+                                              std::partition<RandIt, poolstl::internal::pivot_predicate<Compare,
+                                                  typename std::iterator_traits<RandIt>::value_type>>,
+                                              poolstl::internal::quicksort_pivot<RandIt>);
     }
 
     /**
      * NOTE: Iterators are expected to be random access.
      *
-     * Like `std::sort`, but allows specifying the sequential sort and merge methods. These methods must have the
-     * same signature as the comparator versions of `std::sort` and `std::inplace_merge`, respectively.
+     * Like `std::sort`, but allows specifying the sequential sort method, which must have the
+     * same signature as the comparator version of `std::sort`.
+     *
+     * Implemented as a parallel high-level quicksort that delegates to `sort_func` once the range has been
+     * sufficiently partitioned.
      */
     template <class ExecPolicy, class RandIt>
     poolstl::internal::enable_if_poolstl_policy<ExecPolicy, void>
     pluggable_sort(ExecPolicy &&policy, RandIt first, RandIt last,
+                   void (sort_func)(RandIt, RandIt,
+                                    std::less<typename std::iterator_traits<RandIt>::value_type>) = std::sort){
+        using T = typename std::iterator_traits<RandIt>::value_type;
+        pluggable_sort(std::forward<ExecPolicy>(policy), first, last, std::less<T>(), sort_func);
+    }
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     *
+     * Parallel merge sort.
+     *
+     * @param comp Comparator.
+     * @param sort_func Sequential sort method. Must have the same signature as the comparator version of `std::sort`.
+     * @param merge_func Sequential merge method. Must have the same signature as `std::inplace_merge`.
+     */
+    template <class ExecPolicy, class RandIt, class Compare>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, void>
+    pluggable_mergesort(ExecPolicy &&policy, RandIt first, RandIt last, Compare comp,
+                        void (sort_func)(RandIt, RandIt, Compare) = std::sort,
+                        void (merge_func)(RandIt, RandIt, RandIt, Compare) = std::inplace_merge) {
+        if (poolstl::internal::is_seq<ExecPolicy>(policy)) {
+            sort_func(first, last, comp);
+            return;
+        }
+
+        poolstl::internal::parallel_mergesort(std::forward<ExecPolicy>(policy),
+                                              first, last, comp, sort_func, merge_func);
+    }
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     *
+     * Parallel merge sort.
+     *
+     * Uses `std::less` comparator.
+     *
+     * @param sort_func Sequential sort method. Must have the same signature as the comparator version of `std::sort`.
+     * @param merge_func Sequential merge method. Must have the same signature as `std::inplace_merge`.
+     */
+    template <class ExecPolicy, class RandIt>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, void>
+    pluggable_mergesort(ExecPolicy &&policy, RandIt first, RandIt last,
                    void (sort_func)(RandIt, RandIt,
                                     std::less<typename std::iterator_traits<RandIt>::value_type>) = std::sort,
                    void (merge_func)(RandIt, RandIt, RandIt,
                                     std::less<typename std::iterator_traits<RandIt>::value_type>) = std::inplace_merge){
         using T = typename std::iterator_traits<RandIt>::value_type;
-        pluggable_sort(std::forward<ExecPolicy>(policy), first, last, std::less<T>(), sort_func, merge_func);
+        pluggable_mergesort(std::forward<ExecPolicy>(policy), first, last, std::less<T>(), sort_func, merge_func);
+    }
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     *
+     * Parallel quicksort that allows specifying the sequential sort and partition methods.
+     *
+     * @param comp Comparator.
+     * @param sort_func Sequential sort method to use once range is sufficiently partitioned. Must have the same
+     *                  signature as the comparator version of `std::sort`.
+     * @param part_func Sequential partition method. Must have the same signature as `std::partition`.
+     * @param pivot_func Method that identifies the pivot element
+     */
+    template <class ExecPolicy, class RandIt, class Compare>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, void>
+    pluggable_quicksort(ExecPolicy &&policy, RandIt first, RandIt last, Compare comp,
+                        void (sort_func)(RandIt, RandIt, Compare) = std::sort,
+                        RandIt (part_func)(RandIt, RandIt, poolstl::internal::pivot_predicate<Compare,
+                            typename std::iterator_traits<RandIt>::value_type>) = std::partition,
+                        typename std::iterator_traits<RandIt>::value_type (pivot_func)(RandIt, RandIt) =
+                            poolstl::internal::quicksort_pivot) {
+        if (poolstl::internal::is_seq<ExecPolicy>(policy)) {
+            sort_func(first, last, comp);
+            return;
+        }
+
+        poolstl::internal::parallel_quicksort(std::forward<ExecPolicy>(policy),
+                                              first, last, comp, sort_func, part_func, pivot_func);
+    }
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     *
+     * Parallel quicksort that allows specifying the sequential sort and partition methods.
+     *
+     * Uses `std::less` comparator.
+     *
+     * @param sort_func Sequential sort method to use once range is sufficiently partitioned. Must have the same
+     *                  signature as the comparator version of `std::sort`.
+     * @param part_func Sequential partition method. Must have the same signature as `std::partition`.
+     * @param pivot_func Method that identifies the pivot element
+     */
+    template <class ExecPolicy, class RandIt>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, void>
+    pluggable_quicksort(ExecPolicy &&policy, RandIt first, RandIt last,
+                        void (sort_func)(RandIt, RandIt,
+                                    std::less<typename std::iterator_traits<RandIt>::value_type>) = std::sort,
+                        RandIt (part_func)(RandIt, RandIt, poolstl::internal::pivot_predicate<
+                            std::less<typename std::iterator_traits<RandIt>::value_type>,
+                            typename std::iterator_traits<RandIt>::value_type>) = std::partition,
+                        typename std::iterator_traits<RandIt>::value_type (pivot_func)(RandIt, RandIt) =
+                            poolstl::internal::quicksort_pivot) {
+        using T = typename std::iterator_traits<RandIt>::value_type;
+        pluggable_quicksort(std::forward<ExecPolicy>(policy), first, last, std::less<T>(),
+                            sort_func, part_func, pivot_func);
     }
 }
 

From ffbb86366880f9c0e4bc671378ebe70ab0432c5a Mon Sep 17 00:00:00 2001
From: Marcel Martin <marcel.martin@scilifelab.se>
Date: Mon, 22 Jan 2024 20:45:06 +0100
Subject: [PATCH 13/32] Update baseline commit

---
 tests/baseline-commit.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/baseline-commit.txt b/tests/baseline-commit.txt
index 9c1e9b61..e1ff8c67 100644
--- a/tests/baseline-commit.txt
+++ b/tests/baseline-commit.txt
@@ -1 +1 @@
-0ced9903276834e6b9bfe095a255952f0616d330
+a905f7bdd2dcc2e843b0cbac23b51912adadfe7a

From cc6928611965881a6b533d05483fb93ff18752b3 Mon Sep 17 00:00:00 2001
From: Marcel Martin <marcel.martin@scilifelab.se>
Date: Wed, 31 Jan 2024 09:26:05 +0100
Subject: [PATCH 14/32] Bump poolSTL to 0.3.5

---
 ext/README.md           |   4 +-
 ext/poolstl/poolstl.hpp | 185 +++++++++++++++++++++++++++-------------
 2 files changed, 126 insertions(+), 63 deletions(-)

diff --git a/ext/README.md b/ext/README.md
index a3480467..55e874b7 100644
--- a/ext/README.md
+++ b/ext/README.md
@@ -30,8 +30,8 @@ License: See pdqsort/license.txt
 ## poolstl
 
 Homepage: https://github.com/alugowski/poolSTL/
-Downloaded file: https://github.com/alugowski/poolSTL/releases/download/v0.3.4/poolstl.hpp
-Version: 0.3.4
+Downloaded file: https://github.com/alugowski/poolSTL/releases/download/v0.3.5/poolstl.hpp
+Version: 0.3.5
 License: See poolstl.hpp
 
 ## robin_hood
diff --git a/ext/poolstl/poolstl.hpp b/ext/poolstl/poolstl.hpp
index 8d569ecb..77c3e7a0 100644
--- a/ext/poolstl/poolstl.hpp
+++ b/ext/poolstl/poolstl.hpp
@@ -505,7 +505,7 @@ namespace task_thread_pool {
 // Version macros.
 #define POOLSTL_VERSION_MAJOR 0
 #define POOLSTL_VERSION_MINOR 3
-#define POOLSTL_VERSION_PATCH 4
+#define POOLSTL_VERSION_PATCH 5
 
 #include <cstddef>
 #include <iterator>
@@ -1287,7 +1287,7 @@ namespace task_thread_pool {
 // Version macros.
 #define POOLSTL_VERSION_MAJOR 0
 #define POOLSTL_VERSION_MINOR 3
-#define POOLSTL_VERSION_PATCH 4
+#define POOLSTL_VERSION_PATCH 5
 
 #include <cstddef>
 #include <iterator>
@@ -1679,7 +1679,7 @@ namespace poolstl {
                 auto iter_chunk_size = get_iter_chunk_size(first, last, chunk_size);
                 RandIt loop_end = advanced(first, iter_chunk_size);
 
-                futures.emplace_back(task_pool.submit(std::forward<Chunk>(chunk), first, loop_end));
+                futures.emplace_back(task_pool.submit(chunk, first, loop_end));
 
                 first = loop_end;
             }
@@ -1702,8 +1702,7 @@ namespace poolstl {
                 auto iter_chunk_size = get_iter_chunk_size(first, last, chunk_size);
                 RandIt loop_end = advanced(first, iter_chunk_size);
 
-                futures.emplace_back(task_pool.submit(std::forward<Chunk>(chunk), first, loop_end,
-                                                      std::forward<A>(chunk_args)...));
+                futures.emplace_back(task_pool.submit(chunk, first, loop_end, chunk_args...));
 
                 first = loop_end;
             }
@@ -1719,8 +1718,7 @@ namespace poolstl {
         parallel_chunk_for_1_wait(ExecPolicy &&policy, RandIt first, RandIt last,
                                   Chunk chunk, ChunkRet* rettype, int extra_split_factor, A&&... chunk_args) {
             auto futures = parallel_chunk_for_1(std::forward<ExecPolicy>(policy), first, last,
-                                                std::forward<Chunk>(chunk), rettype, extra_split_factor,
-                                                std::forward<A>(chunk_args)...);
+                                                chunk, rettype, extra_split_factor, chunk_args...);
             get_futures(futures);
         }
 
@@ -1739,8 +1737,7 @@ namespace poolstl {
                 auto iter_chunk_size = get_iter_chunk_size(first1, last1, chunk_size);
                 RandIt1 loop_end = advanced(first1, iter_chunk_size);
 
-                futures.emplace_back(task_pool.submit(std::forward<Chunk>(chunk), first1, loop_end, first2,
-                                                      std::forward<A>(chunk_args)...));
+                futures.emplace_back(task_pool.submit(chunk, first1, loop_end, first2, chunk_args...));
 
                 first1 = loop_end;
                 std::advance(first2, iter_chunk_size);
@@ -1765,8 +1762,7 @@ namespace poolstl {
                 auto iter_chunk_size = get_iter_chunk_size(first1, last1, chunk_size);
                 RandIt1 loop_end = advanced(first1, iter_chunk_size);
 
-                futures.emplace_back(task_pool.submit(std::forward<Chunk>(chunk), first1, loop_end, first2, first3,
-                                                      std::forward<A>(chunk_args)...));
+                futures.emplace_back(task_pool.submit(chunk, first1, loop_end, first2, first3, chunk_args...));
 
                 first1 = loop_end;
                 std::advance(first2, iter_chunk_size);
@@ -1896,9 +1892,14 @@ namespace poolstl {
             // Target partition size. Range will be recursively partitioned into partitions no bigger than this
             // size. Target approximately twice as many partitions as threads to reduce impact of uneven pivot
             // selection.
-            std::ptrdiff_t target_leaf_size = std::max(std::distance(first, last) / (task_pool.get_num_threads() * 2),
+            auto num_threads = task_pool.get_num_threads();
+            std::ptrdiff_t target_leaf_size = std::max(std::distance(first, last) / (num_threads * 2),
                                                        (std::ptrdiff_t)5);
 
+            if (num_threads == 1) {
+                target_leaf_size = std::distance(first, last);
+            }
+
             // task_thread_pool does not support creating task DAGs, so organize the code such that
             // all parallel tasks are independent. The parallel tasks can spawn additional parallel tasks, and they
             // record their "child" task's std::future into a common vector to be waited on by the main thread.
@@ -1927,6 +1928,39 @@ namespace poolstl {
             // Wait on all the parallel tasks.
             get_futures(futures);
         }
+
+        /**
+         * Partition range according to predicate. Unstable.
+         *
+         * This implementation only parallelizes with p=2; will spawn and wait for only one task.
+         */
+        template <class RandIt, class Predicate>
+        RandIt partition_p2(task_thread_pool::task_thread_pool &task_pool, RandIt first, RandIt last, Predicate pred) {
+            auto range_size = std::distance(first, last);
+            if (range_size < 4) {
+                return std::partition(first, last, pred);
+            }
+
+            // approach should be generalizable to arbitrary p
+
+            RandIt mid = std::next(first + range_size / 2);
+
+            // partition left and right halves in parallel
+            auto left_future = task_pool.submit(std::partition<RandIt, Predicate>, first, mid, pred);
+            RandIt right_mid = std::partition(mid, last, pred);
+            RandIt left_mid = left_future.get();
+
+            // merge the two partitioned halves
+            auto left_highs_size = std::distance(left_mid, mid);
+            auto right_lows_size = std::distance(mid, right_mid);
+            if (left_highs_size <= right_lows_size) {
+                std::swap_ranges(left_mid, mid, right_mid - left_highs_size);
+                return right_mid - left_highs_size;
+            } else {
+                std::swap_ranges(mid, right_mid, left_mid);
+                return left_mid + right_lows_size;
+            }
+        }
     }
 }
 
@@ -1965,8 +1999,7 @@ namespace poolstl {
                 auto iter_chunk_size = get_iter_chunk_size(first, last, chunk_size);
                 RandIt loop_end = advanced(first, iter_chunk_size);
 
-                threads.emplace_back(std::thread(std::forward<Chunk>(chunk), first, loop_end,
-                                                 std::forward<A>(chunk_args)...));
+                threads.emplace_back(std::thread(chunk, first, loop_end, chunk_args...));
 
                 first = loop_end;
             }
@@ -1982,6 +2015,66 @@ namespace poolstl {
 
 #endif
 
+namespace poolstl {
+    /**
+     * NOTE: Iterators are expected to be random access.
+     *
+     * Like `std::sort`, but allows specifying the sequential sort method, which must have the
+     * same signature as the comparator version of `std::sort`.
+     *
+     * Implemented as a high-level quicksort that delegates to `sort_func`, in parallel, once the range has been
+     * sufficiently partitioned.
+     */
+    template <class ExecPolicy, class RandIt, class Compare>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, void>
+    pluggable_sort(ExecPolicy &&policy, RandIt first, RandIt last, Compare comp,
+                   void (sort_func)(RandIt, RandIt, Compare) = std::sort) {
+        if (poolstl::internal::is_seq<ExecPolicy>(policy)) {
+            sort_func(first, last, comp);
+            return;
+        }
+
+        // Parallel partition.
+        // The partition_p2 method spawns and waits for its own child task. A deadlock is possible if all worker
+        // threads are waiting for tasks that in turn have to workers to execute them. This is only an issue because
+        // our thread pool does not have the concept of dependencies.
+        // So ensure
+        auto& task_pool = *policy.pool();
+        std::atomic<int> allowed_parallel_partitions{(int)task_pool.get_num_threads() / 2};
+
+        auto part_func = [&task_pool, &allowed_parallel_partitions](RandIt chunk_first, RandIt chunk_last,
+                                   poolstl::internal::pivot_predicate<Compare,
+                                   typename std::iterator_traits<RandIt>::value_type> pred) {
+            if (allowed_parallel_partitions.fetch_sub(1) > 0) {
+                return poolstl::internal::partition_p2(task_pool, chunk_first, chunk_last, pred);
+            } else {
+                return std::partition(chunk_first, chunk_last, pred);
+            }
+        };
+
+        poolstl::internal::parallel_quicksort(std::forward<ExecPolicy>(policy), first, last, comp, sort_func, part_func,
+                                              poolstl::internal::quicksort_pivot<RandIt>);
+    }
+
+    /**
+     * NOTE: Iterators are expected to be random access.
+     *
+     * Like `std::sort`, but allows specifying the sequential sort method, which must have the
+     * same signature as the comparator version of `std::sort`.
+     *
+     * Implemented as a parallel high-level quicksort that delegates to `sort_func` once the range has been
+     * sufficiently partitioned.
+     */
+    template <class ExecPolicy, class RandIt>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, void>
+    pluggable_sort(ExecPolicy &&policy, RandIt first, RandIt last,
+                   void (sort_func)(RandIt, RandIt,
+                                    std::less<typename std::iterator_traits<RandIt>::value_type>) = std::sort){
+        using T = typename std::iterator_traits<RandIt>::value_type;
+        pluggable_sort(std::forward<ExecPolicy>(policy), first, last, std::less<T>(), sort_func);
+    }
+}
+
 namespace std {
 
     /**
@@ -2179,6 +2272,22 @@ namespace std {
         return last;
     }
 
+    /**
+     * NOTE: Iterators are expected to be random access.
+     * See std::partition https://en.cppreference.com/w/cpp/algorithm/partition
+     *
+     * Current implementation uses at most 2 threads.
+     */
+    template <class ExecPolicy, class RandIt, class Predicate>
+    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, RandIt>
+    partition(ExecPolicy &&policy, RandIt first, RandIt last, Predicate pred) {
+        if (poolstl::internal::is_seq<ExecPolicy>(policy)) {
+            return std::partition(first, last, pred);
+        }
+
+        return poolstl::internal::partition_p2(*policy.pool(), first, last, pred);
+    }
+
     /**
      * NOTE: Iterators are expected to be random access.
      * See std::sort https://en.cppreference.com/w/cpp/algorithm/sort
@@ -2191,11 +2300,7 @@ namespace std {
             return;
         }
 
-        poolstl::internal::parallel_quicksort(std::forward<ExecPolicy>(policy), first, last, comp,
-                                              std::sort<RandIt, Compare>,
-                                              std::partition<RandIt, poolstl::internal::pivot_predicate<Compare,
-                                                  typename std::iterator_traits<RandIt>::value_type>>,
-                                              poolstl::internal::quicksort_pivot<RandIt>);
+        poolstl::pluggable_sort(std::forward<ExecPolicy>(policy), first, last, comp, std::sort<RandIt, Compare>);
     }
 
     /**
@@ -2347,48 +2452,6 @@ namespace poolstl {
                                                      (void*)nullptr, 1, construct, f);
     }
 
-    /**
-     * NOTE: Iterators are expected to be random access.
-     *
-     * Like `std::sort`, but allows specifying the sequential sort method, which must have the
-     * same signature as the comparator version of `std::sort`.
-     *
-     * Implemented as a high-level quicksort that delegates to `sort_func`, in parallel, once the range has been
-     * sufficiently partitioned.
-     */
-    template <class ExecPolicy, class RandIt, class Compare>
-    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, void>
-    pluggable_sort(ExecPolicy &&policy, RandIt first, RandIt last, Compare comp,
-                   void (sort_func)(RandIt, RandIt, Compare) = std::sort) {
-        if (poolstl::internal::is_seq<ExecPolicy>(policy)) {
-            sort_func(first, last, comp);
-            return;
-        }
-
-        poolstl::internal::parallel_quicksort(std::forward<ExecPolicy>(policy), first, last, comp, sort_func,
-                                              std::partition<RandIt, poolstl::internal::pivot_predicate<Compare,
-                                                  typename std::iterator_traits<RandIt>::value_type>>,
-                                              poolstl::internal::quicksort_pivot<RandIt>);
-    }
-
-    /**
-     * NOTE: Iterators are expected to be random access.
-     *
-     * Like `std::sort`, but allows specifying the sequential sort method, which must have the
-     * same signature as the comparator version of `std::sort`.
-     *
-     * Implemented as a parallel high-level quicksort that delegates to `sort_func` once the range has been
-     * sufficiently partitioned.
-     */
-    template <class ExecPolicy, class RandIt>
-    poolstl::internal::enable_if_poolstl_policy<ExecPolicy, void>
-    pluggable_sort(ExecPolicy &&policy, RandIt first, RandIt last,
-                   void (sort_func)(RandIt, RandIt,
-                                    std::less<typename std::iterator_traits<RandIt>::value_type>) = std::sort){
-        using T = typename std::iterator_traits<RandIt>::value_type;
-        pluggable_sort(std::forward<ExecPolicy>(policy), first, last, std::less<T>(), sort_func);
-    }
-
     /**
      * NOTE: Iterators are expected to be random access.
      *

From 1024b5073a13dcdefb03fb6498858ab4e39e3ca8 Mon Sep 17 00:00:00 2001
From: Marcel Martin <marcel.martin@scilifelab.se>
Date: Wed, 31 Jan 2024 09:29:27 +0100
Subject: [PATCH 15/32] Update baseline commit

---
 tests/baseline-commit.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/baseline-commit.txt b/tests/baseline-commit.txt
index e1ff8c67..d2baa623 100644
--- a/tests/baseline-commit.txt
+++ b/tests/baseline-commit.txt
@@ -1 +1 @@
-a905f7bdd2dcc2e843b0cbac23b51912adadfe7a
+cc6928611965881a6b533d05483fb93ff18752b3

From 7dc7192a018905db3c3cdb9752e65538c9ff4c6d Mon Sep 17 00:00:00 2001
From: Luis Pedro Coelho <luis@luispedro.org>
Date: Fri, 9 Feb 2024 15:19:38 +1000
Subject: [PATCH 16/32] Explicit error if too many sequences are used
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Right now, strobealign only supports up to 2²⁴ sequences. If the user
tries more, it would silently accept it, but later crash.

This was triggered when trying to map to the Greengenes database

https://ftp.microbio.me/greengenes_release/2022.10/
https://ftp.microbio.me/greengenes_release/2022.10/2022.10.seqs.fna.gz

Even a single read like the one below trigger a crash

```
@M05314:127:000000000-BWLLJ:1:1101:15267:1654 2:N:0:1
CCTGTTCGCTCCCCACGCTTTCGTCCCTCAGCGTCAATATTGTGCCAGAATGCTGCCTTCGCCATTGGTGTTCCTCCTGATATCTACGCATGTCACCGCTACACCAGGAATTCCACATTCCTCTCACATATTCTATTTTATCAGTTTTGAT
+
AAA1AF@1>AAAGG1A0EAFGGEHAAEGFCG1AAEE/F2FG2F2FF1CA0FBDED1BGFGFFE?AF1BFFCFHDGFFHB1FFGFGEEFE/?/BF2F@/EGEEB00/0//0BFG1>B1BGFEFHHGGFFD12BGH2FDFFFGG22GDD>@/F
```
---
 src/main.cpp        | 5 +++++
 src/randstrobes.hpp | 4 ++++
 2 files changed, 9 insertions(+)

diff --git a/src/main.cpp b/src/main.cpp
index dcc96774..f9b8f65a 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -25,6 +25,7 @@
 #include "timer.hpp"
 #include "readlen.hpp"
 #include "version.hpp"
+#include "randstrobes.hpp"
 #include "buildconfig.hpp"
 
 
@@ -209,6 +210,10 @@ int run_strobealign(int argc, char **argv) {
         throw InvalidFasta("No reference sequences found");
     }
 
+    if (references.size() > RefRandstrobe::max_number_of_references) {
+        throw InvalidFasta("Too many reference sequences. Current maximum is " + std::to_string(RefRandstrobe::max_number_of_references));
+    }
+
     StrobemerIndex index(references, index_parameters, opt.bits);
     if (opt.use_index) {
         // Read the index from a file
diff --git a/src/randstrobes.hpp b/src/randstrobes.hpp
index 7a117b3c..8bdaf779 100644
--- a/src/randstrobes.hpp
+++ b/src/randstrobes.hpp
@@ -41,10 +41,14 @@ struct RefRandstrobe {
         return m_packed & mask;
     }
 
+
 private:
     static constexpr int bit_alloc = 8;
     static constexpr int mask = (1 << bit_alloc) - 1;
     packed_t m_packed; // packed representation of ref_index and strobe offset
+
+public:
+    static constexpr uint32_t max_number_of_references = (1 << (32 - bit_alloc)) - 1;
 };
 
 struct QueryRandstrobe {

From 2e4ff9500e68d6e465735dd276d362cf71851dcd Mon Sep 17 00:00:00 2001
From: Marcel Martin <marcel.martin@scilifelab.se>
Date: Wed, 14 Feb 2024 20:45:31 +0100
Subject: [PATCH 17/32] Ensure sorting of randstrobes is reproducible
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

... by sorting them also by position. This way, it does not matter in which
way the randstrobes vector is partitioned during sort. Otherwise, the order
would depend on the number of threads used to create the index, making
mapping results not reproducible across runs that do not use the same no. of
threads.

Note: There is still a tiny chance for collisions/nondeterminism because we
ignore RefRandstrobe::m_packed.

For efficiency, this uses a branchless comparison function inspired by
@alugowski’s comment in PR #386.

Runtimes for sorting with one thread

32 s - only by hash
45 s - by hash and position using std::tie
42 s - by hash and position using branchless_compare from the PR
35 s - by hash and position using __uint128_t (this commit)

Runtimes for sorting (four cores with hyperthreading)

threads | sorting time | index creation time
-|-|-
1 | 35 s | 154 s
2 | 25 s | 95 s
4 | 16 s | 60 s
8 | 15 s | 48 s
---
 src/randstrobes.hpp       | 8 +++++++-
 tests/baseline-commit.txt | 2 +-
 tests/compare-baseline.sh | 8 ++++++--
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/randstrobes.hpp b/src/randstrobes.hpp
index 7a117b3c..0c72aaed 100644
--- a/src/randstrobes.hpp
+++ b/src/randstrobes.hpp
@@ -30,7 +30,13 @@ struct RefRandstrobe {
         , m_packed(packed) { }
 
     bool operator< (const RefRandstrobe& other) const {
-        return hash < other.hash;
+        // Compare both hash and position to ensure that the order of the
+        // RefRandstrobes in the index is reproducible no matter which sorting
+        // function is used. This branchless comparison is faster than the
+        // equivalent one using std::tie.
+        __uint128_t lhs = (static_cast<__uint128_t>(hash) << 64) | position;
+        __uint128_t rhs = (static_cast<__uint128_t>(other.hash) << 64) | other.position;
+        return lhs < rhs;
     }
 
     int reference_index() const {
diff --git a/tests/baseline-commit.txt b/tests/baseline-commit.txt
index d2baa623..9ffa9ec8 100644
--- a/tests/baseline-commit.txt
+++ b/tests/baseline-commit.txt
@@ -1 +1 @@
-cc6928611965881a6b533d05483fb93ff18752b3
+24995f4168108232528fbe5132441c9f1d0401b3
diff --git a/tests/compare-baseline.sh b/tests/compare-baseline.sh
index 57f9faac..4bc2223b 100755
--- a/tests/compare-baseline.sh
+++ b/tests/compare-baseline.sh
@@ -13,8 +13,12 @@ set -euo pipefail
 python3 -c 'import pysam'
 
 ends="pe"
-while getopts "s" opt; do
+threads=4
+while getopts "st:" opt; do
   case "${opt}" in
+    t)
+      threads=$OPTARG
+      ;;
     s)
       ends=se  # single-end reads
       ;;
@@ -38,7 +42,7 @@ baseline_commit=$(< tests/baseline-commit.txt)
 baseline_bam=baseline/bam/${baseline_commit}.${ends}.bam
 baseline_binary=baseline/strobealign-${baseline_commit}
 cmake_options=-DCMAKE_BUILD_TYPE=RelWithDebInfo
-strobealign_options="-t 4"
+strobealign_options="-t ${threads}"
 
 # Generate the baseline BAM if necessary
 mkdir -p baseline/bam

From 724a1df6d518aa35a06e3deeb10c9254e79ede16 Mon Sep 17 00:00:00 2001
From: Marcel Martin <marcel.martin@scilifelab.se>
Date: Wed, 14 Feb 2024 21:41:22 +0100
Subject: [PATCH 18/32] Update baseline commit

---
 tests/baseline-commit.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/baseline-commit.txt b/tests/baseline-commit.txt
index 9ffa9ec8..471fbbbb 100644
--- a/tests/baseline-commit.txt
+++ b/tests/baseline-commit.txt
@@ -1 +1 @@
-24995f4168108232528fbe5132441c9f1d0401b3
+2e4ff9500e68d6e465735dd276d362cf71851dcd

From 31309bd4322582c5fba24ff8e0145e6a380c9e64 Mon Sep 17 00:00:00 2001
From: Marcel Martin <marcel.martin@scilifelab.se>
Date: Fri, 16 Feb 2024 15:10:08 +0100
Subject: [PATCH 19/32] Introduce canonical read length 75

Closes #395
---
 CHANGES.md              | 4 ++++
 README.md               | 6 +++---
 src/indexparameters.cpp | 3 ++-
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index 644e9644..2d907665 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -9,6 +9,10 @@
 * #376: Improve accuracy for read length 50 by optimizing the default
   indexing parameters. Paired-end accuracy increases by 0.3 percentage
   points on average. Single-end accuracy increases by 1 percentage point.
+* #395: Previously, read length 75 used the same indexing parameters as length
+  50, but the improved settings for length 50 are not the best for length 75.
+  To avoid a decrease in accuracy, we introduced a new set of pre-defined
+  indexing parameters for read length 75 (a new canonical read length).
 * If `--details` is used, output `X0:i` SAM tag with the number of
   identically-scored best alignments
 * #378: Added `-C` option for appending the FASTA or FASTQ comment to SAM
diff --git a/README.md b/README.md
index 1ef03a51..f2d0f8c1 100644
--- a/README.md
+++ b/README.md
@@ -145,9 +145,9 @@ options. Some important ones are:
 Strobealign needs to build an index (strobemer index) of the reference before
 it can map reads to it.
 The optimal indexing parameters depend on the length of the input reads.
-There are currently seven different pre-defined sets of parameters that are
-optimized for different read lengths. These *canonical read lengths* are
-50, 100, 125, 150, 250 and 400. When deciding which of the pre-defined
+There are pre-defined sets of parameters that are optimized for different read
+lengths. These *canonical read lengths* are
+50, 75, 100, 125, 150, 250 and 400. When deciding which of the pre-defined
 indexing parameter sets to use, strobealign chooses one whose canonical
 read length is close to the average read length of the input.
 
diff --git a/src/indexparameters.cpp b/src/indexparameters.cpp
index 2f634e9f..0c655903 100644
--- a/src/indexparameters.cpp
+++ b/src/indexparameters.cpp
@@ -35,7 +35,8 @@ struct Profile {
 static auto max{std::numeric_limits<int>::max()};
 
 static std::vector<Profile> profiles = {
-        Profile{ 50,  90, 18, -4, -2,  1},
+        Profile{ 50,  70, 18, -4, -2,  1},
+        Profile{ 75,  90, 20, -4, -3,  2},
         Profile{100, 110, 20, -4, -2,  2},
         Profile{125, 135, 20, -4, -1,  4},
         Profile{150, 175, 20, -4,  1,  7},

From 02ea1966b5afa5617d7fc133cefbbc7b43e099b5 Mon Sep 17 00:00:00 2001
From: Marcel Martin <marcel.martin@scilifelab.se>
Date: Wed, 15 Feb 2023 23:20:24 +0100
Subject: [PATCH 20/32] Update ksw2 to the most recent upstream version

---
 ext/ksw2.h           | 312 +++++++++--------------
 ext/ksw2_extz2_sse.c | 574 +++++++++++++++++++------------------------
 2 files changed, 367 insertions(+), 519 deletions(-)

diff --git a/ext/ksw2.h b/ext/ksw2.h
index 04eeda27..01edd45e 100644
--- a/ext/ksw2.h
+++ b/ext/ksw2.h
@@ -15,20 +15,30 @@
 #define KSW_EZ_SPLICE_FOR  0x100
 #define KSW_EZ_SPLICE_REV  0x200
 #define KSW_EZ_SPLICE_FLANK 0x400
+#define KSW_EZ_EQX         0x800
+
+// The subset of CIGAR operators used by ksw code.
+// Use MM_CIGAR_* from minimap.h if you need the full list.
+#define KSW_CIGAR_MATCH  0
+#define KSW_CIGAR_INS    1
+#define KSW_CIGAR_DEL    2
+#define KSW_CIGAR_N_SKIP 3
+#define KSW_CIGAR_EQ     7
+#define KSW_CIGAR_X      8
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 typedef struct {
-    uint32_t max: 31, zdropped: 1;
-    int max_q, max_t;      // max extension coordinate
-    int mqe, mqe_t;        // max score when reaching the end of query
-    int mte, mte_q;        // max score when reaching the end of target
-    int score;             // max score reaching both ends; may be KSW_NEG_INF
-    int m_cigar, n_cigar;
-    int reach_end;
-    uint32_t *cigar;
+	uint32_t max:31, zdropped:1;
+	int max_q, max_t;      // max extension coordinate
+	int mqe, mqe_t;        // max score when reaching the end of query
+	int mte, mte_q;        // max score when reaching the end of target
+	int score;             // max score reaching both ends; may be KSW_NEG_INF
+	int m_cigar, n_cigar;
+	int reach_end;
+	uint32_t *cigar;
 } ksw_extz_t;
 
 /**
@@ -49,69 +59,21 @@ typedef struct {
  * @param ez        (out) scores and cigar
  */
 void ksw_extz(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
-              int8_t q, int8_t e, int w, int zdrop, int flag, ksw_extz_t *ez);
+			  int8_t q, int8_t e, int w, int zdrop, int flag, ksw_extz_t *ez);
 
-void ksw_extz2_sse(void *km,
-                   int qlen,
-                   const uint8_t *query,
-                   int tlen,
-                   const uint8_t *target,
-                   int8_t m,
-                   const int8_t *mat,
-                   int8_t q,
-                   int8_t e,
-                   int w,
-                   int zdrop,
-                   int end_bonus,
-                   int flag,
-                   ksw_extz_t *ez);
+void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
+				   int8_t q, int8_t e, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez);
 
 void ksw_extd(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
-              int8_t gapo, int8_t gape, int8_t gapo2, int8_t gape2, int w, int zdrop, int flag, ksw_extz_t *ez);
+			  int8_t gapo, int8_t gape, int8_t gapo2, int8_t gape2, int w, int zdrop, int flag, ksw_extz_t *ez);
 
-void ksw_extd2_sse(void *km,
-                   int qlen,
-                   const uint8_t *query,
-                   int tlen,
-                   const uint8_t *target,
-                   int8_t m,
-                   const int8_t *mat,
-                   int8_t gapo,
-                   int8_t gape,
-                   int8_t gapo2,
-                   int8_t gape2,
-                   int w,
-                   int zdrop,
-                   int end_bonus,
-                   int flag,
-                   ksw_extz_t *ez);
+void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
+				   int8_t gapo, int8_t gape, int8_t gapo2, int8_t gape2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez);
 
-void ksw_exts2_sse(void *km,
-                   int qlen,
-                   const uint8_t *query,
-                   int tlen,
-                   const uint8_t *target,
-                   int8_t m,
-                   const int8_t *mat,
-                   int8_t gapo,
-                   int8_t gape,
-                   int8_t gapo2,
-                   int8_t noncan,
-                   int zdrop,
-                   int flag,
-                   ksw_extz_t *ez);
+void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
+				   int8_t gapo, int8_t gape, int8_t gapo2, int8_t noncan, int zdrop, int8_t junc_bonus, int flag, const uint8_t *junc, ksw_extz_t *ez);
 
-void ksw_extf2_sse(void *km,
-                   int qlen,
-                   const uint8_t *query,
-                   int tlen,
-                   const uint8_t *target,
-                   int8_t mch,
-                   int8_t mis,
-                   int8_t e,
-                   int w,
-                   int xdrop,
-                   ksw_extz_t *ez);
+void ksw_extf2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t mch, int8_t mis, int8_t e, int w, int xdrop, ksw_extz_t *ez);
 
 /**
  * Global alignment
@@ -123,45 +85,9 @@ void ksw_extf2_sse(void *km,
  *
  * @return          score of the alignment
  */
-int ksw_gg(void *km,
-           int qlen,
-           const uint8_t *query,
-           int tlen,
-           const uint8_t *target,
-           int8_t m,
-           const int8_t *mat,
-           int8_t gapo,
-           int8_t gape,
-           int w,
-           int *m_cigar_,
-           int *n_cigar_,
-           uint32_t **cigar_);
-int ksw_gg2(void *km,
-            int qlen,
-            const uint8_t *query,
-            int tlen,
-            const uint8_t *target,
-            int8_t m,
-            const int8_t *mat,
-            int8_t gapo,
-            int8_t gape,
-            int w,
-            int *m_cigar_,
-            int *n_cigar_,
-            uint32_t **cigar_);
-int ksw_gg2_sse(void *km,
-                int qlen,
-                const uint8_t *query,
-                int tlen,
-                const uint8_t *target,
-                int8_t m,
-                const int8_t *mat,
-                int8_t gapo,
-                int8_t gape,
-                int w,
-                int *m_cigar_,
-                int *n_cigar_,
-                uint32_t **cigar_);
+int ksw_gg(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t gapo, int8_t gape, int w, int *m_cigar_, int *n_cigar_, uint32_t **cigar_);
+int ksw_gg2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t gapo, int8_t gape, int w, int *m_cigar_, int *n_cigar_, uint32_t **cigar_);
+int ksw_gg2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t gapo, int8_t gape, int w, int *m_cigar_, int *n_cigar_, uint32_t **cigar_);
 
 void *ksw_ll_qinit(void *km, int size, int qlen, const uint8_t *query, int m, const int8_t *mat);
 int ksw_ll_i16(void *q, int tlen, const uint8_t *target, int gapo, int gape, int *qe, int *te);
@@ -184,107 +110,99 @@ int ksw_ll_i16(void *q, int tlen, const uint8_t *target, int gapo, int gape, int
 #define kfree(km, ptr) free((ptr))
 #endif
 
-static inline uint32_t *ksw_push_cigar(void *km, int *n_cigar, int *m_cigar, uint32_t *cigar, uint32_t op, int len) {
-    if (*n_cigar == 0 || op != (cigar[(*n_cigar) - 1] & 0xf)) {
-        if (*n_cigar == *m_cigar) {
-            *m_cigar = *m_cigar ? (*m_cigar) << 1 : 4;
-            cigar = (uint32_t *) krealloc(km, cigar, (*m_cigar) << 2);
-        }
-        cigar[(*n_cigar)++] = len << 4 | op;
-    } else
-        cigar[(*n_cigar) - 1] += len << 4;
-    return cigar;
+static inline uint32_t *ksw_push_cigar(void *km, int *n_cigar, int *m_cigar, uint32_t *cigar, uint32_t op, int len)
+{
+	if (*n_cigar == 0 || op != (cigar[(*n_cigar) - 1]&0xf)) {
+		if (*n_cigar == *m_cigar) {
+			*m_cigar = *m_cigar? (*m_cigar)<<1 : 4;
+			cigar = (uint32_t*)krealloc(km, cigar, (*m_cigar) << 2);
+		}
+		cigar[(*n_cigar)++] = len<<4 | op;
+	} else cigar[(*n_cigar)-1] += len<<4;
+	return cigar;
 }
 
 // In the backtrack matrix, value p[] has the following structure:
 //   bit 0-2: which type gets the max - 0 for H, 1 for E, 2 for F, 3 for \tilde{E} and 4 for \tilde{F}
 //   bit 3/0x08: 1 if a continuation on the E state (bit 5/0x20 for a continuation on \tilde{E})
 //   bit 4/0x10: 1 if a continuation on the F state (bit 6/0x40 for a continuation on \tilde{F})
-static inline void ksw_backtrack(void *km,
-                                 int is_rot,
-                                 int is_rev,
-                                 int min_intron_len,
-                                 const uint8_t *p,
-                                 const int *off,
-                                 const int *off_end,
-                                 int n_col,
-                                 int i0,
-                                 int j0,
-                                 int *m_cigar_,
-                                 int *n_cigar_,
-                                 uint32_t **cigar_) { // p[] - lower 3 bits: which type gets the max; bit
-    int n_cigar = 0, m_cigar = *m_cigar_, i = i0, j = j0, r, state = 0;
-    uint32_t *cigar = *cigar_, tmp;
-    while (i >= 0 && j >= 0) { // at the beginning of the loop, _state_ tells us which state to check
-        int force_state = -1;
-        if (is_rot) {
-            r = i + j;
-            if (i < off[r])
-                force_state = 2;
-            if (off_end && i > off_end[r])
-                force_state = 1;
-            tmp = force_state < 0 ? p[(size_t) r * n_col + i - off[r]] : 0;
-        } else {
-            if (j < off[i])
-                force_state = 2;
-            if (off_end && j > off_end[i])
-                force_state = 1;
-            tmp = force_state < 0 ? p[(size_t) i * n_col + j - off[i]] : 0;
-        }
-        if (state == 0)
-            state = tmp & 7; // if requesting the H state, find state one maximizes it.
-        else if (!(tmp >> (state + 2) & 1))
-            state = 0; // if requesting other states, _state_ stays the same if it is a continuation; otherwise, set to H
-        if (state == 0)
-            state = tmp & 7; // TODO: probably this line can be merged into the "else if" line right above; not 100% sure
-        if (force_state >= 0)
-            state = force_state;
-        if (state == 0)
-            cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 0, 1), --i, --j; // match
-        else if (state == 1 || (state == 3 && min_intron_len <= 0))
-            cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, 1), --i; // deletion
-        else if (state == 3 && min_intron_len > 0)
-            cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 3, 1), --i; // intron
-        else
-            cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 1, 1), --j; // insertion
-    }
-    if (i >= 0)
-        cigar = ksw_push_cigar(km,
-                               &n_cigar,
-                               &m_cigar,
-                               cigar,
-                               min_intron_len > 0 && i >= min_intron_len ? 3 : 2,
-                               i + 1); // first deletion
-    if (j >= 0)
-        cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 1, j + 1); // first insertion
-    if (!is_rev)
-        for (i = 0; i < n_cigar >> 1; ++i) // reverse CIGAR
-            tmp = cigar[i], cigar[i] = cigar[n_cigar - 1 - i], cigar[n_cigar - 1 - i] = tmp;
-    *m_cigar_ = m_cigar, *n_cigar_ = n_cigar, *cigar_ = cigar;
+static inline void ksw_backtrack(void *km, int is_rot, int is_rev, int min_intron_len, const uint8_t *p, const int *off, const int *off_end, int n_col, int i0, int j0,
+								 int *m_cigar_, int *n_cigar_, uint32_t **cigar_)
+{ // p[] - lower 3 bits: which type gets the max; bit
+	int n_cigar = 0, m_cigar = *m_cigar_, i = i0, j = j0, r, state = 0;
+	uint32_t *cigar = *cigar_, tmp;
+	while (i >= 0 && j >= 0) { // at the beginning of the loop, _state_ tells us which state to check
+		int force_state = -1;
+		if (is_rot) {
+			r = i + j;
+			if (i < off[r]) force_state = 2;
+			if (off_end && i > off_end[r]) force_state = 1;
+			tmp = force_state < 0? p[(size_t)r * n_col + i - off[r]] : 0;
+		} else {
+			if (j < off[i]) force_state = 2;
+			if (off_end && j > off_end[i]) force_state = 1;
+			tmp = force_state < 0? p[(size_t)i * n_col + j - off[i]] : 0;
+		}
+		if (state == 0) state = tmp & 7; // if requesting the H state, find state one maximizes it.
+		else if (!(tmp >> (state + 2) & 1)) state = 0; // if requesting other states, _state_ stays the same if it is a continuation; otherwise, set to H
+		if (state == 0) state = tmp & 7; // TODO: probably this line can be merged into the "else if" line right above; not 100% sure
+		if (force_state >= 0) state = force_state;
+		if (state == 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_MATCH, 1), --i, --j;
+		else if (state == 1 || (state == 3 && min_intron_len <= 0)) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_DEL, 1), --i;
+		else if (state == 3 && min_intron_len > 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_N_SKIP, 1), --i;
+		else cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_INS, 1), --j;
+	}
+	if (i >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, min_intron_len > 0 && i >= min_intron_len? KSW_CIGAR_N_SKIP : KSW_CIGAR_DEL, i + 1); // first deletion
+	if (j >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_INS, j + 1); // first insertion
+	if (!is_rev)
+		for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR
+			tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp;
+	*m_cigar_ = m_cigar, *n_cigar_ = n_cigar, *cigar_ = cigar;
+}
+
+static inline void ksw_cigar2eqx(void *km, const uint8_t *query, const uint8_t *target, int nc0, const uint32_t *ci0, int *mc1, int *nc1, uint32_t **ci1)
+{
+	int i, k, x = 0, y = 0;
+	*nc1 = 0;
+	for (k = 0; k < nc0; ++k) {
+		int op = ci0[k]&0xf, len = ci0[k]>>4;
+		if (op == KSW_CIGAR_MATCH) {
+			for (i = 0; i < len; ++i) {
+				if (target[x + i] == query[y + i]) ksw_push_cigar(km, nc1, mc1, *ci1, KSW_CIGAR_EQ, 1);
+				else ksw_push_cigar(km, nc1, mc1, *ci1, KSW_CIGAR_X, 1);
+			}
+			x += len, y += len;
+		} else {
+			ksw_push_cigar(km, nc1, mc1, *ci1, op, len);
+			if (op == KSW_CIGAR_DEL || op == KSW_CIGAR_N_SKIP) x += len;
+			else if (op == KSW_CIGAR_INS) y += len;
+			else if (op == KSW_CIGAR_EQ || op == KSW_CIGAR_X) x += len, y += len;
+		}
+	}
 }
 
-static inline void ksw_reset_extz(ksw_extz_t *ez) {
-    ez->max_q = ez->max_t = ez->mqe_t = ez->mte_q = -1;
-    ez->max = 0, ez->score = ez->mqe = ez->mte = KSW_NEG_INF;
-    ez->n_cigar = 0, ez->zdropped = 0, ez->reach_end = 0;
+static inline void ksw_reset_extz(ksw_extz_t *ez)
+{
+	ez->max_q = ez->max_t = ez->mqe_t = ez->mte_q = -1;
+	ez->max = 0, ez->score = ez->mqe = ez->mte = KSW_NEG_INF;
+	ez->n_cigar = 0, ez->zdropped = 0, ez->reach_end = 0;
 }
 
-static inline int ksw_apply_zdrop(ksw_extz_t *ez, int is_rot, int32_t H, int a, int b, int zdrop, int8_t e) {
-    int r, t;
-    if (is_rot)
-        r = a, t = b;
-    else
-        r = a + b, t = a;
-    if (H > (int32_t) ez->max) {
-        ez->max = H, ez->max_t = t, ez->max_q = r - t;
-    } else if (t >= ez->max_t && r - t >= ez->max_q) {
-        int tl = t - ez->max_t, ql = (r - t) - ez->max_q, l;
-        l = tl > ql ? tl - ql : ql - tl;
-        if (zdrop >= 0 && ez->max - H > zdrop + l * e) {
-            ez->zdropped = 1;
-            return 1;
-        }
-    }
-    return 0;
+static inline int ksw_apply_zdrop(ksw_extz_t *ez, int is_rot, int32_t H, int a, int b, int zdrop, int8_t e)
+{
+	int r, t;
+	if (is_rot) r = a, t = b;
+	else r = a + b, t = a;
+	if (H > (int32_t)ez->max) {
+		ez->max = H, ez->max_t = t, ez->max_q = r - t;
+	} else if (t >= ez->max_t && r - t >= ez->max_q) {
+		int tl = t - ez->max_t, ql = (r - t) - ez->max_q, l;
+		l = tl > ql? tl - ql : ql - tl;
+		if (zdrop >= 0 && ez->max - H > zdrop + l * e) {
+			ez->zdropped = 1;
+			return 1;
+		}
+	}
+	return 0;
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/ext/ksw2_extz2_sse.c b/ext/ksw2_extz2_sse.c
index 767a749f..02bb4c2a 100644
--- a/ext/ksw2_extz2_sse.c
+++ b/ext/ksw2_extz2_sse.c
@@ -20,356 +20,286 @@ void ksw_extz2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const u
 void ksw_extz2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
 #endif
 #else
-void ksw_extz2_sse(void *km,
-                   int qlen,
-                   const uint8_t *query,
-                   int tlen,
-                   const uint8_t *target,
-                   int8_t m,
-                   const int8_t *mat,
-                   int8_t q,
-                   int8_t e,
-                   int w,
-                   int zdrop,
-                   int end_bonus,
-                   int flag,
-                   ksw_extz_t *ez)
+void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
 #endif // ~KSW_CPU_DISPATCH
 {
 #define __dp_code_block1 \
-    z = _mm_add_epi8(_mm_load_si128(&s[t]), qe2_); \
-    xt1 = _mm_load_si128(&x[t]);                     /* xt1 <- x[r-1][t..t+15] */ \
-    tmp = _mm_srli_si128(xt1, 15);                   /* tmp <- x[r-1][t+15] */ \
-    xt1 = _mm_or_si128(_mm_slli_si128(xt1, 1), x1_); /* xt1 <- x[r-1][t-1..t+14] */ \
-    x1_ = tmp; \
-    vt1 = _mm_load_si128(&v[t]);                     /* vt1 <- v[r-1][t..t+15] */ \
-    tmp = _mm_srli_si128(vt1, 15);                   /* tmp <- v[r-1][t+15] */ \
-    vt1 = _mm_or_si128(_mm_slli_si128(vt1, 1), v1_); /* vt1 <- v[r-1][t-1..t+14] */ \
-    v1_ = tmp; \
-    a = _mm_add_epi8(xt1, vt1);                      /* a <- x[r-1][t-1..t+14] + v[r-1][t-1..t+14] */ \
-    ut = _mm_load_si128(&u[t]);                      /* ut <- u[t..t+15] */ \
-    b = _mm_add_epi8(_mm_load_si128(&y[t]), ut);     /* b <- y[r-1][t..t+15] + u[r-1][t..t+15] */
+	z = _mm_add_epi8(_mm_load_si128(&s[t]), qe2_); \
+	xt1 = _mm_load_si128(&x[t]);                     /* xt1 <- x[r-1][t..t+15] */ \
+	tmp = _mm_srli_si128(xt1, 15);                   /* tmp <- x[r-1][t+15] */ \
+	xt1 = _mm_or_si128(_mm_slli_si128(xt1, 1), x1_); /* xt1 <- x[r-1][t-1..t+14] */ \
+	x1_ = tmp; \
+	vt1 = _mm_load_si128(&v[t]);                     /* vt1 <- v[r-1][t..t+15] */ \
+	tmp = _mm_srli_si128(vt1, 15);                   /* tmp <- v[r-1][t+15] */ \
+	vt1 = _mm_or_si128(_mm_slli_si128(vt1, 1), v1_); /* vt1 <- v[r-1][t-1..t+14] */ \
+	v1_ = tmp; \
+	a = _mm_add_epi8(xt1, vt1);                      /* a <- x[r-1][t-1..t+14] + v[r-1][t-1..t+14] */ \
+	ut = _mm_load_si128(&u[t]);                      /* ut <- u[t..t+15] */ \
+	b = _mm_add_epi8(_mm_load_si128(&y[t]), ut);     /* b <- y[r-1][t..t+15] + u[r-1][t..t+15] */
 
 #define __dp_code_block2 \
-    z = _mm_max_epu8(z, b);                          /* z = max(z, b); this works because both are non-negative */ \
-    z = _mm_min_epu8(z, max_sc_); \
-    _mm_store_si128(&u[t], _mm_sub_epi8(z, vt1));    /* u[r][t..t+15] <- z - v[r-1][t-1..t+14] */ \
-    _mm_store_si128(&v[t], _mm_sub_epi8(z, ut));     /* v[r][t..t+15] <- z - u[r-1][t..t+15] */ \
-    z = _mm_sub_epi8(z, q_); \
-    a = _mm_sub_epi8(a, z); \
-    b = _mm_sub_epi8(b, z);
+	z = _mm_max_epu8(z, b);                          /* z = max(z, b); this works because both are non-negative */ \
+	z = _mm_min_epu8(z, max_sc_); \
+	_mm_store_si128(&u[t], _mm_sub_epi8(z, vt1));    /* u[r][t..t+15] <- z - v[r-1][t-1..t+14] */ \
+	_mm_store_si128(&v[t], _mm_sub_epi8(z, ut));     /* v[r][t..t+15] <- z - u[r-1][t..t+15] */ \
+	z = _mm_sub_epi8(z, q_); \
+	a = _mm_sub_epi8(a, z); \
+	b = _mm_sub_epi8(b, z);
 
-  int r, t, qe = q + e, n_col_, *off = 0, *off_end = 0, tlen_, qlen_, last_st, last_en, wl, wr, max_sc, min_sc;
-  int with_cigar = !(flag & KSW_EZ_SCORE_ONLY), approx_max = !!(flag & KSW_EZ_APPROX_MAX);
-  int32_t *H = 0, H0 = 0, last_H0_t = 0;
-  uint8_t *qr, *sf, *mem, *mem2 = 0;
-  __m128i q_, qe2_, zero_, flag1_, flag2_, flag8_, flag16_, sc_mch_, sc_mis_, sc_N_, m1_, max_sc_;
-  __m128i *u, *v, *x, *y, *s, *p = 0;
+	int r, t, qe = q + e, n_col_, *off = 0, *off_end = 0, tlen_, qlen_, last_st, last_en, wl, wr, max_sc, min_sc;
+	int with_cigar = !(flag&KSW_EZ_SCORE_ONLY), approx_max = !!(flag&KSW_EZ_APPROX_MAX);
+	int32_t *H = 0, H0 = 0, last_H0_t = 0;
+	uint8_t *qr, *sf, *mem, *mem2 = 0;
+	__m128i q_, qe2_, zero_, flag1_, flag2_, flag8_, flag16_, sc_mch_, sc_mis_, sc_N_, m1_, max_sc_;
+	__m128i *u, *v, *x, *y, *s, *p = 0;
 
-  ksw_reset_extz(ez);
-  if (m <= 0 || qlen <= 0 || tlen <= 0)
-    return;
+	ksw_reset_extz(ez);
+	if (m <= 0 || qlen <= 0 || tlen <= 0) return;
 
-  zero_ = _mm_set1_epi8(0);
-  q_ = _mm_set1_epi8(q);
-  qe2_ = _mm_set1_epi8((q + e) * 2);
-  flag1_ = _mm_set1_epi8(1);
-  flag2_ = _mm_set1_epi8(2);
-  flag8_ = _mm_set1_epi8(0x08);
-  flag16_ = _mm_set1_epi8(0x10);
-  sc_mch_ = _mm_set1_epi8(mat[0]);
-  sc_mis_ = _mm_set1_epi8(mat[1]);
-  sc_N_ = mat[m * m - 1] == 0 ? _mm_set1_epi8(-e) : _mm_set1_epi8(mat[m * m - 1]);
-  m1_ = _mm_set1_epi8(m - 1); // wildcard
-  max_sc_ = _mm_set1_epi8(mat[0] + (q + e) * 2);
+	zero_   = _mm_set1_epi8(0);
+	q_      = _mm_set1_epi8(q);
+	qe2_    = _mm_set1_epi8((q + e) * 2);
+	flag1_  = _mm_set1_epi8(1);
+	flag2_  = _mm_set1_epi8(2);
+	flag8_  = _mm_set1_epi8(0x08);
+	flag16_ = _mm_set1_epi8(0x10);
+	sc_mch_ = _mm_set1_epi8(mat[0]);
+	sc_mis_ = _mm_set1_epi8(mat[1]);
+	sc_N_   = mat[m*m-1] == 0? _mm_set1_epi8(-e) : _mm_set1_epi8(mat[m*m-1]);
+	m1_     = _mm_set1_epi8(m - 1); // wildcard
+	max_sc_ = _mm_set1_epi8(mat[0] + (q + e) * 2);
 
-  if (w < 0)
-    w = tlen > qlen ? tlen : qlen;
-  wl = wr = w;
-  tlen_ = (tlen + 15) / 16;
-  n_col_ = qlen < tlen ? qlen : tlen;
-  n_col_ = ((n_col_ < w + 1 ? n_col_ : w + 1) + 15) / 16 + 1;
-  qlen_ = (qlen + 15) / 16;
-  for (t = 1, max_sc = mat[0], min_sc = mat[1]; t < m * m; ++t) {
-    max_sc = max_sc > mat[t] ? max_sc : mat[t];
-    min_sc = min_sc < mat[t] ? min_sc : mat[t];
-  }
-  if (-min_sc > 2 * (q + e))
-    return; // otherwise, we won't see any mismatches
+	if (w < 0) w = tlen > qlen? tlen : qlen;
+	wl = wr = w;
+	tlen_ = (tlen + 15) / 16;
+	n_col_ = qlen < tlen? qlen : tlen;
+	n_col_ = ((n_col_ < w + 1? n_col_ : w + 1) + 15) / 16 + 1;
+	qlen_ = (qlen + 15) / 16;
+	for (t = 1, max_sc = mat[0], min_sc = mat[1]; t < m * m; ++t) {
+		max_sc = max_sc > mat[t]? max_sc : mat[t];
+		min_sc = min_sc < mat[t]? min_sc : mat[t];
+	}
+	if (-min_sc > 2 * (q + e)) return; // otherwise, we won't see any mismatches
 
-  mem = (uint8_t *) kcalloc(km, tlen_ * 6 + qlen_ + 1, 16);
-  u = (__m128i *) (((size_t) mem + 15) >> 4 << 4); // 16-byte aligned
-  v = u + tlen_, x = v + tlen_, y = x + tlen_, s = y + tlen_, sf = (uint8_t *) (s + tlen_), qr = sf + tlen_ * 16;
-  if (!approx_max) {
-    H = (int32_t *) kmalloc(km, tlen_ * 16 * 4);
-    for (t = 0; t < tlen_ * 16; ++t)
-      H[t] = KSW_NEG_INF;
-  }
-  if (with_cigar) {
-    mem2 = (uint8_t *) kmalloc(km, ((size_t) (qlen + tlen - 1) * n_col_ + 1) * 16);
-    p = (__m128i *) (((size_t) mem2 + 15) >> 4 << 4);
-    off = (int *) kmalloc(km, (qlen + tlen - 1) * sizeof(int) * 2);
-    off_end = off + qlen + tlen - 1;
-  }
+	mem = (uint8_t*)kcalloc(km, tlen_ * 6 + qlen_ + 1, 16);
+	u = (__m128i*)(((size_t)mem + 15) >> 4 << 4); // 16-byte aligned
+	v = u + tlen_, x = v + tlen_, y = x + tlen_, s = y + tlen_, sf = (uint8_t*)(s + tlen_), qr = sf + tlen_ * 16;
+	if (!approx_max) {
+		H = (int32_t*)kmalloc(km, tlen_ * 16 * 4);
+		for (t = 0; t < tlen_ * 16; ++t) H[t] = KSW_NEG_INF;
+	}
+	if (with_cigar) {
+		mem2 = (uint8_t*)kmalloc(km, ((size_t)(qlen + tlen - 1) * n_col_ + 1) * 16);
+		p = (__m128i*)(((size_t)mem2 + 15) >> 4 << 4);
+		off = (int*)kmalloc(km, (qlen + tlen - 1) * sizeof(int) * 2);
+		off_end = off + qlen + tlen - 1;
+	}
 
-  for (t = 0; t < qlen; ++t)
-    qr[t] = query[qlen - 1 - t];
-  memcpy(sf, target, tlen);
+	for (t = 0; t < qlen; ++t) qr[t] = query[qlen - 1 - t];
+	memcpy(sf, target, tlen);
 
-  for (r = 0, last_st = last_en = -1; r < qlen + tlen - 1; ++r) {
-    int st = 0, en = tlen - 1, st0, en0, st_, en_;
-    int8_t x1, v1;
-    uint8_t *qrr = qr + (qlen - 1 - r), *u8 = (uint8_t *) u, *v8 = (uint8_t *) v;
-    __m128i x1_, v1_;
-    // find the boundaries
-    if (st < r - qlen + 1)
-      st = r - qlen + 1;
-    if (en > r)
-      en = r;
-    if (st < (r - wr + 1) >> 1)
-      st = (r - wr + 1) >> 1; // take the ceil
-    if (en > (r + wl) >> 1)
-      en = (r + wl) >> 1; // take the floor
-    if (st > en) {
-      ez->zdropped = 1;
-      break;
-    }
-    st0 = st, en0 = en;
-    st = st / 16 * 16, en = (en + 16) / 16 * 16 - 1;
-    // set boundary conditions
-    if (st > 0) {
-      if (st - 1 >= last_st && st - 1 <= last_en)
-        x1 = ((uint8_t *) x)[st - 1], v1 = v8[st - 1]; // (r-1,s-1) calculated in the last round
-      else
-        x1 = v1 = 0; // not calculated; set to zeros
-    } else
-      x1 = 0, v1 = r ? q : 0;
-    if (en >= r)
-      ((uint8_t *) y)[r] = 0, u8[r] = r ? q : 0;
-    // loop fission: set scores first
-    if (!(flag & KSW_EZ_GENERIC_SC)) {
-      for (t = st0; t <= en0; t += 16) {
-        __m128i sq, st, tmp, mask;
-        sq = _mm_loadu_si128((__m128i *) &sf[t]);
-        st = _mm_loadu_si128((__m128i *) &qrr[t]);
-        mask = _mm_or_si128(_mm_cmpeq_epi8(sq, m1_), _mm_cmpeq_epi8(st, m1_));
-        tmp = _mm_cmpeq_epi8(sq, st);
+	for (r = 0, last_st = last_en = -1; r < qlen + tlen - 1; ++r) {
+		int st = 0, en = tlen - 1, st0, en0, st_, en_;
+		int8_t x1, v1;
+		uint8_t *qrr = qr + (qlen - 1 - r), *u8 = (uint8_t*)u, *v8 = (uint8_t*)v;
+		__m128i x1_, v1_;
+		// find the boundaries
+		if (st < r - qlen + 1) st = r - qlen + 1;
+		if (en > r) en = r;
+		if (st < (r-wr+1)>>1) st = (r-wr+1)>>1; // take the ceil
+		if (en > (r+wl)>>1) en = (r+wl)>>1; // take the floor
+		if (st > en) {
+			ez->zdropped = 1;
+			break;
+		}
+		st0 = st, en0 = en;
+		st = st / 16 * 16, en = (en + 16) / 16 * 16 - 1;
+		// set boundary conditions
+		if (st > 0) {
+			if (st - 1 >= last_st && st - 1 <= last_en)
+				x1 = ((uint8_t*)x)[st - 1], v1 = v8[st - 1]; // (r-1,s-1) calculated in the last round
+			else x1 = v1 = 0; // not calculated; set to zeros
+		} else x1 = 0, v1 = r? q : 0;
+		if (en >= r) ((uint8_t*)y)[r] = 0, u8[r] = r? q : 0;
+		// loop fission: set scores first
+		if (!(flag & KSW_EZ_GENERIC_SC)) {
+			for (t = st0; t <= en0; t += 16) {
+				__m128i sq, st, tmp, mask;
+				sq = _mm_loadu_si128((__m128i*)&sf[t]);
+				st = _mm_loadu_si128((__m128i*)&qrr[t]);
+				mask = _mm_or_si128(_mm_cmpeq_epi8(sq, m1_), _mm_cmpeq_epi8(st, m1_));
+				tmp = _mm_cmpeq_epi8(sq, st);
 #ifdef __SSE4_1__
-        tmp = _mm_blendv_epi8(sc_mis_, sc_mch_, tmp);
-        tmp = _mm_blendv_epi8(tmp, sc_N_, mask);
+				tmp = _mm_blendv_epi8(sc_mis_, sc_mch_, tmp);
+				tmp = _mm_blendv_epi8(tmp,     sc_N_,   mask);
 #else
-        tmp = _mm_or_si128(_mm_andnot_si128(tmp,  sc_mis_), _mm_and_si128(tmp,  sc_mch_));
-        tmp = _mm_or_si128(_mm_andnot_si128(mask, tmp),     _mm_and_si128(mask, sc_N_));
+				tmp = _mm_or_si128(_mm_andnot_si128(tmp,  sc_mis_), _mm_and_si128(tmp,  sc_mch_));
+				tmp = _mm_or_si128(_mm_andnot_si128(mask, tmp),     _mm_and_si128(mask, sc_N_));
 #endif
-        _mm_storeu_si128((__m128i *) ((uint8_t *) s + t), tmp);
-      }
-    } else {
-      for (t = st0; t <= en0; ++t)
-        ((uint8_t *) s)[t] = mat[sf[t] * m + qrr[t]];
-    }
-    // core loop
-    x1_ = _mm_cvtsi32_si128(x1);
-    v1_ = _mm_cvtsi32_si128(v1);
-    st_ = st / 16, en_ = en / 16;
-    assert(en_ - st_ + 1 <= n_col_);
-    if (!with_cigar) { // score only
-      for (t = st_; t <= en_; ++t) {
-        __m128i z, a, b, xt1, vt1, ut, tmp;
-        __dp_code_block1;
+				_mm_storeu_si128((__m128i*)((uint8_t*)s + t), tmp);
+			}
+		} else {
+			for (t = st0; t <= en0; ++t)
+				((uint8_t*)s)[t] = mat[sf[t] * m + qrr[t]];
+		}
+		// core loop
+		x1_ = _mm_cvtsi32_si128(x1);
+		v1_ = _mm_cvtsi32_si128(v1);
+		st_ = st / 16, en_ = en / 16;
+		assert(en_ - st_ + 1 <= n_col_);
+		if (!with_cigar) { // score only
+			for (t = st_; t <= en_; ++t) {
+				__m128i z, a, b, xt1, vt1, ut, tmp;
+				__dp_code_block1;
 #ifdef __SSE4_1__
-        z = _mm_max_epi8(z, a);                          // z = z > a? z : a (signed)
+				z = _mm_max_epi8(z, a);                          // z = z > a? z : a (signed)
 #else // we need to emulate SSE4.1 intrinsics _mm_max_epi8()
-        z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_));  // z = z > 0? z : 0;
-        z = _mm_max_epu8(z, a);                          // z = max(z, a); this works because both are non-negative
+				z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_));  // z = z > 0? z : 0;
+				z = _mm_max_epu8(z, a);                          // z = max(z, a); this works because both are non-negative
 #endif
-        __dp_code_block2;
+				__dp_code_block2;
 #ifdef __SSE4_1__
-        _mm_store_si128(&x[t], _mm_max_epi8(a, zero_));
-        _mm_store_si128(&y[t], _mm_max_epi8(b, zero_));
+				_mm_store_si128(&x[t], _mm_max_epi8(a, zero_));
+				_mm_store_si128(&y[t], _mm_max_epi8(b, zero_));
 #else
-        tmp = _mm_cmpgt_epi8(a, zero_);
-        _mm_store_si128(&x[t], _mm_and_si128(a, tmp));
-        tmp = _mm_cmpgt_epi8(b, zero_);
-        _mm_store_si128(&y[t], _mm_and_si128(b, tmp));
+				tmp = _mm_cmpgt_epi8(a, zero_);
+				_mm_store_si128(&x[t], _mm_and_si128(a, tmp));
+				tmp = _mm_cmpgt_epi8(b, zero_);
+				_mm_store_si128(&y[t], _mm_and_si128(b, tmp));
 #endif
-      }
-    } else if (!(flag & KSW_EZ_RIGHT)) { // gap left-alignment
-      __m128i *pr = p + (size_t) r * n_col_ - st_;
-      off[r] = st, off_end[r] = en;
-      for (t = st_; t <= en_; ++t) {
-        __m128i d, z, a, b, xt1, vt1, ut, tmp;
-        __dp_code_block1;
-        d = _mm_and_si128(_mm_cmpgt_epi8(a, z), flag1_); // d = a > z? 1 : 0
+			}
+		} else if (!(flag&KSW_EZ_RIGHT)) { // gap left-alignment
+			__m128i *pr = p + (size_t)r * n_col_ - st_;
+			off[r] = st, off_end[r] = en;
+			for (t = st_; t <= en_; ++t) {
+				__m128i d, z, a, b, xt1, vt1, ut, tmp;
+				__dp_code_block1;
+				d = _mm_and_si128(_mm_cmpgt_epi8(a, z), flag1_); // d = a > z? 1 : 0
 #ifdef __SSE4_1__
-        z = _mm_max_epi8(z, a);                          // z = z > a? z : a (signed)
-        tmp = _mm_cmpgt_epi8(b, z);
-        d = _mm_blendv_epi8(d, flag2_, tmp);             // d = b > z? 2 : d
+				z = _mm_max_epi8(z, a);                          // z = z > a? z : a (signed)
+				tmp = _mm_cmpgt_epi8(b, z);
+				d = _mm_blendv_epi8(d, flag2_, tmp);             // d = b > z? 2 : d
 #else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
-        z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_));  // z = z > 0? z : 0;
-        z = _mm_max_epu8(z, a);                          // z = max(z, a); this works because both are non-negative
-        tmp = _mm_cmpgt_epi8(b, z);
-        d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, flag2_)); // d = b > z? 2 : d; emulating blendv
+				z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_));  // z = z > 0? z : 0;
+				z = _mm_max_epu8(z, a);                          // z = max(z, a); this works because both are non-negative
+				tmp = _mm_cmpgt_epi8(b, z);
+				d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, flag2_)); // d = b > z? 2 : d; emulating blendv
 #endif
-        __dp_code_block2;
-        tmp = _mm_cmpgt_epi8(a, zero_);
-        _mm_store_si128(&x[t], _mm_and_si128(tmp, a));
-        d = _mm_or_si128(d, _mm_and_si128(tmp, flag8_));  // d = a > 0? 0x08 : 0
-        tmp = _mm_cmpgt_epi8(b, zero_);
-        _mm_store_si128(&y[t], _mm_and_si128(tmp, b));
-        d = _mm_or_si128(d, _mm_and_si128(tmp, flag16_)); // d = b > 0? 0x10 : 0
-        _mm_store_si128(&pr[t], d);
-      }
-    } else { // gap right-alignment
-      __m128i *pr = p + (size_t) r * n_col_ - st_;
-      off[r] = st, off_end[r] = en;
-      for (t = st_; t <= en_; ++t) {
-        __m128i d, z, a, b, xt1, vt1, ut, tmp;
-        __dp_code_block1;
-        d = _mm_andnot_si128(_mm_cmpgt_epi8(z, a), flag1_); // d = z > a? 0 : 1
+				__dp_code_block2;
+				tmp = _mm_cmpgt_epi8(a, zero_);
+				_mm_store_si128(&x[t], _mm_and_si128(tmp, a));
+				d = _mm_or_si128(d, _mm_and_si128(tmp, flag8_));  // d = a > 0? 0x08 : 0
+				tmp = _mm_cmpgt_epi8(b, zero_);
+				_mm_store_si128(&y[t], _mm_and_si128(tmp, b));
+				d = _mm_or_si128(d, _mm_and_si128(tmp, flag16_)); // d = b > 0? 0x10 : 0
+				_mm_store_si128(&pr[t], d);
+			}
+		} else { // gap right-alignment
+			__m128i *pr = p + (size_t)r * n_col_ - st_;
+			off[r] = st, off_end[r] = en;
+			for (t = st_; t <= en_; ++t) {
+				__m128i d, z, a, b, xt1, vt1, ut, tmp;
+				__dp_code_block1;
+				d = _mm_andnot_si128(_mm_cmpgt_epi8(z, a), flag1_); // d = z > a? 0 : 1
 #ifdef __SSE4_1__
-        z = _mm_max_epi8(z, a);                          // z = z > a? z : a (signed)
-        tmp = _mm_cmpgt_epi8(z, b);
-        d = _mm_blendv_epi8(flag2_, d, tmp);             // d = z > b? d : 2
+				z = _mm_max_epi8(z, a);                          // z = z > a? z : a (signed)
+				tmp = _mm_cmpgt_epi8(z, b);
+				d = _mm_blendv_epi8(flag2_, d, tmp);             // d = z > b? d : 2
 #else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
-        z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_));  // z = z > 0? z : 0;
-        z = _mm_max_epu8(z, a);                          // z = max(z, a); this works because both are non-negative
-        tmp = _mm_cmpgt_epi8(z, b);
-        d = _mm_or_si128(_mm_andnot_si128(tmp, flag2_), _mm_and_si128(tmp, d)); // d = z > b? d : 2; emulating blendv
+				z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_));  // z = z > 0? z : 0;
+				z = _mm_max_epu8(z, a);                          // z = max(z, a); this works because both are non-negative
+				tmp = _mm_cmpgt_epi8(z, b);
+				d = _mm_or_si128(_mm_andnot_si128(tmp, flag2_), _mm_and_si128(tmp, d)); // d = z > b? d : 2; emulating blendv
 #endif
-        __dp_code_block2;
-        tmp = _mm_cmpgt_epi8(zero_, a);
-        _mm_store_si128(&x[t], _mm_andnot_si128(tmp, a));
-        d = _mm_or_si128(d, _mm_andnot_si128(tmp, flag8_));  // d = 0 > a? 0 : 0x08
-        tmp = _mm_cmpgt_epi8(zero_, b);
-        _mm_store_si128(&y[t], _mm_andnot_si128(tmp, b));
-        d = _mm_or_si128(d, _mm_andnot_si128(tmp, flag16_)); // d = 0 > b? 0 : 0x10
-        _mm_store_si128(&pr[t], d);
-      }
-    }
-    if (!approx_max) { // find the exact max with a 32-bit score array
-      int32_t max_H, max_t;
-      // compute H[], max_H and max_t
-      if (r > 0) {
-        int32_t HH[4], tt[4], en1 = st0 + (en0 - st0) / 4 * 4, i;
-        __m128i max_H_, max_t_, qe_;
-        max_H = H[en0] = en0 > 0 ? H[en0 - 1] + u8[en0] - qe : H[en0] + v8[en0] - qe; // special casing the last element
-        max_t = en0;
-        max_H_ = _mm_set1_epi32(max_H);
-        max_t_ = _mm_set1_epi32(max_t);
-        qe_ = _mm_set1_epi32(q + e);
-        for (t = st0; t < en1; t += 4) { // this implements: H[t]+=v8[t]-qe; if(H[t]>max_H) max_H=H[t],max_t=t;
-          __m128i H1, tmp, t_;
-          H1 = _mm_loadu_si128((__m128i *) &H[t]);
-          t_ = _mm_setr_epi32(v8[t], v8[t + 1], v8[t + 2], v8[t + 3]);
-          H1 = _mm_add_epi32(H1, t_);
-          H1 = _mm_sub_epi32(H1, qe_);
-          _mm_storeu_si128((__m128i *) &H[t], H1);
-          t_ = _mm_set1_epi32(t);
-          tmp = _mm_cmpgt_epi32(H1, max_H_);
+				__dp_code_block2;
+				tmp = _mm_cmpgt_epi8(zero_, a);
+				_mm_store_si128(&x[t], _mm_andnot_si128(tmp, a));
+				d = _mm_or_si128(d, _mm_andnot_si128(tmp, flag8_));  // d = 0 > a? 0 : 0x08
+				tmp = _mm_cmpgt_epi8(zero_, b);
+				_mm_store_si128(&y[t], _mm_andnot_si128(tmp, b));
+				d = _mm_or_si128(d, _mm_andnot_si128(tmp, flag16_)); // d = 0 > b? 0 : 0x10
+				_mm_store_si128(&pr[t], d);
+			}
+		}
+		if (!approx_max) { // find the exact max with a 32-bit score array
+			int32_t max_H, max_t;
+			// compute H[], max_H and max_t
+			if (r > 0) {
+				int32_t HH[4], tt[4], en1 = st0 + (en0 - st0) / 4 * 4, i;
+				__m128i max_H_, max_t_, qe_;
+				max_H = H[en0] = en0 > 0? H[en0-1] + u8[en0] - qe : H[en0] + v8[en0] - qe; // special casing the last element
+				max_t = en0;
+				max_H_ = _mm_set1_epi32(max_H);
+				max_t_ = _mm_set1_epi32(max_t);
+				qe_    = _mm_set1_epi32(q + e);
+				for (t = st0; t < en1; t += 4) { // this implements: H[t]+=v8[t]-qe; if(H[t]>max_H) max_H=H[t],max_t=t;
+					__m128i H1, tmp, t_;
+					H1 = _mm_loadu_si128((__m128i*)&H[t]);
+					t_ = _mm_setr_epi32(v8[t], v8[t+1], v8[t+2], v8[t+3]);
+					H1 = _mm_add_epi32(H1, t_);
+					H1 = _mm_sub_epi32(H1, qe_);
+					_mm_storeu_si128((__m128i*)&H[t], H1);
+					t_ = _mm_set1_epi32(t);
+					tmp = _mm_cmpgt_epi32(H1, max_H_);
 #ifdef __SSE4_1__
-          max_H_ = _mm_blendv_epi8(max_H_, H1, tmp);
-          max_t_ = _mm_blendv_epi8(max_t_, t_, tmp);
+					max_H_ = _mm_blendv_epi8(max_H_, H1, tmp);
+					max_t_ = _mm_blendv_epi8(max_t_, t_, tmp);
 #else
-          max_H_ = _mm_or_si128(_mm_and_si128(tmp, H1), _mm_andnot_si128(tmp, max_H_));
-          max_t_ = _mm_or_si128(_mm_and_si128(tmp, t_), _mm_andnot_si128(tmp, max_t_));
+					max_H_ = _mm_or_si128(_mm_and_si128(tmp, H1), _mm_andnot_si128(tmp, max_H_));
+					max_t_ = _mm_or_si128(_mm_and_si128(tmp, t_), _mm_andnot_si128(tmp, max_t_));
 #endif
-        }
-        _mm_storeu_si128((__m128i *) HH, max_H_);
-        _mm_storeu_si128((__m128i *) tt, max_t_);
-        for (i = 0; i < 4; ++i)
-          if (max_H < HH[i])
-            max_H = HH[i], max_t = tt[i] + i;
-        for (; t < en0; ++t) { // for the rest of values that haven't been computed with SSE
-          H[t] += (int32_t) v8[t] - qe;
-          if (H[t] > max_H)
-            max_H = H[t], max_t = t;
-        }
-      } else
-        H[0] = v8[0] - qe - qe, max_H = H[0], max_t = 0; // special casing r==0
-      // update ez
-      if (en0 == tlen - 1 && H[en0] > ez->mte)
-        ez->mte = H[en0], ez->mte_q = r - en;
-      if (r - st0 == qlen - 1 && H[st0] > ez->mqe)
-        ez->mqe = H[st0], ez->mqe_t = st0;
-      if (ksw_apply_zdrop(ez, 1, max_H, r, max_t, zdrop, e))
-        break;
-      if (r == qlen + tlen - 2 && en0 == tlen - 1)
-        ez->score = H[tlen - 1];
-    } else { // find approximate max; Z-drop might be inaccurate, too.
-      if (r > 0) {
-        if (last_H0_t >= st0 && last_H0_t <= en0 && last_H0_t + 1 >= st0 && last_H0_t + 1 <= en0) {
-          int32_t d0 = v8[last_H0_t] - qe;
-          int32_t d1 = u8[last_H0_t + 1] - qe;
-          if (d0 > d1)
-            H0 += d0;
-          else
-            H0 += d1, ++last_H0_t;
-        } else if (last_H0_t >= st0 && last_H0_t <= en0) {
-          H0 += v8[last_H0_t] - qe;
-        } else {
-          ++last_H0_t, H0 += u8[last_H0_t] - qe;
-        }
-        if ((flag & KSW_EZ_APPROX_DROP) && ksw_apply_zdrop(ez, 1, H0, r, last_H0_t, zdrop, e))
-          break;
-      } else
-        H0 = v8[0] - qe - qe, last_H0_t = 0;
-      if (r == qlen + tlen - 2 && en0 == tlen - 1)
-        ez->score = H0;
-    }
-    last_st = st, last_en = en;
-    //for (t = st0; t <= en0; ++t) printf("(%d,%d)\t(%d,%d,%d,%d)\t%d\n", r, t, ((int8_t*)u)[t], ((int8_t*)v)[t], ((int8_t*)x)[t], ((int8_t*)y)[t], H[t]); // for debugging
-  }
-  kfree(km, mem);
-  if (!approx_max)
-    kfree(km, H);
-  if (with_cigar) { // backtrack
-    int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR);
-    if (!ez->zdropped && !(flag & KSW_EZ_EXTZ_ONLY)) {
-      ksw_backtrack(km,
-                    1,
-                    rev_cigar,
-                    0,
-                    (uint8_t *) p,
-                    off,
-                    off_end,
-                    n_col_ * 16,
-                    tlen - 1,
-                    qlen - 1,
-                    &ez->m_cigar,
-                    &ez->n_cigar,
-                    &ez->cigar);
-    } else if (!ez->zdropped && (flag & KSW_EZ_EXTZ_ONLY) && ez->mqe + end_bonus > (int) ez->max) {
-      ez->reach_end = 1;
-      ksw_backtrack(km,
-                    1,
-                    rev_cigar,
-                    0,
-                    (uint8_t *) p,
-                    off,
-                    off_end,
-                    n_col_ * 16,
-                    ez->mqe_t,
-                    qlen - 1,
-                    &ez->m_cigar,
-                    &ez->n_cigar,
-                    &ez->cigar);
-    } else if (ez->max_t >= 0 && ez->max_q >= 0) {
-      ksw_backtrack(km,
-                    1,
-                    rev_cigar,
-                    0,
-                    (uint8_t *) p,
-                    off,
-                    off_end,
-                    n_col_ * 16,
-                    ez->max_t,
-                    ez->max_q,
-                    &ez->m_cigar,
-                    &ez->n_cigar,
-                    &ez->cigar);
-    }
-    kfree(km, mem2);
-    kfree(km, off);
-  }
+				}
+				_mm_storeu_si128((__m128i*)HH, max_H_);
+				_mm_storeu_si128((__m128i*)tt, max_t_);
+				for (i = 0; i < 4; ++i)
+					if (max_H < HH[i]) max_H = HH[i], max_t = tt[i] + i;
+				for (; t < en0; ++t) { // for the rest of values that haven't been computed with SSE
+					H[t] += (int32_t)v8[t] - qe;
+					if (H[t] > max_H)
+						max_H = H[t], max_t = t;
+				}
+			} else H[0] = v8[0] - qe - qe, max_H = H[0], max_t = 0; // special casing r==0
+			// update ez
+			if (en0 == tlen - 1 && H[en0] > ez->mte)
+				ez->mte = H[en0], ez->mte_q = r - en;
+			if (r - st0 == qlen - 1 && H[st0] > ez->mqe)
+				ez->mqe = H[st0], ez->mqe_t = st0;
+			if (ksw_apply_zdrop(ez, 1, max_H, r, max_t, zdrop, e)) break;
+			if (r == qlen + tlen - 2 && en0 == tlen - 1)
+				ez->score = H[tlen - 1];
+		} else { // find approximate max; Z-drop might be inaccurate, too.
+			if (r > 0) {
+				if (last_H0_t >= st0 && last_H0_t <= en0 && last_H0_t + 1 >= st0 && last_H0_t + 1 <= en0) {
+					int32_t d0 = v8[last_H0_t] - qe;
+					int32_t d1 = u8[last_H0_t + 1] - qe;
+					if (d0 > d1) H0 += d0;
+					else H0 += d1, ++last_H0_t;
+				} else if (last_H0_t >= st0 && last_H0_t <= en0) {
+					H0 += v8[last_H0_t] - qe;
+				} else {
+					++last_H0_t, H0 += u8[last_H0_t] - qe;
+				}
+				if ((flag & KSW_EZ_APPROX_DROP) && ksw_apply_zdrop(ez, 1, H0, r, last_H0_t, zdrop, e)) break;
+			} else H0 = v8[0] - qe - qe, last_H0_t = 0;
+			if (r == qlen + tlen - 2 && en0 == tlen - 1)
+				ez->score = H0;
+		}
+		last_st = st, last_en = en;
+		//for (t = st0; t <= en0; ++t) printf("(%d,%d)\t(%d,%d,%d,%d)\t%d\n", r, t, ((int8_t*)u)[t], ((int8_t*)v)[t], ((int8_t*)x)[t], ((int8_t*)y)[t], H[t]); // for debugging
+	}
+	kfree(km, mem);
+	if (!approx_max) kfree(km, H);
+	if (with_cigar) { // backtrack
+		int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR);
+		if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY)) {
+			ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
+		} else if (!ez->zdropped && (flag&KSW_EZ_EXTZ_ONLY) && ez->mqe + end_bonus > (int)ez->max) {
+			ez->reach_end = 1;
+			ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, ez->mqe_t, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
+		} else if (ez->max_t >= 0 && ez->max_q >= 0) {
+			ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar);
+		}
+		kfree(km, mem2); kfree(km, off);
+	}
 }
-#endif // __SSE2__
\ No newline at end of file
+#endif // __SSE2__

From 4f6d50a60829cc96b2e84eae2652a1e69b0d386f Mon Sep 17 00:00:00 2001
From: Marcel Martin <marcel.martin@scilifelab.se>
Date: Wed, 15 Feb 2023 23:25:57 +0100
Subject: [PATCH 21/32] Move ksw2 files to a subdirectory and add license

---
 ext/README.md                   |  7 +++++++
 ext/ksw2/LICENSE.txt            | 24 ++++++++++++++++++++++++
 ext/{ => ksw2}/ksw2.h           |  0
 ext/{ => ksw2}/ksw2_extz2_sse.c |  0
 4 files changed, 31 insertions(+)
 create mode 100644 ext/ksw2/LICENSE.txt
 rename ext/{ => ksw2}/ksw2.h (100%)
 rename ext/{ => ksw2}/ksw2_extz2_sse.c (100%)

diff --git a/ext/README.md b/ext/README.md
index 55e874b7..59e3503d 100644
--- a/ext/README.md
+++ b/ext/README.md
@@ -60,3 +60,10 @@ License: See xxhash.c
 Homepage: https://github.com/mateidavid/zstr
 Commit used: 755da7890ea22478a702e3139092e6c964fab1f5
 License: See zstr/LICENSE
+
+
+## ksw2
+
+https://github.com/lh3/ksw2
+https://raw.githubusercontent.com/lh3/ksw2/06b2183b0f6646d82f2e3f5884008a1b4582f5b5/ksw2.h
+https://raw.githubusercontent.com/lh3/ksw2/06b2183b0f6646d82f2e3f5884008a1b4582f5b5/ksw2_extz2_sse.c
diff --git a/ext/ksw2/LICENSE.txt b/ext/ksw2/LICENSE.txt
new file mode 100644
index 00000000..1a06f649
--- /dev/null
+++ b/ext/ksw2/LICENSE.txt
@@ -0,0 +1,24 @@
+The MIT License
+
+Copyright (c) 2018-     Dana-Farber Cancer Institute
+              2017-2018 Broad Institute, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/ext/ksw2.h b/ext/ksw2/ksw2.h
similarity index 100%
rename from ext/ksw2.h
rename to ext/ksw2/ksw2.h
diff --git a/ext/ksw2_extz2_sse.c b/ext/ksw2/ksw2_extz2_sse.c
similarity index 100%
rename from ext/ksw2_extz2_sse.c
rename to ext/ksw2/ksw2_extz2_sse.c

From 237cb91f710fcf6ce25a0e440ca8dbc890d9d3dc Mon Sep 17 00:00:00 2001
From: Marcel Martin <marcel.martin@scilifelab.se>
Date: Fri, 17 Feb 2023 15:00:20 +0100
Subject: [PATCH 22/32] Implement Aligner::ksw_extend()

---
 CMakeLists.txt  |  1 +
 src/aligner.cpp | 91 +++++++++++++++++++++++++++++++++++++++++++++++++
 src/aligner.hpp | 10 ++++--
 3 files changed, 100 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 988b1235..d1088d22 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -63,6 +63,7 @@ add_library(salib STATIC ${SOURCES}
   ext/xxhash.c
   ext/ssw/ssw_cpp.cpp
   ext/ssw/ssw.c
+  ext/ksw2/ksw2_extz2_sse.c
 )
 target_include_directories(salib PUBLIC src/ ext/ ${PROJECT_BINARY_DIR})
 target_link_libraries(salib PUBLIC ZLIB::ZLIB Threads::Threads zstr::zstr)
diff --git a/src/aligner.cpp b/src/aligner.cpp
index 6d775739..cd7e31af 100644
--- a/src/aligner.cpp
+++ b/src/aligner.cpp
@@ -3,11 +3,15 @@
  *
  * This is for anything that returns an aln_info object, currently
  * Aligner::align and hamming_align.
+ *
+ * ksw_extend code is based on https://github.com/lh3/ksw2/blob/master/cli.c
  */
 #include <sstream>
 #include <tuple>
 #include <algorithm>
 #include <cassert>
+#include <cstring>  // memset
+#include "ksw2/ksw2.h"
 #include "aligner.hpp"
 
 AlignmentInfo Aligner::align(const std::string &query, const std::string &ref) const {
@@ -199,3 +203,90 @@ AlignmentInfo hamming_align(
     aln.query_end = segment_end;
     return aln;
 }
+
+namespace {
+
+unsigned char seq_nt4_table[256] = {
+    0, 1, 2, 3,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
+    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
+    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
+    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
+    4, 0, 4, 1,  4, 4, 4, 2,  4, 4, 4, 4,  4, 4, 4, 4,
+    4, 4, 4, 4,  3, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
+    4, 0, 4, 1,  4, 4, 4, 2,  4, 4, 4, 4,  4, 4, 4, 4,
+    4, 4, 4, 4,  3, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
+    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
+    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
+    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
+    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
+    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
+    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
+    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
+    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4
+};
+
+}  // namespace
+
+void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b)
+{
+    int i, j;
+    a = a < 0? -a : a;
+    b = b > 0? -b : b;
+    for (i = 0; i < m - 1; ++i) {
+        for (j = 0; j < m - 1; ++j)
+            mat[i * m + j] = i == j? a : b;
+        mat[i * m + m - 1] = 0;
+    }
+    for (j = 0; j < m; ++j)
+        mat[(m - 1) * m + j] = 0;
+}
+
+
+AlignmentInfo Aligner::ksw_extend(const std::string& query, const std::string& ref, bool right_align) const {
+    int w = -1; // band width; -1 is inf
+    int zdrop = -1; // -1 to disable
+    int flag = KSW_EZ_EXTZ_ONLY;
+    if (right_align) {
+        flag |= KSW_EZ_RIGHT;
+    }
+    ksw_extz_t ez;
+    memset(&ez, 0, sizeof(ksw_extz_t));
+
+    ez.max_q = ez.max_t = ez.mqe_t = ez.mte_q = -1;
+    ez.max = 0; ez.mqe = ez.mte = KSW_NEG_INF;
+    ez.n_cigar = 0;
+    int qlen = query.length();
+    int tlen = ref.length();
+    uint8_t *qseq = (uint8_t*)calloc(qlen + 33, 1);
+    uint8_t *tseq = (uint8_t*)calloc(tlen + 33, 1);
+    for (int i = 0; i < qlen; ++i)
+        qseq[i] = seq_nt4_table[(uint8_t)query[i]];
+    for (int i = 0; i < tlen; ++i)
+        tseq[i] = seq_nt4_table[(uint8_t)ref[i]];
+
+    ksw_extz2_sse(
+        nullptr, qlen, (uint8_t*)qseq, tlen, (uint8_t*)tseq, ksw_matrix_m, ksw_matrix, parameters.gap_open, parameters.gap_extend, w, zdrop, parameters.end_bonus, flag, &ez
+    );
+    free(qseq);
+    free(tseq);
+
+
+    AlignmentInfo info;
+    auto cigar = Cigar(ez.cigar, ez.n_cigar).to_eqx(query, ref);
+    info.edit_distance = cigar.edit_distance();
+    info.cigar = std::move(cigar);
+    info.ref_start = 0;
+    info.query_start = 0;
+    if (ez.reach_end) {
+        info.ref_end = ez.mqe_t + 1;
+        info.query_end = query.size();
+        info.sw_score = ez.mqe + parameters.end_bonus;
+    } else {
+        info.ref_end = ez.max_t + 1;
+        info.query_end = ez.max_q + 1;
+        info.sw_score = ez.max;
+    }
+
+    kfree(km, ez.cigar);
+    return info;
+}
diff --git a/src/aligner.hpp b/src/aligner.hpp
index 0238885f..e0c6f8c1 100644
--- a/src/aligner.hpp
+++ b/src/aligner.hpp
@@ -28,15 +28,19 @@ struct AlignmentInfo {
     int ref_span() const { return ref_end - ref_start; }
 };
 
+void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b);
+
 struct Aligner {
 public:
     Aligner(AlignmentParameters parameters)
         : parameters(parameters)
         , ssw_aligner(StripedSmithWaterman::Aligner(parameters.match, parameters.mismatch, parameters.gap_open, parameters.gap_extend))
-    { }
+    {
+        ksw_gen_simple_mat(ksw_matrix_m, ksw_matrix, parameters.match, -parameters.mismatch);
+    }
 
     AlignmentInfo align(const std::string &query, const std::string &ref) const;
-
+    AlignmentInfo ksw_extend(const std::string& query, const std::string& ref, bool right_align) const;
     AlignmentParameters parameters;
 
     unsigned calls_count() {
@@ -47,6 +51,8 @@ struct Aligner {
     const StripedSmithWaterman::Aligner ssw_aligner;
     const StripedSmithWaterman::Filter filter;
     mutable unsigned m_align_calls{0};  // no. of calls to the align() method
+    const int8_t ksw_matrix_m{5};
+    int8_t ksw_matrix[25];
 };
 
 inline int hamming_distance(const std::string &s, const std::string &t) {

From a90d6979a3070c0886df5ed4326eeaa6d2dacd76 Mon Sep 17 00:00:00 2001
From: Marcel Martin <marcel.martin@scilifelab.se>
Date: Mon, 13 Mar 2023 14:37:35 +0100
Subject: [PATCH 23/32] Add operator<< for ksw_extz_t

---
 src/aligner.cpp | 21 +++++++++++++++++++++
 src/cigar.cpp   |  5 +++++
 src/cigar.hpp   |  2 ++
 3 files changed, 28 insertions(+)

diff --git a/src/aligner.cpp b/src/aligner.cpp
index cd7e31af..1702a06b 100644
--- a/src/aligner.cpp
+++ b/src/aligner.cpp
@@ -11,6 +11,7 @@
 #include <algorithm>
 #include <cassert>
 #include <cstring>  // memset
+#include <iostream>
 #include "ksw2/ksw2.h"
 #include "aligner.hpp"
 
@@ -241,6 +242,26 @@ void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b)
         mat[(m - 1) * m + j] = 0;
 }
 
+std::ostream& operator<<(std::ostream& os, const ksw_extz_t& ez) {
+    os << "ksw_extz_t("
+        //
+        << "\n max:             " << ez.max   // max overall score
+        << "\n coord max_q:     " << ez.max_q  // max extension coordinate
+        << "\n coord max_t:     " << ez.max_t  // max extension coordinate
+
+        << "\n score mqe:       " << ez.mqe // max score when reaching the end of query
+        << "\n mqe_t:           " << ez.mqe_t // coordinate in target corresponding to mqe
+
+        << "\n score mte:       " << ez.mte  // max score when reaching the end of target
+        << "\n mte_q:           " << ez.mte_q // coordinate in query corresponding to mte
+
+        << "\n score both ends: " << ez.score  // max score reaching both ends
+        << "\n cigar:           " << Cigar(ez.cigar, ez.n_cigar)
+        << "\n zdropped:        " << ez.zdropped
+        << "\n reach_end:       " << ez.reach_end
+        << "\n)";
+    return os;
+}
 
 AlignmentInfo Aligner::ksw_extend(const std::string& query, const std::string& ref, bool right_align) const {
     int w = -1; // band width; -1 is inf
diff --git a/src/cigar.cpp b/src/cigar.cpp
index 88676d53..39abf14e 100644
--- a/src/cigar.cpp
+++ b/src/cigar.cpp
@@ -89,6 +89,11 @@ Cigar::Cigar(const std::string& cig) {
     }
 }
 
+std::ostream& operator<<(std::ostream& os, const Cigar& cigar) {
+    os << cigar.to_string();
+    return os;
+}
+
 std::string compress_cigar(const std::string& ops) {
     char prev = 0;
     int count = 0;
diff --git a/src/cigar.hpp b/src/cigar.hpp
index ced193b1..9aa45c10 100644
--- a/src/cigar.hpp
+++ b/src/cigar.hpp
@@ -92,6 +92,8 @@ class Cigar {
     std::vector<uint32_t> m_ops;
 };
 
+std::ostream& operator<<(std::ostream& os, const Cigar& cigar);
+
 std::string compress_cigar(const std::string& ops);
 
 #endif

From b89e3d78b43c86a107299c3f4faec54ee17d1cf1 Mon Sep 17 00:00:00 2001
From: Marcel Martin <marcel.martin@scilifelab.se>
Date: Thu, 14 Dec 2023 10:20:30 +0100
Subject: [PATCH 24/32] Turn the ref parameter of some functions into
 string_view

---
 src/aligner.cpp | 10 +++++-----
 src/aligner.hpp |  9 +++++----
 src/aln.cpp     |  4 ++--
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/aligner.cpp b/src/aligner.cpp
index 1702a06b..2345124e 100644
--- a/src/aligner.cpp
+++ b/src/aligner.cpp
@@ -15,7 +15,7 @@
 #include "ksw2/ksw2.h"
 #include "aligner.hpp"
 
-AlignmentInfo Aligner::align(const std::string &query, const std::string &ref) const {
+AlignmentInfo Aligner::align(const std::string& query, const std::string_view ref) const {
     m_align_calls++;
     AlignmentInfo aln;
     int32_t maskLen = query.length() / 2;
@@ -30,8 +30,8 @@ AlignmentInfo Aligner::align(const std::string &query, const std::string &ref) c
 
     StripedSmithWaterman::Alignment alignment_ssw;
 
-    // query must be NULL-terminated
-    auto flag = ssw_aligner.Align(query.c_str(), ref.c_str(), ref.size(), filter, &alignment_ssw, maskLen);
+    // only query must be NULL-terminated
+    auto flag = ssw_aligner.Align(query.c_str(), ref.begin(), ref.size(), filter, &alignment_ssw, maskLen);
     if (flag != 0) {
         aln.edit_distance = 100000;
         aln.ref_start = 0;
@@ -121,7 +121,7 @@ AlignmentInfo Aligner::align(const std::string &query, const std::string &ref) c
  * of the query, once for each end.
  */
 std::tuple<size_t, size_t, int> highest_scoring_segment(
-    const std::string& query, const std::string& ref, int match, int mismatch, int end_bonus
+    const std::string& query, const std::string_view ref, int match, int mismatch, int end_bonus
 ) {
     size_t n = query.length();
 
@@ -156,7 +156,7 @@ std::tuple<size_t, size_t, int> highest_scoring_segment(
 }
 
 AlignmentInfo hamming_align(
-    const std::string &query, const std::string &ref, int match, int mismatch, int end_bonus
+    const std::string &query, const std::string_view ref, int match, int mismatch, int end_bonus
 ) {
     AlignmentInfo aln;
     if (query.length() != ref.length()) {
diff --git a/src/aligner.hpp b/src/aligner.hpp
index e0c6f8c1..55abdd3d 100644
--- a/src/aligner.hpp
+++ b/src/aligner.hpp
@@ -39,8 +39,9 @@ struct Aligner {
         ksw_gen_simple_mat(ksw_matrix_m, ksw_matrix, parameters.match, -parameters.mismatch);
     }
 
-    AlignmentInfo align(const std::string &query, const std::string &ref) const;
+    AlignmentInfo align(const std::string& query, const std::string_view ref) const;
     AlignmentInfo ksw_extend(const std::string& query, const std::string& ref, bool right_align) const;
+
     AlignmentParameters parameters;
 
     unsigned calls_count() {
@@ -55,7 +56,7 @@ struct Aligner {
     int8_t ksw_matrix[25];
 };
 
-inline int hamming_distance(const std::string &s, const std::string &t) {
+inline int hamming_distance(const std::string& s, const std::string_view t) {
     if (s.length() != t.length()){
         return -1;
     }
@@ -71,11 +72,11 @@ inline int hamming_distance(const std::string &s, const std::string &t) {
 }
 
 std::tuple<size_t, size_t, int> highest_scoring_segment(
-    const std::string& query, const std::string& ref, int match, int mismatch, int end_bonus
+    const std::string& query, const std::string_view ref, int match, int mismatch, int end_bonus
 );
 
 AlignmentInfo hamming_align(
-    const std::string &query, const std::string &ref, int match, int mismatch, int end_bonus
+    const std::string& query, const std::string_view ref, int match, int mismatch, int end_bonus
 );
 
 #endif
diff --git a/src/aln.cpp b/src/aln.cpp
index b5105bb1..02e408b5 100644
--- a/src/aln.cpp
+++ b/src/aln.cpp
@@ -213,7 +213,7 @@ inline Alignment extend_seed(
     bool consistent_nam
 ) {
     const std::string query = nam.is_rc ? read.rc : read.seq;
-    const std::string& ref = references.sequences[nam.ref_id];
+    const std::string_view ref = references.sequences[nam.ref_id];
 
     const auto projected_ref_start = nam.projected_ref_start();
     const auto projected_ref_end = std::min(nam.ref_end + query.size() - nam.query_end, ref.size());
@@ -222,7 +222,7 @@ inline Alignment extend_seed(
     int result_ref_start;
     bool gapped = true;
     if (projected_ref_end - projected_ref_start == query.size() && consistent_nam) {
-        std::string ref_segm_ham = ref.substr(projected_ref_start, query.size());
+        std::string_view ref_segm_ham = ref.substr(projected_ref_start, query.size());
         auto hamming_dist = hamming_distance(query, ref_segm_ham);
 
         if (hamming_dist >= 0 && (((float) hamming_dist / query.size()) < 0.05) ) { //Hamming distance worked fine, no need to ksw align

From 36513bc8a53a43be6b8075fb7c4bed205f77c34c Mon Sep 17 00:00:00 2001
From: Marcel Martin <marcel.martin@scilifelab.se>
Date: Fri, 15 Dec 2023 15:25:45 +0100
Subject: [PATCH 25/32] Add operator<< for AlignmentInfo

---
 src/aligner.cpp | 10 ++++++++++
 src/aligner.hpp |  2 ++
 2 files changed, 12 insertions(+)

diff --git a/src/aligner.cpp b/src/aligner.cpp
index 2345124e..999aea98 100644
--- a/src/aligner.cpp
+++ b/src/aligner.cpp
@@ -311,3 +311,13 @@ AlignmentInfo Aligner::ksw_extend(const std::string& query, const std::string& r
     kfree(km, ez.cigar);
     return info;
 }
+
+std::ostream& operator<<(std::ostream& os, const AlignmentInfo& info) {
+    os << "AlignmentInfo(cigar='" << info.cigar
+        << "', ref=" << info.ref_start << ".." << info.ref_end
+        << ", query=" << info.query_start << ".." << info.query_end
+        << ", NM=" << info.edit_distance
+        << ", AS=" << info.sw_score
+        << ")";
+    return os;
+}
diff --git a/src/aligner.hpp b/src/aligner.hpp
index 55abdd3d..780c18ae 100644
--- a/src/aligner.hpp
+++ b/src/aligner.hpp
@@ -28,6 +28,8 @@ struct AlignmentInfo {
     int ref_span() const { return ref_end - ref_start; }
 };
 
+std::ostream& operator<<(std::ostream& os, const AlignmentInfo& info);
+
 void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b);
 
 struct Aligner {

From 92d221b11bb7f60f5ef23bcf70faa34d49b27274 Mon Sep 17 00:00:00 2001
From: Marcel Martin <marcel.martin@scilifelab.se>
Date: Fri, 15 Dec 2023 15:27:10 +0100
Subject: [PATCH 26/32] Explicitly set a wildcard score in the ksw score matrix

Otherwise, alignments between wildcards get a score of 0, which is
inconsistent with SSW alignments.
---
 src/aligner.cpp | 7 ++++---
 src/aligner.hpp | 5 +++--
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/aligner.cpp b/src/aligner.cpp
index 999aea98..77e0b495 100644
--- a/src/aligner.cpp
+++ b/src/aligner.cpp
@@ -228,18 +228,19 @@ unsigned char seq_nt4_table[256] = {
 
 }  // namespace
 
-void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b)
+void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t wildcard_score)
 {
     int i, j;
     a = a < 0? -a : a;
     b = b > 0? -b : b;
+    wildcard_score = wildcard_score > 0 ? -wildcard_score : wildcard_score;
     for (i = 0; i < m - 1; ++i) {
         for (j = 0; j < m - 1; ++j)
             mat[i * m + j] = i == j? a : b;
-        mat[i * m + m - 1] = 0;
+        mat[i * m + m - 1] = wildcard_score;
     }
     for (j = 0; j < m; ++j)
-        mat[(m - 1) * m + j] = 0;
+        mat[(m - 1) * m + j] = wildcard_score;
 }
 
 std::ostream& operator<<(std::ostream& os, const ksw_extz_t& ez) {
diff --git a/src/aligner.hpp b/src/aligner.hpp
index 780c18ae..e950ac8b 100644
--- a/src/aligner.hpp
+++ b/src/aligner.hpp
@@ -30,7 +30,7 @@ struct AlignmentInfo {
 
 std::ostream& operator<<(std::ostream& os, const AlignmentInfo& info);
 
-void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b);
+void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t wildcard_score);
 
 struct Aligner {
 public:
@@ -38,7 +38,8 @@ struct Aligner {
         : parameters(parameters)
         , ssw_aligner(StripedSmithWaterman::Aligner(parameters.match, parameters.mismatch, parameters.gap_open, parameters.gap_extend))
     {
-        ksw_gen_simple_mat(ksw_matrix_m, ksw_matrix, parameters.match, -parameters.mismatch);
+        int8_t wildcard_score = -parameters.mismatch;
+        ksw_gen_simple_mat(ksw_matrix_m, ksw_matrix, parameters.match, -parameters.mismatch, wildcard_score);
     }
 
     AlignmentInfo align(const std::string& query, const std::string_view ref) const;

From 13938c55bba7ef8cb816bf7ef1eaa36de94bd5d0 Mon Sep 17 00:00:00 2001
From: Marcel Martin <marcel.martin@scilifelab.se>
Date: Fri, 15 Dec 2023 15:29:42 +0100
Subject: [PATCH 27/32] Pass ref as a string_view

---
 src/aligner.cpp | 2 +-
 src/aligner.hpp | 2 +-
 src/cigar.cpp   | 2 +-
 src/cigar.hpp   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/aligner.cpp b/src/aligner.cpp
index 77e0b495..442c8743 100644
--- a/src/aligner.cpp
+++ b/src/aligner.cpp
@@ -264,7 +264,7 @@ std::ostream& operator<<(std::ostream& os, const ksw_extz_t& ez) {
     return os;
 }
 
-AlignmentInfo Aligner::ksw_extend(const std::string& query, const std::string& ref, bool right_align) const {
+AlignmentInfo Aligner::ksw_extend(const std::string& query, const std::string_view ref, bool right_align) const {
     int w = -1; // band width; -1 is inf
     int zdrop = -1; // -1 to disable
     int flag = KSW_EZ_EXTZ_ONLY;
diff --git a/src/aligner.hpp b/src/aligner.hpp
index e950ac8b..3a1c02a5 100644
--- a/src/aligner.hpp
+++ b/src/aligner.hpp
@@ -43,7 +43,7 @@ struct Aligner {
     }
 
     AlignmentInfo align(const std::string& query, const std::string_view ref) const;
-    AlignmentInfo ksw_extend(const std::string& query, const std::string& ref, bool right_align) const;
+    AlignmentInfo ksw_extend(const std::string& query, const std::string_view ref, bool right_align) const;
 
     AlignmentParameters parameters;
 
diff --git a/src/cigar.cpp b/src/cigar.cpp
index 39abf14e..c2799a1c 100644
--- a/src/cigar.cpp
+++ b/src/cigar.cpp
@@ -17,7 +17,7 @@ Cigar Cigar::to_m() const {
     return cigar;
 }
 
-Cigar Cigar::to_eqx(const std::string& query, const std::string& ref) const {
+Cigar Cigar::to_eqx(const std::string& query, const std::string_view ref) const {
     size_t i = 0, j = 0;
     Cigar cigar;
     for (auto op_len : m_ops) {
diff --git a/src/cigar.hpp b/src/cigar.hpp
index 9aa45c10..12966fcd 100644
--- a/src/cigar.hpp
+++ b/src/cigar.hpp
@@ -85,7 +85,7 @@ class Cigar {
     Cigar to_m() const;
 
     /* Return a new Cigar that uses =/X instead of M */
-    Cigar to_eqx(const std::string& query, const std::string& ref) const;
+    Cigar to_eqx(const std::string& query, const std::string_view ref) const;
 
     std::string to_string() const;
 

From 9d72201f03cc0e9f1402a85b447ee1cfcbe15de0 Mon Sep 17 00:00:00 2001
From: Marcel Martin <marcel.martin@scilifelab.se>
Date: Fri, 15 Dec 2023 15:30:18 +0100
Subject: [PATCH 28/32] Make ksw actually use the score matrix

Apparently, the KSW_EZ_GENERIC_SC flag is required
---
 src/aligner.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/aligner.cpp b/src/aligner.cpp
index 442c8743..1260e9bf 100644
--- a/src/aligner.cpp
+++ b/src/aligner.cpp
@@ -267,7 +267,7 @@ std::ostream& operator<<(std::ostream& os, const ksw_extz_t& ez) {
 AlignmentInfo Aligner::ksw_extend(const std::string& query, const std::string_view ref, bool right_align) const {
     int w = -1; // band width; -1 is inf
     int zdrop = -1; // -1 to disable
-    int flag = KSW_EZ_EXTZ_ONLY;
+    int flag = KSW_EZ_EXTZ_ONLY | KSW_EZ_GENERIC_SC;
     if (right_align) {
         flag |= KSW_EZ_RIGHT;
     }

From c36cc0a0b1c08c3026acd60563e59628c343d7d5 Mon Sep 17 00:00:00 2001
From: Marcel Martin <marcel.martin@scilifelab.se>
Date: Fri, 15 Dec 2023 15:31:14 +0100
Subject: [PATCH 29/32] Return empty alignment if query is empty

---
 src/aligner.cpp | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/aligner.cpp b/src/aligner.cpp
index 1260e9bf..3c05f3d9 100644
--- a/src/aligner.cpp
+++ b/src/aligner.cpp
@@ -265,6 +265,18 @@ std::ostream& operator<<(std::ostream& os, const ksw_extz_t& ez) {
 }
 
 AlignmentInfo Aligner::ksw_extend(const std::string& query, const std::string_view ref, bool right_align) const {
+    AlignmentInfo info;
+    if (query.size() == 0) {
+        info.cigar = Cigar();
+        info.edit_distance = 0;
+        info.ref_start = 0;
+        info.query_start = 0;
+        info.ref_end = 0;
+        info.query_end = 0;
+        info.sw_score = parameters.end_bonus;
+        return info;
+    }
+
     int w = -1; // band width; -1 is inf
     int zdrop = -1; // -1 to disable
     int flag = KSW_EZ_EXTZ_ONLY | KSW_EZ_GENERIC_SC;
@@ -292,8 +304,6 @@ AlignmentInfo Aligner::ksw_extend(const std::string& query, const std::string_vi
     free(qseq);
     free(tseq);
 
-
-    AlignmentInfo info;
     auto cigar = Cigar(ez.cigar, ez.n_cigar).to_eqx(query, ref);
     info.edit_distance = cigar.edit_distance();
     info.cigar = std::move(cigar);

From 7d23a81e91fb8390bd19f87e9f3525837ed21b11 Mon Sep 17 00:00:00 2001
From: Marcel Martin <marcel.martin@scilifelab.se>
Date: Fri, 15 Dec 2023 15:31:49 +0100
Subject: [PATCH 30/32] Add soft clipping to CIGAR if necessary

---
 src/aligner.cpp |  2 ++
 src/cigar.hpp   | 12 ++++++++++++
 2 files changed, 14 insertions(+)

diff --git a/src/aligner.cpp b/src/aligner.cpp
index 3c05f3d9..93e8396c 100644
--- a/src/aligner.cpp
+++ b/src/aligner.cpp
@@ -317,7 +317,9 @@ AlignmentInfo Aligner::ksw_extend(const std::string& query, const std::string_vi
         info.ref_end = ez.max_t + 1;
         info.query_end = ez.max_q + 1;
         info.sw_score = ez.max;
+        info.cigar.push(CIGAR_SOFTCLIP, query.length() - info.query_end);
     }
+    assert(info.cigar.derived_sequence_length() == query.length());
 
     kfree(km, ez.cigar);
     return info;
diff --git a/src/cigar.hpp b/src/cigar.hpp
index 12966fcd..931ab8a1 100644
--- a/src/cigar.hpp
+++ b/src/cigar.hpp
@@ -77,6 +77,18 @@ class Cigar {
         return dist;
     }
 
+    size_t derived_sequence_length() const {
+        size_t length = 0;
+        for (auto op_len : m_ops) {
+            auto op = op_len & 0xf;
+            auto len = op_len >> 4;
+            if (op == CIGAR_MATCH || op == CIGAR_EQ || op == CIGAR_X || op == CIGAR_INS || op == CIGAR_SOFTCLIP) {
+                length += len;
+            }
+        }
+        return length;
+    }
+
     void reverse() {
         std::reverse(m_ops.begin(), m_ops.end());
     }

From 26b472e7a748aabab5e46328622b1cd46a0c0841 Mon Sep 17 00:00:00 2001
From: Marcel Martin <marcel.martin@scilifelab.se>
Date: Fri, 15 Dec 2023 15:33:02 +0100
Subject: [PATCH 31/32] If ungapped alignment is softclipped, ksw_extend
 soft-clipped ends

It is possible that gapped alignment gives a better score than soft-clipping
based on ungapped alignment.

Closes #357
---
 CHANGES.md         |  3 +++
 src/aligner.hpp    |  1 +
 src/aln.cpp        | 45 +++++++++++++++++++++++++++++++++++++++++----
 src/cigar.hpp      |  4 ++++
 tests/phix.1.fastq |  8 ++++----
 tests/phix.pe.sam  |  4 ++--
 tests/phix.se.sam  |  4 ++--
 7 files changed, 57 insertions(+), 12 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index 2d907665..f9b67f88 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -21,6 +21,9 @@
 * Include [ZStr](https://github.com/mateidavid/zstr/) in our own repository
   instead of downloading it at build time. This should make it possible to
   build strobealign without internet access.
+* #357: Fix some suboptimal alignment ends. Sometimes an end was soft-clipped
+  although a better alignment with an insertion or deletion existed that
+  extends to the end of the read.
 
 ## v0.12.0 (2023-11-23)
 
diff --git a/src/aligner.hpp b/src/aligner.hpp
index 3a1c02a5..77f83c23 100644
--- a/src/aligner.hpp
+++ b/src/aligner.hpp
@@ -26,6 +26,7 @@ struct AlignmentInfo {
     int sw_score{0};
 
     int ref_span() const { return ref_end - ref_start; }
+    int query_span() const { return query_end - query_start; }
 };
 
 std::ostream& operator<<(std::ostream& os, const AlignmentInfo& info);
diff --git a/src/aln.cpp b/src/aln.cpp
index 02e408b5..a8bae9c3 100644
--- a/src/aln.cpp
+++ b/src/aln.cpp
@@ -222,11 +222,48 @@ inline Alignment extend_seed(
     int result_ref_start;
     bool gapped = true;
     if (projected_ref_end - projected_ref_start == query.size() && consistent_nam) {
-        std::string_view ref_segm_ham = ref.substr(projected_ref_start, query.size());
-        auto hamming_dist = hamming_distance(query, ref_segm_ham);
+        const std::string_view projected_ref = ref.substr(projected_ref_start, query.size());
+        info = hamming_align(query, projected_ref, aligner.parameters.match, aligner.parameters.mismatch, aligner.parameters.end_bonus);
+
+        if (info.edit_distance + (query.length() - info.query_span()) < 0.05 * query.length()) {
+            if (info.query_end < query.length()) {
+                // Right end is soft clipped, do gapped alignment on it
+                const std::string query_right = query.substr(info.query_end);
+                const int ext_right = std::min(ref.size() - projected_ref_end, size_t(50));
+                const std::string_view ref_right = ref.substr(projected_ref_start + info.ref_end, ext_right);
+                auto right = aligner.ksw_extend(query_right, ref_right, false);
+                info.query_end += right.query_end;
+                info.ref_end += right.ref_end;
+                info.edit_distance += right.edit_distance;
+                info.sw_score += right.sw_score;
+                assert(!info.cigar.empty());
+                info.cigar.pop_oplen();
+                info.cigar += right.cigar;
+            }
+
+            if (info.query_start > 0) {
+                // Left end is soft clipped, do gapped alignment on it
+                std::string query_left = query.substr(0, info.query_start);
+                const int ext_left = std::min(50, projected_ref_start);
+                const int ref_start = projected_ref_start - ext_left;
+                std::string ref_left{ref.substr(ref_start, ext_left + info.ref_start)};
+                std::reverse(query_left.begin(), query_left.end());
+                std::reverse(ref_left.begin(), ref_left.end());
+                auto left = aligner.ksw_extend(query_left, ref_left, true);
+                info.query_start -= left.query_end;
+                info.ref_start -= left.ref_end;
+                info.edit_distance += left.edit_distance;
+                info.sw_score += left.sw_score;
+
+                // TODO this just removes the soft-clipping from the beginning,
+                // a bit too complicated
+                info.cigar.reverse();
+                info.cigar.pop_oplen();
+                info.cigar += left.cigar;
+                info.cigar.reverse();
+            }
 
-        if (hamming_dist >= 0 && (((float) hamming_dist / query.size()) < 0.05) ) { //Hamming distance worked fine, no need to ksw align
-            info = hamming_align(query, ref_segm_ham, aligner.parameters.match, aligner.parameters.mismatch, aligner.parameters.end_bonus);
+            assert(info.cigar.derived_sequence_length() == query.length());
             result_ref_start = projected_ref_start + info.ref_start;
             gapped = false;
         }
diff --git a/src/cigar.hpp b/src/cigar.hpp
index 931ab8a1..55bda51d 100644
--- a/src/cigar.hpp
+++ b/src/cigar.hpp
@@ -93,6 +93,10 @@ class Cigar {
         std::reverse(m_ops.begin(), m_ops.end());
     }
 
+    void pop_oplen() {
+        m_ops.pop_back();
+    }
+
     /* Return a new Cigar that uses M instead of =/X */
     Cigar to_m() const;
 
diff --git a/tests/phix.1.fastq b/tests/phix.1.fastq
index 148f0248..1ed483a6 100644
--- a/tests/phix.1.fastq
+++ b/tests/phix.1.fastq
@@ -31,9 +31,9 @@ NTCATTTGGCGAGAAAGCTCAGTCTCAGGAGGAAGCGGAGCAGGCCAAATGTTTTTGAGATGGCAGCAACGGAAACCATA
 +
 #88ABCFG<EC:CCCF<,6CFGE<;EFEGFE@;F,CFB7F7CF,6BCCCFFCFEE@CGCGGDCCECEFGC@@FGEEGGFGFC=?F@FCC=EGGF?CFFGAFFFFG<?FGF<EDFGC<FEG7CE=ECGGGG8EGFEFGGF+>=>+@+3,3CFB<CFGCF,EFGCGBFGGG9FGCFD:,@=@FFGGFF9CFGGGGGGGG6@CGCGGGGGGGGC7F:7CDB<CEGCGGGG46DC4CD37BCAFGFGAFA77AFFA;FAFC;CA:464<9<?#################################
 @SRR1377138.9
-NTTTGTTAACGTATTTAGCCACATAGAAACCAACAGCCATATAACTGGTAGCTTTAAGCGGCTCACCTTTAGCATCAACAGGCCACAACCAACCAGAACGTGAAAAAGCGTCCTGCGTGTAGCGAACTGCGATGGGCATACTGTAACCATAAGGCCACGTATTTTGCAAGCTATTTAACTGGCGGCGATTGCGTACCCGACGACCAAAATTAGGGTCAACGCTACCTGTAGGAAGTGTCCGCATAAAGTGCACCGCATGGAAATGAAGACGGCCATTAGCTGTACCATACTCAGGCACACA
+NTTTGTTAACGTATTTAGCCACATAGAAACCAACAGCCATATAACTGGTAGCTTTAAGCGGCTCACCTTTAGCATCAACAGGCCACAACCAACCAGAACGTGAAAAAGCGTCCTGCGTGTAGCGAACTGCGATGGGCATACTGTAACCATAAGGCCACGTATTTTGCAAGCTATTTAACTGGCGGCGATTGCGTACCCGACGACCAAAATTAGGGTCAACGCTACCTGTAGGAAGTGTCCGCATAAAGTGCACCGCATGGAAATGAAGACGGCCATTAGCTGTACCATACTCAGGACACA
 +
-#8BCCGGGGGGGGGGGGGGGCEFFGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFFFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGEGGGGGGGGGGGGGGGGGGGGGGGEGGGGGGGGGGGGEDGGGGGGGGGGFGGGFGFGGFGFGFF=7??FFFEBBFBFBBFFFFFFF:-044;?FFFFFFFFABFB?FFF<?:<?FFB?
+#8BCCGGGGGGGGGGGGGGGCEFFGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFFFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGEGGGGGGGGGGGGGGGGGGGGGGGEGGGGGGGGGGGGEDGGGGGGGGGGFGGGFGFGGFGFGFF=7??FFFEBBFBFBBFFFFFFF:-044;?FFFFFFFFABFB?FFF<?:?FFB?
 @SRR1377138.10
 NAAATGATTGGCGTATCCAACCTGCAGAGTTTTATCGCTTCCATGACGCAGAAGTTAACACTTTCGGATATTTCTGATGAGTCGAAAAATTATCTTGATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAAATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTGTCAAAAACTGACGCGTTGGATGAGGAGAAGTGGCTTAATATGCTTGGCACGTTCGTCAAGGACTG
 +
@@ -47,9 +47,9 @@ NAATCTGCTCAAATTTATGCGCGCTTCGATAAAAATGATTGGCGTATCCAACCTGCAGAGTTTTATCGCTTCCATGACGC
 +
 #8BCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGDCGGGGGGGGGGFGGGGGGGFGFGGFGFGFFFFFFFFFFFAFFFFFFFFFFFA3.1.48?A<:?FFFFFFBDFFFF?1<BAFFF#
 @SRR1377138.13
-NTATGTGGCTAAATACGTTAACAAAAAGTCAGATATGGACCTTGCTGCTAAAGGTCTAGGAGCTAAAGAATGGAACAACTCACTAAAAACCAAGCTGTCGCTACTTCCCAAGAAGCTGTTCAGAATCAGAATGAGCCGCAACTTCGGGATGAAAATGCTCACAATGACAAATCTGTCCACGGAGTGCTTAATCCAACTTACCAAGCTGGGTTACGACGCGACGCCGTTCAACCAGATATTGAAGCAGAACGCAAAAAGAGAGATGAGATTGAGGCTGGGAAAAGTTACTGTAGCCGACGTT
+NTATGTGGCTAAATACGTTAACAAAAAGTCAGATATGGACCTTGCTGCTAAAGGTCTAGGAGCTAAAGAATGGAACAACTCACTAAAAACCAAGCTGTCGCTACTTCCCAAGAAGCTGTTCAGAATCAGAATGAGCCGCAACTTCGGGATGAAAATGCTCACAATGACAAATCTGTCCACGGAGTGCTTAATCCAACTTACCAAGCTGGGTTACGACGCGACGCCGTTCAACCAGATATTGAAGCAGAACGCAAAAAGAGAGATGAGATTGAGGCTGGGAAAAGTTACTGTAGCGACGTT
 +
-#8BCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFCFGGFGFGFFFFFFFFFFFDFFFFB:>FFFFF2.1.4::?FFFFF?BAAFFFFFFDFF>>9>9?
+#8BCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFCFGGFGFGFFFFFFFFFFFDFFFFB:>FFFFF2.1.4::?FFFFF?BAAFFFFFFDF>>9>9?
 @SRR1377138.14
 NTCTTGATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAAATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTGTCAAAAACTGACGCGTTGGATGAGGAGAAGTGGCTTAATATGCTTGGCACGTTCGTCAAGGACTGGTTTAGATATGAGTCACATTTTGTTCATGGTAGAGATTCTCTTGTTGACATTTTAAAAGAGCGTGGATTACTATCTGAGTCCGATGCTGTT
 +
diff --git a/tests/phix.pe.sam b/tests/phix.pe.sam
index ef4157d7..ccf0dbca 100644
--- a/tests/phix.pe.sam
+++ b/tests/phix.pe.sam
@@ -17,7 +17,7 @@ SRR1377138.7	83	NC_001422.1	256	60	300=1X	=	141	-416	GGCACGTTCGTCAAGGACTGGTTTAGA
 SRR1377138.7	163	NC_001422.1	141	60	1X299=1X	=	256	416	NTTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTGTCAAAAACTGACGCGTTGGATGAGGAGAAGTGGCTTAATATGCTTGGCACGTTCGTCAAGGACTGGTTTAGATATGAGTCACATTTTGTTCATGGTAGAGATTCTCTTGTTGACATTTTAAAAGAGCGTGGATTACTATCTGAGTCCGATGCTGTTCAACCACTAATAGGTAAGAAATCATGAGTCAAGTTACTGAACAATCCGTACGTTTCCAGACCGCTTTGGCCTCTT	#8A<BFCFEGFFGGF9C:CFCEGGCCGECFF@EEGGGGGGFGCEEGEFGEGGECFCFGFGG<<F@@EFFEFGGDCFGGCFGGC:CB:68AA,BBC=FDA8FAFG8=FFFFEFCF<A+A??=FGCCFEFG:?F=AFDGGGGGFCFFFFCFGGGGGFGGFGGFC,7DDFFFFGGGGGFFEGGGFGGGGGG@7;8=+=36=DEDCABFF>F6BFFCFFF4AA>A4A4>:E>7?-55>;(>?E38*878CB6@EC7C;6C>C<)6<C))))/;;7459,2;>FFF@E>2)(4179:BB?FB####	NM:i:2	AS:i:602	RG:Z:1
 SRR1377138.8	83	NC_001422.1	1254	60	124=1X132=1X42=1X	=	1179	-376	TCATGAAGGATGGTGTTAATGCCACTCCTCTCCCGACTGTTAACACTACTGGTTATATTGACCATGCCGCTTTTCTTGGCACGATTAACCCTGATACCAATAAAATCCCTAAGCATTTGTTTCATGGTTATTTGAATATCTATAACAACTATTTTAAAGCGCCGTGGATGCCTGACCGTACCGAGGCTAACCCTAATGAGCTTAATCAAGATGATGCTCGTTATGGTTTCCGTTGCTGCCATCTCAAAAACATTTGGCCTGCTCCGCTTCCTCCTGAGACTGAGCTTTCTCGCCAAATGAN	#################################?<9<464:AC;CFAF;AFFA77AFAGFGFACB73DC4CD64GGGGCGEC<BDC7:F7CGGGGGGGGCGC@6GGGGGGGGFC9FFGGFF@=@,:DFCGF9GGGFBGCGFE,FCGFC<BFC3,3+@+>=>+FGGFEFGE8GGGGCE=EC7GEF<CGFDE<FGF?<GFFFFAGFFC?FGGE=CCF@F?=CFGFGGEEGF@@CGFECECCDGGCGC@EEFCFFCCCB6,FC7F7BFC,F;@EFGEFE;<EGFC6,<FCCC:CE<GFCBA88#	NM:i:3	AS:i:592	RG:Z:1
 SRR1377138.8	163	NC_001422.1	1179	60	1X79=1X112=1X88=1X15=1X2=	=	1254	376	NTATTGACTCTACTGTAGACATTTTTACTTTTTATGTCCCTCATCGTCACGTTTATGGTGAACAGTGGATTAAGTTCATGCAGGATGGTGTTAATGCCACTCCTCTCCCGACTGTTAACACTACTGGTTATATTGACCATGCCGCTTTTCTTGGCACGATTAACCCTGATACCAATAAAATCCCTAAGCATTTTTTTCAGGGTTATTTGAATATCTATAACAACTATTTTAAAGCGCCGTGGATGCCTGACCGTACCGAGGCTAACCCTAATGAGCTTAATCCAGATGATGCTCGTTACGG	#8AAC,6C;EC@E,@FA,<CC<F,,CFFCGGEGAEECFGFFGG<E@@@@F:CCA<F<EF,@;;CC@@EC<FFGDFEF@<,,66,,B<,F<BC<5F@,,95BEFD,?9FEFEFGGCFE?BE=FC=?FFF:;AFCFEG;E=FGFFBE+CFEFEG9=B?FGGGGDD89=@DGFGGGD8DDGFFD?8A8,=+8=FFD+?DDFD+98F?F8C?+CF;3?===F7C7?@@)=A0@EF)*>EFF5>A@FFE=<)1*-..:@29@29@0:1;3=6@=(544?4)9;C>5))7).474CB)99?######	NM:i:5	AS:i:572	RG:Z:1
-SRR1377138.9	83	NC_001422.1	4731	60	300=1X	=	4650	-382	TGTGTGCCTGAGTATGGTACAGCTAATGGCCGTCTTCATTTCCATGCGGTGCACTTTATGCGGACACTTCCTACAGGTAGCGTTGACCCTAATTTTGGTCGTCGGGTACGCAATCGCCGCCAGTTAAATAGCTTGCAAAATACGTGGCCTTATGGTTACAGTATGCCCATCGCAGTTCGCTACACGCAGGACGCTTTTTCACGTTCTGGTTGGTTGTGGCCTGTTGATGCTAAAGGTGAGCCGCTTAAAGCTACCAGTTATATGGCTGTTGGTTTCTATGTGGCTAAATACGTTAACAAAN	?BFF?<:?<FFF?BFBAFFFFFFFF?;440-:FFFFFFFBBFBFBBEFFF??7=FFGFGFGGFGFGGGFGGGGGGGGGGDEGGGGGGGGGGGGEGGGGGGGGGGGGGGGGGGGGGGGEGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFFFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGFFECGGGGGGGGGGGGGGGCCB8#	NM:i:1	AS:i:612	RG:Z:1
+SRR1377138.9	83	NC_001422.1	4731	60	5=1D294=1X	=	4650	-382	TGTGTCCTGAGTATGGTACAGCTAATGGCCGTCTTCATTTCCATGCGGTGCACTTTATGCGGACACTTCCTACAGGTAGCGTTGACCCTAATTTTGGTCGTCGGGTACGCAATCGCCGCCAGTTAAATAGCTTGCAAAATACGTGGCCTTATGGTTACAGTATGCCCATCGCAGTTCGCTACACGCAGGACGCTTTTTCACGTTCTGGTTGGTTGTGGCCTGTTGATGCTAAAGGTGAGCCGCTTAAAGCTACCAGTTATATGGCTGTTGGTTTCTATGTGGCTAAATACGTTAACAAAN	?BFF?:?<FFF?BFBAFFFFFFFF?;440-:FFFFFFFBBFBFBBEFFF??7=FFGFGFGGFGFGGGFGGGGGGGGGGDEGGGGGGGGGGGGEGGGGGGGGGGGGGGGGGGGGGGGEGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFFFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGFFECGGGGGGGGGGGGGGGCCB8#	NM:i:2	AS:i:597	RG:Z:1
 SRR1377138.9	163	NC_001422.1	4650	60	1X253=1X45=1X	=	4731	382	NTTCGTGATATTGGTCGTATGGTTCTTGCTGCCGAGGGTCGCAAGGCTAATGATTCACACGCCGACTGCTATCAGTATTTTTGTGTGCCTGAGTATGGTACAGCTAATGGCCGTCTTCATTTCCATGCGGTGCACTTTATGCGGACACTTCCTACAGGTAGCGTTGACCCTAATTTTGGTCGTCGGGTACGCAATCGCCGCCAGTTAAATAGCTTGCAAAATACGTGGCCTTATGGTTACAGTATGCCCATCGCCGTTCGCTACACGCAGGACGCTTTTTCACGTTCTGGTTGGTTGTGGG	#8BCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGDGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG9EFGEGEGGGGGGGGGGFFGGGGGGGGGGGGGGGGGGGAEGGGGGGGGGGGGGGGGGFGGFFG;BAFFFAFFFFFFF>FCF@FFF?EFFF@EFEFFFFEEEB?FD<<)8083:<F((4:??01,72??##################################	NM:i:3	AS:i:592	RG:Z:1
 SRR1377138.10	99	NC_001422.1	1	60	26S275=	=	58	358	NAAATGATTGGCGTATCCAACCTGCAGAGTTTTATCGCTTCCATGACGCAGAAGTTAACACTTTCGGATATTTCTGATGAGTCGAAAAATTATCTTGATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAAATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTGTCAAAAACTGACGCGTTGGATGAGGAGAAGTGGCTTAATATGCTTGGCACGTTCGTCAAGGACTG	#8BCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGDGGGGGGGGGGGGGGGGGGGGFBFFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGFGGFGFGFFFFFFFFFFFFFFFFFF?BFFFF2.8:<BFFFFFFFFFFBFFFBF?BBF???F?B	NM:i:0	AS:i:560	RG:Z:1
 SRR1377138.10	147	NC_001422.1	58	60	300=1X	=	1	-358	GAAAAATTATCTTGATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAAATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTGTCAAAAACTGACGCGTTGGATGAGGAGAAGTGGCTTAATATGCTTGGCACGTTCGTCAAGGACTGGTTTAGATATGAGTCACATTTTGTTCATGGTAGAGATTCTCTTGTTGACATTTTAAAAGAGCGTGGATTACTATCTGAGTCCN	###FBDA<5)2<BAAFBA>?>FFDFFECC49FFD?3<@FFFFEDEC?3E?C?0<FFFFFFFEFFDFBFFFFFFFFE6E;FFFFFF>EFFFFFFF>@5FECFFGGGFGD:FGGCDGFGGGGGG>GGGGGGGDGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGEGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGCCB8#	NM:i:1	AS:i:612	RG:Z:1
@@ -25,7 +25,7 @@ SRR1377138.11	99	NC_001422.1	5117	60	1X268=32S	=	5220	270	NAAGCTGTTCAGAATCAGAATG
 SRR1377138.11	147	NC_001422.1	5220	60	167=134S	=	5117	-270	GACGCGACGCCGTTCAACCAGATATTGAAGCAGAACGCAAAAAGAGAGATGAGATTGAGGCTGGGAAAAGTTACTGTAGCCGACGTTTTGGCGGCGCAACCTGTGACGACAAATCTGCTCAAATTTATGCGCGCTTCGATAAAAATGATTGGCGTATCCAACCTGCAGAGTTTTATCGCTTCCATGACGCAGAAGTTAACACTTTCGGATATTTCTGATGAGTCGAAAAATTATCTTGATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGCGGAAAAN	########BFFBA>82><9B>4<2BAA2A<<:?:4,.(B?C?.37;)EE<;)C?6(((;FFFFFEE?7)?9/);FDFB>A=FDAFFFFFFFDEDDCCFAGFFGGGGGGF:GGGFGGGGGFGGGGFGCGGEECDEFCCGGFGGGGFGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGFFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGCCA8#	NM:i:0	AS:i:344	RG:Z:1
 SRR1377138.12	99	NC_001422.1	1	60	57S238=1X5=	=	28	328	NAATCTGCTCAAATTTATGCGCGCTTCGATAAAAATGATTGGCGTATCCAACCTGCAGAGTTTTATCGCTTCCATGACGCAGAAGTTAACACTTTCGGATATTTCTGATGAGTCGAAAAATTATCTTGATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAAATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTGTCAAAAACTGACGCGTTGGATGAGGAGATGTGGC	#8BCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGDCGGGGGGGGGGFGGGGGGGFGFGGFGFGFFFFFFFFFFFAFFFFFFFFFFFA3.1.48?A<:?FFFFFFBDFFFF?1<BAFFF#	NM:i:1	AS:i:488	RG:Z:1
 SRR1377138.12	147	NC_001422.1	28	60	211=1X88=1X	=	1	-328	GTTAACACTTTCGGATATTTCTGATGAGTCGAAAAATTATCTTGATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAAATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTGTCAAAAACTGACGCGTTGGATGAGGAGATGTGGCTTAATATGCTTGGCACGTTCGTCAAGGACTGGTTTAGATATGAGTCACATTTTGTTCATGGTAGAGATTCTCTTGTTGACATTN	#FFFA<600B??A<4FA?BF?AFB?21DD9FFFFFECEFE<CFECEFFFFFFFFFEE?FFFFFFBBFFC9BFFFFFFFCFEFDBFFFFFFFFFFFFFFFGFFGGGGGF>GGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGCCB8#	NM:i:2	AS:i:602	RG:Z:1
-SRR1377138.13	99	NC_001422.1	5006	60	1X300=	=	5049	344	NTATGTGGCTAAATACGTTAACAAAAAGTCAGATATGGACCTTGCTGCTAAAGGTCTAGGAGCTAAAGAATGGAACAACTCACTAAAAACCAAGCTGTCGCTACTTCCCAAGAAGCTGTTCAGAATCAGAATGAGCCGCAACTTCGGGATGAAAATGCTCACAATGACAAATCTGTCCACGGAGTGCTTAATCCAACTTACCAAGCTGGGTTACGACGCGACGCCGTTCAACCAGATATTGAAGCAGAACGCAAAAAGAGAGATGAGATTGAGGCTGGGAAAAGTTACTGTAGCCGACGTT	#8BCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFCFGGFGFGFFFFFFFFFFFDFFFFB:>FFFFF2.1.4::?FFFFF?BAAFFFFFFDFF>>9>9?	NM:i:1	AS:i:612	RG:Z:1
+SRR1377138.13	99	NC_001422.1	5006	60	1X293=1D6=	=	5049	344	NTATGTGGCTAAATACGTTAACAAAAAGTCAGATATGGACCTTGCTGCTAAAGGTCTAGGAGCTAAAGAATGGAACAACTCACTAAAAACCAAGCTGTCGCTACTTCCCAAGAAGCTGTTCAGAATCAGAATGAGCCGCAACTTCGGGATGAAAATGCTCACAATGACAAATCTGTCCACGGAGTGCTTAATCCAACTTACCAAGCTGGGTTACGACGCGACGCCGTTCAACCAGATATTGAAGCAGAACGCAAAAAGAGAGATGAGATTGAGGCTGGGAAAAGTTACTGTAGCGACGTT	#8BCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFCFGGFGFGFFFFFFFFFFFDFFFFB:>FFFFF2.1.4::?FFFFF?BAAFFFFFFDF>>9>9?	NM:i:2	AS:i:597	RG:Z:1
 SRR1377138.13	147	NC_001422.1	5049	60	300=1X	=	5006	-344	GCTGCTAAAGGTCTAGGAGCTAAAGAATGGAACAACTCACTAAAAACCAAGCTGTCGCTACTTCCCAAGAAGCTGTTCAGAATCAGAATGAGCCGCAACTTCGGGATGAAAATGCTCACAATGACAAATCTGTCCACGGAGTGCTTAATCCAACTTACCAAGCTGGGTTACGACGCGACGCCGTTCAACCAGATATTGAAGCAGAACGCAAAAAGAGAGATGAGATTGAGGCTGGGAAAAGTTACTGTAGCCGACGTTTTGGCGGCGCAACCTGTGACGACAAATCTGCTCAAATTTATGN	#FFFFBFFFBD>A<::64>AAFFFB><<A>?FFBABBAFFFFFFBAFFFFFFFFFFFFFE;@F@FFFFFFFD@FE?FFFFFFFFFFFFFEBFFFFFFFFGFFGGGDGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGDGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGCGGCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGCCC8#	NM:i:1	AS:i:612	RG:Z:1
 SRR1377138.14	99	NC_001422.1	66	60	1X300=	=	156	391	NTCTTGATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAAATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTGTCAAAAACTGACGCGTTGGATGAGGAGAAGTGGCTTAATATGCTTGGCACGTTCGTCAAGGACTGGTTTAGATATGAGTCACATTTTGTTCATGGTAGAGATTCTCTTGTTGACATTTTAAAAGAGCGTGGATTACTATCTGAGTCCGATGCTGTT	#8@CCGGGGGGFFGGGGGGGGGGGGGFGGGGGGGGGGGGGFGGEEFGGGGGGGGGGGGGFGDCFEFGGGGGFGGGGGGGGGGGGGGGGGGGG7FGGGGGGGGGGGGGGGGGGGDFGGGGGGGFGGEGGGGGFGGGGGGGGGGGGGGGGGFGGGGGGGGGGDGGGGGGGGDGGGGGGGFEGGGGGGGFGGFFGFGGGGGGGGGFGGGFG7FGGFGDGGGGGFFCGGGGGGGGGGGGGFGFGGC>FGCF=A@FFFFFFFFFFEFFFFFFE<=/04(34:5<BFFA<AFFAAABFF>01:AB>4	NM:i:1	AS:i:612	RG:Z:1
 SRR1377138.14	147	NC_001422.1	156	60	300=1X	=	66	-391	GCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTGTCAAAAACTGACGCGTTGGATGAGGAGAAGTGGCTTAATATGCTTGGCACGTTCGTCAAGGACTGGTTTAGATATGAGTCACATTTTGTTCATGGTAGAGATTCTCTTGTTGACATTTTAAAAGAGCGTGGATTACTATCTGAGTCCGATGCTGTTCAACCACTAATAGGTAAGAAATCATGAGTCAAGTTACTGAACAATCCGTACGTTTCCAGACCGCTTTGGCCTCTATTAAGCTCATTCAGN	############?>BFAAA45FFFFB:7/(9>9;B@=63?FFED>52FE?>5*>852BCEEFFAA@A>8@8C>=7EFCFFEAFFFF:FAFFFFFFFFD?9*DDGGGGGDGGGGGGFGFGGFFCFFGDGFGGGGGFGFEF?GGFCGGGGGGGGGGGGFEEGFFGGFE9GGGFFGFAGGGGGGGGFAGGGFGFFFGGGGGGGEFAFFCFGGGF@GF@AE@FFCEF<FGFGFDGGGGEEFAFGAGGFFFFCGFCFGGFDF@FEEGGE@C,GF@CCCF@CGGGFEEC,GGFF@FFEF<FF@@A8#	NM:i:1	AS:i:612	RG:Z:1
diff --git a/tests/phix.se.sam b/tests/phix.se.sam
index 4ce2a053..58b66faf 100644
--- a/tests/phix.se.sam
+++ b/tests/phix.se.sam
@@ -9,11 +9,11 @@ SRR1377138.5	0	NC_001422.1	2494	60	1X236=1X63=	*	0	0	NCACCTAAAGCTACATCGTCAACGTTA
 SRR1377138.6	0	NC_001422.1	648	60	1X184=1X115=	*	0	0	NATCCCGTCAACATTCAAACGGCCTGTCTCATCATGGAAGGCGCTGAATTTACGGAAAACATTATTAATGGCGTCGAGCGTCCGGTTAAAGCCGCTGAATTGTTCGCGTTTACCTTGCGTGTACGCGCAGGAAACACTGACGTTCTTACTGACGCAGAAGAAAACGTGCGTCAAAAATTACGTGCAGAAGGAGTGATGTAATGTCTAAAGGTAAAAAACGTTCTGGCGCTCGCCCTGGTCGTCCGCAGCCGTTGCGAGGTACTAAAGGCAAGCGTAAAGGCGCTCGTCTTTGGTATGTAGG	#8BCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGDGGGGGGFGFGGFGFGFFFFFF>EEFFFFFFFFFFFFFFF0.044:6B0>BF>003>>:(:<>B<ADFFFFF	NM:i:2	AS:i:602	RG:Z:1
 SRR1377138.7	16	NC_001422.1	256	60	300=1X	*	0	0	GGCACGTTCGTCAAGGACTGGTTTAGATATGAGTCACATTTTGTTCATGGTAGAGATTCTCTTGTTGACATTTTAAAAGAGCGTGGATTACTATCTGAGTCCGATGCTGTTCAACCACTAATAGGTAAGAAATCATGAGTCAAGTTACTGAACAATCCGTACGTTTCCAGACCGCTTTGGCCTCTATTAAGCTCATTCAGGCTTCTGCCGTTTTGGATTTAACCGAAGATGATTTCGATTTTCTGACGAGTAACAAAGTTTGGATTGCTACTGACCGCTCTCGTGCTCGTCGCTGCGTTGN	46:800>?FFFA>:5)9FFFFDAAB<4<60.,AB>FFFFDFFEFA6FFFFFFA8=FGFCFCGFGFF7CGGGGFGGGGFECCGGGGGGGGGGGGGGGGGGGEGGGGGGGFFFGGGGGFEFGGCGGGGGGGGFGEAFGGGFGGGGGGGGGGFCGFFGEGGGGGGGGFGGGGGGGGGDFCFFF<9GGGGGGGGGGGGGGGFFF=8GFGGGGGGGGGGGGGGGGGGEGGGEFFEFGGGGGGGGGGGEFGFFEFGGEFDFCFGGFCCGGDFGGGGGFCGGGGGEGEFGEDEGGGEGGGGGGBCA8#	NM:i:1	AS:i:612	RG:Z:1
 SRR1377138.8	16	NC_001422.1	1254	60	124=1X132=1X42=1X	*	0	0	TCATGAAGGATGGTGTTAATGCCACTCCTCTCCCGACTGTTAACACTACTGGTTATATTGACCATGCCGCTTTTCTTGGCACGATTAACCCTGATACCAATAAAATCCCTAAGCATTTGTTTCATGGTTATTTGAATATCTATAACAACTATTTTAAAGCGCCGTGGATGCCTGACCGTACCGAGGCTAACCCTAATGAGCTTAATCAAGATGATGCTCGTTATGGTTTCCGTTGCTGCCATCTCAAAAACATTTGGCCTGCTCCGCTTCCTCCTGAGACTGAGCTTTCTCGCCAAATGAN	#################################?<9<464:AC;CFAF;AFFA77AFAGFGFACB73DC4CD64GGGGCGEC<BDC7:F7CGGGGGGGGCGC@6GGGGGGGGFC9FFGGFF@=@,:DFCGF9GGGFBGCGFE,FCGFC<BFC3,3+@+>=>+FGGFEFGE8GGGGCE=EC7GEF<CGFDE<FGF?<GFFFFAGFFC?FGGE=CCF@F?=CFGFGGEEGF@@CGFECECCDGGCGC@EEFCFFCCCB6,FC7F7BFC,F;@EFGEFE;<EGFC6,<FCCC:CE<GFCBA88#	NM:i:3	AS:i:592	RG:Z:1
-SRR1377138.9	16	NC_001422.1	4731	60	300=1X	*	0	0	TGTGTGCCTGAGTATGGTACAGCTAATGGCCGTCTTCATTTCCATGCGGTGCACTTTATGCGGACACTTCCTACAGGTAGCGTTGACCCTAATTTTGGTCGTCGGGTACGCAATCGCCGCCAGTTAAATAGCTTGCAAAATACGTGGCCTTATGGTTACAGTATGCCCATCGCAGTTCGCTACACGCAGGACGCTTTTTCACGTTCTGGTTGGTTGTGGCCTGTTGATGCTAAAGGTGAGCCGCTTAAAGCTACCAGTTATATGGCTGTTGGTTTCTATGTGGCTAAATACGTTAACAAAN	?BFF?<:?<FFF?BFBAFFFFFFFF?;440-:FFFFFFFBBFBFBBEFFF??7=FFGFGFGGFGFGGGFGGGGGGGGGGDEGGGGGGGGGGGGEGGGGGGGGGGGGGGGGGGGGGGGEGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFFFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGFFECGGGGGGGGGGGGGGGCCB8#	NM:i:1	AS:i:612	RG:Z:1
+SRR1377138.9	16	NC_001422.1	4731	60	5=1D294=1X	*	0	0	TGTGTCCTGAGTATGGTACAGCTAATGGCCGTCTTCATTTCCATGCGGTGCACTTTATGCGGACACTTCCTACAGGTAGCGTTGACCCTAATTTTGGTCGTCGGGTACGCAATCGCCGCCAGTTAAATAGCTTGCAAAATACGTGGCCTTATGGTTACAGTATGCCCATCGCAGTTCGCTACACGCAGGACGCTTTTTCACGTTCTGGTTGGTTGTGGCCTGTTGATGCTAAAGGTGAGCCGCTTAAAGCTACCAGTTATATGGCTGTTGGTTTCTATGTGGCTAAATACGTTAACAAAN	?BFF?:?<FFF?BFBAFFFFFFFF?;440-:FFFFFFFBBFBFBBEFFF??7=FFGFGFGGFGFGGGFGGGGGGGGGGDEGGGGGGGGGGGGEGGGGGGGGGGGGGGGGGGGGGGGEGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFFFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGFFECGGGGGGGGGGGGGGGCCB8#	NM:i:2	AS:i:597	RG:Z:1
 SRR1377138.10	0	NC_001422.1	1	60	26S275=	*	0	0	NAAATGATTGGCGTATCCAACCTGCAGAGTTTTATCGCTTCCATGACGCAGAAGTTAACACTTTCGGATATTTCTGATGAGTCGAAAAATTATCTTGATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAAATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTGTCAAAAACTGACGCGTTGGATGAGGAGAAGTGGCTTAATATGCTTGGCACGTTCGTCAAGGACTG	#8BCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGDGGGGGGGGGGGGGGGGGGGGFBFFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGFGGFGFGFFFFFFFFFFFFFFFFFF?BFFFF2.8:<BFFFFFFFFFFBFFFBF?BBF???F?B	NM:i:0	AS:i:560	RG:Z:1
 SRR1377138.11	0	NC_001422.1	5117	60	1X268=32S	*	0	0	NAAGCTGTTCAGAATCAGAATGAGCCGCAACTTCGGGATGAAAATGCTCACAATGACAAATCTGTCCACGGAGTGCTTAATCCAACTTACCAAGCTGGGTTACGACGCGACGCCGTTCAACCAGATATTGAAGCAGAACGCAAAAAGAGAGATGAGATTGAGGCTGGGAAAAGTTACTGTAGCCGACGTTTTGGCGGCGCAACCTGTGACGACAAATCTGCTCAAATTTATGCGCGCTTCGATAAAAATGATTGGCGTATCCAACCTGCGGAGTTTTATCGCTTCCATGACGCAGAAGTTA	#8ACCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFFGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGDGGGGF8FFGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGFGGGCFGGGGGGGGGGEE5EGGGGGGGGGFGGGGFDGEGDGGDGGGE7FFGGG>*?*:<>9BDECDFGF<*9@F?*7B?;F;F;B:AFFFFFF7*(03:0:AAAFFF:18?BA:<B##########	NM:i:1	AS:i:538	RG:Z:1
 SRR1377138.12	0	NC_001422.1	1	60	57S238=1X5=	*	0	0	NAATCTGCTCAAATTTATGCGCGCTTCGATAAAAATGATTGGCGTATCCAACCTGCAGAGTTTTATCGCTTCCATGACGCAGAAGTTAACACTTTCGGATATTTCTGATGAGTCGAAAAATTATCTTGATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAAATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTGTCAAAAACTGACGCGTTGGATGAGGAGATGTGGC	#8BCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGDCGGGGGGGGGGFGGGGGGGFGFGGFGFGFFFFFFFFFFFAFFFFFFFFFFFA3.1.48?A<:?FFFFFFBDFFFF?1<BAFFF#	NM:i:1	AS:i:488	RG:Z:1
-SRR1377138.13	0	NC_001422.1	5006	60	1X300=	*	0	0	NTATGTGGCTAAATACGTTAACAAAAAGTCAGATATGGACCTTGCTGCTAAAGGTCTAGGAGCTAAAGAATGGAACAACTCACTAAAAACCAAGCTGTCGCTACTTCCCAAGAAGCTGTTCAGAATCAGAATGAGCCGCAACTTCGGGATGAAAATGCTCACAATGACAAATCTGTCCACGGAGTGCTTAATCCAACTTACCAAGCTGGGTTACGACGCGACGCCGTTCAACCAGATATTGAAGCAGAACGCAAAAAGAGAGATGAGATTGAGGCTGGGAAAAGTTACTGTAGCCGACGTT	#8BCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFCFGGFGFGFFFFFFFFFFFDFFFFB:>FFFFF2.1.4::?FFFFF?BAAFFFFFFDFF>>9>9?	NM:i:1	AS:i:612	RG:Z:1
+SRR1377138.13	0	NC_001422.1	5006	60	1X293=1D6=	*	0	0	NTATGTGGCTAAATACGTTAACAAAAAGTCAGATATGGACCTTGCTGCTAAAGGTCTAGGAGCTAAAGAATGGAACAACTCACTAAAAACCAAGCTGTCGCTACTTCCCAAGAAGCTGTTCAGAATCAGAATGAGCCGCAACTTCGGGATGAAAATGCTCACAATGACAAATCTGTCCACGGAGTGCTTAATCCAACTTACCAAGCTGGGTTACGACGCGACGCCGTTCAACCAGATATTGAAGCAGAACGCAAAAAGAGAGATGAGATTGAGGCTGGGAAAAGTTACTGTAGCGACGTT	#8BCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFCFGGFGFGFFFFFFFFFFFDFFFFB:>FFFFF2.1.4::?FFFFF?BAAFFFFFFDF>>9>9?	NM:i:2	AS:i:597	RG:Z:1
 SRR1377138.14	0	NC_001422.1	66	60	1X300=	*	0	0	NTCTTGATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAAATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTGTCAAAAACTGACGCGTTGGATGAGGAGAAGTGGCTTAATATGCTTGGCACGTTCGTCAAGGACTGGTTTAGATATGAGTCACATTTTGTTCATGGTAGAGATTCTCTTGTTGACATTTTAAAAGAGCGTGGATTACTATCTGAGTCCGATGCTGTT	#8@CCGGGGGGFFGGGGGGGGGGGGGFGGGGGGGGGGGGGFGGEEFGGGGGGGGGGGGGFGDCFEFGGGGGFGGGGGGGGGGGGGGGGGGGG7FGGGGGGGGGGGGGGGGGGGDFGGGGGGGFGGEGGGGGFGGGGGGGGGGGGGGGGGFGGGGGGGGGGDGGGGGGGGDGGGGGGGFEGGGGGGGFGGFFGFGGGGGGGGGFGGGFG7FGGFGDGGGGGFFCGGGGGGGGGGGGGFGFGGC>FGCF=A@FFFFFFFFFFEFFFFFFE<=/04(34:5<BFFA<AFFAAABFF>01:AB>4	NM:i:1	AS:i:612	RG:Z:1
 SRR1377138.15	0	NC_001422.1	1907	60	1X185=115S	*	0	0	NTTAACGCTAAAGGTGCTTTGACTTATACCGATATTGCTGGCGACCCTGTTTTGTATGGCAACTTGCCGCCGCGTGAAATTTCTATGAAGGATGTTTTCCGTTCTGGTGATTCGTCTAAGAAGTTTAAGATTGCTGAGGGTCAGTGGTATCGTTATGCGCCTTCGTATGTTTCTCCTGCTTATCACAAGAACGGAAAACATCCTTCATAGAAATTTCACGCAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTGAAAAAAAAAAAAAAAGATA	#8BCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGFGGFGFGFFFFFFFFFFFFF>73?FFFFFFF3-048:BDFFAFFFFFF06>BFF#########	NM:i:1	AS:i:372	RG:Z:1
 SRR1377138.16	16	NC_001422.1	2072	60	27=1X20=1X83=1X50=1X116=1X	*	0	0	TATGTTTCTCCTGCTTATCACCTTCTTTAAGGCTTCCCATTCATTCAGCAACCGCCTTCTGGTGATTTGCAAGAACGCGTACTTATTCGCCACCATGATTATGACCAGTGTTTCCAGTCCGTTCAGTTGTTGAAGTGGAATAGTCAGGTTAAATTTAATGTGACCGTTTATCGCAATCTGCCGTCCACTCGCGATTCAATCATGACTTCGTGATAAAAGATTGAGTGTGAGGTTATAACGCCGAAGCGGTAAAAATTTTAATTTTTGCCGCTGAGGGGTTGACCAAGCGAAGCGCGGTAGN	############################################################################################################GC;D>,5E>B*,,;98F>FE,9E4,@F@7,;CF>3,,BF7=@,3CF>,E,,F@83++3>@3++CBC,5B+68+FB,7=>=F=F,<?AF9E,,,DEE:+=F?99,<FC,?,C<F@FAEGFC69F<GF<6=@C88+@6+:6<CGGGF@GFGGFCGFF<@:C@@8F66C6;AGDGF;6@C@CF@BF:@CGGCCA8#	NM:i:5	AS:i:572	RG:Z:1

From 83edcf20dc576a3e0ee9ad92ba1be2bb9a356c2a Mon Sep 17 00:00:00 2001
From: Marcel Martin <marcel.martin@scilifelab.se>
Date: Sun, 17 Dec 2023 21:50:37 +0100
Subject: [PATCH 32/32] Update baseline commit

---
 tests/baseline-commit.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/baseline-commit.txt b/tests/baseline-commit.txt
index 471fbbbb..12f828e5 100644
--- a/tests/baseline-commit.txt
+++ b/tests/baseline-commit.txt
@@ -1 +1 @@
-2e4ff9500e68d6e465735dd276d362cf71851dcd
+26b472e7a748aabab5e46328622b1cd46a0c0841