From be706cc02d0f38c020029d4b9cd00d9d03433dbf Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Mon, 18 Dec 2023 10:24:23 +0100 Subject: [PATCH 01/32] Fix misspelled "strobealign" --- src/cmdline.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cmdline.cpp b/src/cmdline.cpp index 48ce9e06..c00c5479 100644 --- a/src/cmdline.cpp +++ b/src/cmdline.cpp @@ -8,7 +8,7 @@ class Version {}; CommandLineOptions parse_command_line_arguments(int argc, char **argv) { - args::ArgumentParser parser("strobelign " + version_string()); + args::ArgumentParser parser("strobealign " + version_string()); parser.helpParams.showTerminator = false; parser.helpParams.helpindent = 20; parser.helpParams.width = 90; From 32fb1486a1c2f4c92122e219d386397c96f04827 Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Fri, 19 Jan 2024 13:58:53 +0100 Subject: [PATCH 02/32] Fix lost mapping-only (PAF) accuracy Introduced by buggy refactor in 1fe3e341b829cb6c9697d88dffec29c8bcb3a6ec. NamPair.score is not actually the sum of the scores of its constituent NAMs, but the sum of the n_hits. --- src/aln.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aln.cpp b/src/aln.cpp index 082e4285..bdb70929 100644 --- a/src/aln.cpp +++ b/src/aln.cpp @@ -883,7 +883,7 @@ inline void get_best_map_location( Nam n1_joint_max, n2_joint_max; for (auto &[score, nam1, nam2] : nam_pairs) { // already sorted by descending score if (nam1.ref_start >= 0 && nam2.ref_start >=0) { // Valid pair - score_joint = score; + score_joint = nam1.score + nam2.score; n1_joint_max = nam1; n2_joint_max = nam2; break; From b20253a6668e2095fc3a0b2b160ad8b4f9a09712 Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Fri, 19 Jan 2024 14:01:41 +0100 Subject: [PATCH 03/32] Rename score attribute of NamPair to n_hits To make it clearer what it represents. --- src/aln.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/aln.cpp b/src/aln.cpp index bdb70929..4322a1d5 100644 --- a/src/aln.cpp +++ b/src/aln.cpp @@ -15,7 +15,7 @@ using namespace klibpp; namespace { struct NamPair { - int score; + int n_hits; Nam nam1; Nam nam2; }; @@ -415,7 +415,7 @@ inline std::vector get_best_scoring_nam_pairs( std::sort( nam_pairs.begin(), nam_pairs.end(), - [](const NamPair& a, const NamPair& b) -> bool { return a.score > b.score; } + [](const NamPair& a, const NamPair& b) -> bool { return a.n_hits > b.n_hits; } ); // Sort by highest score first return nam_pairs; @@ -778,7 +778,7 @@ std::vector align_paired( // Turn pairs of high-scoring NAMs into pairs of alignments std::vector high_scores; - auto max_score = nam_pairs[0].score; + auto max_score = nam_pairs[0].n_hits; for (auto &[score_, n1, n2] : nam_pairs) { float score_dropoff = (float) score_ / max_score; From df612a078a713b518812592c64fad9a4a1a9d444 Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Fri, 19 Jan 2024 16:24:53 +0100 Subject: [PATCH 04/32] Fix wrong insert size computation in is_proper_nam_pair --- src/aln.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aln.cpp b/src/aln.cpp index 4322a1d5..4dec133a 100644 --- a/src/aln.cpp +++ b/src/aln.cpp @@ -333,7 +333,7 @@ bool is_proper_nam_pair(const Nam nam1, const Nam nam2, float mu, float sigma) { if (nam1.ref_id != nam2.ref_id || nam1.is_rc == nam2.is_rc) { return false; } - int a = std::max(0, nam1.ref_start - nam2.query_start); + int a = std::max(0, nam1.ref_start - nam1.query_start); int b = std::max(0, nam2.ref_start - nam2.query_start); // r1 ---> <---- r2 From d8a064c0cd3aacbb2fb6d2cb79c649024632b98f Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Fri, 19 Jan 2024 16:46:37 +0100 Subject: [PATCH 05/32] Update baseline commit --- tests/baseline-commit.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/baseline-commit.txt b/tests/baseline-commit.txt index b2a2c68f..94995525 100644 --- a/tests/baseline-commit.txt +++ b/tests/baseline-commit.txt @@ -1 +1 @@ -baseline_commit=54f9fe4266ae0ab7843ee7fb70ddfbc2d95dc729 +baseline_commit=df612a078a713b518812592c64fad9a4a1a9d444 From 0ced9903276834e6b9bfe095a255952f0616d330 Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Fri, 19 Jan 2024 16:52:52 +0100 Subject: [PATCH 06/32] Add Nam::projected_ref_start() --- src/aln.cpp | 14 +++++++------- src/nam.hpp | 4 ++++ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/aln.cpp b/src/aln.cpp index 4dec133a..b5105bb1 100644 --- a/src/aln.cpp +++ b/src/aln.cpp @@ -215,7 +215,7 @@ inline Alignment extend_seed( const std::string query = nam.is_rc ? read.rc : read.seq; const std::string& ref = references.sequences[nam.ref_id]; - const auto projected_ref_start = std::max(0, nam.ref_start - nam.query_start); + const auto projected_ref_start = nam.projected_ref_start(); const auto projected_ref_end = std::min(nam.ref_end + query.size() - nam.query_end, ref.size()); AlignmentInfo info; @@ -333,14 +333,14 @@ bool is_proper_nam_pair(const Nam nam1, const Nam nam2, float mu, float sigma) { if (nam1.ref_id != nam2.ref_id || nam1.is_rc == nam2.is_rc) { return false; } - int a = std::max(0, nam1.ref_start - nam1.query_start); - int b = std::max(0, nam2.ref_start - nam2.query_start); + int r1_ref_start = nam1.projected_ref_start(); + int r2_ref_start = nam2.projected_ref_start(); // r1 ---> <---- r2 - bool r1_r2 = nam2.is_rc && (a <= b) && (b - a < mu + 10*sigma); + bool r1_r2 = nam2.is_rc && (r1_ref_start <= r2_ref_start) && (r2_ref_start - r1_ref_start < mu + 10*sigma); // r2 ---> <---- r1 - bool r2_r1 = nam1.is_rc && (b <= a) && (a - b < mu + 10*sigma); + bool r2_r1 = nam1.is_rc && (r2_ref_start <= r1_ref_start) && (r1_ref_start - r2_ref_start < mu + 10*sigma); return r1_r2 || r2_r1; } @@ -442,8 +442,8 @@ inline Alignment rescue_align( if (mate_nam.is_rc) { r_tmp = read.seq; - a = mate_nam.ref_start - mate_nam.query_start - (mu+5*sigma); - b = mate_nam.ref_start - mate_nam.query_start + read_len/2; // at most half read overlap + a = mate_nam.projected_ref_start() - (mu+5*sigma); + b = mate_nam.projected_ref_start() + read_len/2; // at most half read overlap } else { r_tmp = read.rc; // mate is rc since fr orientation a = mate_nam.ref_end + (read_len - mate_nam.query_end) - read_len/2; // at most half read overlap diff --git a/src/nam.hpp b/src/nam.hpp index 6fc807fa..b052b1e3 100644 --- a/src/nam.hpp +++ b/src/nam.hpp @@ -29,6 +29,10 @@ struct Nam { int query_span() const { return query_end - query_start; } + + int projected_ref_start() const { + return std::max(0, ref_start - query_start); + } }; std::pair> find_nams( From 781b17a6cbdad7ad8182680261439060be55723e Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Mon, 22 Jan 2024 13:04:29 +0100 Subject: [PATCH 07/32] Update baseline commit --- .github/workflows/ci.yml | 2 +- tests/baseline-commit.txt | 2 +- tests/compare-baseline.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5c34eb67..0cdc4ce4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -68,7 +68,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - run: cat tests/baseline-commit.txt >> $GITHUB_ENV + - run: "echo baseline_commit=$(< tests/baseline-commit.txt) >> $GITHUB_ENV" - uses: actions/checkout@v3 with: ref: ${{ env.baseline_commit }} diff --git a/tests/baseline-commit.txt b/tests/baseline-commit.txt index 94995525..9c1e9b61 100644 --- a/tests/baseline-commit.txt +++ b/tests/baseline-commit.txt @@ -1 +1 @@ -baseline_commit=df612a078a713b518812592c64fad9a4a1a9d444 +0ced9903276834e6b9bfe095a255952f0616d330 diff --git a/tests/compare-baseline.sh b/tests/compare-baseline.sh index 5f13ba2f..a5176503 100755 --- a/tests/compare-baseline.sh +++ b/tests/compare-baseline.sh @@ -33,7 +33,7 @@ fi # Ensure test data is available tests/download.sh -source tests/baseline-commit.txt +baseline_commit=$(< tests/baseline-commit.txt) baseline_bam=baseline/baseline-${baseline_commit}.${ends}.bam baseline_binary=baseline/strobealign-${baseline_commit} From f749f21e43c4acce6cc27b1fbddb84cabdc9cfc1 Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Mon, 22 Jan 2024 13:30:19 +0100 Subject: [PATCH 08/32] Simplify baseline comparison on CI Do not let GitHub Actions do the work, but rely on the baseline comparison script. Only ensure caches are filled before starting it. This reduces code duplication. --- .github/workflows/ci.yml | 40 ++++++++++----------------------------- tests/compare-baseline.sh | 6 +++--- 2 files changed, 13 insertions(+), 33 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0cdc4ce4..d6f1d976 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,7 +14,7 @@ jobs: github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Check for tab characters run: "! grep -P -R '\\t' src/ tests/*.{cpp,py}" @@ -27,7 +27,7 @@ jobs: matrix: os: [ubuntu-latest, macos-latest] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Build run: | cmake -B build -DCMAKE_BUILD_TYPE=RelWithDebInfo @@ -49,7 +49,7 @@ jobs: github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v4 with: @@ -67,48 +67,28 @@ jobs: github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - run: "echo baseline_commit=$(< tests/baseline-commit.txt) >> $GITHUB_ENV" - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: - ref: ${{ env.baseline_commit }} - path: baseline + fetch-depth: 0 # Baseline comparison needs older commits - name: Install Linux dependencies if: runner.os == 'Linux' run: sudo apt-get install samtools python3-pysam picard-tools - name: Install macOS dependencies if: runner.os == 'macOS' run: brew install samtools pysam picard-tools - - name: Cache test dataset - uses: actions/cache@v3 + uses: actions/cache@v4 with: key: test-data-${{ hashFiles('tests/download.sh') }} path: tests/drosophila/ - - name: Download test dataset - run: tests/download.sh - - name: Cache baseline BAM id: cache-baseline-bam - uses: actions/cache@v3 + uses: actions/cache@v4 with: key: baseline-bam-${{ hashFiles('tests/baseline-commit.txt') }} - path: baseline.bam - - name: Generate baseline BAM - if: ${{ steps.cache-baseline-bam.outputs.cache-hit != 'true' }} - run: | - ( cd baseline && cmake -B build ) - make -j3 -C baseline/build - baseline/build/strobealign tests/drosophila/ref.fasta tests/drosophila/reads.1.fastq.gz tests/drosophila/reads.2.fastq.gz | samtools view -o baseline.bam - - - name: Build HEAD version - run: | - cmake -B build -DCMAKE_BUILD_TYPE=RelWithDebInfo - make -j3 -C build - - name: Generate HEAD BAM - run: build/strobealign tests/drosophila/ref.fasta tests/drosophila/reads.1.fastq.gz tests/drosophila/reads.2.fastq.gz | samtools view -o head.bam - - name: Compare - run: python3 tests/samdiff.py baseline.bam head.bam + path: baseline/bam/ + - name: Compare to baseline + run: tests/compare-baseline.sh - name: Validate with Picard run: | PicardCommandLine ValidateSamFile IGNORE=RECORD_MISSING_READ_GROUP IGNORE=MISSING_READ_GROUP I=head.bam diff --git a/tests/compare-baseline.sh b/tests/compare-baseline.sh index a5176503..57f9faac 100755 --- a/tests/compare-baseline.sh +++ b/tests/compare-baseline.sh @@ -35,18 +35,18 @@ tests/download.sh baseline_commit=$(< tests/baseline-commit.txt) -baseline_bam=baseline/baseline-${baseline_commit}.${ends}.bam +baseline_bam=baseline/bam/${baseline_commit}.${ends}.bam baseline_binary=baseline/strobealign-${baseline_commit} cmake_options=-DCMAKE_BUILD_TYPE=RelWithDebInfo strobealign_options="-t 4" # Generate the baseline BAM if necessary -mkdir -p baseline +mkdir -p baseline/bam if ! test -f ${baseline_bam}; then if ! test -f ${baseline_binary}; then srcdir=$(mktemp -p . -d compile.XXXXXXX) git clone . ${srcdir} - ( cd ${srcdir} && git checkout ${baseline_commit} ) + ( cd ${srcdir} && git checkout -d ${baseline_commit} ) cmake ${srcdir} -B ${srcdir}/build ${cmake_options} if ! make -j 4 -C ${srcdir}/build strobealign; then exit 1 From 40c9c09e3fc31d19e8a5c6df5bda26c1f90e6d3c Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Mon, 22 Jan 2024 15:08:14 +0100 Subject: [PATCH 09/32] Vendor zstr instead of fetching it from Git at build time This way, the strobealign sources are more self-contained and it should not be necessary to have internet access at build time. --- CHANGES.md | 3 + CMakeLists.txt | 6 +- ext/README.md | 7 + ext/zstr/CMakeLists.txt | 46 +++ ext/zstr/LICENSE | 21 ++ ext/zstr/README.org | 103 +++++++ ext/zstr/src/strict_fstream.hpp | 237 +++++++++++++++ ext/zstr/src/zstr.hpp | 502 ++++++++++++++++++++++++++++++++ 8 files changed, 920 insertions(+), 5 deletions(-) create mode 100644 ext/zstr/CMakeLists.txt create mode 100644 ext/zstr/LICENSE create mode 100644 ext/zstr/README.org create mode 100644 ext/zstr/src/strict_fstream.hpp create mode 100644 ext/zstr/src/zstr.hpp diff --git a/CHANGES.md b/CHANGES.md index 5e653588..266b5832 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -10,6 +10,9 @@ * #378: Added `-C` option for appending the FASTA or FASTQ comment to SAM output. (Idea and name of the option taken from BWA-MEM.) * #371: Added `--no-PG` option for not outputting the PG SAM header +* Include [ZStr](https://github.com/mateidavid/zstr/) in our own repository + instead of downloading it at build time. This should make it possible to + build strobealign without internet access. ## v0.12.0 (2023-11-23) diff --git a/CMakeLists.txt b/CMakeLists.txt index 19ace7a2..988b1235 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,11 +25,7 @@ endif() message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") add_compile_options(-Wall -Wextra -Werror=maybe-uninitialized) -FetchContent_Declare(ZStrGitRepo - GIT_REPOSITORY "https://github.com/mateidavid/zstr" - GIT_TAG "755da7890ea22478a702e3139092e6c964fab1f5" -) -FetchContent_MakeAvailable(ZStrGitRepo) +add_subdirectory(ext/zstr) # Obtain version from Git or fall back to PROJECT_VERSION if not building # from a Git repository diff --git a/ext/README.md b/ext/README.md index e473fdf5..e8316d26 100644 --- a/ext/README.md +++ b/ext/README.md @@ -47,3 +47,10 @@ License: See ssw/README.md Homepage: https://www.xxhash.com Version: ? License: See xxhash.c + + +## zstr + +Homepage: https://github.com/mateidavid/zstr +Commit used: 755da7890ea22478a702e3139092e6c964fab1f5 +License: See zstr/LICENSE diff --git a/ext/zstr/CMakeLists.txt b/ext/zstr/CMakeLists.txt new file mode 100644 index 00000000..8a015618 --- /dev/null +++ b/ext/zstr/CMakeLists.txt @@ -0,0 +1,46 @@ +cmake_minimum_required(VERSION 3.10 FATAL_ERROR) + +project(zstr LANGUAGES CXX) + +if (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.12) + cmake_policy(SET CMP0074 NEW) # find_package uses _ROOT variables +endif() + +if(${CMAKE_VERSION} VERSION_LESS 3.13) + message(WARNING + "Interface library targets are not well supported before cmake 3.13 .... " + "You may need to add \${ZSTR_INCLUDE_DIRS} to your include directories\n" + "target_include_directories(YourTarget PRIVATE \${ZSTR_INCLUDE_DIRS}) " + ) +endif() + +# -- locate zlib + +find_package(ZLIB 1.2.3 REQUIRED) # defines imported target ZLIB::ZLIB +message(STATUS "zstr - found ZLIB (version: ${ZLIB_VERSION_STRING})") + +# -- add target + +add_library(zstr INTERFACE) +add_library(zstr::zstr ALIAS zstr) + +# -- set target properties + +target_include_directories(zstr INTERFACE "${PROJECT_SOURCE_DIR}/src") +target_link_libraries(zstr INTERFACE ZLIB::ZLIB) +target_compile_features(zstr INTERFACE cxx_std_11) # require c++11 flag + +# -- set cache variables + +# NOTE: these vars are mostly useful to people using cmake < 3.13 +set(ZSTR_INCLUDE_DIRS "${PROJECT_SOURCE_DIR}/src;${ZLIB_INCLUDE_DIRS}" CACHE PATH "" FORCE) +set(ZSTR_LIBRARIES "${ZLIB_LIBRARIES}" CACHE PATH "" FORCE) + +# -- print target summary + +message(STATUS + "zstr - added INTERFACE target 'zstr::zstr' + includes : ${ZSTR_INCLUDE_DIRS} + libraries: ZLIB::ZLIB + features : cxx_std_11" +) diff --git a/ext/zstr/LICENSE b/ext/zstr/LICENSE new file mode 100644 index 00000000..841c7214 --- /dev/null +++ b/ext/zstr/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015 Matei David, Ontario Institute for Cancer Research + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/ext/zstr/README.org b/ext/zstr/README.org new file mode 100644 index 00000000..dea53589 --- /dev/null +++ b/ext/zstr/README.org @@ -0,0 +1,103 @@ +# -*- mode:org; mode:visual-line; coding:utf-8; -*- + +** A C++ ZLib wrapper + +[[http://travis-ci.org/mateidavid/zstr][http://travis-ci.org/mateidavid/zstr.svg?branch=master]] [[https://tldrlegal.com/license/mit-license][http://img.shields.io/:license-mit-blue.svg]] + +This C++ header-only library enables the use of C++ standard iostreams to access ZLib-compressed streams. + +For input access (decompression), the compression format is auto-detected, and multiple concatenated compressed streams are decompressed seamlessly. + +For output access (compression), the only parameter exposed by this API is the compression level. + +Alternatives to this library include: + +- The original [[http://www.zlib.net/][ZLib]], through its [[http://www.zlib.net/manual.html][C API]]. This does not interact nicely with C++ iostreams. + +- The [[http://www.cs.unc.edu/Research/compgeom/gzstream/][GZStream]] library. This library does not auto-detect input compression, and it cannot wrap streams (only files). + +- The [[http://www.boost.org/doc/libs/release/libs/iostreams/][Boost IOStreams]] library. The library does not auto-detect input compression (by default, though that can be easily implemented with filters), and more importantly, it is not a header-only Boost library. + +- The [[https://github.com/tmaklin/bxzstr][bxzstr]] library, if you want support for BZ2 and/or LZMA as well. + +For an example usage, see [[examples/ztxtpipe.cpp]] and [[examples/zc.cpp]]. + +It is compatible with [[https://github.com/richgel999/miniz][miniz]] in case you don't want to get frustrated with zlib e. g. on Windows. + +**** Input Auto-detection + +For input access, the library seamlessly auto-detects whether the source stream is compressed or not. The following compressed streams are detected: + +- GZip header, when stream starts with =1F 8B=. See [[http://en.wikipedia.org/wiki/Gzip][GZip format]]. + +- ZLib header, when stream starts with =78 01=, =78 9C=, and =78 DA=. See [[http://stackoverflow.com/a/17176881][answer here]]. + +If none of these formats are detected, the library assumes the input is not compressed, and it produces a plain copy of the source stream. + +**** Classes + +The package provides 6 classes for accessing ZLib streams: + +- =zstr::istreambuf= is the core decompression class. This is constructed from an existing =std::streambuf= that contains source data. The =zstr::istreambuf= constructor accepts explicit settings for the internal buffer size (default: 1 MB) and the auto-detection option (default: on). ZLib errors cause exceptions to be thrown. + +- =zstr::ostreambuf= is the core compression class. This is constructed from an existing =std::streambuf= that contains sink data. The =zstr::ostreambuf= constructor accepts explicit settings for the internal buffer size (default: 1 MB) and the compression option (default: ZLib default). ZLib errors cause exceptions to be thrown. + +- =zstr::istream= is a wrapper for a =zstr::istreambuf= that accesses an /external/ =std::streambuf=. It can be constructed from an existing =std::istream= (such as =std::cin=) or =std::streambuf=. + +- =zstr::ostream= is a wrapper for a =zstr::ostreambuf= that accesses an /external/ =std::streambuf=. It can be constructed from an existing =std::ostream= (such as =std::cout=) or =std::streambuf=. + +- =zstr::ifstream= is a wrapper for a =zstr::istreambuf= that accesses an /internal/ =std::ifstream=. This can be used to open a file and read decompressed data from it. + +- =zstr::ofstream= is a wrapper for a =zstr::ostreambuf= that accesses an /internal/ =std::ofstream=. This can be used to open a file and write compressed data to it. + +For all stream objects, the =badbit= of their exception mask is turned on in order to propagate exceptions. + +**** CMake + +There are three simple ways to add zstr to a CMake project. + +Method 1. Add zstr as a subdirectory and link to the =zstr::zstr= target + + #+BEGIN_SRC cmake + add_subdirectory(zstr) # defines INTERFACE target 'zstr::zstr' + + add_executable(YourTarget main.cpp) + target_link_libraries(YourTarget PRIVATE zstr::zstr) + # if using cmake < 3.13 you may also need the following line + # target_include_directories(YourTarget PRIVATE ${ZSTR_INCLUDE_DIRS}) + #+END_SRC + +Method 2. Fetch a copy of zstr from an external repository and link to the =zstr::zstr= target + + /NOTE: The FetchContent functions shown here were introduced in CMake 3.14/ + + #+BEGIN_SRC cmake + include(FetchContent) + FetchContent_Declare(ZStrGitRepo + GIT_REPOSITORY "https://github.com/mateidavid/zstr" # can also be a local filesystem path! + GIT_TAG "master" + ) + FetchContent_MakeAvailable(ZStrGitRepo) # defines INTERFACE target 'zstr::zstr' + + add_executable(YourTarget main.cpp) + target_link_libraries(YourTarget PRIVATE zstr::zstr) + #+END_SRC + +Method 3. Add path containing 'zstr.hpp' to your target's include directories + + /NOTE: With this method you're responsible for finding and linking to ZLIB !/ + + #+BEGIN_SRC cmake + find_package(ZLIB REQUIRED) + add_executable(YourTarget main.cpp) + target_link_libraries(YourTarget PRIVATE ZLIB::ZLIB) + target_include_directories(YourTarget PRIVATE /path/to/zstr/src) + #+END_SRC + +**** Requisites + +If you use GCC and want to use the `fs.open()` function, you need to deploy at least GCC version 5.1. + +**** License + +Released under the [[file:LICENSE][MIT license]]. diff --git a/ext/zstr/src/strict_fstream.hpp b/ext/zstr/src/strict_fstream.hpp new file mode 100644 index 00000000..7d03ea66 --- /dev/null +++ b/ext/zstr/src/strict_fstream.hpp @@ -0,0 +1,237 @@ +#pragma once + +#include +#include +#include +#include +#include + +/** + * This namespace defines wrappers for std::ifstream, std::ofstream, and + * std::fstream objects. The wrappers perform the following steps: + * - check the open modes make sense + * - check that the call to open() is successful + * - (for input streams) check that the opened file is peek-able + * - turn on the badbit in the exception mask + */ +namespace strict_fstream +{ + +// Help people out a bit, it seems like this is a common recommenation since +// musl breaks all over the place. +#if defined(__NEED_size_t) && !defined(__MUSL__) +#warning "It seems to be recommended to patch in a define for __MUSL__ if you use musl globally: https://www.openwall.com/lists/musl/2013/02/10/5" +#define __MUSL__ +#endif + +// Workaround for broken musl implementation +// Since musl insists that they are perfectly compatible, ironically enough, +// they don't officially have a __musl__ or similar. But __NEED_size_t is defined in their +// relevant header (and not in working implementations), so we can use that. +#ifdef __MUSL__ +#warning "Working around broken strerror_r() implementation in musl, remove when musl is fixed" +#endif + +// Non-gnu variants of strerror_* don't necessarily null-terminate if +// truncating, so we have to do things manually. +inline std::string trim_to_null(const std::vector &buff) +{ + std::string ret(buff.begin(), buff.end()); + + const std::string::size_type pos = ret.find('\0'); + if (pos == std::string::npos) { + ret += " [...]"; // it has been truncated + } else { + ret.resize(pos); + } + return ret; +} + +/// Overload of error-reporting function, to enable use with VS and non-GNU +/// POSIX libc's +/// Ref: +/// - http://stackoverflow.com/a/901316/717706 +static std::string strerror() +{ + // Can't use std::string since we're pre-C++17 + std::vector buff(256, '\0'); + +#ifdef _WIN32 + // Since strerror_s might set errno itself, we need to store it. + const int err_num = errno; + if (strerror_s(buff.data(), buff.size(), err_num) != 0) { + return trim_to_null(buff); + } else { + return "Unknown error (" + std::to_string(err_num) + ")"; + } +#elif ((_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || defined(__APPLE__) || defined(__FreeBSD__)) && ! _GNU_SOURCE) || defined(__MUSL__) +// XSI-compliant strerror_r() + const int err_num = errno; // See above + if (strerror_r(err_num, buff.data(), buff.size()) == 0) { + return trim_to_null(buff); + } else { + return "Unknown error (" + std::to_string(err_num) + ")"; + } +#else +// GNU-specific strerror_r() + char * p = strerror_r(errno, &buff[0], buff.size()); + return std::string(p, std::strlen(p)); +#endif +} + +/// Exception class thrown by failed operations. +class Exception + : public std::exception +{ +public: + Exception(const std::string& msg) : _msg(msg) {} + const char * what() const noexcept { return _msg.c_str(); } +private: + std::string _msg; +}; // class Exception + +namespace detail +{ + +struct static_method_holder +{ + static std::string mode_to_string(std::ios_base::openmode mode) + { + static const int n_modes = 6; + static const std::ios_base::openmode mode_val_v[n_modes] = + { + std::ios_base::in, + std::ios_base::out, + std::ios_base::app, + std::ios_base::ate, + std::ios_base::trunc, + std::ios_base::binary + }; + + static const char * mode_name_v[n_modes] = + { + "in", + "out", + "app", + "ate", + "trunc", + "binary" + }; + std::string res; + for (int i = 0; i < n_modes; ++i) + { + if (mode & mode_val_v[i]) + { + res += (! res.empty()? "|" : ""); + res += mode_name_v[i]; + } + } + if (res.empty()) res = "none"; + return res; + } + static void check_mode(const std::string& filename, std::ios_base::openmode mode) + { + if ((mode & std::ios_base::trunc) && ! (mode & std::ios_base::out)) + { + throw Exception(std::string("strict_fstream: open('") + filename + "'): mode error: trunc and not out"); + } + else if ((mode & std::ios_base::app) && ! (mode & std::ios_base::out)) + { + throw Exception(std::string("strict_fstream: open('") + filename + "'): mode error: app and not out"); + } + else if ((mode & std::ios_base::trunc) && (mode & std::ios_base::app)) + { + throw Exception(std::string("strict_fstream: open('") + filename + "'): mode error: trunc and app"); + } + } + static void check_open(std::ios * s_p, const std::string& filename, std::ios_base::openmode mode) + { + if (s_p->fail()) + { + throw Exception(std::string("strict_fstream: open('") + + filename + "'," + mode_to_string(mode) + "): open failed: " + + strerror()); + } + } + static void check_peek(std::istream * is_p, const std::string& filename, std::ios_base::openmode mode) + { + bool peek_failed = true; + try + { + is_p->peek(); + peek_failed = is_p->fail(); + } + catch (const std::ios_base::failure &) {} + if (peek_failed) + { + throw Exception(std::string("strict_fstream: open('") + + filename + "'," + mode_to_string(mode) + "): peek failed: " + + strerror()); + } + is_p->clear(); + } +}; // struct static_method_holder + +} // namespace detail + +class ifstream + : public std::ifstream +{ +public: + ifstream() = default; + ifstream(const std::string& filename, std::ios_base::openmode mode = std::ios_base::in) + { + open(filename, mode); + } + void open(const std::string& filename, std::ios_base::openmode mode = std::ios_base::in) + { + mode |= std::ios_base::in; + exceptions(std::ios_base::badbit); + detail::static_method_holder::check_mode(filename, mode); + std::ifstream::open(filename, mode); + detail::static_method_holder::check_open(this, filename, mode); + detail::static_method_holder::check_peek(this, filename, mode); + } +}; // class ifstream + +class ofstream + : public std::ofstream +{ +public: + ofstream() = default; + ofstream(const std::string& filename, std::ios_base::openmode mode = std::ios_base::out) + { + open(filename, mode); + } + void open(const std::string& filename, std::ios_base::openmode mode = std::ios_base::out) + { + mode |= std::ios_base::out; + exceptions(std::ios_base::badbit); + detail::static_method_holder::check_mode(filename, mode); + std::ofstream::open(filename, mode); + detail::static_method_holder::check_open(this, filename, mode); + } +}; // class ofstream + +class fstream + : public std::fstream +{ +public: + fstream() = default; + fstream(const std::string& filename, std::ios_base::openmode mode = std::ios_base::in) + { + open(filename, mode); + } + void open(const std::string& filename, std::ios_base::openmode mode = std::ios_base::in) + { + if (! (mode & std::ios_base::out)) mode |= std::ios_base::in; + exceptions(std::ios_base::badbit); + detail::static_method_holder::check_mode(filename, mode); + std::fstream::open(filename, mode); + detail::static_method_holder::check_open(this, filename, mode); + detail::static_method_holder::check_peek(this, filename, mode); + } +}; // class fstream + +} // namespace strict_fstream + diff --git a/ext/zstr/src/zstr.hpp b/ext/zstr/src/zstr.hpp new file mode 100644 index 00000000..bd330ea1 --- /dev/null +++ b/ext/zstr/src/zstr.hpp @@ -0,0 +1,502 @@ +//--------------------------------------------------------- +// Copyright 2015 Ontario Institute for Cancer Research +// Written by Matei David (matei@cs.toronto.edu) +//--------------------------------------------------------- + +// Reference: +// http://stackoverflow.com/questions/14086417/how-to-write-custom-input-stream-in-c + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "strict_fstream.hpp" + +#if defined(__GNUC__) && !defined(__clang__) +#if (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__>0) +#define CAN_MOVE_IOSTREAM +#endif +#else +#define CAN_MOVE_IOSTREAM +#endif + +namespace zstr +{ + +static const std::size_t default_buff_size = static_cast(1 << 20); + +/// Exception class thrown by failed zlib operations. +class Exception + : public std::ios_base::failure +{ +public: + static std::string error_to_message(z_stream * zstrm_p, int ret) + { + std::string msg = "zlib: "; + switch (ret) + { + case Z_STREAM_ERROR: + msg += "Z_STREAM_ERROR: "; + break; + case Z_DATA_ERROR: + msg += "Z_DATA_ERROR: "; + break; + case Z_MEM_ERROR: + msg += "Z_MEM_ERROR: "; + break; + case Z_VERSION_ERROR: + msg += "Z_VERSION_ERROR: "; + break; + case Z_BUF_ERROR: + msg += "Z_BUF_ERROR: "; + break; + default: + std::ostringstream oss; + oss << ret; + msg += "[" + oss.str() + "]: "; + break; + } + if (zstrm_p->msg) { + msg += zstrm_p->msg; + } + msg += " (" + "next_in: " + + std::to_string(uintptr_t(zstrm_p->next_in)) + + ", avail_in: " + + std::to_string(uintptr_t(zstrm_p->avail_in)) + + ", next_out: " + + std::to_string(uintptr_t(zstrm_p->next_out)) + + ", avail_out: " + + std::to_string(uintptr_t(zstrm_p->avail_out)) + + ")"; + return msg; + } + + Exception(z_stream * zstrm_p, int ret) + : std::ios_base::failure(error_to_message(zstrm_p, ret)) + { + } +}; // class Exception + +namespace detail +{ + +class z_stream_wrapper + : public z_stream +{ +public: + z_stream_wrapper(bool _is_input, int _level, int _window_bits) + : is_input(_is_input) + { + this->zalloc = nullptr;//Z_NULL + this->zfree = nullptr;//Z_NULL + this->opaque = nullptr;//Z_NULL + int ret; + if (is_input) + { + this->avail_in = 0; + this->next_in = nullptr;//Z_NULL + ret = inflateInit2(this, _window_bits ? _window_bits : 15+32); + } + else + { + ret = deflateInit2(this, _level, Z_DEFLATED, _window_bits ? _window_bits : 15+16, 8, Z_DEFAULT_STRATEGY); + } + if (ret != Z_OK) throw Exception(this, ret); + } + ~z_stream_wrapper() + { + if (is_input) + { + inflateEnd(this); + } + else + { + deflateEnd(this); + } + } +private: + bool is_input; +}; // class z_stream_wrapper + +} // namespace detail + +class istreambuf + : public std::streambuf +{ +public: + istreambuf(std::streambuf * _sbuf_p, + std::size_t _buff_size = default_buff_size, bool _auto_detect = true, int _window_bits = 0) + : sbuf_p(_sbuf_p), + in_buff(), + in_buff_start(nullptr), + in_buff_end(nullptr), + out_buff(), + zstrm_p(nullptr), + buff_size(_buff_size), + auto_detect(_auto_detect), + auto_detect_run(false), + is_text(false), + window_bits(_window_bits) + { + assert(sbuf_p); + in_buff = std::unique_ptr(new char[buff_size]); + in_buff_start = in_buff.get(); + in_buff_end = in_buff.get(); + out_buff = std::unique_ptr(new char[buff_size]); + setg(out_buff.get(), out_buff.get(), out_buff.get()); + } + + istreambuf(const istreambuf &) = delete; + istreambuf & operator = (const istreambuf &) = delete; + + pos_type seekoff(off_type off, std::ios_base::seekdir dir, + std::ios_base::openmode which) override + { + if (off != 0 || dir != std::ios_base::cur) { + return std::streambuf::seekoff(off, dir, which); + } + + if (!zstrm_p) { + return 0; + } + + return static_cast(zstrm_p->total_out - static_cast(in_avail())); + } + + std::streambuf::int_type underflow() override + { + if (this->gptr() == this->egptr()) + { + // pointers for free region in output buffer + char * out_buff_free_start = out_buff.get(); + int tries = 0; + do + { + if (++tries > 1000) { + throw std::ios_base::failure("Failed to fill buffer after 1000 tries"); + } + + // read more input if none available + if (in_buff_start == in_buff_end) + { + // empty input buffer: refill from the start + in_buff_start = in_buff.get(); + std::streamsize sz = sbuf_p->sgetn(in_buff.get(), static_cast(buff_size)); + in_buff_end = in_buff_start + sz; + if (in_buff_end == in_buff_start) break; // end of input + } + // auto detect if the stream contains text or deflate data + if (auto_detect && ! auto_detect_run) + { + auto_detect_run = true; + unsigned char b0 = *reinterpret_cast< unsigned char * >(in_buff_start); + unsigned char b1 = *reinterpret_cast< unsigned char * >(in_buff_start + 1); + // Ref: + // http://en.wikipedia.org/wiki/Gzip + // http://stackoverflow.com/questions/9050260/what-does-a-zlib-header-look-like + is_text = ! (in_buff_start + 2 <= in_buff_end + && ((b0 == 0x1F && b1 == 0x8B) // gzip header + || (b0 == 0x78 && (b1 == 0x01 // zlib header + || b1 == 0x9C + || b1 == 0xDA)))); + } + if (is_text) + { + // simply swap in_buff and out_buff, and adjust pointers + assert(in_buff_start == in_buff.get()); + std::swap(in_buff, out_buff); + out_buff_free_start = in_buff_end; + in_buff_start = in_buff.get(); + in_buff_end = in_buff.get(); + } + else + { + // run inflate() on input + if (! zstrm_p) zstrm_p = std::unique_ptr(new detail::z_stream_wrapper(true, Z_DEFAULT_COMPRESSION, window_bits)); + zstrm_p->next_in = reinterpret_cast< decltype(zstrm_p->next_in) >(in_buff_start); + zstrm_p->avail_in = uint32_t(in_buff_end - in_buff_start); + zstrm_p->next_out = reinterpret_cast< decltype(zstrm_p->next_out) >(out_buff_free_start); + zstrm_p->avail_out = uint32_t((out_buff.get() + buff_size) - out_buff_free_start); + int ret = inflate(zstrm_p.get(), Z_NO_FLUSH); + // process return code + if (ret != Z_OK && ret != Z_STREAM_END) throw Exception(zstrm_p.get(), ret); + // update in&out pointers following inflate() + in_buff_start = reinterpret_cast< decltype(in_buff_start) >(zstrm_p->next_in); + in_buff_end = in_buff_start + zstrm_p->avail_in; + out_buff_free_start = reinterpret_cast< decltype(out_buff_free_start) >(zstrm_p->next_out); + assert(out_buff_free_start + zstrm_p->avail_out == out_buff.get() + buff_size); + + if (ret == Z_STREAM_END) { + // if stream ended, deallocate inflator + zstrm_p.reset(); + } + } + } while (out_buff_free_start == out_buff.get()); + // 2 exit conditions: + // - end of input: there might or might not be output available + // - out_buff_free_start != out_buff: output available + this->setg(out_buff.get(), out_buff.get(), out_buff_free_start); + } + return this->gptr() == this->egptr() + ? traits_type::eof() + : traits_type::to_int_type(*this->gptr()); + } +private: + std::streambuf * sbuf_p; + std::unique_ptr in_buff; + char * in_buff_start; + char * in_buff_end; + std::unique_ptr out_buff; + std::unique_ptr zstrm_p; + std::size_t buff_size; + bool auto_detect; + bool auto_detect_run; + bool is_text; + int window_bits; + +}; // class istreambuf + +class ostreambuf + : public std::streambuf +{ +public: + ostreambuf(std::streambuf * _sbuf_p, + std::size_t _buff_size = default_buff_size, int _level = Z_DEFAULT_COMPRESSION, int _window_bits = 0) + : sbuf_p(_sbuf_p), + in_buff(), + out_buff(), + zstrm_p(new detail::z_stream_wrapper(false, _level, _window_bits)), + buff_size(_buff_size) + { + assert(sbuf_p); + in_buff = std::unique_ptr(new char[buff_size]); + out_buff = std::unique_ptr(new char[buff_size]); + setp(in_buff.get(), in_buff.get() + buff_size); + } + + ostreambuf(const ostreambuf &) = delete; + ostreambuf & operator = (const ostreambuf &) = delete; + + int deflate_loop(int flush) + { + while (true) + { + zstrm_p->next_out = reinterpret_cast< decltype(zstrm_p->next_out) >(out_buff.get()); + zstrm_p->avail_out = uint32_t(buff_size); + int ret = deflate(zstrm_p.get(), flush); + if (ret != Z_OK && ret != Z_STREAM_END && ret != Z_BUF_ERROR) { + failed = true; + throw Exception(zstrm_p.get(), ret); + } + std::streamsize sz = sbuf_p->sputn(out_buff.get(), reinterpret_cast< decltype(out_buff.get()) >(zstrm_p->next_out) - out_buff.get()); + if (sz != reinterpret_cast< decltype(out_buff.get()) >(zstrm_p->next_out) - out_buff.get()) + { + // there was an error in the sink stream + return -1; + } + if (ret == Z_STREAM_END || ret == Z_BUF_ERROR || sz == 0) + { + break; + } + } + return 0; + } + + virtual ~ostreambuf() + { + // flush the zlib stream + // + // NOTE: Errors here (sync() return value not 0) are ignored, because we + // cannot throw in a destructor. This mirrors the behaviour of + // std::basic_filebuf::~basic_filebuf(). To see an exception on error, + // close the ofstream with an explicit call to close(), and do not rely + // on the implicit call in the destructor. + // + if (!failed) try { + sync(); + } catch (...) {} + } + std::streambuf::int_type overflow(std::streambuf::int_type c = traits_type::eof()) override + { + zstrm_p->next_in = reinterpret_cast< decltype(zstrm_p->next_in) >(pbase()); + zstrm_p->avail_in = uint32_t(pptr() - pbase()); + while (zstrm_p->avail_in > 0) + { + int r = deflate_loop(Z_NO_FLUSH); + if (r != 0) + { + setp(nullptr, nullptr); + return traits_type::eof(); + } + } + setp(in_buff.get(), in_buff.get() + buff_size); + return traits_type::eq_int_type(c, traits_type::eof()) ? traits_type::eof() : sputc(char_type(c)); + } + int sync() override + { + // first, call overflow to clear in_buff + overflow(); + if (! pptr()) return -1; + // then, call deflate asking to finish the zlib stream + zstrm_p->next_in = nullptr; + zstrm_p->avail_in = 0; + if (deflate_loop(Z_FINISH) != 0) return -1; + deflateReset(zstrm_p.get()); + return 0; + } +private: + std::streambuf * sbuf_p = nullptr; + std::unique_ptr in_buff; + std::unique_ptr out_buff; + std::unique_ptr zstrm_p; + std::size_t buff_size; + bool failed = false; + +}; // class ostreambuf + +class istream + : public std::istream +{ +public: + istream(std::istream & is, + std::size_t _buff_size = default_buff_size, bool _auto_detect = true, int _window_bits = 0) + : std::istream(new istreambuf(is.rdbuf(), _buff_size, _auto_detect, _window_bits)) + { + exceptions(std::ios_base::badbit); + } + explicit istream(std::streambuf * sbuf_p) + : std::istream(new istreambuf(sbuf_p)) + { + exceptions(std::ios_base::badbit); + } + virtual ~istream() + { + delete rdbuf(); + } +}; // class istream + +class ostream + : public std::ostream +{ +public: + ostream(std::ostream & os, + std::size_t _buff_size = default_buff_size, int _level = Z_DEFAULT_COMPRESSION, int _window_bits = 0) + : std::ostream(new ostreambuf(os.rdbuf(), _buff_size, _level, _window_bits)) + { + exceptions(std::ios_base::badbit); + } + explicit ostream(std::streambuf * sbuf_p) + : std::ostream(new ostreambuf(sbuf_p)) + { + exceptions(std::ios_base::badbit); + } + virtual ~ostream() + { + delete rdbuf(); + } +}; // class ostream + +namespace detail +{ + +template < typename FStream_Type > +struct strict_fstream_holder +{ + strict_fstream_holder(const std::string& filename, std::ios_base::openmode mode = std::ios_base::in) + : _fs(filename, mode) + {} + strict_fstream_holder() = default; + FStream_Type _fs {}; +}; // class strict_fstream_holder + +} // namespace detail + +class ifstream + : private detail::strict_fstream_holder< strict_fstream::ifstream >, + public std::istream +{ +public: + explicit ifstream(const std::string filename, std::ios_base::openmode mode = std::ios_base::in, size_t buff_size = default_buff_size) + : detail::strict_fstream_holder< strict_fstream::ifstream >(filename, mode), + std::istream(new istreambuf(_fs.rdbuf(), buff_size)) + { + exceptions(std::ios_base::badbit); + } + explicit ifstream(): detail::strict_fstream_holder< strict_fstream::ifstream >(), std::istream(new istreambuf(_fs.rdbuf())){} + void close() { + _fs.close(); + } + #ifdef CAN_MOVE_IOSTREAM + void open(const std::string filename, std::ios_base::openmode mode = std::ios_base::in) { + _fs.open(filename, mode); + std::istream::operator=(std::istream(new istreambuf(_fs.rdbuf()))); + } + #endif + bool is_open() const { + return _fs.is_open(); + } + virtual ~ifstream() + { + if (_fs.is_open()) close(); + if (rdbuf()) delete rdbuf(); + } + + /// Return the position within the compressed file (wrapped filestream) + std::streampos compressed_tellg() + { + return _fs.tellg(); + } +}; // class ifstream + +class ofstream + : private detail::strict_fstream_holder< strict_fstream::ofstream >, + public std::ostream +{ +public: + explicit ofstream(const std::string filename, std::ios_base::openmode mode = std::ios_base::out, + int level = Z_DEFAULT_COMPRESSION, size_t buff_size = default_buff_size) + : detail::strict_fstream_holder< strict_fstream::ofstream >(filename, mode | std::ios_base::binary), + std::ostream(new ostreambuf(_fs.rdbuf(), buff_size, level)) + { + exceptions(std::ios_base::badbit); + } + explicit ofstream(): detail::strict_fstream_holder< strict_fstream::ofstream >(), std::ostream(new ostreambuf(_fs.rdbuf())){} + void close() { + std::ostream::flush(); + _fs.close(); + } + #ifdef CAN_MOVE_IOSTREAM + void open(const std::string filename, std::ios_base::openmode mode = std::ios_base::out, int level = Z_DEFAULT_COMPRESSION) { + flush(); + _fs.open(filename, mode | std::ios_base::binary); + std::ostream::operator=(std::ostream(new ostreambuf(_fs.rdbuf(), default_buff_size, level))); + } + #endif + bool is_open() const { + return _fs.is_open(); + } + ofstream& flush() { + std::ostream::flush(); + _fs.flush(); + return *this; + } + virtual ~ofstream() + { + if (_fs.is_open()) close(); + if (rdbuf()) delete rdbuf(); + } + + // Return the position within the compressed file (wrapped filestream) + std::streampos compressed_tellp() + { + return _fs.tellp(); + } +}; // class ofstream + +} // namespace zstr + From 7b3bc20d7b5ae39704b2d07490771d4e8b57352c Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Thu, 7 Dec 2023 10:49:15 +0100 Subject: [PATCH 10/32] Use poolstl to sort randstrobes in parallel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/alugowski/poolSTL Sorting the randstrobes is currently a bottleneck in index generation as it does not run in parallel. This is an attempt to parallelize it. poolstl’s sort uses regular std::sort under the hood. We currently use pdqsort_branchless, which is about twice as fast as std::sort, so parallel sorting breaks even with pdqsort_branchless at about 2-3 threads. It gets faster with more threads, but not as much as one would perhaps expect. Here are the sorting runtimes for CHM13: - 31 s with pdqsort_branchless - 59 s with std::sort - 34 s with parallel sort, 2 threads - 24 s with parallel sort, 4 threads - 23 s with parallel sort, 8 threads Another issue is that sorting is no longer in place, so memory usage goes up by a couple of gigabytes, which is another reason for me not to make this change. --- ext/poolstl/poolstl.hpp | 1697 +++++++++++++++++++++++++++++++++++++++ src/index.cpp | 12 +- src/index.hpp | 2 +- 3 files changed, 1707 insertions(+), 4 deletions(-) create mode 100644 ext/poolstl/poolstl.hpp diff --git a/ext/poolstl/poolstl.hpp b/ext/poolstl/poolstl.hpp new file mode 100644 index 00000000..ea79146e --- /dev/null +++ b/ext/poolstl/poolstl.hpp @@ -0,0 +1,1697 @@ +// SPDX-License-Identifier: BSD-2-Clause OR MIT OR BSL-1.0 +/** + * @brief Thread pool-based implementation of parallel standard library algorithms. Single-file version. + * @see https://github.com/alugowski/poolSTL + * @author Adam Lugowski + * @copyright Copyright (C) 2023 Adam Lugowski. + * Licensed under any of the following open-source licenses: + * BSD-2-Clause license, MIT license, Boost Software License 1.0 + * + * + * BSD-2-Clause license: + * + * Copyright (C) 2023 Adam Lugowski + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + * + * + * + * MIT License: + * + * Copyright (c) 2023 Adam Lugowski + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * + * + * Boost Software License 1.0: + * + * Permission is hereby granted, free of charge, to any person or organization + * obtaining a copy of the software and accompanying documentation covered by + * this license (the "Software") to use, reproduce, display, distribute, execute, + * and transmit the Software, and to prepare derivative works of the Software, + * and to permit third-parties to whom the Software is furnished to do so, + * all subject to the following: + * + * The copyright notices in the Software and this entire statement, including + * the above license grant, this restriction and the following disclaimer, must + * be included in all copies of the Software, in whole or in part, and all + * derivative works of the Software, unless such copies or derivative works + * are solely in the form of machine-executable object code generated by a + * source language processor. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT + * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE + * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef POOLSTL_HPP +#define POOLSTL_HPP + + +#ifndef POOLSTL_EXECUTION_HPP +#define POOLSTL_EXECUTION_HPP + +#include +#include +#include + + +#ifndef AL_TASK_THREAD_POOL_HPP +#define AL_TASK_THREAD_POOL_HPP + +// Version macros. +#define TASK_THREAD_POOL_VERSION_MAJOR 1 +#define TASK_THREAD_POOL_VERSION_MINOR 0 +#define TASK_THREAD_POOL_VERSION_PATCH 9 + +#include +#include +#include +#include +#include +#include +#include + +// MSVC does not correctly set the __cplusplus macro by default, so we must read it from _MSVC_LANG +// See https://devblogs.microsoft.com/cppblog/msvc-now-correctly-reports-__cplusplus/ +#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) +#define TTP_CXX17 1 +#else +#define TTP_CXX17 0 +#endif + +#if TTP_CXX17 +#define TTP_NODISCARD [[nodiscard]] +#else +#define TTP_NODISCARD +#endif + +namespace task_thread_pool { + +#if !TTP_CXX17 + /** + * A reimplementation of std::decay_t, which is only available since C++14. + */ + template + using decay_t = typename std::decay::type; +#endif + + /** + * A fast and lightweight thread pool that uses C++11 threads. + */ + class task_thread_pool { + public: + /** + * Create a task_thread_pool and start worker threads. + * + * @param num_threads Number of worker threads. If 0 then number of threads is equal to the + * number of physical cores on the machine, as given by std::thread::hardware_concurrency(). + */ + explicit task_thread_pool(unsigned int num_threads = 0) { + if (num_threads < 1) { + num_threads = std::thread::hardware_concurrency(); + if (num_threads < 1) { num_threads = 1; } + } + start_threads(num_threads); + } + + /** + * Finish all tasks left in the queue then shut down worker threads. + * If the pool is currently paused then it is resumed. + */ + ~task_thread_pool() { + unpause(); + wait_for_queued_tasks(); + stop_all_threads(); + } + + /** + * Drop all tasks that have been submitted but not yet started by a worker. + * + * Tasks already in progress continue executing. + */ + void clear_task_queue() { + const std::lock_guard tasks_lock(task_mutex); + tasks = {}; + } + + /** + * Get number of enqueued tasks. + * + * @return Number of tasks that have been enqueued but not yet started. + */ + TTP_NODISCARD size_t get_num_queued_tasks() const { + const std::lock_guard tasks_lock(task_mutex); + return tasks.size(); + } + + /** + * Get number of in-progress tasks. + * + * @return Approximate number of tasks currently being processed by worker threads. + */ + TTP_NODISCARD size_t get_num_running_tasks() const { + const std::lock_guard tasks_lock(task_mutex); + return num_inflight_tasks; + } + + /** + * Get total number of tasks in the pool. + * + * @return Approximate number of tasks both enqueued and running. + */ + TTP_NODISCARD size_t get_num_tasks() const { + const std::lock_guard tasks_lock(task_mutex); + return tasks.size() + num_inflight_tasks; + } + + /** + * Get number of worker threads. + * + * @return Number of worker threads. + */ + TTP_NODISCARD unsigned int get_num_threads() const { + const std::lock_guard threads_lock(thread_mutex); + return static_cast(threads.size()); + } + + /** + * Set number of worker threads. Will start or stop worker threads as necessary. + * + * @param num_threads Number of worker threads. If 0 then number of threads is equal to the + * number of physical cores on the machine, as given by std::thread::hardware_concurrency(). + * @return Previous number of worker threads. + */ + unsigned int set_num_threads(unsigned int num_threads) { + const std::lock_guard threads_lock(thread_mutex); + unsigned int previous_num_threads = get_num_threads(); + + if (num_threads < 1) { + num_threads = std::thread::hardware_concurrency(); + if (num_threads < 1) { num_threads = 1; } + } + + if (previous_num_threads <= num_threads) { + // expanding the thread pool + start_threads(num_threads - previous_num_threads); + } else { + // contracting the thread pool + stop_all_threads(); + { + const std::lock_guard tasks_lock(task_mutex); + pool_running = true; + } + start_threads(num_threads); + } + + return previous_num_threads; + } + + /** + * Stop executing queued tasks. Use `unpause()` to resume. Note: Destroying the pool will implicitly unpause. + * + * Any in-progress tasks continue executing. + */ + void pause() { + const std::lock_guard tasks_lock(task_mutex); + pool_paused = true; + } + + /** + * Resume executing queued tasks. + */ + void unpause() { + const std::lock_guard tasks_lock(task_mutex); + pool_paused = false; + task_cv.notify_all(); + } + + /** + * Check whether the pool is paused. + * + * @return true if pause() has been called without an intervening unpause(). + */ + TTP_NODISCARD bool is_paused() const { + const std::lock_guard tasks_lock(task_mutex); + return pool_paused; + } + + /** + * Submit a Callable for the pool to execute and return a std::future. + * + * @param func The Callable to execute. Can be a function, a lambda, std::packaged_task, std::function, etc. + * @param args Arguments for func. Optional. + * @return std::future that can be used to get func's return value or thrown exception. + */ + template , std::decay_t...> +#else + typename R = typename std::result_of(decay_t...)>::type +#endif + > + TTP_NODISCARD std::future submit(F&& func, A&&... args) { + std::shared_ptr> ptask = + std::make_shared>(std::bind(std::forward(func), std::forward(args)...)); + submit_detach([ptask] { (*ptask)(); }); + return ptask->get_future(); + } + + /** + * Submit a zero-argument Callable for the pool to execute. + * + * @param func The Callable to execute. Can be a function, a lambda, std::packaged_task, std::function, etc. + */ + template + void submit_detach(F&& func) { + const std::lock_guard tasks_lock(task_mutex); + tasks.emplace(std::forward(func)); + task_cv.notify_one(); + } + + /** + * Submit a Callable with arguments for the pool to execute. + * + * @param func The Callable to execute. Can be a function, a lambda, std::packaged_task, std::function, etc. + */ + template + void submit_detach(F&& func, A&&... args) { + const std::lock_guard tasks_lock(task_mutex); + tasks.emplace(std::bind(std::forward(func), std::forward(args)...)); + task_cv.notify_one(); + } + + /** + * Block until the task queue is empty. Some tasks may be in-progress when this method returns. + */ + void wait_for_queued_tasks() { + std::unique_lock tasks_lock(task_mutex); + notify_task_finish = true; + task_finished_cv.wait(tasks_lock, [&] { return tasks.empty(); }); + notify_task_finish = false; + } + + /** + * Block until all tasks have finished. + */ + void wait_for_tasks() { + std::unique_lock tasks_lock(task_mutex); + notify_task_finish = true; + task_finished_cv.wait(tasks_lock, [&] { return tasks.empty() && num_inflight_tasks == 0; }); + notify_task_finish = false; + } + + protected: + + /** + * Main function for worker threads. + */ + void worker_main() { + bool finished_task = false; + + while (true) { + std::unique_lock tasks_lock(task_mutex); + + if (finished_task) { + --num_inflight_tasks; + if (notify_task_finish) { + task_finished_cv.notify_all(); + } + } + + task_cv.wait(tasks_lock, [&]() { return !pool_running || (!pool_paused && !tasks.empty()); }); + + if (!pool_running) { + break; + } + + // Must mean that (!pool_paused && !tasks.empty()) is true + + std::packaged_task task{std::move(tasks.front())}; + tasks.pop(); + ++num_inflight_tasks; + tasks_lock.unlock(); + + try { + task(); + } catch (...) { + // std::packaged_task::operator() may throw in some error conditions, such as if the task + // had already been run. Nothing that the pool can do anything about. + } + + finished_task = true; + } + } + + /** + * Start worker threads. + * + * @param num_threads How many threads to start. + */ + void start_threads(const unsigned int num_threads) { + const std::lock_guard threads_lock(thread_mutex); + + for (unsigned int i = 0; i < num_threads; ++i) { + threads.emplace_back(&task_thread_pool::worker_main, this); + } + } + + /** + * Stop, join, and destroy all worker threads. + */ + void stop_all_threads() { + const std::lock_guard threads_lock(thread_mutex); + + { + const std::lock_guard tasks_lock(task_mutex); + pool_running = false; + task_cv.notify_all(); + } + + for (auto& thread : threads) { + if (thread.joinable()) { + thread.join(); + } + } + threads.clear(); + } + + /** + * The worker threads. + * + * Access protected by thread_mutex + */ + std::vector threads; + + /** + * A mutex for methods that start/stop threads. + */ + mutable std::recursive_mutex thread_mutex; + + /** + * The task queue. + * + * Access protected by task_mutex. + */ + std::queue> tasks = {}; + + /** + * A mutex for all variables related to tasks. + */ + mutable std::mutex task_mutex; + + /** + * Used to notify changes to the task queue, such as a new task added, pause/unpause, etc. + */ + std::condition_variable task_cv; + + /** + * Used to notify of finished tasks. + */ + std::condition_variable task_finished_cv; + + /** + * A signal for worker threads that the pool is either running or shutting down. + * + * Access protected by task_mutex. + */ + bool pool_running = true; + + /** + * A signal for worker threads to not pull new tasks from the queue. + * + * Access protected by task_mutex. + */ + bool pool_paused = false; + + /** + * A signal for worker threads that they should notify task_finished_cv when they finish a task. + * + * Access protected by task_mutex. + */ + bool notify_task_finish = false; + + /** + * A counter of the number of tasks in-progress by worker threads. + * Incremented when a task is popped off the task queue and decremented when that task is complete. + * + * Access protected by task_mutex. + */ + int num_inflight_tasks = 0; + }; +} + +// clean up +#undef TTP_NODISCARD +#undef TTP_CXX17 + +#endif + +#ifndef POOLSTL_INTERNAL_UTILS_HPP +#define POOLSTL_INTERNAL_UTILS_HPP + +// Version macros. +#define POOLSTL_VERSION_MAJOR 0 +#define POOLSTL_VERSION_MINOR 3 +#define POOLSTL_VERSION_PATCH 1 + +#include +#include + +#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) +#define POOLSTL_HAVE_CXX17 1 +#define POOLSTL_NO_DISCARD [[nodiscard]] +#else +#define POOLSTL_HAVE_CXX17 0 +#define POOLSTL_NO_DISCARD +#endif + +#if POOLSTL_HAVE_CXX17 && (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE >= 9) +#define POOLSTL_HAVE_CXX17_LIB 1 +#else +#define POOLSTL_HAVE_CXX17_LIB 0 +#endif + +#if __cplusplus >= 201402L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201402L) +#define POOLSTL_HAVE_CXX14 1 +#else +#define POOLSTL_HAVE_CXX14 0 +#endif + +namespace poolstl { + namespace internal { + + inline constexpr std::size_t get_chunk_size(std::size_t num_steps, unsigned int num_threads) { + return (num_steps / num_threads) + ((num_steps % num_threads) > 0 ? 1 : 0); + } + + template + constexpr typename std::iterator_traits::difference_type + get_chunk_size(Iterator first, Iterator last, unsigned int num_threads) { + using diff_t = typename std::iterator_traits::difference_type; + return static_cast(get_chunk_size((std::size_t)std::distance(first, last), num_threads)); + } + + template + constexpr typename std::iterator_traits::difference_type + get_iter_chunk_size(const Iterator& iter, const Iterator& last, + typename std::iterator_traits::difference_type chunk_size) { + return std::min(chunk_size, std::distance(iter, last)); + } + + template + Iterator advanced(Iterator iter, typename std::iterator_traits::difference_type offset) { + Iterator ret = iter; + std::advance(ret, offset); + return ret; + } + + /** + * An iterator wrapper that calls std::future<>::get(). + * @tparam Iterator + */ + template + class getting_iter : public Iterator { + public: + using value_type = decltype((*std::declval()).get()); + using difference_type = typename std::iterator_traits::difference_type; + using pointer = value_type*; + using reference = value_type&; + explicit getting_iter(Iterator iter) : iter(iter) {} + + getting_iter operator++() { ++iter; return *this; } + getting_iter operator++(int) { getting_iter ret(*this); ++iter; return ret; } + + value_type operator*() { return (*iter).get(); } + value_type operator[](difference_type offset) { return iter[offset].get(); } + + bool operator==(const getting_iter &other) const { return iter == other.iter; } + bool operator!=(const getting_iter &other) const { return iter != other.iter; } + + protected: + Iterator iter; + }; + + template + getting_iter get_wrap(Iterator iter) { + return getting_iter(iter); + } + + template + void get_futures(Container& futures) { + for (auto &future: futures) { + future.get(); + } + } + + /* + * Some methods are only available with C++17 and up. Reimplement on older standards. + */ +#if POOLSTL_HAVE_CXX17_LIB + namespace cpp17 = std; +#else + namespace cpp17 { + + // std::reduce + + template + Tp reduce(InputIt first, InputIt last, Tp init, BinOp b) { + for (; first != last; ++first) + init = b(init, *first); + return init; + } + + template + typename std::iterator_traits::value_type reduce(InputIt first, InputIt last) { + return reduce(first, last, + typename std::iterator_traits::value_type{}, + std::plus::value_type>()); + } + + // std::transform + + template + OutputIt transform(InputIt first1, InputIt last1, OutputIt d_first, + UnaryOperation unary_op) { + while (first1 != last1) { + *d_first++ = unary_op(*first1++); + } + + return d_first; + } + + template + OutputIt transform(InputIt1 first1, InputIt1 last1, + InputIt2 first2, OutputIt d_first, + BinaryOperation binary_op) { + while (first1 != last1) { + *d_first++ = binary_op(*first1++, *first2++); + } + + return d_first; + } + } +#endif + } +} + +#endif + +#if POOLSTL_HAVE_CXX17 +#include +#endif + +namespace poolstl { + + namespace ttp = task_thread_pool; + + namespace execution { + namespace internal { + /** + * Holds the thread pool used by par. + */ + inline std::shared_ptr get_default_pool() { + static std::shared_ptr pool; + static std::once_flag flag; + std::call_once(flag, [&](){ pool = std::make_shared(); }); + return pool; + } + } + + /** + * A sequential policy that simply forwards to the non-policy overload. + */ + struct sequenced_policy {}; + + /** + * A parallel policy that can use a user-specified thread pool or a default one. + */ + struct parallel_policy { + parallel_policy() = default; + explicit parallel_policy(ttp::task_thread_pool& on_pool): on_pool(&on_pool) {} + + parallel_policy on(ttp::task_thread_pool& pool) const { + return parallel_policy{pool}; + } + + POOLSTL_NO_DISCARD ttp::task_thread_pool& pool() const { + if (on_pool) { + return *on_pool; + } else { + return *(internal::get_default_pool()); + } + } + + protected: + ttp::task_thread_pool *on_pool = nullptr; + }; + + constexpr sequenced_policy seq{}; + constexpr parallel_policy par{}; + + +#if POOLSTL_HAVE_CXX17 + /** + * A policy that allows selecting a policy at runtime. + * + * @tparam Variant std::variant<> of policy options. + */ + template + struct variant_policy { + explicit variant_policy(const Variant& policy): var(policy) {} + Variant var; + }; + + namespace internal { + using poolstl_policy_variant = std::variant< + poolstl::execution::parallel_policy, + poolstl::execution::sequenced_policy>; + } + + /** + * Choose parallel or sequential at runtime. + * + * @param call_par Whether to use a parallel policy. + * @return `par` if call_par is true, else `seq`. + */ + inline variant_policy par_if(bool call_par) { + if (call_par) { + return variant_policy(internal::poolstl_policy_variant(par)); + } else { + return variant_policy(internal::poolstl_policy_variant(seq)); + } + } + + /** + * Choose parallel or sequential at runtime, with pool selection. + * + * @param call_par Whether to use a parallel policy. + * @return `par.on(pool)` if call_par is true, else `seq`. + */ + inline variant_policy par_if(bool call_par, ttp::task_thread_pool& pool) { + if (call_par) { + return variant_policy(internal::poolstl_policy_variant(par.on(pool))); + } else { + return variant_policy(internal::poolstl_policy_variant(seq)); + } + } +#endif + } + + using execution::seq; + using execution::par; +#if POOLSTL_HAVE_CXX17 + using execution::variant_policy; + using execution::par_if; +#endif + + namespace internal { + /** + * To enable/disable seq overload resolution + */ + template + using enable_if_seq = + typename std::enable_if< + std::is_same::type>::type>::value, + Tp>::type; + + /** + * To enable/disable par overload resolution + */ + template + using enable_if_par = + typename std::enable_if< + std::is_same::type>::type>::value, + Tp>::type; + +#if POOLSTL_HAVE_CXX17 + /** + * Helper for enable_if_poolstl_variant + */ + template struct is_poolstl_variant_policy : std::false_type {}; + template struct is_poolstl_variant_policy< + ::poolstl::execution::variant_policy> :std::true_type {}; + + /** + * To enable/disable variant_policy (for par_if) overload resolution + */ + template + using enable_if_poolstl_variant = + typename std::enable_if< + is_poolstl_variant_policy< + typename std::remove_cv::type>::type>::value, + Tp>::type; +#endif + } +} + +#endif + +#ifndef POOLSTL_ALGORITHM_HPP +#define POOLSTL_ALGORITHM_HPP + +#include + + +#ifndef POOLSTL_INTERNAL_TTP_IMPL_HPP +#define POOLSTL_INTERNAL_TTP_IMPL_HPP + +#include +#include +#include +#include + + +namespace poolstl { + namespace internal { + +#if POOLSTL_HAVE_CXX17_LIB + /** + * Call std::apply in parallel. + */ + template + std::vector> + parallel_apply(ExecPolicy &&policy, Op op, const ArgContainer& args_list) { + std::vector> futures; + auto& task_pool = policy.pool(); + + for (const auto& args : args_list) { + futures.emplace_back(task_pool.submit([op](const auto& args_fwd) { std::apply(op, args_fwd); }, args)); + } + + return futures; + } +#endif + + /** + * Chunk a single range. + */ + template + std::vector()(std::declval(), std::declval()))>> + parallel_chunk_for(ExecPolicy &&policy, RandIt first, RandIt last, Chunk chunk, int extra_split_factor = 1) { + std::vector()(std::declval(), std::declval())) + >> futures; + auto& task_pool = policy.pool(); + auto chunk_size = get_chunk_size(first, last, extra_split_factor * task_pool.get_num_threads()); + + while (first < last) { + auto iter_chunk_size = get_iter_chunk_size(first, last, chunk_size); + RandIt loop_end = advanced(first, iter_chunk_size); + + futures.emplace_back(task_pool.submit(chunk, first, loop_end)); + + first = loop_end; + } + + return futures; + } + + /** + * Element-wise chunk two ranges. + */ + template + std::vector()( + std::declval(), + std::declval(), + std::declval()))>> + parallel_chunk_for(ExecPolicy &&policy, RandIt1 first1, RandIt1 last1, RandIt2 first2, Chunk chunk) { + std::vector()( + std::declval(), + std::declval(), + std::declval())) + >> futures; + auto& task_pool = policy.pool(); + auto chunk_size = get_chunk_size(first1, last1, task_pool.get_num_threads()); + + while (first1 < last1) { + auto iter_chunk_size = get_iter_chunk_size(first1, last1, chunk_size); + RandIt1 loop_end = advanced(first1, iter_chunk_size); + + futures.emplace_back(task_pool.submit(chunk, first1, loop_end, first2)); + + first1 = loop_end; + std::advance(first2, iter_chunk_size); + } + + return futures; + } + + /** + * Element-wise chunk three ranges. + */ + template + std::vector()( + std::declval(), + std::declval(), + std::declval(), + std::declval()))>> + parallel_chunk_for(ExecPolicy &&policy, RandIt1 first1, RandIt1 last1, RandIt2 first2, RandIt3 first3, + Chunk chunk) { + std::vector()( + std::declval(), + std::declval(), + std::declval(), + std::declval())) + >> futures; + auto& task_pool = policy.pool(); + auto chunk_size = get_chunk_size(first1, last1, task_pool.get_num_threads()); + + while (first1 < last1) { + auto iter_chunk_size = get_iter_chunk_size(first1, last1, chunk_size); + RandIt1 loop_end = advanced(first1, iter_chunk_size); + + futures.emplace_back(task_pool.submit(chunk, first1, loop_end, first2, first3)); + + first1 = loop_end; + std::advance(first2, iter_chunk_size); + std::advance(first3, iter_chunk_size); + } + + return futures; + } + + /** + * Sort a range in parallel. + * + * @param stable Whether to use std::stable_sort or std::sort + */ + template + void parallel_sort(ExecPolicy &&policy, RandIt first, RandIt last, Compare comp, bool stable) { + if (first == last) { + return; + } + + // Sort chunks in parallel + auto futures = parallel_chunk_for(std::forward(policy), first, last, + [&comp, stable] (RandIt chunk_first, RandIt chunk_last) { + if (stable) { + std::stable_sort(chunk_first, chunk_last, comp); + } else { + std::sort(chunk_first, chunk_last, comp); + } + return std::make_pair(chunk_first, chunk_last); + }); + + // Merge the sorted ranges + using SortedRange = std::pair; + auto& task_pool = policy.pool(); + std::vector subranges; + do { + for (auto& future : futures) { + subranges.emplace_back(future.get()); + } + futures.clear(); + + for (std::size_t i = 0; i < subranges.size(); ++i) { + if (i + 1 < subranges.size()) { + // pair up and merge + auto& lhs = subranges[i]; + auto& rhs = subranges[i + 1]; + futures.emplace_back(task_pool.submit([&comp] (RandIt chunk_first, RandIt chunk_middle, + RandIt chunk_last) { + std::inplace_merge(chunk_first, chunk_middle, chunk_last, comp); + return std::make_pair(chunk_first, chunk_last); + }, lhs.first, lhs.second, rhs.second)); + ++i; + } else { + // forward the final extra range + std::promise p; + futures.emplace_back(p.get_future()); + p.set_value(subranges[i]); + } + } + + subranges.clear(); + } while (futures.size() > 1); + futures.front().get(); + } + } +} + +#endif + +namespace std { + + /** + * NOTE: Iterators are expected to be random access. + * See std::copy https://en.cppreference.com/w/cpp/algorithm/copy + */ + template + poolstl::internal::enable_if_par + copy(ExecPolicy &&policy, RandIt1 first, RandIt1 last, RandIt2 dest) { + auto futures = poolstl::internal::parallel_chunk_for(std::forward(policy), first, last, dest, + [](RandIt1 chunk_first, RandIt1 chunk_last, RandIt2 chunk_dest) { + std::copy(chunk_first, chunk_last, chunk_dest); + }); + poolstl::internal::get_futures(futures); + return poolstl::internal::advanced(dest, std::distance(first, last)); + } + + /** + * NOTE: Iterators are expected to be random access. + * See std::copy_n https://en.cppreference.com/w/cpp/algorithm/copy_n + */ + template + poolstl::internal::enable_if_par + copy_n(ExecPolicy &&policy, RandIt1 first, Size n, RandIt2 dest) { + if (n <= 0) { + return dest; + } + RandIt1 last = poolstl::internal::advanced(first, n); + std::copy(std::forward(policy), first, last, dest); + return poolstl::internal::advanced(dest, n); + } + + /** + * NOTE: Iterators are expected to be random access. + * See std::count_if https://en.cppreference.com/w/cpp/algorithm/count_if + */ + template + poolstl::internal::enable_if_par::difference_type> + count_if(ExecPolicy&& policy, RandIt first, RandIt last, UnaryPredicate p) { + using T = typename iterator_traits::difference_type; + + auto futures = poolstl::internal::parallel_chunk_for(std::forward(policy), first, last, + [&p](RandIt chunk_first, RandIt chunk_last) { + return std::count_if(chunk_first, chunk_last, p); + }); + + return poolstl::internal::cpp17::reduce( + poolstl::internal::get_wrap(futures.begin()), + poolstl::internal::get_wrap(futures.end()), (T)0, std::plus()); + } + + /** + * NOTE: Iterators are expected to be random access. + * See std::count https://en.cppreference.com/w/cpp/algorithm/count + */ + template + poolstl::internal::enable_if_par::difference_type> + count(ExecPolicy&& policy, RandIt first, RandIt last, const T& value) { + return std::count_if(std::forward(policy), first, last, + [&value](const T& test) { return test == value; }); + } + + /** + * NOTE: Iterators are expected to be random access. + * See std::fill https://en.cppreference.com/w/cpp/algorithm/fill + */ + template + poolstl::internal::enable_if_par + fill(ExecPolicy &&policy, RandIt first, RandIt last, const Tp& value) { + auto futures = poolstl::internal::parallel_chunk_for(std::forward(policy), first, last, + [&value](RandIt chunk_first, RandIt chunk_last) { + std::fill(chunk_first, chunk_last, value); + }); + poolstl::internal::get_futures(futures); + } + + /** + * NOTE: Iterators are expected to be random access. + * See std::fill_n https://en.cppreference.com/w/cpp/algorithm/fill_n + */ + template + poolstl::internal::enable_if_par + fill_n(ExecPolicy &&policy, RandIt first, Size n, const Tp& value) { + if (n <= 0) { + return first; + } + RandIt last = poolstl::internal::advanced(first, n); + std::fill(std::forward(policy), first, last, value); + return last; + } + + /** + * NOTE: Iterators are expected to be random access. + * See std::find_if https://en.cppreference.com/w/cpp/algorithm/find_if + */ + template + poolstl::internal::enable_if_par + find_if(ExecPolicy &&policy, RandIt first, RandIt last, UnaryPredicate p) { + using diff_t = typename std::iterator_traits::difference_type; + diff_t n = std::distance(first, last); + std::atomic extremum(n); + + auto futures = poolstl::internal::parallel_chunk_for(std::forward(policy), first, last, + [&first, &extremum, &p](RandIt chunk_first, RandIt chunk_last) { + if (std::distance(first, chunk_first) > extremum) { + // already found by another task + return; + } + + RandIt chunk_res = std::find_if(chunk_first, chunk_last, p); + if (chunk_res != chunk_last) { + // Found, update exremum using a priority update CAS, as discussed in + // "Reducing Contention Through Priority Updates", PPoPP '13 + const diff_t k = std::distance(first, chunk_res); + for (diff_t old = extremum; k < old; old = extremum) { + extremum.compare_exchange_weak(old, k); + } + } + }, 8); // use small tasks so later ones may exit early if item is already found + poolstl::internal::get_futures(futures); + return extremum == n ? last : first + extremum; + } + + /** + * NOTE: Iterators are expected to be random access. + * See std::find_if_not https://en.cppreference.com/w/cpp/algorithm/find_if_not + */ + template + poolstl::internal::enable_if_par + find_if_not(ExecPolicy &&policy, RandIt first, RandIt last, UnaryPredicate p) { + return std::find_if(std::forward(policy), first, last, + [&p](const typename std::iterator_traits::value_type& test) { return !p(test); }); + } + + /** + * NOTE: Iterators are expected to be random access. + * See std::find https://en.cppreference.com/w/cpp/algorithm/find + */ + template + poolstl::internal::enable_if_par + find(ExecPolicy &&policy, RandIt first, RandIt last, const T& value) { + return std::find_if(std::forward(policy), first, last, + [&value](const T& test) { return value == test; }); + } + + /** + * NOTE: Iterators are expected to be random access. + * See std::for_each https://en.cppreference.com/w/cpp/algorithm/for_each + */ + template + poolstl::internal::enable_if_par + for_each(ExecPolicy &&policy, RandIt first, RandIt last, UnaryFunction f) { + auto futures = poolstl::internal::parallel_chunk_for(std::forward(policy), first, last, + [&f](RandIt chunk_first, RandIt chunk_last) { + // std::for_each(chunk_first, chunk_last, f); + for (; chunk_first != chunk_last; ++chunk_first) { + f(*chunk_first); + } + }); + poolstl::internal::get_futures(futures); + } + + /** + * NOTE: Iterators are expected to be random access. + * See std::for_each_n https://en.cppreference.com/w/cpp/algorithm/for_each_n + */ + template + poolstl::internal::enable_if_par + for_each_n(ExecPolicy &&policy, RandIt first, Size n, UnaryFunction f) { + RandIt last = poolstl::internal::advanced(first, n); + std::for_each(std::forward(policy), first, last, f); + return last; + } + + /** + * NOTE: Iterators are expected to be random access. + * See std::sort https://en.cppreference.com/w/cpp/algorithm/sort + */ + template + poolstl::internal::enable_if_par + sort(ExecPolicy &&policy, RandIt first, RandIt last, Compare comp) { + poolstl::internal::parallel_sort(std::forward(policy), first, last, comp, false); + } + + /** + * NOTE: Iterators are expected to be random access. + * See std::sort https://en.cppreference.com/w/cpp/algorithm/sort + */ + template + poolstl::internal::enable_if_par + sort(ExecPolicy &&policy, RandIt first, RandIt last) { + using T = typename std::iterator_traits::value_type; + poolstl::internal::parallel_sort(std::forward(policy), first, last, std::less(), false); + } + + /** + * NOTE: Iterators are expected to be random access. + * See std::stable_sort https://en.cppreference.com/w/cpp/algorithm/stable_sort + */ + template + poolstl::internal::enable_if_par + stable_sort(ExecPolicy &&policy, RandIt first, RandIt last, Compare comp) { + poolstl::internal::parallel_sort(std::forward(policy), first, last, comp, true); + } + + /** + * NOTE: Iterators are expected to be random access. + * See std::stable_sort https://en.cppreference.com/w/cpp/algorithm/stable_sort + */ + template + poolstl::internal::enable_if_par + stable_sort(ExecPolicy &&policy, RandIt first, RandIt last) { + using T = typename std::iterator_traits::value_type; + poolstl::internal::parallel_sort(std::forward(policy), first, last, std::less(), true); + } + + /** + * NOTE: Iterators are expected to be random access. + * See std::transform https://en.cppreference.com/w/cpp/algorithm/transform + */ + template + poolstl::internal::enable_if_par + transform(ExecPolicy&& policy, RandIt1 first1, RandIt1 last1, + RandIt2 dest, UnaryOperation unary_op) { + + auto futures = poolstl::internal::parallel_chunk_for(std::forward(policy), first1, last1, dest, + [&unary_op](RandIt1 chunk_first1, RandIt1 chunk_last1, RandIt2 dest_first) { + return poolstl::internal::cpp17::transform(chunk_first1, chunk_last1, dest_first, unary_op); + }); + poolstl::internal::get_futures(futures); + return dest + std::distance(first1, last1); + } + + /** + * NOTE: Iterators are expected to be random access. + * See std::transform https://en.cppreference.com/w/cpp/algorithm/transform + */ + template + poolstl::internal::enable_if_par + transform(ExecPolicy&& policy, RandIt1 first1, RandIt1 last1, + RandIt2 first2, RandIt3 dest, BinaryOperation binary_op) { + + auto futures = poolstl::internal::parallel_chunk_for(std::forward(policy), first1, last1, + first2, dest, + [&binary_op](RandIt1 chunk_first1, RandIt1 chunk_last1, RandIt1 chunk_first2, RandIt3 dest_first) { + return poolstl::internal::cpp17::transform(chunk_first1, chunk_last1, + chunk_first2, dest_first, binary_op); + }); + poolstl::internal::get_futures(futures); + return dest + std::distance(first1, last1); + } + + /** + * NOTE: Iterators are expected to be random access. + * See std::all_of https://en.cppreference.com/w/cpp/algorithm/all_of + */ + template + poolstl::internal::enable_if_par + all_of(ExecPolicy&& policy, RandIt first, RandIt last, Predicate pred) { + return last == std::find_if_not(std::forward(policy), first, last, pred); + } + + /** + * NOTE: Iterators are expected to be random access. + * See std::none_of https://en.cppreference.com/w/cpp/algorithm/none_of + */ + template + poolstl::internal::enable_if_par + none_of(ExecPolicy&& policy, RandIt first, RandIt last, Predicate pred) { + return last == std::find_if(std::forward(policy), first, last, pred); + } + + /** + * NOTE: Iterators are expected to be random access. + * See std::any_of https://en.cppreference.com/w/cpp/algorithm/any_of + */ + template + poolstl::internal::enable_if_par + any_of(ExecPolicy&& policy, RandIt first, RandIt last, Predicate pred) { + return !std::none_of(std::forward(policy), first, last, pred); + } +} + +namespace poolstl { + + template + void for_each_chunk(RandIt first, RandIt last, ChunkConstructor construct, UnaryFunction f) { + if (first == last) { + return; + } + + auto chunk_data = construct(); + for (; first != last; ++first) { + f(*first, chunk_data); + } + } + + /** + * NOTE: Iterators are expected to be random access. + * + * Like `std::for_each`, but exposes the chunking. The `construct` method is called once per parallel chunk and + * its output is passed to `f`. + * + * Useful for cases where an expensive workspace can be shared between loop iterations + * but cannot be shared by all parallel iterations. + */ + template + poolstl::internal::enable_if_par + for_each_chunk(ExecPolicy&& policy, RandIt first, RandIt last, ChunkConstructor construct, UnaryFunction f) { + auto futures = poolstl::internal::parallel_chunk_for(std::forward(policy), first, last, + [&construct, &f](RandIt chunk_first, RandIt chunk_last) { + for_each_chunk(chunk_first, chunk_last, construct, f); + }); + poolstl::internal::get_futures(futures); + } +} + +#endif + +#ifndef POOLSTL_NUMERIC_HPP +#define POOLSTL_NUMERIC_HPP + +#include + + +namespace std { + +#if POOLSTL_HAVE_CXX17_LIB + /** + * NOTE: Iterators are expected to be random access. + * See std::exclusive_scan https://en.cppreference.com/w/cpp/algorithm/exclusive_scan + */ + template + poolstl::internal::enable_if_par + exclusive_scan(ExecPolicy &&policy, RandIt1 first, RandIt1 last, RandIt2 dest, T init, BinaryOp binop) { + if (first == last) { + return dest; + } + + // Pass 1: Chunk the input and find the sum of each chunk + auto futures = poolstl::internal::parallel_chunk_for(std::forward(policy), first, last, + [binop](RandIt1 chunk_first, RandIt1 chunk_last) { + auto sum = std::accumulate(chunk_first, chunk_last, T{}, binop); + return std::make_tuple(std::make_pair(chunk_first, chunk_last), sum); + }); + + std::vector> ranges; + std::vector sums; + + for (auto& future : futures) { + auto res = future.get(); + ranges.push_back(std::get<0>(res)); + sums.push_back(std::get<1>(res)); + } + + // find initial values for each range + std::exclusive_scan(sums.begin(), sums.end(), sums.begin(), init, binop); + + // Pass 2: perform exclusive scan of each chunk, using the sum of previous chunks as init + std::vector> args; + for (std::size_t i = 0; i < sums.size(); ++i) { + auto chunk_first = std::get<0>(ranges[i]); + args.emplace_back(std::make_tuple( + chunk_first, std::get<1>(ranges[i]), + dest + (chunk_first - first), + sums[i])); + } + + auto futures2 = poolstl::internal::parallel_apply(std::forward(policy), + [binop](RandIt1 chunk_first, RandIt1 chunk_last, RandIt2 chunk_dest, T chunk_init){ + std::exclusive_scan(chunk_first, chunk_last, chunk_dest, chunk_init, binop); + }, args); + + poolstl::internal::get_futures(futures2); + return dest + (last - first); + } + + /** + * NOTE: Iterators are expected to be random access. + * See std::exclusive_scan https://en.cppreference.com/w/cpp/algorithm/exclusive_scan + */ + template + poolstl::internal::enable_if_par + exclusive_scan(ExecPolicy &&policy, RandIt1 first, RandIt1 last, RandIt2 dest, T init) { + return std::exclusive_scan(std::forward(policy), first, last, dest, init, std::plus()); + } +#endif + + /** + * NOTE: Iterators are expected to be random access. + * See std::reduce https://en.cppreference.com/w/cpp/algorithm/reduce + */ + template + poolstl::internal::enable_if_par + reduce(ExecPolicy &&policy, RandIt first, RandIt last, T init, BinaryOp binop) { + auto futures = poolstl::internal::parallel_chunk_for(std::forward(policy), first, last, + [init, binop](RandIt chunk_first, RandIt chunk_last) { + return poolstl::internal::cpp17::reduce(chunk_first, chunk_last, init, binop); + }); + + return poolstl::internal::cpp17::reduce( + poolstl::internal::get_wrap(futures.begin()), + poolstl::internal::get_wrap(futures.end()), init, binop); + } + + /** + * NOTE: Iterators are expected to be random access. + * See std::reduce https://en.cppreference.com/w/cpp/algorithm/reduce + */ + template + poolstl::internal::enable_if_par + reduce(ExecPolicy &&policy, RandIt first, RandIt last, T init) { + return std::reduce(std::forward(policy), first, last, init, std::plus()); + } + + /** + * NOTE: Iterators are expected to be random access. + * See std::reduce https://en.cppreference.com/w/cpp/algorithm/reduce + */ + template + poolstl::internal::enable_if_par< + ExecPolicy, typename std::iterator_traits::value_type> + reduce(ExecPolicy &&policy, RandIt first, RandIt last) { + return std::reduce(std::forward(policy), first, last, + typename std::iterator_traits::value_type{}); + } + +#if POOLSTL_HAVE_CXX17_LIB + /** + * NOTE: Iterators are expected to be random access. + * See std::transform_reduce https://en.cppreference.com/w/cpp/algorithm/transform_reduce + */ + template + poolstl::internal::enable_if_par + transform_reduce(ExecPolicy&& policy, RandIt1 first1, RandIt1 last1, T init, + BinaryReductionOp reduce_op, UnaryTransformOp transform_op) { + + auto futures = poolstl::internal::parallel_chunk_for(std::forward(policy), first1, last1, + [&init, &reduce_op, &transform_op](RandIt1 chunk_first1, RandIt1 chunk_last1) { + return std::transform_reduce(chunk_first1, chunk_last1, init, reduce_op, transform_op); + }); + + return poolstl::internal::cpp17::reduce( + poolstl::internal::get_wrap(futures.begin()), + poolstl::internal::get_wrap(futures.end()), init, reduce_op); + } + + /** + * NOTE: Iterators are expected to be random access. + * See std::transform_reduce https://en.cppreference.com/w/cpp/algorithm/transform_reduce + */ + template + poolstl::internal::enable_if_par + transform_reduce(ExecPolicy&& policy, RandIt1 first1, RandIt1 last1, RandIt2 first2, T init, + BinaryReductionOp reduce_op, BinaryTransformOp transform_op) { + + auto futures = poolstl::internal::parallel_chunk_for(std::forward(policy), first1, last1, first2, + [&init, &reduce_op, &transform_op](RandIt1 chunk_first1, RandIt1 chunk_last1, RandIt2 chunk_first2) { + return std::transform_reduce(chunk_first1, chunk_last1, chunk_first2, init, reduce_op, transform_op); + }); + + return poolstl::internal::cpp17::reduce( + poolstl::internal::get_wrap(futures.begin()), + poolstl::internal::get_wrap(futures.end()), init, reduce_op); + } + + /** + * NOTE: Iterators are expected to be random access. + * See std::transform_reduce https://en.cppreference.com/w/cpp/algorithm/transform_reduce + */ + template< class ExecPolicy, class RandIt1, class RandIt2, class T > + poolstl::internal::enable_if_par + transform_reduce(ExecPolicy&& policy, RandIt1 first1, RandIt1 last1, RandIt2 first2, T init ) { + return transform_reduce(std::forward(policy), + first1, last1, first2, init, std::plus<>(), std::multiplies<>()); + } +#endif + +} + +#endif + +#ifndef POOLSTL_SEQ_FWD_HPP +#define POOLSTL_SEQ_FWD_HPP + + +/* + * Forward poolstl::seq to the native sequential (no policy) method. + */ + +#define POOLSTL_DEFINE_SEQ_FWD(NS, FNAME) \ + template \ + auto FNAME(EP&&, ARGS&&...args) -> \ + poolstl::internal::enable_if_seq(args)...))> { \ + return NS::FNAME(std::forward(args)...); \ + } + +#define POOLSTL_DEFINE_SEQ_FWD_VOID(NS, FNAME) \ + template \ + poolstl::internal::enable_if_seq FNAME(EP&&, ARGS&&... args) { \ + NS::FNAME(std::forward(args)...); \ + } + +#if POOLSTL_HAVE_CXX17 + +/* + * Dynamically choose policy from a std::variant. + * Useful to choose between parallel and sequential policies at runtime via par_if. + */ + +#define POOLSTL_DEFINE_PAR_IF_FWD_VOID(NS, FNAME) \ + template \ + poolstl::internal::enable_if_poolstl_variant FNAME(EP&& policy, ARGS&&...args) { \ + std::visit([&](auto&& pol) { NS::FNAME(pol, std::forward(args)...); }, policy.var); \ + } + +#define POOLSTL_DEFINE_PAR_IF_FWD(NS, FNAME) \ + template \ + auto FNAME(EP&& policy, ARGS&&...args) -> \ + poolstl::internal::enable_if_poolstl_variant(args)...))> { \ + return std::visit([&](auto&& pol) { return NS::FNAME(pol, std::forward(args)...); }, policy.var); \ + } + +#else +#define POOLSTL_DEFINE_PAR_IF_FWD_VOID(NS, FNAME) +#define POOLSTL_DEFINE_PAR_IF_FWD(NS, FNAME) +#endif +/* + * Define both the sequential forward and dynamic chooser. + */ +#define POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(NS, FNAME) \ + POOLSTL_DEFINE_SEQ_FWD(NS, FNAME) \ + POOLSTL_DEFINE_PAR_IF_FWD(NS, FNAME) + +#define POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF_VOID(NS, FNAME) \ + POOLSTL_DEFINE_SEQ_FWD_VOID(NS, FNAME) \ + POOLSTL_DEFINE_PAR_IF_FWD_VOID(NS, FNAME) + +namespace std { + POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, all_of) + POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, any_of) + POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, none_of) + + POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, count) + POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, count_if) + + POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, copy) + POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, copy_n) + + POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF_VOID(std, fill) + POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, fill_n) + + POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, find) + POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, find_if) + POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, find_if_not) + + POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF_VOID(std, for_each) +#if POOLSTL_HAVE_CXX17_LIB + POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, for_each_n) +#endif + + POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, transform) + +#if POOLSTL_HAVE_CXX17_LIB + POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, exclusive_scan) + POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, reduce) + POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, transform_reduce) +#endif +} + +namespace poolstl { + POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF_VOID(poolstl, for_each_chunk) +} + +#endif + +// Note that iota_iter.hpp is self-contained in its own right. + +#ifndef POOLSTL_IOTA_ITER_HPP +#define POOLSTL_IOTA_ITER_HPP + +#include +#include + +namespace poolstl { + + /** + * An iterator over the integers. + * + * Effectively a view on a fictional vector populated by std::iota, but without materializing anything. + * + * Useful to parallelize loops that are not over a container, like this: + * + * \code{.cpp} + * for (int i = 0; i < 10; ++i) { + * } + *\endcode + * + * Becomes: + * \code{.cpp} + * std::for_each(iota_iter(0), iota_iter(10), [](int i) { + * }); + * \endcode + * + * @tparam T A type that acts as an integer. + */ + template + class iota_iter { + public: + using value_type = T; + using difference_type = std::ptrdiff_t; + using pointer = T *; + using reference = T; + using iterator_category = std::random_access_iterator_tag; + + iota_iter() : value{} {} + explicit iota_iter(T rhs) : value(rhs) {} + iota_iter(const iota_iter &rhs) : value(rhs.value) {} + + iota_iter &operator=(T rhs) { value = rhs; return *this; } + iota_iter &operator=(const iota_iter &rhs) { value = rhs.value; return *this; } + + reference operator*() const { return value; } + reference operator[](difference_type rhs) const { return value + rhs; } + // operator-> has no meaning in this application + + bool operator==(const iota_iter &rhs) const { return value == rhs.value; } + bool operator!=(const iota_iter &rhs) const { return value != rhs.value; } + bool operator<(const iota_iter &rhs) const { return value < rhs.value; } + bool operator>(const iota_iter &rhs) const { return value > rhs.value; } + bool operator<=(const iota_iter &rhs) const { return value <= rhs.value; } + bool operator>=(const iota_iter &rhs) const { return value >= rhs.value; } + + iota_iter &operator+=(difference_type rhs) { value += rhs; return *this; } + iota_iter &operator-=(difference_type rhs) { value -= rhs; return *this; } + + iota_iter &operator++() { ++value; return *this; } + iota_iter &operator--() { --value; return *this; } + iota_iter operator++(int) { iota_iter ret(value); ++value; return ret; } + iota_iter operator--(int) { iota_iter ret(value); --value; return ret; } + + difference_type operator-(const iota_iter &rhs) const { return value - rhs.value; } + iota_iter operator-(difference_type rhs) const { return iota_iter(value - rhs); } + iota_iter operator+(difference_type rhs) const { return iota_iter(value + rhs); } + + friend inline iota_iter operator+(difference_type lhs, const iota_iter &rhs) { + return iota_iter(lhs + rhs.value); + } + + protected: + T value; + }; +} + +namespace std { + /** + * Specialize std::iterator_traits for poolstl::iota_iter. + */ + template + struct iterator_traits> { + using value_type = typename poolstl::iota_iter::value_type; + using difference_type = typename poolstl::iota_iter::difference_type; + using pointer = typename poolstl::iota_iter::pointer; + using reference = typename poolstl::iota_iter::reference; + using iterator_category = typename poolstl::iota_iter::iterator_category; + }; +} + +#endif + +/* + * Optionally alias `poolstl::par` as `std::execution::par` to enable poolSTL to fill in for missing compiler support. + * + * USE AT YOUR OWN RISK! + * + * To use this define POOLSTL_STD_SUPPLEMENT=1 before including poolstl.hpp. + * + * Attempts to autodetect native support by checking for , including it if it exists, and then checking for + * the __cpp_lib_parallel_algorithm feature macro. + * + * If native support is not found then the standard execution policies are declared as forwards to poolSTL. + * + * GCC and Clang: TBB is required if is #included. If you'd like to use the poolSTL supplement in cases + * that TBB is not available, have your build system define POOLSTL_STD_SUPPLEMENT_NO_INCLUDE if TBB is not found. + * PoolSTL will then not include and the supplement will kick in. + * Your code must not #include . + * + * MinGW: the compiler declares support, but actual performance is sequential (see poolSTL benchmark). To use + * the supplement anyway define POOLSTL_STD_SUPPLEMENT_FORCE to override the autodetection. + * Your code must not #include . + * + * Define POOLSTL_ALLOW_SUPPLEMENT=0 to override POOLSTL_STD_SUPPLEMENT and disable this feature. + */ +#ifndef POOLSTL_ALLOW_SUPPLEMENT +#define POOLSTL_ALLOW_SUPPLEMENT 1 +#endif + +#if POOLSTL_ALLOW_SUPPLEMENT && defined(POOLSTL_STD_SUPPLEMENT) + +#if __cplusplus >= 201603L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201603L) +#if __has_include() +#ifndef POOLSTL_STD_SUPPLEMENT_NO_INCLUDE +#endif +#endif +#endif + +#if !defined(__cpp_lib_parallel_algorithm) || defined(POOLSTL_STD_SUPPLEMENT_FORCE) +namespace std { + namespace execution { + using ::poolstl::execution::sequenced_policy; + using ::poolstl::execution::seq; + using ::poolstl::execution::parallel_policy; + using ::poolstl::execution::par; + using parallel_unsequenced_policy = ::poolstl::execution::parallel_policy; + constexpr parallel_unsequenced_policy par_unseq{}; + } +} + +#endif +#endif + +#endif diff --git a/src/index.cpp b/src/index.cpp index 7773e509..9b907257 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -11,6 +11,7 @@ #include #include #include "pdqsort/pdqsort.h" +#include "poolstl/poolstl.hpp" #include #include #include @@ -138,7 +139,7 @@ int StrobemerIndex::pick_bits(size_t size) const { return std::clamp(static_cast(log2(estimated_number_of_randstrobes)) - 1, 8, 31); } -void StrobemerIndex::populate(float f, size_t n_threads) { +void StrobemerIndex::populate(float f, unsigned n_threads) { Timer count_hash; auto randstrobe_counts = count_all_randstrobes(references, parameters, n_threads); stats.elapsed_counting_hashes = count_hash.duration(); @@ -164,8 +165,13 @@ void StrobemerIndex::populate(float f, size_t n_threads) { Timer sorting_timer; logger.debug() << " Sorting ...\n"; - // sort by hash values - pdqsort_branchless(randstrobes.begin(), randstrobes.end()); + if (true) { + task_thread_pool::task_thread_pool pool{n_threads}; + std::sort(poolstl::par.on(pool), randstrobes.begin(), randstrobes.end()); + } else { + // sort by hash values + pdqsort_branchless(randstrobes.begin(), randstrobes.end()); + } stats.elapsed_sorting_seeds = sorting_timer.duration(); Timer hash_index_timer; diff --git a/src/index.hpp b/src/index.hpp index 941db7de..a6b9f003 100644 --- a/src/index.hpp +++ b/src/index.hpp @@ -51,7 +51,7 @@ struct StrobemerIndex { void write(const std::string& filename) const; void read(const std::string& filename); - void populate(float f, size_t n_threads); + void populate(float f, unsigned n_threads); void print_diagnostics(const std::string& logfile_name, int k) const; int pick_bits(size_t size) const; size_t find(randstrobe_hash_t key) const { From 2012420a4741685712baa49990065c707ef948ed Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Sun, 14 Jan 2024 16:00:48 +0100 Subject: [PATCH 11/32] Bump poolSTL and use poolstl::pluggable_sort --- ext/README.md | 6 + ext/poolstl/poolstl.hpp | 1476 +++++++++++++++++++++++++++++++-------- src/index.cpp | 9 +- 3 files changed, 1179 insertions(+), 312 deletions(-) diff --git a/ext/README.md b/ext/README.md index e8316d26..d80e5a2d 100644 --- a/ext/README.md +++ b/ext/README.md @@ -27,6 +27,12 @@ Homepage: https://github.com/orlp/pdqsort Commit used: b1ef26a55cdb60d236a5cb199c4234c704f46726 License: See pdqsort/license.txt +## poolstl + +Homepage: https://github.com/alugowski/poolSTL/ +Downloaded file: https://github.com/alugowski/poolSTL/releases/download/v0.3.3/poolstl.hpp +Version: 0.3.3 +License: See poolstl.hpp ## robin_hood diff --git a/ext/poolstl/poolstl.hpp b/ext/poolstl/poolstl.hpp index ea79146e..d1340a1e 100644 --- a/ext/poolstl/poolstl.hpp +++ b/ext/poolstl/poolstl.hpp @@ -84,6 +84,7 @@ * DEALINGS IN THE SOFTWARE. */ + #ifndef POOLSTL_HPP #define POOLSTL_HPP @@ -93,6 +94,767 @@ #include #include +#include +#include + + +#ifndef AL_TASK_THREAD_POOL_HPP +#define AL_TASK_THREAD_POOL_HPP + +// Version macros. +#define TASK_THREAD_POOL_VERSION_MAJOR 1 +#define TASK_THREAD_POOL_VERSION_MINOR 0 +#define TASK_THREAD_POOL_VERSION_PATCH 10 + +#include +#include +#include +#include +#include +#include +#include + +// MSVC does not correctly set the __cplusplus macro by default, so we must read it from _MSVC_LANG +// See https://devblogs.microsoft.com/cppblog/msvc-now-correctly-reports-__cplusplus/ +#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) +#define TTP_CXX17 1 +#else +#define TTP_CXX17 0 +#endif + +#if TTP_CXX17 +#define TTP_NODISCARD [[nodiscard]] +#else +#define TTP_NODISCARD +#endif + +namespace task_thread_pool { + +#if !TTP_CXX17 + /** + * A reimplementation of std::decay_t, which is only available since C++14. + */ + template + using decay_t = typename std::decay::type; +#endif + + /** + * A fast and lightweight thread pool that uses C++11 threads. + */ + class task_thread_pool { + public: + /** + * Create a task_thread_pool and start worker threads. + * + * @param num_threads Number of worker threads. If 0 then number of threads is equal to the + * number of physical cores on the machine, as given by std::thread::hardware_concurrency(). + */ + explicit task_thread_pool(unsigned int num_threads = 0) { + if (num_threads < 1) { + num_threads = std::thread::hardware_concurrency(); + if (num_threads < 1) { num_threads = 1; } + } + start_threads(num_threads); + } + + /** + * Finish all tasks left in the queue then shut down worker threads. + * If the pool is currently paused then it is resumed. + */ + ~task_thread_pool() { + unpause(); + wait_for_queued_tasks(); + stop_all_threads(); + } + + /** + * Drop all tasks that have been submitted but not yet started by a worker. + * + * Tasks already in progress continue executing. + */ + void clear_task_queue() { + const std::lock_guard tasks_lock(task_mutex); + tasks = {}; + } + + /** + * Get number of enqueued tasks. + * + * @return Number of tasks that have been enqueued but not yet started. + */ + TTP_NODISCARD size_t get_num_queued_tasks() const { + const std::lock_guard tasks_lock(task_mutex); + return tasks.size(); + } + + /** + * Get number of in-progress tasks. + * + * @return Approximate number of tasks currently being processed by worker threads. + */ + TTP_NODISCARD size_t get_num_running_tasks() const { + const std::lock_guard tasks_lock(task_mutex); + return num_inflight_tasks; + } + + /** + * Get total number of tasks in the pool. + * + * @return Approximate number of tasks both enqueued and running. + */ + TTP_NODISCARD size_t get_num_tasks() const { + const std::lock_guard tasks_lock(task_mutex); + return tasks.size() + num_inflight_tasks; + } + + /** + * Get number of worker threads. + * + * @return Number of worker threads. + */ + TTP_NODISCARD unsigned int get_num_threads() const { + const std::lock_guard threads_lock(thread_mutex); + return static_cast(threads.size()); + } + + /** + * Set number of worker threads. Will start or stop worker threads as necessary. + * + * @param num_threads Number of worker threads. If 0 then number of threads is equal to the + * number of physical cores on the machine, as given by std::thread::hardware_concurrency(). + * @return Previous number of worker threads. + */ + unsigned int set_num_threads(unsigned int num_threads) { + const std::lock_guard threads_lock(thread_mutex); + unsigned int previous_num_threads = get_num_threads(); + + if (num_threads < 1) { + num_threads = std::thread::hardware_concurrency(); + if (num_threads < 1) { num_threads = 1; } + } + + if (previous_num_threads <= num_threads) { + // expanding the thread pool + start_threads(num_threads - previous_num_threads); + } else { + // contracting the thread pool + stop_all_threads(); + { + const std::lock_guard tasks_lock(task_mutex); + pool_running = true; + } + start_threads(num_threads); + } + + return previous_num_threads; + } + + /** + * Stop executing queued tasks. Use `unpause()` to resume. Note: Destroying the pool will implicitly unpause. + * + * Any in-progress tasks continue executing. + */ + void pause() { + const std::lock_guard tasks_lock(task_mutex); + pool_paused = true; + } + + /** + * Resume executing queued tasks. + */ + void unpause() { + const std::lock_guard tasks_lock(task_mutex); + pool_paused = false; + task_cv.notify_all(); + } + + /** + * Check whether the pool is paused. + * + * @return true if pause() has been called without an intervening unpause(). + */ + TTP_NODISCARD bool is_paused() const { + const std::lock_guard tasks_lock(task_mutex); + return pool_paused; + } + + /** + * Submit a Callable for the pool to execute and return a std::future. + * + * @param func The Callable to execute. Can be a function, a lambda, std::packaged_task, std::function, etc. + * @param args Arguments for func. Optional. + * @return std::future that can be used to get func's return value or thrown exception. + */ + template , std::decay_t...> +#else + typename R = typename std::result_of(decay_t...)>::type +#endif + > + TTP_NODISCARD std::future submit(F&& func, A&&... args) { +#if defined(_MSC_VER) + // MSVC's packaged_task is not movable even though it should be. + // Discussion about this bug and its future fix: + // https://developercommunity.visualstudio.com/t/unable-to-move-stdpackaged-task-into-any-stl-conta/108672 + std::shared_ptr> ptask = + std::make_shared>(std::bind(std::forward(func), std::forward(args)...)); + submit_detach([ptask] { (*ptask)(); }); + return ptask->get_future(); +#else + std::packaged_task task(std::bind(std::forward(func), std::forward(args)...)); + auto ret = task.get_future(); + submit_detach(std::move(task)); + return ret; +#endif + } + + /** + * Submit a zero-argument Callable for the pool to execute. + * + * @param func The Callable to execute. Can be a function, a lambda, std::packaged_task, std::function, etc. + */ + template + void submit_detach(F&& func) { + const std::lock_guard tasks_lock(task_mutex); + tasks.emplace(std::forward(func)); + task_cv.notify_one(); + } + + /** + * Submit a Callable with arguments for the pool to execute. + * + * @param func The Callable to execute. Can be a function, a lambda, std::packaged_task, std::function, etc. + */ + template + void submit_detach(F&& func, A&&... args) { + const std::lock_guard tasks_lock(task_mutex); + tasks.emplace(std::bind(std::forward(func), std::forward(args)...)); + task_cv.notify_one(); + } + + /** + * Block until the task queue is empty. Some tasks may be in-progress when this method returns. + */ + void wait_for_queued_tasks() { + std::unique_lock tasks_lock(task_mutex); + notify_task_finish = true; + task_finished_cv.wait(tasks_lock, [&] { return tasks.empty(); }); + notify_task_finish = false; + } + + /** + * Block until all tasks have finished. + */ + void wait_for_tasks() { + std::unique_lock tasks_lock(task_mutex); + notify_task_finish = true; + task_finished_cv.wait(tasks_lock, [&] { return tasks.empty() && num_inflight_tasks == 0; }); + notify_task_finish = false; + } + + protected: + + /** + * Main function for worker threads. + */ + void worker_main() { + bool finished_task = false; + + while (true) { + std::unique_lock tasks_lock(task_mutex); + + if (finished_task) { + --num_inflight_tasks; + if (notify_task_finish) { + task_finished_cv.notify_all(); + } + } + + task_cv.wait(tasks_lock, [&]() { return !pool_running || (!pool_paused && !tasks.empty()); }); + + if (!pool_running) { + break; + } + + // Must mean that (!pool_paused && !tasks.empty()) is true + + std::packaged_task task{std::move(tasks.front())}; + tasks.pop(); + ++num_inflight_tasks; + tasks_lock.unlock(); + + try { + task(); + } catch (...) { + // std::packaged_task::operator() may throw in some error conditions, such as if the task + // had already been run. Nothing that the pool can do anything about. + } + + finished_task = true; + } + } + + /** + * Start worker threads. + * + * @param num_threads How many threads to start. + */ + void start_threads(const unsigned int num_threads) { + const std::lock_guard threads_lock(thread_mutex); + + for (unsigned int i = 0; i < num_threads; ++i) { + threads.emplace_back(&task_thread_pool::worker_main, this); + } + } + + /** + * Stop, join, and destroy all worker threads. + */ + void stop_all_threads() { + const std::lock_guard threads_lock(thread_mutex); + + { + const std::lock_guard tasks_lock(task_mutex); + pool_running = false; + task_cv.notify_all(); + } + + for (auto& thread : threads) { + if (thread.joinable()) { + thread.join(); + } + } + threads.clear(); + } + + /** + * The worker threads. + * + * Access protected by thread_mutex + */ + std::vector threads; + + /** + * A mutex for methods that start/stop threads. + */ + mutable std::recursive_mutex thread_mutex; + + /** + * The task queue. + * + * Access protected by task_mutex. + */ + std::queue> tasks = {}; + + /** + * A mutex for all variables related to tasks. + */ + mutable std::mutex task_mutex; + + /** + * Used to notify changes to the task queue, such as a new task added, pause/unpause, etc. + */ + std::condition_variable task_cv; + + /** + * Used to notify of finished tasks. + */ + std::condition_variable task_finished_cv; + + /** + * A signal for worker threads that the pool is either running or shutting down. + * + * Access protected by task_mutex. + */ + bool pool_running = true; + + /** + * A signal for worker threads to not pull new tasks from the queue. + * + * Access protected by task_mutex. + */ + bool pool_paused = false; + + /** + * A signal for worker threads that they should notify task_finished_cv when they finish a task. + * + * Access protected by task_mutex. + */ + bool notify_task_finish = false; + + /** + * A counter of the number of tasks in-progress by worker threads. + * Incremented when a task is popped off the task queue and decremented when that task is complete. + * + * Access protected by task_mutex. + */ + int num_inflight_tasks = 0; + }; +} + +// clean up +#undef TTP_NODISCARD +#undef TTP_CXX17 + +#endif + +#ifndef POOLSTL_INTERNAL_UTILS_HPP +#define POOLSTL_INTERNAL_UTILS_HPP + +// Version macros. +#define POOLSTL_VERSION_MAJOR 0 +#define POOLSTL_VERSION_MINOR 3 +#define POOLSTL_VERSION_PATCH 3 + +#include +#include + +#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) +#define POOLSTL_HAVE_CXX17 1 +#define POOLSTL_NO_DISCARD [[nodiscard]] +#else +#define POOLSTL_HAVE_CXX17 0 +#define POOLSTL_NO_DISCARD +#endif + +#if POOLSTL_HAVE_CXX17 && (!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE >= 9) +#define POOLSTL_HAVE_CXX17_LIB 1 +#else +#define POOLSTL_HAVE_CXX17_LIB 0 +#endif + +#if __cplusplus >= 201402L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201402L) +#define POOLSTL_HAVE_CXX14 1 +#else +#define POOLSTL_HAVE_CXX14 0 +#endif + +namespace poolstl { + namespace internal { + + inline constexpr std::size_t get_chunk_size(std::size_t num_steps, unsigned int num_threads) { + return (num_steps / num_threads) + ((num_steps % num_threads) > 0 ? 1 : 0); + } + + template + constexpr typename std::iterator_traits::difference_type + get_chunk_size(Iterator first, Iterator last, unsigned int num_threads) { + using diff_t = typename std::iterator_traits::difference_type; + return static_cast(get_chunk_size((std::size_t)std::distance(first, last), num_threads)); + } + + template + constexpr typename std::iterator_traits::difference_type + get_iter_chunk_size(const Iterator& iter, const Iterator& last, + typename std::iterator_traits::difference_type chunk_size) { + return std::min(chunk_size, std::distance(iter, last)); + } + + template + Iterator advanced(Iterator iter, typename std::iterator_traits::difference_type offset) { + Iterator ret = iter; + std::advance(ret, offset); + return ret; + } + + /** + * An iterator wrapper that calls std::future<>::get(). + * @tparam Iterator + */ + template + class getting_iter : public Iterator { + public: + using value_type = decltype((*std::declval()).get()); + using difference_type = typename std::iterator_traits::difference_type; + using pointer = value_type*; + using reference = value_type&; + explicit getting_iter(Iterator iter) : iter(iter) {} + + getting_iter operator++() { ++iter; return *this; } + getting_iter operator++(int) { getting_iter ret(*this); ++iter; return ret; } + + value_type operator*() { return (*iter).get(); } + value_type operator[](difference_type offset) { return iter[offset].get(); } + + bool operator==(const getting_iter &other) const { return iter == other.iter; } + bool operator!=(const getting_iter &other) const { return iter != other.iter; } + + protected: + Iterator iter; + }; + + template + getting_iter get_wrap(Iterator iter) { + return getting_iter(iter); + } + + template + void get_futures(Container& futures) { + for (auto &future: futures) { + future.get(); + } + } + + /* + * Some methods are only available with C++17 and up. Reimplement on older standards. + */ +#if POOLSTL_HAVE_CXX17_LIB + namespace cpp17 = std; +#else + namespace cpp17 { + + // std::reduce + + template + Tp reduce(InputIt first, InputIt last, Tp init, BinOp b) { + for (; first != last; ++first) + init = b(init, *first); + return init; + } + + template + typename std::iterator_traits::value_type reduce(InputIt first, InputIt last) { + return reduce(first, last, + typename std::iterator_traits::value_type{}, + std::plus::value_type>()); + } + + // std::transform + + template + OutputIt transform(InputIt first1, InputIt last1, OutputIt d_first, + UnaryOperation unary_op) { + while (first1 != last1) { + *d_first++ = unary_op(*first1++); + } + + return d_first; + } + + template + OutputIt transform(InputIt1 first1, InputIt1 last1, + InputIt2 first2, OutputIt d_first, + BinaryOperation binary_op) { + while (first1 != last1) { + *d_first++ = binary_op(*first1++, *first2++); + } + + return d_first; + } + } +#endif + } +} + +#endif + +namespace poolstl { + + namespace ttp = task_thread_pool; + + namespace execution { + namespace internal { + /** + * Holds the thread pool used by par. + */ + inline std::shared_ptr get_default_pool() { + static std::shared_ptr pool; + static std::once_flag flag; + std::call_once(flag, [&](){ pool = std::make_shared(); }); + return pool; + } + } + + /** + * Base class for all poolSTL policies. + */ + struct poolstl_policy { + }; + + /** + * A sequential policy that simply forwards to the non-policy overload. + */ + struct sequenced_policy : public poolstl_policy { + POOLSTL_NO_DISCARD ttp::task_thread_pool* pool() const { + // never called, but must exist for C++11 support + throw std::runtime_error("poolSTL: requested thread pool for seq policy."); + } + + POOLSTL_NO_DISCARD bool par_allowed() const { + return false; + } + }; + + /** + * A parallel policy that can use a user-specified thread pool or a default one. + */ + struct parallel_policy : public poolstl_policy { + parallel_policy() = default; + explicit parallel_policy(ttp::task_thread_pool* on_pool, bool par_ok): on_pool(on_pool), par_ok(par_ok) {} + + parallel_policy on(ttp::task_thread_pool& pool) const { + return parallel_policy{&pool, par_ok}; + } + + parallel_policy par_if(bool call_par) const { + return parallel_policy{on_pool, call_par}; + } + + POOLSTL_NO_DISCARD ttp::task_thread_pool* pool() const { + if (on_pool) { + return on_pool; + } else { + return internal::get_default_pool().get(); + } + } + + POOLSTL_NO_DISCARD bool par_allowed() const { + return par_ok; + } + + protected: + ttp::task_thread_pool *on_pool = nullptr; + bool par_ok = true; + }; + + constexpr sequenced_policy seq{}; + constexpr parallel_policy par{}; + + /** + * EXPERIMENTAL: Subject to significant changes or removal. + * Use pure threads for each operation instead of a shared thread pool. + * + * Advantage: + * - Fewer symbols (no packaged_task with its operators, destructors, vtable, etc) means smaller binary + * which can mean a lot when there are many calls. + * - No thread pool to manage. + * + * Disadvantages: + * - Threads are started and joined for every operation, so it is harder to amortize that cost. + * - Barely any algorithms are supported. + */ + struct pure_threads_policy : public poolstl_policy { + explicit pure_threads_policy(unsigned int num_threads, bool par_ok): num_threads(num_threads), + par_ok(par_ok) {} + + POOLSTL_NO_DISCARD unsigned int get_num_threads() const { + if (num_threads == 0) { + return std::thread::hardware_concurrency(); + } + return num_threads; + } + + POOLSTL_NO_DISCARD bool par_allowed() const { + return par_ok; + } + + protected: + unsigned int num_threads = 1; + bool par_ok = true; + }; + + /** + * Choose parallel or sequential at runtime. + * + * @param call_par Whether to use a parallel policy. + * @return `par` if call_par is true, else a sequential policy (like `seq`). + */ + inline parallel_policy par_if(bool call_par) { + return parallel_policy{nullptr, call_par}; + } + + /** + * Choose parallel or sequential at runtime, with pool selection. + * + * @param call_par Whether to use a parallel policy. + * @return `par.on(pool)` if call_par is true, else a sequential policy (like `seq`). + */ + inline parallel_policy par_if(bool call_par, ttp::task_thread_pool& pool) { + return parallel_policy{&pool, call_par}; + } + + /** + * EXPERIMENTAL: Subject to significant changes or removal. See `pure_threads_policy`. + * Choose parallel or sequential at runtime, with thread count selection. + * + * @param call_par Whether to use a parallel policy. + * @return `par.on(pool)` if call_par is true, else `seq`. + */ + inline pure_threads_policy par_if_threads(bool call_par, unsigned int num_threads) { + return pure_threads_policy{num_threads, call_par}; + } + } + + using execution::seq; + using execution::par; + using execution::par_if; + + namespace internal { + /** + * To enable/disable seq overload resolution + */ + template + using enable_if_seq = + typename std::enable_if< + std::is_same::type>::type>::value, + Tp>::type; + + /** + * To enable/disable par overload resolution + */ + template + using enable_if_par = + typename std::enable_if< + std::is_same::type>::type>::value, + Tp>::type; + + /** + * To enable/disable par overload resolution + */ + template + using enable_if_poolstl_policy = + typename std::enable_if< + std::is_base_of::type>::type>::value, + Tp>::type; + + template + bool is_seq(const ExecPolicy& policy) { + return !policy.par_allowed(); + } + + template + using is_pure_threads_policy = std::is_same::type>::type>; + } +} + +#endif + +#ifndef POOLSTL_ALGORITHM_HPP +#define POOLSTL_ALGORITHM_HPP + +#include + + +#ifndef POOLSTL_INTERNAL_TTP_IMPL_HPP +#define POOLSTL_INTERNAL_TTP_IMPL_HPP + +#include +#include +#include +#include + + +#ifndef POOLSTL_EXECUTION_HPP +#define POOLSTL_EXECUTION_HPP + +#include +#include +#include #include @@ -102,7 +864,7 @@ // Version macros. #define TASK_THREAD_POOL_VERSION_MAJOR 1 #define TASK_THREAD_POOL_VERSION_MINOR 0 -#define TASK_THREAD_POOL_VERSION_PATCH 9 +#define TASK_THREAD_POOL_VERSION_PATCH 10 #include #include @@ -289,12 +1051,22 @@ namespace task_thread_pool { #else typename R = typename std::result_of(decay_t...)>::type #endif - > + > TTP_NODISCARD std::future submit(F&& func, A&&... args) { +#if defined(_MSC_VER) + // MSVC's packaged_task is not movable even though it should be. + // Discussion about this bug and its future fix: + // https://developercommunity.visualstudio.com/t/unable-to-move-stdpackaged-task-into-any-stl-conta/108672 std::shared_ptr> ptask = std::make_shared>(std::bind(std::forward(func), std::forward(args)...)); submit_detach([ptask] { (*ptask)(); }); return ptask->get_future(); +#else + std::packaged_task task(std::bind(std::forward(func), std::forward(args)...)); + auto ret = task.get_future(); + submit_detach(std::move(task)); + return ret; +#endif } /** @@ -493,7 +1265,7 @@ namespace task_thread_pool { // Version macros. #define POOLSTL_VERSION_MAJOR 0 #define POOLSTL_VERSION_MINOR 3 -#define POOLSTL_VERSION_PATCH 1 +#define POOLSTL_VERSION_PATCH 3 #include #include @@ -637,10 +1409,6 @@ namespace poolstl { #endif -#if POOLSTL_HAVE_CXX17 -#include -#endif - namespace poolstl { namespace ttp = task_thread_pool; @@ -658,92 +1426,129 @@ namespace poolstl { } } + /** + * Base class for all poolSTL policies. + */ + struct poolstl_policy { + }; + /** * A sequential policy that simply forwards to the non-policy overload. */ - struct sequenced_policy {}; + struct sequenced_policy : public poolstl_policy { + POOLSTL_NO_DISCARD ttp::task_thread_pool* pool() const { + // never called, but must exist for C++11 support + throw std::runtime_error("poolSTL: requested thread pool for seq policy."); + } + + POOLSTL_NO_DISCARD bool par_allowed() const { + return false; + } + }; /** * A parallel policy that can use a user-specified thread pool or a default one. */ - struct parallel_policy { + struct parallel_policy : public poolstl_policy { parallel_policy() = default; - explicit parallel_policy(ttp::task_thread_pool& on_pool): on_pool(&on_pool) {} + explicit parallel_policy(ttp::task_thread_pool* on_pool, bool par_ok): on_pool(on_pool), par_ok(par_ok) {} parallel_policy on(ttp::task_thread_pool& pool) const { - return parallel_policy{pool}; + return parallel_policy{&pool, par_ok}; + } + + parallel_policy par_if(bool call_par) const { + return parallel_policy{on_pool, call_par}; } - POOLSTL_NO_DISCARD ttp::task_thread_pool& pool() const { + POOLSTL_NO_DISCARD ttp::task_thread_pool* pool() const { if (on_pool) { - return *on_pool; + return on_pool; } else { - return *(internal::get_default_pool()); + return internal::get_default_pool().get(); } } + POOLSTL_NO_DISCARD bool par_allowed() const { + return par_ok; + } + protected: ttp::task_thread_pool *on_pool = nullptr; + bool par_ok = true; }; constexpr sequenced_policy seq{}; constexpr parallel_policy par{}; - -#if POOLSTL_HAVE_CXX17 /** - * A policy that allows selecting a policy at runtime. + * EXPERIMENTAL: Subject to significant changes or removal. + * Use pure threads for each operation instead of a shared thread pool. * - * @tparam Variant std::variant<> of policy options. + * Advantage: + * - Fewer symbols (no packaged_task with its operators, destructors, vtable, etc) means smaller binary + * which can mean a lot when there are many calls. + * - No thread pool to manage. + * + * Disadvantages: + * - Threads are started and joined for every operation, so it is harder to amortize that cost. + * - Barely any algorithms are supported. */ - template - struct variant_policy { - explicit variant_policy(const Variant& policy): var(policy) {} - Variant var; - }; + struct pure_threads_policy : public poolstl_policy { + explicit pure_threads_policy(unsigned int num_threads, bool par_ok): num_threads(num_threads), + par_ok(par_ok) {} - namespace internal { - using poolstl_policy_variant = std::variant< - poolstl::execution::parallel_policy, - poolstl::execution::sequenced_policy>; - } + POOLSTL_NO_DISCARD unsigned int get_num_threads() const { + if (num_threads == 0) { + return std::thread::hardware_concurrency(); + } + return num_threads; + } + + POOLSTL_NO_DISCARD bool par_allowed() const { + return par_ok; + } + + protected: + unsigned int num_threads = 1; + bool par_ok = true; + }; /** * Choose parallel or sequential at runtime. * * @param call_par Whether to use a parallel policy. - * @return `par` if call_par is true, else `seq`. + * @return `par` if call_par is true, else a sequential policy (like `seq`). */ - inline variant_policy par_if(bool call_par) { - if (call_par) { - return variant_policy(internal::poolstl_policy_variant(par)); - } else { - return variant_policy(internal::poolstl_policy_variant(seq)); - } + inline parallel_policy par_if(bool call_par) { + return parallel_policy{nullptr, call_par}; } /** * Choose parallel or sequential at runtime, with pool selection. * * @param call_par Whether to use a parallel policy. + * @return `par.on(pool)` if call_par is true, else a sequential policy (like `seq`). + */ + inline parallel_policy par_if(bool call_par, ttp::task_thread_pool& pool) { + return parallel_policy{&pool, call_par}; + } + + /** + * EXPERIMENTAL: Subject to significant changes or removal. See `pure_threads_policy`. + * Choose parallel or sequential at runtime, with thread count selection. + * + * @param call_par Whether to use a parallel policy. * @return `par.on(pool)` if call_par is true, else `seq`. */ - inline variant_policy par_if(bool call_par, ttp::task_thread_pool& pool) { - if (call_par) { - return variant_policy(internal::poolstl_policy_variant(par.on(pool))); - } else { - return variant_policy(internal::poolstl_policy_variant(seq)); - } + inline pure_threads_policy par_if_threads(bool call_par, unsigned int num_threads) { + return pure_threads_policy{num_threads, call_par}; } -#endif } using execution::seq; using execution::par; -#if POOLSTL_HAVE_CXX17 - using execution::variant_policy; using execution::par_if; -#endif namespace internal { /** @@ -766,44 +1571,29 @@ namespace poolstl { typename std::remove_cv::type>::type>::value, Tp>::type; -#if POOLSTL_HAVE_CXX17 - /** - * Helper for enable_if_poolstl_variant - */ - template struct is_poolstl_variant_policy : std::false_type {}; - template struct is_poolstl_variant_policy< - ::poolstl::execution::variant_policy> :std::true_type {}; - /** - * To enable/disable variant_policy (for par_if) overload resolution + * To enable/disable par overload resolution */ template - using enable_if_poolstl_variant = + using enable_if_poolstl_policy = typename std::enable_if< - is_poolstl_variant_policy< + std::is_base_of::type>::type>::value, Tp>::type; -#endif + + template + bool is_seq(const ExecPolicy& policy) { + return !policy.par_allowed(); + } + + template + using is_pure_threads_policy = std::is_same::type>::type>; } } #endif -#ifndef POOLSTL_ALGORITHM_HPP -#define POOLSTL_ALGORITHM_HPP - -#include - - -#ifndef POOLSTL_INTERNAL_TTP_IMPL_HPP -#define POOLSTL_INTERNAL_TTP_IMPL_HPP - -#include -#include -#include -#include - - namespace poolstl { namespace internal { @@ -815,33 +1605,61 @@ namespace poolstl { std::vector> parallel_apply(ExecPolicy &&policy, Op op, const ArgContainer& args_list) { std::vector> futures; - auto& task_pool = policy.pool(); + auto& task_pool = *policy.pool(); for (const auto& args : args_list) { - futures.emplace_back(task_pool.submit([op](const auto& args_fwd) { std::apply(op, args_fwd); }, args)); + futures.emplace_back(task_pool.submit([](Op op, const auto& args_fwd) { + std::apply(op, args_fwd); + }, op, args)); } return futures; } #endif + /** + * Chunk a single range, with autodetected return types. + */ + template ()(std::declval(), std::declval()))> + std::vector> + parallel_chunk_for_gen(ExecPolicy &&policy, RandIt first, RandIt last, Chunk chunk, + ChunkRet* = (decltype(std::declval()(std::declval(), + std::declval()))*)nullptr, + int extra_split_factor = 1) { + std::vector> futures; + auto& task_pool = *policy.pool(); + auto chunk_size = get_chunk_size(first, last, extra_split_factor * task_pool.get_num_threads()); + + while (first < last) { + auto iter_chunk_size = get_iter_chunk_size(first, last, chunk_size); + RandIt loop_end = advanced(first, iter_chunk_size); + + futures.emplace_back(task_pool.submit(std::forward(chunk), first, loop_end)); + + first = loop_end; + } + + return futures; + } + /** * Chunk a single range. */ - template - std::vector()(std::declval(), std::declval()))>> - parallel_chunk_for(ExecPolicy &&policy, RandIt first, RandIt last, Chunk chunk, int extra_split_factor = 1) { - std::vector()(std::declval(), std::declval())) - >> futures; - auto& task_pool = policy.pool(); + template + std::vector> + parallel_chunk_for_1(ExecPolicy &&policy, RandIt first, RandIt last, + Chunk chunk, ChunkRet*, int extra_split_factor, A&&... chunk_args) { + std::vector> futures; + auto& task_pool = *policy.pool(); auto chunk_size = get_chunk_size(first, last, extra_split_factor * task_pool.get_num_threads()); while (first < last) { auto iter_chunk_size = get_iter_chunk_size(first, last, chunk_size); RandIt loop_end = advanced(first, iter_chunk_size); - futures.emplace_back(task_pool.submit(chunk, first, loop_end)); + futures.emplace_back(task_pool.submit(std::forward(chunk), first, loop_end, + std::forward(chunk_args)...)); first = loop_end; } @@ -849,28 +1667,36 @@ namespace poolstl { return futures; } + /** + * Chunk a single range. + */ + template + typename std::enable_if::value, void>::type + parallel_chunk_for_1_wait(ExecPolicy &&policy, RandIt first, RandIt last, + Chunk chunk, ChunkRet* rettype, int extra_split_factor, A&&... chunk_args) { + auto futures = parallel_chunk_for_1(std::forward(policy), first, last, + std::forward(chunk), rettype, extra_split_factor, + std::forward(chunk_args)...); + get_futures(futures); + } + /** * Element-wise chunk two ranges. */ - template - std::vector()( - std::declval(), - std::declval(), - std::declval()))>> - parallel_chunk_for(ExecPolicy &&policy, RandIt1 first1, RandIt1 last1, RandIt2 first2, Chunk chunk) { - std::vector()( - std::declval(), - std::declval(), - std::declval())) - >> futures; - auto& task_pool = policy.pool(); + template + std::vector> + parallel_chunk_for_2(ExecPolicy &&policy, RandIt1 first1, RandIt1 last1, RandIt2 first2, + Chunk chunk, ChunkRet*, A&&... chunk_args) { + std::vector> futures; + auto& task_pool = *policy.pool(); auto chunk_size = get_chunk_size(first1, last1, task_pool.get_num_threads()); while (first1 < last1) { auto iter_chunk_size = get_iter_chunk_size(first1, last1, chunk_size); RandIt1 loop_end = advanced(first1, iter_chunk_size); - futures.emplace_back(task_pool.submit(chunk, first1, loop_end, first2)); + futures.emplace_back(task_pool.submit(std::forward(chunk), first1, loop_end, first2, + std::forward(chunk_args)...)); first1 = loop_end; std::advance(first2, iter_chunk_size); @@ -882,28 +1708,21 @@ namespace poolstl { /** * Element-wise chunk three ranges. */ - template - std::vector()( - std::declval(), - std::declval(), - std::declval(), - std::declval()))>> - parallel_chunk_for(ExecPolicy &&policy, RandIt1 first1, RandIt1 last1, RandIt2 first2, RandIt3 first3, - Chunk chunk) { - std::vector()( - std::declval(), - std::declval(), - std::declval(), - std::declval())) - >> futures; - auto& task_pool = policy.pool(); + template + std::vector> + parallel_chunk_for_3(ExecPolicy &&policy, RandIt1 first1, RandIt1 last1, RandIt2 first2, RandIt3 first3, + Chunk chunk, ChunkRet*, A&&... chunk_args) { + std::vector> futures; + auto& task_pool = *policy.pool(); auto chunk_size = get_chunk_size(first1, last1, task_pool.get_num_threads()); while (first1 < last1) { auto iter_chunk_size = get_iter_chunk_size(first1, last1, chunk_size); RandIt1 loop_end = advanced(first1, iter_chunk_size); - futures.emplace_back(task_pool.submit(chunk, first1, loop_end, first2, first3)); + futures.emplace_back(task_pool.submit(std::forward(chunk), first1, loop_end, first2, first3, + std::forward(chunk_args)...)); first1 = loop_end; std::advance(first2, iter_chunk_size); @@ -916,28 +1735,26 @@ namespace poolstl { /** * Sort a range in parallel. * - * @param stable Whether to use std::stable_sort or std::sort + * @param sort_func Sequential sort method, like std::sort or std::stable_sort + * @param merge_func Sequential merge method, like std::inplace_merge */ - template - void parallel_sort(ExecPolicy &&policy, RandIt first, RandIt last, Compare comp, bool stable) { + template + void parallel_sort(ExecPolicy &&policy, RandIt first, RandIt last, + Compare comp, SortFunc sort_func, MergeFunc merge_func) { if (first == last) { return; } // Sort chunks in parallel - auto futures = parallel_chunk_for(std::forward(policy), first, last, - [&comp, stable] (RandIt chunk_first, RandIt chunk_last) { - if (stable) { - std::stable_sort(chunk_first, chunk_last, comp); - } else { - std::sort(chunk_first, chunk_last, comp); - } + auto futures = parallel_chunk_for_gen(std::forward(policy), first, last, + [&comp, sort_func] (RandIt chunk_first, RandIt chunk_last) { + sort_func(chunk_first, chunk_last, comp); return std::make_pair(chunk_first, chunk_last); }); // Merge the sorted ranges using SortedRange = std::pair; - auto& task_pool = policy.pool(); + auto& task_pool = *policy.pool(); std::vector subranges; do { for (auto& future : futures) { @@ -950,9 +1767,10 @@ namespace poolstl { // pair up and merge auto& lhs = subranges[i]; auto& rhs = subranges[i + 1]; - futures.emplace_back(task_pool.submit([&comp] (RandIt chunk_first, RandIt chunk_middle, - RandIt chunk_last) { - std::inplace_merge(chunk_first, chunk_middle, chunk_last, comp); + futures.emplace_back(task_pool.submit([&comp, merge_func] (RandIt chunk_first, + RandIt chunk_middle, + RandIt chunk_last) { + merge_func(chunk_first, chunk_middle, chunk_last, comp); return std::make_pair(chunk_first, chunk_last); }, lhs.first, lhs.second, rhs.second)); ++i; @@ -973,6 +1791,56 @@ namespace poolstl { #endif +#ifndef POOLSTL_INTERNAL_THREAD_IMPL_HPP +#define POOLSTL_INTERNAL_THREAD_IMPL_HPP + +/** + * EXPERIMENTAL: Subject to significant changes or removal. + * An implementation using only std::thread and no thread pool at all. + * + * Advantage: + * - Fewer symbols (no packaged_task with its operators, destructors, vtable, etc) means smaller binary + * which can mean a lot when there are many calls like with many templates. + * - No thread pool to manage. + * + * Disadvantages: + * - Threads are started and joined for every operation, so it is harder to amortize that cost. + * - Barely any algorithms are supported. + */ + + + +namespace poolstl { + namespace internal { + + template + typename std::enable_if::value, void>::type + parallel_chunk_for_1_wait(ExecPolicy &&policy, RandIt first, RandIt last, + Chunk chunk, ChunkRet*, int extra_split_factor, A&&... chunk_args) { + std::vector threads; + auto chunk_size = get_chunk_size(first, last, extra_split_factor * policy.get_num_threads()); + + while (first < last) { + auto iter_chunk_size = get_iter_chunk_size(first, last, chunk_size); + RandIt loop_end = advanced(first, iter_chunk_size); + + threads.emplace_back(std::thread(std::forward(chunk), first, loop_end, + std::forward(chunk_args)...)); + + first = loop_end; + } + + for (auto& thread : threads) { + if (thread.joinable()) { + thread.join(); + } + } + } + } +} + +#endif + namespace std { /** @@ -980,12 +1848,14 @@ namespace std { * See std::copy https://en.cppreference.com/w/cpp/algorithm/copy */ template - poolstl::internal::enable_if_par + poolstl::internal::enable_if_poolstl_policy copy(ExecPolicy &&policy, RandIt1 first, RandIt1 last, RandIt2 dest) { - auto futures = poolstl::internal::parallel_chunk_for(std::forward(policy), first, last, dest, - [](RandIt1 chunk_first, RandIt1 chunk_last, RandIt2 chunk_dest) { - std::copy(chunk_first, chunk_last, chunk_dest); - }); + if (poolstl::internal::is_seq(policy)) { + return std::copy(first, last, dest); + } + + auto futures = poolstl::internal::parallel_chunk_for_2(std::forward(policy), first, last, dest, + std::copy, (RandIt2*)nullptr); poolstl::internal::get_futures(futures); return poolstl::internal::advanced(dest, std::distance(first, last)); } @@ -995,7 +1865,7 @@ namespace std { * See std::copy_n https://en.cppreference.com/w/cpp/algorithm/copy_n */ template - poolstl::internal::enable_if_par + poolstl::internal::enable_if_poolstl_policy copy_n(ExecPolicy &&policy, RandIt1 first, Size n, RandIt2 dest) { if (n <= 0) { return dest; @@ -1010,14 +1880,17 @@ namespace std { * See std::count_if https://en.cppreference.com/w/cpp/algorithm/count_if */ template - poolstl::internal::enable_if_par::difference_type> + poolstl::internal::enable_if_poolstl_policy::difference_type> count_if(ExecPolicy&& policy, RandIt first, RandIt last, UnaryPredicate p) { + if (poolstl::internal::is_seq(policy)) { + return std::count_if(first, last, p); + } + using T = typename iterator_traits::difference_type; - auto futures = poolstl::internal::parallel_chunk_for(std::forward(policy), first, last, - [&p](RandIt chunk_first, RandIt chunk_last) { - return std::count_if(chunk_first, chunk_last, p); - }); + auto futures = poolstl::internal::parallel_chunk_for_1(std::forward(policy), first, last, + std::count_if, + (T*)nullptr, 1, p); return poolstl::internal::cpp17::reduce( poolstl::internal::get_wrap(futures.begin()), @@ -1029,7 +1902,7 @@ namespace std { * See std::count https://en.cppreference.com/w/cpp/algorithm/count */ template - poolstl::internal::enable_if_par::difference_type> + poolstl::internal::enable_if_poolstl_policy::difference_type> count(ExecPolicy&& policy, RandIt first, RandIt last, const T& value) { return std::count_if(std::forward(policy), first, last, [&value](const T& test) { return test == value; }); @@ -1040,13 +1913,15 @@ namespace std { * See std::fill https://en.cppreference.com/w/cpp/algorithm/fill */ template - poolstl::internal::enable_if_par + poolstl::internal::enable_if_poolstl_policy fill(ExecPolicy &&policy, RandIt first, RandIt last, const Tp& value) { - auto futures = poolstl::internal::parallel_chunk_for(std::forward(policy), first, last, - [&value](RandIt chunk_first, RandIt chunk_last) { - std::fill(chunk_first, chunk_last, value); - }); - poolstl::internal::get_futures(futures); + if (poolstl::internal::is_seq(policy)) { + std::fill(first, last, value); + return; + } + + poolstl::internal::parallel_chunk_for_1_wait(std::forward(policy), first, last, + std::fill, (void*)nullptr, 1, value); } /** @@ -1054,7 +1929,7 @@ namespace std { * See std::fill_n https://en.cppreference.com/w/cpp/algorithm/fill_n */ template - poolstl::internal::enable_if_par + poolstl::internal::enable_if_poolstl_policy fill_n(ExecPolicy &&policy, RandIt first, Size n, const Tp& value) { if (n <= 0) { return first; @@ -1069,13 +1944,17 @@ namespace std { * See std::find_if https://en.cppreference.com/w/cpp/algorithm/find_if */ template - poolstl::internal::enable_if_par + poolstl::internal::enable_if_poolstl_policy find_if(ExecPolicy &&policy, RandIt first, RandIt last, UnaryPredicate p) { + if (poolstl::internal::is_seq(policy)) { + return std::find_if(first, last, p); + } + using diff_t = typename std::iterator_traits::difference_type; diff_t n = std::distance(first, last); std::atomic extremum(n); - auto futures = poolstl::internal::parallel_chunk_for(std::forward(policy), first, last, + poolstl::internal::parallel_chunk_for_1_wait(std::forward(policy), first, last, [&first, &extremum, &p](RandIt chunk_first, RandIt chunk_last) { if (std::distance(first, chunk_first) > extremum) { // already found by another task @@ -1091,8 +1970,8 @@ namespace std { extremum.compare_exchange_weak(old, k); } } - }, 8); // use small tasks so later ones may exit early if item is already found - poolstl::internal::get_futures(futures); + }, (void*)nullptr, + 8); // use small tasks so later ones may exit early if item is already found return extremum == n ? last : first + extremum; } @@ -1101,10 +1980,15 @@ namespace std { * See std::find_if_not https://en.cppreference.com/w/cpp/algorithm/find_if_not */ template - poolstl::internal::enable_if_par + poolstl::internal::enable_if_poolstl_policy find_if_not(ExecPolicy &&policy, RandIt first, RandIt last, UnaryPredicate p) { return std::find_if(std::forward(policy), first, last, - [&p](const typename std::iterator_traits::value_type& test) { return !p(test); }); +#if POOLSTL_HAVE_CXX17_LIB + std::not_fn(p) +#else + [&p](const typename std::iterator_traits::value_type& test) { return !p(test); } +#endif + ); } /** @@ -1112,7 +1996,7 @@ namespace std { * See std::find https://en.cppreference.com/w/cpp/algorithm/find */ template - poolstl::internal::enable_if_par + poolstl::internal::enable_if_poolstl_policy find(ExecPolicy &&policy, RandIt first, RandIt last, const T& value) { return std::find_if(std::forward(policy), first, last, [&value](const T& test) { return value == test; }); @@ -1123,16 +2007,23 @@ namespace std { * See std::for_each https://en.cppreference.com/w/cpp/algorithm/for_each */ template - poolstl::internal::enable_if_par + poolstl::internal::enable_if_poolstl_policy for_each(ExecPolicy &&policy, RandIt first, RandIt last, UnaryFunction f) { - auto futures = poolstl::internal::parallel_chunk_for(std::forward(policy), first, last, - [&f](RandIt chunk_first, RandIt chunk_last) { - // std::for_each(chunk_first, chunk_last, f); - for (; chunk_first != chunk_last; ++chunk_first) { - f(*chunk_first); - } - }); - poolstl::internal::get_futures(futures); + // Using a lambda instead of just calling the non-policy std::for_each because it appears to + // result in a smaller binary. + auto chunk_func = [&f](RandIt chunk_first, RandIt chunk_last) { + for (; chunk_first != chunk_last; ++chunk_first) { + f(*chunk_first); + } + }; + + if (poolstl::internal::is_seq(policy)) { + chunk_func(first, last); + return; + } + + poolstl::internal::parallel_chunk_for_1_wait(std::forward(policy), first, last, + chunk_func, (void*)nullptr, 1); } /** @@ -1140,7 +2031,7 @@ namespace std { * See std::for_each_n https://en.cppreference.com/w/cpp/algorithm/for_each_n */ template - poolstl::internal::enable_if_par + poolstl::internal::enable_if_poolstl_policy for_each_n(ExecPolicy &&policy, RandIt first, Size n, UnaryFunction f) { RandIt last = poolstl::internal::advanced(first, n); std::for_each(std::forward(policy), first, last, f); @@ -1152,9 +2043,15 @@ namespace std { * See std::sort https://en.cppreference.com/w/cpp/algorithm/sort */ template - poolstl::internal::enable_if_par + poolstl::internal::enable_if_poolstl_policy sort(ExecPolicy &&policy, RandIt first, RandIt last, Compare comp) { - poolstl::internal::parallel_sort(std::forward(policy), first, last, comp, false); + if (poolstl::internal::is_seq(policy)) { + std::sort(first, last, comp); + return; + } + + poolstl::internal::parallel_sort(std::forward(policy), first, last, comp, + std::sort, std::inplace_merge); } /** @@ -1162,10 +2059,10 @@ namespace std { * See std::sort https://en.cppreference.com/w/cpp/algorithm/sort */ template - poolstl::internal::enable_if_par + poolstl::internal::enable_if_poolstl_policy sort(ExecPolicy &&policy, RandIt first, RandIt last) { using T = typename std::iterator_traits::value_type; - poolstl::internal::parallel_sort(std::forward(policy), first, last, std::less(), false); + std::sort(std::forward(policy), first, last, std::less()); } /** @@ -1173,9 +2070,15 @@ namespace std { * See std::stable_sort https://en.cppreference.com/w/cpp/algorithm/stable_sort */ template - poolstl::internal::enable_if_par + poolstl::internal::enable_if_poolstl_policy stable_sort(ExecPolicy &&policy, RandIt first, RandIt last, Compare comp) { - poolstl::internal::parallel_sort(std::forward(policy), first, last, comp, true); + if (poolstl::internal::is_seq(policy)) { + std::stable_sort(first, last, comp); + return; + } + + poolstl::internal::parallel_sort(std::forward(policy), first, last, comp, + std::stable_sort, std::inplace_merge); } /** @@ -1183,10 +2086,10 @@ namespace std { * See std::stable_sort https://en.cppreference.com/w/cpp/algorithm/stable_sort */ template - poolstl::internal::enable_if_par + poolstl::internal::enable_if_poolstl_policy stable_sort(ExecPolicy &&policy, RandIt first, RandIt last) { using T = typename std::iterator_traits::value_type; - poolstl::internal::parallel_sort(std::forward(policy), first, last, std::less(), true); + std::stable_sort(std::forward(policy), first, last, std::less()); } /** @@ -1194,14 +2097,17 @@ namespace std { * See std::transform https://en.cppreference.com/w/cpp/algorithm/transform */ template - poolstl::internal::enable_if_par + poolstl::internal::enable_if_poolstl_policy transform(ExecPolicy&& policy, RandIt1 first1, RandIt1 last1, RandIt2 dest, UnaryOperation unary_op) { + if (poolstl::internal::is_seq(policy)) { + return poolstl::internal::cpp17::transform(first1, last1, dest, unary_op); + } - auto futures = poolstl::internal::parallel_chunk_for(std::forward(policy), first1, last1, dest, - [&unary_op](RandIt1 chunk_first1, RandIt1 chunk_last1, RandIt2 dest_first) { - return poolstl::internal::cpp17::transform(chunk_first1, chunk_last1, dest_first, unary_op); - }); + auto futures = poolstl::internal::parallel_chunk_for_2(std::forward(policy), first1, last1, dest, + poolstl::internal::cpp17::transform, + (RandIt2*)nullptr, unary_op); poolstl::internal::get_futures(futures); return dest + std::distance(first1, last1); } @@ -1211,16 +2117,18 @@ namespace std { * See std::transform https://en.cppreference.com/w/cpp/algorithm/transform */ template - poolstl::internal::enable_if_par + poolstl::internal::enable_if_poolstl_policy transform(ExecPolicy&& policy, RandIt1 first1, RandIt1 last1, RandIt2 first2, RandIt3 dest, BinaryOperation binary_op) { + if (poolstl::internal::is_seq(policy)) { + return poolstl::internal::cpp17::transform(first1, last1, first2, dest, binary_op); + } - auto futures = poolstl::internal::parallel_chunk_for(std::forward(policy), first1, last1, - first2, dest, - [&binary_op](RandIt1 chunk_first1, RandIt1 chunk_last1, RandIt1 chunk_first2, RandIt3 dest_first) { - return poolstl::internal::cpp17::transform(chunk_first1, chunk_last1, - chunk_first2, dest_first, binary_op); - }); + auto futures = poolstl::internal::parallel_chunk_for_3(std::forward(policy), first1, last1, + first2, dest, + poolstl::internal::cpp17::transform, + (RandIt3*)nullptr, binary_op); poolstl::internal::get_futures(futures); return dest + std::distance(first1, last1); } @@ -1230,7 +2138,7 @@ namespace std { * See std::all_of https://en.cppreference.com/w/cpp/algorithm/all_of */ template - poolstl::internal::enable_if_par + poolstl::internal::enable_if_poolstl_policy all_of(ExecPolicy&& policy, RandIt first, RandIt last, Predicate pred) { return last == std::find_if_not(std::forward(policy), first, last, pred); } @@ -1240,7 +2148,7 @@ namespace std { * See std::none_of https://en.cppreference.com/w/cpp/algorithm/none_of */ template - poolstl::internal::enable_if_par + poolstl::internal::enable_if_poolstl_policy none_of(ExecPolicy&& policy, RandIt first, RandIt last, Predicate pred) { return last == std::find_if(std::forward(policy), first, last, pred); } @@ -1250,7 +2158,7 @@ namespace std { * See std::any_of https://en.cppreference.com/w/cpp/algorithm/any_of */ template - poolstl::internal::enable_if_par + poolstl::internal::enable_if_poolstl_policy any_of(ExecPolicy&& policy, RandIt first, RandIt last, Predicate pred) { return !std::none_of(std::forward(policy), first, last, pred); } @@ -1280,13 +2188,52 @@ namespace poolstl { * but cannot be shared by all parallel iterations. */ template - poolstl::internal::enable_if_par + poolstl::internal::enable_if_poolstl_policy for_each_chunk(ExecPolicy&& policy, RandIt first, RandIt last, ChunkConstructor construct, UnaryFunction f) { - auto futures = poolstl::internal::parallel_chunk_for(std::forward(policy), first, last, - [&construct, &f](RandIt chunk_first, RandIt chunk_last) { - for_each_chunk(chunk_first, chunk_last, construct, f); - }); - poolstl::internal::get_futures(futures); + if (poolstl::internal::is_seq(policy)) { + for_each_chunk(first, last, construct, f); + return; + } + + poolstl::internal::parallel_chunk_for_1_wait(std::forward(policy), first, last, + for_each_chunk , + (void*)nullptr, 1, construct, f); + } + + /** + * NOTE: Iterators are expected to be random access. + * + * Like `std::sort`, but allows specifying the sequential sort and merge methods. These methods must have the + * same signature as the comparator versions of `std::sort` and `std::inplace_merge`, respectively. + */ + template + poolstl::internal::enable_if_poolstl_policy + pluggable_sort(ExecPolicy &&policy, RandIt first, RandIt last, Compare comp, + void (sort_func)(RandIt, RandIt, Compare) = std::sort, + void (merge_func)(RandIt, RandIt, RandIt, Compare) = std::inplace_merge) { + if (poolstl::internal::is_seq(policy)) { + sort_func(first, last, comp); + return; + } + + poolstl::internal::parallel_sort(std::forward(policy), first, last, comp, sort_func, merge_func); + } + + /** + * NOTE: Iterators are expected to be random access. + * + * Like `std::sort`, but allows specifying the sequential sort and merge methods. These methods must have the + * same signature as the comparator versions of `std::sort` and `std::inplace_merge`, respectively. + */ + template + poolstl::internal::enable_if_poolstl_policy + pluggable_sort(ExecPolicy &&policy, RandIt first, RandIt last, + void (sort_func)(RandIt, RandIt, + std::less::value_type>) = std::sort, + void (merge_func)(RandIt, RandIt, RandIt, + std::less::value_type>) = std::inplace_merge){ + using T = typename std::iterator_traits::value_type; + pluggable_sort(std::forward(policy), first, last, std::less(), sort_func, merge_func); } } @@ -1306,14 +2253,18 @@ namespace std { * See std::exclusive_scan https://en.cppreference.com/w/cpp/algorithm/exclusive_scan */ template - poolstl::internal::enable_if_par + poolstl::internal::enable_if_poolstl_policy exclusive_scan(ExecPolicy &&policy, RandIt1 first, RandIt1 last, RandIt2 dest, T init, BinaryOp binop) { if (first == last) { return dest; } + if (poolstl::internal::is_seq(policy)) { + return std::exclusive_scan(first, last, dest, init, binop); + } + // Pass 1: Chunk the input and find the sum of each chunk - auto futures = poolstl::internal::parallel_chunk_for(std::forward(policy), first, last, + auto futures = poolstl::internal::parallel_chunk_for_gen(std::forward(policy), first, last, [binop](RandIt1 chunk_first, RandIt1 chunk_last) { auto sum = std::accumulate(chunk_first, chunk_last, T{}, binop); return std::make_tuple(std::make_pair(chunk_first, chunk_last), sum); @@ -1355,7 +2306,7 @@ namespace std { * See std::exclusive_scan https://en.cppreference.com/w/cpp/algorithm/exclusive_scan */ template - poolstl::internal::enable_if_par + poolstl::internal::enable_if_poolstl_policy exclusive_scan(ExecPolicy &&policy, RandIt1 first, RandIt1 last, RandIt2 dest, T init) { return std::exclusive_scan(std::forward(policy), first, last, dest, init, std::plus()); } @@ -1366,12 +2317,15 @@ namespace std { * See std::reduce https://en.cppreference.com/w/cpp/algorithm/reduce */ template - poolstl::internal::enable_if_par + poolstl::internal::enable_if_poolstl_policy reduce(ExecPolicy &&policy, RandIt first, RandIt last, T init, BinaryOp binop) { - auto futures = poolstl::internal::parallel_chunk_for(std::forward(policy), first, last, - [init, binop](RandIt chunk_first, RandIt chunk_last) { - return poolstl::internal::cpp17::reduce(chunk_first, chunk_last, init, binop); - }); + if (poolstl::internal::is_seq(policy)) { + return poolstl::internal::cpp17::reduce(first, last, init, binop); + } + + auto futures = poolstl::internal::parallel_chunk_for_1(std::forward(policy), first, last, + poolstl::internal::cpp17::reduce, + (T*)nullptr, 1, init, binop); return poolstl::internal::cpp17::reduce( poolstl::internal::get_wrap(futures.begin()), @@ -1383,7 +2337,7 @@ namespace std { * See std::reduce https://en.cppreference.com/w/cpp/algorithm/reduce */ template - poolstl::internal::enable_if_par + poolstl::internal::enable_if_poolstl_policy reduce(ExecPolicy &&policy, RandIt first, RandIt last, T init) { return std::reduce(std::forward(policy), first, last, init, std::plus()); } @@ -1393,7 +2347,7 @@ namespace std { * See std::reduce https://en.cppreference.com/w/cpp/algorithm/reduce */ template - poolstl::internal::enable_if_par< + poolstl::internal::enable_if_poolstl_policy< ExecPolicy, typename std::iterator_traits::value_type> reduce(ExecPolicy &&policy, RandIt first, RandIt last) { return std::reduce(std::forward(policy), first, last, @@ -1406,14 +2360,17 @@ namespace std { * See std::transform_reduce https://en.cppreference.com/w/cpp/algorithm/transform_reduce */ template - poolstl::internal::enable_if_par + poolstl::internal::enable_if_poolstl_policy transform_reduce(ExecPolicy&& policy, RandIt1 first1, RandIt1 last1, T init, BinaryReductionOp reduce_op, UnaryTransformOp transform_op) { + if (poolstl::internal::is_seq(policy)) { + return std::transform_reduce(first1, last1, init, reduce_op, transform_op); + } - auto futures = poolstl::internal::parallel_chunk_for(std::forward(policy), first1, last1, - [&init, &reduce_op, &transform_op](RandIt1 chunk_first1, RandIt1 chunk_last1) { - return std::transform_reduce(chunk_first1, chunk_last1, init, reduce_op, transform_op); - }); + auto futures = poolstl::internal::parallel_chunk_for_1(std::forward(policy), first1, last1, + std::transform_reduce, + (T*)nullptr, 1, init, reduce_op, transform_op); return poolstl::internal::cpp17::reduce( poolstl::internal::get_wrap(futures.begin()), @@ -1425,14 +2382,17 @@ namespace std { * See std::transform_reduce https://en.cppreference.com/w/cpp/algorithm/transform_reduce */ template - poolstl::internal::enable_if_par + poolstl::internal::enable_if_poolstl_policy transform_reduce(ExecPolicy&& policy, RandIt1 first1, RandIt1 last1, RandIt2 first2, T init, BinaryReductionOp reduce_op, BinaryTransformOp transform_op) { + if (poolstl::internal::is_seq(policy)) { + return std::transform_reduce(first1, last1, first2, init, reduce_op, transform_op); + } - auto futures = poolstl::internal::parallel_chunk_for(std::forward(policy), first1, last1, first2, - [&init, &reduce_op, &transform_op](RandIt1 chunk_first1, RandIt1 chunk_last1, RandIt2 chunk_first2) { - return std::transform_reduce(chunk_first1, chunk_last1, chunk_first2, init, reduce_op, transform_op); - }); + auto futures = poolstl::internal::parallel_chunk_for_2(std::forward(policy), first1, last1, first2, + std::transform_reduce, + (T*)nullptr, init, reduce_op, transform_op); return poolstl::internal::cpp17::reduce( poolstl::internal::get_wrap(futures.begin()), @@ -1444,7 +2404,7 @@ namespace std { * See std::transform_reduce https://en.cppreference.com/w/cpp/algorithm/transform_reduce */ template< class ExecPolicy, class RandIt1, class RandIt2, class T > - poolstl::internal::enable_if_par + poolstl::internal::enable_if_poolstl_policy transform_reduce(ExecPolicy&& policy, RandIt1 first1, RandIt1 last1, RandIt2 first2, T init ) { return transform_reduce(std::forward(policy), first1, last1, first2, init, std::plus<>(), std::multiplies<>()); @@ -1455,100 +2415,6 @@ namespace std { #endif -#ifndef POOLSTL_SEQ_FWD_HPP -#define POOLSTL_SEQ_FWD_HPP - - -/* - * Forward poolstl::seq to the native sequential (no policy) method. - */ - -#define POOLSTL_DEFINE_SEQ_FWD(NS, FNAME) \ - template \ - auto FNAME(EP&&, ARGS&&...args) -> \ - poolstl::internal::enable_if_seq(args)...))> { \ - return NS::FNAME(std::forward(args)...); \ - } - -#define POOLSTL_DEFINE_SEQ_FWD_VOID(NS, FNAME) \ - template \ - poolstl::internal::enable_if_seq FNAME(EP&&, ARGS&&... args) { \ - NS::FNAME(std::forward(args)...); \ - } - -#if POOLSTL_HAVE_CXX17 - -/* - * Dynamically choose policy from a std::variant. - * Useful to choose between parallel and sequential policies at runtime via par_if. - */ - -#define POOLSTL_DEFINE_PAR_IF_FWD_VOID(NS, FNAME) \ - template \ - poolstl::internal::enable_if_poolstl_variant FNAME(EP&& policy, ARGS&&...args) { \ - std::visit([&](auto&& pol) { NS::FNAME(pol, std::forward(args)...); }, policy.var); \ - } - -#define POOLSTL_DEFINE_PAR_IF_FWD(NS, FNAME) \ - template \ - auto FNAME(EP&& policy, ARGS&&...args) -> \ - poolstl::internal::enable_if_poolstl_variant(args)...))> { \ - return std::visit([&](auto&& pol) { return NS::FNAME(pol, std::forward(args)...); }, policy.var); \ - } - -#else -#define POOLSTL_DEFINE_PAR_IF_FWD_VOID(NS, FNAME) -#define POOLSTL_DEFINE_PAR_IF_FWD(NS, FNAME) -#endif -/* - * Define both the sequential forward and dynamic chooser. - */ -#define POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(NS, FNAME) \ - POOLSTL_DEFINE_SEQ_FWD(NS, FNAME) \ - POOLSTL_DEFINE_PAR_IF_FWD(NS, FNAME) - -#define POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF_VOID(NS, FNAME) \ - POOLSTL_DEFINE_SEQ_FWD_VOID(NS, FNAME) \ - POOLSTL_DEFINE_PAR_IF_FWD_VOID(NS, FNAME) - -namespace std { - POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, all_of) - POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, any_of) - POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, none_of) - - POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, count) - POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, count_if) - - POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, copy) - POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, copy_n) - - POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF_VOID(std, fill) - POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, fill_n) - - POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, find) - POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, find_if) - POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, find_if_not) - - POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF_VOID(std, for_each) -#if POOLSTL_HAVE_CXX17_LIB - POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, for_each_n) -#endif - - POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, transform) - -#if POOLSTL_HAVE_CXX17_LIB - POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, exclusive_scan) - POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, reduce) - POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(std, transform_reduce) -#endif -} - -namespace poolstl { - POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF_VOID(poolstl, for_each_chunk) -} - -#endif - // Note that iota_iter.hpp is self-contained in its own right. #ifndef POOLSTL_IOTA_ITER_HPP diff --git a/src/index.cpp b/src/index.cpp index 9b907257..9b3fbfab 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -165,13 +165,8 @@ void StrobemerIndex::populate(float f, unsigned n_threads) { Timer sorting_timer; logger.debug() << " Sorting ...\n"; - if (true) { - task_thread_pool::task_thread_pool pool{n_threads}; - std::sort(poolstl::par.on(pool), randstrobes.begin(), randstrobes.end()); - } else { - // sort by hash values - pdqsort_branchless(randstrobes.begin(), randstrobes.end()); - } + task_thread_pool::task_thread_pool pool{n_threads}; + poolstl::pluggable_sort(poolstl::par.on(pool), randstrobes.begin(), randstrobes.end(), pdqsort_branchless); stats.elapsed_sorting_seeds = sorting_timer.duration(); Timer hash_index_timer; From a905f7bdd2dcc2e843b0cbac23b51912adadfe7a Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Mon, 22 Jan 2024 13:46:09 +0100 Subject: [PATCH 12/32] Bump to poolSTL 0.3.4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This version indeed gives us a very nice speed improvement without using extra memory: Sorting-only runtimes: * 1 thread: 32.8 s * 2 threads: 20.3 s * 4 threads: 15.7 s * 8 threads: 14.8 s Overall indexing runtimes (before/after): * 1 thread: 151 s → 153 s * 2 threads: 100 s → 88 s * 4 threads: 73 s → 57 s * 8 threads: 63 s → 47 s --- CHANGES.md | 4 + README.md | 2 +- ext/README.md | 4 +- ext/poolstl/poolstl.hpp | 284 +++++++++++++++++++++++++++++++++++++--- 4 files changed, 275 insertions(+), 19 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 266b5832..644e9644 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,6 +2,10 @@ ## development version +* #386: Parallelize indexing even more by using @alugowski’s + [poolSTL](https://github.com/alugowski/) `pluggable_sort`. + Indexing a human reference (measured on CHM13) now takes only ~45 s on a + recent machine (using 8 threads). * #376: Improve accuracy for read length 50 by optimizing the default indexing parameters. Paired-end accuracy increases by 0.3 percentage points on average. Single-end accuracy increases by 1 percentage point. diff --git a/README.md b/README.md index f7ec6e05..1ef03a51 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ Strobealign is a read mapper that is typically significantly faster than other r - Map single-end and paired-end reads - Multithreading support -- Fast indexing (1-2 minutes for a human-sized reference genome using four cores) +- Fast indexing (<1 minute for a human-sized reference genome using four cores) - On-the-fly indexing by default. Optionally create an on-disk index. - Output in standard SAM format or produce even faster results by writing PAF (without alignments) - Strobealign is most suited for read lengths between 100 and 500 bp diff --git a/ext/README.md b/ext/README.md index d80e5a2d..a3480467 100644 --- a/ext/README.md +++ b/ext/README.md @@ -30,8 +30,8 @@ License: See pdqsort/license.txt ## poolstl Homepage: https://github.com/alugowski/poolSTL/ -Downloaded file: https://github.com/alugowski/poolSTL/releases/download/v0.3.3/poolstl.hpp -Version: 0.3.3 +Downloaded file: https://github.com/alugowski/poolSTL/releases/download/v0.3.4/poolstl.hpp +Version: 0.3.4 License: See poolstl.hpp ## robin_hood diff --git a/ext/poolstl/poolstl.hpp b/ext/poolstl/poolstl.hpp index d1340a1e..8d569ecb 100644 --- a/ext/poolstl/poolstl.hpp +++ b/ext/poolstl/poolstl.hpp @@ -505,7 +505,7 @@ namespace task_thread_pool { // Version macros. #define POOLSTL_VERSION_MAJOR 0 #define POOLSTL_VERSION_MINOR 3 -#define POOLSTL_VERSION_PATCH 3 +#define POOLSTL_VERSION_PATCH 4 #include #include @@ -596,6 +596,28 @@ namespace poolstl { } } + /** + * Identify a pivot element for quicksort. Chooses the middle element of the range. + */ + template + typename std::iterator_traits::value_type quicksort_pivot(Iterator first, Iterator last) { + return *(std::next(first, std::distance(first, last) / 2)); + } + + /** + * Predicate for std::partition (for quicksort) + */ + template + struct pivot_predicate { + pivot_predicate(Compare comp, const T& pivot) : comp(comp), pivot(pivot) {} + + bool operator()(const T& em) { + return comp(em, pivot); + } + Compare comp; + const T pivot; + }; + /* * Some methods are only available with C++17 and up. Reimplement on older standards. */ @@ -1265,7 +1287,7 @@ namespace task_thread_pool { // Version macros. #define POOLSTL_VERSION_MAJOR 0 #define POOLSTL_VERSION_MINOR 3 -#define POOLSTL_VERSION_PATCH 3 +#define POOLSTL_VERSION_PATCH 4 #include #include @@ -1356,6 +1378,28 @@ namespace poolstl { } } + /** + * Identify a pivot element for quicksort. Chooses the middle element of the range. + */ + template + typename std::iterator_traits::value_type quicksort_pivot(Iterator first, Iterator last) { + return *(std::next(first, std::distance(first, last) / 2)); + } + + /** + * Predicate for std::partition (for quicksort) + */ + template + struct pivot_predicate { + pivot_predicate(Compare comp, const T& pivot) : comp(comp), pivot(pivot) {} + + bool operator()(const T& em) { + return comp(em, pivot); + } + Compare comp; + const T pivot; + }; + /* * Some methods are only available with C++17 and up. Reimplement on older standards. */ @@ -1739,8 +1783,8 @@ namespace poolstl { * @param merge_func Sequential merge method, like std::inplace_merge */ template - void parallel_sort(ExecPolicy &&policy, RandIt first, RandIt last, - Compare comp, SortFunc sort_func, MergeFunc merge_func) { + void parallel_mergesort(ExecPolicy &&policy, RandIt first, RandIt last, + Compare comp, SortFunc sort_func, MergeFunc merge_func) { if (first == last) { return; } @@ -1786,6 +1830,103 @@ namespace poolstl { } while (futures.size() > 1); futures.front().get(); } + + /** + * Quicksort worker function. + */ + template + void quicksort_impl(task_thread_pool::task_thread_pool* task_pool, const RandIt first, const RandIt last, + Compare comp, SortFunc sort_func, PartFunc part_func, PivotFunc pivot_func, + std::ptrdiff_t target_leaf_size, + std::vector>* futures, std::mutex* mutex, + std::condition_variable* cv, int* inflight_spawns) { + using T = typename std::iterator_traits::value_type; + + auto partition_size = std::distance(first, last); + + if (partition_size > target_leaf_size) { + // partition the range + auto mid = part_func(first, last, pivot_predicate(comp, pivot_func(first, last))); + + if (mid != first && mid != last) { + // was able to partition the range, so recurse + std::lock_guard guard(*mutex); + ++(*inflight_spawns); + + futures->emplace_back(task_pool->submit( + quicksort_impl, + task_pool, first, mid, comp, sort_func, part_func, pivot_func, target_leaf_size, + futures, mutex, cv, inflight_spawns)); + + futures->emplace_back(task_pool->submit( + quicksort_impl, + task_pool, mid, last, comp, sort_func, part_func, pivot_func, target_leaf_size, + futures, mutex, cv, inflight_spawns)); + return; + } + } + + // Range does not need to be subdivided (or was unable to subdivide). Run the sequential sort. + { + // notify main thread that partitioning may be finished + std::lock_guard guard(*mutex); + --(*inflight_spawns); + } + cv->notify_one(); + + sort_func(first, last, comp); + } + + /** + * Sort a range in parallel using quicksort. + * + * @param sort_func Sequential sort method, like std::sort or std::stable_sort + * @param part_func Method that partitions a range, like std::partition or std::stable_partition + * @param pivot_func Method that identifies the pivot + */ + template + void parallel_quicksort(ExecPolicy &&policy, RandIt first, RandIt last, + Compare comp, SortFunc sort_func, PartFunc part_func, PivotFunc pivot_func) { + if (first == last) { + return; + } + + auto& task_pool = *policy.pool(); + + // Target partition size. Range will be recursively partitioned into partitions no bigger than this + // size. Target approximately twice as many partitions as threads to reduce impact of uneven pivot + // selection. + std::ptrdiff_t target_leaf_size = std::max(std::distance(first, last) / (task_pool.get_num_threads() * 2), + (std::ptrdiff_t)5); + + // task_thread_pool does not support creating task DAGs, so organize the code such that + // all parallel tasks are independent. The parallel tasks can spawn additional parallel tasks, and they + // record their "child" task's std::future into a common vector to be waited on by the main thread. + std::mutex mutex; + + // Futures of parallel tasks. Access protected by mutex. + std::vector> futures; + + // For signaling that all partitioning has been completed and futures vector is complete. Uses mutex. + std::condition_variable cv; + + // Number of `quicksort_impl` calls that haven't finished yet. Nonzero value means futures vector may + // still be modified. Access protected by mutex. + int inflight_spawns = 1; + + // Root task. + quicksort_impl(&task_pool, first, last, comp, sort_func, part_func, pivot_func, target_leaf_size, + &futures, &mutex, &cv, &inflight_spawns); + + // Wait for all partitioning to finish. + { + std::unique_lock lock(mutex); + cv.wait(lock, [&] { return inflight_spawns == 0; }); + } + + // Wait on all the parallel tasks. + get_futures(futures); + } } } @@ -2050,8 +2191,11 @@ namespace std { return; } - poolstl::internal::parallel_sort(std::forward(policy), first, last, comp, - std::sort, std::inplace_merge); + poolstl::internal::parallel_quicksort(std::forward(policy), first, last, comp, + std::sort, + std::partition::value_type>>, + poolstl::internal::quicksort_pivot); } /** @@ -2077,8 +2221,11 @@ namespace std { return; } - poolstl::internal::parallel_sort(std::forward(policy), first, last, comp, - std::stable_sort, std::inplace_merge); + poolstl::internal::parallel_quicksort(std::forward(policy), first, last, comp, + std::stable_sort, + std::stable_partition::value_type>>, + poolstl::internal::quicksort_pivot); } /** @@ -2203,37 +2350,142 @@ namespace poolstl { /** * NOTE: Iterators are expected to be random access. * - * Like `std::sort`, but allows specifying the sequential sort and merge methods. These methods must have the - * same signature as the comparator versions of `std::sort` and `std::inplace_merge`, respectively. + * Like `std::sort`, but allows specifying the sequential sort method, which must have the + * same signature as the comparator version of `std::sort`. + * + * Implemented as a high-level quicksort that delegates to `sort_func`, in parallel, once the range has been + * sufficiently partitioned. */ template poolstl::internal::enable_if_poolstl_policy pluggable_sort(ExecPolicy &&policy, RandIt first, RandIt last, Compare comp, - void (sort_func)(RandIt, RandIt, Compare) = std::sort, - void (merge_func)(RandIt, RandIt, RandIt, Compare) = std::inplace_merge) { + void (sort_func)(RandIt, RandIt, Compare) = std::sort) { if (poolstl::internal::is_seq(policy)) { sort_func(first, last, comp); return; } - poolstl::internal::parallel_sort(std::forward(policy), first, last, comp, sort_func, merge_func); + poolstl::internal::parallel_quicksort(std::forward(policy), first, last, comp, sort_func, + std::partition::value_type>>, + poolstl::internal::quicksort_pivot); } /** * NOTE: Iterators are expected to be random access. * - * Like `std::sort`, but allows specifying the sequential sort and merge methods. These methods must have the - * same signature as the comparator versions of `std::sort` and `std::inplace_merge`, respectively. + * Like `std::sort`, but allows specifying the sequential sort method, which must have the + * same signature as the comparator version of `std::sort`. + * + * Implemented as a parallel high-level quicksort that delegates to `sort_func` once the range has been + * sufficiently partitioned. */ template poolstl::internal::enable_if_poolstl_policy pluggable_sort(ExecPolicy &&policy, RandIt first, RandIt last, + void (sort_func)(RandIt, RandIt, + std::less::value_type>) = std::sort){ + using T = typename std::iterator_traits::value_type; + pluggable_sort(std::forward(policy), first, last, std::less(), sort_func); + } + + /** + * NOTE: Iterators are expected to be random access. + * + * Parallel merge sort. + * + * @param comp Comparator. + * @param sort_func Sequential sort method. Must have the same signature as the comparator version of `std::sort`. + * @param merge_func Sequential merge method. Must have the same signature as `std::inplace_merge`. + */ + template + poolstl::internal::enable_if_poolstl_policy + pluggable_mergesort(ExecPolicy &&policy, RandIt first, RandIt last, Compare comp, + void (sort_func)(RandIt, RandIt, Compare) = std::sort, + void (merge_func)(RandIt, RandIt, RandIt, Compare) = std::inplace_merge) { + if (poolstl::internal::is_seq(policy)) { + sort_func(first, last, comp); + return; + } + + poolstl::internal::parallel_mergesort(std::forward(policy), + first, last, comp, sort_func, merge_func); + } + + /** + * NOTE: Iterators are expected to be random access. + * + * Parallel merge sort. + * + * Uses `std::less` comparator. + * + * @param sort_func Sequential sort method. Must have the same signature as the comparator version of `std::sort`. + * @param merge_func Sequential merge method. Must have the same signature as `std::inplace_merge`. + */ + template + poolstl::internal::enable_if_poolstl_policy + pluggable_mergesort(ExecPolicy &&policy, RandIt first, RandIt last, void (sort_func)(RandIt, RandIt, std::less::value_type>) = std::sort, void (merge_func)(RandIt, RandIt, RandIt, std::less::value_type>) = std::inplace_merge){ using T = typename std::iterator_traits::value_type; - pluggable_sort(std::forward(policy), first, last, std::less(), sort_func, merge_func); + pluggable_mergesort(std::forward(policy), first, last, std::less(), sort_func, merge_func); + } + + /** + * NOTE: Iterators are expected to be random access. + * + * Parallel quicksort that allows specifying the sequential sort and partition methods. + * + * @param comp Comparator. + * @param sort_func Sequential sort method to use once range is sufficiently partitioned. Must have the same + * signature as the comparator version of `std::sort`. + * @param part_func Sequential partition method. Must have the same signature as `std::partition`. + * @param pivot_func Method that identifies the pivot element + */ + template + poolstl::internal::enable_if_poolstl_policy + pluggable_quicksort(ExecPolicy &&policy, RandIt first, RandIt last, Compare comp, + void (sort_func)(RandIt, RandIt, Compare) = std::sort, + RandIt (part_func)(RandIt, RandIt, poolstl::internal::pivot_predicate::value_type>) = std::partition, + typename std::iterator_traits::value_type (pivot_func)(RandIt, RandIt) = + poolstl::internal::quicksort_pivot) { + if (poolstl::internal::is_seq(policy)) { + sort_func(first, last, comp); + return; + } + + poolstl::internal::parallel_quicksort(std::forward(policy), + first, last, comp, sort_func, part_func, pivot_func); + } + + /** + * NOTE: Iterators are expected to be random access. + * + * Parallel quicksort that allows specifying the sequential sort and partition methods. + * + * Uses `std::less` comparator. + * + * @param sort_func Sequential sort method to use once range is sufficiently partitioned. Must have the same + * signature as the comparator version of `std::sort`. + * @param part_func Sequential partition method. Must have the same signature as `std::partition`. + * @param pivot_func Method that identifies the pivot element + */ + template + poolstl::internal::enable_if_poolstl_policy + pluggable_quicksort(ExecPolicy &&policy, RandIt first, RandIt last, + void (sort_func)(RandIt, RandIt, + std::less::value_type>) = std::sort, + RandIt (part_func)(RandIt, RandIt, poolstl::internal::pivot_predicate< + std::less::value_type>, + typename std::iterator_traits::value_type>) = std::partition, + typename std::iterator_traits::value_type (pivot_func)(RandIt, RandIt) = + poolstl::internal::quicksort_pivot) { + using T = typename std::iterator_traits::value_type; + pluggable_quicksort(std::forward(policy), first, last, std::less(), + sort_func, part_func, pivot_func); } } From ffbb86366880f9c0e4bc671378ebe70ab0432c5a Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Mon, 22 Jan 2024 20:45:06 +0100 Subject: [PATCH 13/32] Update baseline commit --- tests/baseline-commit.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/baseline-commit.txt b/tests/baseline-commit.txt index 9c1e9b61..e1ff8c67 100644 --- a/tests/baseline-commit.txt +++ b/tests/baseline-commit.txt @@ -1 +1 @@ -0ced9903276834e6b9bfe095a255952f0616d330 +a905f7bdd2dcc2e843b0cbac23b51912adadfe7a From cc6928611965881a6b533d05483fb93ff18752b3 Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Wed, 31 Jan 2024 09:26:05 +0100 Subject: [PATCH 14/32] Bump poolSTL to 0.3.5 --- ext/README.md | 4 +- ext/poolstl/poolstl.hpp | 185 +++++++++++++++++++++++++++------------- 2 files changed, 126 insertions(+), 63 deletions(-) diff --git a/ext/README.md b/ext/README.md index a3480467..55e874b7 100644 --- a/ext/README.md +++ b/ext/README.md @@ -30,8 +30,8 @@ License: See pdqsort/license.txt ## poolstl Homepage: https://github.com/alugowski/poolSTL/ -Downloaded file: https://github.com/alugowski/poolSTL/releases/download/v0.3.4/poolstl.hpp -Version: 0.3.4 +Downloaded file: https://github.com/alugowski/poolSTL/releases/download/v0.3.5/poolstl.hpp +Version: 0.3.5 License: See poolstl.hpp ## robin_hood diff --git a/ext/poolstl/poolstl.hpp b/ext/poolstl/poolstl.hpp index 8d569ecb..77c3e7a0 100644 --- a/ext/poolstl/poolstl.hpp +++ b/ext/poolstl/poolstl.hpp @@ -505,7 +505,7 @@ namespace task_thread_pool { // Version macros. #define POOLSTL_VERSION_MAJOR 0 #define POOLSTL_VERSION_MINOR 3 -#define POOLSTL_VERSION_PATCH 4 +#define POOLSTL_VERSION_PATCH 5 #include #include @@ -1287,7 +1287,7 @@ namespace task_thread_pool { // Version macros. #define POOLSTL_VERSION_MAJOR 0 #define POOLSTL_VERSION_MINOR 3 -#define POOLSTL_VERSION_PATCH 4 +#define POOLSTL_VERSION_PATCH 5 #include #include @@ -1679,7 +1679,7 @@ namespace poolstl { auto iter_chunk_size = get_iter_chunk_size(first, last, chunk_size); RandIt loop_end = advanced(first, iter_chunk_size); - futures.emplace_back(task_pool.submit(std::forward(chunk), first, loop_end)); + futures.emplace_back(task_pool.submit(chunk, first, loop_end)); first = loop_end; } @@ -1702,8 +1702,7 @@ namespace poolstl { auto iter_chunk_size = get_iter_chunk_size(first, last, chunk_size); RandIt loop_end = advanced(first, iter_chunk_size); - futures.emplace_back(task_pool.submit(std::forward(chunk), first, loop_end, - std::forward(chunk_args)...)); + futures.emplace_back(task_pool.submit(chunk, first, loop_end, chunk_args...)); first = loop_end; } @@ -1719,8 +1718,7 @@ namespace poolstl { parallel_chunk_for_1_wait(ExecPolicy &&policy, RandIt first, RandIt last, Chunk chunk, ChunkRet* rettype, int extra_split_factor, A&&... chunk_args) { auto futures = parallel_chunk_for_1(std::forward(policy), first, last, - std::forward(chunk), rettype, extra_split_factor, - std::forward(chunk_args)...); + chunk, rettype, extra_split_factor, chunk_args...); get_futures(futures); } @@ -1739,8 +1737,7 @@ namespace poolstl { auto iter_chunk_size = get_iter_chunk_size(first1, last1, chunk_size); RandIt1 loop_end = advanced(first1, iter_chunk_size); - futures.emplace_back(task_pool.submit(std::forward(chunk), first1, loop_end, first2, - std::forward(chunk_args)...)); + futures.emplace_back(task_pool.submit(chunk, first1, loop_end, first2, chunk_args...)); first1 = loop_end; std::advance(first2, iter_chunk_size); @@ -1765,8 +1762,7 @@ namespace poolstl { auto iter_chunk_size = get_iter_chunk_size(first1, last1, chunk_size); RandIt1 loop_end = advanced(first1, iter_chunk_size); - futures.emplace_back(task_pool.submit(std::forward(chunk), first1, loop_end, first2, first3, - std::forward(chunk_args)...)); + futures.emplace_back(task_pool.submit(chunk, first1, loop_end, first2, first3, chunk_args...)); first1 = loop_end; std::advance(first2, iter_chunk_size); @@ -1896,9 +1892,14 @@ namespace poolstl { // Target partition size. Range will be recursively partitioned into partitions no bigger than this // size. Target approximately twice as many partitions as threads to reduce impact of uneven pivot // selection. - std::ptrdiff_t target_leaf_size = std::max(std::distance(first, last) / (task_pool.get_num_threads() * 2), + auto num_threads = task_pool.get_num_threads(); + std::ptrdiff_t target_leaf_size = std::max(std::distance(first, last) / (num_threads * 2), (std::ptrdiff_t)5); + if (num_threads == 1) { + target_leaf_size = std::distance(first, last); + } + // task_thread_pool does not support creating task DAGs, so organize the code such that // all parallel tasks are independent. The parallel tasks can spawn additional parallel tasks, and they // record their "child" task's std::future into a common vector to be waited on by the main thread. @@ -1927,6 +1928,39 @@ namespace poolstl { // Wait on all the parallel tasks. get_futures(futures); } + + /** + * Partition range according to predicate. Unstable. + * + * This implementation only parallelizes with p=2; will spawn and wait for only one task. + */ + template + RandIt partition_p2(task_thread_pool::task_thread_pool &task_pool, RandIt first, RandIt last, Predicate pred) { + auto range_size = std::distance(first, last); + if (range_size < 4) { + return std::partition(first, last, pred); + } + + // approach should be generalizable to arbitrary p + + RandIt mid = std::next(first + range_size / 2); + + // partition left and right halves in parallel + auto left_future = task_pool.submit(std::partition, first, mid, pred); + RandIt right_mid = std::partition(mid, last, pred); + RandIt left_mid = left_future.get(); + + // merge the two partitioned halves + auto left_highs_size = std::distance(left_mid, mid); + auto right_lows_size = std::distance(mid, right_mid); + if (left_highs_size <= right_lows_size) { + std::swap_ranges(left_mid, mid, right_mid - left_highs_size); + return right_mid - left_highs_size; + } else { + std::swap_ranges(mid, right_mid, left_mid); + return left_mid + right_lows_size; + } + } } } @@ -1965,8 +1999,7 @@ namespace poolstl { auto iter_chunk_size = get_iter_chunk_size(first, last, chunk_size); RandIt loop_end = advanced(first, iter_chunk_size); - threads.emplace_back(std::thread(std::forward(chunk), first, loop_end, - std::forward(chunk_args)...)); + threads.emplace_back(std::thread(chunk, first, loop_end, chunk_args...)); first = loop_end; } @@ -1982,6 +2015,66 @@ namespace poolstl { #endif +namespace poolstl { + /** + * NOTE: Iterators are expected to be random access. + * + * Like `std::sort`, but allows specifying the sequential sort method, which must have the + * same signature as the comparator version of `std::sort`. + * + * Implemented as a high-level quicksort that delegates to `sort_func`, in parallel, once the range has been + * sufficiently partitioned. + */ + template + poolstl::internal::enable_if_poolstl_policy + pluggable_sort(ExecPolicy &&policy, RandIt first, RandIt last, Compare comp, + void (sort_func)(RandIt, RandIt, Compare) = std::sort) { + if (poolstl::internal::is_seq(policy)) { + sort_func(first, last, comp); + return; + } + + // Parallel partition. + // The partition_p2 method spawns and waits for its own child task. A deadlock is possible if all worker + // threads are waiting for tasks that in turn have to workers to execute them. This is only an issue because + // our thread pool does not have the concept of dependencies. + // So ensure + auto& task_pool = *policy.pool(); + std::atomic allowed_parallel_partitions{(int)task_pool.get_num_threads() / 2}; + + auto part_func = [&task_pool, &allowed_parallel_partitions](RandIt chunk_first, RandIt chunk_last, + poolstl::internal::pivot_predicate::value_type> pred) { + if (allowed_parallel_partitions.fetch_sub(1) > 0) { + return poolstl::internal::partition_p2(task_pool, chunk_first, chunk_last, pred); + } else { + return std::partition(chunk_first, chunk_last, pred); + } + }; + + poolstl::internal::parallel_quicksort(std::forward(policy), first, last, comp, sort_func, part_func, + poolstl::internal::quicksort_pivot); + } + + /** + * NOTE: Iterators are expected to be random access. + * + * Like `std::sort`, but allows specifying the sequential sort method, which must have the + * same signature as the comparator version of `std::sort`. + * + * Implemented as a parallel high-level quicksort that delegates to `sort_func` once the range has been + * sufficiently partitioned. + */ + template + poolstl::internal::enable_if_poolstl_policy + pluggable_sort(ExecPolicy &&policy, RandIt first, RandIt last, + void (sort_func)(RandIt, RandIt, + std::less::value_type>) = std::sort){ + using T = typename std::iterator_traits::value_type; + pluggable_sort(std::forward(policy), first, last, std::less(), sort_func); + } +} + namespace std { /** @@ -2179,6 +2272,22 @@ namespace std { return last; } + /** + * NOTE: Iterators are expected to be random access. + * See std::partition https://en.cppreference.com/w/cpp/algorithm/partition + * + * Current implementation uses at most 2 threads. + */ + template + poolstl::internal::enable_if_poolstl_policy + partition(ExecPolicy &&policy, RandIt first, RandIt last, Predicate pred) { + if (poolstl::internal::is_seq(policy)) { + return std::partition(first, last, pred); + } + + return poolstl::internal::partition_p2(*policy.pool(), first, last, pred); + } + /** * NOTE: Iterators are expected to be random access. * See std::sort https://en.cppreference.com/w/cpp/algorithm/sort @@ -2191,11 +2300,7 @@ namespace std { return; } - poolstl::internal::parallel_quicksort(std::forward(policy), first, last, comp, - std::sort, - std::partition::value_type>>, - poolstl::internal::quicksort_pivot); + poolstl::pluggable_sort(std::forward(policy), first, last, comp, std::sort); } /** @@ -2347,48 +2452,6 @@ namespace poolstl { (void*)nullptr, 1, construct, f); } - /** - * NOTE: Iterators are expected to be random access. - * - * Like `std::sort`, but allows specifying the sequential sort method, which must have the - * same signature as the comparator version of `std::sort`. - * - * Implemented as a high-level quicksort that delegates to `sort_func`, in parallel, once the range has been - * sufficiently partitioned. - */ - template - poolstl::internal::enable_if_poolstl_policy - pluggable_sort(ExecPolicy &&policy, RandIt first, RandIt last, Compare comp, - void (sort_func)(RandIt, RandIt, Compare) = std::sort) { - if (poolstl::internal::is_seq(policy)) { - sort_func(first, last, comp); - return; - } - - poolstl::internal::parallel_quicksort(std::forward(policy), first, last, comp, sort_func, - std::partition::value_type>>, - poolstl::internal::quicksort_pivot); - } - - /** - * NOTE: Iterators are expected to be random access. - * - * Like `std::sort`, but allows specifying the sequential sort method, which must have the - * same signature as the comparator version of `std::sort`. - * - * Implemented as a parallel high-level quicksort that delegates to `sort_func` once the range has been - * sufficiently partitioned. - */ - template - poolstl::internal::enable_if_poolstl_policy - pluggable_sort(ExecPolicy &&policy, RandIt first, RandIt last, - void (sort_func)(RandIt, RandIt, - std::less::value_type>) = std::sort){ - using T = typename std::iterator_traits::value_type; - pluggable_sort(std::forward(policy), first, last, std::less(), sort_func); - } - /** * NOTE: Iterators are expected to be random access. * From 1024b5073a13dcdefb03fb6498858ab4e39e3ca8 Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Wed, 31 Jan 2024 09:29:27 +0100 Subject: [PATCH 15/32] Update baseline commit --- tests/baseline-commit.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/baseline-commit.txt b/tests/baseline-commit.txt index e1ff8c67..d2baa623 100644 --- a/tests/baseline-commit.txt +++ b/tests/baseline-commit.txt @@ -1 +1 @@ -a905f7bdd2dcc2e843b0cbac23b51912adadfe7a +cc6928611965881a6b533d05483fb93ff18752b3 From 7dc7192a018905db3c3cdb9752e65538c9ff4c6d Mon Sep 17 00:00:00 2001 From: Luis Pedro Coelho Date: Fri, 9 Feb 2024 15:19:38 +1000 Subject: [PATCH 16/32] Explicit error if too many sequences are used MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Right now, strobealign only supports up to 2²⁴ sequences. If the user tries more, it would silently accept it, but later crash. This was triggered when trying to map to the Greengenes database https://ftp.microbio.me/greengenes_release/2022.10/ https://ftp.microbio.me/greengenes_release/2022.10/2022.10.seqs.fna.gz Even a single read like the one below trigger a crash ``` @M05314:127:000000000-BWLLJ:1:1101:15267:1654 2:N:0:1 CCTGTTCGCTCCCCACGCTTTCGTCCCTCAGCGTCAATATTGTGCCAGAATGCTGCCTTCGCCATTGGTGTTCCTCCTGATATCTACGCATGTCACCGCTACACCAGGAATTCCACATTCCTCTCACATATTCTATTTTATCAGTTTTGAT + AAA1AF@1>AAAGG1A0EAFGGEHAAEGFCG1AAEE/F2FG2F2FF1CA0FBDED1BGFGFFE?AF1BFFCFHDGFFHB1FFGFGEEFE/?/BF2F@/EGEEB00/0//0BFG1>B1BGFEFHHGGFFD12BGH2FDFFFGG22GDD>@/F ``` --- src/main.cpp | 5 +++++ src/randstrobes.hpp | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/src/main.cpp b/src/main.cpp index dcc96774..f9b8f65a 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -25,6 +25,7 @@ #include "timer.hpp" #include "readlen.hpp" #include "version.hpp" +#include "randstrobes.hpp" #include "buildconfig.hpp" @@ -209,6 +210,10 @@ int run_strobealign(int argc, char **argv) { throw InvalidFasta("No reference sequences found"); } + if (references.size() > RefRandstrobe::max_number_of_references) { + throw InvalidFasta("Too many reference sequences. Current maximum is " + std::to_string(RefRandstrobe::max_number_of_references)); + } + StrobemerIndex index(references, index_parameters, opt.bits); if (opt.use_index) { // Read the index from a file diff --git a/src/randstrobes.hpp b/src/randstrobes.hpp index 7a117b3c..8bdaf779 100644 --- a/src/randstrobes.hpp +++ b/src/randstrobes.hpp @@ -41,10 +41,14 @@ struct RefRandstrobe { return m_packed & mask; } + private: static constexpr int bit_alloc = 8; static constexpr int mask = (1 << bit_alloc) - 1; packed_t m_packed; // packed representation of ref_index and strobe offset + +public: + static constexpr uint32_t max_number_of_references = (1 << (32 - bit_alloc)) - 1; }; struct QueryRandstrobe { From 2e4ff9500e68d6e465735dd276d362cf71851dcd Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Wed, 14 Feb 2024 20:45:31 +0100 Subject: [PATCH 17/32] Ensure sorting of randstrobes is reproducible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ... by sorting them also by position. This way, it does not matter in which way the randstrobes vector is partitioned during sort. Otherwise, the order would depend on the number of threads used to create the index, making mapping results not reproducible across runs that do not use the same no. of threads. Note: There is still a tiny chance for collisions/nondeterminism because we ignore RefRandstrobe::m_packed. For efficiency, this uses a branchless comparison function inspired by @alugowski’s comment in PR #386. Runtimes for sorting with one thread 32 s - only by hash 45 s - by hash and position using std::tie 42 s - by hash and position using branchless_compare from the PR 35 s - by hash and position using __uint128_t (this commit) Runtimes for sorting (four cores with hyperthreading) threads | sorting time | index creation time -|-|- 1 | 35 s | 154 s 2 | 25 s | 95 s 4 | 16 s | 60 s 8 | 15 s | 48 s --- src/randstrobes.hpp | 8 +++++++- tests/baseline-commit.txt | 2 +- tests/compare-baseline.sh | 8 ++++++-- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/randstrobes.hpp b/src/randstrobes.hpp index 7a117b3c..0c72aaed 100644 --- a/src/randstrobes.hpp +++ b/src/randstrobes.hpp @@ -30,7 +30,13 @@ struct RefRandstrobe { , m_packed(packed) { } bool operator< (const RefRandstrobe& other) const { - return hash < other.hash; + // Compare both hash and position to ensure that the order of the + // RefRandstrobes in the index is reproducible no matter which sorting + // function is used. This branchless comparison is faster than the + // equivalent one using std::tie. + __uint128_t lhs = (static_cast<__uint128_t>(hash) << 64) | position; + __uint128_t rhs = (static_cast<__uint128_t>(other.hash) << 64) | other.position; + return lhs < rhs; } int reference_index() const { diff --git a/tests/baseline-commit.txt b/tests/baseline-commit.txt index d2baa623..9ffa9ec8 100644 --- a/tests/baseline-commit.txt +++ b/tests/baseline-commit.txt @@ -1 +1 @@ -cc6928611965881a6b533d05483fb93ff18752b3 +24995f4168108232528fbe5132441c9f1d0401b3 diff --git a/tests/compare-baseline.sh b/tests/compare-baseline.sh index 57f9faac..4bc2223b 100755 --- a/tests/compare-baseline.sh +++ b/tests/compare-baseline.sh @@ -13,8 +13,12 @@ set -euo pipefail python3 -c 'import pysam' ends="pe" -while getopts "s" opt; do +threads=4 +while getopts "st:" opt; do case "${opt}" in + t) + threads=$OPTARG + ;; s) ends=se # single-end reads ;; @@ -38,7 +42,7 @@ baseline_commit=$(< tests/baseline-commit.txt) baseline_bam=baseline/bam/${baseline_commit}.${ends}.bam baseline_binary=baseline/strobealign-${baseline_commit} cmake_options=-DCMAKE_BUILD_TYPE=RelWithDebInfo -strobealign_options="-t 4" +strobealign_options="-t ${threads}" # Generate the baseline BAM if necessary mkdir -p baseline/bam From 724a1df6d518aa35a06e3deeb10c9254e79ede16 Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Wed, 14 Feb 2024 21:41:22 +0100 Subject: [PATCH 18/32] Update baseline commit --- tests/baseline-commit.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/baseline-commit.txt b/tests/baseline-commit.txt index 9ffa9ec8..471fbbbb 100644 --- a/tests/baseline-commit.txt +++ b/tests/baseline-commit.txt @@ -1 +1 @@ -24995f4168108232528fbe5132441c9f1d0401b3 +2e4ff9500e68d6e465735dd276d362cf71851dcd From 31309bd4322582c5fba24ff8e0145e6a380c9e64 Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Fri, 16 Feb 2024 15:10:08 +0100 Subject: [PATCH 19/32] Introduce canonical read length 75 Closes #395 --- CHANGES.md | 4 ++++ README.md | 6 +++--- src/indexparameters.cpp | 3 ++- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 644e9644..2d907665 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -9,6 +9,10 @@ * #376: Improve accuracy for read length 50 by optimizing the default indexing parameters. Paired-end accuracy increases by 0.3 percentage points on average. Single-end accuracy increases by 1 percentage point. +* #395: Previously, read length 75 used the same indexing parameters as length + 50, but the improved settings for length 50 are not the best for length 75. + To avoid a decrease in accuracy, we introduced a new set of pre-defined + indexing parameters for read length 75 (a new canonical read length). * If `--details` is used, output `X0:i` SAM tag with the number of identically-scored best alignments * #378: Added `-C` option for appending the FASTA or FASTQ comment to SAM diff --git a/README.md b/README.md index 1ef03a51..f2d0f8c1 100644 --- a/README.md +++ b/README.md @@ -145,9 +145,9 @@ options. Some important ones are: Strobealign needs to build an index (strobemer index) of the reference before it can map reads to it. The optimal indexing parameters depend on the length of the input reads. -There are currently seven different pre-defined sets of parameters that are -optimized for different read lengths. These *canonical read lengths* are -50, 100, 125, 150, 250 and 400. When deciding which of the pre-defined +There are pre-defined sets of parameters that are optimized for different read +lengths. These *canonical read lengths* are +50, 75, 100, 125, 150, 250 and 400. When deciding which of the pre-defined indexing parameter sets to use, strobealign chooses one whose canonical read length is close to the average read length of the input. diff --git a/src/indexparameters.cpp b/src/indexparameters.cpp index 2f634e9f..0c655903 100644 --- a/src/indexparameters.cpp +++ b/src/indexparameters.cpp @@ -35,7 +35,8 @@ struct Profile { static auto max{std::numeric_limits::max()}; static std::vector profiles = { - Profile{ 50, 90, 18, -4, -2, 1}, + Profile{ 50, 70, 18, -4, -2, 1}, + Profile{ 75, 90, 20, -4, -3, 2}, Profile{100, 110, 20, -4, -2, 2}, Profile{125, 135, 20, -4, -1, 4}, Profile{150, 175, 20, -4, 1, 7}, From 02ea1966b5afa5617d7fc133cefbbc7b43e099b5 Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Wed, 15 Feb 2023 23:20:24 +0100 Subject: [PATCH 20/32] Update ksw2 to the most recent upstream version --- ext/ksw2.h | 312 +++++++++-------------- ext/ksw2_extz2_sse.c | 574 +++++++++++++++++++------------------------ 2 files changed, 367 insertions(+), 519 deletions(-) diff --git a/ext/ksw2.h b/ext/ksw2.h index 04eeda27..01edd45e 100644 --- a/ext/ksw2.h +++ b/ext/ksw2.h @@ -15,20 +15,30 @@ #define KSW_EZ_SPLICE_FOR 0x100 #define KSW_EZ_SPLICE_REV 0x200 #define KSW_EZ_SPLICE_FLANK 0x400 +#define KSW_EZ_EQX 0x800 + +// The subset of CIGAR operators used by ksw code. +// Use MM_CIGAR_* from minimap.h if you need the full list. +#define KSW_CIGAR_MATCH 0 +#define KSW_CIGAR_INS 1 +#define KSW_CIGAR_DEL 2 +#define KSW_CIGAR_N_SKIP 3 +#define KSW_CIGAR_EQ 7 +#define KSW_CIGAR_X 8 #ifdef __cplusplus extern "C" { #endif typedef struct { - uint32_t max: 31, zdropped: 1; - int max_q, max_t; // max extension coordinate - int mqe, mqe_t; // max score when reaching the end of query - int mte, mte_q; // max score when reaching the end of target - int score; // max score reaching both ends; may be KSW_NEG_INF - int m_cigar, n_cigar; - int reach_end; - uint32_t *cigar; + uint32_t max:31, zdropped:1; + int max_q, max_t; // max extension coordinate + int mqe, mqe_t; // max score when reaching the end of query + int mte, mte_q; // max score when reaching the end of target + int score; // max score reaching both ends; may be KSW_NEG_INF + int m_cigar, n_cigar; + int reach_end; + uint32_t *cigar; } ksw_extz_t; /** @@ -49,69 +59,21 @@ typedef struct { * @param ez (out) scores and cigar */ void ksw_extz(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, - int8_t q, int8_t e, int w, int zdrop, int flag, ksw_extz_t *ez); + int8_t q, int8_t e, int w, int zdrop, int flag, ksw_extz_t *ez); -void ksw_extz2_sse(void *km, - int qlen, - const uint8_t *query, - int tlen, - const uint8_t *target, - int8_t m, - const int8_t *mat, - int8_t q, - int8_t e, - int w, - int zdrop, - int end_bonus, - int flag, - ksw_extz_t *ez); +void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, + int8_t q, int8_t e, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez); void ksw_extd(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, - int8_t gapo, int8_t gape, int8_t gapo2, int8_t gape2, int w, int zdrop, int flag, ksw_extz_t *ez); + int8_t gapo, int8_t gape, int8_t gapo2, int8_t gape2, int w, int zdrop, int flag, ksw_extz_t *ez); -void ksw_extd2_sse(void *km, - int qlen, - const uint8_t *query, - int tlen, - const uint8_t *target, - int8_t m, - const int8_t *mat, - int8_t gapo, - int8_t gape, - int8_t gapo2, - int8_t gape2, - int w, - int zdrop, - int end_bonus, - int flag, - ksw_extz_t *ez); +void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, + int8_t gapo, int8_t gape, int8_t gapo2, int8_t gape2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez); -void ksw_exts2_sse(void *km, - int qlen, - const uint8_t *query, - int tlen, - const uint8_t *target, - int8_t m, - const int8_t *mat, - int8_t gapo, - int8_t gape, - int8_t gapo2, - int8_t noncan, - int zdrop, - int flag, - ksw_extz_t *ez); +void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, + int8_t gapo, int8_t gape, int8_t gapo2, int8_t noncan, int zdrop, int8_t junc_bonus, int flag, const uint8_t *junc, ksw_extz_t *ez); -void ksw_extf2_sse(void *km, - int qlen, - const uint8_t *query, - int tlen, - const uint8_t *target, - int8_t mch, - int8_t mis, - int8_t e, - int w, - int xdrop, - ksw_extz_t *ez); +void ksw_extf2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t mch, int8_t mis, int8_t e, int w, int xdrop, ksw_extz_t *ez); /** * Global alignment @@ -123,45 +85,9 @@ void ksw_extf2_sse(void *km, * * @return score of the alignment */ -int ksw_gg(void *km, - int qlen, - const uint8_t *query, - int tlen, - const uint8_t *target, - int8_t m, - const int8_t *mat, - int8_t gapo, - int8_t gape, - int w, - int *m_cigar_, - int *n_cigar_, - uint32_t **cigar_); -int ksw_gg2(void *km, - int qlen, - const uint8_t *query, - int tlen, - const uint8_t *target, - int8_t m, - const int8_t *mat, - int8_t gapo, - int8_t gape, - int w, - int *m_cigar_, - int *n_cigar_, - uint32_t **cigar_); -int ksw_gg2_sse(void *km, - int qlen, - const uint8_t *query, - int tlen, - const uint8_t *target, - int8_t m, - const int8_t *mat, - int8_t gapo, - int8_t gape, - int w, - int *m_cigar_, - int *n_cigar_, - uint32_t **cigar_); +int ksw_gg(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t gapo, int8_t gape, int w, int *m_cigar_, int *n_cigar_, uint32_t **cigar_); +int ksw_gg2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t gapo, int8_t gape, int w, int *m_cigar_, int *n_cigar_, uint32_t **cigar_); +int ksw_gg2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t gapo, int8_t gape, int w, int *m_cigar_, int *n_cigar_, uint32_t **cigar_); void *ksw_ll_qinit(void *km, int size, int qlen, const uint8_t *query, int m, const int8_t *mat); int ksw_ll_i16(void *q, int tlen, const uint8_t *target, int gapo, int gape, int *qe, int *te); @@ -184,107 +110,99 @@ int ksw_ll_i16(void *q, int tlen, const uint8_t *target, int gapo, int gape, int #define kfree(km, ptr) free((ptr)) #endif -static inline uint32_t *ksw_push_cigar(void *km, int *n_cigar, int *m_cigar, uint32_t *cigar, uint32_t op, int len) { - if (*n_cigar == 0 || op != (cigar[(*n_cigar) - 1] & 0xf)) { - if (*n_cigar == *m_cigar) { - *m_cigar = *m_cigar ? (*m_cigar) << 1 : 4; - cigar = (uint32_t *) krealloc(km, cigar, (*m_cigar) << 2); - } - cigar[(*n_cigar)++] = len << 4 | op; - } else - cigar[(*n_cigar) - 1] += len << 4; - return cigar; +static inline uint32_t *ksw_push_cigar(void *km, int *n_cigar, int *m_cigar, uint32_t *cigar, uint32_t op, int len) +{ + if (*n_cigar == 0 || op != (cigar[(*n_cigar) - 1]&0xf)) { + if (*n_cigar == *m_cigar) { + *m_cigar = *m_cigar? (*m_cigar)<<1 : 4; + cigar = (uint32_t*)krealloc(km, cigar, (*m_cigar) << 2); + } + cigar[(*n_cigar)++] = len<<4 | op; + } else cigar[(*n_cigar)-1] += len<<4; + return cigar; } // In the backtrack matrix, value p[] has the following structure: // bit 0-2: which type gets the max - 0 for H, 1 for E, 2 for F, 3 for \tilde{E} and 4 for \tilde{F} // bit 3/0x08: 1 if a continuation on the E state (bit 5/0x20 for a continuation on \tilde{E}) // bit 4/0x10: 1 if a continuation on the F state (bit 6/0x40 for a continuation on \tilde{F}) -static inline void ksw_backtrack(void *km, - int is_rot, - int is_rev, - int min_intron_len, - const uint8_t *p, - const int *off, - const int *off_end, - int n_col, - int i0, - int j0, - int *m_cigar_, - int *n_cigar_, - uint32_t **cigar_) { // p[] - lower 3 bits: which type gets the max; bit - int n_cigar = 0, m_cigar = *m_cigar_, i = i0, j = j0, r, state = 0; - uint32_t *cigar = *cigar_, tmp; - while (i >= 0 && j >= 0) { // at the beginning of the loop, _state_ tells us which state to check - int force_state = -1; - if (is_rot) { - r = i + j; - if (i < off[r]) - force_state = 2; - if (off_end && i > off_end[r]) - force_state = 1; - tmp = force_state < 0 ? p[(size_t) r * n_col + i - off[r]] : 0; - } else { - if (j < off[i]) - force_state = 2; - if (off_end && j > off_end[i]) - force_state = 1; - tmp = force_state < 0 ? p[(size_t) i * n_col + j - off[i]] : 0; - } - if (state == 0) - state = tmp & 7; // if requesting the H state, find state one maximizes it. - else if (!(tmp >> (state + 2) & 1)) - state = 0; // if requesting other states, _state_ stays the same if it is a continuation; otherwise, set to H - if (state == 0) - state = tmp & 7; // TODO: probably this line can be merged into the "else if" line right above; not 100% sure - if (force_state >= 0) - state = force_state; - if (state == 0) - cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 0, 1), --i, --j; // match - else if (state == 1 || (state == 3 && min_intron_len <= 0)) - cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 2, 1), --i; // deletion - else if (state == 3 && min_intron_len > 0) - cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 3, 1), --i; // intron - else - cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 1, 1), --j; // insertion - } - if (i >= 0) - cigar = ksw_push_cigar(km, - &n_cigar, - &m_cigar, - cigar, - min_intron_len > 0 && i >= min_intron_len ? 3 : 2, - i + 1); // first deletion - if (j >= 0) - cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, 1, j + 1); // first insertion - if (!is_rev) - for (i = 0; i < n_cigar >> 1; ++i) // reverse CIGAR - tmp = cigar[i], cigar[i] = cigar[n_cigar - 1 - i], cigar[n_cigar - 1 - i] = tmp; - *m_cigar_ = m_cigar, *n_cigar_ = n_cigar, *cigar_ = cigar; +static inline void ksw_backtrack(void *km, int is_rot, int is_rev, int min_intron_len, const uint8_t *p, const int *off, const int *off_end, int n_col, int i0, int j0, + int *m_cigar_, int *n_cigar_, uint32_t **cigar_) +{ // p[] - lower 3 bits: which type gets the max; bit + int n_cigar = 0, m_cigar = *m_cigar_, i = i0, j = j0, r, state = 0; + uint32_t *cigar = *cigar_, tmp; + while (i >= 0 && j >= 0) { // at the beginning of the loop, _state_ tells us which state to check + int force_state = -1; + if (is_rot) { + r = i + j; + if (i < off[r]) force_state = 2; + if (off_end && i > off_end[r]) force_state = 1; + tmp = force_state < 0? p[(size_t)r * n_col + i - off[r]] : 0; + } else { + if (j < off[i]) force_state = 2; + if (off_end && j > off_end[i]) force_state = 1; + tmp = force_state < 0? p[(size_t)i * n_col + j - off[i]] : 0; + } + if (state == 0) state = tmp & 7; // if requesting the H state, find state one maximizes it. + else if (!(tmp >> (state + 2) & 1)) state = 0; // if requesting other states, _state_ stays the same if it is a continuation; otherwise, set to H + if (state == 0) state = tmp & 7; // TODO: probably this line can be merged into the "else if" line right above; not 100% sure + if (force_state >= 0) state = force_state; + if (state == 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_MATCH, 1), --i, --j; + else if (state == 1 || (state == 3 && min_intron_len <= 0)) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_DEL, 1), --i; + else if (state == 3 && min_intron_len > 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_N_SKIP, 1), --i; + else cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_INS, 1), --j; + } + if (i >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, min_intron_len > 0 && i >= min_intron_len? KSW_CIGAR_N_SKIP : KSW_CIGAR_DEL, i + 1); // first deletion + if (j >= 0) cigar = ksw_push_cigar(km, &n_cigar, &m_cigar, cigar, KSW_CIGAR_INS, j + 1); // first insertion + if (!is_rev) + for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR + tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp; + *m_cigar_ = m_cigar, *n_cigar_ = n_cigar, *cigar_ = cigar; +} + +static inline void ksw_cigar2eqx(void *km, const uint8_t *query, const uint8_t *target, int nc0, const uint32_t *ci0, int *mc1, int *nc1, uint32_t **ci1) +{ + int i, k, x = 0, y = 0; + *nc1 = 0; + for (k = 0; k < nc0; ++k) { + int op = ci0[k]&0xf, len = ci0[k]>>4; + if (op == KSW_CIGAR_MATCH) { + for (i = 0; i < len; ++i) { + if (target[x + i] == query[y + i]) ksw_push_cigar(km, nc1, mc1, *ci1, KSW_CIGAR_EQ, 1); + else ksw_push_cigar(km, nc1, mc1, *ci1, KSW_CIGAR_X, 1); + } + x += len, y += len; + } else { + ksw_push_cigar(km, nc1, mc1, *ci1, op, len); + if (op == KSW_CIGAR_DEL || op == KSW_CIGAR_N_SKIP) x += len; + else if (op == KSW_CIGAR_INS) y += len; + else if (op == KSW_CIGAR_EQ || op == KSW_CIGAR_X) x += len, y += len; + } + } } -static inline void ksw_reset_extz(ksw_extz_t *ez) { - ez->max_q = ez->max_t = ez->mqe_t = ez->mte_q = -1; - ez->max = 0, ez->score = ez->mqe = ez->mte = KSW_NEG_INF; - ez->n_cigar = 0, ez->zdropped = 0, ez->reach_end = 0; +static inline void ksw_reset_extz(ksw_extz_t *ez) +{ + ez->max_q = ez->max_t = ez->mqe_t = ez->mte_q = -1; + ez->max = 0, ez->score = ez->mqe = ez->mte = KSW_NEG_INF; + ez->n_cigar = 0, ez->zdropped = 0, ez->reach_end = 0; } -static inline int ksw_apply_zdrop(ksw_extz_t *ez, int is_rot, int32_t H, int a, int b, int zdrop, int8_t e) { - int r, t; - if (is_rot) - r = a, t = b; - else - r = a + b, t = a; - if (H > (int32_t) ez->max) { - ez->max = H, ez->max_t = t, ez->max_q = r - t; - } else if (t >= ez->max_t && r - t >= ez->max_q) { - int tl = t - ez->max_t, ql = (r - t) - ez->max_q, l; - l = tl > ql ? tl - ql : ql - tl; - if (zdrop >= 0 && ez->max - H > zdrop + l * e) { - ez->zdropped = 1; - return 1; - } - } - return 0; +static inline int ksw_apply_zdrop(ksw_extz_t *ez, int is_rot, int32_t H, int a, int b, int zdrop, int8_t e) +{ + int r, t; + if (is_rot) r = a, t = b; + else r = a + b, t = a; + if (H > (int32_t)ez->max) { + ez->max = H, ez->max_t = t, ez->max_q = r - t; + } else if (t >= ez->max_t && r - t >= ez->max_q) { + int tl = t - ez->max_t, ql = (r - t) - ez->max_q, l; + l = tl > ql? tl - ql : ql - tl; + if (zdrop >= 0 && ez->max - H > zdrop + l * e) { + ez->zdropped = 1; + return 1; + } + } + return 0; } -#endif \ No newline at end of file +#endif diff --git a/ext/ksw2_extz2_sse.c b/ext/ksw2_extz2_sse.c index 767a749f..02bb4c2a 100644 --- a/ext/ksw2_extz2_sse.c +++ b/ext/ksw2_extz2_sse.c @@ -20,356 +20,286 @@ void ksw_extz2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const u void ksw_extz2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez) #endif #else -void ksw_extz2_sse(void *km, - int qlen, - const uint8_t *query, - int tlen, - const uint8_t *target, - int8_t m, - const int8_t *mat, - int8_t q, - int8_t e, - int w, - int zdrop, - int end_bonus, - int flag, - ksw_extz_t *ez) +void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez) #endif // ~KSW_CPU_DISPATCH { #define __dp_code_block1 \ - z = _mm_add_epi8(_mm_load_si128(&s[t]), qe2_); \ - xt1 = _mm_load_si128(&x[t]); /* xt1 <- x[r-1][t..t+15] */ \ - tmp = _mm_srli_si128(xt1, 15); /* tmp <- x[r-1][t+15] */ \ - xt1 = _mm_or_si128(_mm_slli_si128(xt1, 1), x1_); /* xt1 <- x[r-1][t-1..t+14] */ \ - x1_ = tmp; \ - vt1 = _mm_load_si128(&v[t]); /* vt1 <- v[r-1][t..t+15] */ \ - tmp = _mm_srli_si128(vt1, 15); /* tmp <- v[r-1][t+15] */ \ - vt1 = _mm_or_si128(_mm_slli_si128(vt1, 1), v1_); /* vt1 <- v[r-1][t-1..t+14] */ \ - v1_ = tmp; \ - a = _mm_add_epi8(xt1, vt1); /* a <- x[r-1][t-1..t+14] + v[r-1][t-1..t+14] */ \ - ut = _mm_load_si128(&u[t]); /* ut <- u[t..t+15] */ \ - b = _mm_add_epi8(_mm_load_si128(&y[t]), ut); /* b <- y[r-1][t..t+15] + u[r-1][t..t+15] */ + z = _mm_add_epi8(_mm_load_si128(&s[t]), qe2_); \ + xt1 = _mm_load_si128(&x[t]); /* xt1 <- x[r-1][t..t+15] */ \ + tmp = _mm_srli_si128(xt1, 15); /* tmp <- x[r-1][t+15] */ \ + xt1 = _mm_or_si128(_mm_slli_si128(xt1, 1), x1_); /* xt1 <- x[r-1][t-1..t+14] */ \ + x1_ = tmp; \ + vt1 = _mm_load_si128(&v[t]); /* vt1 <- v[r-1][t..t+15] */ \ + tmp = _mm_srli_si128(vt1, 15); /* tmp <- v[r-1][t+15] */ \ + vt1 = _mm_or_si128(_mm_slli_si128(vt1, 1), v1_); /* vt1 <- v[r-1][t-1..t+14] */ \ + v1_ = tmp; \ + a = _mm_add_epi8(xt1, vt1); /* a <- x[r-1][t-1..t+14] + v[r-1][t-1..t+14] */ \ + ut = _mm_load_si128(&u[t]); /* ut <- u[t..t+15] */ \ + b = _mm_add_epi8(_mm_load_si128(&y[t]), ut); /* b <- y[r-1][t..t+15] + u[r-1][t..t+15] */ #define __dp_code_block2 \ - z = _mm_max_epu8(z, b); /* z = max(z, b); this works because both are non-negative */ \ - z = _mm_min_epu8(z, max_sc_); \ - _mm_store_si128(&u[t], _mm_sub_epi8(z, vt1)); /* u[r][t..t+15] <- z - v[r-1][t-1..t+14] */ \ - _mm_store_si128(&v[t], _mm_sub_epi8(z, ut)); /* v[r][t..t+15] <- z - u[r-1][t..t+15] */ \ - z = _mm_sub_epi8(z, q_); \ - a = _mm_sub_epi8(a, z); \ - b = _mm_sub_epi8(b, z); + z = _mm_max_epu8(z, b); /* z = max(z, b); this works because both are non-negative */ \ + z = _mm_min_epu8(z, max_sc_); \ + _mm_store_si128(&u[t], _mm_sub_epi8(z, vt1)); /* u[r][t..t+15] <- z - v[r-1][t-1..t+14] */ \ + _mm_store_si128(&v[t], _mm_sub_epi8(z, ut)); /* v[r][t..t+15] <- z - u[r-1][t..t+15] */ \ + z = _mm_sub_epi8(z, q_); \ + a = _mm_sub_epi8(a, z); \ + b = _mm_sub_epi8(b, z); - int r, t, qe = q + e, n_col_, *off = 0, *off_end = 0, tlen_, qlen_, last_st, last_en, wl, wr, max_sc, min_sc; - int with_cigar = !(flag & KSW_EZ_SCORE_ONLY), approx_max = !!(flag & KSW_EZ_APPROX_MAX); - int32_t *H = 0, H0 = 0, last_H0_t = 0; - uint8_t *qr, *sf, *mem, *mem2 = 0; - __m128i q_, qe2_, zero_, flag1_, flag2_, flag8_, flag16_, sc_mch_, sc_mis_, sc_N_, m1_, max_sc_; - __m128i *u, *v, *x, *y, *s, *p = 0; + int r, t, qe = q + e, n_col_, *off = 0, *off_end = 0, tlen_, qlen_, last_st, last_en, wl, wr, max_sc, min_sc; + int with_cigar = !(flag&KSW_EZ_SCORE_ONLY), approx_max = !!(flag&KSW_EZ_APPROX_MAX); + int32_t *H = 0, H0 = 0, last_H0_t = 0; + uint8_t *qr, *sf, *mem, *mem2 = 0; + __m128i q_, qe2_, zero_, flag1_, flag2_, flag8_, flag16_, sc_mch_, sc_mis_, sc_N_, m1_, max_sc_; + __m128i *u, *v, *x, *y, *s, *p = 0; - ksw_reset_extz(ez); - if (m <= 0 || qlen <= 0 || tlen <= 0) - return; + ksw_reset_extz(ez); + if (m <= 0 || qlen <= 0 || tlen <= 0) return; - zero_ = _mm_set1_epi8(0); - q_ = _mm_set1_epi8(q); - qe2_ = _mm_set1_epi8((q + e) * 2); - flag1_ = _mm_set1_epi8(1); - flag2_ = _mm_set1_epi8(2); - flag8_ = _mm_set1_epi8(0x08); - flag16_ = _mm_set1_epi8(0x10); - sc_mch_ = _mm_set1_epi8(mat[0]); - sc_mis_ = _mm_set1_epi8(mat[1]); - sc_N_ = mat[m * m - 1] == 0 ? _mm_set1_epi8(-e) : _mm_set1_epi8(mat[m * m - 1]); - m1_ = _mm_set1_epi8(m - 1); // wildcard - max_sc_ = _mm_set1_epi8(mat[0] + (q + e) * 2); + zero_ = _mm_set1_epi8(0); + q_ = _mm_set1_epi8(q); + qe2_ = _mm_set1_epi8((q + e) * 2); + flag1_ = _mm_set1_epi8(1); + flag2_ = _mm_set1_epi8(2); + flag8_ = _mm_set1_epi8(0x08); + flag16_ = _mm_set1_epi8(0x10); + sc_mch_ = _mm_set1_epi8(mat[0]); + sc_mis_ = _mm_set1_epi8(mat[1]); + sc_N_ = mat[m*m-1] == 0? _mm_set1_epi8(-e) : _mm_set1_epi8(mat[m*m-1]); + m1_ = _mm_set1_epi8(m - 1); // wildcard + max_sc_ = _mm_set1_epi8(mat[0] + (q + e) * 2); - if (w < 0) - w = tlen > qlen ? tlen : qlen; - wl = wr = w; - tlen_ = (tlen + 15) / 16; - n_col_ = qlen < tlen ? qlen : tlen; - n_col_ = ((n_col_ < w + 1 ? n_col_ : w + 1) + 15) / 16 + 1; - qlen_ = (qlen + 15) / 16; - for (t = 1, max_sc = mat[0], min_sc = mat[1]; t < m * m; ++t) { - max_sc = max_sc > mat[t] ? max_sc : mat[t]; - min_sc = min_sc < mat[t] ? min_sc : mat[t]; - } - if (-min_sc > 2 * (q + e)) - return; // otherwise, we won't see any mismatches + if (w < 0) w = tlen > qlen? tlen : qlen; + wl = wr = w; + tlen_ = (tlen + 15) / 16; + n_col_ = qlen < tlen? qlen : tlen; + n_col_ = ((n_col_ < w + 1? n_col_ : w + 1) + 15) / 16 + 1; + qlen_ = (qlen + 15) / 16; + for (t = 1, max_sc = mat[0], min_sc = mat[1]; t < m * m; ++t) { + max_sc = max_sc > mat[t]? max_sc : mat[t]; + min_sc = min_sc < mat[t]? min_sc : mat[t]; + } + if (-min_sc > 2 * (q + e)) return; // otherwise, we won't see any mismatches - mem = (uint8_t *) kcalloc(km, tlen_ * 6 + qlen_ + 1, 16); - u = (__m128i *) (((size_t) mem + 15) >> 4 << 4); // 16-byte aligned - v = u + tlen_, x = v + tlen_, y = x + tlen_, s = y + tlen_, sf = (uint8_t *) (s + tlen_), qr = sf + tlen_ * 16; - if (!approx_max) { - H = (int32_t *) kmalloc(km, tlen_ * 16 * 4); - for (t = 0; t < tlen_ * 16; ++t) - H[t] = KSW_NEG_INF; - } - if (with_cigar) { - mem2 = (uint8_t *) kmalloc(km, ((size_t) (qlen + tlen - 1) * n_col_ + 1) * 16); - p = (__m128i *) (((size_t) mem2 + 15) >> 4 << 4); - off = (int *) kmalloc(km, (qlen + tlen - 1) * sizeof(int) * 2); - off_end = off + qlen + tlen - 1; - } + mem = (uint8_t*)kcalloc(km, tlen_ * 6 + qlen_ + 1, 16); + u = (__m128i*)(((size_t)mem + 15) >> 4 << 4); // 16-byte aligned + v = u + tlen_, x = v + tlen_, y = x + tlen_, s = y + tlen_, sf = (uint8_t*)(s + tlen_), qr = sf + tlen_ * 16; + if (!approx_max) { + H = (int32_t*)kmalloc(km, tlen_ * 16 * 4); + for (t = 0; t < tlen_ * 16; ++t) H[t] = KSW_NEG_INF; + } + if (with_cigar) { + mem2 = (uint8_t*)kmalloc(km, ((size_t)(qlen + tlen - 1) * n_col_ + 1) * 16); + p = (__m128i*)(((size_t)mem2 + 15) >> 4 << 4); + off = (int*)kmalloc(km, (qlen + tlen - 1) * sizeof(int) * 2); + off_end = off + qlen + tlen - 1; + } - for (t = 0; t < qlen; ++t) - qr[t] = query[qlen - 1 - t]; - memcpy(sf, target, tlen); + for (t = 0; t < qlen; ++t) qr[t] = query[qlen - 1 - t]; + memcpy(sf, target, tlen); - for (r = 0, last_st = last_en = -1; r < qlen + tlen - 1; ++r) { - int st = 0, en = tlen - 1, st0, en0, st_, en_; - int8_t x1, v1; - uint8_t *qrr = qr + (qlen - 1 - r), *u8 = (uint8_t *) u, *v8 = (uint8_t *) v; - __m128i x1_, v1_; - // find the boundaries - if (st < r - qlen + 1) - st = r - qlen + 1; - if (en > r) - en = r; - if (st < (r - wr + 1) >> 1) - st = (r - wr + 1) >> 1; // take the ceil - if (en > (r + wl) >> 1) - en = (r + wl) >> 1; // take the floor - if (st > en) { - ez->zdropped = 1; - break; - } - st0 = st, en0 = en; - st = st / 16 * 16, en = (en + 16) / 16 * 16 - 1; - // set boundary conditions - if (st > 0) { - if (st - 1 >= last_st && st - 1 <= last_en) - x1 = ((uint8_t *) x)[st - 1], v1 = v8[st - 1]; // (r-1,s-1) calculated in the last round - else - x1 = v1 = 0; // not calculated; set to zeros - } else - x1 = 0, v1 = r ? q : 0; - if (en >= r) - ((uint8_t *) y)[r] = 0, u8[r] = r ? q : 0; - // loop fission: set scores first - if (!(flag & KSW_EZ_GENERIC_SC)) { - for (t = st0; t <= en0; t += 16) { - __m128i sq, st, tmp, mask; - sq = _mm_loadu_si128((__m128i *) &sf[t]); - st = _mm_loadu_si128((__m128i *) &qrr[t]); - mask = _mm_or_si128(_mm_cmpeq_epi8(sq, m1_), _mm_cmpeq_epi8(st, m1_)); - tmp = _mm_cmpeq_epi8(sq, st); + for (r = 0, last_st = last_en = -1; r < qlen + tlen - 1; ++r) { + int st = 0, en = tlen - 1, st0, en0, st_, en_; + int8_t x1, v1; + uint8_t *qrr = qr + (qlen - 1 - r), *u8 = (uint8_t*)u, *v8 = (uint8_t*)v; + __m128i x1_, v1_; + // find the boundaries + if (st < r - qlen + 1) st = r - qlen + 1; + if (en > r) en = r; + if (st < (r-wr+1)>>1) st = (r-wr+1)>>1; // take the ceil + if (en > (r+wl)>>1) en = (r+wl)>>1; // take the floor + if (st > en) { + ez->zdropped = 1; + break; + } + st0 = st, en0 = en; + st = st / 16 * 16, en = (en + 16) / 16 * 16 - 1; + // set boundary conditions + if (st > 0) { + if (st - 1 >= last_st && st - 1 <= last_en) + x1 = ((uint8_t*)x)[st - 1], v1 = v8[st - 1]; // (r-1,s-1) calculated in the last round + else x1 = v1 = 0; // not calculated; set to zeros + } else x1 = 0, v1 = r? q : 0; + if (en >= r) ((uint8_t*)y)[r] = 0, u8[r] = r? q : 0; + // loop fission: set scores first + if (!(flag & KSW_EZ_GENERIC_SC)) { + for (t = st0; t <= en0; t += 16) { + __m128i sq, st, tmp, mask; + sq = _mm_loadu_si128((__m128i*)&sf[t]); + st = _mm_loadu_si128((__m128i*)&qrr[t]); + mask = _mm_or_si128(_mm_cmpeq_epi8(sq, m1_), _mm_cmpeq_epi8(st, m1_)); + tmp = _mm_cmpeq_epi8(sq, st); #ifdef __SSE4_1__ - tmp = _mm_blendv_epi8(sc_mis_, sc_mch_, tmp); - tmp = _mm_blendv_epi8(tmp, sc_N_, mask); + tmp = _mm_blendv_epi8(sc_mis_, sc_mch_, tmp); + tmp = _mm_blendv_epi8(tmp, sc_N_, mask); #else - tmp = _mm_or_si128(_mm_andnot_si128(tmp, sc_mis_), _mm_and_si128(tmp, sc_mch_)); - tmp = _mm_or_si128(_mm_andnot_si128(mask, tmp), _mm_and_si128(mask, sc_N_)); + tmp = _mm_or_si128(_mm_andnot_si128(tmp, sc_mis_), _mm_and_si128(tmp, sc_mch_)); + tmp = _mm_or_si128(_mm_andnot_si128(mask, tmp), _mm_and_si128(mask, sc_N_)); #endif - _mm_storeu_si128((__m128i *) ((uint8_t *) s + t), tmp); - } - } else { - for (t = st0; t <= en0; ++t) - ((uint8_t *) s)[t] = mat[sf[t] * m + qrr[t]]; - } - // core loop - x1_ = _mm_cvtsi32_si128(x1); - v1_ = _mm_cvtsi32_si128(v1); - st_ = st / 16, en_ = en / 16; - assert(en_ - st_ + 1 <= n_col_); - if (!with_cigar) { // score only - for (t = st_; t <= en_; ++t) { - __m128i z, a, b, xt1, vt1, ut, tmp; - __dp_code_block1; + _mm_storeu_si128((__m128i*)((uint8_t*)s + t), tmp); + } + } else { + for (t = st0; t <= en0; ++t) + ((uint8_t*)s)[t] = mat[sf[t] * m + qrr[t]]; + } + // core loop + x1_ = _mm_cvtsi32_si128(x1); + v1_ = _mm_cvtsi32_si128(v1); + st_ = st / 16, en_ = en / 16; + assert(en_ - st_ + 1 <= n_col_); + if (!with_cigar) { // score only + for (t = st_; t <= en_; ++t) { + __m128i z, a, b, xt1, vt1, ut, tmp; + __dp_code_block1; #ifdef __SSE4_1__ - z = _mm_max_epi8(z, a); // z = z > a? z : a (signed) + z = _mm_max_epi8(z, a); // z = z > a? z : a (signed) #else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() - z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_)); // z = z > 0? z : 0; - z = _mm_max_epu8(z, a); // z = max(z, a); this works because both are non-negative + z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_)); // z = z > 0? z : 0; + z = _mm_max_epu8(z, a); // z = max(z, a); this works because both are non-negative #endif - __dp_code_block2; + __dp_code_block2; #ifdef __SSE4_1__ - _mm_store_si128(&x[t], _mm_max_epi8(a, zero_)); - _mm_store_si128(&y[t], _mm_max_epi8(b, zero_)); + _mm_store_si128(&x[t], _mm_max_epi8(a, zero_)); + _mm_store_si128(&y[t], _mm_max_epi8(b, zero_)); #else - tmp = _mm_cmpgt_epi8(a, zero_); - _mm_store_si128(&x[t], _mm_and_si128(a, tmp)); - tmp = _mm_cmpgt_epi8(b, zero_); - _mm_store_si128(&y[t], _mm_and_si128(b, tmp)); + tmp = _mm_cmpgt_epi8(a, zero_); + _mm_store_si128(&x[t], _mm_and_si128(a, tmp)); + tmp = _mm_cmpgt_epi8(b, zero_); + _mm_store_si128(&y[t], _mm_and_si128(b, tmp)); #endif - } - } else if (!(flag & KSW_EZ_RIGHT)) { // gap left-alignment - __m128i *pr = p + (size_t) r * n_col_ - st_; - off[r] = st, off_end[r] = en; - for (t = st_; t <= en_; ++t) { - __m128i d, z, a, b, xt1, vt1, ut, tmp; - __dp_code_block1; - d = _mm_and_si128(_mm_cmpgt_epi8(a, z), flag1_); // d = a > z? 1 : 0 + } + } else if (!(flag&KSW_EZ_RIGHT)) { // gap left-alignment + __m128i *pr = p + (size_t)r * n_col_ - st_; + off[r] = st, off_end[r] = en; + for (t = st_; t <= en_; ++t) { + __m128i d, z, a, b, xt1, vt1, ut, tmp; + __dp_code_block1; + d = _mm_and_si128(_mm_cmpgt_epi8(a, z), flag1_); // d = a > z? 1 : 0 #ifdef __SSE4_1__ - z = _mm_max_epi8(z, a); // z = z > a? z : a (signed) - tmp = _mm_cmpgt_epi8(b, z); - d = _mm_blendv_epi8(d, flag2_, tmp); // d = b > z? 2 : d + z = _mm_max_epi8(z, a); // z = z > a? z : a (signed) + tmp = _mm_cmpgt_epi8(b, z); + d = _mm_blendv_epi8(d, flag2_, tmp); // d = b > z? 2 : d #else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8() - z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_)); // z = z > 0? z : 0; - z = _mm_max_epu8(z, a); // z = max(z, a); this works because both are non-negative - tmp = _mm_cmpgt_epi8(b, z); - d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, flag2_)); // d = b > z? 2 : d; emulating blendv + z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_)); // z = z > 0? z : 0; + z = _mm_max_epu8(z, a); // z = max(z, a); this works because both are non-negative + tmp = _mm_cmpgt_epi8(b, z); + d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, flag2_)); // d = b > z? 2 : d; emulating blendv #endif - __dp_code_block2; - tmp = _mm_cmpgt_epi8(a, zero_); - _mm_store_si128(&x[t], _mm_and_si128(tmp, a)); - d = _mm_or_si128(d, _mm_and_si128(tmp, flag8_)); // d = a > 0? 0x08 : 0 - tmp = _mm_cmpgt_epi8(b, zero_); - _mm_store_si128(&y[t], _mm_and_si128(tmp, b)); - d = _mm_or_si128(d, _mm_and_si128(tmp, flag16_)); // d = b > 0? 0x10 : 0 - _mm_store_si128(&pr[t], d); - } - } else { // gap right-alignment - __m128i *pr = p + (size_t) r * n_col_ - st_; - off[r] = st, off_end[r] = en; - for (t = st_; t <= en_; ++t) { - __m128i d, z, a, b, xt1, vt1, ut, tmp; - __dp_code_block1; - d = _mm_andnot_si128(_mm_cmpgt_epi8(z, a), flag1_); // d = z > a? 0 : 1 + __dp_code_block2; + tmp = _mm_cmpgt_epi8(a, zero_); + _mm_store_si128(&x[t], _mm_and_si128(tmp, a)); + d = _mm_or_si128(d, _mm_and_si128(tmp, flag8_)); // d = a > 0? 0x08 : 0 + tmp = _mm_cmpgt_epi8(b, zero_); + _mm_store_si128(&y[t], _mm_and_si128(tmp, b)); + d = _mm_or_si128(d, _mm_and_si128(tmp, flag16_)); // d = b > 0? 0x10 : 0 + _mm_store_si128(&pr[t], d); + } + } else { // gap right-alignment + __m128i *pr = p + (size_t)r * n_col_ - st_; + off[r] = st, off_end[r] = en; + for (t = st_; t <= en_; ++t) { + __m128i d, z, a, b, xt1, vt1, ut, tmp; + __dp_code_block1; + d = _mm_andnot_si128(_mm_cmpgt_epi8(z, a), flag1_); // d = z > a? 0 : 1 #ifdef __SSE4_1__ - z = _mm_max_epi8(z, a); // z = z > a? z : a (signed) - tmp = _mm_cmpgt_epi8(z, b); - d = _mm_blendv_epi8(flag2_, d, tmp); // d = z > b? d : 2 + z = _mm_max_epi8(z, a); // z = z > a? z : a (signed) + tmp = _mm_cmpgt_epi8(z, b); + d = _mm_blendv_epi8(flag2_, d, tmp); // d = z > b? d : 2 #else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8() - z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_)); // z = z > 0? z : 0; - z = _mm_max_epu8(z, a); // z = max(z, a); this works because both are non-negative - tmp = _mm_cmpgt_epi8(z, b); - d = _mm_or_si128(_mm_andnot_si128(tmp, flag2_), _mm_and_si128(tmp, d)); // d = z > b? d : 2; emulating blendv + z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_)); // z = z > 0? z : 0; + z = _mm_max_epu8(z, a); // z = max(z, a); this works because both are non-negative + tmp = _mm_cmpgt_epi8(z, b); + d = _mm_or_si128(_mm_andnot_si128(tmp, flag2_), _mm_and_si128(tmp, d)); // d = z > b? d : 2; emulating blendv #endif - __dp_code_block2; - tmp = _mm_cmpgt_epi8(zero_, a); - _mm_store_si128(&x[t], _mm_andnot_si128(tmp, a)); - d = _mm_or_si128(d, _mm_andnot_si128(tmp, flag8_)); // d = 0 > a? 0 : 0x08 - tmp = _mm_cmpgt_epi8(zero_, b); - _mm_store_si128(&y[t], _mm_andnot_si128(tmp, b)); - d = _mm_or_si128(d, _mm_andnot_si128(tmp, flag16_)); // d = 0 > b? 0 : 0x10 - _mm_store_si128(&pr[t], d); - } - } - if (!approx_max) { // find the exact max with a 32-bit score array - int32_t max_H, max_t; - // compute H[], max_H and max_t - if (r > 0) { - int32_t HH[4], tt[4], en1 = st0 + (en0 - st0) / 4 * 4, i; - __m128i max_H_, max_t_, qe_; - max_H = H[en0] = en0 > 0 ? H[en0 - 1] + u8[en0] - qe : H[en0] + v8[en0] - qe; // special casing the last element - max_t = en0; - max_H_ = _mm_set1_epi32(max_H); - max_t_ = _mm_set1_epi32(max_t); - qe_ = _mm_set1_epi32(q + e); - for (t = st0; t < en1; t += 4) { // this implements: H[t]+=v8[t]-qe; if(H[t]>max_H) max_H=H[t],max_t=t; - __m128i H1, tmp, t_; - H1 = _mm_loadu_si128((__m128i *) &H[t]); - t_ = _mm_setr_epi32(v8[t], v8[t + 1], v8[t + 2], v8[t + 3]); - H1 = _mm_add_epi32(H1, t_); - H1 = _mm_sub_epi32(H1, qe_); - _mm_storeu_si128((__m128i *) &H[t], H1); - t_ = _mm_set1_epi32(t); - tmp = _mm_cmpgt_epi32(H1, max_H_); + __dp_code_block2; + tmp = _mm_cmpgt_epi8(zero_, a); + _mm_store_si128(&x[t], _mm_andnot_si128(tmp, a)); + d = _mm_or_si128(d, _mm_andnot_si128(tmp, flag8_)); // d = 0 > a? 0 : 0x08 + tmp = _mm_cmpgt_epi8(zero_, b); + _mm_store_si128(&y[t], _mm_andnot_si128(tmp, b)); + d = _mm_or_si128(d, _mm_andnot_si128(tmp, flag16_)); // d = 0 > b? 0 : 0x10 + _mm_store_si128(&pr[t], d); + } + } + if (!approx_max) { // find the exact max with a 32-bit score array + int32_t max_H, max_t; + // compute H[], max_H and max_t + if (r > 0) { + int32_t HH[4], tt[4], en1 = st0 + (en0 - st0) / 4 * 4, i; + __m128i max_H_, max_t_, qe_; + max_H = H[en0] = en0 > 0? H[en0-1] + u8[en0] - qe : H[en0] + v8[en0] - qe; // special casing the last element + max_t = en0; + max_H_ = _mm_set1_epi32(max_H); + max_t_ = _mm_set1_epi32(max_t); + qe_ = _mm_set1_epi32(q + e); + for (t = st0; t < en1; t += 4) { // this implements: H[t]+=v8[t]-qe; if(H[t]>max_H) max_H=H[t],max_t=t; + __m128i H1, tmp, t_; + H1 = _mm_loadu_si128((__m128i*)&H[t]); + t_ = _mm_setr_epi32(v8[t], v8[t+1], v8[t+2], v8[t+3]); + H1 = _mm_add_epi32(H1, t_); + H1 = _mm_sub_epi32(H1, qe_); + _mm_storeu_si128((__m128i*)&H[t], H1); + t_ = _mm_set1_epi32(t); + tmp = _mm_cmpgt_epi32(H1, max_H_); #ifdef __SSE4_1__ - max_H_ = _mm_blendv_epi8(max_H_, H1, tmp); - max_t_ = _mm_blendv_epi8(max_t_, t_, tmp); + max_H_ = _mm_blendv_epi8(max_H_, H1, tmp); + max_t_ = _mm_blendv_epi8(max_t_, t_, tmp); #else - max_H_ = _mm_or_si128(_mm_and_si128(tmp, H1), _mm_andnot_si128(tmp, max_H_)); - max_t_ = _mm_or_si128(_mm_and_si128(tmp, t_), _mm_andnot_si128(tmp, max_t_)); + max_H_ = _mm_or_si128(_mm_and_si128(tmp, H1), _mm_andnot_si128(tmp, max_H_)); + max_t_ = _mm_or_si128(_mm_and_si128(tmp, t_), _mm_andnot_si128(tmp, max_t_)); #endif - } - _mm_storeu_si128((__m128i *) HH, max_H_); - _mm_storeu_si128((__m128i *) tt, max_t_); - for (i = 0; i < 4; ++i) - if (max_H < HH[i]) - max_H = HH[i], max_t = tt[i] + i; - for (; t < en0; ++t) { // for the rest of values that haven't been computed with SSE - H[t] += (int32_t) v8[t] - qe; - if (H[t] > max_H) - max_H = H[t], max_t = t; - } - } else - H[0] = v8[0] - qe - qe, max_H = H[0], max_t = 0; // special casing r==0 - // update ez - if (en0 == tlen - 1 && H[en0] > ez->mte) - ez->mte = H[en0], ez->mte_q = r - en; - if (r - st0 == qlen - 1 && H[st0] > ez->mqe) - ez->mqe = H[st0], ez->mqe_t = st0; - if (ksw_apply_zdrop(ez, 1, max_H, r, max_t, zdrop, e)) - break; - if (r == qlen + tlen - 2 && en0 == tlen - 1) - ez->score = H[tlen - 1]; - } else { // find approximate max; Z-drop might be inaccurate, too. - if (r > 0) { - if (last_H0_t >= st0 && last_H0_t <= en0 && last_H0_t + 1 >= st0 && last_H0_t + 1 <= en0) { - int32_t d0 = v8[last_H0_t] - qe; - int32_t d1 = u8[last_H0_t + 1] - qe; - if (d0 > d1) - H0 += d0; - else - H0 += d1, ++last_H0_t; - } else if (last_H0_t >= st0 && last_H0_t <= en0) { - H0 += v8[last_H0_t] - qe; - } else { - ++last_H0_t, H0 += u8[last_H0_t] - qe; - } - if ((flag & KSW_EZ_APPROX_DROP) && ksw_apply_zdrop(ez, 1, H0, r, last_H0_t, zdrop, e)) - break; - } else - H0 = v8[0] - qe - qe, last_H0_t = 0; - if (r == qlen + tlen - 2 && en0 == tlen - 1) - ez->score = H0; - } - last_st = st, last_en = en; - //for (t = st0; t <= en0; ++t) printf("(%d,%d)\t(%d,%d,%d,%d)\t%d\n", r, t, ((int8_t*)u)[t], ((int8_t*)v)[t], ((int8_t*)x)[t], ((int8_t*)y)[t], H[t]); // for debugging - } - kfree(km, mem); - if (!approx_max) - kfree(km, H); - if (with_cigar) { // backtrack - int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR); - if (!ez->zdropped && !(flag & KSW_EZ_EXTZ_ONLY)) { - ksw_backtrack(km, - 1, - rev_cigar, - 0, - (uint8_t *) p, - off, - off_end, - n_col_ * 16, - tlen - 1, - qlen - 1, - &ez->m_cigar, - &ez->n_cigar, - &ez->cigar); - } else if (!ez->zdropped && (flag & KSW_EZ_EXTZ_ONLY) && ez->mqe + end_bonus > (int) ez->max) { - ez->reach_end = 1; - ksw_backtrack(km, - 1, - rev_cigar, - 0, - (uint8_t *) p, - off, - off_end, - n_col_ * 16, - ez->mqe_t, - qlen - 1, - &ez->m_cigar, - &ez->n_cigar, - &ez->cigar); - } else if (ez->max_t >= 0 && ez->max_q >= 0) { - ksw_backtrack(km, - 1, - rev_cigar, - 0, - (uint8_t *) p, - off, - off_end, - n_col_ * 16, - ez->max_t, - ez->max_q, - &ez->m_cigar, - &ez->n_cigar, - &ez->cigar); - } - kfree(km, mem2); - kfree(km, off); - } + } + _mm_storeu_si128((__m128i*)HH, max_H_); + _mm_storeu_si128((__m128i*)tt, max_t_); + for (i = 0; i < 4; ++i) + if (max_H < HH[i]) max_H = HH[i], max_t = tt[i] + i; + for (; t < en0; ++t) { // for the rest of values that haven't been computed with SSE + H[t] += (int32_t)v8[t] - qe; + if (H[t] > max_H) + max_H = H[t], max_t = t; + } + } else H[0] = v8[0] - qe - qe, max_H = H[0], max_t = 0; // special casing r==0 + // update ez + if (en0 == tlen - 1 && H[en0] > ez->mte) + ez->mte = H[en0], ez->mte_q = r - en; + if (r - st0 == qlen - 1 && H[st0] > ez->mqe) + ez->mqe = H[st0], ez->mqe_t = st0; + if (ksw_apply_zdrop(ez, 1, max_H, r, max_t, zdrop, e)) break; + if (r == qlen + tlen - 2 && en0 == tlen - 1) + ez->score = H[tlen - 1]; + } else { // find approximate max; Z-drop might be inaccurate, too. + if (r > 0) { + if (last_H0_t >= st0 && last_H0_t <= en0 && last_H0_t + 1 >= st0 && last_H0_t + 1 <= en0) { + int32_t d0 = v8[last_H0_t] - qe; + int32_t d1 = u8[last_H0_t + 1] - qe; + if (d0 > d1) H0 += d0; + else H0 += d1, ++last_H0_t; + } else if (last_H0_t >= st0 && last_H0_t <= en0) { + H0 += v8[last_H0_t] - qe; + } else { + ++last_H0_t, H0 += u8[last_H0_t] - qe; + } + if ((flag & KSW_EZ_APPROX_DROP) && ksw_apply_zdrop(ez, 1, H0, r, last_H0_t, zdrop, e)) break; + } else H0 = v8[0] - qe - qe, last_H0_t = 0; + if (r == qlen + tlen - 2 && en0 == tlen - 1) + ez->score = H0; + } + last_st = st, last_en = en; + //for (t = st0; t <= en0; ++t) printf("(%d,%d)\t(%d,%d,%d,%d)\t%d\n", r, t, ((int8_t*)u)[t], ((int8_t*)v)[t], ((int8_t*)x)[t], ((int8_t*)y)[t], H[t]); // for debugging + } + kfree(km, mem); + if (!approx_max) kfree(km, H); + if (with_cigar) { // backtrack + int rev_cigar = !!(flag & KSW_EZ_REV_CIGAR); + if (!ez->zdropped && !(flag&KSW_EZ_EXTZ_ONLY)) { + ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, tlen-1, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar); + } else if (!ez->zdropped && (flag&KSW_EZ_EXTZ_ONLY) && ez->mqe + end_bonus > (int)ez->max) { + ez->reach_end = 1; + ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, ez->mqe_t, qlen-1, &ez->m_cigar, &ez->n_cigar, &ez->cigar); + } else if (ez->max_t >= 0 && ez->max_q >= 0) { + ksw_backtrack(km, 1, rev_cigar, 0, (uint8_t*)p, off, off_end, n_col_*16, ez->max_t, ez->max_q, &ez->m_cigar, &ez->n_cigar, &ez->cigar); + } + kfree(km, mem2); kfree(km, off); + } } -#endif // __SSE2__ \ No newline at end of file +#endif // __SSE2__ From 4f6d50a60829cc96b2e84eae2652a1e69b0d386f Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Wed, 15 Feb 2023 23:25:57 +0100 Subject: [PATCH 21/32] Move ksw2 files to a subdirectory and add license --- ext/README.md | 7 +++++++ ext/ksw2/LICENSE.txt | 24 ++++++++++++++++++++++++ ext/{ => ksw2}/ksw2.h | 0 ext/{ => ksw2}/ksw2_extz2_sse.c | 0 4 files changed, 31 insertions(+) create mode 100644 ext/ksw2/LICENSE.txt rename ext/{ => ksw2}/ksw2.h (100%) rename ext/{ => ksw2}/ksw2_extz2_sse.c (100%) diff --git a/ext/README.md b/ext/README.md index 55e874b7..59e3503d 100644 --- a/ext/README.md +++ b/ext/README.md @@ -60,3 +60,10 @@ License: See xxhash.c Homepage: https://github.com/mateidavid/zstr Commit used: 755da7890ea22478a702e3139092e6c964fab1f5 License: See zstr/LICENSE + + +## ksw2 + +https://github.com/lh3/ksw2 +https://raw.githubusercontent.com/lh3/ksw2/06b2183b0f6646d82f2e3f5884008a1b4582f5b5/ksw2.h +https://raw.githubusercontent.com/lh3/ksw2/06b2183b0f6646d82f2e3f5884008a1b4582f5b5/ksw2_extz2_sse.c diff --git a/ext/ksw2/LICENSE.txt b/ext/ksw2/LICENSE.txt new file mode 100644 index 00000000..1a06f649 --- /dev/null +++ b/ext/ksw2/LICENSE.txt @@ -0,0 +1,24 @@ +The MIT License + +Copyright (c) 2018- Dana-Farber Cancer Institute + 2017-2018 Broad Institute, Inc. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/ext/ksw2.h b/ext/ksw2/ksw2.h similarity index 100% rename from ext/ksw2.h rename to ext/ksw2/ksw2.h diff --git a/ext/ksw2_extz2_sse.c b/ext/ksw2/ksw2_extz2_sse.c similarity index 100% rename from ext/ksw2_extz2_sse.c rename to ext/ksw2/ksw2_extz2_sse.c From 237cb91f710fcf6ce25a0e440ca8dbc890d9d3dc Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Fri, 17 Feb 2023 15:00:20 +0100 Subject: [PATCH 22/32] Implement Aligner::ksw_extend() --- CMakeLists.txt | 1 + src/aligner.cpp | 91 +++++++++++++++++++++++++++++++++++++++++++++++++ src/aligner.hpp | 10 ++++-- 3 files changed, 100 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 988b1235..d1088d22 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -63,6 +63,7 @@ add_library(salib STATIC ${SOURCES} ext/xxhash.c ext/ssw/ssw_cpp.cpp ext/ssw/ssw.c + ext/ksw2/ksw2_extz2_sse.c ) target_include_directories(salib PUBLIC src/ ext/ ${PROJECT_BINARY_DIR}) target_link_libraries(salib PUBLIC ZLIB::ZLIB Threads::Threads zstr::zstr) diff --git a/src/aligner.cpp b/src/aligner.cpp index 6d775739..cd7e31af 100644 --- a/src/aligner.cpp +++ b/src/aligner.cpp @@ -3,11 +3,15 @@ * * This is for anything that returns an aln_info object, currently * Aligner::align and hamming_align. + * + * ksw_extend code is based on https://github.com/lh3/ksw2/blob/master/cli.c */ #include #include #include #include +#include // memset +#include "ksw2/ksw2.h" #include "aligner.hpp" AlignmentInfo Aligner::align(const std::string &query, const std::string &ref) const { @@ -199,3 +203,90 @@ AlignmentInfo hamming_align( aln.query_end = segment_end; return aln; } + +namespace { + +unsigned char seq_nt4_table[256] = { + 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 +}; + +} // namespace + +void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b) +{ + int i, j; + a = a < 0? -a : a; + b = b > 0? -b : b; + for (i = 0; i < m - 1; ++i) { + for (j = 0; j < m - 1; ++j) + mat[i * m + j] = i == j? a : b; + mat[i * m + m - 1] = 0; + } + for (j = 0; j < m; ++j) + mat[(m - 1) * m + j] = 0; +} + + +AlignmentInfo Aligner::ksw_extend(const std::string& query, const std::string& ref, bool right_align) const { + int w = -1; // band width; -1 is inf + int zdrop = -1; // -1 to disable + int flag = KSW_EZ_EXTZ_ONLY; + if (right_align) { + flag |= KSW_EZ_RIGHT; + } + ksw_extz_t ez; + memset(&ez, 0, sizeof(ksw_extz_t)); + + ez.max_q = ez.max_t = ez.mqe_t = ez.mte_q = -1; + ez.max = 0; ez.mqe = ez.mte = KSW_NEG_INF; + ez.n_cigar = 0; + int qlen = query.length(); + int tlen = ref.length(); + uint8_t *qseq = (uint8_t*)calloc(qlen + 33, 1); + uint8_t *tseq = (uint8_t*)calloc(tlen + 33, 1); + for (int i = 0; i < qlen; ++i) + qseq[i] = seq_nt4_table[(uint8_t)query[i]]; + for (int i = 0; i < tlen; ++i) + tseq[i] = seq_nt4_table[(uint8_t)ref[i]]; + + ksw_extz2_sse( + nullptr, qlen, (uint8_t*)qseq, tlen, (uint8_t*)tseq, ksw_matrix_m, ksw_matrix, parameters.gap_open, parameters.gap_extend, w, zdrop, parameters.end_bonus, flag, &ez + ); + free(qseq); + free(tseq); + + + AlignmentInfo info; + auto cigar = Cigar(ez.cigar, ez.n_cigar).to_eqx(query, ref); + info.edit_distance = cigar.edit_distance(); + info.cigar = std::move(cigar); + info.ref_start = 0; + info.query_start = 0; + if (ez.reach_end) { + info.ref_end = ez.mqe_t + 1; + info.query_end = query.size(); + info.sw_score = ez.mqe + parameters.end_bonus; + } else { + info.ref_end = ez.max_t + 1; + info.query_end = ez.max_q + 1; + info.sw_score = ez.max; + } + + kfree(km, ez.cigar); + return info; +} diff --git a/src/aligner.hpp b/src/aligner.hpp index 0238885f..e0c6f8c1 100644 --- a/src/aligner.hpp +++ b/src/aligner.hpp @@ -28,15 +28,19 @@ struct AlignmentInfo { int ref_span() const { return ref_end - ref_start; } }; +void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b); + struct Aligner { public: Aligner(AlignmentParameters parameters) : parameters(parameters) , ssw_aligner(StripedSmithWaterman::Aligner(parameters.match, parameters.mismatch, parameters.gap_open, parameters.gap_extend)) - { } + { + ksw_gen_simple_mat(ksw_matrix_m, ksw_matrix, parameters.match, -parameters.mismatch); + } AlignmentInfo align(const std::string &query, const std::string &ref) const; - + AlignmentInfo ksw_extend(const std::string& query, const std::string& ref, bool right_align) const; AlignmentParameters parameters; unsigned calls_count() { @@ -47,6 +51,8 @@ struct Aligner { const StripedSmithWaterman::Aligner ssw_aligner; const StripedSmithWaterman::Filter filter; mutable unsigned m_align_calls{0}; // no. of calls to the align() method + const int8_t ksw_matrix_m{5}; + int8_t ksw_matrix[25]; }; inline int hamming_distance(const std::string &s, const std::string &t) { From a90d6979a3070c0886df5ed4326eeaa6d2dacd76 Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Mon, 13 Mar 2023 14:37:35 +0100 Subject: [PATCH 23/32] Add operator<< for ksw_extz_t --- src/aligner.cpp | 21 +++++++++++++++++++++ src/cigar.cpp | 5 +++++ src/cigar.hpp | 2 ++ 3 files changed, 28 insertions(+) diff --git a/src/aligner.cpp b/src/aligner.cpp index cd7e31af..1702a06b 100644 --- a/src/aligner.cpp +++ b/src/aligner.cpp @@ -11,6 +11,7 @@ #include #include #include // memset +#include #include "ksw2/ksw2.h" #include "aligner.hpp" @@ -241,6 +242,26 @@ void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b) mat[(m - 1) * m + j] = 0; } +std::ostream& operator<<(std::ostream& os, const ksw_extz_t& ez) { + os << "ksw_extz_t(" + // + << "\n max: " << ez.max // max overall score + << "\n coord max_q: " << ez.max_q // max extension coordinate + << "\n coord max_t: " << ez.max_t // max extension coordinate + + << "\n score mqe: " << ez.mqe // max score when reaching the end of query + << "\n mqe_t: " << ez.mqe_t // coordinate in target corresponding to mqe + + << "\n score mte: " << ez.mte // max score when reaching the end of target + << "\n mte_q: " << ez.mte_q // coordinate in query corresponding to mte + + << "\n score both ends: " << ez.score // max score reaching both ends + << "\n cigar: " << Cigar(ez.cigar, ez.n_cigar) + << "\n zdropped: " << ez.zdropped + << "\n reach_end: " << ez.reach_end + << "\n)"; + return os; +} AlignmentInfo Aligner::ksw_extend(const std::string& query, const std::string& ref, bool right_align) const { int w = -1; // band width; -1 is inf diff --git a/src/cigar.cpp b/src/cigar.cpp index 88676d53..39abf14e 100644 --- a/src/cigar.cpp +++ b/src/cigar.cpp @@ -89,6 +89,11 @@ Cigar::Cigar(const std::string& cig) { } } +std::ostream& operator<<(std::ostream& os, const Cigar& cigar) { + os << cigar.to_string(); + return os; +} + std::string compress_cigar(const std::string& ops) { char prev = 0; int count = 0; diff --git a/src/cigar.hpp b/src/cigar.hpp index ced193b1..9aa45c10 100644 --- a/src/cigar.hpp +++ b/src/cigar.hpp @@ -92,6 +92,8 @@ class Cigar { std::vector m_ops; }; +std::ostream& operator<<(std::ostream& os, const Cigar& cigar); + std::string compress_cigar(const std::string& ops); #endif From b89e3d78b43c86a107299c3f4faec54ee17d1cf1 Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Thu, 14 Dec 2023 10:20:30 +0100 Subject: [PATCH 24/32] Turn the ref parameter of some functions into string_view --- src/aligner.cpp | 10 +++++----- src/aligner.hpp | 9 +++++---- src/aln.cpp | 4 ++-- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/src/aligner.cpp b/src/aligner.cpp index 1702a06b..2345124e 100644 --- a/src/aligner.cpp +++ b/src/aligner.cpp @@ -15,7 +15,7 @@ #include "ksw2/ksw2.h" #include "aligner.hpp" -AlignmentInfo Aligner::align(const std::string &query, const std::string &ref) const { +AlignmentInfo Aligner::align(const std::string& query, const std::string_view ref) const { m_align_calls++; AlignmentInfo aln; int32_t maskLen = query.length() / 2; @@ -30,8 +30,8 @@ AlignmentInfo Aligner::align(const std::string &query, const std::string &ref) c StripedSmithWaterman::Alignment alignment_ssw; - // query must be NULL-terminated - auto flag = ssw_aligner.Align(query.c_str(), ref.c_str(), ref.size(), filter, &alignment_ssw, maskLen); + // only query must be NULL-terminated + auto flag = ssw_aligner.Align(query.c_str(), ref.begin(), ref.size(), filter, &alignment_ssw, maskLen); if (flag != 0) { aln.edit_distance = 100000; aln.ref_start = 0; @@ -121,7 +121,7 @@ AlignmentInfo Aligner::align(const std::string &query, const std::string &ref) c * of the query, once for each end. */ std::tuple highest_scoring_segment( - const std::string& query, const std::string& ref, int match, int mismatch, int end_bonus + const std::string& query, const std::string_view ref, int match, int mismatch, int end_bonus ) { size_t n = query.length(); @@ -156,7 +156,7 @@ std::tuple highest_scoring_segment( } AlignmentInfo hamming_align( - const std::string &query, const std::string &ref, int match, int mismatch, int end_bonus + const std::string &query, const std::string_view ref, int match, int mismatch, int end_bonus ) { AlignmentInfo aln; if (query.length() != ref.length()) { diff --git a/src/aligner.hpp b/src/aligner.hpp index e0c6f8c1..55abdd3d 100644 --- a/src/aligner.hpp +++ b/src/aligner.hpp @@ -39,8 +39,9 @@ struct Aligner { ksw_gen_simple_mat(ksw_matrix_m, ksw_matrix, parameters.match, -parameters.mismatch); } - AlignmentInfo align(const std::string &query, const std::string &ref) const; + AlignmentInfo align(const std::string& query, const std::string_view ref) const; AlignmentInfo ksw_extend(const std::string& query, const std::string& ref, bool right_align) const; + AlignmentParameters parameters; unsigned calls_count() { @@ -55,7 +56,7 @@ struct Aligner { int8_t ksw_matrix[25]; }; -inline int hamming_distance(const std::string &s, const std::string &t) { +inline int hamming_distance(const std::string& s, const std::string_view t) { if (s.length() != t.length()){ return -1; } @@ -71,11 +72,11 @@ inline int hamming_distance(const std::string &s, const std::string &t) { } std::tuple highest_scoring_segment( - const std::string& query, const std::string& ref, int match, int mismatch, int end_bonus + const std::string& query, const std::string_view ref, int match, int mismatch, int end_bonus ); AlignmentInfo hamming_align( - const std::string &query, const std::string &ref, int match, int mismatch, int end_bonus + const std::string& query, const std::string_view ref, int match, int mismatch, int end_bonus ); #endif diff --git a/src/aln.cpp b/src/aln.cpp index b5105bb1..02e408b5 100644 --- a/src/aln.cpp +++ b/src/aln.cpp @@ -213,7 +213,7 @@ inline Alignment extend_seed( bool consistent_nam ) { const std::string query = nam.is_rc ? read.rc : read.seq; - const std::string& ref = references.sequences[nam.ref_id]; + const std::string_view ref = references.sequences[nam.ref_id]; const auto projected_ref_start = nam.projected_ref_start(); const auto projected_ref_end = std::min(nam.ref_end + query.size() - nam.query_end, ref.size()); @@ -222,7 +222,7 @@ inline Alignment extend_seed( int result_ref_start; bool gapped = true; if (projected_ref_end - projected_ref_start == query.size() && consistent_nam) { - std::string ref_segm_ham = ref.substr(projected_ref_start, query.size()); + std::string_view ref_segm_ham = ref.substr(projected_ref_start, query.size()); auto hamming_dist = hamming_distance(query, ref_segm_ham); if (hamming_dist >= 0 && (((float) hamming_dist / query.size()) < 0.05) ) { //Hamming distance worked fine, no need to ksw align From 36513bc8a53a43be6b8075fb7c4bed205f77c34c Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Fri, 15 Dec 2023 15:25:45 +0100 Subject: [PATCH 25/32] Add operator<< for AlignmentInfo --- src/aligner.cpp | 10 ++++++++++ src/aligner.hpp | 2 ++ 2 files changed, 12 insertions(+) diff --git a/src/aligner.cpp b/src/aligner.cpp index 2345124e..999aea98 100644 --- a/src/aligner.cpp +++ b/src/aligner.cpp @@ -311,3 +311,13 @@ AlignmentInfo Aligner::ksw_extend(const std::string& query, const std::string& r kfree(km, ez.cigar); return info; } + +std::ostream& operator<<(std::ostream& os, const AlignmentInfo& info) { + os << "AlignmentInfo(cigar='" << info.cigar + << "', ref=" << info.ref_start << ".." << info.ref_end + << ", query=" << info.query_start << ".." << info.query_end + << ", NM=" << info.edit_distance + << ", AS=" << info.sw_score + << ")"; + return os; +} diff --git a/src/aligner.hpp b/src/aligner.hpp index 55abdd3d..780c18ae 100644 --- a/src/aligner.hpp +++ b/src/aligner.hpp @@ -28,6 +28,8 @@ struct AlignmentInfo { int ref_span() const { return ref_end - ref_start; } }; +std::ostream& operator<<(std::ostream& os, const AlignmentInfo& info); + void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b); struct Aligner { From 92d221b11bb7f60f5ef23bcf70faa34d49b27274 Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Fri, 15 Dec 2023 15:27:10 +0100 Subject: [PATCH 26/32] Explicitly set a wildcard score in the ksw score matrix Otherwise, alignments between wildcards get a score of 0, which is inconsistent with SSW alignments. --- src/aligner.cpp | 7 ++++--- src/aligner.hpp | 5 +++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/aligner.cpp b/src/aligner.cpp index 999aea98..77e0b495 100644 --- a/src/aligner.cpp +++ b/src/aligner.cpp @@ -228,18 +228,19 @@ unsigned char seq_nt4_table[256] = { } // namespace -void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b) +void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t wildcard_score) { int i, j; a = a < 0? -a : a; b = b > 0? -b : b; + wildcard_score = wildcard_score > 0 ? -wildcard_score : wildcard_score; for (i = 0; i < m - 1; ++i) { for (j = 0; j < m - 1; ++j) mat[i * m + j] = i == j? a : b; - mat[i * m + m - 1] = 0; + mat[i * m + m - 1] = wildcard_score; } for (j = 0; j < m; ++j) - mat[(m - 1) * m + j] = 0; + mat[(m - 1) * m + j] = wildcard_score; } std::ostream& operator<<(std::ostream& os, const ksw_extz_t& ez) { diff --git a/src/aligner.hpp b/src/aligner.hpp index 780c18ae..e950ac8b 100644 --- a/src/aligner.hpp +++ b/src/aligner.hpp @@ -30,7 +30,7 @@ struct AlignmentInfo { std::ostream& operator<<(std::ostream& os, const AlignmentInfo& info); -void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b); +void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t wildcard_score); struct Aligner { public: @@ -38,7 +38,8 @@ struct Aligner { : parameters(parameters) , ssw_aligner(StripedSmithWaterman::Aligner(parameters.match, parameters.mismatch, parameters.gap_open, parameters.gap_extend)) { - ksw_gen_simple_mat(ksw_matrix_m, ksw_matrix, parameters.match, -parameters.mismatch); + int8_t wildcard_score = -parameters.mismatch; + ksw_gen_simple_mat(ksw_matrix_m, ksw_matrix, parameters.match, -parameters.mismatch, wildcard_score); } AlignmentInfo align(const std::string& query, const std::string_view ref) const; From 13938c55bba7ef8cb816bf7ef1eaa36de94bd5d0 Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Fri, 15 Dec 2023 15:29:42 +0100 Subject: [PATCH 27/32] Pass ref as a string_view --- src/aligner.cpp | 2 +- src/aligner.hpp | 2 +- src/cigar.cpp | 2 +- src/cigar.hpp | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/aligner.cpp b/src/aligner.cpp index 77e0b495..442c8743 100644 --- a/src/aligner.cpp +++ b/src/aligner.cpp @@ -264,7 +264,7 @@ std::ostream& operator<<(std::ostream& os, const ksw_extz_t& ez) { return os; } -AlignmentInfo Aligner::ksw_extend(const std::string& query, const std::string& ref, bool right_align) const { +AlignmentInfo Aligner::ksw_extend(const std::string& query, const std::string_view ref, bool right_align) const { int w = -1; // band width; -1 is inf int zdrop = -1; // -1 to disable int flag = KSW_EZ_EXTZ_ONLY; diff --git a/src/aligner.hpp b/src/aligner.hpp index e950ac8b..3a1c02a5 100644 --- a/src/aligner.hpp +++ b/src/aligner.hpp @@ -43,7 +43,7 @@ struct Aligner { } AlignmentInfo align(const std::string& query, const std::string_view ref) const; - AlignmentInfo ksw_extend(const std::string& query, const std::string& ref, bool right_align) const; + AlignmentInfo ksw_extend(const std::string& query, const std::string_view ref, bool right_align) const; AlignmentParameters parameters; diff --git a/src/cigar.cpp b/src/cigar.cpp index 39abf14e..c2799a1c 100644 --- a/src/cigar.cpp +++ b/src/cigar.cpp @@ -17,7 +17,7 @@ Cigar Cigar::to_m() const { return cigar; } -Cigar Cigar::to_eqx(const std::string& query, const std::string& ref) const { +Cigar Cigar::to_eqx(const std::string& query, const std::string_view ref) const { size_t i = 0, j = 0; Cigar cigar; for (auto op_len : m_ops) { diff --git a/src/cigar.hpp b/src/cigar.hpp index 9aa45c10..12966fcd 100644 --- a/src/cigar.hpp +++ b/src/cigar.hpp @@ -85,7 +85,7 @@ class Cigar { Cigar to_m() const; /* Return a new Cigar that uses =/X instead of M */ - Cigar to_eqx(const std::string& query, const std::string& ref) const; + Cigar to_eqx(const std::string& query, const std::string_view ref) const; std::string to_string() const; From 9d72201f03cc0e9f1402a85b447ee1cfcbe15de0 Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Fri, 15 Dec 2023 15:30:18 +0100 Subject: [PATCH 28/32] Make ksw actually use the score matrix Apparently, the KSW_EZ_GENERIC_SC flag is required --- src/aligner.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aligner.cpp b/src/aligner.cpp index 442c8743..1260e9bf 100644 --- a/src/aligner.cpp +++ b/src/aligner.cpp @@ -267,7 +267,7 @@ std::ostream& operator<<(std::ostream& os, const ksw_extz_t& ez) { AlignmentInfo Aligner::ksw_extend(const std::string& query, const std::string_view ref, bool right_align) const { int w = -1; // band width; -1 is inf int zdrop = -1; // -1 to disable - int flag = KSW_EZ_EXTZ_ONLY; + int flag = KSW_EZ_EXTZ_ONLY | KSW_EZ_GENERIC_SC; if (right_align) { flag |= KSW_EZ_RIGHT; } From c36cc0a0b1c08c3026acd60563e59628c343d7d5 Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Fri, 15 Dec 2023 15:31:14 +0100 Subject: [PATCH 29/32] Return empty alignment if query is empty --- src/aligner.cpp | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/aligner.cpp b/src/aligner.cpp index 1260e9bf..3c05f3d9 100644 --- a/src/aligner.cpp +++ b/src/aligner.cpp @@ -265,6 +265,18 @@ std::ostream& operator<<(std::ostream& os, const ksw_extz_t& ez) { } AlignmentInfo Aligner::ksw_extend(const std::string& query, const std::string_view ref, bool right_align) const { + AlignmentInfo info; + if (query.size() == 0) { + info.cigar = Cigar(); + info.edit_distance = 0; + info.ref_start = 0; + info.query_start = 0; + info.ref_end = 0; + info.query_end = 0; + info.sw_score = parameters.end_bonus; + return info; + } + int w = -1; // band width; -1 is inf int zdrop = -1; // -1 to disable int flag = KSW_EZ_EXTZ_ONLY | KSW_EZ_GENERIC_SC; @@ -292,8 +304,6 @@ AlignmentInfo Aligner::ksw_extend(const std::string& query, const std::string_vi free(qseq); free(tseq); - - AlignmentInfo info; auto cigar = Cigar(ez.cigar, ez.n_cigar).to_eqx(query, ref); info.edit_distance = cigar.edit_distance(); info.cigar = std::move(cigar); From 7d23a81e91fb8390bd19f87e9f3525837ed21b11 Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Fri, 15 Dec 2023 15:31:49 +0100 Subject: [PATCH 30/32] Add soft clipping to CIGAR if necessary --- src/aligner.cpp | 2 ++ src/cigar.hpp | 12 ++++++++++++ 2 files changed, 14 insertions(+) diff --git a/src/aligner.cpp b/src/aligner.cpp index 3c05f3d9..93e8396c 100644 --- a/src/aligner.cpp +++ b/src/aligner.cpp @@ -317,7 +317,9 @@ AlignmentInfo Aligner::ksw_extend(const std::string& query, const std::string_vi info.ref_end = ez.max_t + 1; info.query_end = ez.max_q + 1; info.sw_score = ez.max; + info.cigar.push(CIGAR_SOFTCLIP, query.length() - info.query_end); } + assert(info.cigar.derived_sequence_length() == query.length()); kfree(km, ez.cigar); return info; diff --git a/src/cigar.hpp b/src/cigar.hpp index 12966fcd..931ab8a1 100644 --- a/src/cigar.hpp +++ b/src/cigar.hpp @@ -77,6 +77,18 @@ class Cigar { return dist; } + size_t derived_sequence_length() const { + size_t length = 0; + for (auto op_len : m_ops) { + auto op = op_len & 0xf; + auto len = op_len >> 4; + if (op == CIGAR_MATCH || op == CIGAR_EQ || op == CIGAR_X || op == CIGAR_INS || op == CIGAR_SOFTCLIP) { + length += len; + } + } + return length; + } + void reverse() { std::reverse(m_ops.begin(), m_ops.end()); } From 26b472e7a748aabab5e46328622b1cd46a0c0841 Mon Sep 17 00:00:00 2001 From: Marcel Martin Date: Fri, 15 Dec 2023 15:33:02 +0100 Subject: [PATCH 31/32] If ungapped alignment is softclipped, ksw_extend soft-clipped ends It is possible that gapped alignment gives a better score than soft-clipping based on ungapped alignment. Closes #357 --- CHANGES.md | 3 +++ src/aligner.hpp | 1 + src/aln.cpp | 45 +++++++++++++++++++++++++++++++++++++++++---- src/cigar.hpp | 4 ++++ tests/phix.1.fastq | 8 ++++---- tests/phix.pe.sam | 4 ++-- tests/phix.se.sam | 4 ++-- 7 files changed, 57 insertions(+), 12 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 2d907665..f9b67f88 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -21,6 +21,9 @@ * Include [ZStr](https://github.com/mateidavid/zstr/) in our own repository instead of downloading it at build time. This should make it possible to build strobealign without internet access. +* #357: Fix some suboptimal alignment ends. Sometimes an end was soft-clipped + although a better alignment with an insertion or deletion existed that + extends to the end of the read. ## v0.12.0 (2023-11-23) diff --git a/src/aligner.hpp b/src/aligner.hpp index 3a1c02a5..77f83c23 100644 --- a/src/aligner.hpp +++ b/src/aligner.hpp @@ -26,6 +26,7 @@ struct AlignmentInfo { int sw_score{0}; int ref_span() const { return ref_end - ref_start; } + int query_span() const { return query_end - query_start; } }; std::ostream& operator<<(std::ostream& os, const AlignmentInfo& info); diff --git a/src/aln.cpp b/src/aln.cpp index 02e408b5..a8bae9c3 100644 --- a/src/aln.cpp +++ b/src/aln.cpp @@ -222,11 +222,48 @@ inline Alignment extend_seed( int result_ref_start; bool gapped = true; if (projected_ref_end - projected_ref_start == query.size() && consistent_nam) { - std::string_view ref_segm_ham = ref.substr(projected_ref_start, query.size()); - auto hamming_dist = hamming_distance(query, ref_segm_ham); + const std::string_view projected_ref = ref.substr(projected_ref_start, query.size()); + info = hamming_align(query, projected_ref, aligner.parameters.match, aligner.parameters.mismatch, aligner.parameters.end_bonus); + + if (info.edit_distance + (query.length() - info.query_span()) < 0.05 * query.length()) { + if (info.query_end < query.length()) { + // Right end is soft clipped, do gapped alignment on it + const std::string query_right = query.substr(info.query_end); + const int ext_right = std::min(ref.size() - projected_ref_end, size_t(50)); + const std::string_view ref_right = ref.substr(projected_ref_start + info.ref_end, ext_right); + auto right = aligner.ksw_extend(query_right, ref_right, false); + info.query_end += right.query_end; + info.ref_end += right.ref_end; + info.edit_distance += right.edit_distance; + info.sw_score += right.sw_score; + assert(!info.cigar.empty()); + info.cigar.pop_oplen(); + info.cigar += right.cigar; + } + + if (info.query_start > 0) { + // Left end is soft clipped, do gapped alignment on it + std::string query_left = query.substr(0, info.query_start); + const int ext_left = std::min(50, projected_ref_start); + const int ref_start = projected_ref_start - ext_left; + std::string ref_left{ref.substr(ref_start, ext_left + info.ref_start)}; + std::reverse(query_left.begin(), query_left.end()); + std::reverse(ref_left.begin(), ref_left.end()); + auto left = aligner.ksw_extend(query_left, ref_left, true); + info.query_start -= left.query_end; + info.ref_start -= left.ref_end; + info.edit_distance += left.edit_distance; + info.sw_score += left.sw_score; + + // TODO this just removes the soft-clipping from the beginning, + // a bit too complicated + info.cigar.reverse(); + info.cigar.pop_oplen(); + info.cigar += left.cigar; + info.cigar.reverse(); + } - if (hamming_dist >= 0 && (((float) hamming_dist / query.size()) < 0.05) ) { //Hamming distance worked fine, no need to ksw align - info = hamming_align(query, ref_segm_ham, aligner.parameters.match, aligner.parameters.mismatch, aligner.parameters.end_bonus); + assert(info.cigar.derived_sequence_length() == query.length()); result_ref_start = projected_ref_start + info.ref_start; gapped = false; } diff --git a/src/cigar.hpp b/src/cigar.hpp index 931ab8a1..55bda51d 100644 --- a/src/cigar.hpp +++ b/src/cigar.hpp @@ -93,6 +93,10 @@ class Cigar { std::reverse(m_ops.begin(), m_ops.end()); } + void pop_oplen() { + m_ops.pop_back(); + } + /* Return a new Cigar that uses M instead of =/X */ Cigar to_m() const; diff --git a/tests/phix.1.fastq b/tests/phix.1.fastq index 148f0248..1ed483a6 100644 --- a/tests/phix.1.fastq +++ b/tests/phix.1.fastq @@ -31,9 +31,9 @@ NTCATTTGGCGAGAAAGCTCAGTCTCAGGAGGAAGCGGAGCAGGCCAAATGTTTTTGAGATGGCAGCAACGGAAACCATA + #88ABCFG=>+@+3,3CFBFFFFF2.1.4::?FFFFF?BAAFFFFFFDFF>>9>9? +#8BCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFCFGGFGFGFFFFFFFFFFFDFFFFB:>FFFFF2.1.4::?FFFFF?BAAFFFFFFDF>>9>9? @SRR1377138.14 NTCTTGATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAAATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTGTCAAAAACTGACGCGTTGGATGAGGAGAAGTGGCTTAATATGCTTGGCACGTTCGTCAAGGACTGGTTTAGATATGAGTCACATTTTGTTCATGGTAGAGATTCTCTTGTTGACATTTTAAAAGAGCGTGGATTACTATCTGAGTCCGATGCTGTT + diff --git a/tests/phix.pe.sam b/tests/phix.pe.sam index ef4157d7..ccf0dbca 100644 --- a/tests/phix.pe.sam +++ b/tests/phix.pe.sam @@ -17,7 +17,7 @@ SRR1377138.7 83 NC_001422.1 256 60 300=1X = 141 -416 GGCACGTTCGTCAAGGACTGGTTTAGA SRR1377138.7 163 NC_001422.1 141 60 1X299=1X = 256 416 NTTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTGTCAAAAACTGACGCGTTGGATGAGGAGAAGTGGCTTAATATGCTTGGCACGTTCGTCAAGGACTGGTTTAGATATGAGTCACATTTTGTTCATGGTAGAGATTCTCTTGTTGACATTTTAAAAGAGCGTGGATTACTATCTGAGTCCGATGCTGTTCAACCACTAATAGGTAAGAAATCATGAGTCAAGTTACTGAACAATCCGTACGTTTCCAGACCGCTTTGGCCTCTT #8AF6BFFCFFF4AA>A4A4>:E>7?-55>;(>?E38*878CB6@EC7C;6C>C<)6FFF@E>2)(4179:BB?FB#### NM:i:2 AS:i:602 RG:Z:1 SRR1377138.8 83 NC_001422.1 1254 60 124=1X132=1X42=1X = 1179 -376 TCATGAAGGATGGTGTTAATGCCACTCCTCTCCCGACTGTTAACACTACTGGTTATATTGACCATGCCGCTTTTCTTGGCACGATTAACCCTGATACCAATAAAATCCCTAAGCATTTGTTTCATGGTTATTTGAATATCTATAACAACTATTTTAAAGCGCCGTGGATGCCTGACCGTACCGAGGCTAACCCTAATGAGCTTAATCAAGATGATGCTCGTTATGGTTTCCGTTGCTGCCATCTCAAAAACATTTGGCCTGCTCCGCTTCCTCCTGAGACTGAGCTTTCTCGCCAAATGAN #################################?<9<464:AC;CFAF;AFFA77AFAGFGFACB73DC4CD64GGGGCGEC=>+FGGFEFGE8GGGGCE=EC7GEFEFF5>A@FFE=<)1*-..:@29@29@0:1;3=6@=(544?4)9;C>5))7).474CB)99?###### NM:i:5 AS:i:572 RG:Z:1 -SRR1377138.9 83 NC_001422.1 4731 60 300=1X = 4650 -382 TGTGTGCCTGAGTATGGTACAGCTAATGGCCGTCTTCATTTCCATGCGGTGCACTTTATGCGGACACTTCCTACAGGTAGCGTTGACCCTAATTTTGGTCGTCGGGTACGCAATCGCCGCCAGTTAAATAGCTTGCAAAATACGTGGCCTTATGGTTACAGTATGCCCATCGCAGTTCGCTACACGCAGGACGCTTTTTCACGTTCTGGTTGGTTGTGGCCTGTTGATGCTAAAGGTGAGCCGCTTAAAGCTACCAGTTATATGGCTGTTGGTTTCTATGTGGCTAAATACGTTAACAAAN ?BFF?<:?FCF@FFF?EFFF@EFEFFFFEEEB?FD<<)8083:?>FFDFFECC49FFD?3<@FFFFEDEC?3E?C?0EFFFFFFF>@5FECFFGGGFGD:FGGCDGFGGGGGG>GGGGGGGDGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGEGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGCCB8# NM:i:1 AS:i:612 RG:Z:1 @@ -25,7 +25,7 @@ SRR1377138.11 99 NC_001422.1 5117 60 1X268=32S = 5220 270 NAAGCTGTTCAGAATCAGAATG SRR1377138.11 147 NC_001422.1 5220 60 167=134S = 5117 -270 GACGCGACGCCGTTCAACCAGATATTGAAGCAGAACGCAAAAAGAGAGATGAGATTGAGGCTGGGAAAAGTTACTGTAGCCGACGTTTTGGCGGCGCAACCTGTGACGACAAATCTGCTCAAATTTATGCGCGCTTCGATAAAAATGATTGGCGTATCCAACCTGCAGAGTTTTATCGCTTCCATGACGCAGAAGTTAACACTTTCGGATATTTCTGATGAGTCGAAAAATTATCTTGATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGCGGAAAAN ########BFFBA>82><9B>4<2BAA2A<<:?:4,.(B?C?.37;)EE<;)C?6(((;FFFFFEE?7)?9/);FDFB>A=FDAFFFFFFFDEDDCCFAGFFGGGGGGF:GGGFGGGGGFGGGGFGCGGEECDEFCCGGFGGGGFGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGFFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGCCA8# NM:i:0 AS:i:344 RG:Z:1 SRR1377138.12 99 NC_001422.1 1 60 57S238=1X5= = 28 328 NAATCTGCTCAAATTTATGCGCGCTTCGATAAAAATGATTGGCGTATCCAACCTGCAGAGTTTTATCGCTTCCATGACGCAGAAGTTAACACTTTCGGATATTTCTGATGAGTCGAAAAATTATCTTGATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAAATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTGTCAAAAACTGACGCGTTGGATGAGGAGATGTGGC #8BCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGDCGGGGGGGGGGFGGGGGGGFGFGGFGFGFFFFFFFFFFFAFFFFFFFFFFFA3.1.48?A<:?FFFFFFBDFFFF?1GGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGCCB8# NM:i:2 AS:i:602 RG:Z:1 -SRR1377138.13 99 NC_001422.1 5006 60 1X300= = 5049 344 NTATGTGGCTAAATACGTTAACAAAAAGTCAGATATGGACCTTGCTGCTAAAGGTCTAGGAGCTAAAGAATGGAACAACTCACTAAAAACCAAGCTGTCGCTACTTCCCAAGAAGCTGTTCAGAATCAGAATGAGCCGCAACTTCGGGATGAAAATGCTCACAATGACAAATCTGTCCACGGAGTGCTTAATCCAACTTACCAAGCTGGGTTACGACGCGACGCCGTTCAACCAGATATTGAAGCAGAACGCAAAAAGAGAGATGAGATTGAGGCTGGGAAAAGTTACTGTAGCCGACGTT #8BCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFCFGGFGFGFFFFFFFFFFFDFFFFB:>FFFFF2.1.4::?FFFFF?BAAFFFFFFDFF>>9>9? NM:i:1 AS:i:612 RG:Z:1 +SRR1377138.13 99 NC_001422.1 5006 60 1X293=1D6= = 5049 344 NTATGTGGCTAAATACGTTAACAAAAAGTCAGATATGGACCTTGCTGCTAAAGGTCTAGGAGCTAAAGAATGGAACAACTCACTAAAAACCAAGCTGTCGCTACTTCCCAAGAAGCTGTTCAGAATCAGAATGAGCCGCAACTTCGGGATGAAAATGCTCACAATGACAAATCTGTCCACGGAGTGCTTAATCCAACTTACCAAGCTGGGTTACGACGCGACGCCGTTCAACCAGATATTGAAGCAGAACGCAAAAAGAGAGATGAGATTGAGGCTGGGAAAAGTTACTGTAGCGACGTT #8BCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFCFGGFGFGFFFFFFFFFFFDFFFFB:>FFFFF2.1.4::?FFFFF?BAAFFFFFFDF>>9>9? NM:i:2 AS:i:597 RG:Z:1 SRR1377138.13 147 NC_001422.1 5049 60 300=1X = 5006 -344 GCTGCTAAAGGTCTAGGAGCTAAAGAATGGAACAACTCACTAAAAACCAAGCTGTCGCTACTTCCCAAGAAGCTGTTCAGAATCAGAATGAGCCGCAACTTCGGGATGAAAATGCTCACAATGACAAATCTGTCCACGGAGTGCTTAATCCAACTTACCAAGCTGGGTTACGACGCGACGCCGTTCAACCAGATATTGAAGCAGAACGCAAAAAGAGAGATGAGATTGAGGCTGGGAAAAGTTACTGTAGCCGACGTTTTGGCGGCGCAACCTGTGACGACAAATCTGCTCAAATTTATGN #FFFFBFFFBD>A<::64>AAFFFB><?FFBABBAFFFFFFBAFFFFFFFFFFFFFE;@F@FFFFFFFD@FE?FFFFFFFFFFFFFEBFFFFFFFFGFFGGGDGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGDGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGCGGCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGCCC8# NM:i:1 AS:i:612 RG:Z:1 SRR1377138.14 99 NC_001422.1 66 60 1X300= = 156 391 NTCTTGATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAAATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTGTCAAAAACTGACGCGTTGGATGAGGAGAAGTGGCTTAATATGCTTGGCACGTTCGTCAAGGACTGGTTTAGATATGAGTCACATTTTGTTCATGGTAGAGATTCTCTTGTTGACATTTTAAAAGAGCGTGGATTACTATCTGAGTCCGATGCTGTT #8@CCGGGGGGFFGGGGGGGGGGGGGFGGGGGGGGGGGGGFGGEEFGGGGGGGGGGGGGFGDCFEFGGGGGFGGGGGGGGGGGGGGGGGGGG7FGGGGGGGGGGGGGGGGGGGDFGGGGGGGFGGEGGGGGFGGGGGGGGGGGGGGGGGFGGGGGGGGGGDGGGGGGGGDGGGGGGGFEGGGGGGGFGGFFGFGGGGGGGGGFGGGFG7FGGFGDGGGGGFFCGGGGGGGGGGGGGFGFGGC>FGCF=A@FFFFFFFFFFEFFFFFFE<=/04(34:501:AB>4 NM:i:1 AS:i:612 RG:Z:1 SRR1377138.14 147 NC_001422.1 156 60 300=1X = 66 -391 GCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTGTCAAAAACTGACGCGTTGGATGAGGAGAAGTGGCTTAATATGCTTGGCACGTTCGTCAAGGACTGGTTTAGATATGAGTCACATTTTGTTCATGGTAGAGATTCTCTTGTTGACATTTTAAAAGAGCGTGGATTACTATCTGAGTCCGATGCTGTTCAACCACTAATAGGTAAGAAATCATGAGTCAAGTTACTGAACAATCCGTACGTTTCCAGACCGCTTTGGCCTCTATTAAGCTCATTCAGN ############?>BFAAA45FFFFB:7/(9>9;B@=63?FFED>52FE?>5*>852BCEEFFAA@A>8@8C>=7EFCFFEAFFFF:FAFFFFFFFFD?9*DDGGGGGDGGGGGGFGFGGFFCFFGDGFGGGGGFGFEF?GGFCGGGGGGGGGGGGFEEGFFGGFE9GGGFFGFAGGGGGGGGFAGGGFGFFFGGGGGGGEFAFFCFGGGF@GF@AE@FFCEFEEFFFFFFFFFFFFFFF0.044:6B0>BF>003>>:(:<>B?FFFA>:5)9FFFFDAAB<4<60.,AB>FFFFDFFEFA6FFFFFFA8=FGFCFCGFGFF7CGGGGFGGGGFECCGGGGGGGGGGGGGGGGGGGEGGGGGGGFFFGGGGGFEFGGCGGGGGGGGFGEAFGGGFGGGGGGGGGGFCGFFGEGGGGGGGGFGGGGGGGGGDFCFFF<9GGGGGGGGGGGGGGGFFF=8GFGGGGGGGGGGGGGGGGGGEGGGEFFEFGGGGGGGGGGGEFGFFEFGGEFDFCFGGFCCGGDFGGGGGFCGGGGGEGEFGEDEGGGEGGGGGGBCA8# NM:i:1 AS:i:612 RG:Z:1 SRR1377138.8 16 NC_001422.1 1254 60 124=1X132=1X42=1X * 0 0 TCATGAAGGATGGTGTTAATGCCACTCCTCTCCCGACTGTTAACACTACTGGTTATATTGACCATGCCGCTTTTCTTGGCACGATTAACCCTGATACCAATAAAATCCCTAAGCATTTGTTTCATGGTTATTTGAATATCTATAACAACTATTTTAAAGCGCCGTGGATGCCTGACCGTACCGAGGCTAACCCTAATGAGCTTAATCAAGATGATGCTCGTTATGGTTTCCGTTGCTGCCATCTCAAAAACATTTGGCCTGCTCCGCTTCCTCCTGAGACTGAGCTTTCTCGCCAAATGAN #################################?<9<464:AC;CFAF;AFFA77AFAGFGFACB73DC4CD64GGGGCGEC=>+FGGFEFGE8GGGGCE=EC7GEF*?*:<>9BDECDFGF<*9@F?*7B?;F;F;B:AFFFFFF7*(03:0:AAAFFF:18?BA:FFFFF2.1.4::?FFFFF?BAAFFFFFFDFF>>9>9? NM:i:1 AS:i:612 RG:Z:1 +SRR1377138.13 0 NC_001422.1 5006 60 1X293=1D6= * 0 0 NTATGTGGCTAAATACGTTAACAAAAAGTCAGATATGGACCTTGCTGCTAAAGGTCTAGGAGCTAAAGAATGGAACAACTCACTAAAAACCAAGCTGTCGCTACTTCCCAAGAAGCTGTTCAGAATCAGAATGAGCCGCAACTTCGGGATGAAAATGCTCACAATGACAAATCTGTCCACGGAGTGCTTAATCCAACTTACCAAGCTGGGTTACGACGCGACGCCGTTCAACCAGATATTGAAGCAGAACGCAAAAAGAGAGATGAGATTGAGGCTGGGAAAAGTTACTGTAGCGACGTT #8BCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFCFGGFGFGFFFFFFFFFFFDFFFFB:>FFFFF2.1.4::?FFFFF?BAAFFFFFFDF>>9>9? NM:i:2 AS:i:597 RG:Z:1 SRR1377138.14 0 NC_001422.1 66 60 1X300= * 0 0 NTCTTGATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAAATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTGTCAAAAACTGACGCGTTGGATGAGGAGAAGTGGCTTAATATGCTTGGCACGTTCGTCAAGGACTGGTTTAGATATGAGTCACATTTTGTTCATGGTAGAGATTCTCTTGTTGACATTTTAAAAGAGCGTGGATTACTATCTGAGTCCGATGCTGTT #8@CCGGGGGGFFGGGGGGGGGGGGGFGGGGGGGGGGGGGFGGEEFGGGGGGGGGGGGGFGDCFEFGGGGGFGGGGGGGGGGGGGGGGGGGG7FGGGGGGGGGGGGGGGGGGGDFGGGGGGGFGGEGGGGGFGGGGGGGGGGGGGGGGGFGGGGGGGGGGDGGGGGGGGDGGGGGGGFEGGGGGGGFGGFFGFGGGGGGGGGFGGGFG7FGGFGDGGGGGFFCGGGGGGGGGGGGGFGFGGC>FGCF=A@FFFFFFFFFFEFFFFFFE<=/04(34:501:AB>4 NM:i:1 AS:i:612 RG:Z:1 SRR1377138.15 0 NC_001422.1 1907 60 1X185=115S * 0 0 NTTAACGCTAAAGGTGCTTTGACTTATACCGATATTGCTGGCGACCCTGTTTTGTATGGCAACTTGCCGCCGCGTGAAATTTCTATGAAGGATGTTTTCCGTTCTGGTGATTCGTCTAAGAAGTTTAAGATTGCTGAGGGTCAGTGGTATCGTTATGCGCCTTCGTATGTTTCTCCTGCTTATCACAAGAACGGAAAACATCCTTCATAGAAATTTCACGCAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTGAAAAAAAAAAAAAAAGATA #8BCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGFGGFGFGFFFFFFFFFFFFF>73?FFFFFFF3-048:BDFFAFFFFFF06>BFF######### NM:i:1 AS:i:372 RG:Z:1 SRR1377138.16 16 NC_001422.1 2072 60 27=1X20=1X83=1X50=1X116=1X * 0 0 TATGTTTCTCCTGCTTATCACCTTCTTTAAGGCTTCCCATTCATTCAGCAACCGCCTTCTGGTGATTTGCAAGAACGCGTACTTATTCGCCACCATGATTATGACCAGTGTTTCCAGTCCGTTCAGTTGTTGAAGTGGAATAGTCAGGTTAAATTTAATGTGACCGTTTATCGCAATCTGCCGTCCACTCGCGATTCAATCATGACTTCGTGATAAAAGATTGAGTGTGAGGTTATAACGCCGAAGCGGTAAAAATTTTAATTTTTGCCGCTGAGGGGTTGACCAAGCGAAGCGCGGTAGN ############################################################################################################GC;D>,5E>B*,,;98F>FE,9E4,@F@7,;CF>3,,BF7=@,3CF>,E,,F@83++3>@3++CBC,5B+68+FB,7=>=F=F, Date: Sun, 17 Dec 2023 21:50:37 +0100 Subject: [PATCH 32/32] Update baseline commit --- tests/baseline-commit.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/baseline-commit.txt b/tests/baseline-commit.txt index 471fbbbb..12f828e5 100644 --- a/tests/baseline-commit.txt +++ b/tests/baseline-commit.txt @@ -1 +1 @@ -2e4ff9500e68d6e465735dd276d362cf71851dcd +26b472e7a748aabab5e46328622b1cd46a0c0841