From 0ee78e09b410e47b1fb8114761bad75652937f7d Mon Sep 17 00:00:00 2001
From: Andrew Duffy
Date: Wed, 21 Aug 2024 09:16:19 -0400
Subject: [PATCH 1/3] remove spurious check (#18)
---
src/builder.rs | 4 ----
1 file changed, 4 deletions(-)
diff --git a/src/builder.rs b/src/builder.rs
index 42f2fb5..1dda9c9 100644
--- a/src/builder.rs
+++ b/src/builder.rs
@@ -58,10 +58,6 @@ impl<'a> Iterator for CodesIterator<'a> {
self.reference = self.index * 64;
}
- if self.block == 0 {
- return None;
- }
-
// Find the next set bit in the current block.
let position = self.block.trailing_zeros() as usize;
let code = self.reference + position;
From d15cc4e8cd6109a5f141e6db2f5bae6f6cc04590 Mon Sep 17 00:00:00 2001
From: Andrew Duffy
Date: Wed, 21 Aug 2024 11:47:00 -0400
Subject: [PATCH 2/3] implement second bitmap, ~2x speedup for train (#21)
![image](https://github.com/user-attachments/assets/5a30710f-8025-4708-ad74-eee6dd14187b)
^ this is a sad flamegraph. `bzero` is not a fun place to be spending
60% of your time.
![image](https://github.com/user-attachments/assets/fbe55d00-d7fd-4999-83be-3d9b8e6b48b5)
How did we get from one to two?
1. Avoid initializing our `counts1` and `counts1` vectors
2. Implement a second bitmap index that limits our outer-loop iterations
in `optimize`. Because `counts1` and `counts2` are not initialized, we
check the bitmap before all accesses
This also gives us another 2x speedup on the train benchmark, which is
nice
---
src/builder.rs | 61 +++++++++++++++++++++++++++++++++++++++++++-------
1 file changed, 53 insertions(+), 8 deletions(-)
diff --git a/src/builder.rs b/src/builder.rs
index 1dda9c9..cc58190 100644
--- a/src/builder.rs
+++ b/src/builder.rs
@@ -26,6 +26,14 @@ impl CodesBitmap {
self.codes[map] |= 1 << (index % 64);
}
+ /// Check if `index` is present in the bitmap
+ pub(crate) fn is_set(&self, index: usize) -> bool {
+ debug_assert!(index <= MAX_CODE as usize, "code cannot exceed {MAX_CODE}");
+
+ let map = index >> 6;
+ self.codes[map] & 1 << (index % 64) != 0
+ }
+
/// Get all codes set in this bitmap
pub(crate) fn codes(&self) -> CodesIterator {
CodesIterator {
@@ -82,6 +90,9 @@ struct Counter {
/// Frequency count for each code-pair.
counts2: Vec,
+ /// Bitmap index for codes that appear in counts1
+ code1_index: CodesBitmap,
+
/// Bitmap index of pairs that have been set.
///
/// `pair_index[code1].codes()` yields an iterator that can
@@ -96,36 +107,70 @@ const COUNTS2_SIZE: usize = COUNTS1_SIZE * COUNTS1_SIZE;
impl Counter {
fn new() -> Self {
+ let mut counts1 = Vec::with_capacity(COUNTS1_SIZE);
+ let mut counts2 = Vec::with_capacity(COUNTS2_SIZE);
+ // SAFETY: all accesses to the vector go through the bitmap to ensure no uninitialized
+ // data is ever read from these vectors.
+ unsafe {
+ counts1.set_len(COUNTS1_SIZE);
+ counts2.set_len(COUNTS2_SIZE);
+ }
+
Self {
- counts1: vec![0; COUNTS1_SIZE],
- counts2: vec![0; COUNTS2_SIZE],
+ counts1,
+ counts2,
+ code1_index: CodesBitmap::default(),
pair_index: vec![CodesBitmap::default(); COUNTS1_SIZE],
}
}
#[inline]
fn record_count1(&mut self, code1: u16) {
- self.counts1[code1 as usize] += 1;
+ if self.code1_index.is_set(code1 as usize) {
+ self.counts1[code1 as usize] += 1;
+ } else {
+ self.counts1[code1 as usize] = 1;
+ }
+ self.code1_index.set(code1 as usize);
}
#[inline]
fn record_count2(&mut self, code1: u16, code2: u16) {
+ debug_assert!(self.code1_index.is_set(code1 as usize));
+ debug_assert!(self.code1_index.is_set(code2 as usize));
+
let idx = (code1 as usize) * 511 + (code2 as usize);
- self.counts2[idx] += 1;
+ if self.pair_index[code1 as usize].is_set(code2 as usize) {
+ self.counts2[idx] += 1;
+ } else {
+ self.counts2[idx] = 1;
+ }
self.pair_index[code1 as usize].set(code2 as usize);
}
#[inline]
- fn count1(&self, code: u16) -> usize {
- self.counts1[code as usize]
+ fn count1(&self, code1: u16) -> usize {
+ debug_assert!(self.code1_index.is_set(code1 as usize));
+
+ self.counts1[code1 as usize]
}
#[inline]
fn count2(&self, code1: u16, code2: u16) -> usize {
+ debug_assert!(self.code1_index.is_set(code1 as usize));
+ debug_assert!(self.code1_index.is_set(code2 as usize));
+ debug_assert!(self.pair_index[code1 as usize].is_set(code2 as usize));
+
let idx = (code1 as usize) * 511 + (code2 as usize);
self.counts2[idx]
}
+ /// Returns an ordered iterator over the codes that were observed
+ /// in a call to [`Self::count1`].
+ fn first_codes(&self) -> CodesIterator {
+ self.code1_index.codes()
+ }
+
/// Returns an iterator over the codes that have been observed
/// to follow `code1`.
///
@@ -217,7 +262,7 @@ impl Compressor {
let mut res = Compressor::default();
let mut pqueue = BinaryHeap::with_capacity(65_536);
- for code1 in 0u16..(256u16 + self.n_symbols as u16) {
+ for code1 in counters.first_codes() {
let symbol1 = self.symbols[code1 as usize];
let count = counters.count1(code1);
// If count is zero, we can skip the whole inner loop.
@@ -375,7 +420,7 @@ mod test {
// empty case
let map = CodesBitmap::default();
- assert_eq!(map.codes().collect::>(), vec![]);
+ assert!(map.codes().collect::>().is_empty());
// edge case: first bit in each block is set
let mut map = CodesBitmap::default();
From ced11ef7b3be45cdbf29c4f52891652035f3caeb Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
<41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 21 Aug 2024 11:50:08 -0400
Subject: [PATCH 3/3] chore: release v0.2.2 (#19)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
## 🤖 New release
* `fsst-rs`: 0.2.1 -> 0.2.2
Changelog
## [0.2.2](https://github.com/spiraldb/fsst/compare/v0.2.1...v0.2.2) -
2024-08-21
### Other
- implement second bitmap, ~2x speedup for train
([#21](https://github.com/spiraldb/fsst/pull/21))
- remove spurious check
([#18](https://github.com/spiraldb/fsst/pull/18))
---
This PR was generated with
[release-plz](https://github.com/MarcoIeni/release-plz/).
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
CHANGELOG.md | 6 ++++++
Cargo.lock | 2 +-
Cargo.toml | 2 +-
3 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 520bd21..6cb51c5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
+## [0.2.2](https://github.com/spiraldb/fsst/compare/v0.2.1...v0.2.2) - 2024-08-21
+
+### Other
+- implement second bitmap, ~2x speedup for train ([#21](https://github.com/spiraldb/fsst/pull/21))
+- remove spurious check ([#18](https://github.com/spiraldb/fsst/pull/18))
+
## [0.2.1](https://github.com/spiraldb/fsst/compare/v0.2.0...v0.2.1) - 2024-08-20
### Added
diff --git a/Cargo.lock b/Cargo.lock
index a6e41af..ea88d59 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -174,7 +174,7 @@ checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
[[package]]
name = "fsst-rs"
-version = "0.2.1"
+version = "0.2.2"
dependencies = [
"criterion",
]
diff --git a/Cargo.toml b/Cargo.toml
index 8157245..a3cd415 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "fsst-rs"
-version = "0.2.1"
+version = "0.2.2"
description = "Pure-Rust implementation of Fast Static Symbol Tables algorithm for string compression"
authors = ["SpiralDB Developers "]
license = "Apache-2.0"