Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: hash_table_sizing, inline hints, lint rule #29

Merged
merged 4 commits into from
Sep 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ impl CompressorBuilder {
/// with an existing symbol.
pub fn insert(&mut self, symbol: Symbol, len: usize) -> bool {
assert!(self.n_symbols < 255, "cannot insert into full symbol table");
debug_assert!(len == symbol.len(), "provided len != symbol.len()");
assert_eq!(len, symbol.len(), "provided len must equal symbol.len()");

if len == 2 {
// shortCodes
Expand Down Expand Up @@ -387,7 +387,6 @@ impl CompressorBuilder {
///
/// Also returns the lengths vector, which is of length `n_symbols` and contains the
/// length for each of the values.
#[inline(never)]
fn finalize(&mut self) -> (u8, Vec<u8>) {
// Create a cumulative sum of each of the elements of the input line numbers.
// Do a map that includes the previously seen value as well.
Expand Down Expand Up @@ -534,7 +533,7 @@ const FSST_SAMPLELINE: usize = 512;
/// SAFETY: sample_buf must be >= FSST_SAMPLEMAX bytes long. Providing something less may cause unexpected failures.
#[allow(clippy::ptr_arg)]
fn make_sample<'a, 'b: 'a>(sample_buf: &'a mut Vec<u8>, str_in: &Vec<&'b [u8]>) -> Vec<&'a [u8]> {
debug_assert!(
assert!(
sample_buf.capacity() >= FSST_SAMPLEMAX,
"sample_buf.len() < FSST_SAMPLEMAX"
);
Expand Down Expand Up @@ -700,7 +699,7 @@ impl CompressorBuilder {
}

let remaining_bytes = unsafe { in_end.byte_offset_from(in_ptr) };
debug_assert!(
assert!(
remaining_bytes.is_positive(),
"in_ptr exceeded in_end, should not be possible"
);
Expand Down
21 changes: 5 additions & 16 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,7 @@ impl Symbol {

/// Constructor for a `Symbol` from an 8-element byte slice.
pub fn from_slice(slice: &[u8; 8]) -> Self {
let num: u64 = slice[0] as u64
| (slice[1] as u64) << 8
| (slice[2] as u64) << 16
| (slice[3] as u64) << 24
| (slice[4] as u64) << 32
| (slice[5] as u64) << 40
| (slice[6] as u64) << 48
| (slice[7] as u64) << 56;
let num: u64 = u64::from_le_bytes(*slice);

Self(num)
}
Expand Down Expand Up @@ -106,7 +99,7 @@ impl Symbol {

/// Return a new `Symbol` by logically concatenating ourselves with another `Symbol`.
pub fn concat(self, other: Self) -> Self {
debug_assert!(
assert!(
self.len() + other.len() <= 8,
"cannot build symbol with length > 8"
);
Expand Down Expand Up @@ -171,9 +164,6 @@ pub const FSST_CODE_BITS: usize = 9;
/// First bit of the "length" portion of an extended code.
pub const FSST_LEN_BITS: usize = 12;

/// A code that never appears in practice, indicating an unused slot.
pub const FSST_CODE_UNUSED: u16 = 1u16 << FSST_CODE_BITS;

/// Maximum code value in the extended code range.
pub const FSST_CODE_MAX: u16 = 1 << FSST_CODE_BITS;

Expand Down Expand Up @@ -253,7 +243,7 @@ impl<'a> Decompressor<'a> {
/// If the provided symbol table has length greater than 256
pub fn new(symbols: &'a [Symbol], lengths: &'a [u8]) -> Self {
assert!(
symbols.len() <= 255,
symbols.len() < FSST_CODE_BASE as usize,
"symbol table cannot have size exceeding 255"
);

Expand Down Expand Up @@ -295,7 +285,7 @@ impl<'a> Decompressor<'a> {
}
}

debug_assert!(
assert!(
in_pos >= compressed.len(),
"decompression should exhaust input before output"
);
Expand Down Expand Up @@ -350,7 +340,7 @@ pub struct Compressor {
/// The core structure of the FSST codec, holding a mapping between `Symbol`s and `Code`s.
///
/// The symbol table is trained on a corpus of data in the form of a single byte array, building up
/// a mapping of 1-byte "codes" to sequences of up to `N` plaintext bytse, or "symbols".
/// a mapping of 1-byte "codes" to sequences of up to 8 plaintext bytes, or "symbols".
impl Compressor {
/// Using the symbol table, runs a single cycle of compression on an input word, writing
/// the output into `out_ptr`.
Expand All @@ -367,7 +357,6 @@ impl Compressor {
/// # Safety
///
/// `out_ptr` must never be NULL or otherwise point to invalid memory.
#[inline(never)]
pub unsafe fn compress_word(&self, word: u64, out_ptr: *mut u8) -> (usize, usize) {
// Speculatively write the first byte of `word` at offset 1. This is necessary if it is an escape, and
// if it isn't, it will be overwritten anyway.
Expand Down
8 changes: 2 additions & 6 deletions src/lossy_pht.rs
Original file line number Diff line number Diff line change
@@ -1,19 +1,15 @@
// TODO: remove
#![allow(unused)]

use std::fmt::Debug;

use crate::builder::fsst_hash;
use crate::Code;
use crate::Symbol;
use crate::FSST_CODE_MASK;
use crate::{Code, FSST_CODE_UNUSED};

/// Size of the perfect hash table.
///
/// NOTE: this differs from the paper, which recommends a 64KB total
/// table size. The paper does not account for the fact that most
/// vendors split the L1 cache into 32KB of instruction and 32KB of data.
pub const HASH_TABLE_SIZE: usize = 1 << 12;
pub const HASH_TABLE_SIZE: usize = 1 << 11;

/// A single entry in the [Lossy Perfect Hash Table][`LossyPHT`].
///
Expand Down
Loading