Skip to content

Commit

Permalink
Refactor cose a little bit more.
Browse files Browse the repository at this point in the history
  • Loading branch information
KochetovNicolai committed Dec 21, 2020
1 parent c3a99e2 commit 29e0b4e
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 5 deletions.
9 changes: 7 additions & 2 deletions src/Functions/ExtractString.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,16 @@ namespace DB
template <size_t N, bool CaseInsensitive>
struct ExtractStringImpl
{
/// Padding form ColumnsString. It is a number of bytes we can always read starting from pos if pos < end.
static constexpr size_t default_padding = 16;

// the length of code_points = default_padding + N -1
/// Functions are read `default_padding - (N - 1)` bytes into the buffer. Window of size N is used.
/// Read copies `N - 1` last bytes from buffer into beginning, and then reads new bytes.
static constexpr size_t buffer_size = default_padding + N - 1;

// the length of code_points = buffer_size
// pos: the current beginning location that we want to copy data
// end: the end loction of the string
// end: the end location of the string
static ALWAYS_INLINE size_t readASCIICodePoints(UInt8 * code_points, const char *& pos, const char * end)
{
/// Offset before which we copy some data.
Expand Down
6 changes: 3 additions & 3 deletions src/Functions/FunctionsStringHash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ struct Hash
return crc;
}

static ALWAYS_INLINE inline UInt64 wordShinglesHash(const UInt32 * hashes, size_t size, size_t offset)
static ALWAYS_INLINE inline UInt64 wordShinglesHash(const UInt64 * hashes, size_t size, size_t offset)
{
UInt64 crc1 = -1ULL;
UInt64 crc2 = -1ULL;
Expand Down Expand Up @@ -122,7 +122,7 @@ struct SimhashImpl
// we made an assumption that the size of one word cann't exceed 128, which may not true
// if some word's size exceed 128, it would be cut up to several word
static constexpr size_t max_string_size = 1u << 15;
static constexpr size_t simultaneously_codepoints_num = StrOp::default_padding + N - 1;
static constexpr size_t simultaneously_codepoints_num = StrOp::buffer_size;

// Simhash ngram calculate function: String ->UInt64
// this function extracting ngram from input string, and maintain a 64-dimensions vector
Expand Down Expand Up @@ -323,7 +323,7 @@ struct MinhashImpl
using MinHeap = FixedHeap<std::greater<size_t>, K, 0>;
using StrOp = ExtractStringImpl<N, CaseInsensitive>;
static constexpr size_t max_string_size = 1u << 15;
static constexpr size_t simultaneously_codepoints_num = StrOp::default_padding + N - 1;
static constexpr size_t simultaneously_codepoints_num = StrOp::buffer_size;

// Minhash ngram calculate function, String -> Tuple(UInt64, UInt64)
// we extract ngram from input string, and calculate a hash value for each ngram
Expand Down

0 comments on commit 29e0b4e

Please sign in to comment.