Merge pull request #92 from ytmimi/fix-generic-simd

remove `packed_simd` in favor of `std::simd` and `#![feature(portable_simd)]`
llogiq · Apr 17, 2024 · 68ecb39 · 68ecb39
2 parents 934ea0e + 32a098d
commit 68ecb39
Show file tree

Hide file tree

Showing 4 changed files with 19 additions and 17 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -18,12 +18,11 @@ appveyor = { repository = "llogiq/bytecount" }
 bench = false
 
 [features]
-generic-simd = ["packed_simd"]
+generic-simd = []
 runtime-dispatch-simd = []
 html_report = []
 
 [dependencies]
-packed_simd = { version = "0.3.8", optional = true }
 
 [dev-dependencies]
 quickcheck = "1.0"

diff --git a/README.md b/README.md
@@ -46,7 +46,7 @@ Your users can then compile with runtime dispatch using:
 cargo build --release --features runtime-dispatch-simd
 ```
 
-The second, `generic-simd`, uses `packed_simd` to provide a fast
+The second, `generic-simd`, uses [`std::simd`](https://doc.rust-lang.org/std/simd/index.html) and [`#![feature(portable_simd)]`](https://github.com/rust-lang/rust/issues/86656) to provide a fast
 architecture-agnostic SIMD codepath, but requires running on nightly.
 
 Your users can compile with this codepath using:

diff --git a/src/lib.rs b/src/lib.rs
@@ -31,6 +31,8 @@
 //! [`naive_count_32`](fn.naive_count_32.html) method can be faster
 //! still on small strings.
 
+#![cfg_attr(feature = "generic-simd", feature(portable_simd))]
+
 #![deny(missing_docs)]
 #![cfg_attr(not(feature = "runtime-dispatch-simd"), no_std)]
 

diff --git a/src/simd/generic.rs b/src/simd/generic.rs
@@ -1,11 +1,12 @@
-extern crate packed_simd;
+
 
 #[cfg(not(feature = "runtime-dispatch-simd"))]
-use core::mem;
+use core::{mem, simd};
+
 #[cfg(feature = "runtime-dispatch-simd")]
-use std::mem;
+use std::{mem, simd};
 
-use self::packed_simd::{u8x32, u8x64, FromCast};
+use simd::{u8x32, u8x64, cmp::SimdPartialEq, num::SimdInt};
 
 const MASK: [u8; 64] = [
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -14,20 +15,20 @@ const MASK: [u8; 64] = [
 ];
 
 unsafe fn u8x64_from_offset(slice: &[u8], offset: usize) -> u8x64 {
-    u8x64::from_slice_unaligned_unchecked(slice.get_unchecked(offset..))
+    u8x64::from_slice(slice.get_unchecked(offset..))
 }
 unsafe fn u8x32_from_offset(slice: &[u8], offset: usize) -> u8x32 {
-    u8x32::from_slice_unaligned_unchecked(slice.get_unchecked(offset..))
+    u8x32::from_slice(slice.get_unchecked(offset..))
 }
 
 fn sum_x64(u8s: &u8x64) -> usize {
     let mut store = [0; mem::size_of::<u8x64>()];
-    u8s.write_to_slice_unaligned(&mut store);
+    u8s.copy_to_slice(&mut store);
     store.iter().map(|&e| e as usize).sum()
 }
 fn sum_x32(u8s: &u8x32) -> usize {
     let mut store = [0; mem::size_of::<u8x32>()];
-    u8s.write_to_slice_unaligned(&mut store);
+    u8s.copy_to_slice(&mut store);
     store.iter().map(|&e| e as usize).sum()
 }
 
@@ -44,7 +45,7 @@ pub fn chunk_count(haystack: &[u8], needle: u8) -> usize {
         while haystack.len() >= offset + 64 * 255 {
             let mut counts = u8x64::splat(0);
             for _ in 0..255 {
-                counts -= u8x64::from_cast(u8x64_from_offset(haystack, offset).eq(needles_x64));
+                counts -= u8x64_from_offset(haystack, offset).simd_eq(needles_x64).to_int().cast();
                 offset += 64;
             }
             count += sum_x64(&counts);
@@ -54,7 +55,7 @@ pub fn chunk_count(haystack: &[u8], needle: u8) -> usize {
         if haystack.len() >= offset + 64 * 128 {
             let mut counts = u8x64::splat(0);
             for _ in 0..128 {
-                counts -= u8x64::from_cast(u8x64_from_offset(haystack, offset).eq(needles_x64));
+                counts -= u8x64_from_offset(haystack, offset).simd_eq(needles_x64).to_int().cast();
                 offset += 64;
             }
             count += sum_x64(&counts);
@@ -66,15 +67,15 @@ pub fn chunk_count(haystack: &[u8], needle: u8) -> usize {
         let mut counts = u8x32::splat(0);
         for i in 0..(haystack.len() - offset) / 32 {
             counts -=
-                u8x32::from_cast(u8x32_from_offset(haystack, offset + i * 32).eq(needles_x32));
+                u8x32_from_offset(haystack, offset + i * 32).simd_eq(needles_x32).to_int().cast();
         }
         count += sum_x32(&counts);
 
         // Straggler; need to reset counts because prior loop can run 255 times
         counts = u8x32::splat(0);
         if haystack.len() % 32 != 0 {
             counts -=
-                u8x32::from_cast(u8x32_from_offset(haystack, haystack.len() - 32).eq(needles_x32))
+                u8x32_from_offset(haystack, haystack.len() - 32).simd_eq(needles_x32).to_int().cast()
                     & u8x32_from_offset(&MASK, haystack.len() % 32);
         }
         count += sum_x32(&counts);
@@ -84,11 +85,11 @@ pub fn chunk_count(haystack: &[u8], needle: u8) -> usize {
 }
 
 fn is_leading_utf8_byte_x64(u8s: u8x64) -> u8x64 {
-    u8x64::from_cast((u8s & u8x64::splat(0b1100_0000)).ne(u8x64::splat(0b1000_0000)))
+    (u8s & u8x64::splat(0b1100_0000)).simd_ne(u8x64::splat(0b1000_0000)).to_int().cast()
 }
 
 fn is_leading_utf8_byte_x32(u8s: u8x32) -> u8x32 {
-    u8x32::from_cast((u8s & u8x32::splat(0b1100_0000)).ne(u8x32::splat(0b1000_0000)))
+    (u8s & u8x32::splat(0b1100_0000)).simd_ne(u8x32::splat(0b1000_0000)).to_int().cast()
 }
 
 pub fn chunk_num_chars(utf8_chars: &[u8]) -> usize {