From 599d7f8f6dbead4c9f6dcf6705c38b68b5c45eff Mon Sep 17 00:00:00 2001
From: TheIronBorn <TheIronBorn@users.noreply.github.com>
Date: Wed, 6 Jul 2022 23:44:05 -0700
Subject: [PATCH 1/5] switch to std::simd, expand SIMD stuff & docs

move __m128i to stable, expand documentation, add SIMD to Bernoulli, add maskNxM, add __m512i

genericize simd uniform int

remove some debug stuff

remove bernoulli

foo

foo
---
 Cargo.toml                     |  11 +--
 src/distributions/bernoulli.rs |   5 +-
 src/distributions/float.rs     |  74 +++++++++---------
 src/distributions/integer.rs   | 104 +++++++++++++------------
 src/distributions/mod.rs       |  15 +++-
 src/distributions/other.rs     |  33 ++++++++
 src/distributions/uniform.rs   | 134 ++++++++++++++-------------------
 src/distributions/utils.rs     | 133 ++++++++++++++++++--------------
 src/lib.rs                     |   2 +-
 9 files changed, 281 insertions(+), 230 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 98ba373c68f..32c92fe9cdc 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -41,8 +41,8 @@ alloc = ["rand_core/alloc"]
 # Option: use getrandom package for seeding
 getrandom = ["rand_core/getrandom"]
 
-# Option (requires nightly): experimental SIMD support
-simd_support = ["packed_simd"]
+# Option (requires nightly Rust): experimental SIMD support
+simd_support = []
 
 # Option (enabled by default): enable StdRng
 std_rng = ["rand_chacha"]
@@ -68,13 +68,6 @@ log = { version = "0.4.4", optional = true }
 serde = { version = "1.0.103", features = ["derive"], optional = true }
 rand_chacha = { path = "rand_chacha", version = "0.3.0", default-features = false, optional = true }
 
-[dependencies.packed_simd]
-# NOTE: so far no version works reliably due to dependence on unstable features
-package = "packed_simd_2"
-version = "0.3.7"
-optional = true
-features = ["into_bits"]
-
 [target.'cfg(unix)'.dependencies]
 # Used for fork protection (reseeding.rs)
 libc = { version = "0.2.22", optional = true, default-features = false }
diff --git a/src/distributions/bernoulli.rs b/src/distributions/bernoulli.rs
index 226db79fa9c..676b79a5c10 100644
--- a/src/distributions/bernoulli.rs
+++ b/src/distributions/bernoulli.rs
@@ -14,6 +14,7 @@ use core::{fmt, u64};
 
 #[cfg(feature = "serde1")]
 use serde::{Serialize, Deserialize};
+
 /// The Bernoulli distribution.
 ///
 /// This is a special case of the Binomial distribution where `n = 1`.
@@ -147,10 +148,10 @@ mod test {
     use crate::Rng;
 
     #[test]
-    #[cfg(feature="serde1")]
+    #[cfg(feature = "serde1")]
     fn test_serializing_deserializing_bernoulli() {
         let coin_flip = Bernoulli::new(0.5).unwrap();
-        let de_coin_flip : Bernoulli = bincode::deserialize(&bincode::serialize(&coin_flip).unwrap()).unwrap();
+        let de_coin_flip: Bernoulli = bincode::deserialize(&bincode::serialize(&coin_flip).unwrap()).unwrap();
 
         assert_eq!(coin_flip.p_int, de_coin_flip.p_int);
     }
diff --git a/src/distributions/float.rs b/src/distributions/float.rs
index ce5946f7f01..54aebad4dc5 100644
--- a/src/distributions/float.rs
+++ b/src/distributions/float.rs
@@ -8,11 +8,11 @@
 
 //! Basic floating-point number distributions
 
-use crate::distributions::utils::FloatSIMDUtils;
+use crate::distributions::utils::{IntAsSIMD, FloatAsSIMD, FloatSIMDUtils};
 use crate::distributions::{Distribution, Standard};
 use crate::Rng;
 use core::mem;
-#[cfg(feature = "simd_support")] use packed_simd::*;
+#[cfg(feature = "simd_support")] use core::simd::*;
 
 #[cfg(feature = "serde1")]
 use serde::{Serialize, Deserialize};
@@ -99,7 +99,7 @@ macro_rules! float_impls {
                 // The exponent is encoded using an offset-binary representation
                 let exponent_bits: $u_scalar =
                     (($exponent_bias + exponent) as $u_scalar) << $fraction_bits;
-                $ty::from_bits(self | exponent_bits)
+                $ty::from_bits(self | $uty::splat(exponent_bits))
             }
         }
 
@@ -108,13 +108,13 @@ macro_rules! float_impls {
                 // Multiply-based method; 24/53 random bits; [0, 1) interval.
                 // We use the most significant bits because for simple RNGs
                 // those are usually more random.
-                let float_size = mem::size_of::<$f_scalar>() as u32 * 8;
+                let float_size = mem::size_of::<$f_scalar>() as $u_scalar * 8;
                 let precision = $fraction_bits + 1;
                 let scale = 1.0 / ((1 as $u_scalar << precision) as $f_scalar);
 
                 let value: $uty = rng.gen();
-                let value = value >> (float_size - precision);
-                scale * $ty::cast_from_int(value)
+                let value = value >> $uty::splat(float_size - precision);
+                $ty::splat(scale) * $ty::cast_from_int(value)
             }
         }
 
@@ -123,14 +123,14 @@ macro_rules! float_impls {
                 // Multiply-based method; 24/53 random bits; (0, 1] interval.
                 // We use the most significant bits because for simple RNGs
                 // those are usually more random.
-                let float_size = mem::size_of::<$f_scalar>() as u32 * 8;
+                let float_size = mem::size_of::<$f_scalar>() as $u_scalar * 8;
                 let precision = $fraction_bits + 1;
                 let scale = 1.0 / ((1 as $u_scalar << precision) as $f_scalar);
 
                 let value: $uty = rng.gen();
-                let value = value >> (float_size - precision);
+                let value = value >> $uty::splat(float_size - precision);
                 // Add 1 to shift up; will not overflow because of right-shift:
-                scale * $ty::cast_from_int(value + 1)
+                $ty::splat(scale) * $ty::cast_from_int(value + $uty::splat(1))
             }
         }
 
@@ -140,11 +140,11 @@ macro_rules! float_impls {
                 // We use the most significant bits because for simple RNGs
                 // those are usually more random.
                 use core::$f_scalar::EPSILON;
-                let float_size = mem::size_of::<$f_scalar>() as u32 * 8;
+                let float_size = mem::size_of::<$f_scalar>() as $u_scalar * 8;
 
                 let value: $uty = rng.gen();
-                let fraction = value >> (float_size - $fraction_bits);
-                fraction.into_float_with_exponent(0) - (1.0 - EPSILON / 2.0)
+                let fraction = value >> $uty::splat(float_size - $fraction_bits);
+                fraction.into_float_with_exponent(0) - $ty::splat(1.0 - EPSILON / 2.0)
             }
         }
     }
@@ -169,10 +169,10 @@ float_impls! { f64x4, u64x4, f64, u64, 52, 1023 }
 #[cfg(feature = "simd_support")]
 float_impls! { f64x8, u64x8, f64, u64, 52, 1023 }
 
-
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::distributions::utils::FloatAsSIMD;
     use crate::rngs::mock::StepRng;
 
     const EPSILON32: f32 = ::core::f32::EPSILON;
@@ -182,29 +182,31 @@ mod tests {
         ($fnn:ident, $ty:ident, $ZERO:expr, $EPSILON:expr) => {
             #[test]
             fn $fnn() {
+                let two = $ty::splat(2.0);
+
                 // Standard
                 let mut zeros = StepRng::new(0, 0);
                 assert_eq!(zeros.gen::<$ty>(), $ZERO);
                 let mut one = StepRng::new(1 << 8 | 1 << (8 + 32), 0);
-                assert_eq!(one.gen::<$ty>(), $EPSILON / 2.0);
+                assert_eq!(one.gen::<$ty>(), $EPSILON / two);
                 let mut max = StepRng::new(!0, 0);
-                assert_eq!(max.gen::<$ty>(), 1.0 - $EPSILON / 2.0);
+                assert_eq!(max.gen::<$ty>(), $ty::splat(1.0) - $EPSILON / two);
 
                 // OpenClosed01
                 let mut zeros = StepRng::new(0, 0);
-                assert_eq!(zeros.sample::<$ty, _>(OpenClosed01), 0.0 + $EPSILON / 2.0);
+                assert_eq!(zeros.sample::<$ty, _>(OpenClosed01), $ZERO + $EPSILON / two);
                 let mut one = StepRng::new(1 << 8 | 1 << (8 + 32), 0);
                 assert_eq!(one.sample::<$ty, _>(OpenClosed01), $EPSILON);
                 let mut max = StepRng::new(!0, 0);
-                assert_eq!(max.sample::<$ty, _>(OpenClosed01), $ZERO + 1.0);
+                assert_eq!(max.sample::<$ty, _>(OpenClosed01), $ZERO + $ty::splat(1.0));
 
                 // Open01
                 let mut zeros = StepRng::new(0, 0);
-                assert_eq!(zeros.sample::<$ty, _>(Open01), 0.0 + $EPSILON / 2.0);
+                assert_eq!(zeros.sample::<$ty, _>(Open01), $ZERO + $EPSILON / two);
                 let mut one = StepRng::new(1 << 9 | 1 << (9 + 32), 0);
-                assert_eq!(one.sample::<$ty, _>(Open01), $EPSILON / 2.0 * 3.0);
+                assert_eq!(one.sample::<$ty, _>(Open01), $EPSILON / two * $ty::splat(3.0));
                 let mut max = StepRng::new(!0, 0);
-                assert_eq!(max.sample::<$ty, _>(Open01), 1.0 - $EPSILON / 2.0);
+                assert_eq!(max.sample::<$ty, _>(Open01), $ty::splat(1.0) - $EPSILON / two);
             }
         };
     }
@@ -222,29 +224,31 @@ mod tests {
         ($fnn:ident, $ty:ident, $ZERO:expr, $EPSILON:expr) => {
             #[test]
             fn $fnn() {
+                let two = $ty::splat(2.0);
+
                 // Standard
                 let mut zeros = StepRng::new(0, 0);
                 assert_eq!(zeros.gen::<$ty>(), $ZERO);
                 let mut one = StepRng::new(1 << 11, 0);
-                assert_eq!(one.gen::<$ty>(), $EPSILON / 2.0);
+                assert_eq!(one.gen::<$ty>(), $EPSILON / two);
                 let mut max = StepRng::new(!0, 0);
-                assert_eq!(max.gen::<$ty>(), 1.0 - $EPSILON / 2.0);
+                assert_eq!(max.gen::<$ty>(), $ty::splat(1.0) - $EPSILON / two);
 
                 // OpenClosed01
                 let mut zeros = StepRng::new(0, 0);
-                assert_eq!(zeros.sample::<$ty, _>(OpenClosed01), 0.0 + $EPSILON / 2.0);
+                assert_eq!(zeros.sample::<$ty, _>(OpenClosed01), $ZERO + $EPSILON / two);
                 let mut one = StepRng::new(1 << 11, 0);
                 assert_eq!(one.sample::<$ty, _>(OpenClosed01), $EPSILON);
                 let mut max = StepRng::new(!0, 0);
-                assert_eq!(max.sample::<$ty, _>(OpenClosed01), $ZERO + 1.0);
+                assert_eq!(max.sample::<$ty, _>(OpenClosed01), $ZERO + $ty::splat(1.0));
 
                 // Open01
                 let mut zeros = StepRng::new(0, 0);
-                assert_eq!(zeros.sample::<$ty, _>(Open01), 0.0 + $EPSILON / 2.0);
+                assert_eq!(zeros.sample::<$ty, _>(Open01), $ZERO + $EPSILON / two);
                 let mut one = StepRng::new(1 << 12, 0);
-                assert_eq!(one.sample::<$ty, _>(Open01), $EPSILON / 2.0 * 3.0);
+                assert_eq!(one.sample::<$ty, _>(Open01), $EPSILON / two * $ty::splat(3.0));
                 let mut max = StepRng::new(!0, 0);
-                assert_eq!(max.sample::<$ty, _>(Open01), 1.0 - $EPSILON / 2.0);
+                assert_eq!(max.sample::<$ty, _>(Open01), $ty::splat(1.0) - $EPSILON / two);
             }
         };
     }
@@ -296,16 +300,16 @@ mod tests {
             // non-SIMD types; we assume this pattern continues across all
             // SIMD types.
 
-            test_samples(&Standard, f32x2::new(0.0, 0.0), &[
-                f32x2::new(0.0035963655, 0.7346052),
-                f32x2::new(0.09778172, 0.20298547),
-                f32x2::new(0.34296435, 0.81664366),
+            test_samples(&Standard, f32x2::from([0.0, 0.0]), &[
+                f32x2::from([0.0035963655, 0.7346052]),
+                f32x2::from([0.09778172, 0.20298547]),
+                f32x2::from([0.34296435, 0.81664366]),
             ]);
 
-            test_samples(&Standard, f64x2::new(0.0, 0.0), &[
-                f64x2::new(0.7346051961657583, 0.20298547462974248),
-                f64x2::new(0.8166436635290655, 0.7423708925400552),
-                f64x2::new(0.16387782224016323, 0.9087068770169618),
+            test_samples(&Standard, f64x2::from([0.0, 0.0]), &[
+                f64x2::from([0.7346051961657583, 0.20298547462974248]),
+                f64x2::from([0.8166436635290655, 0.7423708925400552]),
+                f64x2::from([0.16387782224016323, 0.9087068770169618]),
             ]);
         }
     }
diff --git a/src/distributions/integer.rs b/src/distributions/integer.rs
index 19ce71599cb..d905044ed1e 100644
--- a/src/distributions/integer.rs
+++ b/src/distributions/integer.rs
@@ -11,12 +11,17 @@
 use crate::distributions::{Distribution, Standard};
 use crate::Rng;
 #[cfg(all(target_arch = "x86", feature = "simd_support"))]
+use core::arch::x86::__m512i;
+#[cfg(target_arch = "x86")]
 use core::arch::x86::{__m128i, __m256i};
 #[cfg(all(target_arch = "x86_64", feature = "simd_support"))]
+use core::arch::x86_64::__m512i;
+#[cfg(target_arch = "x86_64")]
 use core::arch::x86_64::{__m128i, __m256i};
 use core::num::{NonZeroU16, NonZeroU32, NonZeroU64, NonZeroU8, NonZeroUsize,
     NonZeroU128};
-#[cfg(feature = "simd_support")] use packed_simd::*;
+#[cfg(feature = "simd_support")] use core::simd::*;
+use core::mem;
 
 impl Distribution<u8> for Standard {
     #[inline]
@@ -109,53 +114,63 @@ impl_nzint!(NonZeroU64, NonZeroU64::new);
 impl_nzint!(NonZeroU128, NonZeroU128::new);
 impl_nzint!(NonZeroUsize, NonZeroUsize::new);
 
-#[cfg(feature = "simd_support")]
-macro_rules! simd_impl {
-    ($(($intrinsic:ident, $vec:ty),)+) => {$(
+macro_rules! intrinsic_impl {
+    ($($intrinsic:ident),+) => {$(
+        /// Available only on x86/64 platforms
         impl Distribution<$intrinsic> for Standard {
             #[inline]
             fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> $intrinsic {
-                $intrinsic::from_bits(rng.gen::<$vec>())
+                // On proper hardware, this should compile to SIMD instructions
+                // Verified on x86 Haswell with __m128i, __m256i
+                let mut buf = [0_u8; mem::size_of::<$intrinsic>()];
+                rng.fill_bytes(&mut buf);
+                // x86 is little endian so no need for conversion
+                // SAFETY: we know [u8; N] and $intrinsic have the same size
+                unsafe { mem::transmute_copy(&buf) }
             }
         }
     )+};
+}
 
-    ($bits:expr,) => {};
-    ($bits:expr, $ty:ty, $($ty_more:ty,)*) => {
-        simd_impl!($bits, $($ty_more,)*);
-
+#[cfg(feature = "simd_support")]
+macro_rules! simd_impl {
+    ($($ty:ty),+) => {$(
+        /// Requires nightly Rust and the [`simd_support`] feature
+        ///
+        /// [`simd_support`]: https://github.com/rust-random/rand#crate-features
         impl Distribution<$ty> for Standard {
             #[inline]
             fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> $ty {
-                let mut vec: $ty = Default::default();
-                unsafe {
-                    let ptr = &mut vec;
-                    let b_ptr = &mut *(ptr as *mut $ty as *mut [u8; $bits/8]);
-                    rng.fill_bytes(b_ptr);
+                // TODO: impl this generically once const generics are robust enough
+                let mut vec: Simd<u8, { mem::size_of::<$ty>() }> = Default::default();
+                rng.fill_bytes(vec.as_mut_array());
+                // NOTE: replace with `to_le` if added to core::simd
+                #[cfg(not(target_endian = "little"))]
+                {
+                    vec = vec.reverse();
                 }
-                vec.to_le()
+                // SAFETY: we know u8xN and $ty have the same size
+                unsafe { mem::transmute_copy(&vec) }
             }
         }
-    };
+    )+};
 }
 
 #[cfg(feature = "simd_support")]
-simd_impl!(16, u8x2, i8x2,);
-#[cfg(feature = "simd_support")]
-simd_impl!(32, u8x4, i8x4, u16x2, i16x2,);
-#[cfg(feature = "simd_support")]
-simd_impl!(64, u8x8, i8x8, u16x4, i16x4, u32x2, i32x2,);
-#[cfg(feature = "simd_support")]
-simd_impl!(128, u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, u64x2, i64x2,);
-#[cfg(feature = "simd_support")]
-simd_impl!(256, u8x32, i8x32, u16x16, i16x16, u32x8, i32x8, u64x4, i64x4,);
-#[cfg(feature = "simd_support")]
-simd_impl!(512, u8x64, i8x64, u16x32, i16x32, u32x16, i32x16, u64x8, i64x8,);
+simd_impl!(
+    i8x4, i8x8, i8x16, i8x32, i8x64, i16x2, i16x4, i16x8, i16x16, i16x32, i32x2, i32x4, i32x8,
+    i32x16, i64x2, i64x4, i64x8, isizex2, isizex4, isizex8, u8x4, u8x8, u8x16, u8x32, u8x64, u16x2,
+    u16x4, u16x8, u16x16, u16x32, u32x2, u32x4, u32x8, u32x16, u64x2, u64x4, u64x8, usizex2,
+    usizex4, usizex8
+);
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+intrinsic_impl!(__m128i, __m256i);
 #[cfg(all(
-    feature = "simd_support",
-    any(target_arch = "x86", target_arch = "x86_64")
+    any(target_arch = "x86", target_arch = "x86_64"),
+    feature = "simd_support"
 ))]
-simd_impl!((__m128i, u8x16), (__m256i, u8x32),);
+intrinsic_impl!(__m512i);
 
 #[cfg(test)]
 mod tests {
@@ -221,24 +236,19 @@ mod tests {
         {
             // We only test a sub-set of types here and make assumptions about the rest.
 
-            test_samples(u8x2::default(), &[
-                u8x2::new(9, 126),
-                u8x2::new(247, 167),
-                u8x2::new(111, 149),
-            ]);
             test_samples(u8x4::default(), &[
-                u8x4::new(9, 126, 87, 132),
-                u8x4::new(247, 167, 123, 153),
-                u8x4::new(111, 149, 73, 120),
+                u8x4::from([9, 126, 87, 132]),
+                u8x4::from([247, 167, 123, 153]),
+                u8x4::from([111, 149, 73, 120]),
             ]);
             test_samples(u8x8::default(), &[
-                u8x8::new(9, 126, 87, 132, 247, 167, 123, 153),
-                u8x8::new(111, 149, 73, 120, 68, 171, 98, 223),
-                u8x8::new(24, 121, 1, 50, 13, 46, 164, 20),
+                u8x8::from([9, 126, 87, 132, 247, 167, 123, 153]),
+                u8x8::from([111, 149, 73, 120, 68, 171, 98, 223]),
+                u8x8::from([24, 121, 1, 50, 13, 46, 164, 20]),
             ]);
 
             test_samples(i64x8::default(), &[
-                i64x8::new(
+                i64x8::from([
                     -7387126082252079607,
                     -2350127744969763473,
                     1487364411147516184,
@@ -247,8 +257,8 @@ mod tests {
                     6022086574635100741,
                     -5080089175222015595,
                     -4066367846667249123,
-                ),
-                i64x8::new(
+                ]),
+                i64x8::from([
                     9180885022207963908,
                     3095981199532211089,
                     6586075293021332726,
@@ -257,8 +267,8 @@ mod tests {
                     5287129228749947252,
                     444726432079249540,
                     -1587028029513790706,
-                ),
-                i64x8::new(
+                ]),
+                i64x8::from([
                     6075236523189346388,
                     1351763722368165432,
                     -6192309979959753740,
@@ -267,7 +277,7 @@ mod tests {
                     7522501477800909500,
                     -1837258847956201231,
                     -586926753024886735,
-                ),
+                ]),
             ]);
         }
     }
diff --git a/src/distributions/mod.rs b/src/distributions/mod.rs
index 05ca80606b0..9b0a8867000 100644
--- a/src/distributions/mod.rs
+++ b/src/distributions/mod.rs
@@ -149,8 +149,14 @@ use crate::Rng;
 /// * `bool`: Generates `false` or `true`, each with probability 0.5.
 /// * Floating point types (`f32` and `f64`): Uniformly distributed in the
 ///   half-open range `[0, 1)`. See notes below.
-/// * Wrapping integers (`Wrapping<T>`), besides the type identical to their
+/// * Wrapping integers ([`Wrapping<T>`]), besides the type identical to their
 ///   normal integer variants.
+/// * Non-zero integers ([`NonZeroU8`]), which are like their normal integer
+///   variants but cannot produce zero.
+/// * SIMD types like x86's [`__m128i`], `std::simd`'s [`u32x4`]/[`f32x4`]/
+///   [`mask32x4`] (requires [`simd_support`]), where each lane is distributed
+///   like their scalar `Standard` variants. See the list of `Standard`
+///   implementations for more.
 ///
 /// The `Standard` distribution also supports generation of the following
 /// compound types where all component types are supported:
@@ -213,6 +219,13 @@ use crate::Rng;
 /// CPUs all methods have approximately equal performance).
 ///
 /// [`Uniform`]: uniform::Uniform
+/// [`Wrapping<T>`]: std::num::Wrapping
+/// [`NonZeroU8`]: std::num::NonZeroU8
+/// [`__m128i`]: https://doc.rust-lang.org/nightly/core/arch/x86/struct.__m128i.html
+/// [`u32x4`]: std::simd::u32x4
+/// [`f32x4`]: std::simd::f32x4
+/// [`mask32x4`]: std::simd::mask32x4
+/// [`simd_support`]: https://github.com/rust-random/rand#crate-features
 #[derive(Clone, Copy, Debug)]
 #[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))]
 pub struct Standard;
diff --git a/src/distributions/other.rs b/src/distributions/other.rs
index 03802a76d5f..b3b5e4e0201 100644
--- a/src/distributions/other.rs
+++ b/src/distributions/other.rs
@@ -22,6 +22,8 @@ use crate::Rng;
 use serde::{Serialize, Deserialize};
 #[cfg(feature = "min_const_gen")]
 use core::mem::{self, MaybeUninit};
+#[cfg(feature = "simd_support")]
+use core::simd::{Mask, Simd, LaneCount, SupportedLaneCount, MaskElement, SimdElement};
 
 
 // ----- Sampling distributions -----
@@ -145,6 +147,37 @@ impl Distribution<bool> for Standard {
     }
 }
 
+/// Requires nightly Rust and the [`simd_support`] feature
+///
+/// Note that on some hardware like x86/64 mask operations like [`_mm_blendv_epi8`]
+/// only care about a single bit. This means that you could use uniform random bits
+/// directly:
+///
+/// ```ignore
+/// // this may be faster...
+/// let x = unsafe { _mm_blendv_epi8(a.into(), b.into(), rng.gen::<__m128i>()) };
+///
+/// // ...than this
+/// let x = rng.gen::<mask8x16>().select(b, a);
+/// ```
+///
+/// Since most bits are unused you could also generate only as many bits as you need.
+///
+/// [`_mm_blendv_epi8`]: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8&ig_expand=514/
+/// [`simd_support`]: https://github.com/rust-random/rand#crate-features
+#[cfg(feature = "simd_support")]
+impl<T, const LANES: usize> Distribution<Mask<T, LANES>> for Standard
+where
+    T: MaskElement + PartialOrd + SimdElement<Mask = T> + Default,
+    LaneCount<LANES>: SupportedLaneCount,
+    Standard: Distribution<Simd<T, LANES>>,
+{
+    #[inline]
+    fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Mask<T, LANES> {
+        rng.gen().lanes_lt(Simd::default())
+    }
+}
+
 macro_rules! tuple_impl {
     // use variables to indicate the arity of the tuple
     ($($tyvar:ident),* ) => {
diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs
index 261357b2456..d9197c68045 100644
--- a/src/distributions/uniform.rs
+++ b/src/distributions/uniform.rs
@@ -110,15 +110,17 @@ use core::time::Duration;
 use core::ops::{Range, RangeInclusive};
 
 use crate::distributions::float::IntoFloat;
-use crate::distributions::utils::{BoolAsSIMD, FloatAsSIMD, FloatSIMDUtils, WideningMultiply};
+use crate::distributions::utils::{BoolAsSIMD, FloatAsSIMD, FloatSIMDUtils, IntAsSIMD, WideningMultiply};
 use crate::distributions::Distribution;
+#[cfg(feature = "simd_support")]
+use crate::distributions::Standard;
 use crate::{Rng, RngCore};
 
 #[cfg(not(feature = "std"))]
 #[allow(unused_imports)] // rustc doesn't detect that this is actually used
 use crate::distributions::utils::Float;
 
-#[cfg(feature = "simd_support")] use packed_simd::*;
+#[cfg(feature = "simd_support")] use core::simd::*;
 
 #[cfg(feature = "serde1")]
 use serde::{Serialize, Deserialize};
@@ -571,21 +573,30 @@ uniform_int_impl! { u128, u128, u128 }
 
 #[cfg(feature = "simd_support")]
 macro_rules! uniform_simd_int_impl {
-    ($ty:ident, $unsigned:ident, $u_scalar:ident) => {
+    ($ty:ident, $unsigned:ident) => {
         // The "pick the largest zone that can fit in an `u32`" optimization
         // is less useful here. Multiple lanes complicate things, we don't
         // know the PRNG's minimal output size, and casting to a larger vector
         // is generally a bad idea for SIMD performance. The user can still
         // implement it manually.
-
-        // TODO: look into `Uniform::<u32x4>::new(0u32, 100)` functionality
-        //       perhaps `impl SampleUniform for $u_scalar`?
-        impl SampleUniform for $ty {
-            type Sampler = UniformInt<$ty>;
+        impl<const LANES: usize> SampleUniform for Simd<$ty, LANES>
+        where
+            LaneCount<LANES>: SupportedLaneCount,
+            Simd<$unsigned, LANES>:
+                WideningMultiply<Output = (Simd<$unsigned, LANES>, Simd<$unsigned, LANES>)>,
+            Standard: Distribution<Simd<$unsigned, LANES>>,
+        {
+            type Sampler = UniformInt<Simd<$ty, LANES>>;
         }
 
-        impl UniformSampler for UniformInt<$ty> {
-            type X = $ty;
+        impl<const LANES: usize> UniformSampler for UniformInt<Simd<$ty, LANES>>
+        where
+            LaneCount<LANES>: SupportedLaneCount,
+            Simd<$unsigned, LANES>:
+                WideningMultiply<Output = (Simd<$unsigned, LANES>, Simd<$unsigned, LANES>)>,
+            Standard: Distribution<Simd<$unsigned, LANES>>,
+        {
+            type X = Simd<$ty, LANES>;
 
             #[inline] // if the range is constant, this helps LLVM to do the
                       // calculations at compile-time.
@@ -595,8 +606,8 @@ macro_rules! uniform_simd_int_impl {
             {
                 let low = *low_b.borrow();
                 let high = *high_b.borrow();
-                assert!(low.lt(high).all(), "Uniform::new called with `low >= high`");
-                UniformSampler::new_inclusive(low, high - 1)
+                assert!(low.lanes_lt(high).all(), "Uniform::new called with `low >= high`");
+                UniformSampler::new_inclusive(low, high - Simd::splat(1))
             }
 
             #[inline] // if the range is constant, this helps LLVM to do the
@@ -607,20 +618,20 @@ macro_rules! uniform_simd_int_impl {
             {
                 let low = *low_b.borrow();
                 let high = *high_b.borrow();
-                assert!(low.le(high).all(),
+                assert!(low.lanes_le(high).all(),
                         "Uniform::new_inclusive called with `low > high`");
-                let unsigned_max = ::core::$u_scalar::MAX;
+                let unsigned_max = Simd::splat(::core::$unsigned::MAX);
 
-                // NOTE: these may need to be replaced with explicitly
-                // wrapping operations if `packed_simd` changes
-                let range: $unsigned = ((high - low) + 1).cast();
+                // NOTE: all `Simd` operations are inherently wrapping,
+                //       see https://doc.rust-lang.org/std/simd/struct.Simd.html
+                let range: Simd<$unsigned, LANES> = ((high - low) + Simd::splat(1)).cast();
                 // `% 0` will panic at runtime.
-                let not_full_range = range.gt($unsigned::splat(0));
+                let not_full_range = range.lanes_gt(Simd::splat(0));
                 // replacing 0 with `unsigned_max` allows a faster `select`
                 // with bitwise OR
-                let modulo = not_full_range.select(range, $unsigned::splat(unsigned_max));
+                let modulo = not_full_range.select(range, unsigned_max);
                 // wrapping addition
-                let ints_to_reject = (unsigned_max - range + 1) % modulo;
+                let ints_to_reject = (unsigned_max - range + Simd::splat(1)) % modulo;
                 // When `range` is 0, `lo` of `v.wmul(range)` will always be
                 // zero which means only one sample is needed.
                 let zone = unsigned_max - ints_to_reject;
@@ -634,8 +645,8 @@ macro_rules! uniform_simd_int_impl {
             }
 
             fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Self::X {
-                let range: $unsigned = self.range.cast();
-                let zone: $unsigned = self.z.cast();
+                let range: Simd<$unsigned, LANES> = self.range.cast();
+                let zone: Simd<$unsigned, LANES> = self.z.cast();
 
                 // This might seem very slow, generating a whole new
                 // SIMD vector for every sample rejection. For most uses
@@ -646,19 +657,19 @@ macro_rules! uniform_simd_int_impl {
                 // rejection. The replacement method does however add a little
                 // overhead. Benchmarking or calculating probabilities might
                 // reveal contexts where this replacement method is slower.
-                let mut v: $unsigned = rng.gen();
+                let mut v: Simd<$unsigned, LANES> = rng.gen();
                 loop {
                     let (hi, lo) = v.wmul(range);
-                    let mask = lo.le(zone);
+                    let mask = lo.lanes_le(zone);
                     if mask.all() {
-                        let hi: $ty = hi.cast();
+                        let hi: Simd<$ty, LANES> = hi.cast();
                         // wrapping addition
                         let result = self.low + hi;
                         // `select` here compiles to a blend operation
                         // When `range.eq(0).none()` the compare and blend
                         // operations are avoided.
-                        let v: $ty = v.cast();
-                        return range.gt($unsigned::splat(0)).select(result, v);
+                        let v: Simd<$ty, LANES> = v.cast();
+                        return range.lanes_gt(Simd::splat(0)).select(result, v);
                     }
                     // Replace only the failing lanes
                     v = mask.select(v, rng.gen());
@@ -668,51 +679,16 @@ macro_rules! uniform_simd_int_impl {
     };
 
     // bulk implementation
-    ($(($unsigned:ident, $signed:ident),)+ $u_scalar:ident) => {
+    ($(($unsigned:ident, $signed:ident)),+) => {
         $(
-            uniform_simd_int_impl!($unsigned, $unsigned, $u_scalar);
-            uniform_simd_int_impl!($signed, $unsigned, $u_scalar);
+            uniform_simd_int_impl!($unsigned, $unsigned);
+            uniform_simd_int_impl!($signed, $unsigned);
         )+
     };
 }
 
 #[cfg(feature = "simd_support")]
-uniform_simd_int_impl! {
-    (u64x2, i64x2),
-    (u64x4, i64x4),
-    (u64x8, i64x8),
-    u64
-}
-
-#[cfg(feature = "simd_support")]
-uniform_simd_int_impl! {
-    (u32x2, i32x2),
-    (u32x4, i32x4),
-    (u32x8, i32x8),
-    (u32x16, i32x16),
-    u32
-}
-
-#[cfg(feature = "simd_support")]
-uniform_simd_int_impl! {
-    (u16x2, i16x2),
-    (u16x4, i16x4),
-    (u16x8, i16x8),
-    (u16x16, i16x16),
-    (u16x32, i16x32),
-    u16
-}
-
-#[cfg(feature = "simd_support")]
-uniform_simd_int_impl! {
-    (u8x2, i8x2),
-    (u8x4, i8x4),
-    (u8x8, i8x8),
-    (u8x16, i8x16),
-    (u8x32, i8x32),
-    (u8x64, i8x64),
-    u8
-}
+uniform_simd_int_impl! { (u8, i8), (u16, i16), (u32, i32), (u64, i64) }
 
 impl SampleUniform for char {
     type Sampler = UniformChar;
@@ -847,7 +823,7 @@ macro_rules! uniform_float_impl {
 
                 loop {
                     let mask = (scale * max_rand + low).ge_mask(high);
-                    if mask.none() {
+                    if !mask.any() {
                         break;
                     }
                     scale = scale.decrease_masked(mask);
@@ -886,7 +862,7 @@ macro_rules! uniform_float_impl {
 
                 loop {
                     let mask = (scale * max_rand + low).gt_mask(high);
-                    if mask.none() {
+                    if !mask.any() {
                         break;
                     }
                     scale = scale.decrease_masked(mask);
@@ -899,11 +875,11 @@ macro_rules! uniform_float_impl {
 
             fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Self::X {
                 // Generate a value in the range [1, 2)
-                let value1_2 = (rng.gen::<$uty>() >> $bits_to_discard).into_float_with_exponent(0);
+                let value1_2 = (rng.gen::<$uty>() >> $uty::splat($bits_to_discard)).into_float_with_exponent(0);
 
                 // Get a value in the range [0, 1) in order to avoid
                 // overflowing into infinity when multiplying with scale
-                let value0_1 = value1_2 - 1.0;
+                let value0_1 = value1_2 - <$ty>::splat(1.0);
 
                 // We don't use `f64::mul_add`, because it is not available with
                 // `no_std`. Furthermore, it is slower for some targets (but
@@ -939,11 +915,11 @@ macro_rules! uniform_float_impl {
                 loop {
                     // Generate a value in the range [1, 2)
                     let value1_2 =
-                        (rng.gen::<$uty>() >> $bits_to_discard).into_float_with_exponent(0);
+                        (rng.gen::<$uty>() >> $uty::splat($bits_to_discard)).into_float_with_exponent(0);
 
                     // Get a value in the range [0, 1) in order to avoid
                     // overflowing into infinity when multiplying with scale
-                    let value0_1 = value1_2 - 1.0;
+                    let value0_1 = value1_2 - <$ty>::splat(1.0);
 
                     // Doing multiply before addition allows some architectures
                     // to use a single instruction.
@@ -1184,7 +1160,7 @@ mod tests {
             _ => panic!("`UniformDurationMode` was not serialized/deserialized correctly")
         }
     }
-    
+
     #[test]
     #[cfg(feature = "serde1")]
     fn test_uniform_serialization() {
@@ -1289,8 +1265,8 @@ mod tests {
                         ($ty::splat(10), $ty::splat(127)),
                         ($ty::splat($scalar::MIN), $ty::splat($scalar::MAX)),
                     ],
-                    |x: $ty, y| x.le(y).all(),
-                    |x: $ty, y| x.lt(y).all()
+                    |x: $ty, y| x.lanes_le(y).all(),
+                    |x: $ty, y| x.lanes_lt(y).all()
                 );)*
             }};
         }
@@ -1298,8 +1274,8 @@ mod tests {
 
         #[cfg(feature = "simd_support")]
         {
-            t!(u8x2, u8x4, u8x8, u8x16, u8x32, u8x64 => u8);
-            t!(i8x2, i8x4, i8x8, i8x16, i8x32, i8x64 => i8);
+            t!(u8x4, u8x8, u8x16, u8x32, u8x64 => u8);
+            t!(i8x4, i8x8, i8x16, i8x32, i8x64 => i8);
             t!(u16x2, u16x4, u16x8, u16x16, u16x32 => u16);
             t!(i16x2, i16x4, i16x8, i16x16, i16x32 => i16);
             t!(u32x2, u32x4, u32x8, u32x16 => u32);
@@ -1351,7 +1327,7 @@ mod tests {
                     (-::core::$f_scalar::MAX * 0.2, ::core::$f_scalar::MAX * 0.7),
                 ];
                 for &(low_scalar, high_scalar) in v.iter() {
-                    for lane in 0..<$ty>::lanes() {
+                    for lane in 0..<$ty>::LANES {
                         let low = <$ty>::splat(0.0 as $f_scalar).replace(lane, low_scalar);
                         let high = <$ty>::splat(1.0 as $f_scalar).replace(lane, high_scalar);
                         let my_uniform = Uniform::new(low, high);
@@ -1474,7 +1450,7 @@ mod tests {
                     (::std::$f_scalar::NEG_INFINITY, ::std::$f_scalar::INFINITY),
                 ];
                 for &(low_scalar, high_scalar) in v.iter() {
-                    for lane in 0..<$ty>::lanes() {
+                    for lane in 0..<$ty>::LANES {
                         let low = <$ty>::splat(0.0 as $f_scalar).replace(lane, low_scalar);
                         let high = <$ty>::splat(1.0 as $f_scalar).replace(lane, high_scalar);
                         assert!(catch_unwind(|| range(low, high)).is_err());
diff --git a/src/distributions/utils.rs b/src/distributions/utils.rs
index 89da5fd7aad..689a4a1d8ea 100644
--- a/src/distributions/utils.rs
+++ b/src/distributions/utils.rs
@@ -8,7 +8,7 @@
 
 //! Math helper functions
 
-#[cfg(feature = "simd_support")] use packed_simd::*;
+#[cfg(feature = "simd_support")] use core::simd::*;
 
 
 pub(crate) trait WideningMultiply<RHS = Self> {
@@ -45,7 +45,7 @@ macro_rules! wmul_impl {
                     let y: $wide = self.cast();
                     let x: $wide = x.cast();
                     let tmp = y * x;
-                    let hi: $ty = (tmp >> $shift).cast();
+                    let hi: $ty = (tmp >> Simd::splat($shift)).cast();
                     let lo: $ty = tmp.cast();
                     (hi, lo)
                 }
@@ -99,19 +99,20 @@ macro_rules! wmul_impl_large {
                 #[inline(always)]
                 fn wmul(self, b: $ty) -> Self::Output {
                     // needs wrapping multiplication
-                    const LOWER_MASK: $scalar = !0 >> $half;
+                    const LOWER_MASK: $ty = <$ty>::splat(!0 >> $half);
+                    const HALF: $ty = <$ty>::splat($half);
                     let mut low = (self & LOWER_MASK) * (b & LOWER_MASK);
-                    let mut t = low >> $half;
+                    let mut t = low >> HALF;
                     low &= LOWER_MASK;
-                    t += (self >> $half) * (b & LOWER_MASK);
-                    low += (t & LOWER_MASK) << $half;
-                    let mut high = t >> $half;
-                    t = low >> $half;
+                    t += (self >> HALF) * (b & LOWER_MASK);
+                    low += (t & LOWER_MASK) << HALF;
+                    let mut high = t >> HALF;
+                    t = low >> HALF;
                     low &= LOWER_MASK;
-                    t += (b >> $half) * (self & LOWER_MASK);
-                    low += (t & LOWER_MASK) << $half;
-                    high += t >> $half;
-                    high += (self >> $half) * (b >> $half);
+                    t += (b >> HALF) * (self & LOWER_MASK);
+                    low += (t & LOWER_MASK) << HALF;
+                    high += t >> HALF;
+                    high += (self >> HALF) * (b >> HALF);
 
                     (high, low)
                 }
@@ -148,7 +149,6 @@ mod simd_wmul {
     #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*;
 
     wmul_impl! {
-        (u8x2, u16x2),
         (u8x4, u16x4),
         (u8x8, u16x8),
         (u8x16, u16x16),
@@ -167,16 +167,14 @@ mod simd_wmul {
     // means `wmul` can be implemented with only two instructions.
     #[allow(unused_macros)]
     macro_rules! wmul_impl_16 {
-        ($ty:ident, $intrinsic:ident, $mulhi:ident, $mullo:ident) => {
+        ($ty:ident, $mulhi:ident, $mullo:ident) => {
             impl WideningMultiply for $ty {
                 type Output = ($ty, $ty);
 
                 #[inline(always)]
                 fn wmul(self, x: $ty) -> Self::Output {
-                    let b = $intrinsic::from_bits(x);
-                    let a = $intrinsic::from_bits(self);
-                    let hi = $ty::from_bits(unsafe { $mulhi(a, b) });
-                    let lo = $ty::from_bits(unsafe { $mullo(a, b) });
+                    let hi = unsafe { $mulhi(self.into(), x.into()) }.into();
+                    let lo = unsafe { $mullo(self.into(), x.into()) }.into();
                     (hi, lo)
                 }
             }
@@ -184,11 +182,11 @@ mod simd_wmul {
     }
 
     #[cfg(target_feature = "sse2")]
-    wmul_impl_16! { u16x8, __m128i, _mm_mulhi_epu16, _mm_mullo_epi16 }
+    wmul_impl_16! { u16x8, _mm_mulhi_epu16, _mm_mullo_epi16 }
     #[cfg(target_feature = "avx2")]
-    wmul_impl_16! { u16x16, __m256i, _mm256_mulhi_epu16, _mm256_mullo_epi16 }
-    // FIXME: there are no `__m512i` types in stdsimd yet, so `wmul::<u16x32>`
-    // cannot use the same implementation.
+    wmul_impl_16! { u16x16, _mm256_mulhi_epu16, _mm256_mullo_epi16 }
+    #[cfg(target_feature = "avx512bw")]
+    wmul_impl_16! { u16x32, _mm512_mulhi_epu16, _mm512_mullo_epi16 }
 
     wmul_impl! {
         (u32x2, u64x2),
@@ -199,6 +197,7 @@ mod simd_wmul {
 
     // TODO: optimize, this seems to seriously slow things down
     wmul_impl_large! { (u8x64,) u8, 4 }
+    #[cfg(not(target_feature = "avx512bw"))]
     wmul_impl_large! { (u16x32,) u16, 8 }
     wmul_impl_large! { (u32x16,) u32, 16 }
     wmul_impl_large! { (u64x2, u64x4, u64x8,) u64, 32 }
@@ -229,6 +228,10 @@ pub(crate) trait FloatSIMDUtils {
     // value, not by retaining the binary representation.
     type UInt;
     fn cast_from_int(i: Self::UInt) -> Self;
+
+    type Scalar;
+    fn replace(self, index: usize, new_value: Self::Scalar) -> Self;
+    fn extract(self, index: usize) -> Self::Scalar;
 }
 
 /// Implement functions available in std builds but missing from core primitives
@@ -243,26 +246,23 @@ pub(crate) trait Float: Sized {
 
 /// Implement functions on f32/f64 to give them APIs similar to SIMD types
 pub(crate) trait FloatAsSIMD: Sized {
-    #[inline(always)]
-    fn lanes() -> usize {
-        1
-    }
+    const LANES: usize = 1;
     #[inline(always)]
     fn splat(scalar: Self) -> Self {
         scalar
     }
+}
+
+pub(crate) trait IntAsSIMD: Sized {
     #[inline(always)]
-    fn extract(self, index: usize) -> Self {
-        debug_assert_eq!(index, 0);
-        self
-    }
-    #[inline(always)]
-    fn replace(self, index: usize, new_value: Self) -> Self {
-        debug_assert_eq!(index, 0);
-        new_value
+    fn splat(scalar: Self) -> Self {
+        scalar
     }
 }
 
+impl IntAsSIMD for u32 {}
+impl IntAsSIMD for u64 {}
+
 pub(crate) trait BoolAsSIMD: Sized {
     fn any(self) -> bool;
     fn all(self) -> bool;
@@ -308,6 +308,7 @@ macro_rules! scalar_float_impl {
 
         impl FloatSIMDUtils for $ty {
             type Mask = bool;
+            type Scalar = $ty;
             type UInt = $uty;
 
             #[inline(always)]
@@ -350,6 +351,18 @@ macro_rules! scalar_float_impl {
             fn cast_from_int(i: Self::UInt) -> Self {
                 i as $ty
             }
+
+            #[inline]
+            fn replace(self, index: usize, new_value: Self::Scalar) -> Self {
+                debug_assert_eq!(index, 0);
+                new_value
+            }
+
+            #[inline]
+            fn extract(self, index: usize) -> Self::Scalar {
+                debug_assert_eq!(index, 0);
+                self
+            }
         }
 
         impl FloatAsSIMD for $ty {}
@@ -362,42 +375,42 @@ scalar_float_impl!(f64, u64);
 
 #[cfg(feature = "simd_support")]
 macro_rules! simd_impl {
-    ($ty:ident, $f_scalar:ident, $mty:ident, $uty:ident) => {
-        impl FloatSIMDUtils for $ty {
-            type Mask = $mty;
-            type UInt = $uty;
+    ($fty:ident, $uty:ident) => {
+        impl<const LANES: usize> FloatSIMDUtils for Simd<$fty, LANES>
+        where LaneCount<LANES>: SupportedLaneCount
+        {
+            type Mask = Mask<<$fty as SimdElement>::Mask, LANES>;
+            type Scalar = $fty;
+            type UInt = Simd<$uty, LANES>;
 
             #[inline(always)]
             fn all_lt(self, other: Self) -> bool {
-                self.lt(other).all()
+                self.lanes_lt(other).all()
             }
 
             #[inline(always)]
             fn all_le(self, other: Self) -> bool {
-                self.le(other).all()
+                self.lanes_le(other).all()
             }
 
             #[inline(always)]
             fn all_finite(self) -> bool {
-                self.finite_mask().all()
+                self.is_finite().all()
             }
 
             #[inline(always)]
             fn finite_mask(self) -> Self::Mask {
-                // This can possibly be done faster by checking bit patterns
-                let neg_inf = $ty::splat(::core::$f_scalar::NEG_INFINITY);
-                let pos_inf = $ty::splat(::core::$f_scalar::INFINITY);
-                self.gt(neg_inf) & self.lt(pos_inf)
+                self.is_finite()
             }
 
             #[inline(always)]
             fn gt_mask(self, other: Self) -> Self::Mask {
-                self.gt(other)
+                self.lanes_gt(other)
             }
 
             #[inline(always)]
             fn ge_mask(self, other: Self) -> Self::Mask {
-                self.ge(other)
+                self.lanes_ge(other)
             }
 
             #[inline(always)]
@@ -406,24 +419,32 @@ macro_rules! simd_impl {
                 // true, and 0 for false. Adding that to the binary
                 // representation of a float means subtracting one from
                 // the binary representation, resulting in the next lower
-                // value representable by $ty. This works even when the
+                // value representable by $fty. This works even when the
                 // current value is infinity.
                 debug_assert!(mask.any(), "At least one lane must be set");
-                <$ty>::from_bits(<$uty>::from_bits(self) + <$uty>::from_bits(mask))
+                Self::from_bits(self.to_bits() + mask.to_int().cast())
             }
 
             #[inline]
             fn cast_from_int(i: Self::UInt) -> Self {
                 i.cast()
             }
+
+            #[inline]
+            fn replace(mut self, index: usize, new_value: Self::Scalar) -> Self {
+                self.as_mut_array()[index] = new_value;
+                self
+            }
+
+            #[inline]
+            fn extract(self, index: usize) -> Self::Scalar {
+                self.as_array()[index]
+            }
         }
     };
 }
 
-#[cfg(feature="simd_support")] simd_impl! { f32x2, f32, m32x2, u32x2 }
-#[cfg(feature="simd_support")] simd_impl! { f32x4, f32, m32x4, u32x4 }
-#[cfg(feature="simd_support")] simd_impl! { f32x8, f32, m32x8, u32x8 }
-#[cfg(feature="simd_support")] simd_impl! { f32x16, f32, m32x16, u32x16 }
-#[cfg(feature="simd_support")] simd_impl! { f64x2, f64, m64x2, u64x2 }
-#[cfg(feature="simd_support")] simd_impl! { f64x4, f64, m64x4, u64x4 }
-#[cfg(feature="simd_support")] simd_impl! { f64x8, f64, m64x8, u64x8 }
+#[cfg(feature = "simd_support")]
+simd_impl!(f32, u32);
+#[cfg(feature = "simd_support")]
+simd_impl!(f64, u64);
diff --git a/src/lib.rs b/src/lib.rs
index 6d847180111..ef5c8a5a56c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -49,7 +49,7 @@
 #![deny(missing_debug_implementations)]
 #![doc(test(attr(allow(unused_variables), deny(warnings))))]
 #![no_std]
-#![cfg_attr(feature = "simd_support", feature(stdsimd))]
+#![cfg_attr(feature = "simd_support", feature(stdsimd, portable_simd))]
 #![cfg_attr(doc_cfg, feature(doc_cfg))]
 #![allow(
     clippy::float_cmp,

From d4b8748004ee5c0fa41aa6b430ca701dd982605b Mon Sep 17 00:00:00 2001
From: TheIronBorn <TheIronBorn@users.noreply.github.com>
Date: Mon, 11 Jul 2022 11:29:13 -0700
Subject: [PATCH 2/5] fix simd ints, clarify mask behavior

---
 src/distributions/integer.rs | 27 +++++++++------------------
 src/distributions/other.rs   |  2 ++
 2 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/src/distributions/integer.rs b/src/distributions/integer.rs
index d905044ed1e..c660095bdf7 100644
--- a/src/distributions/integer.rs
+++ b/src/distributions/integer.rs
@@ -138,31 +138,22 @@ macro_rules! simd_impl {
         /// Requires nightly Rust and the [`simd_support`] feature
         ///
         /// [`simd_support`]: https://github.com/rust-random/rand#crate-features
-        impl Distribution<$ty> for Standard {
+        impl<const LANES: usize> Distribution<Simd<$ty, LANES>> for Standard
+        where
+            LaneCount<LANES>: SupportedLaneCount,
+        {
             #[inline]
-            fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> $ty {
-                // TODO: impl this generically once const generics are robust enough
-                let mut vec: Simd<u8, { mem::size_of::<$ty>() }> = Default::default();
-                rng.fill_bytes(vec.as_mut_array());
-                // NOTE: replace with `to_le` if added to core::simd
-                #[cfg(not(target_endian = "little"))]
-                {
-                    vec = vec.reverse();
-                }
-                // SAFETY: we know u8xN and $ty have the same size
-                unsafe { mem::transmute_copy(&vec) }
+            fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Simd<$ty, LANES> {
+                let mut vec = Simd::default();
+                rng.fill(vec.as_mut_array().as_mut_slice());
+                vec
             }
         }
     )+};
 }
 
 #[cfg(feature = "simd_support")]
-simd_impl!(
-    i8x4, i8x8, i8x16, i8x32, i8x64, i16x2, i16x4, i16x8, i16x16, i16x32, i32x2, i32x4, i32x8,
-    i32x16, i64x2, i64x4, i64x8, isizex2, isizex4, isizex8, u8x4, u8x8, u8x16, u8x32, u8x64, u16x2,
-    u16x4, u16x8, u16x16, u16x32, u32x2, u32x4, u32x8, u32x16, u64x2, u64x4, u64x8, usizex2,
-    usizex4, usizex8
-);
+simd_impl!(u8, i8, u16, i16, u32, i32, u64, i64, usize, isize);
 
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 intrinsic_impl!(__m128i, __m256i);
diff --git a/src/distributions/other.rs b/src/distributions/other.rs
index b3b5e4e0201..b53977e1052 100644
--- a/src/distributions/other.rs
+++ b/src/distributions/other.rs
@@ -174,6 +174,8 @@ where
 {
     #[inline]
     fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Mask<T, LANES> {
+        // `MaskElement` must be a signed integer, so this is equivalent
+        // to the scalar `i32 < 0` method
         rng.gen().lanes_lt(Simd::default())
     }
 }

From f89f15fc1f846c53de4cd830c997314112dab0e4 Mon Sep 17 00:00:00 2001
From: TheIronBorn <TheIronBorn@users.noreply.github.com>
Date: Thu, 4 Aug 2022 20:42:35 +0000
Subject: [PATCH 3/5] fix doc link

---
 src/distributions/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/distributions/mod.rs b/src/distributions/mod.rs
index 9b0a8867000..716536d4ee8 100644
--- a/src/distributions/mod.rs
+++ b/src/distributions/mod.rs
@@ -221,7 +221,7 @@ use crate::Rng;
 /// [`Uniform`]: uniform::Uniform
 /// [`Wrapping<T>`]: std::num::Wrapping
 /// [`NonZeroU8`]: std::num::NonZeroU8
-/// [`__m128i`]: https://doc.rust-lang.org/nightly/core/arch/x86/struct.__m128i.html
+/// [`__m128i`]: https://doc.rust-lang.org/core/arch/x86/struct.__m128i.html
 /// [`u32x4`]: std::simd::u32x4
 /// [`f32x4`]: std::simd::f32x4
 /// [`mask32x4`]: std::simd::mask32x4

From 2fab15dcd716fc568c9ca26e75465eef19a1f07e Mon Sep 17 00:00:00 2001
From: TheIronBorn <TheIronBorn@users.noreply.github.com>
Date: Sun, 7 Aug 2022 17:22:19 -0700
Subject: [PATCH 4/5] fix stdsimd, add mask opt notes

---
 src/distributions/integer.rs |  6 +++---
 src/distributions/other.rs   | 15 +++++++++++----
 src/distributions/uniform.rs | 14 +++++++-------
 src/distributions/utils.rs   | 36 ++++++++++++++++++------------------
 4 files changed, 39 insertions(+), 32 deletions(-)

diff --git a/src/distributions/integer.rs b/src/distributions/integer.rs
index c660095bdf7..418eea9ff13 100644
--- a/src/distributions/integer.rs
+++ b/src/distributions/integer.rs
@@ -114,7 +114,7 @@ impl_nzint!(NonZeroU64, NonZeroU64::new);
 impl_nzint!(NonZeroU128, NonZeroU128::new);
 impl_nzint!(NonZeroUsize, NonZeroUsize::new);
 
-macro_rules! intrinsic_impl {
+macro_rules! x86_intrinsic_impl {
     ($($intrinsic:ident),+) => {$(
         /// Available only on x86/64 platforms
         impl Distribution<$intrinsic> for Standard {
@@ -156,12 +156,12 @@ macro_rules! simd_impl {
 simd_impl!(u8, i8, u16, i16, u32, i32, u64, i64, usize, isize);
 
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-intrinsic_impl!(__m128i, __m256i);
+x86_intrinsic_impl!(__m128i, __m256i);
 #[cfg(all(
     any(target_arch = "x86", target_arch = "x86_64"),
     feature = "simd_support"
 ))]
-intrinsic_impl!(__m512i);
+x86_intrinsic_impl!(__m512i);
 
 #[cfg(test)]
 mod tests {
diff --git a/src/distributions/other.rs b/src/distributions/other.rs
index b53977e1052..cb1b801f5b5 100644
--- a/src/distributions/other.rs
+++ b/src/distributions/other.rs
@@ -23,7 +23,7 @@ use serde::{Serialize, Deserialize};
 #[cfg(feature = "min_const_gen")]
 use core::mem::{self, MaybeUninit};
 #[cfg(feature = "simd_support")]
-use core::simd::{Mask, Simd, LaneCount, SupportedLaneCount, MaskElement, SimdElement};
+use core::simd::*;
 
 
 // ----- Sampling distributions -----
@@ -161,22 +161,29 @@ impl Distribution<bool> for Standard {
 /// let x = rng.gen::<mask8x16>().select(b, a);
 /// ```
 ///
-/// Since most bits are unused you could also generate only as many bits as you need.
+/// Since most bits are unused you could also generate only as many bits as you need, i.e.:
+/// ```
+/// let x = u16x8::splat(rng.gen::<u8> as u16);
+/// let mask = u16x8::splat(1) << u16x8::from([0, 1, 2, 3, 4, 5, 6, 7]);
+/// let rand_mask = (x & mask).simd_eq(mask);
+/// ```
 ///
 /// [`_mm_blendv_epi8`]: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8&ig_expand=514/
 /// [`simd_support`]: https://github.com/rust-random/rand#crate-features
 #[cfg(feature = "simd_support")]
 impl<T, const LANES: usize> Distribution<Mask<T, LANES>> for Standard
 where
-    T: MaskElement + PartialOrd + SimdElement<Mask = T> + Default,
+    T: MaskElement + Default,
     LaneCount<LANES>: SupportedLaneCount,
     Standard: Distribution<Simd<T, LANES>>,
+    Simd<T, LANES>: SimdPartialOrd<Mask = Mask<T, LANES>>,
 {
     #[inline]
     fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Mask<T, LANES> {
         // `MaskElement` must be a signed integer, so this is equivalent
         // to the scalar `i32 < 0` method
-        rng.gen().lanes_lt(Simd::default())
+        let var = rng.gen::<Simd<T, LANES>>();
+        var.simd_lt(Simd::default())
     }
 }
 
diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs
index d9197c68045..a7b4cb1a777 100644
--- a/src/distributions/uniform.rs
+++ b/src/distributions/uniform.rs
@@ -606,7 +606,7 @@ macro_rules! uniform_simd_int_impl {
             {
                 let low = *low_b.borrow();
                 let high = *high_b.borrow();
-                assert!(low.lanes_lt(high).all(), "Uniform::new called with `low >= high`");
+                assert!(low.simd_lt(high).all(), "Uniform::new called with `low >= high`");
                 UniformSampler::new_inclusive(low, high - Simd::splat(1))
             }
 
@@ -618,7 +618,7 @@ macro_rules! uniform_simd_int_impl {
             {
                 let low = *low_b.borrow();
                 let high = *high_b.borrow();
-                assert!(low.lanes_le(high).all(),
+                assert!(low.simd_le(high).all(),
                         "Uniform::new_inclusive called with `low > high`");
                 let unsigned_max = Simd::splat(::core::$unsigned::MAX);
 
@@ -626,7 +626,7 @@ macro_rules! uniform_simd_int_impl {
                 //       see https://doc.rust-lang.org/std/simd/struct.Simd.html
                 let range: Simd<$unsigned, LANES> = ((high - low) + Simd::splat(1)).cast();
                 // `% 0` will panic at runtime.
-                let not_full_range = range.lanes_gt(Simd::splat(0));
+                let not_full_range = range.simd_gt(Simd::splat(0));
                 // replacing 0 with `unsigned_max` allows a faster `select`
                 // with bitwise OR
                 let modulo = not_full_range.select(range, unsigned_max);
@@ -660,7 +660,7 @@ macro_rules! uniform_simd_int_impl {
                 let mut v: Simd<$unsigned, LANES> = rng.gen();
                 loop {
                     let (hi, lo) = v.wmul(range);
-                    let mask = lo.lanes_le(zone);
+                    let mask = lo.simd_le(zone);
                     if mask.all() {
                         let hi: Simd<$ty, LANES> = hi.cast();
                         // wrapping addition
@@ -669,7 +669,7 @@ macro_rules! uniform_simd_int_impl {
                         // When `range.eq(0).none()` the compare and blend
                         // operations are avoided.
                         let v: Simd<$ty, LANES> = v.cast();
-                        return range.lanes_gt(Simd::splat(0)).select(result, v);
+                        return range.simd_gt(Simd::splat(0)).select(result, v);
                     }
                     // Replace only the failing lanes
                     v = mask.select(v, rng.gen());
@@ -1265,8 +1265,8 @@ mod tests {
                         ($ty::splat(10), $ty::splat(127)),
                         ($ty::splat($scalar::MIN), $ty::splat($scalar::MAX)),
                     ],
-                    |x: $ty, y| x.lanes_le(y).all(),
-                    |x: $ty, y| x.lanes_lt(y).all()
+                    |x: $ty, y| x.simd_le(y).all(),
+                    |x: $ty, y| x.simd_lt(y).all()
                 );)*
             }};
         }
diff --git a/src/distributions/utils.rs b/src/distributions/utils.rs
index 689a4a1d8ea..895756462f9 100644
--- a/src/distributions/utils.rs
+++ b/src/distributions/utils.rs
@@ -99,20 +99,20 @@ macro_rules! wmul_impl_large {
                 #[inline(always)]
                 fn wmul(self, b: $ty) -> Self::Output {
                     // needs wrapping multiplication
-                    const LOWER_MASK: $ty = <$ty>::splat(!0 >> $half);
-                    const HALF: $ty = <$ty>::splat($half);
-                    let mut low = (self & LOWER_MASK) * (b & LOWER_MASK);
-                    let mut t = low >> HALF;
-                    low &= LOWER_MASK;
-                    t += (self >> HALF) * (b & LOWER_MASK);
-                    low += (t & LOWER_MASK) << HALF;
-                    let mut high = t >> HALF;
-                    t = low >> HALF;
-                    low &= LOWER_MASK;
-                    t += (b >> HALF) * (self & LOWER_MASK);
-                    low += (t & LOWER_MASK) << HALF;
-                    high += t >> HALF;
-                    high += (self >> HALF) * (b >> HALF);
+                    let lower_mask = <$ty>::splat(!0 >> $half);
+                    let half = <$ty>::splat($half);
+                    let mut low = (self & lower_mask) * (b & lower_mask);
+                    let mut t = low >> half;
+                    low &= lower_mask;
+                    t += (self >> half) * (b & lower_mask);
+                    low += (t & lower_mask) << half;
+                    let mut high = t >> half;
+                    t = low >> half;
+                    low &= lower_mask;
+                    t += (b >> half) * (self & lower_mask);
+                    low += (t & lower_mask) << half;
+                    high += t >> half;
+                    high += (self >> half) * (b >> half);
 
                     (high, low)
                 }
@@ -385,12 +385,12 @@ macro_rules! simd_impl {
 
             #[inline(always)]
             fn all_lt(self, other: Self) -> bool {
-                self.lanes_lt(other).all()
+                self.simd_lt(other).all()
             }
 
             #[inline(always)]
             fn all_le(self, other: Self) -> bool {
-                self.lanes_le(other).all()
+                self.simd_le(other).all()
             }
 
             #[inline(always)]
@@ -405,12 +405,12 @@ macro_rules! simd_impl {
 
             #[inline(always)]
             fn gt_mask(self, other: Self) -> Self::Mask {
-                self.lanes_gt(other)
+                self.simd_gt(other)
             }
 
             #[inline(always)]
             fn ge_mask(self, other: Self) -> Self::Mask {
-                self.lanes_ge(other)
+                self.simd_ge(other)
             }
 
             #[inline(always)]

From 949d70f6a5bcb74adb28c6d3183c92591122b988 Mon Sep 17 00:00:00 2001
From: TheIronBorn <TheIronBorn@users.noreply.github.com>
Date: Sun, 7 Aug 2022 17:42:24 -0700
Subject: [PATCH 5/5] fix doc test

---
 src/distributions/other.rs | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/distributions/other.rs b/src/distributions/other.rs
index cb1b801f5b5..184fce9d4e9 100644
--- a/src/distributions/other.rs
+++ b/src/distributions/other.rs
@@ -163,7 +163,12 @@ impl Distribution<bool> for Standard {
 ///
 /// Since most bits are unused you could also generate only as many bits as you need, i.e.:
 /// ```
-/// let x = u16x8::splat(rng.gen::<u8> as u16);
+/// #![feature(portable_simd)]
+/// use std::simd::*;
+/// use rand::prelude::*;
+/// let mut rng = thread_rng();
+///
+/// let x = u16x8::splat(rng.gen::<u8>() as u16);
 /// let mask = u16x8::splat(1) << u16x8::from([0, 1, 2, 3, 4, 5, 6, 7]);
 /// let rand_mask = (x & mask).simd_eq(mask);
 /// ```