From b48e7e5496202a3a93b24060ec782b0eec08b67b Mon Sep 17 00:00:00 2001 From: Chris Denton Date: Sat, 6 Apr 2024 15:30:17 +0000 Subject: [PATCH 1/3] Add const UTF-8 to UTF-16 conversion macros `wide_str!` creates a null terminated UTF-16 string whereas `utf16!` just creates a UTF-16 string without adding a null. --- library/std/src/sys/pal/windows/api.rs | 94 ++++++++++++++++++++ library/std/src/sys/pal/windows/api/tests.rs | 16 ++++ library/std/src/sys/pal/windows/mod.rs | 5 +- 3 files changed, 113 insertions(+), 2 deletions(-) create mode 100644 library/std/src/sys/pal/windows/api/tests.rs diff --git a/library/std/src/sys/pal/windows/api.rs b/library/std/src/sys/pal/windows/api.rs index 90e1bff52a365..8613dba42d2dc 100644 --- a/library/std/src/sys/pal/windows/api.rs +++ b/library/std/src/sys/pal/windows/api.rs @@ -34,6 +34,100 @@ use core::ptr::addr_of; use super::c; +/// Creates a null-terminated UTF-16 string from a str. +pub macro wide_str($str:literal) {{ + const _: () = { + if core::slice::memchr::memchr(0, $str.as_bytes()).is_some() { + panic!("null terminated strings cannot contain interior nulls"); + } + }; + crate::sys::pal::windows::api::utf16!(concat!($str, '\0')) +}} + +/// Creates a UTF-16 string from a str without null termination. +pub macro utf16($str:expr) {{ + const UTF8: &str = $str; + const UTF16_LEN: usize = crate::sys::pal::windows::api::utf16_len(UTF8); + const UTF16: [u16; UTF16_LEN] = crate::sys::pal::windows::api::to_utf16(UTF8); + &UTF16 +}} + +#[cfg(test)] +mod tests; + +/// Gets the UTF-16 length of a UTF-8 string, for use in the wide_str macro. +pub const fn utf16_len(s: &str) -> usize { + let s = s.as_bytes(); + let mut i = 0; + let mut len = 0; + while i < s.len() { + // the length of a UTF-8 encoded code-point is given by the number of + // leading ones, except in the case of ASCII. + let utf8_len = match s[i].leading_ones() { + 0 => 1, + n => n as usize, + }; + i += utf8_len; + len += if utf8_len < 4 { 1 } else { 2 }; + } + len +} + +/// Const convert UTF-8 to UTF-16, for use in the wide_str macro. +/// +/// Note that this is designed for use in const contexts so is not optimized. +pub const fn to_utf16(s: &str) -> [u16; UTF16_LEN] { + let mut output = [0_u16; UTF16_LEN]; + let mut pos = 0; + let s = s.as_bytes(); + let mut i = 0; + while i < s.len() { + match s[i].leading_ones() { + // Decode UTF-8 based on its length. + // See https://en.wikipedia.org/wiki/UTF-8 + 0 => { + // ASCII is the same in both encodings + output[pos] = s[i] as u16; + i += 1; + pos += 1; + } + 2 => { + // Bits: 110xxxxx 10xxxxxx + output[pos] = ((s[i] as u16 & 0b11111) << 6) | (s[i + 1] as u16 & 0b111111); + i += 2; + pos += 1; + } + 3 => { + // Bits: 1110xxxx 10xxxxxx 10xxxxxx + output[pos] = ((s[i] as u16 & 0b1111) << 12) + | ((s[i + 1] as u16 & 0b111111) << 6) + | (s[i + 2] as u16 & 0b111111); + i += 3; + pos += 1; + } + 4 => { + // Bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + let mut c = ((s[i] as u32 & 0b111) << 18) + | ((s[i + 1] as u32 & 0b111111) << 12) + | ((s[i + 2] as u32 & 0b111111) << 6) + | (s[i + 3] as u32 & 0b111111); + // re-encode as UTF-16 (see https://en.wikipedia.org/wiki/UTF-16) + // - Subtract 0x10000 from the code point + // - For the high surrogate, shift right by 10 then add 0xD800 + // - For the low surrogate, take the low 10 bits then add 0xDC00 + c -= 0x10000; + output[pos] = ((c >> 10) + 0xD800) as u16; + output[pos + 1] = ((c & 0b1111111111) + 0xDC00) as u16; + i += 4; + pos += 2; + } + // valid UTF-8 cannot have any other values + _ => unreachable!(), + } + } + output +} + /// Helper method for getting the size of `T` as a u32. /// Errors at compile time if the size would overflow. /// diff --git a/library/std/src/sys/pal/windows/api/tests.rs b/library/std/src/sys/pal/windows/api/tests.rs new file mode 100644 index 0000000000000..fab022c7b93d8 --- /dev/null +++ b/library/std/src/sys/pal/windows/api/tests.rs @@ -0,0 +1,16 @@ +use crate::sys::pal::windows::api::{utf16, wide_str}; + +macro_rules! check_utf16 { + ($str:literal) => {{ + assert!(wide_str!($str).iter().copied().eq($str.encode_utf16().chain([0]))); + assert!(utf16!($str).iter().copied().eq($str.encode_utf16())); + }}; +} + +#[test] +fn test_utf16_macros() { + check_utf16!("hello world"); + check_utf16!("€4.50"); + check_utf16!("𨉟呐㗂越"); + check_utf16!("Pchnąć w tę łódź jeża lub ośm skrzyń fig"); +} diff --git a/library/std/src/sys/pal/windows/mod.rs b/library/std/src/sys/pal/windows/mod.rs index 6a561518fad65..e68954d447aff 100644 --- a/library/std/src/sys/pal/windows/mod.rs +++ b/library/std/src/sys/pal/windows/mod.rs @@ -5,6 +5,7 @@ use crate::io::ErrorKind; use crate::mem::MaybeUninit; use crate::os::windows::ffi::{OsStrExt, OsStringExt}; use crate::path::PathBuf; +use crate::sys::pal::windows::api::wide_str; use crate::time::Duration; pub use self::rand::hashmap_random_keys; @@ -12,6 +13,8 @@ pub use self::rand::hashmap_random_keys; #[macro_use] pub mod compat; +mod api; + pub mod alloc; pub mod args; pub mod c; @@ -41,8 +44,6 @@ cfg_if::cfg_if! { } } -mod api; - /// Map a Result to io::Result. trait IoResult { fn io_result(self) -> crate::io::Result; From 952d432666e6b1a8c76c332375e3483213532670 Mon Sep 17 00:00:00 2001 From: Chris Denton Date: Sat, 6 Apr 2024 03:15:06 +0000 Subject: [PATCH 2/3] Windows: set main thread name without reencoding --- library/std/src/sys/pal/windows/mod.rs | 2 +- library/std/src/sys/pal/windows/thread.rs | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/library/std/src/sys/pal/windows/mod.rs b/library/std/src/sys/pal/windows/mod.rs index e68954d447aff..a734c2bd4c71e 100644 --- a/library/std/src/sys/pal/windows/mod.rs +++ b/library/std/src/sys/pal/windows/mod.rs @@ -61,7 +61,7 @@ pub unsafe fn init(_argc: isize, _argv: *const *const u8, _sigpipe: u8) { // Normally, `thread::spawn` will call `Thread::set_name` but since this thread already // exists, we have to call it ourselves. - thread::Thread::set_name(&c"main"); + thread::Thread::set_name_wide(wide_str!("main")); } // SAFETY: must be called only once during runtime cleanup. diff --git a/library/std/src/sys/pal/windows/thread.rs b/library/std/src/sys/pal/windows/thread.rs index c0c63c3340f4f..9b1c5b34bbf27 100644 --- a/library/std/src/sys/pal/windows/thread.rs +++ b/library/std/src/sys/pal/windows/thread.rs @@ -59,13 +59,17 @@ impl Thread { pub fn set_name(name: &CStr) { if let Ok(utf8) = name.to_str() { if let Ok(utf16) = to_u16s(utf8) { - unsafe { - c::SetThreadDescription(c::GetCurrentThread(), utf16.as_ptr()); - }; + Self::set_name_wide(&utf16) }; }; } + pub fn set_name_wide(name: &[u16]) { + unsafe { + c::SetThreadDescription(c::GetCurrentThread(), name.as_ptr()); + }; + } + pub fn join(self) { let rc = unsafe { c::WaitForSingleObject(self.handle.as_raw_handle(), c::INFINITE) }; if rc == c::WAIT_FAILED { From 19f04a7d6878fc4c258ba3d4374e81c8bbeca2e0 Mon Sep 17 00:00:00 2001 From: Chris Denton Date: Mon, 8 Apr 2024 11:42:16 +0000 Subject: [PATCH 3/3] Add comment on UTF-16 surrogates --- library/std/src/sys/pal/windows/api.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/library/std/src/sys/pal/windows/api.rs b/library/std/src/sys/pal/windows/api.rs index 8613dba42d2dc..555ad581b8568 100644 --- a/library/std/src/sys/pal/windows/api.rs +++ b/library/std/src/sys/pal/windows/api.rs @@ -68,6 +68,8 @@ pub const fn utf16_len(s: &str) -> usize { n => n as usize, }; i += utf8_len; + // Note that UTF-16 surrogates (U+D800 to U+DFFF) are not encodable as UTF-8, + // so (unlike with WTF-8) we don't have to worry about how they'll get re-encoded. len += if utf8_len < 4 { 1 } else { 2 }; } len