Replace `thread_local::guard` on Windows with an FLS-based impl
diff --git a/library/std/src/sys/pal/windows/c/bindings.txt b/library/std/src/sys/pal/windows/c/bindings.txt index 044d304..50ea67a 100644 --- a/library/std/src/sys/pal/windows/c/bindings.txt +++ b/library/std/src/sys/pal/windows/c/bindings.txt
@@ -2263,6 +2263,7 @@ IPV6_MREQ IPV6_MULTICAST_LOOP IPV6_V6ONLY +IsThreadAFiber LINGER listen LocalFree
diff --git a/library/std/src/sys/pal/windows/c/windows_sys.rs b/library/std/src/sys/pal/windows/c/windows_sys.rs index f3d5d5e..d4e522b 100644 --- a/library/std/src/sys/pal/windows/c/windows_sys.rs +++ b/library/std/src/sys/pal/windows/c/windows_sys.rs
@@ -71,6 +71,7 @@ windows_link::link!("kernel32.dll" "system" fn InitOnceBeginInitialize(lpinitonce : *mut INIT_ONCE, dwflags : u32, fpending : *mut BOOL, lpcontext : *mut *mut core::ffi::c_void) -> BOOL); windows_link::link!("kernel32.dll" "system" fn InitOnceComplete(lpinitonce : *mut INIT_ONCE, dwflags : u32, lpcontext : *const core::ffi::c_void) -> BOOL); windows_link::link!("kernel32.dll" "system" fn InitializeProcThreadAttributeList(lpattributelist : LPPROC_THREAD_ATTRIBUTE_LIST, dwattributecount : u32, dwflags : u32, lpsize : *mut usize) -> BOOL); +windows_link::link!("kernel32.dll" "system" fn IsThreadAFiber() -> BOOL); windows_link::link!("kernel32.dll" "system" fn LocalFree(hmem : HLOCAL) -> HLOCAL); windows_link::link!("kernel32.dll" "system" fn LockFileEx(hfile : HANDLE, dwflags : LOCK_FILE_FLAGS, dwreserved : u32, nnumberofbytestolocklow : u32, nnumberofbytestolockhigh : u32, lpoverlapped : *mut OVERLAPPED) -> BOOL); windows_link::link!("kernel32.dll" "system" fn MoveFileExW(lpexistingfilename : PCWSTR, lpnewfilename : PCWSTR, dwflags : MOVE_FILE_FLAGS) -> BOOL);
diff --git a/library/std/src/sys/thread_local/guard/windows.rs b/library/std/src/sys/thread_local/guard/windows.rs index f747129..7e56c8b 100644 --- a/library/std/src/sys/thread_local/guard/windows.rs +++ b/library/std/src/sys/thread_local/guard/windows.rs
@@ -1,103 +1,99 @@ //! Support for Windows TLS destructors. //! -//! Unfortunately, Windows does not provide a nice API to provide a destructor -//! for a TLS variable. Thus, the solution here ended up being a little more -//! obscure, but fear not, the internet has informed me [1][2] that this solution -//! is not unique (no way I could have thought of it as well!). The key idea is -//! to insert some hook somewhere to run arbitrary code on thread termination. -//! With this in place we'll be able to run anything we like, including all -//! TLS destructors! +//! Windows has an API to provide a destructor for a FLS (fiber local storage) variable, +//! which behaves similarly to a TLS variable for our purpose [1]. //! -//! In order to realize this, all TLS destructors are tracked by *us*, not the -//! Windows runtime. This means that we have a global list of destructors for +//! All TLS destructors are tracked by *us*, not the Windows runtime. +//! This means that we have a global list of destructors for //! each TLS key or variable that we know about. //! -//! # What's up with CRT$XLB? -//! -//! For anything about TLS destructors to work on Windows, we have to be able -//! to run *something* when a thread exits. To do so, we place a very special -//! static in a very special location. If this is encoded in just the right -//! way, the kernel's loader is apparently nice enough to run some function -//! of ours whenever a thread exits! How nice of the kernel! -//! -//! Lots of detailed information can be found in source [1] above, but the -//! gist of it is that this is leveraging a feature of Microsoft's PE format -//! (executable format) which is not actually used by any compilers today. -//! This apparently translates to any callbacks in the ".CRT$XLB" section -//! being run on certain events. -//! -//! So after all that, we use the compiler's `#[link_section]` feature to place -//! a callback pointer into the magic section so it ends up being called. -//! -//! # What's up with this callback? -//! -//! The callback specified receives a number of parameters from... someone! -//! (the kernel? the runtime? I'm not quite sure!) There are a few events that -//! this gets invoked for, but we're currently only interested on when a -//! thread or a process "detaches" (exits). The process part happens for the -//! last thread and the thread part happens for any normal thread. -//! -//! # The article mentions weird stuff about "/INCLUDE"? -//! -//! It sure does! Specifically we're talking about this quote: -//! -//! ```quote -//! The Microsoft run-time library facilitates this process by defining a -//! memory image of the TLS Directory and giving it the special name -//! “__tls_used” (Intel x86 platforms) or “_tls_used” (other platforms). The -//! linker looks for this memory image and uses the data there to create the -//! TLS Directory. Other compilers that support TLS and work with the -//! Microsoft linker must use this same technique. -//! ``` -//! -//! Basically what this means is that if we want support for our TLS -//! destructors/our hook being called then we need to make sure the linker does -//! not omit this symbol. Otherwise it will omit it and our callback won't be -//! wired up. -//! -//! We don't actually use the `/INCLUDE` linker flag here like the article -//! mentions because the Rust compiler doesn't propagate linker flags, but -//! instead we use a shim function which performs a volatile 1-byte load from -//! the address of the _tls_used symbol to ensure it sticks around. -//! -//! [1]: https://www.codeproject.com/Articles/8113/Thread-Local-Storage-The-C-Way -//! [2]: https://github.com/ChromiumWebApps/chromium/blob/master/base/threading/thread_local_storage_win.cc#L42 +//! [1]: https://devblogs.microsoft.com/oldnewthing/20191011-00/?p=102989 use core::ffi::c_void; +use core::sync::atomic::{AtomicU32, Ordering}; +use crate::cell::Cell; use crate::ptr; -use crate::sys::c; +use crate::sys::c::{self, FLS_OUT_OF_INDEXES}; -unsafe extern "C" { - #[link_name = "_tls_used"] - static TLS_USED: u8; -} -pub fn enable() { - // When destructors are used, we need to add a reference to the _tls_used - // symbol provided by the CRT, otherwise the TLS support code will get - // GC'd by the linker and our callback won't be called. - unsafe { ptr::from_ref(&TLS_USED).read_volatile() }; - // We also need to reference CALLBACK to make sure it does not get GC'd - // by the compiler/LLVM. The callback will end up inside the TLS - // callback array pointed to by _TLS_USED through linker shenanigans, - // but as far as the compiler is concerned, it looks like the data is - // unused, so we need this hack to prevent it from disappearing. - unsafe { ptr::from_ref(&CALLBACK).read_volatile() }; -} +pub type Key = u32; -#[unsafe(link_section = ".CRT$XLB")] -#[cfg_attr(miri, used)] // Miri only considers explicitly `#[used]` statics for `lookup_link_section` -pub static CALLBACK: unsafe extern "system" fn(*mut c_void, u32, *mut c_void) = tls_callback; +unsafe fn create(dtor: c::PFLS_CALLBACK_FUNCTION) -> Key { + let key_result = unsafe { c::FlsAlloc(dtor) }; -unsafe extern "system" fn tls_callback(_h: *mut c_void, dw_reason: u32, _pv: *mut c_void) { - if dw_reason == c::DLL_THREAD_DETACH || dw_reason == c::DLL_PROCESS_DETACH { - unsafe { - #[cfg(target_thread_local)] - super::super::destructors::run(); - #[cfg(not(target_thread_local))] - super::super::key::run_dtors(); - - crate::rt::thread_cleanup(); - } + if key_result == c::FLS_OUT_OF_INDEXES { + rtabort!("out of FLS keys"); } + + key_result +} + +unsafe fn set(key: Key, ptr: *const c_void) { + let result = unsafe { c::FlsSetValue(key, ptr) }; + + if result == c::FALSE { + rtabort!("failed to set FLS value"); + } +} + +fn is_thread_a_fiber() -> bool { + let res = unsafe { c::IsThreadAFiber() }; + res == c::TRUE +} + +static KEY: AtomicU32 = AtomicU32::new(FLS_OUT_OF_INDEXES); + +pub fn enable() { + #[thread_local] + static REGISTERED: Cell<bool> = Cell::new(false); + + if !REGISTERED.replace(true) { + let current_key = KEY.load(Ordering::Acquire); + + // If we already allocated a key, we only need to set it to a non-null value so that the dtor is run. + let key = if current_key != FLS_OUT_OF_INDEXES { + current_key + } else { + // Otherwise, we try to allocate a key. + let new_key = unsafe { create(Some(cleanup)) }; + + // Now we need to set this key to be used by everyone else. + // If we won the race, our key is the right one and we can set it to non-null value. + // If we lost, we'll use the winning key. + // Note: we are not freeing our losing key since according to the docs + // > It is expected that DLLs call [the FlsFree] function (if at all) only during DLL_PROCESS_DETACH. + match KEY.compare_exchange(current_key, new_key, Ordering::Release, Ordering::Acquire) { + Ok(_) => new_key, + Err(other_key) => other_key, + } + }; + + // Setting the key's value to non-zero will cause the dtor callback to be called when the thread exits. + // We only set the key once per thread, so the destructors are guaranteed to run at most once (fibers cannot be moved between threads). + unsafe { set(key, ptr::without_provenance(1)) }; + } +} + +unsafe extern "system" fn cleanup(_ptr: *const c_void) { + // Avoid running the hook if we are in a fiber. + // This will cause destructors of thread locals to not run, leaking them. + // Thread-local runtime state will not be cleaned. + // + // We need to verify that we won't run the destructors *before* the thread exits, + // but if the fiber that registered the callback is deleted, the thread might still be running other fibers. + // + // By checking that we are not running in a fiber here, we are guaranteed that the hook is only running during the thread's exit. + // See also the `fiber_does_not_trigger_dtor` test. + if is_thread_a_fiber() { + return; + } + + unsafe { + #[cfg(target_thread_local)] + super::super::destructors::run(); + #[cfg(not(target_thread_local))] + super::super::key::run_dtors(); + } + + crate::rt::thread_cleanup(); }
diff --git a/library/std/src/thread/local.rs b/library/std/src/thread/local.rs index 5de0018..041211d 100644 --- a/library/std/src/thread/local.rs +++ b/library/std/src/thread/local.rs
@@ -98,17 +98,11 @@ /// run on the thread that causes the process to exit. This is because the /// other threads may be forcibly terminated. /// -/// ## Synchronization in thread-local destructors +/// If a thread is [converted into a fiber], destructors will not be run unless +/// the fiber is [converted back into a thread] before the underlying thread exits. /// -/// On Windows, synchronization operations (such as [`JoinHandle::join`]) in -/// thread local destructors are prone to deadlocks and so should be avoided. -/// This is because the [loader lock] is held while a destructor is run. The -/// lock is acquired whenever a thread starts or exits or when a DLL is loaded -/// or unloaded. Therefore these events are blocked for as long as a thread -/// local destructor is running. -/// -/// [loader lock]: https://docs.microsoft.com/en-us/windows/win32/dlls/dynamic-link-library-best-practices -/// [`JoinHandle::join`]: crate::thread::JoinHandle::join +/// [converted into a fiber]: https://learn.microsoft.com/en-us/windows/win32/api/winbase/nf-winbase-convertthreadtofiber +/// [converted back into a thread]: https://learn.microsoft.com/en-us/windows/win32/api/winbase/nf-winbase-convertfibertothread /// [`with`]: LocalKey::with #[cfg_attr(not(test), rustc_diagnostic_item = "LocalKey")] #[stable(feature = "rust1", since = "1.0.0")]
diff --git a/library/std/tests/thread_local/tests.rs b/library/std/tests/thread_local/tests.rs index 5df1a0e..7324b88 100644 --- a/library/std/tests/thread_local/tests.rs +++ b/library/std/tests/thread_local/tests.rs
@@ -21,6 +21,12 @@ fn wait(&self) { set = cvar.wait(set).unwrap(); } } + + fn is_set(&self) -> bool { + let (set, _cvar) = &*self.0; + let set = set.lock().unwrap(); + *set + } } struct NotifyOnDrop(Signal); @@ -97,6 +103,7 @@ fn run(key: &'static LocalKey<UnsafeCell<Option<NotifyOnDrop>>>) { }); }); signal.wait(); + assert!(signal.is_set()); t.join().unwrap(); } } @@ -393,3 +400,51 @@ fn drop(&mut self) { let name = name.as_ref().unwrap(); assert_eq!(name, "test"); } + +// Test that if a thread uses fibers while the dtors callback is running, +// we do NOT trigger dtors. +// This prevents a UAF if a fiber is the first to use Tls and is deleted, +// like in the example here: +// https://github.com/rust-lang/rust/pull/148799#issuecomment-3731806901 +#[cfg(target_os = "windows")] +#[test] +fn fiber_does_not_trigger_dtor() { + use core::ffi::c_void; + use std::ptr; + + unsafe extern "system" { + fn ConvertFiberToThread() -> i32; + fn ConvertThreadToFiber(lpParameter: *const c_void) -> *mut c_void; + } + + thread_local!(static FOO: UnsafeCell<Option<NotifyOnDrop>> = UnsafeCell::new(None)); + let signal = Signal::default(); + + let signal2 = signal.clone(); + let t = thread::spawn(move || unsafe { + let mut signal = Some(signal2); + FOO.with(|f| { + *f.get() = Some(NotifyOnDrop(signal.take().unwrap())); + }); + let _main = ConvertThreadToFiber(ptr::null()); + // A user can then switch to a new fiber and delete the main fiber. + // If destructors are triggered when the fiber is deleted, + // the new fiber will be able to observe already-destructed values. + }); + t.join().unwrap(); + assert!(!signal.is_set()); + + // As long as we stop using fibers before thread teardown, everything works as expected. + let signal2 = signal.clone(); + let t = thread::spawn(move || unsafe { + let mut signal = Some(signal2); + let _ = ConvertThreadToFiber(ptr::null()); + FOO.with(|f| { + *f.get() = Some(NotifyOnDrop(signal.take().unwrap())); + }); + let _ = ConvertFiberToThread(); + }); + signal.wait(); + assert!(signal.is_set()); + t.join().unwrap(); +}