Replace `thread_local::guard` on Windows with an FLS-based impl
diff --git a/library/std/src/sys/pal/windows/c/bindings.txt b/library/std/src/sys/pal/windows/c/bindings.txt
index 044d304..50ea67a 100644
--- a/library/std/src/sys/pal/windows/c/bindings.txt
+++ b/library/std/src/sys/pal/windows/c/bindings.txt
@@ -2263,6 +2263,7 @@
 IPV6_MREQ
 IPV6_MULTICAST_LOOP
 IPV6_V6ONLY
+IsThreadAFiber
 LINGER
 listen
 LocalFree
diff --git a/library/std/src/sys/pal/windows/c/windows_sys.rs b/library/std/src/sys/pal/windows/c/windows_sys.rs
index f3d5d5e..d4e522b 100644
--- a/library/std/src/sys/pal/windows/c/windows_sys.rs
+++ b/library/std/src/sys/pal/windows/c/windows_sys.rs
@@ -71,6 +71,7 @@
 windows_link::link!("kernel32.dll" "system" fn InitOnceBeginInitialize(lpinitonce : *mut INIT_ONCE, dwflags : u32, fpending : *mut BOOL, lpcontext : *mut *mut core::ffi::c_void) -> BOOL);
 windows_link::link!("kernel32.dll" "system" fn InitOnceComplete(lpinitonce : *mut INIT_ONCE, dwflags : u32, lpcontext : *const core::ffi::c_void) -> BOOL);
 windows_link::link!("kernel32.dll" "system" fn InitializeProcThreadAttributeList(lpattributelist : LPPROC_THREAD_ATTRIBUTE_LIST, dwattributecount : u32, dwflags : u32, lpsize : *mut usize) -> BOOL);
+windows_link::link!("kernel32.dll" "system" fn IsThreadAFiber() -> BOOL);
 windows_link::link!("kernel32.dll" "system" fn LocalFree(hmem : HLOCAL) -> HLOCAL);
 windows_link::link!("kernel32.dll" "system" fn LockFileEx(hfile : HANDLE, dwflags : LOCK_FILE_FLAGS, dwreserved : u32, nnumberofbytestolocklow : u32, nnumberofbytestolockhigh : u32, lpoverlapped : *mut OVERLAPPED) -> BOOL);
 windows_link::link!("kernel32.dll" "system" fn MoveFileExW(lpexistingfilename : PCWSTR, lpnewfilename : PCWSTR, dwflags : MOVE_FILE_FLAGS) -> BOOL);
diff --git a/library/std/src/sys/thread_local/guard/windows.rs b/library/std/src/sys/thread_local/guard/windows.rs
index f747129..7e56c8b 100644
--- a/library/std/src/sys/thread_local/guard/windows.rs
+++ b/library/std/src/sys/thread_local/guard/windows.rs
@@ -1,103 +1,99 @@
 //! Support for Windows TLS destructors.
 //!
-//! Unfortunately, Windows does not provide a nice API to provide a destructor
-//! for a TLS variable. Thus, the solution here ended up being a little more
-//! obscure, but fear not, the internet has informed me [1][2] that this solution
-//! is not unique (no way I could have thought of it as well!). The key idea is
-//! to insert some hook somewhere to run arbitrary code on thread termination.
-//! With this in place we'll be able to run anything we like, including all
-//! TLS destructors!
+//! Windows has an API to provide a destructor for a FLS (fiber local storage) variable,
+//! which behaves similarly to a TLS variable for our purpose [1].
 //!
-//! In order to realize this, all TLS destructors are tracked by *us*, not the
-//! Windows runtime. This means that we have a global list of destructors for
+//! All TLS destructors are tracked by *us*, not the Windows runtime.
+//! This means that we have a global list of destructors for
 //! each TLS key or variable that we know about.
 //!
-//! # What's up with CRT$XLB?
-//!
-//! For anything about TLS destructors to work on Windows, we have to be able
-//! to run *something* when a thread exits. To do so, we place a very special
-//! static in a very special location. If this is encoded in just the right
-//! way, the kernel's loader is apparently nice enough to run some function
-//! of ours whenever a thread exits! How nice of the kernel!
-//!
-//! Lots of detailed information can be found in source [1] above, but the
-//! gist of it is that this is leveraging a feature of Microsoft's PE format
-//! (executable format) which is not actually used by any compilers today.
-//! This apparently translates to any callbacks in the ".CRT$XLB" section
-//! being run on certain events.
-//!
-//! So after all that, we use the compiler's `#[link_section]` feature to place
-//! a callback pointer into the magic section so it ends up being called.
-//!
-//! # What's up with this callback?
-//!
-//! The callback specified receives a number of parameters from... someone!
-//! (the kernel? the runtime? I'm not quite sure!) There are a few events that
-//! this gets invoked for, but we're currently only interested on when a
-//! thread or a process "detaches" (exits). The process part happens for the
-//! last thread and the thread part happens for any normal thread.
-//!
-//! # The article mentions weird stuff about "/INCLUDE"?
-//!
-//! It sure does! Specifically we're talking about this quote:
-//!
-//! ```quote
-//! The Microsoft run-time library facilitates this process by defining a
-//! memory image of the TLS Directory and giving it the special name
-//! “__tls_used” (Intel x86 platforms) or “_tls_used” (other platforms). The
-//! linker looks for this memory image and uses the data there to create the
-//! TLS Directory. Other compilers that support TLS and work with the
-//! Microsoft linker must use this same technique.
-//! ```
-//!
-//! Basically what this means is that if we want support for our TLS
-//! destructors/our hook being called then we need to make sure the linker does
-//! not omit this symbol. Otherwise it will omit it and our callback won't be
-//! wired up.
-//!
-//! We don't actually use the `/INCLUDE` linker flag here like the article
-//! mentions because the Rust compiler doesn't propagate linker flags, but
-//! instead we use a shim function which performs a volatile 1-byte load from
-//! the address of the _tls_used symbol to ensure it sticks around.
-//!
-//! [1]: https://www.codeproject.com/Articles/8113/Thread-Local-Storage-The-C-Way
-//! [2]: https://github.com/ChromiumWebApps/chromium/blob/master/base/threading/thread_local_storage_win.cc#L42
+//! [1]: https://devblogs.microsoft.com/oldnewthing/20191011-00/?p=102989
 
 use core::ffi::c_void;
+use core::sync::atomic::{AtomicU32, Ordering};
 
+use crate::cell::Cell;
 use crate::ptr;
-use crate::sys::c;
+use crate::sys::c::{self, FLS_OUT_OF_INDEXES};
 
-unsafe extern "C" {
-    #[link_name = "_tls_used"]
-    static TLS_USED: u8;
-}
-pub fn enable() {
-    // When destructors are used, we need to add a reference to the _tls_used
-    // symbol provided by the CRT, otherwise the TLS support code will get
-    // GC'd by the linker and our callback won't be called.
-    unsafe { ptr::from_ref(&TLS_USED).read_volatile() };
-    // We also need to reference CALLBACK to make sure it does not get GC'd
-    // by the compiler/LLVM. The callback will end up inside the TLS
-    // callback array pointed to by _TLS_USED through linker shenanigans,
-    // but as far as the compiler is concerned, it looks like the data is
-    // unused, so we need this hack to prevent it from disappearing.
-    unsafe { ptr::from_ref(&CALLBACK).read_volatile() };
-}
+pub type Key = u32;
 
-#[unsafe(link_section = ".CRT$XLB")]
-#[cfg_attr(miri, used)] // Miri only considers explicitly `#[used]` statics for `lookup_link_section`
-pub static CALLBACK: unsafe extern "system" fn(*mut c_void, u32, *mut c_void) = tls_callback;
+unsafe fn create(dtor: c::PFLS_CALLBACK_FUNCTION) -> Key {
+    let key_result = unsafe { c::FlsAlloc(dtor) };
 
-unsafe extern "system" fn tls_callback(_h: *mut c_void, dw_reason: u32, _pv: *mut c_void) {
-    if dw_reason == c::DLL_THREAD_DETACH || dw_reason == c::DLL_PROCESS_DETACH {
-        unsafe {
-            #[cfg(target_thread_local)]
-            super::super::destructors::run();
-            #[cfg(not(target_thread_local))]
-            super::super::key::run_dtors();
-
-            crate::rt::thread_cleanup();
-        }
+    if key_result == c::FLS_OUT_OF_INDEXES {
+        rtabort!("out of FLS keys");
     }
+
+    key_result
+}
+
+unsafe fn set(key: Key, ptr: *const c_void) {
+    let result = unsafe { c::FlsSetValue(key, ptr) };
+
+    if result == c::FALSE {
+        rtabort!("failed to set FLS value");
+    }
+}
+
+fn is_thread_a_fiber() -> bool {
+    let res = unsafe { c::IsThreadAFiber() };
+    res == c::TRUE
+}
+
+static KEY: AtomicU32 = AtomicU32::new(FLS_OUT_OF_INDEXES);
+
+pub fn enable() {
+    #[thread_local]
+    static REGISTERED: Cell<bool> = Cell::new(false);
+
+    if !REGISTERED.replace(true) {
+        let current_key = KEY.load(Ordering::Acquire);
+
+        // If we already allocated a key, we only need to set it to a non-null value so that the dtor is run.
+        let key = if current_key != FLS_OUT_OF_INDEXES {
+            current_key
+        } else {
+            // Otherwise, we try to allocate a key.
+            let new_key = unsafe { create(Some(cleanup)) };
+
+            // Now we need to set this key to be used by everyone else.
+            // If we won the race, our key is the right one and we can set it to non-null value.
+            // If we lost, we'll use the winning key.
+            // Note: we are not freeing our losing key since according to the docs
+            // > It is expected that DLLs call [the FlsFree] function (if at all) only during DLL_PROCESS_DETACH.
+            match KEY.compare_exchange(current_key, new_key, Ordering::Release, Ordering::Acquire) {
+                Ok(_) => new_key,
+                Err(other_key) => other_key,
+            }
+        };
+
+        // Setting the key's value to non-zero will cause the dtor callback to be called when the thread exits.
+        // We only set the key once per thread, so the destructors are guaranteed to run at most once (fibers cannot be moved between threads).
+        unsafe { set(key, ptr::without_provenance(1)) };
+    }
+}
+
+unsafe extern "system" fn cleanup(_ptr: *const c_void) {
+    // Avoid running the hook if we are in a fiber.
+    // This will cause destructors of thread locals to not run, leaking them.
+    // Thread-local runtime state will not be cleaned.
+    //
+    // We need to verify that we won't run the destructors *before* the thread exits,
+    // but if the fiber that registered the callback is deleted, the thread might still be running other fibers.
+    //
+    // By checking that we are not running in a fiber here, we are guaranteed that the hook is only running during the thread's exit.
+    // See also the `fiber_does_not_trigger_dtor` test.
+    if is_thread_a_fiber() {
+        return;
+    }
+
+    unsafe {
+        #[cfg(target_thread_local)]
+        super::super::destructors::run();
+        #[cfg(not(target_thread_local))]
+        super::super::key::run_dtors();
+    }
+
+    crate::rt::thread_cleanup();
 }
diff --git a/library/std/src/thread/local.rs b/library/std/src/thread/local.rs
index 5de0018..041211d 100644
--- a/library/std/src/thread/local.rs
+++ b/library/std/src/thread/local.rs
@@ -98,17 +98,11 @@
 ///    run on the thread that causes the process to exit. This is because the
 ///    other threads may be forcibly terminated.
 ///
-/// ## Synchronization in thread-local destructors
+///    If a thread is [converted into a fiber], destructors will not be run unless
+///    the fiber is [converted back into a thread] before the underlying thread exits.
 ///
-/// On Windows, synchronization operations (such as [`JoinHandle::join`]) in
-/// thread local destructors are prone to deadlocks and so should be avoided.
-/// This is because the [loader lock] is held while a destructor is run. The
-/// lock is acquired whenever a thread starts or exits or when a DLL is loaded
-/// or unloaded. Therefore these events are blocked for as long as a thread
-/// local destructor is running.
-///
-/// [loader lock]: https://docs.microsoft.com/en-us/windows/win32/dlls/dynamic-link-library-best-practices
-/// [`JoinHandle::join`]: crate::thread::JoinHandle::join
+/// [converted into a fiber]: https://learn.microsoft.com/en-us/windows/win32/api/winbase/nf-winbase-convertthreadtofiber
+/// [converted back into a thread]: https://learn.microsoft.com/en-us/windows/win32/api/winbase/nf-winbase-convertfibertothread
 /// [`with`]: LocalKey::with
 #[cfg_attr(not(test), rustc_diagnostic_item = "LocalKey")]
 #[stable(feature = "rust1", since = "1.0.0")]
diff --git a/library/std/tests/thread_local/tests.rs b/library/std/tests/thread_local/tests.rs
index 5df1a0e..7324b88 100644
--- a/library/std/tests/thread_local/tests.rs
+++ b/library/std/tests/thread_local/tests.rs
@@ -21,6 +21,12 @@ fn wait(&self) {
             set = cvar.wait(set).unwrap();
         }
     }
+
+    fn is_set(&self) -> bool {
+        let (set, _cvar) = &*self.0;
+        let set = set.lock().unwrap();
+        *set
+    }
 }
 
 struct NotifyOnDrop(Signal);
@@ -97,6 +103,7 @@ fn run(key: &'static LocalKey<UnsafeCell<Option<NotifyOnDrop>>>) {
             });
         });
         signal.wait();
+        assert!(signal.is_set());
         t.join().unwrap();
     }
 }
@@ -393,3 +400,51 @@ fn drop(&mut self) {
     let name = name.as_ref().unwrap();
     assert_eq!(name, "test");
 }
+
+// Test that if a thread uses fibers while the dtors callback is running,
+// we do NOT trigger dtors.
+// This prevents a UAF if a fiber is the first to use Tls and is deleted,
+// like in the example here:
+// https://github.com/rust-lang/rust/pull/148799#issuecomment-3731806901
+#[cfg(target_os = "windows")]
+#[test]
+fn fiber_does_not_trigger_dtor() {
+    use core::ffi::c_void;
+    use std::ptr;
+
+    unsafe extern "system" {
+        fn ConvertFiberToThread() -> i32;
+        fn ConvertThreadToFiber(lpParameter: *const c_void) -> *mut c_void;
+    }
+
+    thread_local!(static FOO: UnsafeCell<Option<NotifyOnDrop>> = UnsafeCell::new(None));
+    let signal = Signal::default();
+
+    let signal2 = signal.clone();
+    let t = thread::spawn(move || unsafe {
+        let mut signal = Some(signal2);
+        FOO.with(|f| {
+            *f.get() = Some(NotifyOnDrop(signal.take().unwrap()));
+        });
+        let _main = ConvertThreadToFiber(ptr::null());
+        // A user can then switch to a new fiber and delete the main fiber.
+        // If destructors are triggered when the fiber is deleted,
+        // the new fiber will be able to observe already-destructed values.
+    });
+    t.join().unwrap();
+    assert!(!signal.is_set());
+
+    // As long as we stop using fibers before thread teardown, everything works as expected.
+    let signal2 = signal.clone();
+    let t = thread::spawn(move || unsafe {
+        let mut signal = Some(signal2);
+        let _ = ConvertThreadToFiber(ptr::null());
+        FOO.with(|f| {
+            *f.get() = Some(NotifyOnDrop(signal.take().unwrap()));
+        });
+        let _ = ConvertFiberToThread();
+    });
+    signal.wait();
+    assert!(signal.is_set());
+    t.join().unwrap();
+}