blob: 5476cccc02e3b47c64f174594dd5fd4d05452b01 [file] [log] [blame]
use std::sync::atomic::{AtomicPtr, AtomicUsize};
use ipc_channel::ipc;
use nix::sys::{ptrace, signal, wait};
use nix::unistd;
use super::CALLBACK_STACK_SIZE;
use super::messages::{Confirmation, StartFfiInfo, TraceRequest};
use crate::shims::native_lib::{AccessEvent, AccessRange, MemEvents};
/// The flags to use when calling `waitid()`.
const WAIT_FLAGS: wait::WaitPidFlag =
wait::WaitPidFlag::WUNTRACED.union(wait::WaitPidFlag::WEXITED);
/// The default word size on a given platform, in bytes.
#[cfg(target_arch = "x86")]
const ARCH_WORD_SIZE: usize = 4;
#[cfg(target_arch = "x86_64")]
const ARCH_WORD_SIZE: usize = 8;
// x86 max instruction length is 15 bytes:
// https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sdm.html
// See vol. 3B section 24.25.
const ARCH_MAX_INSTR_SIZE: usize = 15;
/// The address of the page set to be edited, initialised to a sentinel null
/// pointer.
static PAGE_ADDR: AtomicPtr<u8> = AtomicPtr::new(std::ptr::null_mut());
/// The host pagesize, initialised to a sentinel zero value.
pub static PAGE_SIZE: AtomicUsize = AtomicUsize::new(0);
/// How many consecutive pages to unprotect. 1 by default, unlikely to be set
/// higher than 2.
static PAGE_COUNT: AtomicUsize = AtomicUsize::new(1);
/// Allows us to get common arguments from the `user_regs_t` across architectures.
/// Normally this would land us ABI hell, but thankfully all of our usecases
/// consist of functions with a small number of register-sized integer arguments.
/// See <https://man7.org/linux/man-pages/man2/syscall.2.html> for sources.
trait ArchIndependentRegs {
/// Gets the address of the instruction pointer.
fn ip(&self) -> usize;
/// Set the instruction pointer; remember to also set the stack pointer, or
/// else the stack might get messed up!
fn set_ip(&mut self, ip: usize);
/// Set the stack pointer, ideally to a zeroed-out area.
fn set_sp(&mut self, sp: usize);
}
// It's fine / desirable behaviour for values to wrap here, we care about just
// preserving the bit pattern.
#[cfg(target_arch = "x86_64")]
#[rustfmt::skip]
impl ArchIndependentRegs for libc::user_regs_struct {
#[inline]
fn ip(&self) -> usize { self.rip.try_into().unwrap() }
#[inline]
fn set_ip(&mut self, ip: usize) { self.rip = ip.try_into().unwrap() }
#[inline]
fn set_sp(&mut self, sp: usize) { self.rsp = sp.try_into().unwrap() }
}
#[cfg(target_arch = "x86")]
#[rustfmt::skip]
impl ArchIndependentRegs for libc::user_regs_struct {
#[inline]
fn ip(&self) -> usize { self.eip.cast_unsigned().try_into().unwrap() }
#[inline]
fn set_ip(&mut self, ip: usize) { self.eip = ip.cast_signed().try_into().unwrap() }
#[inline]
fn set_sp(&mut self, sp: usize) { self.esp = sp.cast_signed().try_into().unwrap() }
}
/// A unified event representing something happening on the child process. Wraps
/// `nix`'s `WaitStatus` and our custom signals so it can all be done with one
/// `match` statement.
pub enum ExecEvent {
/// Child process requests that we begin monitoring it.
Start(StartFfiInfo),
/// Child requests that we stop monitoring and pass over the events we
/// detected.
End,
/// The child process with the specified pid was stopped by the given signal.
Status(unistd::Pid, signal::Signal),
/// The child process with the specified pid entered or existed a syscall.
Syscall(unistd::Pid),
/// A child process exited or was killed; if we have a return code, it is
/// specified.
Died(Option<i32>),
}
/// A listener for the FFI start info channel along with relevant state.
pub struct ChildListener {
/// The matching channel for the child's `Supervisor` struct.
message_rx: ipc::IpcReceiver<TraceRequest>,
/// ...
confirm_tx: ipc::IpcSender<Confirmation>,
/// Whether an FFI call is currently ongoing.
attached: bool,
/// If `Some`, overrides the return code with the given value.
override_retcode: Option<i32>,
/// Last code obtained from a child exiting.
last_code: Option<i32>,
}
impl ChildListener {
pub fn new(
message_rx: ipc::IpcReceiver<TraceRequest>,
confirm_tx: ipc::IpcSender<Confirmation>,
) -> Self {
Self { message_rx, confirm_tx, attached: false, override_retcode: None, last_code: None }
}
}
impl Iterator for ChildListener {
type Item = ExecEvent;
// Allows us to monitor the child process by just iterating over the listener.
// NB: This should never return None!
fn next(&mut self) -> Option<Self::Item> {
// Do not block if the child has nothing to report for `waitid`.
let opts = WAIT_FLAGS | wait::WaitPidFlag::WNOHANG;
loop {
// Listen to any child, not just the main one. Important if we want
// to allow the C code to fork further, along with being a bit of
// defensive programming since Linux sometimes assigns threads of
// the same process different PIDs with unpredictable rules...
match wait::waitid(wait::Id::All, opts) {
Ok(stat) =>
match stat {
// Child exited normally with a specific code set.
wait::WaitStatus::Exited(_, code) => self.last_code = Some(code),
// Child was killed by a signal, without giving a code.
wait::WaitStatus::Signaled(_, _, _) => self.last_code = None,
// Child entered or exited a syscall.
wait::WaitStatus::PtraceSyscall(pid) =>
if self.attached {
return Some(ExecEvent::Syscall(pid));
},
// Child with the given pid was stopped by the given signal.
// It's somewhat unclear when which of these two is returned;
// we just treat them the same.
wait::WaitStatus::Stopped(pid, signal)
| wait::WaitStatus::PtraceEvent(pid, signal, _) =>
if self.attached {
// This is our end-of-FFI signal!
if signal == signal::SIGUSR1 {
self.attached = false;
return Some(ExecEvent::End);
} else {
return Some(ExecEvent::Status(pid, signal));
}
} else {
// Just pass along the signal.
ptrace::cont(pid, signal).unwrap();
},
_ => (),
},
// This case should only trigger when all children died.
Err(_) => return Some(ExecEvent::Died(self.override_retcode.or(self.last_code))),
}
// Similarly, do a non-blocking poll of the IPC channel.
if let Ok(req) = self.message_rx.try_recv() {
match req {
TraceRequest::StartFfi(info) =>
// Should never trigger - but better to panic explicitly than deadlock!
if self.attached {
panic!("Attempting to begin FFI multiple times!");
} else {
self.attached = true;
return Some(ExecEvent::Start(info));
},
TraceRequest::OverrideRetcode(code) => {
self.override_retcode = Some(code);
self.confirm_tx.send(Confirmation).unwrap();
}
}
}
// Not ideal, but doing anything else might sacrifice performance.
std::thread::yield_now();
}
}
}
/// An error came up while waiting on the child process to do something.
/// It likely died, with this return code if we have one.
#[derive(Debug)]
pub struct ExecEnd(pub Option<i32>);
/// Whether to call `ptrace::cont()` immediately. Used exclusively by `wait_for_signal`.
enum InitialCont {
Yes,
No,
}
/// This is the main loop of the supervisor process. It runs in a separate
/// process from the rest of Miri (but because we fork, addresses for anything
/// created before the fork - like statics - are the same).
pub fn sv_loop(
listener: ChildListener,
init_pid: unistd::Pid,
event_tx: ipc::IpcSender<MemEvents>,
confirm_tx: ipc::IpcSender<Confirmation>,
) -> Result<!, ExecEnd> {
// Get the pagesize set and make sure it isn't still on the zero sentinel value!
let page_size = PAGE_SIZE.load(std::sync::atomic::Ordering::Relaxed);
assert_ne!(page_size, 0);
// Things that we return to the child process.
let mut acc_events = Vec::new();
// Memory allocated for the MiriMachine.
let mut ch_pages = Vec::new();
let mut ch_stack = None;
// An instance of the Capstone disassembler, so we don't spawn one on every access.
let cs = get_disasm();
// The pid of the last process we interacted with, used by default if we don't have a
// reason to use a different one.
let mut curr_pid = init_pid;
// There's an initial sigstop we need to deal with.
wait_for_signal(Some(curr_pid), signal::SIGSTOP, InitialCont::No)?;
ptrace::cont(curr_pid, None).unwrap();
for evt in listener {
match evt {
// Child started ffi, so prep memory.
ExecEvent::Start(ch_info) => {
// All the pages that the child process is "allowed to" access.
ch_pages = ch_info.page_ptrs;
// And the temporary callback stack it allocated for us to use later.
ch_stack = Some(ch_info.stack_ptr);
// We received the signal and are no longer in the main listener loop,
// so we can let the child move on to the end of the ffi prep where it will
// raise a SIGSTOP. We need it to be signal-stopped *and waited for* in
// order to do most ptrace operations!
confirm_tx.send(Confirmation).unwrap();
// We can't trust simply calling `Pid::this()` in the child process to give the right
// PID for us, so we get it this way.
curr_pid = wait_for_signal(None, signal::SIGSTOP, InitialCont::No).unwrap();
// Continue until next syscall.
ptrace::syscall(curr_pid, None).unwrap();
}
// Child wants to end tracing.
ExecEvent::End => {
// Hand over the access info we traced.
event_tx.send(MemEvents { acc_events }).unwrap();
// And reset our values.
acc_events = Vec::new();
ch_stack = None;
// No need to monitor syscalls anymore, they'd just be ignored.
ptrace::cont(curr_pid, None).unwrap();
}
// Child process was stopped by a signal
ExecEvent::Status(pid, signal) =>
match signal {
// If it was a segfault, check if it was an artificial one
// caused by it trying to access the MiriMachine memory.
signal::SIGSEGV =>
handle_segfault(
pid,
&ch_pages,
ch_stack.unwrap(),
page_size,
&cs,
&mut acc_events,
)?,
// Something weird happened.
_ => {
eprintln!("Process unexpectedly got {signal}; continuing...");
// In case we're not tracing
if ptrace::syscall(pid, None).is_err() {
// If *this* fails too, something really weird happened
// and it's probably best to just panic.
signal::kill(pid, signal::SIGCONT).unwrap();
}
}
},
// Child entered or exited a syscall. For now we ignore this and just continue.
ExecEvent::Syscall(pid) => {
ptrace::syscall(pid, None).unwrap();
}
ExecEvent::Died(code) => {
return Err(ExecEnd(code));
}
}
}
unreachable!()
}
/// Spawns a Capstone disassembler for the host architecture.
#[rustfmt::skip]
fn get_disasm() -> capstone::Capstone {
use capstone::prelude::*;
let cs_pre = Capstone::new();
{
#[cfg(target_arch = "x86_64")]
{cs_pre.x86().mode(arch::x86::ArchMode::Mode64)}
#[cfg(target_arch = "x86")]
{cs_pre.x86().mode(arch::x86::ArchMode::Mode32)}
}
.detail(true)
.build()
.unwrap()
}
/// Waits for `wait_signal`. If `init_cont`, it will first do a `ptrace::cont`.
/// We want to avoid that in some cases, like at the beginning of FFI.
///
/// If `pid` is `None`, only one wait will be done and `init_cont` should be false.
fn wait_for_signal(
pid: Option<unistd::Pid>,
wait_signal: signal::Signal,
init_cont: InitialCont,
) -> Result<unistd::Pid, ExecEnd> {
if matches!(init_cont, InitialCont::Yes) {
ptrace::cont(pid.unwrap(), None).unwrap();
}
// Repeatedly call `waitid` until we get the signal we want, or the process dies.
loop {
let wait_id = match pid {
Some(pid) => wait::Id::Pid(pid),
None => wait::Id::All,
};
let stat = wait::waitid(wait_id, WAIT_FLAGS).map_err(|_| ExecEnd(None))?;
let (signal, pid) = match stat {
// Report the cause of death, if we know it.
wait::WaitStatus::Exited(_, code) => {
return Err(ExecEnd(Some(code)));
}
wait::WaitStatus::Signaled(_, _, _) => return Err(ExecEnd(None)),
wait::WaitStatus::Stopped(pid, signal)
| wait::WaitStatus::PtraceEvent(pid, signal, _) => (signal, pid),
// This covers PtraceSyscall and variants that are impossible with
// the flags set (e.g. WaitStatus::StillAlive).
_ => {
ptrace::cont(pid.unwrap(), None).unwrap();
continue;
}
};
if signal == wait_signal {
return Ok(pid);
} else {
ptrace::cont(pid, signal).map_err(|_| ExecEnd(None))?;
}
}
}
/// Add the memory events from `op` being executed while there is a memory access at `addr` to
/// `acc_events`. Return whether this was a memory operand.
fn capstone_find_events(
addr: usize,
op: &capstone::arch::ArchOperand,
acc_events: &mut Vec<AccessEvent>,
) -> bool {
use capstone::prelude::*;
match op {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
arch::ArchOperand::X86Operand(x86_operand) => {
match x86_operand.op_type {
// We only care about memory accesses
arch::x86::X86OperandType::Mem(_) => {
let push = AccessRange { addr, size: x86_operand.size.into() };
// It's called a "RegAccessType" but it also applies to memory
let acc_ty = x86_operand.access.unwrap();
// The same instruction might do both reads and writes, so potentially add both.
// We do not know the order in which they happened, but writing and then reading
// makes little sense so we put the read first. That is also the more
// conservative choice.
if acc_ty.is_readable() {
acc_events.push(AccessEvent::Read(push.clone()));
}
if acc_ty.is_writable() {
// FIXME: This could be made certain; either determine all cases where
// only reads happen, or have an intermediate mempr_* function to first
// map the page(s) as readonly and check if a segfault occurred.
// Per https://docs.rs/iced-x86/latest/iced_x86/enum.OpAccess.html,
// we know that the possible access types are Read, CondRead, Write,
// CondWrite, ReadWrite, and ReadCondWrite. Since we got a segfault
// we know some kind of access happened so Cond{Read, Write}s are
// certain reads and writes; the only uncertainty is with an RW op
// as it might be a ReadCondWrite with the write condition unmet.
acc_events.push(AccessEvent::Write(push, !acc_ty.is_readable()));
}
return true;
}
_ => (),
}
}
// FIXME: arm64
_ => unimplemented!(),
}
false
}
/// Extract the events from the given instruction.
fn capstone_disassemble(
instr: &[u8],
addr: usize,
cs: &capstone::Capstone,
acc_events: &mut Vec<AccessEvent>,
) -> capstone::CsResult<()> {
// The arch_detail is what we care about, but it relies on these temporaries
// that we can't drop. 0x1000 is the default base address for Captsone, and
// we're expecting 1 instruction.
let insns = cs.disasm_count(instr, 0x1000, 1)?;
let ins_detail = cs.insn_detail(&insns[0])?;
let arch_detail = ins_detail.arch_detail();
let mut found_mem_op = false;
for op in arch_detail.operands() {
if capstone_find_events(addr, &op, acc_events) {
if found_mem_op {
panic!("more than one memory operand found; we don't know which one accessed what");
}
found_mem_op = true;
}
}
Ok(())
}
/// Grabs the access that caused a segfault and logs it down if it's to our memory,
/// or kills the child and returns the appropriate error otherwise.
fn handle_segfault(
pid: unistd::Pid,
ch_pages: &[usize],
ch_stack: usize,
page_size: usize,
cs: &capstone::Capstone,
acc_events: &mut Vec<AccessEvent>,
) -> Result<(), ExecEnd> {
// Get information on what caused the segfault. This contains the address
// that triggered it.
let siginfo = ptrace::getsiginfo(pid).unwrap();
// All x86 instructions only have at most one memory operand (thankfully!)
// SAFETY: si_addr is safe to call.
let addr = unsafe { siginfo.si_addr().addr() };
let page_addr = addr.strict_sub(addr.strict_rem(page_size));
if !ch_pages.iter().any(|pg| (*pg..pg.strict_add(page_size)).contains(&addr)) {
// This was a real segfault (not one of the Miri memory pages), so print some debug info and
// quit.
let regs = ptrace::getregs(pid).unwrap();
eprintln!("Segfault occurred during FFI at {addr:#018x}");
eprintln!("Expected access on pages: {ch_pages:#018x?}");
eprintln!("Register dump: {regs:#x?}");
ptrace::kill(pid).unwrap();
return Err(ExecEnd(None));
}
// Overall structure:
// - Get the address that caused the segfault
// - Unprotect the memory: we force the child to execute `mempr_off`, passing parameters via
// global atomic variables. This is what we use the temporary callback stack for.
// - Step 1 instruction
// - Parse executed code to estimate size & type of access
// - Reprotect the memory by executing `mempr_on` in the child, using the callback stack again.
// - Continue
// Ensure the stack is properly zeroed out!
for a in (ch_stack..ch_stack.strict_add(CALLBACK_STACK_SIZE)).step_by(ARCH_WORD_SIZE) {
ptrace::write(pid, std::ptr::with_exposed_provenance_mut(a), 0).unwrap();
}
// Guard against both architectures with upwards and downwards-growing stacks.
let stack_ptr = ch_stack.strict_add(CALLBACK_STACK_SIZE / 2);
let regs_bak = ptrace::getregs(pid).unwrap();
let mut new_regs = regs_bak;
// Read at least one instruction from the ip. It's possible that the instruction
// that triggered the segfault was short and at the end of the mapped text area,
// so some of these reads may fail; in that case, just write empty bytes. If all
// reads failed, the disassembler will report an error.
let instr = (0..(ARCH_MAX_INSTR_SIZE.div_ceil(ARCH_WORD_SIZE)))
.flat_map(|ofs| {
// This reads one word of memory; we divided by `ARCH_WORD_SIZE` above to compensate for that.
ptrace::read(
pid,
std::ptr::without_provenance_mut(
regs_bak.ip().strict_add(ARCH_WORD_SIZE.strict_mul(ofs)),
),
)
.unwrap_or_default()
.to_ne_bytes()
})
.collect::<Vec<_>>();
// Now figure out the size + type of access and log it down.
capstone_disassemble(&instr, addr, cs, acc_events).expect("Failed to disassemble instruction");
// Move the instr ptr into the deprotection code.
#[expect(clippy::as_conversions)]
new_regs.set_ip(mempr_off as *const () as usize);
// Don't mess up the stack by accident!
new_regs.set_sp(stack_ptr);
// Modify the PAGE_ADDR global on the child process to point to the page
// that we want unprotected.
ptrace::write(
pid,
(&raw const PAGE_ADDR).cast_mut().cast(),
libc::c_long::try_from(page_addr.cast_signed()).unwrap(),
)
.unwrap();
// Check if we also own the next page, and if so unprotect it in case
// the access spans the page boundary.
let flag = if ch_pages.contains(&page_addr.strict_add(page_size)) { 2 } else { 1 };
ptrace::write(pid, (&raw const PAGE_COUNT).cast_mut().cast(), flag).unwrap();
ptrace::setregs(pid, new_regs).unwrap();
// Our mempr_* functions end with a raise(SIGSTOP).
wait_for_signal(Some(pid), signal::SIGSTOP, InitialCont::Yes)?;
// Step 1 instruction.
ptrace::setregs(pid, regs_bak).unwrap();
ptrace::step(pid, None).unwrap();
// Don't use wait_for_signal here since 1 instruction doesn't give room
// for any uncertainty + we don't want it `cont()`ing randomly by accident
// Also, don't let it continue with unprotected memory if something errors!
let stat = wait::waitid(wait::Id::Pid(pid), WAIT_FLAGS).map_err(|_| ExecEnd(None))?;
match stat {
wait::WaitStatus::Signaled(_, s, _)
| wait::WaitStatus::Stopped(_, s)
| wait::WaitStatus::PtraceEvent(_, s, _) =>
assert!(
!matches!(s, signal::SIGSEGV),
"native code segfaulted when re-trying memory access\n\
is the native code trying to call a Rust function?"
),
_ => (),
}
// Zero out again to be safe
for a in (ch_stack..ch_stack.strict_add(CALLBACK_STACK_SIZE)).step_by(ARCH_WORD_SIZE) {
ptrace::write(pid, std::ptr::with_exposed_provenance_mut(a), 0).unwrap();
}
let regs_bak = ptrace::getregs(pid).unwrap();
new_regs = regs_bak;
// Reprotect everything and continue.
#[expect(clippy::as_conversions)]
new_regs.set_ip(mempr_on as *const () as usize);
new_regs.set_sp(stack_ptr);
ptrace::setregs(pid, new_regs).unwrap();
wait_for_signal(Some(pid), signal::SIGSTOP, InitialCont::Yes)?;
ptrace::setregs(pid, regs_bak).unwrap();
ptrace::syscall(pid, None).unwrap();
Ok(())
}
// We only get dropped into these functions via offsetting the instr pointer
// manually, so we *must not ever* unwind from them.
/// Disables protections on the page whose address is currently in `PAGE_ADDR`.
///
/// SAFETY: `PAGE_ADDR` should be set to a page-aligned pointer to an owned page,
/// `PAGE_SIZE` should be the host pagesize, and the range from `PAGE_ADDR` to
/// `PAGE_SIZE` * `PAGE_COUNT` must be owned and allocated memory. No other threads
/// should be running.
pub unsafe extern "C" fn mempr_off() {
use std::sync::atomic::Ordering;
// Again, cannot allow unwinds to happen here.
let len = PAGE_SIZE.load(Ordering::Relaxed).saturating_mul(PAGE_COUNT.load(Ordering::Relaxed));
// SAFETY: Upheld by "caller".
unsafe {
// It's up to the caller to make sure this doesn't actually overflow, but
// we mustn't unwind from here, so...
if libc::mprotect(
PAGE_ADDR.load(Ordering::Relaxed).cast(),
len,
libc::PROT_READ | libc::PROT_WRITE,
) != 0
{
// Can't return or unwind, but we can do this.
std::process::exit(-1);
}
}
// If this fails somehow we're doomed.
if signal::raise(signal::SIGSTOP).is_err() {
std::process::exit(-1);
}
}
/// Reenables protection on the page set by `PAGE_ADDR`.
///
/// SAFETY: See `mempr_off()`.
pub unsafe extern "C" fn mempr_on() {
use std::sync::atomic::Ordering;
let len = PAGE_SIZE.load(Ordering::Relaxed).wrapping_mul(PAGE_COUNT.load(Ordering::Relaxed));
// SAFETY: Upheld by "caller".
unsafe {
if libc::mprotect(PAGE_ADDR.load(Ordering::Relaxed).cast(), len, libc::PROT_NONE) != 0 {
std::process::exit(-1);
}
}
if signal::raise(signal::SIGSTOP).is_err() {
std::process::exit(-1);
}
}