1
0
Fork 0
mirror of https://github.com/containers/youki synced 2024-06-08 15:56:16 +02:00
youki/crates/libcontainer/src/process/fork.rs
anti-entropy123 59937cf490 update nix to 0.27.1
Signed-off-by: anti-entropy123 <1348651580@qq.com>
2023-10-07 11:10:07 +08:00

394 lines
16 KiB
Rust

use std::{ffi::c_int, fs::File, num::NonZeroUsize};
use libc::SIGCHLD;
use nix::{
sys::{mman, resource},
unistd::Pid,
};
#[derive(Debug, thiserror::Error)]
pub enum CloneError {
#[error("failed to clone process")]
Clone(#[source] nix::Error),
#[error("failed to get system memory page size")]
PageSize(#[source] nix::Error),
#[error("failed to get resource limit")]
ResourceLimit(#[source] nix::Error),
#[error("the stack size is zero")]
ZeroStackSize,
#[error("failed to allocate stack")]
StackAllocation(#[source] nix::Error),
#[error("failed to create stack guard page")]
GuardPage(#[source] nix::Error),
#[error("unknown error code {0}")]
UnknownErrno(i32),
}
/// The callback function used in clone system call. The return value is i32
/// which is consistent with C functions return code. The trait has to be
/// `FnMut` because we need to be able to call the closure multiple times, once
/// in clone3 and once in clone if fallback is required. The closure is boxed
/// because we need to store the closure on heap, not stack in the case of
/// `clone`. Unlike `fork` or `clone3`, the `clone` glibc wrapper requires us to
/// pass in a child stack, which is empty. By storing the closure in heap, we
/// can then in the new process to re-box the heap memory back to a closure
/// correctly.
pub type CloneCb = Box<dyn FnMut() -> i32>;
// Clone a sibling process that shares the same parent as the calling
// process. This is used to launch the container init process so the parent
// process of the calling process can receive ownership of the process. If we
// clone a child process as the init process, the calling process (likely the
// youki main process) will exit and the init process will be re-parented to the
// process 1 (system init process), which is not the right behavior of what we
// look for.
pub fn container_clone_sibling(cb: CloneCb) -> Result<Pid, CloneError> {
// Note: normally, an exit signal is required, but when using
// `CLONE_PARENT`, the `clone3` will return EINVAL if an exit signal is set.
// The older `clone` will not return EINVAL in this case. Instead it ignores
// the exit signal bits in the glibc wrapper. Therefore, we explicitly set
// the exit_signal to None here, so this works for both version of clone.
clone_internal(cb, libc::CLONE_PARENT as u64, None)
}
// Clone a child process and execute the callback.
pub fn container_clone(cb: CloneCb) -> Result<Pid, CloneError> {
clone_internal(cb, 0, Some(SIGCHLD as u64))
}
// An internal wrapper to manage the clone3 vs clone fallback logic.
fn clone_internal(
mut cb: CloneCb,
flags: u64,
exit_signal: Option<u64>,
) -> Result<Pid, CloneError> {
match clone3(&mut cb, flags, exit_signal) {
Ok(pid) => Ok(pid),
// For now, we decide to only fallback on ENOSYS
Err(CloneError::Clone(nix::Error::ENOSYS)) => {
tracing::debug!("clone3 is not supported, fallback to clone");
let pid = clone(cb, flags, exit_signal)?;
Ok(pid)
}
Err(err) => Err(err),
}
}
// Unlike the clone call, clone3 is currently using the kernel syscall, mimicking
// the interface of fork. There is not need to explicitly manage the memory, so
// we can safely passing the callback closure as reference.
fn clone3(cb: &mut CloneCb, flags: u64, exit_signal: Option<u64>) -> Result<Pid, CloneError> {
#[repr(C)]
struct clone3_args {
flags: u64,
pidfd: u64,
child_tid: u64,
parent_tid: u64,
exit_signal: u64,
stack: u64,
stack_size: u64,
tls: u64,
set_tid: u64,
set_tid_size: u64,
cgroup: u64,
}
let mut args = clone3_args {
flags,
pidfd: 0,
child_tid: 0,
parent_tid: 0,
exit_signal: exit_signal.unwrap_or(0),
stack: 0,
stack_size: 0,
tls: 0,
set_tid: 0,
set_tid_size: 0,
cgroup: 0,
};
let args_ptr = &mut args as *mut clone3_args;
let args_size = std::mem::size_of::<clone3_args>();
// For now, we can only use clone3 as a kernel syscall. Libc wrapper is not
// available yet. This can have undefined behavior because libc authors do
// not like people calling kernel syscall to directly create processes. Libc
// does perform additional bookkeeping when calling clone or fork. So far,
// we have not observed any issues with calling clone3 directly, but we
// should keep an eye on it.
match unsafe { libc::syscall(libc::SYS_clone3, args_ptr, args_size) } {
-1 => Err(CloneError::Clone(nix::Error::last())),
0 => {
// Inside the cloned process, we execute the callback and exit with
// the return code.
std::process::exit(cb());
}
ret if ret >= 0 => Ok(Pid::from_raw(ret as i32)),
ret => Err(CloneError::UnknownErrno(ret as i32)),
}
}
fn clone(cb: CloneCb, flags: u64, exit_signal: Option<u64>) -> Result<Pid, CloneError> {
const DEFAULT_STACK_SIZE: usize = 8 * 1024 * 1024; // 8M
const DEFAULT_PAGE_SIZE: usize = 4 * 1024; // 4K
// Use sysconf to find the page size. If there is an error, we assume
// the default 4K page size.
let page_size = nix::unistd::sysconf(nix::unistd::SysconfVar::PAGE_SIZE)
.map_err(CloneError::PageSize)?
.map(|size| size as usize)
.unwrap_or(DEFAULT_PAGE_SIZE);
// Find out the default stack max size through getrlimit.
let (rlim_cur, _) =
resource::getrlimit(resource::Resource::RLIMIT_STACK).map_err(CloneError::ResourceLimit)?;
// mmap will return ENOMEM if stack size is unlimited when we create the
// child stack, so we need to set a reasonable default stack size.
let default_stack_size = if rlim_cur != u64::MAX {
rlim_cur as usize
} else {
tracing::debug!(
"stack size returned by getrlimit() is unlimited, use DEFAULT_STACK_SIZE(8MB)"
);
DEFAULT_STACK_SIZE
};
// Using the clone syscall requires us to create the stack space for the
// child process instead of taken cared for us like fork call. We use mmap
// here to create the stack. Instead of guessing how much space the child
// process needs, we allocate through mmap to the system default limit,
// which is 8MB on most of the linux system today. This is OK since mmap
// will only reserve the address space upfront, instead of allocating
// physical memory upfront. The stack will grow as needed, up to the size
// reserved, so no wasted memory here. Lastly, the child stack only needs
// to support the container init process set up code in Youki. When Youki
// calls exec into the container payload, exec will reset the stack. Note,
// do not use MAP_GROWSDOWN since it is not well supported.
// Ref: https://man7.org/linux/man-pages/man2/mmap.2.html
let child_stack = unsafe {
// Since nix = "0.27.1", `mmap()` requires a generic type `F: AsFd`.
// `::<File>` doesn't have any meaning because we won't use it.
mman::mmap::<File>(
None,
NonZeroUsize::new(default_stack_size).ok_or(CloneError::ZeroStackSize)?,
mman::ProtFlags::PROT_READ | mman::ProtFlags::PROT_WRITE,
mman::MapFlags::MAP_PRIVATE | mman::MapFlags::MAP_ANONYMOUS | mman::MapFlags::MAP_STACK,
None,
0,
)
.map_err(CloneError::StackAllocation)?
};
unsafe {
// Consistent with how pthread_create sets up the stack, we create a
// guard page of 1 page, to protect the child stack collision. Note, for
// clone call, the child stack will grow downward, so the bottom of the
// child stack is in the beginning.
mman::mprotect(child_stack, page_size, mman::ProtFlags::PROT_NONE)
.map_err(CloneError::GuardPage)?;
};
// Since the child stack for clone grows downward, we need to pass in
// the top of the stack address.
let child_stack_top = unsafe { child_stack.add(default_stack_size) };
// Combine the clone flags with exit signals.
let combined_flags = (flags | exit_signal.unwrap_or(0)) as c_int;
// We are passing the boxed closure "cb" into the clone function as the a
// function pointer in C. The box closure in Rust is both a function pointer
// and a struct. However, when casting the box closure into libc::c_void,
// the function pointer will be lost. Therefore, to work around the issue,
// we double box the closure. This is consistent with how std::unix::thread
// handles the closure.
// Ref: https://github.com/rust-lang/rust/blob/master/library/std/src/sys/unix/thread.rs
let data = Box::into_raw(Box::new(cb));
// The main is a wrapper function passed into clone call below. The "data"
// arg is actually a raw pointer to the Box closure. so here, we re-box the
// pointer back into a box closure so the main takes ownership of the
// memory. Then we can call the closure.
extern "C" fn main(data: *mut libc::c_void) -> libc::c_int {
unsafe { Box::from_raw(data as *mut CloneCb)() }
}
// The nix::sched::clone wrapper doesn't provide the right interface. Using
// the clone syscall is one of the rare cases where we don't want rust to
// manage the child stack memory. Instead, we want to use c_void directly
// here. Therefore, here we are using libc::clone syscall directly for
// better control. The child stack will be cleaned when exec is called or
// the child process terminates. The nix wrapper also does not treat the
// closure memory correctly. The wrapper implementation fails to pass the
// right ownership to the new child process.
// Ref: https://github.com/nix-rust/nix/issues/919
// Ref: https://github.com/nix-rust/nix/pull/920
let ret = unsafe {
libc::clone(
main,
child_stack_top,
combined_flags,
data as *mut libc::c_void,
)
};
// After the clone returns, the heap memory associated with the Box closure
// is duplicated in the cloned process. Therefore, we can safely re-box the
// closure from the raw pointer and let rust to continue managing the
// memory. We call drop here explicitly to avoid the warning that the
// closure is not used. This is correct since the closure is called in the
// cloned process, not the parent process.
unsafe { drop(Box::from_raw(data)) };
match ret {
-1 => Err(CloneError::Clone(nix::Error::last())),
pid if ret > 0 => Ok(Pid::from_raw(pid)),
_ => unreachable!("clone returned a negative pid {ret}"),
}
}
#[cfg(test)]
mod test {
use crate::channel::channel;
use super::*;
use anyhow::{bail, Context, Result};
use nix::sys::wait::{waitpid, WaitStatus};
use nix::unistd;
#[test]
fn test_container_fork() -> Result<()> {
let pid = container_clone(Box::new(|| 0))?;
match waitpid(pid, None).expect("wait pid failed.") {
WaitStatus::Exited(p, status) => {
assert_eq!(pid, p);
assert_eq!(status, 0);
Ok(())
}
_ => bail!("test failed"),
}
}
#[test]
fn test_container_err_fork() -> Result<()> {
let pid = container_clone(Box::new(|| -1))?;
match waitpid(pid, None).expect("wait pid failed.") {
WaitStatus::Exited(p, status) => {
assert_eq!(pid, p);
assert_eq!(status, 255);
Ok(())
}
_ => bail!("test failed"),
}
}
#[test]
fn test_container_clone_sibling() -> Result<()> {
// The `container_clone_sibling` will create a sibling process (share
// the same parent) of the calling process. In Unix, a process can only
// wait on the immediate children process and can't wait on the sibling
// process. Therefore, to test the logic, we will have to fork a process
// first and then let the forked process call `container_clone_sibling`.
// Then the testing process (the process where test is called), who are
// the parent to this forked process and the sibling process cloned by
// the `container_clone_sibling`, can wait on both processes.
// We need to use a channel so that the forked process can pass the pid
// of the sibling process to the testing process.
let (sender, receiver) = &mut channel::<i32>()?;
match unsafe { unistd::fork()? } {
unistd::ForkResult::Parent { child } => {
let sibling_process_pid =
Pid::from_raw(receiver.recv().with_context(|| {
"failed to receive the sibling pid from forked process"
})?);
receiver.close()?;
match waitpid(sibling_process_pid, None).expect("wait pid failed.") {
WaitStatus::Exited(p, status) => {
assert_eq!(sibling_process_pid, p);
assert_eq!(status, 0);
}
_ => bail!("failed to wait on the sibling process"),
}
// After sibling process exits, we can wait on the forked process.
match waitpid(child, None).expect("wait pid failed.") {
WaitStatus::Exited(p, status) => {
assert_eq!(child, p);
assert_eq!(status, 0);
}
_ => bail!("failed to wait on the forked process"),
}
}
unistd::ForkResult::Child => {
// Inside the forked process. We call `container_clone` and pass
// the pid to the parent process.
let pid = container_clone_sibling(Box::new(|| 0))?;
sender.send(pid.as_raw())?;
sender.close()?;
std::process::exit(0);
}
};
Ok(())
}
// This test depends on libseccomp to work.
#[cfg(feature = "libseccomp")]
#[test]
fn test_clone_fallback() -> Result<()> {
use crate::test_utils::TestCallbackError;
use oci_spec::runtime::{
Arch, LinuxSeccompAction, LinuxSeccompBuilder, LinuxSyscallBuilder,
};
fn has_clone3() -> bool {
// We use the probe syscall to check if the kernel supports clone3 or
// seccomp has successfully blocked clone3.
let res = unsafe { libc::syscall(libc::SYS_clone3, 0, 0) };
let err = (res == -1)
.then(std::io::Error::last_os_error)
.expect("probe syscall should not succeed");
err.raw_os_error() != Some(libc::ENOSYS)
}
// To test the fallback behavior, we will create a seccomp rule that
// blocks `clone3` as ENOSYS.
let syscall = LinuxSyscallBuilder::default()
.names(vec![String::from("clone3")])
.action(LinuxSeccompAction::ScmpActErrno)
.errno_ret(libc::ENOSYS as u32)
.build()?;
let seccomp_profile = LinuxSeccompBuilder::default()
.default_action(LinuxSeccompAction::ScmpActAllow)
.architectures(vec![Arch::ScmpArchNative])
.syscalls(vec![syscall])
.build()?;
crate::test_utils::test_in_child_process(|| {
// We use seccomp to block `clone3`
let _ = prctl::set_no_new_privileges(true);
crate::seccomp::initialize_seccomp(&seccomp_profile)
.expect("failed to initialize seccomp");
if has_clone3() {
return Err(TestCallbackError::Custom(
"clone3 is not blocked by seccomp".into(),
));
}
let pid = container_clone(Box::new(|| 0)).map_err(|err| err.to_string())?;
match waitpid(pid, None).expect("wait pid failed.") {
WaitStatus::Exited(p, status) => {
assert_eq!(pid, p);
assert_eq!(status, 0);
}
status => {
return Err(TestCallbackError::Custom(format!(
"failed to wait on child process: {:?}",
status
)));
}
};
Ok(())
})?;
Ok(())
}
}