mirror of
https://github.com/containers/youki
synced 2024-06-08 15:56:16 +02:00
59937cf490
Signed-off-by: anti-entropy123 <1348651580@qq.com>
394 lines
16 KiB
Rust
394 lines
16 KiB
Rust
use std::{ffi::c_int, fs::File, num::NonZeroUsize};
|
|
|
|
use libc::SIGCHLD;
|
|
use nix::{
|
|
sys::{mman, resource},
|
|
unistd::Pid,
|
|
};
|
|
|
|
#[derive(Debug, thiserror::Error)]
|
|
pub enum CloneError {
|
|
#[error("failed to clone process")]
|
|
Clone(#[source] nix::Error),
|
|
#[error("failed to get system memory page size")]
|
|
PageSize(#[source] nix::Error),
|
|
#[error("failed to get resource limit")]
|
|
ResourceLimit(#[source] nix::Error),
|
|
#[error("the stack size is zero")]
|
|
ZeroStackSize,
|
|
#[error("failed to allocate stack")]
|
|
StackAllocation(#[source] nix::Error),
|
|
#[error("failed to create stack guard page")]
|
|
GuardPage(#[source] nix::Error),
|
|
#[error("unknown error code {0}")]
|
|
UnknownErrno(i32),
|
|
}
|
|
|
|
/// The callback function used in clone system call. The return value is i32
|
|
/// which is consistent with C functions return code. The trait has to be
|
|
/// `FnMut` because we need to be able to call the closure multiple times, once
|
|
/// in clone3 and once in clone if fallback is required. The closure is boxed
|
|
/// because we need to store the closure on heap, not stack in the case of
|
|
/// `clone`. Unlike `fork` or `clone3`, the `clone` glibc wrapper requires us to
|
|
/// pass in a child stack, which is empty. By storing the closure in heap, we
|
|
/// can then in the new process to re-box the heap memory back to a closure
|
|
/// correctly.
|
|
pub type CloneCb = Box<dyn FnMut() -> i32>;
|
|
|
|
// Clone a sibling process that shares the same parent as the calling
|
|
// process. This is used to launch the container init process so the parent
|
|
// process of the calling process can receive ownership of the process. If we
|
|
// clone a child process as the init process, the calling process (likely the
|
|
// youki main process) will exit and the init process will be re-parented to the
|
|
// process 1 (system init process), which is not the right behavior of what we
|
|
// look for.
|
|
pub fn container_clone_sibling(cb: CloneCb) -> Result<Pid, CloneError> {
|
|
// Note: normally, an exit signal is required, but when using
|
|
// `CLONE_PARENT`, the `clone3` will return EINVAL if an exit signal is set.
|
|
// The older `clone` will not return EINVAL in this case. Instead it ignores
|
|
// the exit signal bits in the glibc wrapper. Therefore, we explicitly set
|
|
// the exit_signal to None here, so this works for both version of clone.
|
|
clone_internal(cb, libc::CLONE_PARENT as u64, None)
|
|
}
|
|
|
|
// Clone a child process and execute the callback.
|
|
pub fn container_clone(cb: CloneCb) -> Result<Pid, CloneError> {
|
|
clone_internal(cb, 0, Some(SIGCHLD as u64))
|
|
}
|
|
|
|
// An internal wrapper to manage the clone3 vs clone fallback logic.
|
|
fn clone_internal(
|
|
mut cb: CloneCb,
|
|
flags: u64,
|
|
exit_signal: Option<u64>,
|
|
) -> Result<Pid, CloneError> {
|
|
match clone3(&mut cb, flags, exit_signal) {
|
|
Ok(pid) => Ok(pid),
|
|
// For now, we decide to only fallback on ENOSYS
|
|
Err(CloneError::Clone(nix::Error::ENOSYS)) => {
|
|
tracing::debug!("clone3 is not supported, fallback to clone");
|
|
let pid = clone(cb, flags, exit_signal)?;
|
|
|
|
Ok(pid)
|
|
}
|
|
Err(err) => Err(err),
|
|
}
|
|
}
|
|
|
|
// Unlike the clone call, clone3 is currently using the kernel syscall, mimicking
|
|
// the interface of fork. There is not need to explicitly manage the memory, so
|
|
// we can safely passing the callback closure as reference.
|
|
fn clone3(cb: &mut CloneCb, flags: u64, exit_signal: Option<u64>) -> Result<Pid, CloneError> {
|
|
#[repr(C)]
|
|
struct clone3_args {
|
|
flags: u64,
|
|
pidfd: u64,
|
|
child_tid: u64,
|
|
parent_tid: u64,
|
|
exit_signal: u64,
|
|
stack: u64,
|
|
stack_size: u64,
|
|
tls: u64,
|
|
set_tid: u64,
|
|
set_tid_size: u64,
|
|
cgroup: u64,
|
|
}
|
|
let mut args = clone3_args {
|
|
flags,
|
|
pidfd: 0,
|
|
child_tid: 0,
|
|
parent_tid: 0,
|
|
exit_signal: exit_signal.unwrap_or(0),
|
|
stack: 0,
|
|
stack_size: 0,
|
|
tls: 0,
|
|
set_tid: 0,
|
|
set_tid_size: 0,
|
|
cgroup: 0,
|
|
};
|
|
let args_ptr = &mut args as *mut clone3_args;
|
|
let args_size = std::mem::size_of::<clone3_args>();
|
|
// For now, we can only use clone3 as a kernel syscall. Libc wrapper is not
|
|
// available yet. This can have undefined behavior because libc authors do
|
|
// not like people calling kernel syscall to directly create processes. Libc
|
|
// does perform additional bookkeeping when calling clone or fork. So far,
|
|
// we have not observed any issues with calling clone3 directly, but we
|
|
// should keep an eye on it.
|
|
match unsafe { libc::syscall(libc::SYS_clone3, args_ptr, args_size) } {
|
|
-1 => Err(CloneError::Clone(nix::Error::last())),
|
|
0 => {
|
|
// Inside the cloned process, we execute the callback and exit with
|
|
// the return code.
|
|
std::process::exit(cb());
|
|
}
|
|
ret if ret >= 0 => Ok(Pid::from_raw(ret as i32)),
|
|
ret => Err(CloneError::UnknownErrno(ret as i32)),
|
|
}
|
|
}
|
|
|
|
fn clone(cb: CloneCb, flags: u64, exit_signal: Option<u64>) -> Result<Pid, CloneError> {
|
|
const DEFAULT_STACK_SIZE: usize = 8 * 1024 * 1024; // 8M
|
|
const DEFAULT_PAGE_SIZE: usize = 4 * 1024; // 4K
|
|
|
|
// Use sysconf to find the page size. If there is an error, we assume
|
|
// the default 4K page size.
|
|
let page_size = nix::unistd::sysconf(nix::unistd::SysconfVar::PAGE_SIZE)
|
|
.map_err(CloneError::PageSize)?
|
|
.map(|size| size as usize)
|
|
.unwrap_or(DEFAULT_PAGE_SIZE);
|
|
|
|
// Find out the default stack max size through getrlimit.
|
|
let (rlim_cur, _) =
|
|
resource::getrlimit(resource::Resource::RLIMIT_STACK).map_err(CloneError::ResourceLimit)?;
|
|
// mmap will return ENOMEM if stack size is unlimited when we create the
|
|
// child stack, so we need to set a reasonable default stack size.
|
|
let default_stack_size = if rlim_cur != u64::MAX {
|
|
rlim_cur as usize
|
|
} else {
|
|
tracing::debug!(
|
|
"stack size returned by getrlimit() is unlimited, use DEFAULT_STACK_SIZE(8MB)"
|
|
);
|
|
DEFAULT_STACK_SIZE
|
|
};
|
|
|
|
// Using the clone syscall requires us to create the stack space for the
|
|
// child process instead of taken cared for us like fork call. We use mmap
|
|
// here to create the stack. Instead of guessing how much space the child
|
|
// process needs, we allocate through mmap to the system default limit,
|
|
// which is 8MB on most of the linux system today. This is OK since mmap
|
|
// will only reserve the address space upfront, instead of allocating
|
|
// physical memory upfront. The stack will grow as needed, up to the size
|
|
// reserved, so no wasted memory here. Lastly, the child stack only needs
|
|
// to support the container init process set up code in Youki. When Youki
|
|
// calls exec into the container payload, exec will reset the stack. Note,
|
|
// do not use MAP_GROWSDOWN since it is not well supported.
|
|
// Ref: https://man7.org/linux/man-pages/man2/mmap.2.html
|
|
let child_stack = unsafe {
|
|
// Since nix = "0.27.1", `mmap()` requires a generic type `F: AsFd`.
|
|
// `::<File>` doesn't have any meaning because we won't use it.
|
|
mman::mmap::<File>(
|
|
None,
|
|
NonZeroUsize::new(default_stack_size).ok_or(CloneError::ZeroStackSize)?,
|
|
mman::ProtFlags::PROT_READ | mman::ProtFlags::PROT_WRITE,
|
|
mman::MapFlags::MAP_PRIVATE | mman::MapFlags::MAP_ANONYMOUS | mman::MapFlags::MAP_STACK,
|
|
None,
|
|
0,
|
|
)
|
|
.map_err(CloneError::StackAllocation)?
|
|
};
|
|
unsafe {
|
|
// Consistent with how pthread_create sets up the stack, we create a
|
|
// guard page of 1 page, to protect the child stack collision. Note, for
|
|
// clone call, the child stack will grow downward, so the bottom of the
|
|
// child stack is in the beginning.
|
|
mman::mprotect(child_stack, page_size, mman::ProtFlags::PROT_NONE)
|
|
.map_err(CloneError::GuardPage)?;
|
|
};
|
|
|
|
// Since the child stack for clone grows downward, we need to pass in
|
|
// the top of the stack address.
|
|
let child_stack_top = unsafe { child_stack.add(default_stack_size) };
|
|
|
|
// Combine the clone flags with exit signals.
|
|
let combined_flags = (flags | exit_signal.unwrap_or(0)) as c_int;
|
|
|
|
// We are passing the boxed closure "cb" into the clone function as the a
|
|
// function pointer in C. The box closure in Rust is both a function pointer
|
|
// and a struct. However, when casting the box closure into libc::c_void,
|
|
// the function pointer will be lost. Therefore, to work around the issue,
|
|
// we double box the closure. This is consistent with how std::unix::thread
|
|
// handles the closure.
|
|
// Ref: https://github.com/rust-lang/rust/blob/master/library/std/src/sys/unix/thread.rs
|
|
let data = Box::into_raw(Box::new(cb));
|
|
// The main is a wrapper function passed into clone call below. The "data"
|
|
// arg is actually a raw pointer to the Box closure. so here, we re-box the
|
|
// pointer back into a box closure so the main takes ownership of the
|
|
// memory. Then we can call the closure.
|
|
extern "C" fn main(data: *mut libc::c_void) -> libc::c_int {
|
|
unsafe { Box::from_raw(data as *mut CloneCb)() }
|
|
}
|
|
|
|
// The nix::sched::clone wrapper doesn't provide the right interface. Using
|
|
// the clone syscall is one of the rare cases where we don't want rust to
|
|
// manage the child stack memory. Instead, we want to use c_void directly
|
|
// here. Therefore, here we are using libc::clone syscall directly for
|
|
// better control. The child stack will be cleaned when exec is called or
|
|
// the child process terminates. The nix wrapper also does not treat the
|
|
// closure memory correctly. The wrapper implementation fails to pass the
|
|
// right ownership to the new child process.
|
|
// Ref: https://github.com/nix-rust/nix/issues/919
|
|
// Ref: https://github.com/nix-rust/nix/pull/920
|
|
let ret = unsafe {
|
|
libc::clone(
|
|
main,
|
|
child_stack_top,
|
|
combined_flags,
|
|
data as *mut libc::c_void,
|
|
)
|
|
};
|
|
|
|
// After the clone returns, the heap memory associated with the Box closure
|
|
// is duplicated in the cloned process. Therefore, we can safely re-box the
|
|
// closure from the raw pointer and let rust to continue managing the
|
|
// memory. We call drop here explicitly to avoid the warning that the
|
|
// closure is not used. This is correct since the closure is called in the
|
|
// cloned process, not the parent process.
|
|
unsafe { drop(Box::from_raw(data)) };
|
|
match ret {
|
|
-1 => Err(CloneError::Clone(nix::Error::last())),
|
|
pid if ret > 0 => Ok(Pid::from_raw(pid)),
|
|
_ => unreachable!("clone returned a negative pid {ret}"),
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod test {
|
|
use crate::channel::channel;
|
|
|
|
use super::*;
|
|
use anyhow::{bail, Context, Result};
|
|
use nix::sys::wait::{waitpid, WaitStatus};
|
|
use nix::unistd;
|
|
|
|
#[test]
|
|
fn test_container_fork() -> Result<()> {
|
|
let pid = container_clone(Box::new(|| 0))?;
|
|
match waitpid(pid, None).expect("wait pid failed.") {
|
|
WaitStatus::Exited(p, status) => {
|
|
assert_eq!(pid, p);
|
|
assert_eq!(status, 0);
|
|
Ok(())
|
|
}
|
|
_ => bail!("test failed"),
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_container_err_fork() -> Result<()> {
|
|
let pid = container_clone(Box::new(|| -1))?;
|
|
match waitpid(pid, None).expect("wait pid failed.") {
|
|
WaitStatus::Exited(p, status) => {
|
|
assert_eq!(pid, p);
|
|
assert_eq!(status, 255);
|
|
Ok(())
|
|
}
|
|
_ => bail!("test failed"),
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_container_clone_sibling() -> Result<()> {
|
|
// The `container_clone_sibling` will create a sibling process (share
|
|
// the same parent) of the calling process. In Unix, a process can only
|
|
// wait on the immediate children process and can't wait on the sibling
|
|
// process. Therefore, to test the logic, we will have to fork a process
|
|
// first and then let the forked process call `container_clone_sibling`.
|
|
// Then the testing process (the process where test is called), who are
|
|
// the parent to this forked process and the sibling process cloned by
|
|
// the `container_clone_sibling`, can wait on both processes.
|
|
|
|
// We need to use a channel so that the forked process can pass the pid
|
|
// of the sibling process to the testing process.
|
|
let (sender, receiver) = &mut channel::<i32>()?;
|
|
|
|
match unsafe { unistd::fork()? } {
|
|
unistd::ForkResult::Parent { child } => {
|
|
let sibling_process_pid =
|
|
Pid::from_raw(receiver.recv().with_context(|| {
|
|
"failed to receive the sibling pid from forked process"
|
|
})?);
|
|
receiver.close()?;
|
|
match waitpid(sibling_process_pid, None).expect("wait pid failed.") {
|
|
WaitStatus::Exited(p, status) => {
|
|
assert_eq!(sibling_process_pid, p);
|
|
assert_eq!(status, 0);
|
|
}
|
|
_ => bail!("failed to wait on the sibling process"),
|
|
}
|
|
// After sibling process exits, we can wait on the forked process.
|
|
match waitpid(child, None).expect("wait pid failed.") {
|
|
WaitStatus::Exited(p, status) => {
|
|
assert_eq!(child, p);
|
|
assert_eq!(status, 0);
|
|
}
|
|
_ => bail!("failed to wait on the forked process"),
|
|
}
|
|
}
|
|
unistd::ForkResult::Child => {
|
|
// Inside the forked process. We call `container_clone` and pass
|
|
// the pid to the parent process.
|
|
let pid = container_clone_sibling(Box::new(|| 0))?;
|
|
sender.send(pid.as_raw())?;
|
|
sender.close()?;
|
|
std::process::exit(0);
|
|
}
|
|
};
|
|
|
|
Ok(())
|
|
}
|
|
|
|
// This test depends on libseccomp to work.
|
|
#[cfg(feature = "libseccomp")]
|
|
#[test]
|
|
fn test_clone_fallback() -> Result<()> {
|
|
use crate::test_utils::TestCallbackError;
|
|
use oci_spec::runtime::{
|
|
Arch, LinuxSeccompAction, LinuxSeccompBuilder, LinuxSyscallBuilder,
|
|
};
|
|
|
|
fn has_clone3() -> bool {
|
|
// We use the probe syscall to check if the kernel supports clone3 or
|
|
// seccomp has successfully blocked clone3.
|
|
let res = unsafe { libc::syscall(libc::SYS_clone3, 0, 0) };
|
|
let err = (res == -1)
|
|
.then(std::io::Error::last_os_error)
|
|
.expect("probe syscall should not succeed");
|
|
err.raw_os_error() != Some(libc::ENOSYS)
|
|
}
|
|
|
|
// To test the fallback behavior, we will create a seccomp rule that
|
|
// blocks `clone3` as ENOSYS.
|
|
let syscall = LinuxSyscallBuilder::default()
|
|
.names(vec![String::from("clone3")])
|
|
.action(LinuxSeccompAction::ScmpActErrno)
|
|
.errno_ret(libc::ENOSYS as u32)
|
|
.build()?;
|
|
let seccomp_profile = LinuxSeccompBuilder::default()
|
|
.default_action(LinuxSeccompAction::ScmpActAllow)
|
|
.architectures(vec![Arch::ScmpArchNative])
|
|
.syscalls(vec![syscall])
|
|
.build()?;
|
|
|
|
crate::test_utils::test_in_child_process(|| {
|
|
// We use seccomp to block `clone3`
|
|
let _ = prctl::set_no_new_privileges(true);
|
|
crate::seccomp::initialize_seccomp(&seccomp_profile)
|
|
.expect("failed to initialize seccomp");
|
|
|
|
if has_clone3() {
|
|
return Err(TestCallbackError::Custom(
|
|
"clone3 is not blocked by seccomp".into(),
|
|
));
|
|
}
|
|
|
|
let pid = container_clone(Box::new(|| 0)).map_err(|err| err.to_string())?;
|
|
match waitpid(pid, None).expect("wait pid failed.") {
|
|
WaitStatus::Exited(p, status) => {
|
|
assert_eq!(pid, p);
|
|
assert_eq!(status, 0);
|
|
}
|
|
status => {
|
|
return Err(TestCallbackError::Custom(format!(
|
|
"failed to wait on child process: {:?}",
|
|
status
|
|
)));
|
|
}
|
|
};
|
|
|
|
Ok(())
|
|
})?;
|
|
|
|
Ok(())
|
|
}
|
|
}
|