mirror of
https://github.com/containers/youki
synced 2025-04-30 13:20:17 +02:00
Merge pull request #173 from yihuaf/yihuaf/163
Fix how closure is transferred to the clone call.
This commit is contained in:
commit
307e44a7ba
@ -30,8 +30,6 @@ pub(super) struct ContainerBuilderImpl {
|
||||
pub use_systemd: bool,
|
||||
/// Id of the container
|
||||
pub container_id: String,
|
||||
/// Directory where the state of the container will be stored
|
||||
pub container_dir: PathBuf,
|
||||
/// OCI complient runtime spec
|
||||
pub spec: Spec,
|
||||
/// Root filesystem of the container
|
||||
@ -66,19 +64,26 @@ impl ContainerBuilderImpl {
|
||||
|
||||
// create the parent and child process structure so the parent and child process can sync with each other
|
||||
let (mut parent, parent_channel) = parent::ParentProcess::new(self.rootless.clone())?;
|
||||
let mut child = child::ChildProcess::new(parent_channel)?;
|
||||
let child = child::ChildProcess::new(parent_channel)?;
|
||||
|
||||
let cb = Box::new(|| {
|
||||
if let Err(error) = container_init(
|
||||
self.init,
|
||||
self.rootless.clone(),
|
||||
self.spec.clone(),
|
||||
self.syscall.clone(),
|
||||
self.rootfs.clone(),
|
||||
self.console_socket.clone(),
|
||||
self.notify_path.clone(),
|
||||
&mut child,
|
||||
) {
|
||||
// This init_args will be passed to the container init process,
|
||||
// therefore we will have to move all the variable by value. Since self
|
||||
// is a shared reference, we have to clone these variables here.
|
||||
let init_args = ContainerInitArgs {
|
||||
init: self.init,
|
||||
syscall: self.syscall.clone(),
|
||||
spec: self.spec.clone(),
|
||||
rootfs: self.rootfs.clone(),
|
||||
console_socket: self.console_socket.clone(),
|
||||
rootless: self.rootless.clone(),
|
||||
notify_path: self.notify_path.clone(),
|
||||
child: child,
|
||||
};
|
||||
|
||||
// We have to box up this closure to correctly pass to the init function
|
||||
// of the new process.
|
||||
let cb = Box::new(move || {
|
||||
if let Err(error) = container_init(init_args) {
|
||||
log::debug!("failed to run container_init: {:?}", error);
|
||||
return -1;
|
||||
}
|
||||
@ -114,22 +119,36 @@ impl ContainerBuilderImpl {
|
||||
}
|
||||
}
|
||||
|
||||
fn container_init(
|
||||
init: bool,
|
||||
rootless: Option<Rootless>,
|
||||
spec: Spec,
|
||||
command: LinuxSyscall,
|
||||
rootfs: PathBuf,
|
||||
console_socket: Option<FileDescriptor>,
|
||||
notify_name: PathBuf,
|
||||
child: &mut child::ChildProcess,
|
||||
) -> Result<()> {
|
||||
struct ContainerInitArgs {
|
||||
/// Flag indicating if an init or a tenant container should be created
|
||||
pub init: bool,
|
||||
/// Interface to operating system primitives
|
||||
pub syscall: LinuxSyscall,
|
||||
/// OCI complient runtime spec
|
||||
pub spec: Spec,
|
||||
/// Root filesystem of the container
|
||||
pub rootfs: PathBuf,
|
||||
/// Socket to communicate the file descriptor of the ptty
|
||||
pub console_socket: Option<FileDescriptor>,
|
||||
/// Options for rootless containers
|
||||
pub rootless: Option<Rootless>,
|
||||
/// Path to the Unix Domain Socket to communicate container start
|
||||
pub notify_path: PathBuf,
|
||||
/// Pipe used to communicate with the child process
|
||||
pub child: child::ChildProcess,
|
||||
}
|
||||
|
||||
fn container_init(args: ContainerInitArgs) -> Result<()> {
|
||||
let command = &args.syscall;
|
||||
let spec = &args.spec;
|
||||
let linux = &spec.linux.as_ref().context("no linux in spec")?;
|
||||
let namespaces: Namespaces = linux.namespaces.clone().into();
|
||||
// need to create the notify socket before we pivot root, since the unix
|
||||
// domain socket used here is outside of the rootfs of container
|
||||
let mut notify_socket: NotifyListener = NotifyListener::new(¬ify_name)?;
|
||||
let mut notify_socket: NotifyListener = NotifyListener::new(&args.notify_path)?;
|
||||
let proc = &spec.process.as_ref().context("no process in spec")?;
|
||||
let rootfs = &args.rootfs;
|
||||
let mut child = args.child;
|
||||
|
||||
// if Out-of-memory score adjustment is set in specification. set the score
|
||||
// value for the current process check
|
||||
@ -146,7 +165,7 @@ fn container_init(
|
||||
// namespace will be created, check
|
||||
// https://man7.org/linux/man-pages/man7/user_namespaces.7.html for more
|
||||
// information
|
||||
if rootless.is_some() {
|
||||
if args.rootless.is_some() {
|
||||
// child needs to be dumpable, otherwise the non root parent is not
|
||||
// allowed to write the uid/gid maps
|
||||
prctl::set_dumpable(true).unwrap();
|
||||
@ -165,7 +184,7 @@ fn container_init(
|
||||
.context("failed to become root")?;
|
||||
|
||||
// set up tty if specified
|
||||
if let Some(csocketfd) = console_socket {
|
||||
if let Some(csocketfd) = args.console_socket {
|
||||
tty::setup_console(&csocketfd)?;
|
||||
}
|
||||
|
||||
@ -178,7 +197,7 @@ fn container_init(
|
||||
let _ = prctl::set_no_new_privileges(true);
|
||||
}
|
||||
|
||||
if init {
|
||||
if args.init {
|
||||
rootfs::prepare_rootfs(
|
||||
&spec,
|
||||
&rootfs,
|
||||
@ -190,14 +209,14 @@ fn container_init(
|
||||
|
||||
// change the root of filesystem of the process to the rootfs
|
||||
command
|
||||
.pivot_rootfs(&rootfs)
|
||||
.with_context(|| format!("Failed to pivot root to {:?}", &rootfs))?;
|
||||
.pivot_rootfs(rootfs)
|
||||
.with_context(|| format!("Failed to pivot root to {:?}", rootfs))?;
|
||||
}
|
||||
|
||||
command.set_id(Uid::from_raw(proc.user.uid), Gid::from_raw(proc.user.gid))?;
|
||||
capabilities::reset_effective(&command)?;
|
||||
capabilities::reset_effective(command)?;
|
||||
if let Some(caps) = &proc.capabilities {
|
||||
capabilities::drop_privileges(&caps, &command)?;
|
||||
capabilities::drop_privileges(&caps, command)?;
|
||||
}
|
||||
|
||||
// notify parents that the init process is ready to execute the payload.
|
||||
|
@ -72,7 +72,6 @@ impl InitContainerBuilder {
|
||||
pid_file: self.base.pid_file,
|
||||
console_socket: csocketfd,
|
||||
use_systemd: self.use_systemd,
|
||||
container_dir,
|
||||
spec,
|
||||
rootfs,
|
||||
rootless,
|
||||
|
@ -111,7 +111,6 @@ impl TenantContainerBuilder {
|
||||
pid_file: self.base.pid_file,
|
||||
console_socket: csocketfd,
|
||||
use_systemd,
|
||||
container_dir,
|
||||
spec,
|
||||
rootfs,
|
||||
rootless,
|
||||
|
@ -17,10 +17,12 @@ pub struct ChildProcess {
|
||||
poll: Option<Poll>,
|
||||
}
|
||||
|
||||
// Note : The original youki process first forks into 'parent' (P) and 'child' (C1) process
|
||||
// of which this represents the child (C1) process. The C1 then again forks into parent process which is C1,
|
||||
// and Child (C2) process. C2 is called as init process as it will run the command of the container. But form
|
||||
// a process point of view, init process is child of child process, which is child of original youki process.
|
||||
// Note: The original Youki process "forks" a child process using clone(2). The
|
||||
// child process will become the container init process, where it will set up
|
||||
// namespaces, device mounts, and etc. for the container process. Finally, the
|
||||
// container init process will run the actual container payload through exec
|
||||
// call. The ChildProcess will be used to synchronize between the Youki main
|
||||
// process and the child process (container init process).
|
||||
impl ChildProcess {
|
||||
/// create a new Child process structure
|
||||
pub fn new(parent_channel: ParentChannel) -> Result<Self> {
|
||||
|
@ -1,26 +1,23 @@
|
||||
use anyhow::bail;
|
||||
use anyhow::Context;
|
||||
use anyhow::Result;
|
||||
use libc::c_int;
|
||||
use libc::c_void;
|
||||
use nix::errno::Errno;
|
||||
use nix::sched;
|
||||
use nix::sys;
|
||||
use nix::sys::mman;
|
||||
use nix::unistd::Pid;
|
||||
use std::mem;
|
||||
use std::ptr;
|
||||
|
||||
// The clone callback is used in clone call. It is a boxed closure and it needs
|
||||
// to trasfer the ownership of related memory to the new process.
|
||||
type CloneCb = Box<dyn FnOnce() -> isize + Send>;
|
||||
|
||||
/// clone uses syscall clone(2) to create a new process for the container init
|
||||
/// process. Using clone syscall gives us better control over how to can create
|
||||
/// the new container process, where we can enter into namespaces directly instead
|
||||
/// of using unshare and fork. This call will only create one new process, instead
|
||||
/// of two using fork.
|
||||
pub fn clone(mut cb: sched::CloneCb, clone_flags: sched::CloneFlags) -> Result<Pid> {
|
||||
extern "C" fn callback(data: *mut sched::CloneCb) -> c_int {
|
||||
let cb: &mut sched::CloneCb = unsafe { &mut *data };
|
||||
(*cb)() as c_int
|
||||
}
|
||||
|
||||
pub fn clone(cb: CloneCb, clone_flags: sched::CloneFlags) -> Result<Pid> {
|
||||
// Use sysconf to find the page size. If there is an error, we assume
|
||||
// the default 4K page size.
|
||||
let page_size: usize = unsafe {
|
||||
@ -60,45 +57,70 @@ pub fn clone(mut cb: sched::CloneCb, clone_flags: sched::CloneFlags) -> Result<P
|
||||
0,
|
||||
)?
|
||||
};
|
||||
// Consistant with how pthread_create sets up the stack, we create a
|
||||
// guard page of 1 page, to protect the child stack collision. Note, for
|
||||
// clone call, the child stack will grow downward, so the bottom of the
|
||||
// child stack is in the beginning.
|
||||
unsafe {
|
||||
mman::mprotect(child_stack, page_size, mman::ProtFlags::PROT_NONE)
|
||||
.with_context(|| "Failed to create guard page")?
|
||||
};
|
||||
|
||||
// Since the child stack for clone grows downward, we need to pass in
|
||||
// the top of the stack address.
|
||||
let child_stack_top = unsafe { child_stack.add(default_stack_size) };
|
||||
|
||||
// Adds SIGCHLD flag to mimic the same behavior as fork.
|
||||
let signal = sys::signal::Signal::SIGCHLD;
|
||||
let combined = clone_flags.bits() | signal as c_int;
|
||||
let res = unsafe {
|
||||
// Consistant with how pthread_create sets up the stack, we create a
|
||||
// guard page of 1 page, to protect the child stack collision. Note, for
|
||||
// clone call, the child stack will grow downward, so the bottom of the
|
||||
// child stack is in the beginning.
|
||||
mman::mprotect(child_stack, page_size, mman::ProtFlags::PROT_NONE)
|
||||
.with_context(|| "Failed to create guard page")?;
|
||||
let combined = clone_flags.bits() | signal as libc::c_int;
|
||||
|
||||
// Since the child stack for clone grows downward, we need to pass in
|
||||
// the top of the stack address.
|
||||
let child_stack_top = child_stack.add(default_stack_size);
|
||||
// We are passing the boxed closure "cb" into the clone function as the a
|
||||
// function pointer in C. The box closure in Rust is both a function pointer
|
||||
// and a struct. However, when casting the box closure into libc::c_void,
|
||||
// the function pointer will be lost. Therefore, to work around the issue,
|
||||
// we double box the closure. This is consistant with how std::unix::thread
|
||||
// handles the closure.
|
||||
// Ref: https://github.com/rust-lang/rust/blob/master/library/std/src/sys/unix/thread.rs
|
||||
let data = Box::into_raw(Box::new(cb));
|
||||
// The main is a wrapper function passed into clone call below. The "data"
|
||||
// arg is actually a raw pointer to a Box closure. so here, we re-box the
|
||||
// pointer back into a box closure so the main takes ownership of the
|
||||
// memory. Then we can call the closure passed in.
|
||||
extern "C" fn main(data: *mut libc::c_void) -> libc::c_int {
|
||||
unsafe { Box::from_raw(data as *mut CloneCb)() as i32 }
|
||||
}
|
||||
|
||||
// Using the clone syscall is one of the rare cases where we don't want
|
||||
// rust to manage the child stack memory. Instead, we want to use
|
||||
// c_void directly here. The nix::sched::clone wrapper doesn't provide
|
||||
// the right interface and its interface can't be changed. Therefore,
|
||||
// here we are using libc::clone syscall directly for better control.
|
||||
// The child stack will be cleaned when exec is called or the child
|
||||
// process terminates.
|
||||
libc::clone(
|
||||
mem::transmute(callback as extern "C" fn(*mut Box<dyn FnMut() -> isize>) -> i32),
|
||||
child_stack_top,
|
||||
combined,
|
||||
&mut cb as *mut _ as *mut c_void,
|
||||
)
|
||||
};
|
||||
let pid = Errno::result(res).map(Pid::from_raw)?;
|
||||
|
||||
Ok(pid)
|
||||
// The nix::sched::clone wrapper doesn't provide the right interface. Using
|
||||
// the clone syscall is one of the rare cases where we don't want rust to
|
||||
// manage the child stack memory. Instead, we want to use c_void directly
|
||||
// here. Therefore, here we are using libc::clone syscall directly for
|
||||
// better control. The child stack will be cleaned when exec is called or
|
||||
// the child process terminates. The nix wrapper also does not treat the
|
||||
// closure memory correctly. The wrapper implementation fails to pass the
|
||||
// right ownership to the new child process.
|
||||
// Ref: https://github.com/nix-rust/nix/issues/919
|
||||
// Ref: https://github.com/nix-rust/nix/pull/920
|
||||
let res = unsafe { libc::clone(main, child_stack_top, combined, data as *mut libc::c_void) };
|
||||
match res {
|
||||
-1 => {
|
||||
// Since the clone call failed, the closure passed in didn't get
|
||||
// consumed. To complete the circle, we can safely box up the
|
||||
// closure again and let rust manage this memory for us.
|
||||
unsafe { drop(Box::from_raw(data)) };
|
||||
bail!(
|
||||
"Failed clone to create new process: {:?}",
|
||||
Errno::result(res)
|
||||
)
|
||||
}
|
||||
pid => Ok(Pid::from_raw(pid)),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use anyhow::bail;
|
||||
use nix::sys::wait;
|
||||
use nix::unistd;
|
||||
|
||||
#[test]
|
||||
@ -115,7 +137,7 @@ mod tests {
|
||||
// namespace is needed for the test to run without root
|
||||
let flags = sched::CloneFlags::CLONE_NEWPID | sched::CloneFlags::CLONE_NEWUSER;
|
||||
let pid = super::clone(
|
||||
Box::new(|| {
|
||||
Box::new(move || {
|
||||
if cb().is_err() {
|
||||
return -1;
|
||||
}
|
||||
@ -165,4 +187,27 @@ mod tests {
|
||||
|
||||
bail!("Process didn't exit correctly")
|
||||
}
|
||||
|
||||
fn clone_closure_ownership_test_payload() -> super::CloneCb {
|
||||
// The vec should not be deallocated after this function returns. The
|
||||
// ownership should correctly transfer to the closure returned, to be
|
||||
// passed to the clone and new child process.
|
||||
let numbers: Vec<i32> = (0..101).into_iter().collect();
|
||||
Box::new(move || {
|
||||
assert_eq!(numbers.iter().sum::<i32>(), 5050);
|
||||
0
|
||||
})
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_clone_closure_ownership() -> Result<()> {
|
||||
let flags = sched::CloneFlags::empty();
|
||||
|
||||
let pid = super::clone(clone_closure_ownership_test_payload(), flags)?;
|
||||
let exit_status =
|
||||
wait::waitpid(pid, Some(wait::WaitPidFlag::__WALL)).expect("Waiting for child");
|
||||
assert_eq!(exit_status, wait::WaitStatus::Exited(pid, 0));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user