1
0
Fork 0
mirror of https://github.com/containers/youki synced 2024-06-10 00:36:16 +02:00

Implement double fork

This commit is contained in:
yihuaf 2021-08-19 20:25:08 +02:00
parent 414fa3a448
commit 0f9490c68b
10 changed files with 540 additions and 742 deletions

View File

@ -60,7 +60,7 @@ jobs:
- name: Build
run: ./build.sh --release
- name: Run tests
run: cargo test
run: cargo test -- --nocapture
- name: Run doc tests
run: cargo test --doc
- name: Run cgroup tests

View File

@ -1,5 +1,4 @@
use anyhow::{Context, Result};
use nix::sched::CloneFlags;
use cgroups;
@ -8,8 +7,7 @@ use std::{fs, os::unix::prelude::RawFd, path::PathBuf};
use crate::{
hooks,
namespaces::Namespaces,
process::{child, fork, init, parent},
process::{channel, fork, init},
rootless::Rootless,
syscall::linux::LinuxSyscall,
utils,
@ -59,16 +57,16 @@ impl<'a> ContainerBuilderImpl<'a> {
let cgroups_path = utils::get_cgroup_path(&linux.cgroups_path, &self.container_id);
let cmanager = cgroups::common::create_cgroup_manager(&cgroups_path, self.use_systemd)?;
// create the parent and child process structure so the parent and child process can sync with each other
let (mut parent, parent_channel) = parent::ParentProcess::new(&self.rootless)?;
let child = child::ChildProcess::new(parent_channel)?;
if self.init {
if let Some(hooks) = self.spec.hooks.as_ref() {
hooks::run_hooks(hooks.create_runtime.as_ref(), self.container.as_ref())?
}
}
// We use a set of channels to communicate between parent and child process. Each channel is uni-directional.
let parent_to_child = &mut channel::Channel::new()?;
let child_to_parent = &mut channel::Channel::new()?;
// This init_args will be passed to the container init process,
// therefore we will have to move all the variable by value. Since self
// is a shared reference, we have to clone these variables here.
@ -82,30 +80,24 @@ impl<'a> ContainerBuilderImpl<'a> {
notify_path: self.notify_path.clone(),
preserve_fds: self.preserve_fds,
container: self.container.clone(),
child,
};
let intermediate_pid = fork::container_fork(|| {
init::container_intermidiate(init_args, parent_to_child, child_to_parent)
})?;
// If creating a rootless container, the intermediate process will ask
// the main process to set up uid and gid mapping, once the intermediate
// process enters into a new user namespace.
if self.rootless.is_some() {
child_to_parent.wait_for_mapping_request(
intermediate_pid,
self.rootless.as_ref(),
parent_to_child,
)?;
}
// We have to box up this closure to correctly pass to the init function
// of the new process.
let cb = Box::new(move || {
if let Err(error) = init::container_init(init_args) {
log::debug!("failed to run container_init: {:?}", error);
return -1;
}
0
});
let clone_flags = linux
.namespaces
.as_ref()
.map(|ns| Namespaces::from(ns).clone_flags)
.unwrap_or_else(CloneFlags::empty);
let init_pid = fork::clone(cb, clone_flags)?;
let init_pid = child_to_parent.wait_for_child_ready()?;
log::debug!("init pid is {:?}", init_pid);
parent.wait_for_child_ready(init_pid)?;
cmanager.add_task(init_pid)?;
if self.rootless.is_none() && linux.resources.is_some() && self.init {
cmanager.apply(linux.resources.as_ref().unwrap())?;

View File

@ -8,86 +8,85 @@
//! Cgroup (Resource limits, execution priority etc.)
use crate::syscall::{syscall::create_syscall, Syscall};
use anyhow::Result;
use nix::{
fcntl,
sched::{self, CloneFlags},
sys::stat,
unistd::{self, Gid, Uid},
};
use oci_spec::LinuxNamespace;
use anyhow::{Context, Result};
use nix::{fcntl, sched::CloneFlags, sys::stat, unistd};
use oci_spec::{LinuxNamespace, LinuxNamespaceType};
use std::collections;
/// Holds information about namespaces
pub struct Namespaces<'a> {
spaces: &'a Vec<LinuxNamespace>,
pub struct Namespaces {
command: Box<dyn Syscall>,
pub clone_flags: CloneFlags,
namespace_map: collections::HashMap<CloneFlags, LinuxNamespace>,
}
impl<'a> From<&'a Vec<LinuxNamespace>> for Namespaces<'a> {
fn from(namespaces: &'a Vec<LinuxNamespace>) -> Self {
let clone_flags = namespaces.iter().filter(|ns| ns.path.is_none()).fold(
CloneFlags::empty(),
|mut cf, ns| {
cf |= CloneFlags::from_bits_truncate(ns.typ as i32);
cf
},
);
let command: Box<dyn Syscall> = create_syscall();
Namespaces {
spaces: namespaces,
command,
clone_flags,
}
fn get_clone_flag(namespace_type: LinuxNamespaceType) -> CloneFlags {
match namespace_type {
LinuxNamespaceType::Pid => CloneFlags::CLONE_NEWPID,
LinuxNamespaceType::User => CloneFlags::CLONE_NEWUSER,
LinuxNamespaceType::Uts => CloneFlags::CLONE_NEWUTS,
LinuxNamespaceType::Cgroup => CloneFlags::CLONE_NEWCGROUP,
LinuxNamespaceType::Ipc => CloneFlags::CLONE_NEWIPC,
LinuxNamespaceType::Network => CloneFlags::CLONE_NEWNET,
LinuxNamespaceType::Mount => CloneFlags::CLONE_NEWNS,
}
}
impl<'a> Namespaces<'a> {
pub fn apply_setns(&self) -> Result<()> {
let to_enter: Vec<(CloneFlags, i32)> = self
.spaces
impl From<Option<&Vec<LinuxNamespace>>> for Namespaces {
fn from(namespaces: Option<&Vec<LinuxNamespace>>) -> Self {
let command: Box<dyn Syscall> = create_syscall();
let namespace_map: collections::HashMap<CloneFlags, LinuxNamespace> = namespaces
.unwrap_or(&vec![])
.iter()
.filter(|ns| ns.path.is_some()) // filter those which are actually present on the system
.map(|ns| {
let space = CloneFlags::from_bits_truncate(ns.typ as i32);
let fd = fcntl::open(
&*ns.path.as_ref().unwrap(),
fcntl::OFlag::empty(),
stat::Mode::empty(),
)
.unwrap();
(space, fd)
})
.map(|ns| (get_clone_flag(ns.typ), ns.clone()))
.collect();
for &(space, fd) in &to_enter {
// set the namespace
self.command.set_ns(fd, space)?;
unistd::close(fd)?;
// if namespace is cloned with newuser flag, then it creates a new user namespace,
// and we need to set the user and group id to 0
// see https://man7.org/linux/man-pages/man2/clone.2.html for more info
if space == sched::CloneFlags::CLONE_NEWUSER {
self.command.set_id(Uid::from_raw(0), Gid::from_raw(0))?;
}
Namespaces {
command,
namespace_map,
}
}
}
impl Namespaces {
pub fn apply_namespaces<F: Fn(CloneFlags) -> bool>(&self, filter: F) -> Result<()> {
let to_enter: collections::HashMap<&CloneFlags, &LinuxNamespace> = self
.namespace_map
.iter()
.filter(|(k, _)| filter(**k))
.collect();
for (ns_type, ns) in to_enter {
self.unshare_or_setns(ns)
.with_context(|| format!("Failed to enter {:?} namespace: {:?}", ns_type, ns))?;
}
Ok(())
}
/// disassociate given parts context of calling process from other process
// see https://man7.org/linux/man-pages/man2/unshare.2.html for more info
pub fn apply_unshare(&self, without: CloneFlags) -> Result<()> {
self.command.unshare(self.clone_flags & !without)?;
pub fn unshare_or_setns(&self, namespace: &LinuxNamespace) -> Result<()> {
if namespace.path.is_none() {
self.command.unshare(get_clone_flag(namespace.typ))?;
} else {
let ns_path = namespace.path.as_ref().unwrap();
let fd = fcntl::open(ns_path, fcntl::OFlag::empty(), stat::Mode::empty())
.with_context(|| format!("Failed to open namespace fd: {:?}", ns_path))?;
self.command
.set_ns(fd, get_clone_flag(namespace.typ))
.with_context(|| "Failed to set namespace")?;
unistd::close(fd).with_context(|| "Failed to close namespace fd")?;
}
Ok(())
}
pub fn get(&self, k: LinuxNamespaceType) -> Option<&LinuxNamespace> {
self.namespace_map.get(&get_clone_flag(k))
}
}
#[cfg(test)]
mod tests {
use oci_spec::LinuxNamespaceType;
use super::*;
use crate::syscall::test::TestHelperSyscall;
use oci_spec::LinuxNamespaceType;
fn gen_sample_linux_namespaces() -> Vec<LinuxNamespace> {
vec![
@ -115,11 +114,13 @@ mod tests {
}
#[test]
fn test_namespaces_set_ns() {
fn test_apply_namespaces() {
let sample_linux_namespaces = gen_sample_linux_namespaces();
let namespaces = Namespaces::from(&sample_linux_namespaces);
let namespaces = Namespaces::from(Some(&sample_linux_namespaces));
let test_command: &TestHelperSyscall = namespaces.command.as_any().downcast_ref().unwrap();
assert!(namespaces.apply_setns().is_ok());
assert!(namespaces
.apply_namespaces(|ns_type| { ns_type != CloneFlags::CLONE_NEWIPC })
.is_ok());
let mut setns_args: Vec<_> = test_command
.get_setns_args()
@ -130,18 +131,10 @@ mod tests {
let mut expect = vec![CloneFlags::CLONE_NEWNS, CloneFlags::CLONE_NEWNET];
expect.sort();
assert_eq!(setns_args, expect);
}
#[test]
fn test_namespaces_unshare() {
let sample_linux_namespaces = gen_sample_linux_namespaces();
let namespaces = Namespaces::from(&sample_linux_namespaces);
assert!(namespaces.apply_unshare(CloneFlags::CLONE_NEWIPC).is_ok());
let test_command: &TestHelperSyscall = namespaces.command.as_any().downcast_ref().unwrap();
let mut unshare_args = test_command.get_unshare_args();
unshare_args.sort();
let mut expect = vec![CloneFlags::CLONE_NEWUSER | CloneFlags::CLONE_NEWPID];
let mut expect = vec![CloneFlags::CLONE_NEWUSER, CloneFlags::CLONE_NEWPID];
expect.sort();
assert_eq!(unshare_args, expect)
}

295
src/process/channel.rs Normal file
View File

@ -0,0 +1,295 @@
use crate::process::message::Message;
use crate::rootless::Rootless;
use crate::utils;
use anyhow::bail;
use anyhow::Context;
use anyhow::Result;
use mio::unix::pipe;
use mio::unix::pipe::{Receiver, Sender};
use mio::{Events, Interest, Poll, Token};
use nix::unistd::Pid;
use std::io::ErrorKind;
use std::io::Read;
use std::io::Write;
use std::path::Path;
use std::process::Command;
use std::time::Duration;
/// Maximum event capacity of polling
const MAX_EVENTS: usize = 128;
/// Time to wait when polling for message from child process
const WAIT_FOR_CHILD: Duration = Duration::from_secs(5);
/// Time to wait when polling for mapping ack from parent
const WAIT_FOR_MAPPING: Duration = Duration::from_secs(3);
// Token is used to identify which socket generated an event
const PARENT: Token = Token(0);
pub struct Channel {
sender: Sender,
receiver: Receiver,
poll: Poll,
}
impl Channel {
pub fn new() -> Result<Self> {
let poll = Poll::new()?;
let (sender, mut receiver) = pipe::new()?;
poll.registry()
.register(&mut receiver, PARENT, Interest::READABLE)?;
Ok(Self {
sender,
receiver,
poll,
})
}
pub fn send_child_ready(&mut self, pid: Pid) -> Result<()> {
// Send over the ChildReady follow by the pid.
log::debug!("sending init pid ({:?})", pid);
self.write_message(Message::ChildReady)?;
self.sender.write_all(&(pid.as_raw()).to_be_bytes())?;
Ok(())
}
// requests the parent to write the id mappings for the child process
// this needs to be done from the parent see https://man7.org/linux/man-pages/man7/user_namespaces.7.html
pub fn send_identifier_mapping_request(&mut self) -> Result<()> {
log::debug!("send identifier mapping request");
self.write_message(Message::WriteMapping)?;
Ok(())
}
pub fn send_mapping_written(&mut self) -> Result<()> {
log::debug!("identifier mapping written");
self.sender
.write_all(&(Message::MappingWritten as u8).to_be_bytes())?;
Ok(())
}
// wait until the parent process has finished writing the id mappings
pub fn wait_for_mapping_ack(&mut self) -> Result<()> {
let mut events = Events::with_capacity(MAX_EVENTS);
log::debug!("waiting for mapping ack");
self.poll.poll(&mut events, Some(WAIT_FOR_MAPPING))?;
for event in events.iter() {
if event.token() == PARENT {
let mut buf = [0; 1];
if let Err(e) = self.receiver.read_exact(&mut buf) {
if e.kind() != ErrorKind::WouldBlock {
bail!(
"Failed to receive a message from the child process. {:?}",
e
)
}
}
match Message::from(u8::from_be_bytes(buf)) {
Message::MappingWritten => return Ok(()),
msg => bail!(
"receive unexpected message {:?} in waiting for mapping ack",
msg
),
}
} else {
unreachable!();
}
}
unreachable!("timed out waiting for mapping ack")
}
pub fn wait_for_mapping_request(
&mut self,
child_pid: Pid,
rootless: Option<&Rootless>,
callback: &mut Channel,
) -> Result<()> {
// Create collection with capacity to store up to MAX_EVENTS events
let mut events = Events::with_capacity(MAX_EVENTS);
loop {
// poll the receiving end of pipe created for WAIT_FOR_CHILD duration for an event
self.poll.poll(&mut events, Some(WAIT_FOR_CHILD))?;
for event in events.iter() {
if event.token() == PARENT {
// read data from pipe
let mut buf = [0; 1];
if let Err(e) = self.receiver.read_exact(&mut buf) {
if e.kind() != ErrorKind::WouldBlock {
bail!(
"Failed to receive a message from the child process. {:?}",
e
)
}
}
// convert to Message wrapper
match Message::from(u8::from_be_bytes(buf)) {
Message::WriteMapping => {
log::debug!("write mapping for pid {:?}", child_pid);
utils::write_file(format!("/proc/{}/setgroups", child_pid), "deny")?;
write_uid_mapping(child_pid, rootless)?;
write_gid_mapping(child_pid, rootless)?;
callback.send_mapping_written()?;
return Ok(());
}
msg => bail!(
"receive unexpected message {:?} waiting for mapping request",
msg
),
}
} else {
unreachable!();
}
}
}
}
/// Waits for associated child process to send ready message
/// and return the pid of init process which is forked by child process
pub fn wait_for_child_ready(&mut self) -> Result<Pid> {
// Create collection with capacity to store up to MAX_EVENTS events
let mut events = Events::with_capacity(MAX_EVENTS);
loop {
// poll the receiving end of pipe created for WAIT_FOR_CHILD duration for an event
self.poll.poll(&mut events, Some(WAIT_FOR_CHILD))?;
for event in events.iter() {
if event.token() == PARENT {
// read data from pipe
let mut buf = [0; 1];
if let Err(e) = self.receiver.read_exact(&mut buf) {
if e.kind() != ErrorKind::WouldBlock {
bail!(
"Failed to receive a message from the child process. {:?}",
e
)
}
}
// convert to Message wrapper
match Message::from(u8::from_be_bytes(buf)) {
Message::ChildReady => {
log::debug!("received child ready message");
let mut buf = [0; 4];
if let Err(e) = self.receiver.read_exact(&mut buf) {
if e.kind() != ErrorKind::WouldBlock {
bail!(
"Failed to receive a message from the child process. {:?}",
e
)
}
}
return Ok(Pid::from_raw(i32::from_be_bytes(buf)));
}
msg => bail!(
"receive unexpected message {:?} waiting for child ready",
msg
),
}
} else {
unreachable!();
}
}
}
}
#[inline]
fn write_message(&mut self, msg: Message) -> Result<()> {
self.sender.write_all(&(msg as u8).to_be_bytes())?;
Ok(())
}
}
fn write_uid_mapping(target_pid: Pid, rootless: Option<&Rootless>) -> Result<()> {
if let Some(rootless) = rootless {
if let Some(uid_mappings) = rootless.gid_mappings {
return write_id_mapping(
&format!("/proc/{}/uid_map", target_pid),
uid_mappings,
rootless.newuidmap.as_deref(),
);
}
}
Ok(())
}
fn write_gid_mapping(target_pid: Pid, rootless: Option<&Rootless>) -> Result<()> {
if let Some(rootless) = rootless {
if let Some(gid_mappings) = rootless.gid_mappings {
return write_id_mapping(
&format!("/proc/{}/gid_map", target_pid),
gid_mappings,
rootless.newgidmap.as_deref(),
);
}
}
Ok(())
}
fn write_id_mapping(
map_file: &str,
mappings: &[oci_spec::LinuxIdMapping],
map_binary: Option<&Path>,
) -> Result<()> {
let mappings: Vec<String> = mappings
.iter()
.map(|m| format!("{} {} {}", m.container_id, m.host_id, m.size))
.collect();
if mappings.len() == 1 {
utils::write_file(map_file, mappings.first().unwrap())?;
} else {
Command::new(map_binary.unwrap())
.args(mappings)
.output()
.with_context(|| format!("failed to execute {:?}", map_binary))?;
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use nix::sys::wait;
use nix::unistd;
#[test]
fn test_channel_child_ready() -> Result<()> {
let ch = &mut Channel::new()?;
match unsafe { unistd::fork()? } {
unistd::ForkResult::Parent { child } => {
wait::waitpid(child, None)?;
let pid = ch.wait_for_child_ready()?;
assert_eq!(pid, child);
}
unistd::ForkResult::Child => {
let pid = unistd::getpid();
ch.send_child_ready(pid)?;
std::process::exit(0);
}
};
Ok(())
}
#[test]
fn test_channel_id_mapping() -> Result<()> {
let ch = &mut Channel::new()?;
match unsafe { unistd::fork()? } {
unistd::ForkResult::Parent { child } => {
ch.wait_for_mapping_ack()?;
wait::waitpid(child, None)?;
}
unistd::ForkResult::Child => {
ch.send_mapping_written()?;
std::process::exit(0);
}
};
Ok(())
}
}

View File

@ -1,67 +0,0 @@
use anyhow::Result;
use mio::unix::pipe;
use mio::unix::pipe::Receiver;
use mio::unix::pipe::Sender;
use mio::{Interest, Poll, Token};
use super::parent::ParentChannel;
// Token is used to identify which socket generated an event
const CHILD: Token = Token(1);
/// Contains sending end of pipe for parent process, receiving end of pipe
/// for the init process and poller for that
pub struct ChildProcess {
parent_channel: ParentChannel,
receiver: Option<Receiver>,
poll: Option<Poll>,
}
// Note: The original Youki process "forks" a child process using clone(2). The
// child process will become the container init process, where it will set up
// namespaces, device mounts, and etc. for the container process. Finally, the
// container init process will run the actual container payload through exec
// call. The ChildProcess will be used to synchronize between the Youki main
// process and the child process (container init process).
impl ChildProcess {
/// create a new Child process structure
pub fn new(parent_channel: ParentChannel) -> Result<Self> {
Ok(Self {
parent_channel,
receiver: None,
poll: None,
})
}
/// sets up sockets for init process
pub fn setup_pipe(&mut self) -> Result<Sender> {
// create a new pipe
let (sender, mut receiver) = pipe::new()?;
// create a new poll, and register the receiving end of pipe to it
// This will poll for the read events, so when data is written to sending end of the pipe,
// the receiving end will be readable and poll wil notify
let poll = Poll::new()?;
poll.registry()
.register(&mut receiver, CHILD, Interest::READABLE)?;
self.receiver = Some(receiver);
self.poll = Some(poll);
Ok(sender)
}
/// Indicate that child process has forked the init process to parent process
pub fn notify_parent(&mut self) -> Result<()> {
self.parent_channel.send_child_ready()?;
Ok(())
}
pub fn request_identifier_mapping(&mut self) -> Result<()> {
self.parent_channel.request_identifier_mapping()?;
Ok(())
}
pub fn wait_for_mapping_ack(&mut self) -> Result<()> {
self.parent_channel.wait_for_mapping_ack()?;
Ok(())
}
}

View File

@ -1,222 +1,24 @@
use anyhow::bail;
use anyhow::Context;
use anyhow::Result;
use nix::errno::Errno;
use nix::sched;
use nix::sys;
use nix::sys::mman;
use nix::unistd;
use nix::unistd::Pid;
use std::ptr;
// The clone callback is used in clone call. It is a boxed closure and it needs
// to trasfer the ownership of related memory to the new process.
type CloneCb = Box<dyn FnOnce() -> isize + Send>;
const DEFAULT_STACK_SIZE: usize = 8 * 1024 * 1024; // 8M
/// clone uses syscall clone(2) to create a new process for the container init
/// process. Using clone syscall gives us better control over how to can create
/// the new container process, where we can enter into namespaces directly instead
/// of using unshare and fork. This call will only create one new process, instead
/// of two using fork.
pub fn clone(cb: CloneCb, clone_flags: sched::CloneFlags) -> Result<Pid> {
// Use sysconf to find the page size. If there is an error, we assume
// the default 4K page size.
let page_size: usize = unsafe {
match libc::sysconf(libc::_SC_PAGE_SIZE) {
-1 => 4 * 1024, // default to 4K page size
x => x as usize,
// Execute the cb in another process. Make the fork works more like thread_spawn
// or clone, so it is easier to reason. Compared to clone call, fork is easier
// to use since fork will magically take care of all the variable copying. If
// using clone, we would have to manually make sure all the variables are
// correctly send to the new process, especially Rust borrow checker will be a
// lot of hassel to deal with every details.
pub fn container_fork<F: FnOnce() -> Result<()>>(cb: F) -> Result<Pid> {
match unsafe { unistd::fork()? } {
unistd::ForkResult::Parent { child } => Ok(child),
unistd::ForkResult::Child => {
let ret = if let Err(error) = cb() {
log::debug!("failed to run fork: {:?}", error);
-1
} else {
0
};
std::process::exit(ret);
}
};
// Find out the default stack max size through getrlimit.
let mut rlimit = libc::rlimit {
rlim_cur: 0,
rlim_max: 0,
};
unsafe { Errno::result(libc::getrlimit(libc::RLIMIT_STACK, &mut rlimit))? };
// mmap will return ENOMEM if stack size is unlimited
let default_stack_size = if rlimit.rlim_cur != u64::MAX {
rlimit.rlim_cur as usize
} else {
log::info!("stack size returned by getrlimit() is unlimited, use DEFAULT_STACK_SIZE(8MB)");
DEFAULT_STACK_SIZE
};
// Using the clone syscall requires us to create the stack space for the
// child process instead of taken cared for us like fork call. We use mmap
// here to create the stack. Instead of guessing how much space the child
// process needs, we allocate through mmap to the system default limit,
// which is 8MB on most of the linux system today. This is OK since mmap
// will only researve the address space upfront, instead of allocating
// physical memory upfront. The stack will grow as needed, up to the size
// researved, so no wasted memory here. Lastly, the child stack only needs
// to support the container init process set up code in Youki. When Youki
// calls exec into the container payload, exec will reset the stack. Note,
// do not use MAP_GROWSDOWN since it is not well supported.
// Ref: https://man7.org/linux/man-pages/man2/mmap.2.html
let child_stack = unsafe {
mman::mmap(
ptr::null_mut(),
default_stack_size,
mman::ProtFlags::PROT_READ | mman::ProtFlags::PROT_WRITE,
mman::MapFlags::MAP_PRIVATE | mman::MapFlags::MAP_ANONYMOUS | mman::MapFlags::MAP_STACK,
-1,
0,
)?
};
// Consistant with how pthread_create sets up the stack, we create a
// guard page of 1 page, to protect the child stack collision. Note, for
// clone call, the child stack will grow downward, so the bottom of the
// child stack is in the beginning.
unsafe {
mman::mprotect(child_stack, page_size, mman::ProtFlags::PROT_NONE)
.with_context(|| "Failed to create guard page")?
};
// Since the child stack for clone grows downward, we need to pass in
// the top of the stack address.
let child_stack_top = unsafe { child_stack.add(default_stack_size) };
// Adds SIGCHLD flag to mimic the same behavior as fork.
let signal = sys::signal::Signal::SIGCHLD;
let combined = clone_flags.bits() | signal as libc::c_int;
// We are passing the boxed closure "cb" into the clone function as the a
// function pointer in C. The box closure in Rust is both a function pointer
// and a struct. However, when casting the box closure into libc::c_void,
// the function pointer will be lost. Therefore, to work around the issue,
// we double box the closure. This is consistant with how std::unix::thread
// handles the closure.
// Ref: https://github.com/rust-lang/rust/blob/master/library/std/src/sys/unix/thread.rs
let data = Box::into_raw(Box::new(cb));
// The main is a wrapper function passed into clone call below. The "data"
// arg is actually a raw pointer to a Box closure. so here, we re-box the
// pointer back into a box closure so the main takes ownership of the
// memory. Then we can call the closure passed in.
extern "C" fn main(data: *mut libc::c_void) -> libc::c_int {
unsafe { Box::from_raw(data as *mut CloneCb)() as i32 }
}
// The nix::sched::clone wrapper doesn't provide the right interface. Using
// the clone syscall is one of the rare cases where we don't want rust to
// manage the child stack memory. Instead, we want to use c_void directly
// here. Therefore, here we are using libc::clone syscall directly for
// better control. The child stack will be cleaned when exec is called or
// the child process terminates. The nix wrapper also does not treat the
// closure memory correctly. The wrapper implementation fails to pass the
// right ownership to the new child process.
// Ref: https://github.com/nix-rust/nix/issues/919
// Ref: https://github.com/nix-rust/nix/pull/920
let res = unsafe { libc::clone(main, child_stack_top, combined, data as *mut libc::c_void) };
match res {
-1 => {
// Since the clone call failed, the closure passed in didn't get
// consumed. To complete the circle, we can safely box up the
// closure again and let rust manage this memory for us.
unsafe { drop(Box::from_raw(data)) };
bail!(
"Failed clone to create new process: {:?}",
Errno::result(res)
)
}
pid => Ok(Pid::from_raw(pid)),
}
}
#[cfg(test)]
mod tests {
use super::*;
use anyhow::bail;
use nix::sys::wait;
use nix::unistd;
#[test]
fn test_fork_clone() -> Result<()> {
let cb = || -> Result<()> {
// In a new pid namespace, pid of this process should be 1
let pid = unistd::getpid();
assert_eq!(unistd::Pid::from_raw(1), pid, "PID should set to 1");
Ok(())
};
// For now, we test clone with new pid and user namespace. user
// namespace is needed for the test to run without root
let flags = sched::CloneFlags::CLONE_NEWPID | sched::CloneFlags::CLONE_NEWUSER;
let pid = super::clone(
Box::new(move || {
if cb().is_err() {
return -1;
}
0
}),
flags,
)?;
let status = nix::sys::wait::waitpid(pid, None)?;
if let nix::sys::wait::WaitStatus::Exited(_, exit_code) = status {
assert_eq!(
0, exit_code,
"Process didn't exit correctly {:?}",
exit_code
);
return Ok(());
}
bail!("Process didn't exit correctly")
}
#[test]
fn test_clone_stack_allocation() -> Result<()> {
let flags = sched::CloneFlags::empty();
let pid = super::clone(
Box::new(|| {
let mut array_on_stack = [0u8; 4096];
array_on_stack.iter_mut().for_each(|x| *x = 0);
0
}),
flags,
)?;
let status = nix::sys::wait::waitpid(pid, None)?;
if let nix::sys::wait::WaitStatus::Exited(_, exit_code) = status {
assert_eq!(
0, exit_code,
"Process didn't exit correctly {:?}",
exit_code
);
return Ok(());
}
bail!("Process didn't exit correctly")
}
fn clone_closure_ownership_test_payload() -> super::CloneCb {
// The vec should not be deallocated after this function returns. The
// ownership should correctly transfer to the closure returned, to be
// passed to the clone and new child process.
let numbers: Vec<i32> = (0..101).into_iter().collect();
Box::new(move || {
assert_eq!(numbers.iter().sum::<i32>(), 5050);
0
})
}
#[test]
fn test_clone_closure_ownership() -> Result<()> {
let flags = sched::CloneFlags::empty();
let pid = super::clone(clone_closure_ownership_test_payload(), flags)?;
let exit_status =
wait::waitpid(pid, Some(wait::WaitPidFlag::__WALL)).expect("Waiting for child");
assert_eq!(exit_status, wait::WaitStatus::Exited(pid, 0));
Ok(())
}
}

View File

@ -1,12 +1,13 @@
use anyhow::{bail, Context, Result};
use nix::mount::mount as nix_mount;
use nix::mount::MsFlags;
use nix::sched::CloneFlags;
use nix::{
fcntl, sched,
sys::statfs,
unistd::{self, Gid, Uid},
unistd::{self, Gid, Pid, Uid},
};
use oci_spec::Spec;
use oci_spec::{LinuxNamespaceType, Spec};
use std::collections::HashMap;
use std::{
env,
@ -20,7 +21,8 @@ use crate::{
hooks,
namespaces::Namespaces,
notify_socket::NotifyListener,
process::child,
process::channel,
process::fork,
rootfs,
syscall::{linux::LinuxSyscall, Syscall},
tty, utils,
@ -91,6 +93,59 @@ fn cleanup_file_descriptors(preserve_fds: i32) -> Result<()> {
Ok(())
}
fn sysctl(kernel_params: &HashMap<String, String>) -> Result<()> {
let sys = PathBuf::from("/proc/sys");
for (kernel_param, value) in kernel_params {
let path = sys.join(kernel_param.replace(".", "/"));
log::debug!(
"apply value {} to kernel parameter {}.",
value,
kernel_param
);
fs::write(path, value.as_bytes())
.with_context(|| format!("failed to set sysctl {}={}", kernel_param, value))?;
}
Ok(())
}
// make a read only path
// The first time we bind mount, other flags are ignored,
// so we need to mount it once and then remount it with the necessary flags specified.
// https://man7.org/linux/man-pages/man2/mount.2.html
fn readonly_path(path: &str) -> Result<()> {
match nix_mount::<str, str, str, str>(
Some(path),
path,
None::<&str>,
MsFlags::MS_BIND | MsFlags::MS_REC,
None::<&str>,
) {
// ignore error if path is not exist.
Err(nix::errno::Errno::ENOENT) => {
log::warn!("readonly path {:?} not exist", path);
return Ok(());
}
Err(err) => bail!(err),
Ok(_) => {}
}
nix_mount::<str, str, str, str>(
Some(path),
path,
None::<&str>,
MsFlags::MS_NOSUID
| MsFlags::MS_NODEV
| MsFlags::MS_NOEXEC
| MsFlags::MS_BIND
| MsFlags::MS_REMOUNT
| MsFlags::MS_RDONLY,
None::<&str>,
)?;
log::debug!("readonly path {:?} mounted", path);
Ok(())
}
pub struct ContainerInitArgs {
/// Flag indicating if an init or a tenant container should be created
pub init: bool,
@ -110,23 +165,16 @@ pub struct ContainerInitArgs {
pub preserve_fds: i32,
/// Container state
pub container: Option<Container>,
/// Pipe used to communicate with the child process
pub child: child::ChildProcess,
}
pub fn container_init(args: ContainerInitArgs) -> Result<()> {
pub fn container_intermidiate(
args: ContainerInitArgs,
main_to_intermediate: &mut channel::Channel,
intermediate_to_main: &mut channel::Channel,
) -> Result<()> {
let command = &args.syscall;
let spec = &args.spec;
let linux = spec.linux.as_ref().context("no linux in spec")?;
// need to create the notify socket before we pivot root, since the unix
// domain socket used here is outside of the rootfs of container
let mut notify_socket: NotifyListener = NotifyListener::new(&args.notify_path)?;
let proc = spec.process.as_ref().context("no process in spec")?;
let mut envs: Vec<String> = proc.env.as_ref().unwrap_or(&vec![]).clone();
let rootfs = &args.rootfs;
let hooks = spec.hooks.as_ref();
let container = args.container.as_ref();
let mut child = args.child;
// if Out-of-memory score adjustment is set in specification. set the score
// value for the current process check
@ -144,15 +192,18 @@ pub fn container_init(args: ContainerInitArgs) -> Result<()> {
// https://man7.org/linux/man-pages/man7/user_namespaces.7.html for more
// information
if args.is_rootless {
log::debug!("creating new user namespace");
sched::unshare(sched::CloneFlags::CLONE_NEWUSER)?;
// child needs to be dumpable, otherwise the non root parent is not
// allowed to write the uid/gid maps
prctl::set_dumpable(true).unwrap();
child.request_identifier_mapping()?;
child.wait_for_mapping_ack()?;
intermediate_to_main.send_identifier_mapping_request()?;
main_to_intermediate.wait_for_mapping_ack()?;
prctl::set_dumpable(false).unwrap();
}
// set limits and namespaces to the process
let proc = spec.process.as_ref().context("no process in spec")?;
if let Some(rlimits) = proc.rlimits.as_ref() {
for rlimit in rlimits.iter() {
command.set_rlimit(rlimit).context("failed to set rlimit")?;
@ -163,21 +214,72 @@ pub fn container_init(args: ContainerInitArgs) -> Result<()> {
.set_id(Uid::from_raw(0), Gid::from_raw(0))
.context("failed to become root")?;
// set up tty if specified
if let Some(csocketfd) = args.console_socket {
tty::setup_console(&csocketfd)?;
// Pid namespace requires an extra fork to enter, so we enter pid namespace now.
let namespaces = Namespaces::from(linux.namespaces.as_ref());
if let Some(pid_namespace) = namespaces.get(LinuxNamespaceType::Pid) {
namespaces
.unshare_or_setns(pid_namespace)
.with_context(|| format!("Failed to enter pid namespace: {:?}", pid_namespace))?;
}
// join existing namespaces
let bind_service = if let Some(ns) = linux.namespaces.as_ref() {
let namespaces = Namespaces::from(ns);
namespaces.apply_setns()?;
// We only need for init process to send us the ChildReady.
let child_to_parent = &mut channel::Channel::new()?;
// We resued the args passed in, but replace with a new set of channels.
let init_args = ContainerInitArgs { ..args };
// We have to record the pid of the child (container init process), since
// the child will be inside the pid namespace. We can't rely on child_ready
// to send us the correct pid.
let pid = fork::container_fork(|| container_init(init_args, child_to_parent))?;
// There is no point using the pid returned here, since the child will be
// inside the pid namespace already.
child_to_parent.wait_for_child_ready()?;
// After the child (the container init process) becomes ready, we can signal
// the parent (the main process) that we are ready.
intermediate_to_main.send_child_ready(pid)?;
Ok(())
}
pub fn container_init(
args: ContainerInitArgs,
init_to_intermediate: &mut channel::Channel,
) -> Result<()> {
let command = &args.syscall;
let spec = &args.spec;
let linux = spec.linux.as_ref().context("no linux in spec")?;
// Need to create the notify socket before we pivot root, since the unix
// domain socket used here is outside of the rootfs of container. During
// exec, need to create the socket before we exter into existing mount
// namespace.
let mut notify_socket: NotifyListener = NotifyListener::new(&args.notify_path)?;
let proc = spec.process.as_ref().context("no process in spec")?;
let mut envs: Vec<String> = proc.env.as_ref().unwrap_or(&vec![]).clone();
let rootfs = &args.rootfs;
let hooks = spec.hooks.as_ref();
let container = args.container.as_ref();
let namespaces = Namespaces::from(linux.namespaces.as_ref());
// set up tty if specified
if let Some(csocketfd) = args.console_socket {
tty::setup_console(&csocketfd).with_context(|| "Failed to set up tty")?;
}
// Enter into rest of namespace. Note, we already entered into user and pid
// namespace. We also have to enter into mount namespace last since
// namespace may be bind to /proc path. The /proc path will need to be
// accessed before pivot_root.
namespaces
.apply_namespaces(|ns_type| -> bool {
ns_type != CloneFlags::CLONE_NEWUSER
&& ns_type != CloneFlags::CLONE_NEWPID
&& ns_type != CloneFlags::CLONE_NEWNS
})
.with_context(|| "Failed to apply namespaces")?;
if let Some(mount_namespace) = namespaces.get(LinuxNamespaceType::Mount) {
namespaces
.clone_flags
.contains(sched::CloneFlags::CLONE_NEWUSER)
} else {
false
};
.unshare_or_setns(mount_namespace)
.with_context(|| format!("Failed to enter mount namespace: {:?}", mount_namespace))?;
}
if let Some(hostname) = spec.hostname.as_ref() {
command.set_hostname(hostname)?;
@ -193,6 +295,8 @@ pub fn container_init(args: ContainerInitArgs) -> Result<()> {
if let Some(hooks) = hooks {
hooks::run_hooks(hooks.create_container.as_ref(), container)?
}
let bind_service = namespaces.get(LinuxNamespaceType::User).is_some();
rootfs::prepare_rootfs(spec, rootfs, bind_service)
.with_context(|| "Failed to prepare rootfs")?;
@ -202,7 +306,8 @@ pub fn container_init(args: ContainerInitArgs) -> Result<()> {
.with_context(|| format!("Failed to pivot root to {:?}", rootfs))?;
if let Some(kernel_params) = &linux.sysctl {
sysctl(kernel_params)?;
sysctl(kernel_params)
.with_context(|| format!("Failed to sysctl: {:?}", kernel_params))?;
}
}
@ -273,9 +378,7 @@ pub fn container_init(args: ContainerInitArgs) -> Result<()> {
};
// clean up and handle perserved fds.
if args.init {
cleanup_file_descriptors(preserve_fds).with_context(|| "Failed to clean up extra fds")?;
}
cleanup_file_descriptors(preserve_fds).with_context(|| "Failed to clean up extra fds")?;
// change directory to process.cwd if process.cwd is not empty
if do_chdir {
@ -289,7 +392,10 @@ pub fn container_init(args: ContainerInitArgs) -> Result<()> {
.for_each(|(key, value)| env::set_var(key, value));
// notify parents that the init process is ready to execute the payload.
child.notify_parent()?;
// Note, we pass -1 here because we are already inside the pid namespace.
// The pid outside the pid namespace should be recorded by the intermediate
// process.
init_to_intermediate.send_child_ready(Pid::from_raw(-1))?;
// listing on the notify socket for container start command
notify_socket.wait_for_container_start()?;
@ -313,59 +419,6 @@ pub fn container_init(args: ContainerInitArgs) -> Result<()> {
unreachable!();
}
fn sysctl(kernel_params: &HashMap<String, String>) -> Result<()> {
let sys = PathBuf::from("/proc/sys");
for (kernel_param, value) in kernel_params {
let path = sys.join(kernel_param.replace(".", "/"));
log::debug!(
"apply value {} to kernel parameter {}.",
value,
kernel_param
);
fs::write(path, value.as_bytes())
.with_context(|| format!("failed to set sysctl {}={}", kernel_param, value))?;
}
Ok(())
}
// make a read only path
// The first time we bind mount, other flags are ignored,
// so we need to mount it once and then remount it with the necessary flags specified.
// https://man7.org/linux/man-pages/man2/mount.2.html
fn readonly_path(path: &str) -> Result<()> {
match nix_mount::<str, str, str, str>(
Some(path),
path,
None::<&str>,
MsFlags::MS_BIND | MsFlags::MS_REC,
None::<&str>,
) {
// ignore error if path is not exist.
Err(nix::errno::Errno::ENOENT) => {
log::warn!("readonly path {:?} not exist", path);
return Ok(());
}
Err(err) => bail!(err),
Ok(_) => {}
}
nix_mount::<str, str, str, str>(
Some(path),
path,
None::<&str>,
MsFlags::MS_NOSUID
| MsFlags::MS_NODEV
| MsFlags::MS_NOEXEC
| MsFlags::MS_BIND
| MsFlags::MS_REMOUNT
| MsFlags::MS_RDONLY,
None::<&str>,
)?;
log::debug!("readonly path {:?} mounted", path);
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;

View File

@ -1,24 +1,7 @@
//! Provides a thin wrapper around fork syscall,
//! with enums and functions specific to youki implemented
use std::time::Duration;
pub mod child;
pub mod channel;
pub mod fork;
pub mod init;
pub mod message;
pub mod parent;
/// Used to describe type of process after fork.
/// Parent and child processes mean the same thing as in a normal fork call
/// InitProcess is specifically used to indicate the process which will run the command of container
pub enum Process<'a> {
Parent(parent::ParentProcess<'a>),
Child(child::ChildProcess),
}
/// Maximum event capacity of polling
const MAX_EVENTS: usize = 128;
/// Time to wait when polling for message from child process
const WAIT_FOR_CHILD: Duration = Duration::from_secs(5);
/// Time to wait when polling for mapping ack from parent
const WAIT_FOR_MAPPING: Duration = Duration::from_secs(3);

View File

@ -1,246 +0,0 @@
use std::io::ErrorKind;
use std::io::Read;
use std::io::Write;
use std::path::Path;
use std::process::Command;
use super::{MAX_EVENTS, WAIT_FOR_CHILD};
use crate::process::message::Message;
use crate::process::WAIT_FOR_MAPPING;
use crate::rootless::Rootless;
use crate::utils;
use anyhow::Context;
use anyhow::{bail, Result};
use mio::unix::pipe;
use mio::unix::pipe::{Receiver, Sender};
use mio::{Events, Interest, Poll, Token};
use nix::unistd::Pid;
use oci_spec::LinuxIdMapping;
// Token is used to identify which socket generated an event
const PARENT: Token = Token(0);
/// Contains receiving end of pipe to child process and a poller for that.
pub struct ParentProcess<'a> {
child_channel: ChildChannel<'a>,
}
// Poll is used to register and listen for various events
// by registering it with an event source such as receiving end of a pipe
impl<'a> ParentProcess<'a> {
/// Create new Parent process structure
pub fn new(rootless: &'a Option<Rootless>) -> Result<(Self, ParentChannel)> {
let (parent_channel, child_channel) = Self::setup_pipes(rootless)?;
let parent = Self { child_channel };
Ok((parent, parent_channel))
}
fn setup_pipes(rootless: &'a Option<Rootless>) -> Result<(ParentChannel, ChildChannel<'a>)> {
let (send_to_parent, receive_from_child) = pipe::new()?;
let (send_to_child, receive_from_parent) = pipe::new()?;
let parent_channel = ParentChannel::new(send_to_parent, receive_from_parent)?;
let child_channel = ChildChannel::new(send_to_child, receive_from_child, rootless)?;
Ok((parent_channel, child_channel))
}
/// Waits for associated child process to send ready message
/// and return the pid of init process which is forked by child process
pub fn wait_for_child_ready(&mut self, child_pid: Pid) -> Result<()> {
self.child_channel.wait_for_child_ready(child_pid)?;
Ok(())
}
}
// Channel for communicating with the parent
pub struct ParentChannel {
sender: Sender,
receiver: Receiver,
poll: Poll,
}
impl ParentChannel {
fn new(sender: Sender, mut receiver: Receiver) -> Result<Self> {
let poll = Poll::new()?;
poll.registry()
.register(&mut receiver, PARENT, Interest::READABLE)?;
Ok(Self {
sender,
receiver,
poll,
})
}
pub fn send_child_ready(&mut self) -> Result<()> {
// write ChildReady message to the pipe to parent
log::debug!("[child to parent] sending child ready");
self.write_message(Message::ChildReady)?;
Ok(())
}
// requests the parent to write the id mappings for the child process
// this needs to be done from the parent see https://man7.org/linux/man-pages/man7/user_namespaces.7.html
pub fn request_identifier_mapping(&mut self) -> Result<()> {
log::debug!("[child to parent] request identifier mapping");
self.write_message(Message::WriteMapping)?;
Ok(())
}
// wait until the parent process has finished writing the id mappings
pub fn wait_for_mapping_ack(&mut self) -> Result<()> {
let mut events = Events::with_capacity(MAX_EVENTS);
log::debug!("waiting for ack from parent");
self.poll.poll(&mut events, Some(WAIT_FOR_MAPPING))?;
for event in events.iter() {
if event.token() == PARENT {
let mut buf = [0; 1];
match self.receiver.read_exact(&mut buf) {
Err(ref e) if e.kind() == ErrorKind::WouldBlock => (),
Err(e) => bail!(
"Failed to receive a message from the child process. {:?}",
e
),
_ => (),
}
match Message::from(u8::from_be_bytes(buf)) {
Message::MappingWritten => return Ok(()),
msg => bail!("receive unexpected message {:?} in child process", msg),
}
}
}
unreachable!("timed out waiting for mapping ack from parent")
}
#[inline]
fn write_message(&mut self, msg: Message) -> Result<()> {
self.sender.write_all(&(msg as u8).to_be_bytes())?;
Ok(())
}
}
struct ChildChannel<'a> {
sender: Sender,
receiver: Receiver,
poll: Poll,
rootless: &'a Option<Rootless<'a>>,
}
impl<'a> ChildChannel<'a> {
fn new(sender: Sender, mut receiver: Receiver, rootless: &'a Option<Rootless>) -> Result<Self> {
let poll = Poll::new()?;
poll.registry()
.register(&mut receiver, PARENT, Interest::READABLE)?;
Ok(Self {
sender,
receiver,
poll,
rootless,
})
}
/// Waits for associated child process to send ready message
/// and return the pid of init process which is forked by child process
pub fn wait_for_child_ready(&mut self, child_pid: Pid) -> Result<()> {
// Create collection with capacity to store up to MAX_EVENTS events
let mut events = Events::with_capacity(MAX_EVENTS);
loop {
// poll the receiving end of pipe created for WAIT_FOR_CHILD duration for an event
self.poll.poll(&mut events, Some(WAIT_FOR_CHILD))?;
for event in events.iter() {
// check if the event token in PARENT
// note that this does not assign anything to PARENT, but instead compares PARENT and event.token()
// check http://patshaughnessy.net/2018/1/18/learning-rust-if-let-vs--match for a bit more detailed explanation
if let PARENT = event.token() {
// read data from pipe
let mut buf = [0; 1];
match self.receiver.read_exact(&mut buf) {
// This error simply means that there are no more incoming connections waiting to be accepted at this point.
Err(ref e) if e.kind() == ErrorKind::WouldBlock => {
break;
}
Err(e) => bail!(
"Failed to receive a message from the child process. {:?}",
e
),
_ => (),
};
// convert to Message wrapper
match Message::from(u8::from_be_bytes(buf)) {
Message::ChildReady => {
log::debug!("received child ready message");
return Ok(());
}
Message::WriteMapping => {
log::debug!("write mapping for pid {:?}", child_pid);
utils::write_file(format!("/proc/{}/setgroups", child_pid), "deny")?;
self.write_uid_mapping(child_pid)?;
self.write_gid_mapping(child_pid)?;
self.notify_mapping_written()?;
}
msg => bail!("receive unexpected message {:?} in parent process", msg),
}
} else {
// as the poll is registered with only parent token
unreachable!()
}
}
}
}
fn notify_mapping_written(&mut self) -> Result<()> {
self.sender
.write_all(&(Message::MappingWritten as u8).to_be_bytes())?;
Ok(())
}
fn write_uid_mapping(&self, target_pid: Pid) -> Result<()> {
if let Some(rootless) = self.rootless.as_ref() {
if let Some(uid_mappings) = rootless.gid_mappings {
return write_id_mapping(
&format!("/proc/{}/uid_map", target_pid),
uid_mappings,
rootless.newuidmap.as_deref(),
);
}
}
Ok(())
}
fn write_gid_mapping(&self, target_pid: Pid) -> Result<()> {
if let Some(rootless) = self.rootless.as_ref() {
if let Some(gid_mappings) = rootless.gid_mappings {
return write_id_mapping(
&format!("/proc/{}/gid_map", target_pid),
gid_mappings,
rootless.newgidmap.as_deref(),
);
}
}
Ok(())
}
}
fn write_id_mapping(
map_file: &str,
mappings: &[LinuxIdMapping],
map_binary: Option<&Path>,
) -> Result<()> {
let mappings: Vec<String> = mappings
.iter()
.map(|m| format!("{} {} {}", m.container_id, m.host_id, m.size))
.collect();
if mappings.len() == 1 {
utils::write_file(map_file, mappings.first().unwrap())?;
} else {
Command::new(map_binary.unwrap())
.args(mappings)
.output()
.with_context(|| format!("failed to execute {:?}", map_binary))?;
}
Ok(())
}

View File

@ -1,8 +1,7 @@
use std::{env, path::PathBuf};
use anyhow::{bail, Context, Result};
use nix::sched::CloneFlags;
use oci_spec::{Linux, LinuxIdMapping, Mount, Spec};
use oci_spec::{Linux, LinuxIdMapping, LinuxNamespaceType, Mount, Spec};
use crate::namespaces::Namespaces;
@ -84,14 +83,8 @@ pub fn validate(spec: &Spec) -> Result<()> {
bail!("rootless containers require at least one gid mapping")
}
let namespaces = Namespaces::from(
linux
.namespaces
.as_ref()
.context("rootless containers require the namespaces.")?,
);
if !namespaces.clone_flags.contains(CloneFlags::CLONE_NEWUSER) {
let namespaces = Namespaces::from(linux.namespaces.as_ref());
if namespaces.get(LinuxNamespaceType::User).is_none() {
bail!("rootless containers require the specification of a user namespace");
}