mirror of
https://github.com/containers/youki
synced 2024-06-10 00:36:16 +02:00
Implement double fork
This commit is contained in:
parent
414fa3a448
commit
0f9490c68b
|
@ -60,7 +60,7 @@ jobs:
|
|||
- name: Build
|
||||
run: ./build.sh --release
|
||||
- name: Run tests
|
||||
run: cargo test
|
||||
run: cargo test -- --nocapture
|
||||
- name: Run doc tests
|
||||
run: cargo test --doc
|
||||
- name: Run cgroup tests
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
use anyhow::{Context, Result};
|
||||
use nix::sched::CloneFlags;
|
||||
|
||||
use cgroups;
|
||||
|
||||
|
@ -8,8 +7,7 @@ use std::{fs, os::unix::prelude::RawFd, path::PathBuf};
|
|||
|
||||
use crate::{
|
||||
hooks,
|
||||
namespaces::Namespaces,
|
||||
process::{child, fork, init, parent},
|
||||
process::{channel, fork, init},
|
||||
rootless::Rootless,
|
||||
syscall::linux::LinuxSyscall,
|
||||
utils,
|
||||
|
@ -59,16 +57,16 @@ impl<'a> ContainerBuilderImpl<'a> {
|
|||
let cgroups_path = utils::get_cgroup_path(&linux.cgroups_path, &self.container_id);
|
||||
let cmanager = cgroups::common::create_cgroup_manager(&cgroups_path, self.use_systemd)?;
|
||||
|
||||
// create the parent and child process structure so the parent and child process can sync with each other
|
||||
let (mut parent, parent_channel) = parent::ParentProcess::new(&self.rootless)?;
|
||||
let child = child::ChildProcess::new(parent_channel)?;
|
||||
|
||||
if self.init {
|
||||
if let Some(hooks) = self.spec.hooks.as_ref() {
|
||||
hooks::run_hooks(hooks.create_runtime.as_ref(), self.container.as_ref())?
|
||||
}
|
||||
}
|
||||
|
||||
// We use a set of channels to communicate between parent and child process. Each channel is uni-directional.
|
||||
let parent_to_child = &mut channel::Channel::new()?;
|
||||
let child_to_parent = &mut channel::Channel::new()?;
|
||||
|
||||
// This init_args will be passed to the container init process,
|
||||
// therefore we will have to move all the variable by value. Since self
|
||||
// is a shared reference, we have to clone these variables here.
|
||||
|
@ -82,30 +80,24 @@ impl<'a> ContainerBuilderImpl<'a> {
|
|||
notify_path: self.notify_path.clone(),
|
||||
preserve_fds: self.preserve_fds,
|
||||
container: self.container.clone(),
|
||||
child,
|
||||
};
|
||||
let intermediate_pid = fork::container_fork(|| {
|
||||
init::container_intermidiate(init_args, parent_to_child, child_to_parent)
|
||||
})?;
|
||||
// If creating a rootless container, the intermediate process will ask
|
||||
// the main process to set up uid and gid mapping, once the intermediate
|
||||
// process enters into a new user namespace.
|
||||
if self.rootless.is_some() {
|
||||
child_to_parent.wait_for_mapping_request(
|
||||
intermediate_pid,
|
||||
self.rootless.as_ref(),
|
||||
parent_to_child,
|
||||
)?;
|
||||
}
|
||||
|
||||
// We have to box up this closure to correctly pass to the init function
|
||||
// of the new process.
|
||||
let cb = Box::new(move || {
|
||||
if let Err(error) = init::container_init(init_args) {
|
||||
log::debug!("failed to run container_init: {:?}", error);
|
||||
return -1;
|
||||
}
|
||||
|
||||
0
|
||||
});
|
||||
|
||||
let clone_flags = linux
|
||||
.namespaces
|
||||
.as_ref()
|
||||
.map(|ns| Namespaces::from(ns).clone_flags)
|
||||
.unwrap_or_else(CloneFlags::empty);
|
||||
let init_pid = fork::clone(cb, clone_flags)?;
|
||||
let init_pid = child_to_parent.wait_for_child_ready()?;
|
||||
log::debug!("init pid is {:?}", init_pid);
|
||||
|
||||
parent.wait_for_child_ready(init_pid)?;
|
||||
|
||||
cmanager.add_task(init_pid)?;
|
||||
if self.rootless.is_none() && linux.resources.is_some() && self.init {
|
||||
cmanager.apply(linux.resources.as_ref().unwrap())?;
|
||||
|
|
|
@ -8,86 +8,85 @@
|
|||
//! Cgroup (Resource limits, execution priority etc.)
|
||||
|
||||
use crate::syscall::{syscall::create_syscall, Syscall};
|
||||
use anyhow::Result;
|
||||
use nix::{
|
||||
fcntl,
|
||||
sched::{self, CloneFlags},
|
||||
sys::stat,
|
||||
unistd::{self, Gid, Uid},
|
||||
};
|
||||
use oci_spec::LinuxNamespace;
|
||||
use anyhow::{Context, Result};
|
||||
use nix::{fcntl, sched::CloneFlags, sys::stat, unistd};
|
||||
use oci_spec::{LinuxNamespace, LinuxNamespaceType};
|
||||
use std::collections;
|
||||
|
||||
/// Holds information about namespaces
|
||||
pub struct Namespaces<'a> {
|
||||
spaces: &'a Vec<LinuxNamespace>,
|
||||
pub struct Namespaces {
|
||||
command: Box<dyn Syscall>,
|
||||
pub clone_flags: CloneFlags,
|
||||
namespace_map: collections::HashMap<CloneFlags, LinuxNamespace>,
|
||||
}
|
||||
|
||||
impl<'a> From<&'a Vec<LinuxNamespace>> for Namespaces<'a> {
|
||||
fn from(namespaces: &'a Vec<LinuxNamespace>) -> Self {
|
||||
let clone_flags = namespaces.iter().filter(|ns| ns.path.is_none()).fold(
|
||||
CloneFlags::empty(),
|
||||
|mut cf, ns| {
|
||||
cf |= CloneFlags::from_bits_truncate(ns.typ as i32);
|
||||
cf
|
||||
},
|
||||
);
|
||||
let command: Box<dyn Syscall> = create_syscall();
|
||||
|
||||
Namespaces {
|
||||
spaces: namespaces,
|
||||
command,
|
||||
clone_flags,
|
||||
}
|
||||
fn get_clone_flag(namespace_type: LinuxNamespaceType) -> CloneFlags {
|
||||
match namespace_type {
|
||||
LinuxNamespaceType::Pid => CloneFlags::CLONE_NEWPID,
|
||||
LinuxNamespaceType::User => CloneFlags::CLONE_NEWUSER,
|
||||
LinuxNamespaceType::Uts => CloneFlags::CLONE_NEWUTS,
|
||||
LinuxNamespaceType::Cgroup => CloneFlags::CLONE_NEWCGROUP,
|
||||
LinuxNamespaceType::Ipc => CloneFlags::CLONE_NEWIPC,
|
||||
LinuxNamespaceType::Network => CloneFlags::CLONE_NEWNET,
|
||||
LinuxNamespaceType::Mount => CloneFlags::CLONE_NEWNS,
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Namespaces<'a> {
|
||||
pub fn apply_setns(&self) -> Result<()> {
|
||||
let to_enter: Vec<(CloneFlags, i32)> = self
|
||||
.spaces
|
||||
impl From<Option<&Vec<LinuxNamespace>>> for Namespaces {
|
||||
fn from(namespaces: Option<&Vec<LinuxNamespace>>) -> Self {
|
||||
let command: Box<dyn Syscall> = create_syscall();
|
||||
let namespace_map: collections::HashMap<CloneFlags, LinuxNamespace> = namespaces
|
||||
.unwrap_or(&vec![])
|
||||
.iter()
|
||||
.filter(|ns| ns.path.is_some()) // filter those which are actually present on the system
|
||||
.map(|ns| {
|
||||
let space = CloneFlags::from_bits_truncate(ns.typ as i32);
|
||||
let fd = fcntl::open(
|
||||
&*ns.path.as_ref().unwrap(),
|
||||
fcntl::OFlag::empty(),
|
||||
stat::Mode::empty(),
|
||||
)
|
||||
.unwrap();
|
||||
(space, fd)
|
||||
})
|
||||
.map(|ns| (get_clone_flag(ns.typ), ns.clone()))
|
||||
.collect();
|
||||
|
||||
for &(space, fd) in &to_enter {
|
||||
// set the namespace
|
||||
self.command.set_ns(fd, space)?;
|
||||
unistd::close(fd)?;
|
||||
// if namespace is cloned with newuser flag, then it creates a new user namespace,
|
||||
// and we need to set the user and group id to 0
|
||||
// see https://man7.org/linux/man-pages/man2/clone.2.html for more info
|
||||
if space == sched::CloneFlags::CLONE_NEWUSER {
|
||||
self.command.set_id(Uid::from_raw(0), Gid::from_raw(0))?;
|
||||
}
|
||||
Namespaces {
|
||||
command,
|
||||
namespace_map,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Namespaces {
|
||||
pub fn apply_namespaces<F: Fn(CloneFlags) -> bool>(&self, filter: F) -> Result<()> {
|
||||
let to_enter: collections::HashMap<&CloneFlags, &LinuxNamespace> = self
|
||||
.namespace_map
|
||||
.iter()
|
||||
.filter(|(k, _)| filter(**k))
|
||||
.collect();
|
||||
for (ns_type, ns) in to_enter {
|
||||
self.unshare_or_setns(ns)
|
||||
.with_context(|| format!("Failed to enter {:?} namespace: {:?}", ns_type, ns))?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// disassociate given parts context of calling process from other process
|
||||
// see https://man7.org/linux/man-pages/man2/unshare.2.html for more info
|
||||
pub fn apply_unshare(&self, without: CloneFlags) -> Result<()> {
|
||||
self.command.unshare(self.clone_flags & !without)?;
|
||||
pub fn unshare_or_setns(&self, namespace: &LinuxNamespace) -> Result<()> {
|
||||
if namespace.path.is_none() {
|
||||
self.command.unshare(get_clone_flag(namespace.typ))?;
|
||||
} else {
|
||||
let ns_path = namespace.path.as_ref().unwrap();
|
||||
let fd = fcntl::open(ns_path, fcntl::OFlag::empty(), stat::Mode::empty())
|
||||
.with_context(|| format!("Failed to open namespace fd: {:?}", ns_path))?;
|
||||
self.command
|
||||
.set_ns(fd, get_clone_flag(namespace.typ))
|
||||
.with_context(|| "Failed to set namespace")?;
|
||||
unistd::close(fd).with_context(|| "Failed to close namespace fd")?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn get(&self, k: LinuxNamespaceType) -> Option<&LinuxNamespace> {
|
||||
self.namespace_map.get(&get_clone_flag(k))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use oci_spec::LinuxNamespaceType;
|
||||
|
||||
use super::*;
|
||||
use crate::syscall::test::TestHelperSyscall;
|
||||
use oci_spec::LinuxNamespaceType;
|
||||
|
||||
fn gen_sample_linux_namespaces() -> Vec<LinuxNamespace> {
|
||||
vec![
|
||||
|
@ -115,11 +114,13 @@ mod tests {
|
|||
}
|
||||
|
||||
#[test]
|
||||
fn test_namespaces_set_ns() {
|
||||
fn test_apply_namespaces() {
|
||||
let sample_linux_namespaces = gen_sample_linux_namespaces();
|
||||
let namespaces = Namespaces::from(&sample_linux_namespaces);
|
||||
let namespaces = Namespaces::from(Some(&sample_linux_namespaces));
|
||||
let test_command: &TestHelperSyscall = namespaces.command.as_any().downcast_ref().unwrap();
|
||||
assert!(namespaces.apply_setns().is_ok());
|
||||
assert!(namespaces
|
||||
.apply_namespaces(|ns_type| { ns_type != CloneFlags::CLONE_NEWIPC })
|
||||
.is_ok());
|
||||
|
||||
let mut setns_args: Vec<_> = test_command
|
||||
.get_setns_args()
|
||||
|
@ -130,18 +131,10 @@ mod tests {
|
|||
let mut expect = vec![CloneFlags::CLONE_NEWNS, CloneFlags::CLONE_NEWNET];
|
||||
expect.sort();
|
||||
assert_eq!(setns_args, expect);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_namespaces_unshare() {
|
||||
let sample_linux_namespaces = gen_sample_linux_namespaces();
|
||||
let namespaces = Namespaces::from(&sample_linux_namespaces);
|
||||
assert!(namespaces.apply_unshare(CloneFlags::CLONE_NEWIPC).is_ok());
|
||||
|
||||
let test_command: &TestHelperSyscall = namespaces.command.as_any().downcast_ref().unwrap();
|
||||
let mut unshare_args = test_command.get_unshare_args();
|
||||
unshare_args.sort();
|
||||
let mut expect = vec![CloneFlags::CLONE_NEWUSER | CloneFlags::CLONE_NEWPID];
|
||||
let mut expect = vec![CloneFlags::CLONE_NEWUSER, CloneFlags::CLONE_NEWPID];
|
||||
expect.sort();
|
||||
assert_eq!(unshare_args, expect)
|
||||
}
|
||||
|
|
|
@ -0,0 +1,295 @@
|
|||
use crate::process::message::Message;
|
||||
use crate::rootless::Rootless;
|
||||
use crate::utils;
|
||||
use anyhow::bail;
|
||||
use anyhow::Context;
|
||||
use anyhow::Result;
|
||||
use mio::unix::pipe;
|
||||
use mio::unix::pipe::{Receiver, Sender};
|
||||
use mio::{Events, Interest, Poll, Token};
|
||||
use nix::unistd::Pid;
|
||||
use std::io::ErrorKind;
|
||||
use std::io::Read;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
use std::process::Command;
|
||||
use std::time::Duration;
|
||||
|
||||
/// Maximum event capacity of polling
|
||||
const MAX_EVENTS: usize = 128;
|
||||
/// Time to wait when polling for message from child process
|
||||
const WAIT_FOR_CHILD: Duration = Duration::from_secs(5);
|
||||
/// Time to wait when polling for mapping ack from parent
|
||||
const WAIT_FOR_MAPPING: Duration = Duration::from_secs(3);
|
||||
// Token is used to identify which socket generated an event
|
||||
const PARENT: Token = Token(0);
|
||||
|
||||
pub struct Channel {
|
||||
sender: Sender,
|
||||
receiver: Receiver,
|
||||
poll: Poll,
|
||||
}
|
||||
|
||||
impl Channel {
|
||||
pub fn new() -> Result<Self> {
|
||||
let poll = Poll::new()?;
|
||||
let (sender, mut receiver) = pipe::new()?;
|
||||
poll.registry()
|
||||
.register(&mut receiver, PARENT, Interest::READABLE)?;
|
||||
|
||||
Ok(Self {
|
||||
sender,
|
||||
receiver,
|
||||
poll,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn send_child_ready(&mut self, pid: Pid) -> Result<()> {
|
||||
// Send over the ChildReady follow by the pid.
|
||||
log::debug!("sending init pid ({:?})", pid);
|
||||
self.write_message(Message::ChildReady)?;
|
||||
self.sender.write_all(&(pid.as_raw()).to_be_bytes())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// requests the parent to write the id mappings for the child process
|
||||
// this needs to be done from the parent see https://man7.org/linux/man-pages/man7/user_namespaces.7.html
|
||||
pub fn send_identifier_mapping_request(&mut self) -> Result<()> {
|
||||
log::debug!("send identifier mapping request");
|
||||
self.write_message(Message::WriteMapping)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn send_mapping_written(&mut self) -> Result<()> {
|
||||
log::debug!("identifier mapping written");
|
||||
self.sender
|
||||
.write_all(&(Message::MappingWritten as u8).to_be_bytes())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// wait until the parent process has finished writing the id mappings
|
||||
pub fn wait_for_mapping_ack(&mut self) -> Result<()> {
|
||||
let mut events = Events::with_capacity(MAX_EVENTS);
|
||||
log::debug!("waiting for mapping ack");
|
||||
|
||||
self.poll.poll(&mut events, Some(WAIT_FOR_MAPPING))?;
|
||||
for event in events.iter() {
|
||||
if event.token() == PARENT {
|
||||
let mut buf = [0; 1];
|
||||
if let Err(e) = self.receiver.read_exact(&mut buf) {
|
||||
if e.kind() != ErrorKind::WouldBlock {
|
||||
bail!(
|
||||
"Failed to receive a message from the child process. {:?}",
|
||||
e
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
match Message::from(u8::from_be_bytes(buf)) {
|
||||
Message::MappingWritten => return Ok(()),
|
||||
msg => bail!(
|
||||
"receive unexpected message {:?} in waiting for mapping ack",
|
||||
msg
|
||||
),
|
||||
}
|
||||
} else {
|
||||
unreachable!();
|
||||
}
|
||||
}
|
||||
|
||||
unreachable!("timed out waiting for mapping ack")
|
||||
}
|
||||
|
||||
pub fn wait_for_mapping_request(
|
||||
&mut self,
|
||||
child_pid: Pid,
|
||||
rootless: Option<&Rootless>,
|
||||
callback: &mut Channel,
|
||||
) -> Result<()> {
|
||||
// Create collection with capacity to store up to MAX_EVENTS events
|
||||
let mut events = Events::with_capacity(MAX_EVENTS);
|
||||
loop {
|
||||
// poll the receiving end of pipe created for WAIT_FOR_CHILD duration for an event
|
||||
self.poll.poll(&mut events, Some(WAIT_FOR_CHILD))?;
|
||||
for event in events.iter() {
|
||||
if event.token() == PARENT {
|
||||
// read data from pipe
|
||||
let mut buf = [0; 1];
|
||||
if let Err(e) = self.receiver.read_exact(&mut buf) {
|
||||
if e.kind() != ErrorKind::WouldBlock {
|
||||
bail!(
|
||||
"Failed to receive a message from the child process. {:?}",
|
||||
e
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// convert to Message wrapper
|
||||
match Message::from(u8::from_be_bytes(buf)) {
|
||||
Message::WriteMapping => {
|
||||
log::debug!("write mapping for pid {:?}", child_pid);
|
||||
utils::write_file(format!("/proc/{}/setgroups", child_pid), "deny")?;
|
||||
write_uid_mapping(child_pid, rootless)?;
|
||||
write_gid_mapping(child_pid, rootless)?;
|
||||
callback.send_mapping_written()?;
|
||||
return Ok(());
|
||||
}
|
||||
msg => bail!(
|
||||
"receive unexpected message {:?} waiting for mapping request",
|
||||
msg
|
||||
),
|
||||
}
|
||||
} else {
|
||||
unreachable!();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Waits for associated child process to send ready message
|
||||
/// and return the pid of init process which is forked by child process
|
||||
pub fn wait_for_child_ready(&mut self) -> Result<Pid> {
|
||||
// Create collection with capacity to store up to MAX_EVENTS events
|
||||
let mut events = Events::with_capacity(MAX_EVENTS);
|
||||
loop {
|
||||
// poll the receiving end of pipe created for WAIT_FOR_CHILD duration for an event
|
||||
self.poll.poll(&mut events, Some(WAIT_FOR_CHILD))?;
|
||||
for event in events.iter() {
|
||||
if event.token() == PARENT {
|
||||
// read data from pipe
|
||||
let mut buf = [0; 1];
|
||||
if let Err(e) = self.receiver.read_exact(&mut buf) {
|
||||
if e.kind() != ErrorKind::WouldBlock {
|
||||
bail!(
|
||||
"Failed to receive a message from the child process. {:?}",
|
||||
e
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// convert to Message wrapper
|
||||
match Message::from(u8::from_be_bytes(buf)) {
|
||||
Message::ChildReady => {
|
||||
log::debug!("received child ready message");
|
||||
let mut buf = [0; 4];
|
||||
if let Err(e) = self.receiver.read_exact(&mut buf) {
|
||||
if e.kind() != ErrorKind::WouldBlock {
|
||||
bail!(
|
||||
"Failed to receive a message from the child process. {:?}",
|
||||
e
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
return Ok(Pid::from_raw(i32::from_be_bytes(buf)));
|
||||
}
|
||||
msg => bail!(
|
||||
"receive unexpected message {:?} waiting for child ready",
|
||||
msg
|
||||
),
|
||||
}
|
||||
} else {
|
||||
unreachable!();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn write_message(&mut self, msg: Message) -> Result<()> {
|
||||
self.sender.write_all(&(msg as u8).to_be_bytes())?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn write_uid_mapping(target_pid: Pid, rootless: Option<&Rootless>) -> Result<()> {
|
||||
if let Some(rootless) = rootless {
|
||||
if let Some(uid_mappings) = rootless.gid_mappings {
|
||||
return write_id_mapping(
|
||||
&format!("/proc/{}/uid_map", target_pid),
|
||||
uid_mappings,
|
||||
rootless.newuidmap.as_deref(),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_gid_mapping(target_pid: Pid, rootless: Option<&Rootless>) -> Result<()> {
|
||||
if let Some(rootless) = rootless {
|
||||
if let Some(gid_mappings) = rootless.gid_mappings {
|
||||
return write_id_mapping(
|
||||
&format!("/proc/{}/gid_map", target_pid),
|
||||
gid_mappings,
|
||||
rootless.newgidmap.as_deref(),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_id_mapping(
|
||||
map_file: &str,
|
||||
mappings: &[oci_spec::LinuxIdMapping],
|
||||
map_binary: Option<&Path>,
|
||||
) -> Result<()> {
|
||||
let mappings: Vec<String> = mappings
|
||||
.iter()
|
||||
.map(|m| format!("{} {} {}", m.container_id, m.host_id, m.size))
|
||||
.collect();
|
||||
if mappings.len() == 1 {
|
||||
utils::write_file(map_file, mappings.first().unwrap())?;
|
||||
} else {
|
||||
Command::new(map_binary.unwrap())
|
||||
.args(mappings)
|
||||
.output()
|
||||
.with_context(|| format!("failed to execute {:?}", map_binary))?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use nix::sys::wait;
|
||||
use nix::unistd;
|
||||
|
||||
#[test]
|
||||
fn test_channel_child_ready() -> Result<()> {
|
||||
let ch = &mut Channel::new()?;
|
||||
match unsafe { unistd::fork()? } {
|
||||
unistd::ForkResult::Parent { child } => {
|
||||
wait::waitpid(child, None)?;
|
||||
let pid = ch.wait_for_child_ready()?;
|
||||
assert_eq!(pid, child);
|
||||
}
|
||||
unistd::ForkResult::Child => {
|
||||
let pid = unistd::getpid();
|
||||
ch.send_child_ready(pid)?;
|
||||
std::process::exit(0);
|
||||
}
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_channel_id_mapping() -> Result<()> {
|
||||
let ch = &mut Channel::new()?;
|
||||
match unsafe { unistd::fork()? } {
|
||||
unistd::ForkResult::Parent { child } => {
|
||||
ch.wait_for_mapping_ack()?;
|
||||
wait::waitpid(child, None)?;
|
||||
}
|
||||
unistd::ForkResult::Child => {
|
||||
ch.send_mapping_written()?;
|
||||
std::process::exit(0);
|
||||
}
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
|
@ -1,67 +0,0 @@
|
|||
use anyhow::Result;
|
||||
use mio::unix::pipe;
|
||||
use mio::unix::pipe::Receiver;
|
||||
use mio::unix::pipe::Sender;
|
||||
use mio::{Interest, Poll, Token};
|
||||
|
||||
use super::parent::ParentChannel;
|
||||
|
||||
// Token is used to identify which socket generated an event
|
||||
const CHILD: Token = Token(1);
|
||||
|
||||
/// Contains sending end of pipe for parent process, receiving end of pipe
|
||||
/// for the init process and poller for that
|
||||
pub struct ChildProcess {
|
||||
parent_channel: ParentChannel,
|
||||
receiver: Option<Receiver>,
|
||||
poll: Option<Poll>,
|
||||
}
|
||||
|
||||
// Note: The original Youki process "forks" a child process using clone(2). The
|
||||
// child process will become the container init process, where it will set up
|
||||
// namespaces, device mounts, and etc. for the container process. Finally, the
|
||||
// container init process will run the actual container payload through exec
|
||||
// call. The ChildProcess will be used to synchronize between the Youki main
|
||||
// process and the child process (container init process).
|
||||
impl ChildProcess {
|
||||
/// create a new Child process structure
|
||||
pub fn new(parent_channel: ParentChannel) -> Result<Self> {
|
||||
Ok(Self {
|
||||
parent_channel,
|
||||
receiver: None,
|
||||
poll: None,
|
||||
})
|
||||
}
|
||||
|
||||
/// sets up sockets for init process
|
||||
pub fn setup_pipe(&mut self) -> Result<Sender> {
|
||||
// create a new pipe
|
||||
let (sender, mut receiver) = pipe::new()?;
|
||||
// create a new poll, and register the receiving end of pipe to it
|
||||
// This will poll for the read events, so when data is written to sending end of the pipe,
|
||||
// the receiving end will be readable and poll wil notify
|
||||
let poll = Poll::new()?;
|
||||
poll.registry()
|
||||
.register(&mut receiver, CHILD, Interest::READABLE)?;
|
||||
|
||||
self.receiver = Some(receiver);
|
||||
self.poll = Some(poll);
|
||||
Ok(sender)
|
||||
}
|
||||
|
||||
/// Indicate that child process has forked the init process to parent process
|
||||
pub fn notify_parent(&mut self) -> Result<()> {
|
||||
self.parent_channel.send_child_ready()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn request_identifier_mapping(&mut self) -> Result<()> {
|
||||
self.parent_channel.request_identifier_mapping()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn wait_for_mapping_ack(&mut self) -> Result<()> {
|
||||
self.parent_channel.wait_for_mapping_ack()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
|
@ -1,222 +1,24 @@
|
|||
use anyhow::bail;
|
||||
use anyhow::Context;
|
||||
use anyhow::Result;
|
||||
use nix::errno::Errno;
|
||||
use nix::sched;
|
||||
use nix::sys;
|
||||
use nix::sys::mman;
|
||||
use nix::unistd;
|
||||
use nix::unistd::Pid;
|
||||
use std::ptr;
|
||||
|
||||
// The clone callback is used in clone call. It is a boxed closure and it needs
|
||||
// to trasfer the ownership of related memory to the new process.
|
||||
type CloneCb = Box<dyn FnOnce() -> isize + Send>;
|
||||
|
||||
const DEFAULT_STACK_SIZE: usize = 8 * 1024 * 1024; // 8M
|
||||
|
||||
/// clone uses syscall clone(2) to create a new process for the container init
|
||||
/// process. Using clone syscall gives us better control over how to can create
|
||||
/// the new container process, where we can enter into namespaces directly instead
|
||||
/// of using unshare and fork. This call will only create one new process, instead
|
||||
/// of two using fork.
|
||||
pub fn clone(cb: CloneCb, clone_flags: sched::CloneFlags) -> Result<Pid> {
|
||||
// Use sysconf to find the page size. If there is an error, we assume
|
||||
// the default 4K page size.
|
||||
let page_size: usize = unsafe {
|
||||
match libc::sysconf(libc::_SC_PAGE_SIZE) {
|
||||
-1 => 4 * 1024, // default to 4K page size
|
||||
x => x as usize,
|
||||
// Execute the cb in another process. Make the fork works more like thread_spawn
|
||||
// or clone, so it is easier to reason. Compared to clone call, fork is easier
|
||||
// to use since fork will magically take care of all the variable copying. If
|
||||
// using clone, we would have to manually make sure all the variables are
|
||||
// correctly send to the new process, especially Rust borrow checker will be a
|
||||
// lot of hassel to deal with every details.
|
||||
pub fn container_fork<F: FnOnce() -> Result<()>>(cb: F) -> Result<Pid> {
|
||||
match unsafe { unistd::fork()? } {
|
||||
unistd::ForkResult::Parent { child } => Ok(child),
|
||||
unistd::ForkResult::Child => {
|
||||
let ret = if let Err(error) = cb() {
|
||||
log::debug!("failed to run fork: {:?}", error);
|
||||
-1
|
||||
} else {
|
||||
0
|
||||
};
|
||||
std::process::exit(ret);
|
||||
}
|
||||
};
|
||||
|
||||
// Find out the default stack max size through getrlimit.
|
||||
let mut rlimit = libc::rlimit {
|
||||
rlim_cur: 0,
|
||||
rlim_max: 0,
|
||||
};
|
||||
unsafe { Errno::result(libc::getrlimit(libc::RLIMIT_STACK, &mut rlimit))? };
|
||||
|
||||
// mmap will return ENOMEM if stack size is unlimited
|
||||
let default_stack_size = if rlimit.rlim_cur != u64::MAX {
|
||||
rlimit.rlim_cur as usize
|
||||
} else {
|
||||
log::info!("stack size returned by getrlimit() is unlimited, use DEFAULT_STACK_SIZE(8MB)");
|
||||
DEFAULT_STACK_SIZE
|
||||
};
|
||||
|
||||
// Using the clone syscall requires us to create the stack space for the
|
||||
// child process instead of taken cared for us like fork call. We use mmap
|
||||
// here to create the stack. Instead of guessing how much space the child
|
||||
// process needs, we allocate through mmap to the system default limit,
|
||||
// which is 8MB on most of the linux system today. This is OK since mmap
|
||||
// will only researve the address space upfront, instead of allocating
|
||||
// physical memory upfront. The stack will grow as needed, up to the size
|
||||
// researved, so no wasted memory here. Lastly, the child stack only needs
|
||||
// to support the container init process set up code in Youki. When Youki
|
||||
// calls exec into the container payload, exec will reset the stack. Note,
|
||||
// do not use MAP_GROWSDOWN since it is not well supported.
|
||||
// Ref: https://man7.org/linux/man-pages/man2/mmap.2.html
|
||||
let child_stack = unsafe {
|
||||
mman::mmap(
|
||||
ptr::null_mut(),
|
||||
default_stack_size,
|
||||
mman::ProtFlags::PROT_READ | mman::ProtFlags::PROT_WRITE,
|
||||
mman::MapFlags::MAP_PRIVATE | mman::MapFlags::MAP_ANONYMOUS | mman::MapFlags::MAP_STACK,
|
||||
-1,
|
||||
0,
|
||||
)?
|
||||
};
|
||||
// Consistant with how pthread_create sets up the stack, we create a
|
||||
// guard page of 1 page, to protect the child stack collision. Note, for
|
||||
// clone call, the child stack will grow downward, so the bottom of the
|
||||
// child stack is in the beginning.
|
||||
unsafe {
|
||||
mman::mprotect(child_stack, page_size, mman::ProtFlags::PROT_NONE)
|
||||
.with_context(|| "Failed to create guard page")?
|
||||
};
|
||||
|
||||
// Since the child stack for clone grows downward, we need to pass in
|
||||
// the top of the stack address.
|
||||
let child_stack_top = unsafe { child_stack.add(default_stack_size) };
|
||||
|
||||
// Adds SIGCHLD flag to mimic the same behavior as fork.
|
||||
let signal = sys::signal::Signal::SIGCHLD;
|
||||
let combined = clone_flags.bits() | signal as libc::c_int;
|
||||
|
||||
// We are passing the boxed closure "cb" into the clone function as the a
|
||||
// function pointer in C. The box closure in Rust is both a function pointer
|
||||
// and a struct. However, when casting the box closure into libc::c_void,
|
||||
// the function pointer will be lost. Therefore, to work around the issue,
|
||||
// we double box the closure. This is consistant with how std::unix::thread
|
||||
// handles the closure.
|
||||
// Ref: https://github.com/rust-lang/rust/blob/master/library/std/src/sys/unix/thread.rs
|
||||
let data = Box::into_raw(Box::new(cb));
|
||||
// The main is a wrapper function passed into clone call below. The "data"
|
||||
// arg is actually a raw pointer to a Box closure. so here, we re-box the
|
||||
// pointer back into a box closure so the main takes ownership of the
|
||||
// memory. Then we can call the closure passed in.
|
||||
extern "C" fn main(data: *mut libc::c_void) -> libc::c_int {
|
||||
unsafe { Box::from_raw(data as *mut CloneCb)() as i32 }
|
||||
}
|
||||
|
||||
// The nix::sched::clone wrapper doesn't provide the right interface. Using
|
||||
// the clone syscall is one of the rare cases where we don't want rust to
|
||||
// manage the child stack memory. Instead, we want to use c_void directly
|
||||
// here. Therefore, here we are using libc::clone syscall directly for
|
||||
// better control. The child stack will be cleaned when exec is called or
|
||||
// the child process terminates. The nix wrapper also does not treat the
|
||||
// closure memory correctly. The wrapper implementation fails to pass the
|
||||
// right ownership to the new child process.
|
||||
// Ref: https://github.com/nix-rust/nix/issues/919
|
||||
// Ref: https://github.com/nix-rust/nix/pull/920
|
||||
let res = unsafe { libc::clone(main, child_stack_top, combined, data as *mut libc::c_void) };
|
||||
match res {
|
||||
-1 => {
|
||||
// Since the clone call failed, the closure passed in didn't get
|
||||
// consumed. To complete the circle, we can safely box up the
|
||||
// closure again and let rust manage this memory for us.
|
||||
unsafe { drop(Box::from_raw(data)) };
|
||||
bail!(
|
||||
"Failed clone to create new process: {:?}",
|
||||
Errno::result(res)
|
||||
)
|
||||
}
|
||||
pid => Ok(Pid::from_raw(pid)),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use anyhow::bail;
|
||||
use nix::sys::wait;
|
||||
use nix::unistd;
|
||||
|
||||
#[test]
|
||||
fn test_fork_clone() -> Result<()> {
|
||||
let cb = || -> Result<()> {
|
||||
// In a new pid namespace, pid of this process should be 1
|
||||
let pid = unistd::getpid();
|
||||
assert_eq!(unistd::Pid::from_raw(1), pid, "PID should set to 1");
|
||||
|
||||
Ok(())
|
||||
};
|
||||
|
||||
// For now, we test clone with new pid and user namespace. user
|
||||
// namespace is needed for the test to run without root
|
||||
let flags = sched::CloneFlags::CLONE_NEWPID | sched::CloneFlags::CLONE_NEWUSER;
|
||||
let pid = super::clone(
|
||||
Box::new(move || {
|
||||
if cb().is_err() {
|
||||
return -1;
|
||||
}
|
||||
|
||||
0
|
||||
}),
|
||||
flags,
|
||||
)?;
|
||||
|
||||
let status = nix::sys::wait::waitpid(pid, None)?;
|
||||
if let nix::sys::wait::WaitStatus::Exited(_, exit_code) = status {
|
||||
assert_eq!(
|
||||
0, exit_code,
|
||||
"Process didn't exit correctly {:?}",
|
||||
exit_code
|
||||
);
|
||||
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
bail!("Process didn't exit correctly")
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_clone_stack_allocation() -> Result<()> {
|
||||
let flags = sched::CloneFlags::empty();
|
||||
let pid = super::clone(
|
||||
Box::new(|| {
|
||||
let mut array_on_stack = [0u8; 4096];
|
||||
array_on_stack.iter_mut().for_each(|x| *x = 0);
|
||||
|
||||
0
|
||||
}),
|
||||
flags,
|
||||
)?;
|
||||
|
||||
let status = nix::sys::wait::waitpid(pid, None)?;
|
||||
if let nix::sys::wait::WaitStatus::Exited(_, exit_code) = status {
|
||||
assert_eq!(
|
||||
0, exit_code,
|
||||
"Process didn't exit correctly {:?}",
|
||||
exit_code
|
||||
);
|
||||
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
bail!("Process didn't exit correctly")
|
||||
}
|
||||
|
||||
fn clone_closure_ownership_test_payload() -> super::CloneCb {
|
||||
// The vec should not be deallocated after this function returns. The
|
||||
// ownership should correctly transfer to the closure returned, to be
|
||||
// passed to the clone and new child process.
|
||||
let numbers: Vec<i32> = (0..101).into_iter().collect();
|
||||
Box::new(move || {
|
||||
assert_eq!(numbers.iter().sum::<i32>(), 5050);
|
||||
0
|
||||
})
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_clone_closure_ownership() -> Result<()> {
|
||||
let flags = sched::CloneFlags::empty();
|
||||
|
||||
let pid = super::clone(clone_closure_ownership_test_payload(), flags)?;
|
||||
let exit_status =
|
||||
wait::waitpid(pid, Some(wait::WaitPidFlag::__WALL)).expect("Waiting for child");
|
||||
assert_eq!(exit_status, wait::WaitStatus::Exited(pid, 0));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,12 +1,13 @@
|
|||
use anyhow::{bail, Context, Result};
|
||||
use nix::mount::mount as nix_mount;
|
||||
use nix::mount::MsFlags;
|
||||
use nix::sched::CloneFlags;
|
||||
use nix::{
|
||||
fcntl, sched,
|
||||
sys::statfs,
|
||||
unistd::{self, Gid, Uid},
|
||||
unistd::{self, Gid, Pid, Uid},
|
||||
};
|
||||
use oci_spec::Spec;
|
||||
use oci_spec::{LinuxNamespaceType, Spec};
|
||||
use std::collections::HashMap;
|
||||
use std::{
|
||||
env,
|
||||
|
@ -20,7 +21,8 @@ use crate::{
|
|||
hooks,
|
||||
namespaces::Namespaces,
|
||||
notify_socket::NotifyListener,
|
||||
process::child,
|
||||
process::channel,
|
||||
process::fork,
|
||||
rootfs,
|
||||
syscall::{linux::LinuxSyscall, Syscall},
|
||||
tty, utils,
|
||||
|
@ -91,6 +93,59 @@ fn cleanup_file_descriptors(preserve_fds: i32) -> Result<()> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
fn sysctl(kernel_params: &HashMap<String, String>) -> Result<()> {
|
||||
let sys = PathBuf::from("/proc/sys");
|
||||
for (kernel_param, value) in kernel_params {
|
||||
let path = sys.join(kernel_param.replace(".", "/"));
|
||||
log::debug!(
|
||||
"apply value {} to kernel parameter {}.",
|
||||
value,
|
||||
kernel_param
|
||||
);
|
||||
fs::write(path, value.as_bytes())
|
||||
.with_context(|| format!("failed to set sysctl {}={}", kernel_param, value))?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// make a read only path
|
||||
// The first time we bind mount, other flags are ignored,
|
||||
// so we need to mount it once and then remount it with the necessary flags specified.
|
||||
// https://man7.org/linux/man-pages/man2/mount.2.html
|
||||
fn readonly_path(path: &str) -> Result<()> {
|
||||
match nix_mount::<str, str, str, str>(
|
||||
Some(path),
|
||||
path,
|
||||
None::<&str>,
|
||||
MsFlags::MS_BIND | MsFlags::MS_REC,
|
||||
None::<&str>,
|
||||
) {
|
||||
// ignore error if path is not exist.
|
||||
Err(nix::errno::Errno::ENOENT) => {
|
||||
log::warn!("readonly path {:?} not exist", path);
|
||||
return Ok(());
|
||||
}
|
||||
Err(err) => bail!(err),
|
||||
Ok(_) => {}
|
||||
}
|
||||
|
||||
nix_mount::<str, str, str, str>(
|
||||
Some(path),
|
||||
path,
|
||||
None::<&str>,
|
||||
MsFlags::MS_NOSUID
|
||||
| MsFlags::MS_NODEV
|
||||
| MsFlags::MS_NOEXEC
|
||||
| MsFlags::MS_BIND
|
||||
| MsFlags::MS_REMOUNT
|
||||
| MsFlags::MS_RDONLY,
|
||||
None::<&str>,
|
||||
)?;
|
||||
log::debug!("readonly path {:?} mounted", path);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub struct ContainerInitArgs {
|
||||
/// Flag indicating if an init or a tenant container should be created
|
||||
pub init: bool,
|
||||
|
@ -110,23 +165,16 @@ pub struct ContainerInitArgs {
|
|||
pub preserve_fds: i32,
|
||||
/// Container state
|
||||
pub container: Option<Container>,
|
||||
/// Pipe used to communicate with the child process
|
||||
pub child: child::ChildProcess,
|
||||
}
|
||||
|
||||
pub fn container_init(args: ContainerInitArgs) -> Result<()> {
|
||||
pub fn container_intermidiate(
|
||||
args: ContainerInitArgs,
|
||||
main_to_intermediate: &mut channel::Channel,
|
||||
intermediate_to_main: &mut channel::Channel,
|
||||
) -> Result<()> {
|
||||
let command = &args.syscall;
|
||||
let spec = &args.spec;
|
||||
let linux = spec.linux.as_ref().context("no linux in spec")?;
|
||||
// need to create the notify socket before we pivot root, since the unix
|
||||
// domain socket used here is outside of the rootfs of container
|
||||
let mut notify_socket: NotifyListener = NotifyListener::new(&args.notify_path)?;
|
||||
let proc = spec.process.as_ref().context("no process in spec")?;
|
||||
let mut envs: Vec<String> = proc.env.as_ref().unwrap_or(&vec![]).clone();
|
||||
let rootfs = &args.rootfs;
|
||||
let hooks = spec.hooks.as_ref();
|
||||
let container = args.container.as_ref();
|
||||
let mut child = args.child;
|
||||
|
||||
// if Out-of-memory score adjustment is set in specification. set the score
|
||||
// value for the current process check
|
||||
|
@ -144,15 +192,18 @@ pub fn container_init(args: ContainerInitArgs) -> Result<()> {
|
|||
// https://man7.org/linux/man-pages/man7/user_namespaces.7.html for more
|
||||
// information
|
||||
if args.is_rootless {
|
||||
log::debug!("creating new user namespace");
|
||||
sched::unshare(sched::CloneFlags::CLONE_NEWUSER)?;
|
||||
// child needs to be dumpable, otherwise the non root parent is not
|
||||
// allowed to write the uid/gid maps
|
||||
prctl::set_dumpable(true).unwrap();
|
||||
child.request_identifier_mapping()?;
|
||||
child.wait_for_mapping_ack()?;
|
||||
intermediate_to_main.send_identifier_mapping_request()?;
|
||||
main_to_intermediate.wait_for_mapping_ack()?;
|
||||
prctl::set_dumpable(false).unwrap();
|
||||
}
|
||||
|
||||
// set limits and namespaces to the process
|
||||
let proc = spec.process.as_ref().context("no process in spec")?;
|
||||
if let Some(rlimits) = proc.rlimits.as_ref() {
|
||||
for rlimit in rlimits.iter() {
|
||||
command.set_rlimit(rlimit).context("failed to set rlimit")?;
|
||||
|
@ -163,21 +214,72 @@ pub fn container_init(args: ContainerInitArgs) -> Result<()> {
|
|||
.set_id(Uid::from_raw(0), Gid::from_raw(0))
|
||||
.context("failed to become root")?;
|
||||
|
||||
// set up tty if specified
|
||||
if let Some(csocketfd) = args.console_socket {
|
||||
tty::setup_console(&csocketfd)?;
|
||||
// Pid namespace requires an extra fork to enter, so we enter pid namespace now.
|
||||
let namespaces = Namespaces::from(linux.namespaces.as_ref());
|
||||
if let Some(pid_namespace) = namespaces.get(LinuxNamespaceType::Pid) {
|
||||
namespaces
|
||||
.unshare_or_setns(pid_namespace)
|
||||
.with_context(|| format!("Failed to enter pid namespace: {:?}", pid_namespace))?;
|
||||
}
|
||||
|
||||
// join existing namespaces
|
||||
let bind_service = if let Some(ns) = linux.namespaces.as_ref() {
|
||||
let namespaces = Namespaces::from(ns);
|
||||
namespaces.apply_setns()?;
|
||||
// We only need for init process to send us the ChildReady.
|
||||
let child_to_parent = &mut channel::Channel::new()?;
|
||||
// We resued the args passed in, but replace with a new set of channels.
|
||||
let init_args = ContainerInitArgs { ..args };
|
||||
// We have to record the pid of the child (container init process), since
|
||||
// the child will be inside the pid namespace. We can't rely on child_ready
|
||||
// to send us the correct pid.
|
||||
let pid = fork::container_fork(|| container_init(init_args, child_to_parent))?;
|
||||
// There is no point using the pid returned here, since the child will be
|
||||
// inside the pid namespace already.
|
||||
child_to_parent.wait_for_child_ready()?;
|
||||
// After the child (the container init process) becomes ready, we can signal
|
||||
// the parent (the main process) that we are ready.
|
||||
intermediate_to_main.send_child_ready(pid)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn container_init(
|
||||
args: ContainerInitArgs,
|
||||
init_to_intermediate: &mut channel::Channel,
|
||||
) -> Result<()> {
|
||||
let command = &args.syscall;
|
||||
let spec = &args.spec;
|
||||
let linux = spec.linux.as_ref().context("no linux in spec")?;
|
||||
// Need to create the notify socket before we pivot root, since the unix
|
||||
// domain socket used here is outside of the rootfs of container. During
|
||||
// exec, need to create the socket before we exter into existing mount
|
||||
// namespace.
|
||||
let mut notify_socket: NotifyListener = NotifyListener::new(&args.notify_path)?;
|
||||
let proc = spec.process.as_ref().context("no process in spec")?;
|
||||
let mut envs: Vec<String> = proc.env.as_ref().unwrap_or(&vec![]).clone();
|
||||
let rootfs = &args.rootfs;
|
||||
let hooks = spec.hooks.as_ref();
|
||||
let container = args.container.as_ref();
|
||||
let namespaces = Namespaces::from(linux.namespaces.as_ref());
|
||||
|
||||
// set up tty if specified
|
||||
if let Some(csocketfd) = args.console_socket {
|
||||
tty::setup_console(&csocketfd).with_context(|| "Failed to set up tty")?;
|
||||
}
|
||||
|
||||
// Enter into rest of namespace. Note, we already entered into user and pid
|
||||
// namespace. We also have to enter into mount namespace last since
|
||||
// namespace may be bind to /proc path. The /proc path will need to be
|
||||
// accessed before pivot_root.
|
||||
namespaces
|
||||
.apply_namespaces(|ns_type| -> bool {
|
||||
ns_type != CloneFlags::CLONE_NEWUSER
|
||||
&& ns_type != CloneFlags::CLONE_NEWPID
|
||||
&& ns_type != CloneFlags::CLONE_NEWNS
|
||||
})
|
||||
.with_context(|| "Failed to apply namespaces")?;
|
||||
if let Some(mount_namespace) = namespaces.get(LinuxNamespaceType::Mount) {
|
||||
namespaces
|
||||
.clone_flags
|
||||
.contains(sched::CloneFlags::CLONE_NEWUSER)
|
||||
} else {
|
||||
false
|
||||
};
|
||||
.unshare_or_setns(mount_namespace)
|
||||
.with_context(|| format!("Failed to enter mount namespace: {:?}", mount_namespace))?;
|
||||
}
|
||||
|
||||
if let Some(hostname) = spec.hostname.as_ref() {
|
||||
command.set_hostname(hostname)?;
|
||||
|
@ -193,6 +295,8 @@ pub fn container_init(args: ContainerInitArgs) -> Result<()> {
|
|||
if let Some(hooks) = hooks {
|
||||
hooks::run_hooks(hooks.create_container.as_ref(), container)?
|
||||
}
|
||||
|
||||
let bind_service = namespaces.get(LinuxNamespaceType::User).is_some();
|
||||
rootfs::prepare_rootfs(spec, rootfs, bind_service)
|
||||
.with_context(|| "Failed to prepare rootfs")?;
|
||||
|
||||
|
@ -202,7 +306,8 @@ pub fn container_init(args: ContainerInitArgs) -> Result<()> {
|
|||
.with_context(|| format!("Failed to pivot root to {:?}", rootfs))?;
|
||||
|
||||
if let Some(kernel_params) = &linux.sysctl {
|
||||
sysctl(kernel_params)?;
|
||||
sysctl(kernel_params)
|
||||
.with_context(|| format!("Failed to sysctl: {:?}", kernel_params))?;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -273,9 +378,7 @@ pub fn container_init(args: ContainerInitArgs) -> Result<()> {
|
|||
};
|
||||
|
||||
// clean up and handle perserved fds.
|
||||
if args.init {
|
||||
cleanup_file_descriptors(preserve_fds).with_context(|| "Failed to clean up extra fds")?;
|
||||
}
|
||||
cleanup_file_descriptors(preserve_fds).with_context(|| "Failed to clean up extra fds")?;
|
||||
|
||||
// change directory to process.cwd if process.cwd is not empty
|
||||
if do_chdir {
|
||||
|
@ -289,7 +392,10 @@ pub fn container_init(args: ContainerInitArgs) -> Result<()> {
|
|||
.for_each(|(key, value)| env::set_var(key, value));
|
||||
|
||||
// notify parents that the init process is ready to execute the payload.
|
||||
child.notify_parent()?;
|
||||
// Note, we pass -1 here because we are already inside the pid namespace.
|
||||
// The pid outside the pid namespace should be recorded by the intermediate
|
||||
// process.
|
||||
init_to_intermediate.send_child_ready(Pid::from_raw(-1))?;
|
||||
|
||||
// listing on the notify socket for container start command
|
||||
notify_socket.wait_for_container_start()?;
|
||||
|
@ -313,59 +419,6 @@ pub fn container_init(args: ContainerInitArgs) -> Result<()> {
|
|||
unreachable!();
|
||||
}
|
||||
|
||||
fn sysctl(kernel_params: &HashMap<String, String>) -> Result<()> {
|
||||
let sys = PathBuf::from("/proc/sys");
|
||||
for (kernel_param, value) in kernel_params {
|
||||
let path = sys.join(kernel_param.replace(".", "/"));
|
||||
log::debug!(
|
||||
"apply value {} to kernel parameter {}.",
|
||||
value,
|
||||
kernel_param
|
||||
);
|
||||
fs::write(path, value.as_bytes())
|
||||
.with_context(|| format!("failed to set sysctl {}={}", kernel_param, value))?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// make a read only path
|
||||
// The first time we bind mount, other flags are ignored,
|
||||
// so we need to mount it once and then remount it with the necessary flags specified.
|
||||
// https://man7.org/linux/man-pages/man2/mount.2.html
|
||||
fn readonly_path(path: &str) -> Result<()> {
|
||||
match nix_mount::<str, str, str, str>(
|
||||
Some(path),
|
||||
path,
|
||||
None::<&str>,
|
||||
MsFlags::MS_BIND | MsFlags::MS_REC,
|
||||
None::<&str>,
|
||||
) {
|
||||
// ignore error if path is not exist.
|
||||
Err(nix::errno::Errno::ENOENT) => {
|
||||
log::warn!("readonly path {:?} not exist", path);
|
||||
return Ok(());
|
||||
}
|
||||
Err(err) => bail!(err),
|
||||
Ok(_) => {}
|
||||
}
|
||||
|
||||
nix_mount::<str, str, str, str>(
|
||||
Some(path),
|
||||
path,
|
||||
None::<&str>,
|
||||
MsFlags::MS_NOSUID
|
||||
| MsFlags::MS_NODEV
|
||||
| MsFlags::MS_NOEXEC
|
||||
| MsFlags::MS_BIND
|
||||
| MsFlags::MS_REMOUNT
|
||||
| MsFlags::MS_RDONLY,
|
||||
None::<&str>,
|
||||
)?;
|
||||
log::debug!("readonly path {:?} mounted", path);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
|
|
@ -1,24 +1,7 @@
|
|||
//! Provides a thin wrapper around fork syscall,
|
||||
//! with enums and functions specific to youki implemented
|
||||
|
||||
use std::time::Duration;
|
||||
|
||||
pub mod child;
|
||||
pub mod channel;
|
||||
pub mod fork;
|
||||
pub mod init;
|
||||
pub mod message;
|
||||
pub mod parent;
|
||||
|
||||
/// Used to describe type of process after fork.
|
||||
/// Parent and child processes mean the same thing as in a normal fork call
|
||||
/// InitProcess is specifically used to indicate the process which will run the command of container
|
||||
pub enum Process<'a> {
|
||||
Parent(parent::ParentProcess<'a>),
|
||||
Child(child::ChildProcess),
|
||||
}
|
||||
/// Maximum event capacity of polling
|
||||
const MAX_EVENTS: usize = 128;
|
||||
/// Time to wait when polling for message from child process
|
||||
const WAIT_FOR_CHILD: Duration = Duration::from_secs(5);
|
||||
/// Time to wait when polling for mapping ack from parent
|
||||
const WAIT_FOR_MAPPING: Duration = Duration::from_secs(3);
|
||||
|
|
|
@ -1,246 +0,0 @@
|
|||
use std::io::ErrorKind;
|
||||
use std::io::Read;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
use std::process::Command;
|
||||
|
||||
use super::{MAX_EVENTS, WAIT_FOR_CHILD};
|
||||
use crate::process::message::Message;
|
||||
use crate::process::WAIT_FOR_MAPPING;
|
||||
use crate::rootless::Rootless;
|
||||
use crate::utils;
|
||||
use anyhow::Context;
|
||||
use anyhow::{bail, Result};
|
||||
use mio::unix::pipe;
|
||||
use mio::unix::pipe::{Receiver, Sender};
|
||||
use mio::{Events, Interest, Poll, Token};
|
||||
use nix::unistd::Pid;
|
||||
use oci_spec::LinuxIdMapping;
|
||||
|
||||
// Token is used to identify which socket generated an event
|
||||
const PARENT: Token = Token(0);
|
||||
|
||||
/// Contains receiving end of pipe to child process and a poller for that.
|
||||
pub struct ParentProcess<'a> {
|
||||
child_channel: ChildChannel<'a>,
|
||||
}
|
||||
|
||||
// Poll is used to register and listen for various events
|
||||
// by registering it with an event source such as receiving end of a pipe
|
||||
impl<'a> ParentProcess<'a> {
|
||||
/// Create new Parent process structure
|
||||
pub fn new(rootless: &'a Option<Rootless>) -> Result<(Self, ParentChannel)> {
|
||||
let (parent_channel, child_channel) = Self::setup_pipes(rootless)?;
|
||||
let parent = Self { child_channel };
|
||||
|
||||
Ok((parent, parent_channel))
|
||||
}
|
||||
|
||||
fn setup_pipes(rootless: &'a Option<Rootless>) -> Result<(ParentChannel, ChildChannel<'a>)> {
|
||||
let (send_to_parent, receive_from_child) = pipe::new()?;
|
||||
let (send_to_child, receive_from_parent) = pipe::new()?;
|
||||
|
||||
let parent_channel = ParentChannel::new(send_to_parent, receive_from_parent)?;
|
||||
let child_channel = ChildChannel::new(send_to_child, receive_from_child, rootless)?;
|
||||
|
||||
Ok((parent_channel, child_channel))
|
||||
}
|
||||
|
||||
/// Waits for associated child process to send ready message
|
||||
/// and return the pid of init process which is forked by child process
|
||||
pub fn wait_for_child_ready(&mut self, child_pid: Pid) -> Result<()> {
|
||||
self.child_channel.wait_for_child_ready(child_pid)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// Channel for communicating with the parent
|
||||
pub struct ParentChannel {
|
||||
sender: Sender,
|
||||
receiver: Receiver,
|
||||
poll: Poll,
|
||||
}
|
||||
|
||||
impl ParentChannel {
|
||||
fn new(sender: Sender, mut receiver: Receiver) -> Result<Self> {
|
||||
let poll = Poll::new()?;
|
||||
poll.registry()
|
||||
.register(&mut receiver, PARENT, Interest::READABLE)?;
|
||||
Ok(Self {
|
||||
sender,
|
||||
receiver,
|
||||
poll,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn send_child_ready(&mut self) -> Result<()> {
|
||||
// write ChildReady message to the pipe to parent
|
||||
log::debug!("[child to parent] sending child ready");
|
||||
self.write_message(Message::ChildReady)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// requests the parent to write the id mappings for the child process
|
||||
// this needs to be done from the parent see https://man7.org/linux/man-pages/man7/user_namespaces.7.html
|
||||
pub fn request_identifier_mapping(&mut self) -> Result<()> {
|
||||
log::debug!("[child to parent] request identifier mapping");
|
||||
self.write_message(Message::WriteMapping)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// wait until the parent process has finished writing the id mappings
|
||||
pub fn wait_for_mapping_ack(&mut self) -> Result<()> {
|
||||
let mut events = Events::with_capacity(MAX_EVENTS);
|
||||
log::debug!("waiting for ack from parent");
|
||||
|
||||
self.poll.poll(&mut events, Some(WAIT_FOR_MAPPING))?;
|
||||
for event in events.iter() {
|
||||
if event.token() == PARENT {
|
||||
let mut buf = [0; 1];
|
||||
match self.receiver.read_exact(&mut buf) {
|
||||
Err(ref e) if e.kind() == ErrorKind::WouldBlock => (),
|
||||
Err(e) => bail!(
|
||||
"Failed to receive a message from the child process. {:?}",
|
||||
e
|
||||
),
|
||||
_ => (),
|
||||
}
|
||||
|
||||
match Message::from(u8::from_be_bytes(buf)) {
|
||||
Message::MappingWritten => return Ok(()),
|
||||
msg => bail!("receive unexpected message {:?} in child process", msg),
|
||||
}
|
||||
}
|
||||
}
|
||||
unreachable!("timed out waiting for mapping ack from parent")
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn write_message(&mut self, msg: Message) -> Result<()> {
|
||||
self.sender.write_all(&(msg as u8).to_be_bytes())?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
struct ChildChannel<'a> {
|
||||
sender: Sender,
|
||||
receiver: Receiver,
|
||||
poll: Poll,
|
||||
rootless: &'a Option<Rootless<'a>>,
|
||||
}
|
||||
|
||||
impl<'a> ChildChannel<'a> {
|
||||
fn new(sender: Sender, mut receiver: Receiver, rootless: &'a Option<Rootless>) -> Result<Self> {
|
||||
let poll = Poll::new()?;
|
||||
poll.registry()
|
||||
.register(&mut receiver, PARENT, Interest::READABLE)?;
|
||||
Ok(Self {
|
||||
sender,
|
||||
receiver,
|
||||
poll,
|
||||
rootless,
|
||||
})
|
||||
}
|
||||
|
||||
/// Waits for associated child process to send ready message
|
||||
/// and return the pid of init process which is forked by child process
|
||||
pub fn wait_for_child_ready(&mut self, child_pid: Pid) -> Result<()> {
|
||||
// Create collection with capacity to store up to MAX_EVENTS events
|
||||
let mut events = Events::with_capacity(MAX_EVENTS);
|
||||
loop {
|
||||
// poll the receiving end of pipe created for WAIT_FOR_CHILD duration for an event
|
||||
self.poll.poll(&mut events, Some(WAIT_FOR_CHILD))?;
|
||||
for event in events.iter() {
|
||||
// check if the event token in PARENT
|
||||
// note that this does not assign anything to PARENT, but instead compares PARENT and event.token()
|
||||
// check http://patshaughnessy.net/2018/1/18/learning-rust-if-let-vs--match for a bit more detailed explanation
|
||||
if let PARENT = event.token() {
|
||||
// read data from pipe
|
||||
let mut buf = [0; 1];
|
||||
match self.receiver.read_exact(&mut buf) {
|
||||
// This error simply means that there are no more incoming connections waiting to be accepted at this point.
|
||||
Err(ref e) if e.kind() == ErrorKind::WouldBlock => {
|
||||
break;
|
||||
}
|
||||
Err(e) => bail!(
|
||||
"Failed to receive a message from the child process. {:?}",
|
||||
e
|
||||
),
|
||||
_ => (),
|
||||
};
|
||||
// convert to Message wrapper
|
||||
match Message::from(u8::from_be_bytes(buf)) {
|
||||
Message::ChildReady => {
|
||||
log::debug!("received child ready message");
|
||||
return Ok(());
|
||||
}
|
||||
Message::WriteMapping => {
|
||||
log::debug!("write mapping for pid {:?}", child_pid);
|
||||
utils::write_file(format!("/proc/{}/setgroups", child_pid), "deny")?;
|
||||
self.write_uid_mapping(child_pid)?;
|
||||
self.write_gid_mapping(child_pid)?;
|
||||
self.notify_mapping_written()?;
|
||||
}
|
||||
msg => bail!("receive unexpected message {:?} in parent process", msg),
|
||||
}
|
||||
} else {
|
||||
// as the poll is registered with only parent token
|
||||
unreachable!()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn notify_mapping_written(&mut self) -> Result<()> {
|
||||
self.sender
|
||||
.write_all(&(Message::MappingWritten as u8).to_be_bytes())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_uid_mapping(&self, target_pid: Pid) -> Result<()> {
|
||||
if let Some(rootless) = self.rootless.as_ref() {
|
||||
if let Some(uid_mappings) = rootless.gid_mappings {
|
||||
return write_id_mapping(
|
||||
&format!("/proc/{}/uid_map", target_pid),
|
||||
uid_mappings,
|
||||
rootless.newuidmap.as_deref(),
|
||||
);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_gid_mapping(&self, target_pid: Pid) -> Result<()> {
|
||||
if let Some(rootless) = self.rootless.as_ref() {
|
||||
if let Some(gid_mappings) = rootless.gid_mappings {
|
||||
return write_id_mapping(
|
||||
&format!("/proc/{}/gid_map", target_pid),
|
||||
gid_mappings,
|
||||
rootless.newgidmap.as_deref(),
|
||||
);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn write_id_mapping(
|
||||
map_file: &str,
|
||||
mappings: &[LinuxIdMapping],
|
||||
map_binary: Option<&Path>,
|
||||
) -> Result<()> {
|
||||
let mappings: Vec<String> = mappings
|
||||
.iter()
|
||||
.map(|m| format!("{} {} {}", m.container_id, m.host_id, m.size))
|
||||
.collect();
|
||||
if mappings.len() == 1 {
|
||||
utils::write_file(map_file, mappings.first().unwrap())?;
|
||||
} else {
|
||||
Command::new(map_binary.unwrap())
|
||||
.args(mappings)
|
||||
.output()
|
||||
.with_context(|| format!("failed to execute {:?}", map_binary))?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
|
@ -1,8 +1,7 @@
|
|||
use std::{env, path::PathBuf};
|
||||
|
||||
use anyhow::{bail, Context, Result};
|
||||
use nix::sched::CloneFlags;
|
||||
use oci_spec::{Linux, LinuxIdMapping, Mount, Spec};
|
||||
use oci_spec::{Linux, LinuxIdMapping, LinuxNamespaceType, Mount, Spec};
|
||||
|
||||
use crate::namespaces::Namespaces;
|
||||
|
||||
|
@ -84,14 +83,8 @@ pub fn validate(spec: &Spec) -> Result<()> {
|
|||
bail!("rootless containers require at least one gid mapping")
|
||||
}
|
||||
|
||||
let namespaces = Namespaces::from(
|
||||
linux
|
||||
.namespaces
|
||||
.as_ref()
|
||||
.context("rootless containers require the namespaces.")?,
|
||||
);
|
||||
|
||||
if !namespaces.clone_flags.contains(CloneFlags::CLONE_NEWUSER) {
|
||||
let namespaces = Namespaces::from(linux.namespaces.as_ref());
|
||||
if namespaces.get(LinuxNamespaceType::User).is_none() {
|
||||
bail!("rootless containers require the specification of a user namespace");
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue