1
0
mirror of https://github.com/containers/youki synced 2024-11-23 09:21:57 +01:00

add framework for seccomp notify

This commit is contained in:
yihuaf 2021-10-12 10:21:41 +02:00
parent 8fbe51cd5f
commit eca790070d
4 changed files with 114 additions and 24 deletions

@ -37,6 +37,12 @@ impl MainSender {
Ok(())
}
pub fn seccomp_notify_request(&mut self) -> Result<()> {
self.sender.write_message(Message::SeccompNotify)?;
Ok(())
}
pub fn intermediate_ready(&mut self, pid: Pid) -> Result<()> {
// Send over the IntermediateReady follow by the pid.
log::debug!("sending init pid ({:?})", pid);
@ -86,6 +92,22 @@ impl MainReceiver {
}
}
pub fn wait_for_seccomp_request(&mut self) -> Result<()> {
let mut buf = [0; 1];
self.receiver
.read_exact(&mut buf)
.with_context(|| "failed to receive a message from the child process")?;
// convert to Message wrapper
match Message::from(u8::from_be_bytes(buf)) {
Message::SeccompNotify => Ok(()),
msg => bail!(
"receive unexpected message {:?} waiting for mapping request",
msg
),
}
}
pub fn close(&self) -> Result<()> {
self.receiver.close()
}
@ -174,6 +196,12 @@ pub struct InitSender {
}
impl InitSender {
pub fn seccomp_notify_done(&mut self) -> Result<()> {
self.sender.write_message(Message::SeccompNotifyDone)?;
Ok(())
}
pub fn close(&self) -> Result<()> {
self.sender.close()
}
@ -184,6 +212,22 @@ pub struct InitReceiver {
}
impl InitReceiver {
pub fn wait_for_seccomp_request_done(&mut self) -> Result<()> {
let mut buf = [0; 1];
self.receiver
.read_exact(&mut buf)
.with_context(|| "failed to receive a message")?;
// convert to Message wrapper
match Message::from(u8::from_be_bytes(buf)) {
Message::SeccompNotifyDone => Ok(()),
msg => bail!(
"receive unexpected message {:?} waiting for seccomp done request",
msg
),
}
}
pub fn close(&self) -> Result<()> {
self.receiver.close()
}

@ -13,7 +13,7 @@ use nix::{
fcntl,
unistd::{self, Gid, Uid},
};
use oci_spec::runtime::{LinuxNamespaceType, Spec, User};
use oci_spec::runtime::{LinuxNamespaceType, LinuxSeccomp, Spec, User};
use std::collections::HashMap;
use std::{
env, fs,
@ -188,7 +188,8 @@ fn apply_rest_namespaces(
pub fn container_init_process(
args: &ContainerArgs,
intermediate_sender: &mut channel::IntermediateSender,
_init_receiver: &mut channel::InitReceiver,
main_sender: &mut channel::MainSender,
init_receiver: &mut channel::InitReceiver,
) -> Result<()> {
let syscall = args.syscall;
let spec = &args.spec;
@ -294,7 +295,7 @@ pub fn container_init_process(
match unistd::chdir(proc.cwd()) {
Ok(_) => false,
Err(nix::Error::EPERM) => true,
Err(e) => bail!("Failed to chdir: {}", e),
Err(e) => bail!("failed to chdir: {}", e),
}
};
@ -306,14 +307,16 @@ pub fn container_init_process(
Uid::from_raw(proc.user().uid()),
Gid::from_raw(proc.user().gid()),
)
.context("Failed to configure uid and gid")?;
.context("failed to configure uid and gid")?;
// Without no new privileges, seccomp is a privileged operation. We have to
// do this before dropping capabilities. Otherwise, we should do it later,
// as close to exec as possible.
if linux.seccomp().is_some() && proc.no_new_privileges().is_none() {
seccomp::initialize_seccomp(linux.seccomp().as_ref().unwrap())
.context("Failed to execute seccomp")?;
if let Some(seccomp) = linux.seccomp() {
seccomp::initialize_seccomp(seccomp).context("failed to execute seccomp")?;
sync_seccomp(seccomp, main_sender, init_receiver).context("failed to sync seccomp")?;
}
}
capabilities::reset_effective(syscall).context("Failed to reset effective capabilities")?;
@ -361,10 +364,11 @@ pub fn container_init_process(
}
};
// clean up and handle perserved fds.
// Clean up and handle perserved fds. We only mark the fd as CLOSEXEC, so we
// don't have to worry about when the fd will be closed.
cleanup_file_descriptors(preserve_fds).with_context(|| "Failed to clean up extra fds")?;
// change directory to process.cwd if process.cwd is not empty
// Change directory to process.cwd if process.cwd is not empty
if do_chdir {
unistd::chdir(proc.cwd()).with_context(|| format!("failed to chdir {:?}", proc.cwd()))?;
}
@ -375,6 +379,16 @@ pub fn container_init_process(
.iter()
.for_each(|(key, value)| env::set_var(key, value));
// Initialize seccomp profile right before we are ready to execute the
// payload so as few syscalls will happen between here and payload exec. The
// notify socket will still need network related syscalls.
if let Some(seccomp) = linux.seccomp() {
if proc.no_new_privileges().is_some() {
seccomp::initialize_seccomp(seccomp).context("failed to execute seccomp")?;
sync_seccomp(seccomp, main_sender, init_receiver).context("failed to sync seccomp")?;
}
}
// notify parents that the init process is ready to execute the payload.
// Note, we pass -1 here because we are already inside the pid namespace.
// The pid outside the pid namespace should be recorded by the intermediate
@ -392,14 +406,6 @@ pub fn container_init_process(
}
}
if let Some(seccomp) = linux.seccomp() {
if proc.no_new_privileges().is_some() {
// Initialize seccomp profile right before we are ready to execute the
// payload. The notify socket will still need network related syscalls.
seccomp::initialize_seccomp(seccomp).context("Failed to execute seccomp")?;
}
}
if let Some(args) = proc.args() {
utils::do_exec(&args[0], args)?;
} else {
@ -469,6 +475,19 @@ fn set_supplementary_gids(user: &User, rootless: &Option<Rootless>) -> Result<()
Ok(())
}
fn sync_seccomp(
seccomp: &LinuxSeccomp,
main_sender: &mut channel::MainSender,
init_receiver: &mut channel::InitReceiver,
) -> Result<()> {
if seccomp::is_notify(seccomp) {
main_sender.seccomp_notify_request()?;
init_receiver.wait_for_seccomp_request_done()?;
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;

@ -13,6 +13,8 @@ pub fn container_intermediate_process(
args: &ContainerArgs,
intermediate_sender: &mut channel::IntermediateSender,
intermediate_receiver: &mut channel::IntermediateReceiver,
init_sender: &mut channel::InitSender,
init_receiver: &mut channel::InitReceiver,
main_sender: &mut channel::MainSender,
) -> Result<()> {
let command = &args.syscall;
@ -75,9 +77,6 @@ pub fn container_intermediate_process(
.context("failed to apply cgroups")?
}
// We only need for init process to send us the ChildReady.
let (init_sender, init_receiver) = &mut channel::init_channel()?;
// We have to record the pid of the child (container init process), since
// the child will be inside the pid namespace. We can't rely on child_ready
// to send us the correct pid.
@ -86,10 +85,7 @@ pub fn container_intermediate_process(
init_sender
.close()
.context("failed to close receiver in init process")?;
main_sender
.close()
.context("failed to close unused sender")?;
container_init_process(args, intermediate_sender, init_receiver)
container_init_process(args, intermediate_sender, main_sender, init_receiver)
})?;
// Close unused fds in the parent process.
intermediate_sender

@ -1,15 +1,17 @@
use crate::{
process::{args::ContainerArgs, channel, container_intermediate_process, fork},
rootless::Rootless,
utils,
seccomp, utils,
};
use anyhow::{Context, Result};
use nix::unistd::Pid;
use oci_spec::runtime::LinuxSeccomp;
pub fn container_main_process(container_args: &ContainerArgs) -> Result<Pid> {
// We use a set of channels to communicate between parent and child process. Each channel is uni-directional.
let (main_sender, main_receiver) = &mut channel::main_channel()?;
let (intermediate_sender, intermediate_receiver) = &mut channel::intermediate_channel()?;
let (init_sender, init_receiver) = &mut channel::init_channel()?;
let intermediate_pid = fork::container_fork(|| {
// The fds in the channel is duplicated during fork, so we first close
@ -22,6 +24,8 @@ pub fn container_main_process(container_args: &ContainerArgs) -> Result<Pid> {
container_args,
intermediate_sender,
intermediate_receiver,
init_sender,
init_receiver,
main_sender,
)
})?;
@ -44,12 +48,38 @@ pub fn container_main_process(container_args: &ContainerArgs) -> Result<Pid> {
.close()
.context("failed to close unused sender")?;
if let Some(linux) = container_args.spec.linux() {
if let Some(seccomp) = linux.seccomp() {
sync_seccomp(seccomp, init_sender, main_receiver)
.context("failed to sync seccomp with init")?;
}
}
init_sender
.close()
.context("failed to close unused init sender")?;
let init_pid = main_receiver.wait_for_intermediate_ready()?;
log::debug!("init pid is {:?}", init_pid);
Ok(init_pid)
}
fn sync_seccomp(
seccomp: &LinuxSeccomp,
init_sender: &mut channel::InitSender,
main_receiver: &mut channel::MainReceiver,
) -> Result<()> {
if seccomp::is_notify(seccomp) {
log::debug!("main process waiting for sync seccomp");
main_receiver.wait_for_seccomp_request()?;
// process seccomp notify
init_sender.seccomp_notify_done()?;
}
Ok(())
}
fn setup_mapping(rootless: &Rootless, pid: Pid) -> Result<()> {
log::debug!("write mapping for pid {:?}", pid);
if !rootless.privileged {
@ -57,6 +87,7 @@ fn setup_mapping(rootless: &Rootless, pid: Pid) -> Result<()> {
// until "deny" has been written to setgroups. See CVE-2014-8989.
utils::write_file(format!("/proc/{}/setgroups", pid), "deny")?;
}
rootless
.write_uid_mapping(pid)
.context(format!("failed to map uid of pid {}", pid))?;