mirror of
https://github.com/containers/youki
synced 2024-11-23 09:21:57 +01:00
add framework for seccomp notify
This commit is contained in:
parent
8fbe51cd5f
commit
eca790070d
@ -37,6 +37,12 @@ impl MainSender {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn seccomp_notify_request(&mut self) -> Result<()> {
|
||||
self.sender.write_message(Message::SeccompNotify)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn intermediate_ready(&mut self, pid: Pid) -> Result<()> {
|
||||
// Send over the IntermediateReady follow by the pid.
|
||||
log::debug!("sending init pid ({:?})", pid);
|
||||
@ -86,6 +92,22 @@ impl MainReceiver {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn wait_for_seccomp_request(&mut self) -> Result<()> {
|
||||
let mut buf = [0; 1];
|
||||
self.receiver
|
||||
.read_exact(&mut buf)
|
||||
.with_context(|| "failed to receive a message from the child process")?;
|
||||
|
||||
// convert to Message wrapper
|
||||
match Message::from(u8::from_be_bytes(buf)) {
|
||||
Message::SeccompNotify => Ok(()),
|
||||
msg => bail!(
|
||||
"receive unexpected message {:?} waiting for mapping request",
|
||||
msg
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn close(&self) -> Result<()> {
|
||||
self.receiver.close()
|
||||
}
|
||||
@ -174,6 +196,12 @@ pub struct InitSender {
|
||||
}
|
||||
|
||||
impl InitSender {
|
||||
pub fn seccomp_notify_done(&mut self) -> Result<()> {
|
||||
self.sender.write_message(Message::SeccompNotifyDone)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn close(&self) -> Result<()> {
|
||||
self.sender.close()
|
||||
}
|
||||
@ -184,6 +212,22 @@ pub struct InitReceiver {
|
||||
}
|
||||
|
||||
impl InitReceiver {
|
||||
pub fn wait_for_seccomp_request_done(&mut self) -> Result<()> {
|
||||
let mut buf = [0; 1];
|
||||
self.receiver
|
||||
.read_exact(&mut buf)
|
||||
.with_context(|| "failed to receive a message")?;
|
||||
|
||||
// convert to Message wrapper
|
||||
match Message::from(u8::from_be_bytes(buf)) {
|
||||
Message::SeccompNotifyDone => Ok(()),
|
||||
msg => bail!(
|
||||
"receive unexpected message {:?} waiting for seccomp done request",
|
||||
msg
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn close(&self) -> Result<()> {
|
||||
self.receiver.close()
|
||||
}
|
||||
|
@ -13,7 +13,7 @@ use nix::{
|
||||
fcntl,
|
||||
unistd::{self, Gid, Uid},
|
||||
};
|
||||
use oci_spec::runtime::{LinuxNamespaceType, Spec, User};
|
||||
use oci_spec::runtime::{LinuxNamespaceType, LinuxSeccomp, Spec, User};
|
||||
use std::collections::HashMap;
|
||||
use std::{
|
||||
env, fs,
|
||||
@ -188,7 +188,8 @@ fn apply_rest_namespaces(
|
||||
pub fn container_init_process(
|
||||
args: &ContainerArgs,
|
||||
intermediate_sender: &mut channel::IntermediateSender,
|
||||
_init_receiver: &mut channel::InitReceiver,
|
||||
main_sender: &mut channel::MainSender,
|
||||
init_receiver: &mut channel::InitReceiver,
|
||||
) -> Result<()> {
|
||||
let syscall = args.syscall;
|
||||
let spec = &args.spec;
|
||||
@ -294,7 +295,7 @@ pub fn container_init_process(
|
||||
match unistd::chdir(proc.cwd()) {
|
||||
Ok(_) => false,
|
||||
Err(nix::Error::EPERM) => true,
|
||||
Err(e) => bail!("Failed to chdir: {}", e),
|
||||
Err(e) => bail!("failed to chdir: {}", e),
|
||||
}
|
||||
};
|
||||
|
||||
@ -306,14 +307,16 @@ pub fn container_init_process(
|
||||
Uid::from_raw(proc.user().uid()),
|
||||
Gid::from_raw(proc.user().gid()),
|
||||
)
|
||||
.context("Failed to configure uid and gid")?;
|
||||
.context("failed to configure uid and gid")?;
|
||||
|
||||
// Without no new privileges, seccomp is a privileged operation. We have to
|
||||
// do this before dropping capabilities. Otherwise, we should do it later,
|
||||
// as close to exec as possible.
|
||||
if linux.seccomp().is_some() && proc.no_new_privileges().is_none() {
|
||||
seccomp::initialize_seccomp(linux.seccomp().as_ref().unwrap())
|
||||
.context("Failed to execute seccomp")?;
|
||||
if let Some(seccomp) = linux.seccomp() {
|
||||
seccomp::initialize_seccomp(seccomp).context("failed to execute seccomp")?;
|
||||
sync_seccomp(seccomp, main_sender, init_receiver).context("failed to sync seccomp")?;
|
||||
}
|
||||
}
|
||||
|
||||
capabilities::reset_effective(syscall).context("Failed to reset effective capabilities")?;
|
||||
@ -361,10 +364,11 @@ pub fn container_init_process(
|
||||
}
|
||||
};
|
||||
|
||||
// clean up and handle perserved fds.
|
||||
// Clean up and handle perserved fds. We only mark the fd as CLOSEXEC, so we
|
||||
// don't have to worry about when the fd will be closed.
|
||||
cleanup_file_descriptors(preserve_fds).with_context(|| "Failed to clean up extra fds")?;
|
||||
|
||||
// change directory to process.cwd if process.cwd is not empty
|
||||
// Change directory to process.cwd if process.cwd is not empty
|
||||
if do_chdir {
|
||||
unistd::chdir(proc.cwd()).with_context(|| format!("failed to chdir {:?}", proc.cwd()))?;
|
||||
}
|
||||
@ -375,6 +379,16 @@ pub fn container_init_process(
|
||||
.iter()
|
||||
.for_each(|(key, value)| env::set_var(key, value));
|
||||
|
||||
// Initialize seccomp profile right before we are ready to execute the
|
||||
// payload so as few syscalls will happen between here and payload exec. The
|
||||
// notify socket will still need network related syscalls.
|
||||
if let Some(seccomp) = linux.seccomp() {
|
||||
if proc.no_new_privileges().is_some() {
|
||||
seccomp::initialize_seccomp(seccomp).context("failed to execute seccomp")?;
|
||||
sync_seccomp(seccomp, main_sender, init_receiver).context("failed to sync seccomp")?;
|
||||
}
|
||||
}
|
||||
|
||||
// notify parents that the init process is ready to execute the payload.
|
||||
// Note, we pass -1 here because we are already inside the pid namespace.
|
||||
// The pid outside the pid namespace should be recorded by the intermediate
|
||||
@ -392,14 +406,6 @@ pub fn container_init_process(
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(seccomp) = linux.seccomp() {
|
||||
if proc.no_new_privileges().is_some() {
|
||||
// Initialize seccomp profile right before we are ready to execute the
|
||||
// payload. The notify socket will still need network related syscalls.
|
||||
seccomp::initialize_seccomp(seccomp).context("Failed to execute seccomp")?;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(args) = proc.args() {
|
||||
utils::do_exec(&args[0], args)?;
|
||||
} else {
|
||||
@ -469,6 +475,19 @@ fn set_supplementary_gids(user: &User, rootless: &Option<Rootless>) -> Result<()
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn sync_seccomp(
|
||||
seccomp: &LinuxSeccomp,
|
||||
main_sender: &mut channel::MainSender,
|
||||
init_receiver: &mut channel::InitReceiver,
|
||||
) -> Result<()> {
|
||||
if seccomp::is_notify(seccomp) {
|
||||
main_sender.seccomp_notify_request()?;
|
||||
init_receiver.wait_for_seccomp_request_done()?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
@ -13,6 +13,8 @@ pub fn container_intermediate_process(
|
||||
args: &ContainerArgs,
|
||||
intermediate_sender: &mut channel::IntermediateSender,
|
||||
intermediate_receiver: &mut channel::IntermediateReceiver,
|
||||
init_sender: &mut channel::InitSender,
|
||||
init_receiver: &mut channel::InitReceiver,
|
||||
main_sender: &mut channel::MainSender,
|
||||
) -> Result<()> {
|
||||
let command = &args.syscall;
|
||||
@ -75,9 +77,6 @@ pub fn container_intermediate_process(
|
||||
.context("failed to apply cgroups")?
|
||||
}
|
||||
|
||||
// We only need for init process to send us the ChildReady.
|
||||
let (init_sender, init_receiver) = &mut channel::init_channel()?;
|
||||
|
||||
// We have to record the pid of the child (container init process), since
|
||||
// the child will be inside the pid namespace. We can't rely on child_ready
|
||||
// to send us the correct pid.
|
||||
@ -86,10 +85,7 @@ pub fn container_intermediate_process(
|
||||
init_sender
|
||||
.close()
|
||||
.context("failed to close receiver in init process")?;
|
||||
main_sender
|
||||
.close()
|
||||
.context("failed to close unused sender")?;
|
||||
container_init_process(args, intermediate_sender, init_receiver)
|
||||
container_init_process(args, intermediate_sender, main_sender, init_receiver)
|
||||
})?;
|
||||
// Close unused fds in the parent process.
|
||||
intermediate_sender
|
||||
|
@ -1,15 +1,17 @@
|
||||
use crate::{
|
||||
process::{args::ContainerArgs, channel, container_intermediate_process, fork},
|
||||
rootless::Rootless,
|
||||
utils,
|
||||
seccomp, utils,
|
||||
};
|
||||
use anyhow::{Context, Result};
|
||||
use nix::unistd::Pid;
|
||||
use oci_spec::runtime::LinuxSeccomp;
|
||||
|
||||
pub fn container_main_process(container_args: &ContainerArgs) -> Result<Pid> {
|
||||
// We use a set of channels to communicate between parent and child process. Each channel is uni-directional.
|
||||
let (main_sender, main_receiver) = &mut channel::main_channel()?;
|
||||
let (intermediate_sender, intermediate_receiver) = &mut channel::intermediate_channel()?;
|
||||
let (init_sender, init_receiver) = &mut channel::init_channel()?;
|
||||
|
||||
let intermediate_pid = fork::container_fork(|| {
|
||||
// The fds in the channel is duplicated during fork, so we first close
|
||||
@ -22,6 +24,8 @@ pub fn container_main_process(container_args: &ContainerArgs) -> Result<Pid> {
|
||||
container_args,
|
||||
intermediate_sender,
|
||||
intermediate_receiver,
|
||||
init_sender,
|
||||
init_receiver,
|
||||
main_sender,
|
||||
)
|
||||
})?;
|
||||
@ -44,12 +48,38 @@ pub fn container_main_process(container_args: &ContainerArgs) -> Result<Pid> {
|
||||
.close()
|
||||
.context("failed to close unused sender")?;
|
||||
|
||||
if let Some(linux) = container_args.spec.linux() {
|
||||
if let Some(seccomp) = linux.seccomp() {
|
||||
sync_seccomp(seccomp, init_sender, main_receiver)
|
||||
.context("failed to sync seccomp with init")?;
|
||||
}
|
||||
}
|
||||
|
||||
init_sender
|
||||
.close()
|
||||
.context("failed to close unused init sender")?;
|
||||
|
||||
let init_pid = main_receiver.wait_for_intermediate_ready()?;
|
||||
log::debug!("init pid is {:?}", init_pid);
|
||||
|
||||
Ok(init_pid)
|
||||
}
|
||||
|
||||
fn sync_seccomp(
|
||||
seccomp: &LinuxSeccomp,
|
||||
init_sender: &mut channel::InitSender,
|
||||
main_receiver: &mut channel::MainReceiver,
|
||||
) -> Result<()> {
|
||||
if seccomp::is_notify(seccomp) {
|
||||
log::debug!("main process waiting for sync seccomp");
|
||||
main_receiver.wait_for_seccomp_request()?;
|
||||
// process seccomp notify
|
||||
init_sender.seccomp_notify_done()?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn setup_mapping(rootless: &Rootless, pid: Pid) -> Result<()> {
|
||||
log::debug!("write mapping for pid {:?}", pid);
|
||||
if !rootless.privileged {
|
||||
@ -57,6 +87,7 @@ fn setup_mapping(rootless: &Rootless, pid: Pid) -> Result<()> {
|
||||
// until "deny" has been written to setgroups. See CVE-2014-8989.
|
||||
utils::write_file(format!("/proc/{}/setgroups", pid), "deny")?;
|
||||
}
|
||||
|
||||
rootless
|
||||
.write_uid_mapping(pid)
|
||||
.context(format!("failed to map uid of pid {}", pid))?;
|
||||
|
Loading…
Reference in New Issue
Block a user