diff --git a/src/process/channel.rs b/src/process/channel.rs index 48f8e340..c7105d10 100644 --- a/src/process/channel.rs +++ b/src/process/channel.rs @@ -5,7 +5,10 @@ use nix::{ unistd::{self, Pid}, }; use serde::{Deserialize, Serialize}; -use std::{marker::PhantomData, os::unix::prelude::RawFd}; +use std::{ + marker::PhantomData, + os::unix::prelude::{AsRawFd, RawFd}, +}; /// Channel Design /// @@ -37,8 +40,9 @@ impl MainSender { Ok(()) } - pub fn seccomp_notify_request(&mut self) -> Result<()> { - self.sender.send(Message::SeccompNotify)?; + pub fn seccomp_notify_request(&mut self, fd: RawFd) -> Result<()> { + self.sender + .send_fds(Message::SeccompNotify, &[fd.as_raw_fd()])?; Ok(()) } @@ -51,6 +55,12 @@ impl MainSender { Ok(()) } + pub fn init_ready(&mut self) -> Result<()> { + self.sender.send(Message::InitReady)?; + + Ok(()) + } + pub fn close(&self) -> Result<()> { self.sender.close() } @@ -92,14 +102,20 @@ impl MainReceiver { } } - pub fn wait_for_seccomp_request(&mut self) -> Result<()> { - let msg = self + pub fn wait_for_seccomp_request(&mut self) -> Result { + let (msg, fds) = self .receiver - .recv() + .recv_with_fds::<[RawFd; 1]>() .context("failed to wait for seccomp request")?; match msg { - Message::SeccompNotify => Ok(()), + Message::SeccompNotify => { + let fd = match fds { + Some(fds) => fds[0], + None => bail!("expecting fds from seccomp request"), + }; + Ok(fd) + } msg => bail!( "receive unexpected message {:?} waiting for seccomp request", msg @@ -107,6 +123,22 @@ impl MainReceiver { } } + /// Waits for associated init process to send ready message + /// and return the pid of init process which is forked by init process + pub fn wait_for_init_ready(&mut self) -> Result<()> { + let msg = self + .receiver + .recv() + .context("failed to wait for init ready")?; + match msg { + Message::InitReady => Ok(()), + msg => bail!( + "receive unexpected message {:?} waiting for init ready", + msg + ), + } + } + pub fn close(&self) -> Result<()> { self.receiver.close() } @@ -132,12 +164,6 @@ impl IntermediateSender { Ok(()) } - pub fn init_ready(&mut self) -> Result<()> { - self.sender.send(Message::InitReady)?; - - Ok(()) - } - pub fn close(&self) -> Result<()> { self.sender.close() } @@ -164,22 +190,6 @@ impl IntermediateReceiver { } } - /// Waits for associated init process to send ready message - /// and return the pid of init process which is forked by init process - pub fn wait_for_init_ready(&mut self) -> Result<()> { - let msg = self - .receiver - .recv() - .context("failed to wait for init ready")?; - match msg { - Message::InitReady => Ok(()), - msg => bail!( - "receive unexpected message {:?} waiting for init ready", - msg - ), - } - } - pub fn close(&self) -> Result<()> { self.receiver.close() } @@ -502,7 +512,7 @@ mod tests { #[test] #[serial] fn test_channel_init_ready() -> Result<()> { - let (sender, receiver) = &mut intermediate_channel()?; + let (sender, receiver) = &mut main_channel()?; match unsafe { unistd::fork()? } { unistd::ForkResult::Parent { child } => { wait::waitpid(child, None)?; @@ -547,7 +557,7 @@ mod tests { #[test] #[serial] fn test_channel_intermediate_graceful_exit() -> Result<()> { - let (sender, receiver) = &mut intermediate_channel()?; + let (sender, receiver) = &mut main_channel()?; match unsafe { unistd::fork()? } { unistd::ForkResult::Parent { child } => { sender.close().context("failed to close sender")?; diff --git a/src/process/container_init_process.rs b/src/process/container_init_process.rs index b5c8ad5d..2249c2c3 100644 --- a/src/process/container_init_process.rs +++ b/src/process/container_init_process.rs @@ -13,7 +13,7 @@ use nix::{ fcntl, unistd::{self, Gid, Uid}, }; -use oci_spec::runtime::{LinuxNamespaceType, LinuxSeccomp, Spec, User}; +use oci_spec::runtime::{LinuxNamespaceType, Spec, User}; use std::collections::HashMap; use std::{ env, fs, @@ -187,7 +187,6 @@ fn apply_rest_namespaces( pub fn container_init_process( args: &ContainerArgs, - intermediate_sender: &mut channel::IntermediateSender, main_sender: &mut channel::MainSender, init_receiver: &mut channel::InitReceiver, ) -> Result<()> { @@ -314,8 +313,10 @@ pub fn container_init_process( // as close to exec as possible. if linux.seccomp().is_some() && proc.no_new_privileges().is_none() { if let Some(seccomp) = linux.seccomp() { - seccomp::initialize_seccomp(seccomp).context("failed to execute seccomp")?; - sync_seccomp(seccomp, main_sender, init_receiver).context("failed to sync seccomp")?; + let notify_fd = + seccomp::initialize_seccomp(seccomp).context("failed to execute seccomp")?; + sync_seccomp(notify_fd, main_sender, init_receiver) + .context("failed to sync seccomp")?; } } @@ -384,16 +385,18 @@ pub fn container_init_process( // notify socket will still need network related syscalls. if let Some(seccomp) = linux.seccomp() { if proc.no_new_privileges().is_some() { - seccomp::initialize_seccomp(seccomp).context("failed to execute seccomp")?; - sync_seccomp(seccomp, main_sender, init_receiver).context("failed to sync seccomp")?; + let notify_fd = + seccomp::initialize_seccomp(seccomp).context("failed to execute seccomp")?; + sync_seccomp(notify_fd, main_sender, init_receiver) + .context("failed to sync seccomp")?; } } - // notify parents that the init process is ready to execute the payload. + // Notify main process that the init process is ready to execute the payload. // Note, we pass -1 here because we are already inside the pid namespace. // The pid outside the pid namespace should be recorded by the intermediate // process. - intermediate_sender.init_ready()?; + main_sender.init_ready()?; // listing on the notify socket for container start command args.notify_socket.wait_for_container_start()?; @@ -476,12 +479,13 @@ fn set_supplementary_gids(user: &User, rootless: &Option) -> Result<() } fn sync_seccomp( - seccomp: &LinuxSeccomp, + fd: Option, main_sender: &mut channel::MainSender, init_receiver: &mut channel::InitReceiver, ) -> Result<()> { - if seccomp::is_notify(seccomp) { - main_sender.seccomp_notify_request()?; + if let Some(fd) = fd { + log::debug!("init process sync seccomp, notify fd: {}", fd); + main_sender.seccomp_notify_request(fd)?; init_receiver.wait_for_seccomp_request_done()?; } diff --git a/src/process/container_intermediate_process.rs b/src/process/container_intermediate_process.rs index 2dcfbb85..3db47410 100644 --- a/src/process/container_intermediate_process.rs +++ b/src/process/container_intermediate_process.rs @@ -85,7 +85,10 @@ pub fn container_intermediate_process( init_sender .close() .context("failed to close receiver in init process")?; - container_init_process(args, intermediate_sender, main_sender, init_receiver) + intermediate_sender + .close() + .context("failed to close sender in the intermediate process")?; + container_init_process(args, main_sender, init_receiver) })?; // Close unused fds in the parent process. intermediate_sender @@ -96,9 +99,7 @@ pub fn container_intermediate_process( .context("failed to close unused init sender")?; // There is no point using the pid returned here, since the child will be // inside the pid namespace already. - intermediate_receiver - .wait_for_init_ready() - .context("failed to wait for the child")?; + // After the child (the container init process) becomes ready, we can signal // the parent (the main process) that we are ready. main_sender diff --git a/src/process/container_main_process.rs b/src/process/container_main_process.rs index b05f12ea..2cf54e9c 100644 --- a/src/process/container_main_process.rs +++ b/src/process/container_main_process.rs @@ -1,11 +1,16 @@ use crate::{ + container::ContainerProcessState, process::{args::ContainerArgs, channel, container_intermediate_process, fork}, rootless::Rootless, seccomp, utils, }; use anyhow::{Context, Result}; -use nix::unistd::Pid; -use oci_spec::runtime::LinuxSeccomp; +use nix::{ + sys::{socket, uio}, + unistd::{self, Pid}, +}; +use oci_spec::runtime; +use std::path::Path; pub fn container_main_process(container_args: &ContainerArgs) -> Result { // We use a set of channels to communicate between parent and child process. Each channel is uni-directional. @@ -44,13 +49,35 @@ pub fn container_main_process(container_args: &ContainerArgs) -> Result { intermediate_sender.mapping_written()?; } + // The intermediate process will send the init pid once it forks the init + // process. The intermediate process should exit after this point. + let init_pid = main_receiver.wait_for_intermediate_ready()?; + intermediate_sender .close() .context("failed to close unused sender")?; if let Some(linux) = container_args.spec.linux() { if let Some(seccomp) = linux.seccomp() { - sync_seccomp(seccomp, init_sender, main_receiver) + let seccomp_metadata = if let Some(metadata) = seccomp.listener_metadata() { + metadata.to_owned() + } else { + String::new() + }; + let state = ContainerProcessState { + oci_version: container_args.spec.version().to_string(), + // runc hardcode the `seccompFd` name for fds. + fds: vec![String::from("seccompFd")], + pid: init_pid.as_raw(), + metadata: seccomp_metadata, + state: container_args + .container + .as_ref() + .context("container state is required")? + .state + .clone(), + }; + sync_seccomp(seccomp, &state, init_sender, main_receiver) .context("failed to sync seccomp with init")?; } } @@ -59,27 +86,71 @@ pub fn container_main_process(container_args: &ContainerArgs) -> Result { .close() .context("failed to close unused init sender")?; - let init_pid = main_receiver.wait_for_intermediate_ready()?; + main_receiver + .wait_for_init_ready() + .context("failed to wait for init ready")?; + log::debug!("init pid is {:?}", init_pid); Ok(init_pid) } fn sync_seccomp( - seccomp: &LinuxSeccomp, + seccomp: &runtime::LinuxSeccomp, + state: &ContainerProcessState, init_sender: &mut channel::InitSender, main_receiver: &mut channel::MainReceiver, ) -> Result<()> { if seccomp::is_notify(seccomp) { log::debug!("main process waiting for sync seccomp"); - main_receiver.wait_for_seccomp_request()?; - // process seccomp notify + let seccomp_fd = main_receiver.wait_for_seccomp_request()?; + let listener_path = seccomp + .listener_path() + .as_ref() + .context("notify will require seccomp listener path to be set")?; + let encoded_state = + serde_json::to_vec(state).context("failed to encode container process state")?; + sync_seccomp_send_msg(listener_path, &encoded_state, seccomp_fd) + .context("failed to send msg to seccomp listener")?; init_sender.seccomp_notify_done()?; } Ok(()) } +fn sync_seccomp_send_msg(listener_path: &Path, msg: &[u8], fd: i32) -> Result<()> { + // The seccomp listener has specific instructions on how to transmit the + // information through seccomp listener. Therefore, we have to use + // libc/nix APIs instead of Rust std lib APIs to maintain flexibility. + let socket = socket::socket( + socket::AddressFamily::Unix, + socket::SockType::Stream, + socket::SockFlag::empty(), + None, + ) + .context("failed to create unix domain socket for seccomp listener")?; + let unix_addr = + socket::SockAddr::new_unix(listener_path).context("failed to create unix addr")?; + socket::connect(socket, &unix_addr).with_context(|| { + format!( + "failed to connect to seccomp notify listerner path: {:?}", + listener_path + ) + })?; + // We have to use sendmsg here because the spec requires us to send seccomp notify fds through + // SCM_RIGHTS message. + // Ref: https://man7.org/linux/man-pages/man3/sendmsg.3p.html + // Ref: https://man7.org/linux/man-pages/man3/cmsg.3.html + let iov = [uio::IoVec::from_slice(msg)]; + let fds = [fd]; + let cmsgs = socket::ControlMessage::ScmRights(&fds); + socket::sendmsg(socket, &iov, &[cmsgs], socket::MsgFlags::empty(), None) + .context("failed to write container state to seccomp listener")?; + let _ = unistd::close(socket); + + Ok(()) +} + fn setup_mapping(rootless: &Rootless, pid: Pid) -> Result<()> { log::debug!("write mapping for pid {:?}", pid); if !rootless.privileged {