2021-10-08 21:09:48 +02:00
|
|
|
use super::{Container, ContainerStatus};
|
2021-07-04 22:44:07 +02:00
|
|
|
use crate::{
|
2023-05-16 20:40:48 +02:00
|
|
|
error::{LibcontainerError, MissingSpecError},
|
2021-08-14 02:58:52 +02:00
|
|
|
hooks,
|
2021-08-27 20:55:03 +02:00
|
|
|
notify_socket::NotifyListener,
|
2022-10-07 08:29:17 +02:00
|
|
|
process::{
|
|
|
|
self,
|
|
|
|
args::{ContainerArgs, ContainerType},
|
2023-04-17 13:53:58 +02:00
|
|
|
intel_rdt::delete_resctrl_subdirectory,
|
2022-10-07 08:29:17 +02:00
|
|
|
},
|
2023-07-03 23:48:47 +02:00
|
|
|
syscall::syscall::SyscallType,
|
2023-08-12 15:58:25 +02:00
|
|
|
user_ns::UserNamespaceConfig,
|
2021-08-02 02:34:31 +02:00
|
|
|
utils,
|
2023-07-21 14:12:21 +02:00
|
|
|
workload::Executor,
|
2021-07-04 22:44:07 +02:00
|
|
|
};
|
2023-05-03 07:59:55 +02:00
|
|
|
use libcgroups::common::CgroupManager;
|
2022-06-29 14:22:46 +02:00
|
|
|
use nix::unistd::Pid;
|
2021-09-28 20:20:39 +02:00
|
|
|
use oci_spec::runtime::Spec;
|
2023-07-23 14:35:50 +02:00
|
|
|
use std::{fs, io::Write, os::unix::prelude::RawFd, path::PathBuf, rc::Rc};
|
2021-07-03 00:45:38 +02:00
|
|
|
|
2023-07-23 14:35:50 +02:00
|
|
|
pub(super) struct ContainerBuilderImpl {
|
2021-07-17 15:55:38 +02:00
|
|
|
/// Flag indicating if an init or a tenant container should be created
|
2022-10-07 08:29:17 +02:00
|
|
|
pub container_type: ContainerType,
|
2021-07-17 15:55:38 +02:00
|
|
|
/// Interface to operating system primitives
|
2023-07-03 23:48:47 +02:00
|
|
|
pub syscall: SyscallType,
|
2021-07-17 15:55:38 +02:00
|
|
|
/// Flag indicating if systemd should be used for cgroup management
|
2021-07-03 00:45:38 +02:00
|
|
|
pub use_systemd: bool,
|
2021-07-17 15:55:38 +02:00
|
|
|
/// Id of the container
|
2021-07-03 00:45:38 +02:00
|
|
|
pub container_id: String,
|
2023-06-08 06:49:17 +02:00
|
|
|
/// OCI compliant runtime spec
|
2023-07-23 14:35:50 +02:00
|
|
|
pub spec: Rc<Spec>,
|
2021-07-17 15:55:38 +02:00
|
|
|
/// Root filesystem of the container
|
2021-07-03 00:45:38 +02:00
|
|
|
pub rootfs: PathBuf,
|
2021-07-17 15:55:38 +02:00
|
|
|
/// File which will be used to communicate the pid of the
|
|
|
|
/// container process to the higher level runtime
|
2021-07-03 00:45:38 +02:00
|
|
|
pub pid_file: Option<PathBuf>,
|
2021-07-17 15:55:38 +02:00
|
|
|
/// Socket to communicate the file descriptor of the ptty
|
2021-08-09 08:05:31 +02:00
|
|
|
pub console_socket: Option<RawFd>,
|
2023-08-12 15:58:25 +02:00
|
|
|
/// Options for new user namespace
|
|
|
|
pub user_ns_config: Option<UserNamespaceConfig>,
|
2021-07-19 08:22:47 +02:00
|
|
|
/// Path to the Unix Domain Socket to communicate container start
|
|
|
|
pub notify_path: PathBuf,
|
2021-07-17 15:55:38 +02:00
|
|
|
/// Container state
|
2021-07-03 00:45:38 +02:00
|
|
|
pub container: Option<Container>,
|
2021-08-02 02:23:56 +02:00
|
|
|
/// File descriptos preserved/passed to the container init process.
|
|
|
|
pub preserve_fds: i32,
|
2022-10-20 15:16:22 +02:00
|
|
|
/// If the container is to be run in detached mode
|
|
|
|
pub detached: bool,
|
2023-03-11 02:32:03 +01:00
|
|
|
/// Default executes the specified execution of a generic command
|
2023-08-06 09:44:31 +02:00
|
|
|
pub executor: Box<dyn Executor>,
|
2021-07-03 00:45:38 +02:00
|
|
|
}
|
|
|
|
|
2023-07-23 14:35:50 +02:00
|
|
|
impl ContainerBuilderImpl {
|
2023-05-16 20:40:48 +02:00
|
|
|
pub(super) fn create(&mut self) -> Result<Pid, LibcontainerError> {
|
|
|
|
match self.run_container() {
|
2022-06-29 14:22:46 +02:00
|
|
|
Ok(pid) => Ok(pid),
|
|
|
|
Err(outer) => {
|
2023-04-14 17:02:56 +02:00
|
|
|
// Only the init container should be cleaned up in the case of
|
|
|
|
// an error.
|
|
|
|
if matches!(self.container_type, ContainerType::InitContainer) {
|
2023-05-16 20:40:48 +02:00
|
|
|
self.cleanup_container()?;
|
2022-06-29 14:22:46 +02:00
|
|
|
}
|
2023-04-14 17:02:56 +02:00
|
|
|
|
2022-06-29 14:22:46 +02:00
|
|
|
Err(outer)
|
2021-09-25 22:52:07 +02:00
|
|
|
}
|
|
|
|
}
|
2021-07-03 00:45:38 +02:00
|
|
|
}
|
|
|
|
|
2023-05-16 20:40:48 +02:00
|
|
|
fn run_container(&mut self) -> Result<Pid, LibcontainerError> {
|
|
|
|
let linux = self.spec.linux().as_ref().ok_or(MissingSpecError::Linux)?;
|
2024-04-27 14:49:58 +02:00
|
|
|
let cgroups_path = utils::get_cgroup_path(linux.cgroups_path(), &self.container_id);
|
2023-07-19 15:09:43 +02:00
|
|
|
let cgroup_config = libcgroups::common::CgroupConfig {
|
|
|
|
cgroup_path: cgroups_path,
|
2023-08-12 15:58:25 +02:00
|
|
|
systemd_cgroup: self.use_systemd || self.user_ns_config.is_some(),
|
2023-07-19 15:09:43 +02:00
|
|
|
container_name: self.container_id.to_owned(),
|
|
|
|
};
|
2023-05-16 20:40:48 +02:00
|
|
|
let process = self
|
|
|
|
.spec
|
|
|
|
.process()
|
|
|
|
.as_ref()
|
|
|
|
.ok_or(MissingSpecError::Process)?;
|
2021-07-19 07:06:27 +02:00
|
|
|
|
2022-10-07 08:29:17 +02:00
|
|
|
if matches!(self.container_type, ContainerType::InitContainer) {
|
2021-09-28 00:46:57 +02:00
|
|
|
if let Some(hooks) = self.spec.hooks() {
|
2021-09-27 03:08:56 +02:00
|
|
|
hooks::run_hooks(hooks.create_runtime().as_ref(), self.container.as_ref())?
|
2021-08-05 10:21:15 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-08-27 20:55:03 +02:00
|
|
|
// Need to create the notify socket before we pivot root, since the unix
|
|
|
|
// domain socket used here is outside of the rootfs of container. During
|
2021-09-05 17:41:02 +02:00
|
|
|
// exec, need to create the socket before we enter into existing mount
|
2023-07-19 15:09:43 +02:00
|
|
|
// namespace. We also need to create to socket before entering into the
|
|
|
|
// user namespace in the case that the path is located in paths only
|
|
|
|
// root can access.
|
|
|
|
let notify_listener = NotifyListener::new(&self.notify_path)?;
|
2021-08-27 20:55:03 +02:00
|
|
|
|
2021-08-31 07:45:45 +02:00
|
|
|
// If Out-of-memory score adjustment is set in specification. set the score
|
|
|
|
// value for the current process check
|
|
|
|
// https://dev.to/rrampage/surviving-the-linux-oom-killer-2ki9 for some more
|
|
|
|
// information.
|
|
|
|
//
|
|
|
|
// This has to be done before !dumpable because /proc/self/oom_score_adj
|
|
|
|
// is not writeable unless you're an privileged user (if !dumpable is
|
|
|
|
// set). All children inherit their parent's oom_score_adj value on
|
|
|
|
// fork(2) so this will always be propagated properly.
|
2021-09-27 03:08:56 +02:00
|
|
|
if let Some(oom_score_adj) = process.oom_score_adj() {
|
2023-05-12 13:47:05 +02:00
|
|
|
tracing::debug!("Set OOM score to {}", oom_score_adj);
|
2023-05-16 20:40:48 +02:00
|
|
|
let mut f = fs::File::create("/proc/self/oom_score_adj").map_err(|err| {
|
|
|
|
tracing::error!("failed to open /proc/self/oom_score_adj: {}", err);
|
|
|
|
LibcontainerError::OtherIO(err)
|
|
|
|
})?;
|
|
|
|
f.write_all(oom_score_adj.to_string().as_bytes())
|
|
|
|
.map_err(|err| {
|
|
|
|
tracing::error!("failed to write to /proc/self/oom_score_adj: {}", err);
|
|
|
|
LibcontainerError::OtherIO(err)
|
|
|
|
})?;
|
2021-08-31 07:45:45 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// Make the process non-dumpable, to avoid various race conditions that
|
|
|
|
// could cause processes in namespaces we're joining to access host
|
|
|
|
// resources (or potentially execute code).
|
|
|
|
//
|
|
|
|
// However, if the number of namespaces we are joining is 0, we are not
|
|
|
|
// going to be switching to a different security context. Thus setting
|
|
|
|
// ourselves to be non-dumpable only breaks things (like rootless
|
|
|
|
// containers), which is the recommendation from the kernel folks.
|
2021-09-27 03:08:56 +02:00
|
|
|
if linux.namespaces().is_some() {
|
2024-02-22 07:38:26 +01:00
|
|
|
prctl::set_dumpable(false).map_err(|e| {
|
|
|
|
LibcontainerError::Other(format!(
|
|
|
|
"error in setting dumpable to false : {}",
|
|
|
|
nix::errno::from_i32(e)
|
|
|
|
))
|
|
|
|
})?;
|
2021-08-31 07:45:45 +02:00
|
|
|
}
|
|
|
|
|
2022-07-10 03:54:32 +02:00
|
|
|
// This container_args will be passed to the container processes,
|
2021-07-30 03:52:45 +02:00
|
|
|
// therefore we will have to move all the variable by value. Since self
|
|
|
|
// is a shared reference, we have to clone these variables here.
|
2021-10-08 21:09:48 +02:00
|
|
|
let container_args = ContainerArgs {
|
2022-10-07 08:29:17 +02:00
|
|
|
container_type: self.container_type,
|
2021-09-18 20:32:15 +02:00
|
|
|
syscall: self.syscall,
|
2023-07-23 14:35:50 +02:00
|
|
|
spec: Rc::clone(&self.spec),
|
|
|
|
rootfs: self.rootfs.to_owned(),
|
2021-08-09 08:05:31 +02:00
|
|
|
console_socket: self.console_socket,
|
2023-07-19 15:09:43 +02:00
|
|
|
notify_listener,
|
2021-08-02 02:23:56 +02:00
|
|
|
preserve_fds: self.preserve_fds,
|
2023-07-23 14:35:50 +02:00
|
|
|
container: self.container.to_owned(),
|
2023-08-12 15:58:25 +02:00
|
|
|
user_ns_config: self.user_ns_config.to_owned(),
|
2023-07-19 15:09:43 +02:00
|
|
|
cgroup_config,
|
2022-10-20 15:16:22 +02:00
|
|
|
detached: self.detached,
|
2023-07-21 14:12:21 +02:00
|
|
|
executor: self.executor.clone(),
|
2021-07-30 03:52:45 +02:00
|
|
|
};
|
2021-10-08 09:58:54 +02:00
|
|
|
|
2023-04-17 13:53:58 +02:00
|
|
|
let (init_pid, need_to_clean_up_intel_rdt_dir) =
|
2023-05-16 20:40:48 +02:00
|
|
|
process::container_main_process::container_main_process(&container_args).map_err(
|
|
|
|
|err| {
|
|
|
|
tracing::error!(?err, "failed to run container process");
|
|
|
|
LibcontainerError::MainProcess(err)
|
|
|
|
},
|
|
|
|
)?;
|
2021-07-19 07:06:27 +02:00
|
|
|
|
|
|
|
// if file to write the pid to is specified, write pid of the child
|
|
|
|
if let Some(pid_file) = &self.pid_file {
|
2023-05-16 20:40:48 +02:00
|
|
|
fs::write(pid_file, format!("{init_pid}")).map_err(|err| {
|
|
|
|
tracing::error!("failed to write pid to file: {}", err);
|
|
|
|
LibcontainerError::OtherIO(err)
|
|
|
|
})?;
|
2021-07-19 07:06:27 +02:00
|
|
|
}
|
|
|
|
|
2021-09-23 23:05:35 +02:00
|
|
|
if let Some(container) = &mut self.container {
|
2021-07-19 07:06:27 +02:00
|
|
|
// update status and pid of the container process
|
|
|
|
container
|
2021-09-23 23:05:35 +02:00
|
|
|
.set_status(ContainerStatus::Created)
|
2021-07-19 07:06:27 +02:00
|
|
|
.set_creator(nix::unistd::geteuid().as_raw())
|
|
|
|
.set_pid(init_pid.as_raw())
|
2023-04-17 13:53:58 +02:00
|
|
|
.set_clean_up_intel_rdt_directory(need_to_clean_up_intel_rdt_dir)
|
2023-05-16 20:40:48 +02:00
|
|
|
.save()?;
|
2021-07-19 07:06:27 +02:00
|
|
|
}
|
|
|
|
|
2023-03-08 06:30:38 +01:00
|
|
|
Ok(init_pid)
|
2021-07-19 07:06:27 +02:00
|
|
|
}
|
2021-09-25 22:52:07 +02:00
|
|
|
|
2023-05-16 20:40:48 +02:00
|
|
|
fn cleanup_container(&self) -> Result<(), LibcontainerError> {
|
|
|
|
let linux = self.spec.linux().as_ref().ok_or(MissingSpecError::Linux)?;
|
2024-04-27 14:49:58 +02:00
|
|
|
let cgroups_path = utils::get_cgroup_path(linux.cgroups_path(), &self.container_id);
|
2023-07-19 15:09:43 +02:00
|
|
|
let cmanager =
|
|
|
|
libcgroups::common::create_cgroup_manager(libcgroups::common::CgroupConfig {
|
|
|
|
cgroup_path: cgroups_path,
|
2023-08-12 15:58:25 +02:00
|
|
|
systemd_cgroup: self.use_systemd || self.user_ns_config.is_some(),
|
2023-07-19 15:09:43 +02:00
|
|
|
container_name: self.container_id.to_string(),
|
|
|
|
})?;
|
2021-09-25 22:52:07 +02:00
|
|
|
|
|
|
|
let mut errors = Vec::new();
|
2023-04-17 13:53:58 +02:00
|
|
|
|
2023-05-16 20:40:48 +02:00
|
|
|
if let Err(e) = cmanager.remove() {
|
|
|
|
tracing::error!(error = ?e, "failed to remove cgroup manager");
|
2021-09-25 22:52:07 +02:00
|
|
|
errors.push(e.to_string());
|
|
|
|
}
|
|
|
|
|
|
|
|
if let Some(container) = &self.container {
|
2023-04-17 13:53:58 +02:00
|
|
|
if let Some(true) = container.clean_up_intel_rdt_subdirectory() {
|
2023-05-16 20:40:48 +02:00
|
|
|
if let Err(e) = delete_resctrl_subdirectory(container.id()) {
|
|
|
|
tracing::error!(id = ?container.id(), error = ?e, "failed to delete resctrl subdirectory");
|
2023-04-17 13:53:58 +02:00
|
|
|
errors.push(e.to_string());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-09-25 22:52:07 +02:00
|
|
|
if container.root.exists() {
|
2023-05-16 20:40:48 +02:00
|
|
|
if let Err(e) = fs::remove_dir_all(&container.root) {
|
|
|
|
tracing::error!(container_root = ?container.root, error = ?e, "failed to delete container root");
|
2021-09-25 22:52:07 +02:00
|
|
|
errors.push(e.to_string());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if !errors.is_empty() {
|
2023-05-16 20:40:48 +02:00
|
|
|
return Err(LibcontainerError::Other(format!(
|
|
|
|
"failed to cleanup container: {}",
|
|
|
|
errors.join(";")
|
|
|
|
)));
|
2021-09-25 22:52:07 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
}
|
2021-07-19 07:06:27 +02:00
|
|
|
}
|