From 96d13fb5b19caead97a4545fac6e7047c3bb3b51 Mon Sep 17 00:00:00 2001 From: Furisto <24721048+Furisto@users.noreply.github.com> Date: Wed, 6 Oct 2021 23:09:02 +0200 Subject: [PATCH] Emulate cgroup namespaces with bind mounts --- src/rootfs/mount.rs | 133 +++++++++++++++++++++++++++++++------------- src/utils.rs | 13 +++-- 2 files changed, 101 insertions(+), 45 deletions(-) diff --git a/src/rootfs/mount.rs b/src/rootfs/mount.rs index b38edabf..a684f135 100644 --- a/src/rootfs/mount.rs +++ b/src/rootfs/mount.rs @@ -5,11 +5,11 @@ use super::{ use crate::syscall::{syscall::create_syscall, Syscall}; use crate::utils::PathBufExt; use anyhow::{bail, Context, Result}; -use cgroups::common::CgroupSetup::{Hybrid, Legacy, Unified}; +use cgroups::common::{CgroupSetup::{Hybrid, Legacy, Unified}, DEFAULT_CGROUP_ROOT}; use nix::{errno::Errno, mount::MsFlags}; use oci_spec::runtime::{Mount as SpecMount, MountBuilder as SpecMountBuilder}; use procfs::process::{MountOptFields, Process}; -use std::fs::{canonicalize, create_dir_all, OpenOptions}; +use std::{collections::HashMap, fs::{canonicalize, create_dir_all, OpenOptions}}; use std::path::{Path, PathBuf}; #[derive(Debug)] @@ -73,12 +73,12 @@ impl Mount { Ok(()) } - fn mount_cgroup_v1(&self, mount: &SpecMount, options: &MountOptions) -> Result<()> { + fn mount_cgroup_v1(&self, cgroup_mount: &SpecMount, options: &MountOptions) -> Result<()> { // create tmpfs into which the cgroup subsystems will be mounted let tmpfs = SpecMountBuilder::default() .source("tmpfs") .typ("tmpfs") - .destination(mount.destination()) + .destination(cgroup_mount.destination()) .options( ["noexec", "nosuid", "nodev", "mode=755"] .iter() @@ -87,73 +87,128 @@ impl Mount { ) .build() .context("failed to build tmpfs for cgroup")?; - - self.setup_mount(&tmpfs, options) - .context("failed to mount tmpfs for cgroup")?; - + self.setup_mount(&tmpfs, options).context("failed to mount tmpfs for cgroup")?; + // get all cgroup mounts on the host system - let mount_points: Vec = cgroups::v1::util::list_subsystem_mount_points() + let host_mounts: Vec = cgroups::v1::util::list_subsystem_mount_points() .context("failed to get subsystem mount points")? .into_iter() - .filter(|p| p.as_path().starts_with("/sys/fs")) + .filter(|p| p.as_path().starts_with(DEFAULT_CGROUP_ROOT)) .collect(); - log::debug!("{:?}", mount_points); - - // setup cgroup mounts for container + let cgroup_root = options .root - .join_safely(mount.destination()) - .context("could not join rootfs with cgroup destination")?; - for mount_point in mount_points { - if let Some(subsystem_name) = mount_point.file_name().and_then(|n| n.to_str()) { - let cgroup_mount = SpecMountBuilder::default() - .source("cgroup") - .typ("cgroup") - .destination(mount.destination().join(subsystem_name)) - .options( - ["noexec", "nosuid", "nodev"] - .iter() - .map(|o| o.to_string()) - .collect::>(), - ) - .build() - .with_context(|| format!("failed to build {}", subsystem_name))?; + .join_safely(cgroup_mount.destination()) + .context("could not join rootfs path with cgroup mount destination")?; + let symlink = Symlink::new(); + + // setup cgroup mounts for container + for host_mount in &host_mounts { + if let Some(subsystem_name) = host_mount.file_name().and_then(|n| n.to_str()) { if subsystem_name == "systemd" { continue; } - + if options.cgroup_ns { - self.setup_namespaced_hierarchy(&cgroup_mount, options, subsystem_name)?; - Symlink::new().setup_comount_symlinks(&cgroup_root, subsystem_name)?; + self.setup_namespaced_subsystem(cgroup_mount, options, subsystem_name)?; } else { - log::warn!("cgroup mounts are currently only suported with cgroup namespaces") + self.setup_emulated_subsystem(cgroup_mount, options, host_mount, subsystem_name)?; } + + symlink.setup_comount_symlinks(&cgroup_root, subsystem_name)?; } else { - log::warn!("could not get subsystem name from {:?}", mount_point); + log::warn!("could not get subsystem name from {:?}", host_mount); } } - + Ok(()) } // On some distros cgroup subsystems are comounted e.g. cpu,cpuacct or net_cls,net_prio. These systems // have to be comounted in the container as well as the kernel will reject trying to mount them separately. - fn setup_namespaced_hierarchy( - &self, + fn setup_namespaced_subsystem(&self, cgroup_mount: &SpecMount, options: &MountOptions, subsystem_name: &str, ) -> Result<()> { + let subsystem_mount = SpecMountBuilder::default() + .source("cgroup") + .typ("cgroup") + .destination(cgroup_mount.destination().join(subsystem_name)) + .options( + ["noexec", "nosuid", "nodev"] + .iter() + .map(|o| o.to_string()) + .collect::>(), + ) + .build() + .with_context(|| format!("failed to build {}", subsystem_name))?; + log::debug!("Mounting cgroup subsystem: {:?}", subsystem_name); self.mount_to_container( - cgroup_mount, + &subsystem_mount, options.root, MsFlags::MS_NOEXEC | MsFlags::MS_NOSUID | MsFlags::MS_NODEV, subsystem_name, options.label, ) - .with_context(|| format!("failed to mount {:?}", cgroup_mount)) + .with_context(|| format!("failed to mount {:?}", subsystem_mount)) + } + + fn setup_emulated_subsystem( + &self, + mount: &SpecMount, + options: &MountOptions, + host_mount: &Path, + subsystem_name: &str, + ) -> Result<()> { + let process_cgroups: HashMap = Process::myself()? + .cgroups() + .context("failed to get process cgroups")? + .into_iter() + .map(|c| (c.controllers.join(","), c.pathname)) + .collect(); + + log::debug!("{:?}", process_cgroups); + if let Some(proc_path) = process_cgroups.get(subsystem_name) { + let emulated = SpecMountBuilder::default() + .source( + host_mount + .join_safely(proc_path.as_str()) + .with_context(|| { + format!( + "failed to join mount source for {} subsystem", + subsystem_name + ) + })?, + ) + .destination( + mount + .destination() + .join_safely(subsystem_name) + .with_context(|| { + format!( + "failed to join mount destination for {} subsystem", + subsystem_name + ) + })?, + ) + .typ("bind") + .options( + ["rw", "rbind"] + .iter() + .map(|o| o.to_string()) + .collect::>(), + ) + .build()?; + log::debug!("Mounting emulated cgroup subsystem: {:?}", emulated); + + self.setup_mount(&emulated, options) + .with_context(|| format!("failed to mount {} cgroup hierarchy", subsystem_name))?; + } + + Ok(()) } fn mount_cgroup_v2(&self, _: &SpecMount, _: &MountOptions, _: MsFlags, _: &str) -> Result<()> { diff --git a/src/utils.rs b/src/utils.rs index 89eef7d9..d715d5d4 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -18,7 +18,7 @@ use std::path::{Path, PathBuf}; pub trait PathBufExt { fn as_in_container(&self) -> Result; fn join_absolute_path(&self, p: &Path) -> Result; - fn join_safely(&self, p: &Path) -> Result; + fn join_safely>(&self, p: P) -> Result; } impl PathBufExt for Path { @@ -41,14 +41,15 @@ impl PathBufExt for Path { Ok(PathBuf::from(format!("{}{}", self.display(), p.display()))) } - fn join_safely(&self, p: &Path) -> Result { - if p.is_relative() { - return Ok(self.join(p)); + fn join_safely>(&self, path: P) -> Result { + let path = path.as_ref(); + if path.is_relative() { + return Ok(self.join(path)); } - let stripped = p + let stripped = path .strip_prefix("/") - .with_context(|| format!("failed to strip prefix from {}", p.display()))?; + .with_context(|| format!("failed to strip prefix from {}", path.display()))?; Ok(self.join(stripped)) } }