1
0
Fork 0
mirror of https://github.com/containers/youki synced 2024-05-06 07:36:17 +02:00
youki/src/rootfs.rs
2021-09-27 15:46:57 -07:00

488 lines
16 KiB
Rust

//! During kernel initialization, a minimal replica of the ramfs filesystem is loaded, called rootfs.
//! Most systems mount another filesystem over it
use crate::utils::PathBufExt;
use anyhow::{anyhow, bail, Context, Result};
use nix::errno::Errno;
use nix::fcntl::{open, OFlag};
use nix::mount::mount as nix_mount;
use nix::mount::MsFlags;
use nix::sys::stat::{mknod, umask};
use nix::sys::stat::{Mode, SFlag};
use nix::unistd::{chown, close};
use nix::unistd::{Gid, Uid};
use nix::NixPath;
use oci_spec::runtime::{Linux, LinuxDevice, LinuxDeviceBuilder, LinuxDeviceType, Mount, Spec};
use procfs::process::{MountInfo, MountOptFields, Process};
use std::fs::OpenOptions;
use std::fs::{canonicalize, create_dir_all, remove_file};
use std::os::unix::fs::symlink;
use std::path::{Path, PathBuf};
pub fn prepare_rootfs(spec: &Spec, rootfs: &Path, bind_devices: bool) -> Result<()> {
log::debug!("Prepare rootfs: {:?}", rootfs);
let mut flags = MsFlags::MS_REC;
let linux = spec.linux().as_ref().context("no linux in spec")?;
match linux.rootfs_propagation().as_deref() {
Some("shared") => flags |= MsFlags::MS_SHARED,
Some("private") => flags |= MsFlags::MS_PRIVATE,
Some("slave" | "unbindable") | None => flags |= MsFlags::MS_SLAVE,
Some(uknown) => bail!("unknown rootfs_propagation: {}", uknown),
}
nix_mount(None::<&str>, "/", None::<&str>, flags, None::<&str>)
.context("Failed to mount rootfs")?;
make_parent_mount_private(rootfs).context("Failed to change parent mount of rootfs private")?;
log::debug!("mount root fs {:?}", rootfs);
nix_mount::<Path, Path, str, str>(
Some(rootfs),
rootfs,
None::<&str>,
MsFlags::MS_BIND | MsFlags::MS_REC,
None::<&str>,
)?;
if let Some(mounts) = spec.mounts() {
for mount in mounts {
log::debug!("Mount... {:?}", mount);
let (flags, data) = parse_mount(mount);
let mount_label = linux.mount_label().as_ref();
if *mount.typ() == Some("cgroup".to_string()) {
// skip
log::warn!("A feature of cgroup is unimplemented.");
} else if *mount.destination() == PathBuf::from("/dev") {
mount_to_container(
mount,
rootfs,
flags & !MsFlags::MS_RDONLY,
&data,
mount_label,
)
.with_context(|| format!("Failed to mount /dev: {:?}", mount))?;
} else {
mount_to_container(mount, rootfs, flags, &data, mount_label)
.with_context(|| format!("Failed to mount: {:?}", mount))?;
}
}
}
setup_default_symlinks(rootfs).context("Failed to setup default symlinks")?;
if let Some(added_devices) = linux.devices() {
create_devices(
rootfs,
default_devices().iter().chain(added_devices),
bind_devices,
)
} else {
create_devices(rootfs, &default_devices(), bind_devices)
}?;
setup_ptmx(rootfs)?;
Ok(())
}
fn setup_ptmx(rootfs: &Path) -> Result<()> {
if let Err(e) = remove_file(rootfs.join("dev/ptmx")) {
if e.kind() != ::std::io::ErrorKind::NotFound {
bail!("could not delete /dev/ptmx")
}
}
symlink("pts/ptmx", rootfs.join("dev/ptmx")).context("failed to symlink ptmx")?;
Ok(())
}
fn setup_default_symlinks(rootfs: &Path) -> Result<()> {
if Path::new("/proc/kcore").exists() {
symlink("/proc/kcore", rootfs.join("dev/kcore")).context("Failed to symlink kcore")?;
}
let defaults = [
("/proc/self/fd", "dev/fd"),
("/proc/self/fd/0", "dev/stdin"),
("/proc/self/fd/1", "dev/stdout"),
("/proc/self/fd/2", "dev/stderr"),
];
for (src, dst) in defaults {
symlink(src, rootfs.join(dst)).context("Fail to symlink defaults")?;
}
Ok(())
}
pub fn default_devices() -> Vec<LinuxDevice> {
vec![
LinuxDeviceBuilder::default()
.path(PathBuf::from("/dev/null"))
.typ(LinuxDeviceType::C)
.major(1)
.minor(3)
.file_mode(0o066u32)
.build()
.unwrap(),
LinuxDeviceBuilder::default()
.path(PathBuf::from("/dev/zero"))
.typ(LinuxDeviceType::C)
.major(1)
.minor(5)
.file_mode(0o066u32)
.build()
.unwrap(),
LinuxDeviceBuilder::default()
.path(PathBuf::from("/dev/full"))
.typ(LinuxDeviceType::C)
.major(1)
.minor(7)
.file_mode(0o066u32)
.build()
.unwrap(),
LinuxDeviceBuilder::default()
.path(PathBuf::from("/dev/tty"))
.typ(LinuxDeviceType::C)
.major(5)
.minor(0)
.file_mode(0o066u32)
.build()
.unwrap(),
LinuxDeviceBuilder::default()
.path(PathBuf::from("/dev/urandom"))
.typ(LinuxDeviceType::C)
.major(1)
.minor(9)
.file_mode(0o066u32)
.build()
.unwrap(),
LinuxDeviceBuilder::default()
.path(PathBuf::from("/dev/random"))
.typ(LinuxDeviceType::C)
.major(1)
.minor(8)
.file_mode(0o066u32)
.build()
.unwrap(),
]
}
fn create_devices<'a, I>(rootfs: &Path, devices: I, bind: bool) -> Result<()>
where
I: IntoIterator<Item = &'a LinuxDevice>,
{
let old_mode = umask(Mode::from_bits_truncate(0o000));
if bind {
let _ = devices
.into_iter()
.map(|dev| {
if !dev.path().starts_with("/dev") {
panic!("{} is not a valid device path", dev.path().display());
}
bind_dev(rootfs, dev)
})
.collect::<Result<Vec<_>>>()?;
} else {
devices
.into_iter()
.map(|dev| {
if !dev.path().starts_with("/dev") {
panic!("{} is not a valid device path", dev.path().display());
}
mknod_dev(rootfs, dev)
})
.collect::<Result<Vec<_>>>()?;
}
umask(old_mode);
Ok(())
}
fn bind_dev(rootfs: &Path, dev: &LinuxDevice) -> Result<()> {
let full_container_path = rootfs.join(dev.path().as_in_container()?);
let fd = open(
&full_container_path,
OFlag::O_RDWR | OFlag::O_CREAT,
Mode::from_bits_truncate(0o644),
)?;
close(fd)?;
nix_mount(
Some(dev.path()),
&full_container_path,
Some("bind"),
MsFlags::MS_BIND,
None::<&str>,
)?;
Ok(())
}
fn to_sflag(dev_type: LinuxDeviceType) -> SFlag {
match dev_type {
LinuxDeviceType::A => SFlag::S_IFBLK | SFlag::S_IFCHR | SFlag::S_IFIFO,
LinuxDeviceType::B => SFlag::S_IFBLK,
LinuxDeviceType::C | LinuxDeviceType::U => SFlag::S_IFCHR,
LinuxDeviceType::P => SFlag::S_IFIFO,
}
}
fn mknod_dev(rootfs: &Path, dev: &LinuxDevice) -> Result<()> {
fn makedev(major: i64, minor: i64) -> u64 {
((minor & 0xff)
| ((major & 0xfff) << 8)
| ((minor & !0xff) << 12)
| ((major & !0xfff) << 32)) as u64
}
let full_container_path = rootfs.join(dev.path().as_in_container()?);
mknod(
&full_container_path,
to_sflag(dev.typ()),
Mode::from_bits_truncate(dev.file_mode().unwrap_or(0)),
makedev(dev.major(), dev.minor()),
)?;
chown(
&full_container_path,
dev.uid().map(Uid::from_raw),
dev.gid().map(Gid::from_raw),
)?;
Ok(())
}
fn mount_to_container(
m: &Mount,
rootfs: &Path,
flags: MsFlags,
data: &str,
label: Option<&String>,
) -> Result<()> {
let typ = m.typ().as_deref();
let d = if let Some(l) = label {
if typ != Some("proc") && typ != Some("sysfs") {
if data.is_empty() {
format!("context=\"{}\"", l)
} else {
format!("{},context=\"{}\"", data, l)
}
} else {
data.to_string()
}
} else {
data.to_string()
};
let dest_for_host = format!(
"{}{}",
rootfs.to_string_lossy().into_owned(),
m.destination().display()
);
let dest = Path::new(&dest_for_host);
let source = m.source().as_ref().context("no source in mount spec")?;
let src = if typ == Some("bind") {
let src = canonicalize(source)?;
let dir = if src.is_file() {
Path::new(&dest).parent().unwrap()
} else {
Path::new(&dest)
};
create_dir_all(&dir)
.with_context(|| format!("Failed to create dir for bind mount: {:?}", dir))?;
if src.is_file() {
OpenOptions::new()
.create(true)
.write(true)
.open(&dest)
.unwrap();
}
src
} else {
create_dir_all(&dest).with_context(|| format!("Failed to create device: {:?}", dest))?;
PathBuf::from(source)
};
if let Err(errno) = nix_mount(Some(&*src), dest, typ, flags, Some(&*d)) {
if !matches!(errno, Errno::EINVAL) {
bail!("mount of {:?} failed", m.destination());
}
nix_mount(Some(&*src), dest, typ, flags, Some(data))?;
}
if flags.contains(MsFlags::MS_BIND)
&& flags.intersects(
!(MsFlags::MS_REC
| MsFlags::MS_REMOUNT
| MsFlags::MS_BIND
| MsFlags::MS_PRIVATE
| MsFlags::MS_SHARED
| MsFlags::MS_SLAVE),
)
{
nix_mount(
Some(dest),
dest,
None::<&str>,
flags | MsFlags::MS_REMOUNT,
None::<&str>,
)?;
}
Ok(())
}
fn parse_mount(m: &Mount) -> (MsFlags, String) {
let mut flags = MsFlags::empty();
let mut data = Vec::new();
if let Some(options) = &m.options() {
for s in options {
if let Some((is_clear, flag)) = match s.as_str() {
"defaults" => Some((false, MsFlags::empty())),
"ro" => Some((false, MsFlags::MS_RDONLY)),
"rw" => Some((true, MsFlags::MS_RDONLY)),
"suid" => Some((true, MsFlags::MS_NOSUID)),
"nosuid" => Some((false, MsFlags::MS_NOSUID)),
"dev" => Some((true, MsFlags::MS_NODEV)),
"nodev" => Some((false, MsFlags::MS_NODEV)),
"exec" => Some((true, MsFlags::MS_NOEXEC)),
"noexec" => Some((false, MsFlags::MS_NOEXEC)),
"sync" => Some((false, MsFlags::MS_SYNCHRONOUS)),
"async" => Some((true, MsFlags::MS_SYNCHRONOUS)),
"dirsync" => Some((false, MsFlags::MS_DIRSYNC)),
"remount" => Some((false, MsFlags::MS_REMOUNT)),
"mand" => Some((false, MsFlags::MS_MANDLOCK)),
"nomand" => Some((true, MsFlags::MS_MANDLOCK)),
"atime" => Some((true, MsFlags::MS_NOATIME)),
"noatime" => Some((false, MsFlags::MS_NOATIME)),
"diratime" => Some((true, MsFlags::MS_NODIRATIME)),
"nodiratime" => Some((false, MsFlags::MS_NODIRATIME)),
"bind" => Some((false, MsFlags::MS_BIND)),
"rbind" => Some((false, MsFlags::MS_BIND | MsFlags::MS_REC)),
"unbindable" => Some((false, MsFlags::MS_UNBINDABLE)),
"runbindable" => Some((false, MsFlags::MS_UNBINDABLE | MsFlags::MS_REC)),
"private" => Some((true, MsFlags::MS_PRIVATE)),
"rprivate" => Some((true, MsFlags::MS_PRIVATE | MsFlags::MS_REC)),
"shared" => Some((true, MsFlags::MS_SHARED)),
"rshared" => Some((true, MsFlags::MS_SHARED | MsFlags::MS_REC)),
"slave" => Some((true, MsFlags::MS_SLAVE)),
"rslave" => Some((true, MsFlags::MS_SLAVE | MsFlags::MS_REC)),
"relatime" => Some((true, MsFlags::MS_RELATIME)),
"norelatime" => Some((true, MsFlags::MS_RELATIME)),
"strictatime" => Some((true, MsFlags::MS_STRICTATIME)),
"nostrictatime" => Some((true, MsFlags::MS_STRICTATIME)),
_ => None,
} {
if is_clear {
flags &= !flag;
} else {
flags |= flag;
}
} else {
data.push(s.as_str());
};
}
}
(flags, data.join(","))
}
/// Find parent mount of rootfs in given mount infos
fn find_parent_mount<'a>(rootfs: &Path, mount_infos: &'a [MountInfo]) -> Result<&'a MountInfo> {
// find the longest mount point
let parent_mount_info = mount_infos
.iter()
.filter(|mi| rootfs.starts_with(&mi.mount_point))
.max_by(|mi1, mi2| mi1.mount_point.len().cmp(&mi2.mount_point.len()))
.ok_or_else(|| anyhow!("couldn't find parent mount of {}", rootfs.display()))?;
Ok(parent_mount_info)
}
/// Make parent mount of rootfs private if it was shared, which is required by pivot_root.
/// It also makes sure following bind mount does not propagate in other namespaces.
fn make_parent_mount_private(rootfs: &Path) -> Result<()> {
let mount_infos = Process::myself()?.mountinfo()?;
let parent_mount = find_parent_mount(rootfs, &mount_infos)?;
// check parent mount has 'shared' propagation type
if parent_mount
.opt_fields
.iter()
.any(|field| matches!(field, MountOptFields::Shared(_)))
{
nix_mount(
None::<&str>,
&parent_mount.mount_point,
None::<&str>,
MsFlags::MS_PRIVATE,
None::<&str>,
)?;
}
Ok(())
}
/// Change propagation type of rootfs as specified in spec.
pub fn adjust_root_mount_propagation(linux: &Linux) -> Result<()> {
let rootfs_propagation = linux.rootfs_propagation().as_deref();
let flags = match rootfs_propagation {
Some("shared") => Some(MsFlags::MS_SHARED),
Some("unbindable") => Some(MsFlags::MS_UNBINDABLE),
_ => None,
};
if let Some(flags) = flags {
log::debug!("make root mount {:?}", flags);
nix_mount(None::<&str>, "/", None::<&str>, flags, None::<&str>)?;
}
Ok(())
}
#[cfg(test)]
mod tests {
use anyhow::{Context, Result};
use procfs::process::MountInfo;
use std::path::{Path, PathBuf};
#[test]
fn test_find_parent_mount() -> Result<()> {
let mount_infos = vec![
MountInfo {
mnt_id: 11,
pid: 10,
majmin: "".to_string(),
root: "/".to_string(),
mount_point: PathBuf::from("/"),
mount_options: Default::default(),
opt_fields: vec![],
fs_type: "ext4".to_string(),
mount_source: Some("/dev/sda1".to_string()),
super_options: Default::default(),
},
MountInfo {
mnt_id: 12,
pid: 11,
majmin: "".to_string(),
root: "/".to_string(),
mount_point: PathBuf::from("/proc"),
mount_options: Default::default(),
opt_fields: vec![],
fs_type: "proc".to_string(),
mount_source: Some("proc".to_string()),
super_options: Default::default(),
},
];
let res = super::find_parent_mount(Path::new("/path/to/rootfs"), &mount_infos)
.context("Failed to get parent mount")?;
assert_eq!(res.mnt_id, 11);
Ok(())
}
#[test]
fn test_find_parent_mount_with_empty_mount_infos() {
let mount_infos = vec![];
let res = super::find_parent_mount(Path::new("/path/to/rootfs"), &mount_infos);
assert!(res.is_err());
}
}