1
0
Fork 0
mirror of https://github.com/containers/youki synced 2024-05-10 01:26:14 +02:00

Implement the seccomp profile

This commit is contained in:
yihuaf 2021-09-14 08:54:35 +02:00
parent c0a344e863
commit 052ba25769
9 changed files with 1407 additions and 14 deletions

View File

@ -43,7 +43,7 @@ jobs:
working-directory: ./cgroups
- run: rustup component add rustfmt clippy
- run: sudo apt-get -y update
- run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev libelf-dev
- run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev libelf-dev libseccomp-dev
- name: Check formatting
run: cargo fmt --all -- --check
working-directory: ${{matrix.dirs}}
@ -68,7 +68,7 @@ jobs:
with:
working-directory: ./cgroups
- run: sudo apt-get -y update
- run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev libelf-dev
- run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev libelf-dev libseccomp-dev
- name: Run tests
run: cargo test --all --all-features --no-fail-fast
coverage:
@ -98,7 +98,7 @@ jobs:
- name: Update System Libraries
run: sudo apt-get -y update
- name: Install System Libraries
run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev libelf-dev
run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev libelf-dev libseccomp-dev
- name: Run Test Coverage for youki
run: |
cargo llvm-cov clean --workspace
@ -143,7 +143,7 @@ jobs:
with:
working-directory: ./cgroups
- run: sudo apt-get -y update
- run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev libelf-dev
- run: sudo apt-get install -y pkg-config libsystemd-dev libdbus-glib-1-dev libelf-dev libseccomp-dev
- name: Build
run: ./build.sh --release
- uses: actions/setup-go@v2

10
Cargo.lock generated
View File

@ -890,6 +890,15 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
[[package]]
name = "seccomp-sys"
version = "0.1.3"
source = "git+https://github.com/polachok/seccomp-sys.git?rev=9d89b10f9faa19e8f4e952663697ec126f2e2121#9d89b10f9faa19e8f4e952663697ec126f2e2121"
dependencies = [
"libc",
"pkg-config",
]
[[package]]
name = "serde"
version = "1.0.130"
@ -1151,6 +1160,7 @@ dependencies = [
"prctl",
"procfs",
"quickcheck",
"seccomp-sys",
"serde",
"serde_json",
"serial_test",

View File

@ -41,6 +41,7 @@ dbus = "0.9.2"
tabwriter = "1"
fastrand = "1.4.1"
crossbeam-channel = "0.5"
seccomp-sys = { git = "https://github.com/polachok/seccomp-sys.git", rev = "9d89b10f9faa19e8f4e952663697ec126f2e2121"}
[dev-dependencies]
oci-spec = { git = "https://github.com/utam0k/oci-spec-rs/", tag = "v0.4.0-with-bugfix", features = ["proptests"] }

View File

@ -75,7 +75,8 @@ $ sudo apt-get install \
libsystemd-dev \
libdbus-glib-1-dev \
build-essential \
libelf-dev
libelf-dev \
libseccomp-dev
```
### Fedora, Centos, RHEL and related distributions
@ -86,6 +87,7 @@ $ sudo dnf install \
systemd-devel \
dbus-devel \
elfutils-libelf-devel \
libseccomp-devel
```
## Build

View File

@ -9,6 +9,7 @@ pub mod notify_socket;
pub mod process;
pub mod rootfs;
pub mod rootless;
pub mod seccomp;
pub mod signal;
pub mod syscall;
pub mod tty;

View File

@ -1,3 +1,8 @@
use super::args::ContainerArgs;
use crate::{
capabilities, hooks, namespaces::Namespaces, process::channel, rootfs, rootless::Rootless,
seccomp, syscall::Syscall, tty, utils,
};
use anyhow::{bail, Context, Result};
use nix::mount::mount as nix_mount;
use nix::mount::MsFlags;
@ -9,17 +14,12 @@ use nix::{
};
use oci_spec::runtime::{LinuxNamespaceType, User};
use std::collections::HashMap;
use std::{env, os::unix::io::AsRawFd};
use std::{fs, path::Path, path::PathBuf};
use crate::rootless::Rootless;
use crate::{
capabilities, hooks, namespaces::Namespaces, process::channel, rootfs, syscall::Syscall, tty,
utils,
use std::{
env, fs,
os::unix::io::AsRawFd,
path::{Path, PathBuf},
};
use super::args::ContainerArgs;
// Make sure a given path is on procfs. This is to avoid the security risk that
// /proc path is mounted over. Ref: CVE-2019-16884
fn ensure_procfs(path: &Path) -> Result<()> {
@ -377,6 +377,10 @@ pub fn container_init(
}
}
// Initialize seccomp profile right before we are ready to execute the
// payload. The notify socket will still need network related syscalls.
seccomp::initialize_seccomp(linux.seccomp.as_ref()).context("Failed to execute seccomp")?;
if let Some(args) = proc.args.as_ref() {
utils::do_exec(&args[0], args)?;
} else {

View File

@ -0,0 +1,972 @@
{
"ociVersion": "1.0.1-dev",
"process": {
"terminal": false,
"user": {
"uid": 0,
"gid": 0
},
"args": [
"helloworld"
],
"env": [
"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
"TERM=xterm"
],
"cwd": "/",
"capabilities": {
"bounding": [
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE"
],
"effective": [
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE"
],
"inheritable": [
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE"
],
"permitted": [
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE"
],
"ambient": [
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE"
]
},
"rlimits": [
{
"type": "RLIMIT_NOFILE",
"hard": 1024,
"soft": 1024
}
],
"noNewPrivileges": true
},
"root": {
"path": "tests/assets/oci/helloworld/rootfs"
},
"hostname": "runc",
"mounts": [
{
"destination": "/proc",
"type": "proc",
"source": "proc"
},
{
"destination": "/dev",
"type": "tmpfs",
"source": "tmpfs",
"options": [
"nosuid",
"strictatime",
"mode=755",
"size=65536k"
]
},
{
"destination": "/dev/pts",
"type": "devpts",
"source": "devpts",
"options": [
"nosuid",
"noexec",
"newinstance",
"ptmxmode=0666",
"mode=0620",
"gid=5"
]
},
{
"destination": "/dev/shm",
"type": "tmpfs",
"source": "shm",
"options": [
"nosuid",
"noexec",
"nodev",
"mode=1777",
"size=65536k"
]
},
{
"destination": "/dev/mqueue",
"type": "mqueue",
"source": "mqueue",
"options": [
"nosuid",
"noexec",
"nodev"
]
},
{
"destination": "/sys",
"type": "sysfs",
"source": "sysfs",
"options": [
"nosuid",
"noexec",
"nodev",
"ro"
]
},
{
"destination": "/sys/fs/cgroup",
"type": "cgroup",
"source": "cgroup",
"options": [
"nosuid",
"noexec",
"nodev",
"relatime",
"ro"
]
}
],
"linux": {
"devices": [
{
"path": "/dev/kvm",
"type": "c",
"major": 10,
"minor": 232,
"fileMode": 666,
"uid": 0,
"gid": 36
}
],
"seccomp": {
"defaultAction": "SCMP_ACT_ERRNO",
"defaultErrnoRet": 1,
"archMap": [
{
"architecture": "SCMP_ARCH_X86_64",
"subArchitectures": [
"SCMP_ARCH_X86",
"SCMP_ARCH_X32"
]
},
{
"architecture": "SCMP_ARCH_AARCH64",
"subArchitectures": [
"SCMP_ARCH_ARM"
]
},
{
"architecture": "SCMP_ARCH_MIPS64",
"subArchitectures": [
"SCMP_ARCH_MIPS",
"SCMP_ARCH_MIPS64N32"
]
},
{
"architecture": "SCMP_ARCH_MIPS64N32",
"subArchitectures": [
"SCMP_ARCH_MIPS",
"SCMP_ARCH_MIPS64"
]
},
{
"architecture": "SCMP_ARCH_MIPSEL64",
"subArchitectures": [
"SCMP_ARCH_MIPSEL",
"SCMP_ARCH_MIPSEL64N32"
]
},
{
"architecture": "SCMP_ARCH_MIPSEL64N32",
"subArchitectures": [
"SCMP_ARCH_MIPSEL",
"SCMP_ARCH_MIPSEL64"
]
},
{
"architecture": "SCMP_ARCH_S390X",
"subArchitectures": [
"SCMP_ARCH_S390"
]
}
],
"syscalls": [
{
"names": [
"accept",
"accept4",
"access",
"adjtimex",
"alarm",
"bind",
"brk",
"capget",
"capset",
"chdir",
"chmod",
"chown",
"chown32",
"clock_adjtime",
"clock_adjtime64",
"clock_getres",
"clock_getres_time64",
"clock_gettime",
"clock_gettime64",
"clock_nanosleep",
"clock_nanosleep_time64",
"close",
"close_range",
"connect",
"copy_file_range",
"creat",
"dup",
"dup2",
"dup3",
"epoll_create",
"epoll_create1",
"epoll_ctl",
"epoll_ctl_old",
"epoll_pwait",
"epoll_pwait2",
"epoll_wait",
"epoll_wait_old",
"eventfd",
"eventfd2",
"execve",
"execveat",
"exit",
"exit_group",
"faccessat",
"faccessat2",
"fadvise64",
"fadvise64_64",
"fallocate",
"fanotify_mark",
"fchdir",
"fchmod",
"fchmodat",
"fchown",
"fchown32",
"fchownat",
"fcntl",
"fcntl64",
"fdatasync",
"fgetxattr",
"flistxattr",
"flock",
"fork",
"fremovexattr",
"fsetxattr",
"fstat",
"fstat64",
"fstatat64",
"fstatfs",
"fstatfs64",
"fsync",
"ftruncate",
"ftruncate64",
"futex",
"futex_time64",
"futimesat",
"getcpu",
"getcwd",
"getdents",
"getdents64",
"getegid",
"getegid32",
"geteuid",
"geteuid32",
"getgid",
"getgid32",
"getgroups",
"getgroups32",
"getitimer",
"getpeername",
"getpgid",
"getpgrp",
"getpid",
"getppid",
"getpriority",
"getrandom",
"getresgid",
"getresgid32",
"getresuid",
"getresuid32",
"getrlimit",
"get_robust_list",
"getrusage",
"getsid",
"getsockname",
"getsockopt",
"get_thread_area",
"gettid",
"gettimeofday",
"getuid",
"getuid32",
"getxattr",
"inotify_add_watch",
"inotify_init",
"inotify_init1",
"inotify_rm_watch",
"io_cancel",
"ioctl",
"io_destroy",
"io_getevents",
"io_pgetevents",
"io_pgetevents_time64",
"ioprio_get",
"ioprio_set",
"io_setup",
"io_submit",
"io_uring_enter",
"io_uring_register",
"io_uring_setup",
"ipc",
"kill",
"lchown",
"lchown32",
"lgetxattr",
"link",
"linkat",
"listen",
"listxattr",
"llistxattr",
"_llseek",
"lremovexattr",
"lseek",
"lsetxattr",
"lstat",
"lstat64",
"madvise",
"membarrier",
"memfd_create",
"mincore",
"mkdir",
"mkdirat",
"mknod",
"mknodat",
"mlock",
"mlock2",
"mlockall",
"mmap",
"mmap2",
"mprotect",
"mq_getsetattr",
"mq_notify",
"mq_open",
"mq_timedreceive",
"mq_timedreceive_time64",
"mq_timedsend",
"mq_timedsend_time64",
"mq_unlink",
"mremap",
"msgctl",
"msgget",
"msgrcv",
"msgsnd",
"msync",
"munlock",
"munlockall",
"munmap",
"nanosleep",
"newfstatat",
"_newselect",
"open",
"openat",
"openat2",
"pause",
"pidfd_open",
"pidfd_send_signal",
"pipe",
"pipe2",
"poll",
"ppoll",
"ppoll_time64",
"prctl",
"pread64",
"preadv",
"preadv2",
"prlimit64",
"pselect6",
"pselect6_time64",
"pwrite64",
"pwritev",
"pwritev2",
"read",
"readahead",
"readlink",
"readlinkat",
"readv",
"recv",
"recvfrom",
"recvmmsg",
"recvmmsg_time64",
"recvmsg",
"remap_file_pages",
"removexattr",
"rename",
"renameat",
"renameat2",
"restart_syscall",
"rmdir",
"rseq",
"rt_sigaction",
"rt_sigpending",
"rt_sigprocmask",
"rt_sigqueueinfo",
"rt_sigreturn",
"rt_sigsuspend",
"rt_sigtimedwait",
"rt_sigtimedwait_time64",
"rt_tgsigqueueinfo",
"sched_getaffinity",
"sched_getattr",
"sched_getparam",
"sched_get_priority_max",
"sched_get_priority_min",
"sched_getscheduler",
"sched_rr_get_interval",
"sched_rr_get_interval_time64",
"sched_setaffinity",
"sched_setattr",
"sched_setparam",
"sched_setscheduler",
"sched_yield",
"seccomp",
"select",
"semctl",
"semget",
"semop",
"semtimedop",
"semtimedop_time64",
"send",
"sendfile",
"sendfile64",
"sendmmsg",
"sendmsg",
"sendto",
"setfsgid",
"setfsgid32",
"setfsuid",
"setfsuid32",
"setgid",
"setgid32",
"setgroups",
"setgroups32",
"setitimer",
"setpgid",
"setpriority",
"setregid",
"setregid32",
"setresgid",
"setresgid32",
"setresuid",
"setresuid32",
"setreuid",
"setreuid32",
"setrlimit",
"set_robust_list",
"setsid",
"setsockopt",
"set_thread_area",
"set_tid_address",
"setuid",
"setuid32",
"setxattr",
"shmat",
"shmctl",
"shmdt",
"shmget",
"shutdown",
"sigaltstack",
"signalfd",
"signalfd4",
"sigprocmask",
"sigreturn",
"socket",
"socketcall",
"socketpair",
"splice",
"stat",
"stat64",
"statfs",
"statfs64",
"statx",
"symlink",
"symlinkat",
"sync",
"sync_file_range",
"syncfs",
"sysinfo",
"tee",
"tgkill",
"time",
"timer_create",
"timer_delete",
"timer_getoverrun",
"timer_gettime",
"timer_gettime64",
"timer_settime",
"timer_settime64",
"timerfd_create",
"timerfd_gettime",
"timerfd_gettime64",
"timerfd_settime",
"timerfd_settime64",
"times",
"tkill",
"truncate",
"truncate64",
"ugetrlimit",
"umask",
"uname",
"unlink",
"unlinkat",
"utime",
"utimensat",
"utimensat_time64",
"utimes",
"vfork",
"vmsplice",
"wait4",
"waitid",
"waitpid",
"write",
"writev"
],
"action": "SCMP_ACT_ALLOW"
},
{
"names": [
"process_vm_readv",
"process_vm_writev",
"ptrace"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"minKernel": "4.8"
}
},
{
"names": [
"personality"
],
"action": "SCMP_ACT_ALLOW",
"args": [
{
"index": 0,
"value": 0,
"op": "SCMP_CMP_EQ"
}
]
},
{
"names": [
"personality"
],
"action": "SCMP_ACT_ALLOW",
"args": [
{
"index": 0,
"value": 8,
"op": "SCMP_CMP_EQ"
}
]
},
{
"names": [
"personality"
],
"action": "SCMP_ACT_ALLOW",
"args": [
{
"index": 0,
"value": 131072,
"op": "SCMP_CMP_EQ"
}
]
},
{
"names": [
"personality"
],
"action": "SCMP_ACT_ALLOW",
"args": [
{
"index": 0,
"value": 131080,
"op": "SCMP_CMP_EQ"
}
]
},
{
"names": [
"personality"
],
"action": "SCMP_ACT_ALLOW",
"args": [
{
"index": 0,
"value": 4294967295,
"op": "SCMP_CMP_EQ"
}
]
},
{
"names": [
"sync_file_range2"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"arches": [
"ppc64le"
]
}
},
{
"names": [
"arm_fadvise64_64",
"arm_sync_file_range",
"sync_file_range2",
"breakpoint",
"cacheflush",
"set_tls"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"arches": [
"arm",
"arm64"
]
}
},
{
"names": [
"arch_prctl"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"arches": [
"amd64",
"x32"
]
}
},
{
"names": [
"modify_ldt"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"arches": [
"amd64",
"x32",
"x86"
]
}
},
{
"names": [
"s390_pci_mmio_read",
"s390_pci_mmio_write",
"s390_runtime_instr"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"arches": [
"s390",
"s390x"
]
}
},
{
"names": [
"open_by_handle_at"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"caps": [
"CAP_DAC_READ_SEARCH"
]
}
},
{
"names": [
"bpf",
"clone",
"clone3",
"fanotify_init",
"fsconfig",
"fsmount",
"fsopen",
"fspick",
"lookup_dcookie",
"mount",
"move_mount",
"name_to_handle_at",
"open_tree",
"perf_event_open",
"quotactl",
"setdomainname",
"sethostname",
"setns",
"syslog",
"umount",
"umount2",
"unshare"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"caps": [
"CAP_SYS_ADMIN"
]
}
},
{
"names": [
"clone"
],
"action": "SCMP_ACT_ALLOW",
"args": [
{
"index": 0,
"value": 2114060288,
"op": "SCMP_CMP_MASKED_EQ"
}
],
"excludes": {
"caps": [
"CAP_SYS_ADMIN"
],
"arches": [
"s390",
"s390x"
]
}
},
{
"names": [
"clone"
],
"action": "SCMP_ACT_ALLOW",
"args": [
{
"index": 1,
"value": 2114060288,
"op": "SCMP_CMP_MASKED_EQ"
}
],
"comment": "s390 parameter ordering for clone is different",
"includes": {
"arches": [
"s390",
"s390x"
]
},
"excludes": {
"caps": [
"CAP_SYS_ADMIN"
]
}
},
{
"names": [
"clone3"
],
"action": "SCMP_ACT_ERRNO",
"errnoRet": 38,
"excludes": {
"caps": [
"CAP_SYS_ADMIN"
]
}
},
{
"names": [
"reboot"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"caps": [
"CAP_SYS_BOOT"
]
}
},
{
"names": [
"chroot"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"caps": [
"CAP_SYS_CHROOT"
]
}
},
{
"names": [
"delete_module",
"init_module",
"finit_module"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"caps": [
"CAP_SYS_MODULE"
]
}
},
{
"names": [
"acct"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"caps": [
"CAP_SYS_PACCT"
]
}
},
{
"names": [
"kcmp",
"pidfd_getfd",
"process_madvise",
"process_vm_readv",
"process_vm_writev",
"ptrace"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"caps": [
"CAP_SYS_PTRACE"
]
}
},
{
"names": [
"iopl",
"ioperm"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"caps": [
"CAP_SYS_RAWIO"
]
}
},
{
"names": [
"settimeofday",
"stime",
"clock_settime"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"caps": [
"CAP_SYS_TIME"
]
}
},
{
"names": [
"vhangup"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"caps": [
"CAP_SYS_TTY_CONFIG"
]
}
},
{
"names": [
"get_mempolicy",
"mbind",
"set_mempolicy"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"caps": [
"CAP_SYS_NICE"
]
}
},
{
"names": [
"syslog"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"caps": [
"CAP_SYSLOG"
]
}
}
]
},
"resources": {
"devices": [
{
"allow": true,
"access": "rwm"
}
]
},
"uidMappings": [
{
"containerID": 0,
"hostID": 1000,
"size": 1
}
],
"gidMappings": [
{
"containerID": 0,
"hostID": 1000,
"size": 1
}
],
"namespaces": [
{
"type": "pid"
},
{
"type": "network"
},
{
"type": "user"
},
{
"type": "ipc"
},
{
"type": "uts"
},
{
"type": "mount"
}
],
"maskedPaths": [
"/proc/kcore",
"/proc/latency_stats",
"/proc/timer_list",
"/proc/timer_stats",
"/proc/sched_debug",
"/sys/firmware",
"/proc/scsi"
],
"readonlyPaths": [
"/proc/asound",
"/proc/bus",
"/proc/fs",
"/proc/irq",
"/proc/sys",
"/proc/sysrq-trigger"
]
}
}

2
src/seccomp/mod.rs Normal file
View File

@ -0,0 +1,2 @@
pub mod seccomp;
pub use seccomp::initialize_seccomp;

401
src/seccomp/seccomp.rs Normal file
View File

@ -0,0 +1,401 @@
use anyhow::bail;
use anyhow::Context;
use anyhow::Result;
use oci_spec::runtime::Arch;
use oci_spec::runtime::LinuxSeccomp;
use oci_spec::runtime::LinuxSeccompAction;
use oci_spec::runtime::LinuxSeccompOperator;
use seccomp_sys::scmp_arch::*;
use seccomp_sys::scmp_compare::*;
use seccomp_sys::*;
use std::ffi::CString;
#[derive(Debug)]
pub struct Compare {
// The zero-indexed index of the syscall arguement.
arg: libc::c_uint,
op: Option<scmp_compare>,
datum_a: Option<scmp_datum_t>,
datum_b: Option<scmp_datum_t>,
}
impl Compare {
pub fn new(args: u32) -> Self {
Compare {
arg: args as libc::c_uint,
op: None,
datum_a: None,
datum_b: None,
}
}
pub fn op(mut self, op: scmp_compare) -> Self {
self.op = Some(op);
self
}
pub fn datum_a(mut self, datum: scmp_datum_t) -> Self {
self.datum_a = Some(datum);
self
}
pub fn datum_b(mut self, datum: scmp_datum_t) -> Self {
self.datum_b = Some(datum);
self
}
pub fn build(self) -> Result<scmp_arg_cmp> {
if self.op.is_some() && self.datum_a.is_some() {
Ok(scmp_arg_cmp {
arg: self.arg,
op: self.op.unwrap().into(),
datum_a: self.datum_a.unwrap(),
datum_b: self.datum_b.unwrap_or(0),
})
} else {
bail!("op and datum_a is required: {:?}", self);
}
}
}
#[derive(Debug)]
pub struct Rule {
action: u32,
syscall_nr: i32,
comparators: Vec<scmp_arg_cmp>,
}
impl Rule {
pub fn new(action: u32, syscall_number: i32) -> Self {
Rule {
action,
syscall_nr: syscall_number,
comparators: vec![],
}
}
pub fn add_comparator(&mut self, cmp: scmp_arg_cmp) {
self.comparators.push(cmp);
}
}
#[derive(Debug)]
struct FilterContext {
ctx: *mut scmp_filter_ctx,
}
impl FilterContext {
pub fn default(default_action: u32) -> Result<FilterContext> {
let filter_ctx = unsafe { seccomp_init(default_action) };
if filter_ctx.is_null() {
bail!("Failed to initialized seccomp profile")
}
Ok(FilterContext { ctx: filter_ctx })
}
pub fn add_rule(&mut self, rule: &Rule) -> Result<()> {
let res = match rule.comparators.len() {
0 => unsafe { seccomp_rule_add(self.ctx, rule.action, rule.syscall_nr, 0) },
_ => unsafe {
seccomp_rule_add_array(
self.ctx,
rule.action,
rule.syscall_nr,
rule.comparators.len() as u32,
rule.comparators.as_slice().as_ptr(),
)
},
};
if res != 0 {
bail!("Failed to add rule. Errno: {}, Rule: {:?}", res, rule);
}
Ok(())
}
pub fn add_arch(&mut self, arch: u32) -> Result<()> {
let res = unsafe { seccomp_arch_add(self.ctx, arch) };
if res != 0 {
if nix::Error::from_i32(res.abs()) != nix::Error::EEXIST {
// The architecture already existed in the profile, so we can
// safely ignore the error here. Otherwise, error out.
bail!("Failed to add architecture {}. Errno: {}", arch, res);
}
}
Ok(())
}
pub fn load(&self) -> Result<()> {
let res = unsafe { seccomp_load(self.ctx) };
if res != 0 {
bail!("Failed to load seccomp profile: {}", res);
}
Ok(())
}
}
fn translate_syscall(syscall_name: String) -> Result<i32> {
let c_syscall_name = CString::new(syscall_name.as_str())
.with_context(|| format!("Failed to convert syscall {:?} to cstring", syscall_name))?;
let res = unsafe { seccomp_syscall_resolve_name(c_syscall_name.as_ptr()) };
if res == __NR_SCMP_ERROR {
bail!("Failed to resolve syscall from name: {:?}", syscall_name);
}
Ok(res)
}
fn translate_action(action: &LinuxSeccompAction, errno: Option<u32>) -> u32 {
let errno = errno.unwrap_or(libc::EPERM as u32);
match action {
LinuxSeccompAction::ScmpActKill => SCMP_ACT_KILL,
LinuxSeccompAction::ScmpActTrap => SCMP_ACT_TRAP,
LinuxSeccompAction::ScmpActErrno => SCMP_ACT_ERRNO(errno),
LinuxSeccompAction::ScmpActTrace => SCMP_ACT_TRACE(errno),
LinuxSeccompAction::ScmpActAllow => SCMP_ACT_ALLOW,
LinuxSeccompAction::ScmpActKillProcess => SCMP_ACT_KILL_PROCESS,
LinuxSeccompAction::ScmpActNotify => SCMP_ACT_NOTIFY,
LinuxSeccompAction::ScmpActLog => SCMP_ACT_LOG,
}
}
fn translate_op(op: &LinuxSeccompOperator) -> scmp_compare {
match op {
LinuxSeccompOperator::ScmpCmpNe => SCMP_CMP_NE,
LinuxSeccompOperator::ScmpCmpLt => SCMP_CMP_LT,
LinuxSeccompOperator::ScmpCmpLe => SCMP_CMP_LE,
LinuxSeccompOperator::ScmpCmpEq => SCMP_CMP_EQ,
LinuxSeccompOperator::ScmpCmpGe => SCMP_CMP_GE,
LinuxSeccompOperator::ScmpCmpGt => SCMP_CMP_GT,
LinuxSeccompOperator::ScmpCmpMaskedEq => SCMP_CMP_MASKED_EQ,
}
}
fn translate_arch(arch: &Arch) -> scmp_arch {
match arch {
Arch::ScmpArchNative => SCMP_ARCH_NATIVE,
Arch::ScmpArchX86 => SCMP_ARCH_X86,
Arch::ScmpArchX86_64 => SCMP_ARCH_X86_64,
Arch::ScmpArchX32 => SCMP_ARCH_X32,
Arch::ScmpArchArm => SCMP_ARCH_ARM,
Arch::ScmpArchAarch64 => SCMP_ARCH_AARCH64,
Arch::ScmpArchMips => SCMP_ARCH_MIPS,
Arch::ScmpArchMips64 => SCMP_ARCH_MIPS64,
Arch::ScmpArchMips64n32 => SCMP_ARCH_MIPS64N32,
Arch::ScmpArchMipsel => SCMP_ARCH_MIPSEL,
Arch::ScmpArchMipsel64 => SCMP_ARCH_MIPSEL64,
Arch::ScmpArchMipsel64n32 => SCMP_ARCH_MIPSEL64N32,
Arch::ScmpArchPpc => SCMP_ARCH_PPC,
Arch::ScmpArchPpc64 => SCMP_ARCH_PPC64,
Arch::ScmpArchPpc64le => SCMP_ARCH_PPC64LE,
Arch::ScmpArchS390 => SCMP_ARCH_S390,
Arch::ScmpArchS390x => SCMP_ARCH_S390X,
}
}
pub fn initialize_seccomp(seccomp: Option<&LinuxSeccomp>) -> Result<()> {
if seccomp.is_none() {
return Ok(());
}
let seccomp = seccomp.unwrap();
if seccomp.flags.is_some() {
// runc did not support this, so let's skip it for now.
bail!("seccomp flags are not yet supported");
}
// log::debug!("XXX seccomp: {:?}", seccomp);
// TODO: fix default action error number. The spec repo doesn't have it yet.
let default_action = translate_action(&seccomp.default_action, None);
let mut ctx = FilterContext::default(default_action)?;
if let Some(architectures) = seccomp.architectures.as_ref() {
for arch in architectures {
let arch_token = translate_arch(arch);
ctx.add_arch(arch_token as u32)
.context("Failed to add arch to seccomp")?;
}
}
if let Some(syscalls) = seccomp.syscalls.as_ref() {
for syscall in syscalls {
let action = translate_action(&syscall.action, syscall.errno_ret);
if action == default_action {
// When the action is the same as the default action, the rule is redundent. We can
// skip this here to avoid failing when we add the rules.
log::warn!(
"Detect a seccomp action that is the same as the default action: {:?}",
syscall
);
continue;
}
for name in &syscall.names {
let ret = translate_syscall(name.clone());
if ret.is_err() {
// If we failed to resolve the syscall by name, likely the kernel
// doeesn't support this syscall. So it is safe to skip...
log::warn!(
"Failed to resolve syscall, likely kernel doesn't support this. {:?}",
name
);
continue;
}
let syscall_number = translate_syscall(name.clone())?;
// Not clear why but if there are multiple arg attached to one
// syscall rule, we have to add them seperatly. add_rule will
// return EINVAL. runc does the same but doesn't explain why.
match syscall.args.as_ref() {
Some(args) => {
for arg in args {
let mut rule = Rule::new(action, syscall_number);
let cmp = Compare::new(arg.index as u32)
.op(translate_op(&arg.op))
.datum_a(arg.value)
.datum_b(arg.value_two.unwrap_or(0))
.build()
.context("Failed to build a seccomp compare rule")?;
rule.add_comparator(cmp);
ctx.add_rule(&rule).with_context(|| {
format!(
"Failed to add seccomp rule: {:?}. Syscall: {:?}",
&rule, name,
)
})?;
}
}
None => {
let rule = Rule::new(action, syscall_number);
ctx.add_rule(&rule).with_context(|| {
format!(
"Failed to add seccomp rule: {:?}. Syscall: {:?}",
&rule, name,
)
})?;
}
}
}
}
}
let _ = prctl::set_no_new_privileges(true);
ctx.load().context("Failed to load seccomp context")?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use anyhow::Result;
use mio::unix::pipe;
use nix::sys::wait;
use oci_spec::runtime::{Arch, LinuxSeccomp, LinuxSyscall};
use serial_test::serial;
use std::io::Read;
use std::io::Write;
use std::os::unix::prelude::AsRawFd;
use std::path;
#[test]
#[serial]
fn test_basic() -> Result<()> {
// Note: seccomp profile is really hard to write unit test for. First,
// we can't really test default error or kill action, since rust test
// actually replies on certain syscalls. Second, some of the syscall
// will not return errorno. These syscalls will just send an abort
// signal or even just segfaults. Here we choose to use `getcwd`
// syscall for testing. This is more of a sanity check.
let mut seccomp_profile = LinuxSeccomp::default();
seccomp_profile.default_action = LinuxSeccompAction::ScmpActAllow;
seccomp_profile.architectures = Some(vec![Arch::ScmpArchNative]);
let mut seccomp_syscall = LinuxSyscall::default();
// Here, we choose an error that getcwd call would never return on its own, so
// we can make sure that getcwd failed because of seccomp rule.
let expect_error = libc::EAGAIN;
seccomp_syscall.names = vec![String::from("getcwd"), String::from("setuid")];
seccomp_syscall.action = LinuxSeccompAction::ScmpActErrno;
seccomp_syscall.errno_ret = Some(expect_error as u32);
seccomp_profile.syscalls = Some(vec![seccomp_syscall]);
// Since Rust cargo test uses a single process to execute all tests, it
// is a good idea to fork a child process to test the seccomp profile,
// and then kill the process. This way, the main test process is
// unaffected. The child process will pass the returned error code
// to the parent for assert and checking.
let (mut sender, mut receiver) = pipe::new()?;
receiver
.set_nonblocking(false)
.with_context(|| "Failed to set channel receiver to blocking")?;
match unsafe { nix::unistd::fork()? } {
nix::unistd::ForkResult::Parent { child } => {
nix::unistd::close(sender.as_raw_fd())?;
let mut buf = [0; 4];
receiver
.read_exact(&mut buf)
.context("Failed to wait from child")?;
assert_eq!(i32::from_be_bytes(buf), expect_error);
wait::waitpid(child, None)?;
}
nix::unistd::ForkResult::Child => {
nix::unistd::close(receiver.as_raw_fd())?;
initialize_seccomp(Some(&seccomp_profile))?;
let ret = nix::unistd::getcwd();
let errno: i32 = if ret.is_err() {
ret.err().unwrap() as i32
} else {
0
};
sender.write_all(&errno.to_be_bytes())?;
std::process::exit(errno);
}
}
Ok(())
}
#[test]
#[serial]
fn test_moby() -> Result<()> {
let fixture_path =
path::PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("src/seccomp/fixture/config.json");
let spec = oci_spec::runtime::Spec::load(fixture_path)
.context("Failed to load test spec for seccomp")?;
// We know linux and seccomp exist, so let's just unwrap.
let seccomp_profile = spec.linux.unwrap().seccomp.unwrap();
match unsafe { nix::unistd::fork()? } {
nix::unistd::ForkResult::Parent { child } => {
let status = wait::waitpid(child, None)?;
match status {
wait::WaitStatus::Exited(_, exit_code) => {
assert_eq!(
exit_code, 0,
"Child process didn't configure seccomp profile correctly"
);
}
_ => {
bail!("Child process failed to exit correctly: {:?}", status);
}
}
}
nix::unistd::ForkResult::Child => {
let ret = initialize_seccomp(Some(&seccomp_profile));
let exit_code = if ret.is_ok() { 0 } else { -1 };
std::process::exit(exit_code);
}
}
Ok(())
}
}