Implement double fork

2024-06-10 00:36:16 +02:00 · 2021-08-19 20:25:08 +02:00 · 2021-08-19 20:25:08 +02:00 · 0f9490c68b
parent 414fa3a448
commit 0f9490c68b
10 changed files with 540 additions and 742 deletions
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@ -60,7 +60,7 @@ jobs:
      - name: Build
        run: ./build.sh --release
      - name: Run tests
-        run: cargo test
+        run: cargo test -- --nocapture
      - name: Run doc tests
        run: cargo test --doc
      - name: Run cgroup tests
--- a/src/container/builder_impl.rs
+++ b/src/container/builder_impl.rs
@ -1,5 +1,4 @@
 use anyhow::{Context, Result};
-use nix::sched::CloneFlags;

 use cgroups;

@ -8,8 +7,7 @@ use std::{fs, os::unix::prelude::RawFd, path::PathBuf};

 use crate::{
    hooks,
-    namespaces::Namespaces,
-    process::{child, fork, init, parent},
+    process::{channel, fork, init},
    rootless::Rootless,
    syscall::linux::LinuxSyscall,
    utils,
@ -59,16 +57,16 @@ impl<'a> ContainerBuilderImpl<'a> {
        let cgroups_path = utils::get_cgroup_path(&linux.cgroups_path, &self.container_id);
        let cmanager = cgroups::common::create_cgroup_manager(&cgroups_path, self.use_systemd)?;

-        // create the parent and child process structure so the parent and child process can sync with each other
-        let (mut parent, parent_channel) = parent::ParentProcess::new(&self.rootless)?;
-        let child = child::ChildProcess::new(parent_channel)?;
-
        if self.init {
            if let Some(hooks) = self.spec.hooks.as_ref() {
                hooks::run_hooks(hooks.create_runtime.as_ref(), self.container.as_ref())?
            }
        }

+        // We use a set of channels to communicate between parent and child process. Each channel is uni-directional.
+        let parent_to_child = &mut channel::Channel::new()?;
+        let child_to_parent = &mut channel::Channel::new()?;
+
        // This init_args will be passed to the container init process,
        // therefore we will have to move all the variable by value. Since self
        // is a shared reference, we have to clone these variables here.
@ -82,30 +80,24 @@ impl<'a> ContainerBuilderImpl<'a> {
            notify_path: self.notify_path.clone(),
            preserve_fds: self.preserve_fds,
            container: self.container.clone(),
-            child,
        };
+        let intermediate_pid = fork::container_fork(|| {
+            init::container_intermidiate(init_args, parent_to_child, child_to_parent)
+        })?;
+        // If creating a rootless container, the intermediate process will ask
+        // the main process to set up uid and gid mapping, once the intermediate
+        // process enters into a new user namespace.
+        if self.rootless.is_some() {
+            child_to_parent.wait_for_mapping_request(
+                intermediate_pid,
+                self.rootless.as_ref(),
+                parent_to_child,
+            )?;
+        }

-        // We have to box up this closure to correctly pass to the init function
-        // of the new process.
-        let cb = Box::new(move || {
-            if let Err(error) = init::container_init(init_args) {
-                log::debug!("failed to run container_init: {:?}", error);
-                return -1;
-            }
-
-            0
-        });
-
-        let clone_flags = linux
-            .namespaces
-            .as_ref()
-            .map(|ns| Namespaces::from(ns).clone_flags)
-            .unwrap_or_else(CloneFlags::empty);
-        let init_pid = fork::clone(cb, clone_flags)?;
+        let init_pid = child_to_parent.wait_for_child_ready()?;
        log::debug!("init pid is {:?}", init_pid);

-        parent.wait_for_child_ready(init_pid)?;
-
        cmanager.add_task(init_pid)?;
        if self.rootless.is_none() && linux.resources.is_some() && self.init {
            cmanager.apply(linux.resources.as_ref().unwrap())?;
--- a/src/namespaces.rs
+++ b/src/namespaces.rs
@ -8,86 +8,85 @@
 //! Cgroup (Resource limits, execution priority etc.)

 use crate::syscall::{syscall::create_syscall, Syscall};
-use anyhow::Result;
-use nix::{
-    fcntl,
-    sched::{self, CloneFlags},
-    sys::stat,
-    unistd::{self, Gid, Uid},
-};
-use oci_spec::LinuxNamespace;
+use anyhow::{Context, Result};
+use nix::{fcntl, sched::CloneFlags, sys::stat, unistd};
+use oci_spec::{LinuxNamespace, LinuxNamespaceType};
+use std::collections;

 /// Holds information about namespaces
-pub struct Namespaces<'a> {
-    spaces: &'a Vec<LinuxNamespace>,
+pub struct Namespaces {
    command: Box<dyn Syscall>,
-    pub clone_flags: CloneFlags,
+    namespace_map: collections::HashMap<CloneFlags, LinuxNamespace>,
 }

-impl<'a> From<&'a Vec<LinuxNamespace>> for Namespaces<'a> {
-    fn from(namespaces: &'a Vec<LinuxNamespace>) -> Self {
-        let clone_flags = namespaces.iter().filter(|ns| ns.path.is_none()).fold(
-            CloneFlags::empty(),
-            |mut cf, ns| {
-                cf |= CloneFlags::from_bits_truncate(ns.typ as i32);
-                cf
-            },
-        );
-        let command: Box<dyn Syscall> = create_syscall();
-
-        Namespaces {
-            spaces: namespaces,
-            command,
-            clone_flags,
-        }
+fn get_clone_flag(namespace_type: LinuxNamespaceType) -> CloneFlags {
+    match namespace_type {
+        LinuxNamespaceType::Pid => CloneFlags::CLONE_NEWPID,
+        LinuxNamespaceType::User => CloneFlags::CLONE_NEWUSER,
+        LinuxNamespaceType::Uts => CloneFlags::CLONE_NEWUTS,
+        LinuxNamespaceType::Cgroup => CloneFlags::CLONE_NEWCGROUP,
+        LinuxNamespaceType::Ipc => CloneFlags::CLONE_NEWIPC,
+        LinuxNamespaceType::Network => CloneFlags::CLONE_NEWNET,
+        LinuxNamespaceType::Mount => CloneFlags::CLONE_NEWNS,
    }
 }

-impl<'a> Namespaces<'a> {
-    pub fn apply_setns(&self) -> Result<()> {
-        let to_enter: Vec<(CloneFlags, i32)> = self
-            .spaces
+impl From<Option<&Vec<LinuxNamespace>>> for Namespaces {
+    fn from(namespaces: Option<&Vec<LinuxNamespace>>) -> Self {
+        let command: Box<dyn Syscall> = create_syscall();
+        let namespace_map: collections::HashMap<CloneFlags, LinuxNamespace> = namespaces
+            .unwrap_or(&vec![])
            .iter()
-            .filter(|ns| ns.path.is_some()) // filter those which are actually present on the system
-            .map(|ns| {
-                let space = CloneFlags::from_bits_truncate(ns.typ as i32);
-                let fd = fcntl::open(
-                    &*ns.path.as_ref().unwrap(),
-                    fcntl::OFlag::empty(),
-                    stat::Mode::empty(),
-                )
-                .unwrap();
-                (space, fd)
-            })
+            .map(|ns| (get_clone_flag(ns.typ), ns.clone()))
            .collect();

-        for &(space, fd) in &to_enter {
-            // set the namespace
-            self.command.set_ns(fd, space)?;
-            unistd::close(fd)?;
-            // if namespace is cloned with newuser flag, then it creates a new user namespace,
-            // and we need to set the user and group id to 0
-            // see https://man7.org/linux/man-pages/man2/clone.2.html for more info
-            if space == sched::CloneFlags::CLONE_NEWUSER {
-                self.command.set_id(Uid::from_raw(0), Gid::from_raw(0))?;
-            }
+        Namespaces {
+            command,
+            namespace_map,
+        }
+    }
+}
+
+impl Namespaces {
+    pub fn apply_namespaces<F: Fn(CloneFlags) -> bool>(&self, filter: F) -> Result<()> {
+        let to_enter: collections::HashMap<&CloneFlags, &LinuxNamespace> = self
+            .namespace_map
+            .iter()
+            .filter(|(k, _)| filter(**k))
+            .collect();
+        for (ns_type, ns) in to_enter {
+            self.unshare_or_setns(ns)
+                .with_context(|| format!("Failed to enter {:?} namespace: {:?}", ns_type, ns))?;
        }
        Ok(())
    }

-    /// disassociate given parts context of calling process from other process
-    // see https://man7.org/linux/man-pages/man2/unshare.2.html for more info
-    pub fn apply_unshare(&self, without: CloneFlags) -> Result<()> {
-        self.command.unshare(self.clone_flags & !without)?;
+    pub fn unshare_or_setns(&self, namespace: &LinuxNamespace) -> Result<()> {
+        if namespace.path.is_none() {
+            self.command.unshare(get_clone_flag(namespace.typ))?;
+        } else {
+            let ns_path = namespace.path.as_ref().unwrap();
+            let fd = fcntl::open(ns_path, fcntl::OFlag::empty(), stat::Mode::empty())
+                .with_context(|| format!("Failed to open namespace fd: {:?}", ns_path))?;
+            self.command
+                .set_ns(fd, get_clone_flag(namespace.typ))
+                .with_context(|| "Failed to set namespace")?;
+            unistd::close(fd).with_context(|| "Failed to close namespace fd")?;
+        }
+
        Ok(())
    }
+
+    pub fn get(&self, k: LinuxNamespaceType) -> Option<&LinuxNamespace> {
+        self.namespace_map.get(&get_clone_flag(k))
+    }
 }
+
 #[cfg(test)]
 mod tests {
-    use oci_spec::LinuxNamespaceType;
-
    use super::*;
    use crate::syscall::test::TestHelperSyscall;
+    use oci_spec::LinuxNamespaceType;

    fn gen_sample_linux_namespaces() -> Vec<LinuxNamespace> {
        vec![
@ -115,11 +114,13 @@ mod tests {
    }

    #[test]
-    fn test_namespaces_set_ns() {
+    fn test_apply_namespaces() {
        let sample_linux_namespaces = gen_sample_linux_namespaces();
-        let namespaces = Namespaces::from(&sample_linux_namespaces);
+        let namespaces = Namespaces::from(Some(&sample_linux_namespaces));
        let test_command: &TestHelperSyscall = namespaces.command.as_any().downcast_ref().unwrap();
-        assert!(namespaces.apply_setns().is_ok());
+        assert!(namespaces
+            .apply_namespaces(|ns_type| { ns_type != CloneFlags::CLONE_NEWIPC })
+            .is_ok());

        let mut setns_args: Vec<_> = test_command
            .get_setns_args()
@ -130,18 +131,10 @@ mod tests {
        let mut expect = vec![CloneFlags::CLONE_NEWNS, CloneFlags::CLONE_NEWNET];
        expect.sort();
        assert_eq!(setns_args, expect);
-    }

-    #[test]
-    fn test_namespaces_unshare() {
-        let sample_linux_namespaces = gen_sample_linux_namespaces();
-        let namespaces = Namespaces::from(&sample_linux_namespaces);
-        assert!(namespaces.apply_unshare(CloneFlags::CLONE_NEWIPC).is_ok());
-
-        let test_command: &TestHelperSyscall = namespaces.command.as_any().downcast_ref().unwrap();
        let mut unshare_args = test_command.get_unshare_args();
        unshare_args.sort();
-        let mut expect = vec![CloneFlags::CLONE_NEWUSER | CloneFlags::CLONE_NEWPID];
+        let mut expect = vec![CloneFlags::CLONE_NEWUSER, CloneFlags::CLONE_NEWPID];
        expect.sort();
        assert_eq!(unshare_args, expect)
    }
--- a/src/process/channel.rs
+++ b/src/process/channel.rs
@ -0,0 +1,295 @@
+use crate::process::message::Message;
+use crate::rootless::Rootless;
+use crate::utils;
+use anyhow::bail;
+use anyhow::Context;
+use anyhow::Result;
+use mio::unix::pipe;
+use mio::unix::pipe::{Receiver, Sender};
+use mio::{Events, Interest, Poll, Token};
+use nix::unistd::Pid;
+use std::io::ErrorKind;
+use std::io::Read;
+use std::io::Write;
+use std::path::Path;
+use std::process::Command;
+use std::time::Duration;
+
+/// Maximum event capacity of polling
+const MAX_EVENTS: usize = 128;
+/// Time to wait when polling for message from child process
+const WAIT_FOR_CHILD: Duration = Duration::from_secs(5);
+/// Time to wait when polling for mapping ack from parent
+const WAIT_FOR_MAPPING: Duration = Duration::from_secs(3);
+// Token is used to identify which socket generated an event
+const PARENT: Token = Token(0);
+
+pub struct Channel {
+    sender: Sender,
+    receiver: Receiver,
+    poll: Poll,
+}
+
+impl Channel {
+    pub fn new() -> Result<Self> {
+        let poll = Poll::new()?;
+        let (sender, mut receiver) = pipe::new()?;
+        poll.registry()
+            .register(&mut receiver, PARENT, Interest::READABLE)?;
+
+        Ok(Self {
+            sender,
+            receiver,
+            poll,
+        })
+    }
+
+    pub fn send_child_ready(&mut self, pid: Pid) -> Result<()> {
+        // Send over the ChildReady follow by the pid.
+        log::debug!("sending init pid ({:?})", pid);
+        self.write_message(Message::ChildReady)?;
+        self.sender.write_all(&(pid.as_raw()).to_be_bytes())?;
+        Ok(())
+    }
+
+    // requests the parent to write the id mappings for the child process
+    // this needs to be done from the parent see https://man7.org/linux/man-pages/man7/user_namespaces.7.html
+    pub fn send_identifier_mapping_request(&mut self) -> Result<()> {
+        log::debug!("send identifier mapping request");
+        self.write_message(Message::WriteMapping)?;
+        Ok(())
+    }
+
+    pub fn send_mapping_written(&mut self) -> Result<()> {
+        log::debug!("identifier mapping written");
+        self.sender
+            .write_all(&(Message::MappingWritten as u8).to_be_bytes())?;
+        Ok(())
+    }
+
+    // wait until the parent process has finished writing the id mappings
+    pub fn wait_for_mapping_ack(&mut self) -> Result<()> {
+        let mut events = Events::with_capacity(MAX_EVENTS);
+        log::debug!("waiting for mapping ack");
+
+        self.poll.poll(&mut events, Some(WAIT_FOR_MAPPING))?;
+        for event in events.iter() {
+            if event.token() == PARENT {
+                let mut buf = [0; 1];
+                if let Err(e) = self.receiver.read_exact(&mut buf) {
+                    if e.kind() != ErrorKind::WouldBlock {
+                        bail!(
+                            "Failed to receive a message from the child process. {:?}",
+                            e
+                        )
+                    }
+                }
+
+                match Message::from(u8::from_be_bytes(buf)) {
+                    Message::MappingWritten => return Ok(()),
+                    msg => bail!(
+                        "receive unexpected message {:?} in waiting for mapping ack",
+                        msg
+                    ),
+                }
+            } else {
+                unreachable!();
+            }
+        }
+
+        unreachable!("timed out waiting for mapping ack")
+    }
+
+    pub fn wait_for_mapping_request(
+        &mut self,
+        child_pid: Pid,
+        rootless: Option<&Rootless>,
+        callback: &mut Channel,
+    ) -> Result<()> {
+        // Create collection with capacity to store up to MAX_EVENTS events
+        let mut events = Events::with_capacity(MAX_EVENTS);
+        loop {
+            // poll the receiving end of pipe created for WAIT_FOR_CHILD duration for an event
+            self.poll.poll(&mut events, Some(WAIT_FOR_CHILD))?;
+            for event in events.iter() {
+                if event.token() == PARENT {
+                    // read data from pipe
+                    let mut buf = [0; 1];
+                    if let Err(e) = self.receiver.read_exact(&mut buf) {
+                        if e.kind() != ErrorKind::WouldBlock {
+                            bail!(
+                                "Failed to receive a message from the child process. {:?}",
+                                e
+                            )
+                        }
+                    }
+
+                    // convert to Message wrapper
+                    match Message::from(u8::from_be_bytes(buf)) {
+                        Message::WriteMapping => {
+                            log::debug!("write mapping for pid {:?}", child_pid);
+                            utils::write_file(format!("/proc/{}/setgroups", child_pid), "deny")?;
+                            write_uid_mapping(child_pid, rootless)?;
+                            write_gid_mapping(child_pid, rootless)?;
+                            callback.send_mapping_written()?;
+                            return Ok(());
+                        }
+                        msg => bail!(
+                            "receive unexpected message {:?} waiting for mapping request",
+                            msg
+                        ),
+                    }
+                } else {
+                    unreachable!();
+                }
+            }
+        }
+    }
+
+    /// Waits for associated child process to send ready message
+    /// and return the pid of init process which is forked by child process
+    pub fn wait_for_child_ready(&mut self) -> Result<Pid> {
+        // Create collection with capacity to store up to MAX_EVENTS events
+        let mut events = Events::with_capacity(MAX_EVENTS);
+        loop {
+            // poll the receiving end of pipe created for WAIT_FOR_CHILD duration for an event
+            self.poll.poll(&mut events, Some(WAIT_FOR_CHILD))?;
+            for event in events.iter() {
+                if event.token() == PARENT {
+                    // read data from pipe
+                    let mut buf = [0; 1];
+                    if let Err(e) = self.receiver.read_exact(&mut buf) {
+                        if e.kind() != ErrorKind::WouldBlock {
+                            bail!(
+                                "Failed to receive a message from the child process. {:?}",
+                                e
+                            )
+                        }
+                    }
+
+                    // convert to Message wrapper
+                    match Message::from(u8::from_be_bytes(buf)) {
+                        Message::ChildReady => {
+                            log::debug!("received child ready message");
+                            let mut buf = [0; 4];
+                            if let Err(e) = self.receiver.read_exact(&mut buf) {
+                                if e.kind() != ErrorKind::WouldBlock {
+                                    bail!(
+                                        "Failed to receive a message from the child process. {:?}",
+                                        e
+                                    )
+                                }
+                            }
+
+                            return Ok(Pid::from_raw(i32::from_be_bytes(buf)));
+                        }
+                        msg => bail!(
+                            "receive unexpected message {:?} waiting for child ready",
+                            msg
+                        ),
+                    }
+                } else {
+                    unreachable!();
+                }
+            }
+        }
+    }
+
+    #[inline]
+    fn write_message(&mut self, msg: Message) -> Result<()> {
+        self.sender.write_all(&(msg as u8).to_be_bytes())?;
+        Ok(())
+    }
+}
+
+fn write_uid_mapping(target_pid: Pid, rootless: Option<&Rootless>) -> Result<()> {
+    if let Some(rootless) = rootless {
+        if let Some(uid_mappings) = rootless.gid_mappings {
+            return write_id_mapping(
+                &format!("/proc/{}/uid_map", target_pid),
+                uid_mappings,
+                rootless.newuidmap.as_deref(),
+            );
+        }
+    }
+
+    Ok(())
+}
+
+fn write_gid_mapping(target_pid: Pid, rootless: Option<&Rootless>) -> Result<()> {
+    if let Some(rootless) = rootless {
+        if let Some(gid_mappings) = rootless.gid_mappings {
+            return write_id_mapping(
+                &format!("/proc/{}/gid_map", target_pid),
+                gid_mappings,
+                rootless.newgidmap.as_deref(),
+            );
+        }
+    }
+
+    Ok(())
+}
+
+fn write_id_mapping(
+    map_file: &str,
+    mappings: &[oci_spec::LinuxIdMapping],
+    map_binary: Option<&Path>,
+) -> Result<()> {
+    let mappings: Vec<String> = mappings
+        .iter()
+        .map(|m| format!("{} {} {}", m.container_id, m.host_id, m.size))
+        .collect();
+    if mappings.len() == 1 {
+        utils::write_file(map_file, mappings.first().unwrap())?;
+    } else {
+        Command::new(map_binary.unwrap())
+            .args(mappings)
+            .output()
+            .with_context(|| format!("failed to execute {:?}", map_binary))?;
+    }
+
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use nix::sys::wait;
+    use nix::unistd;
+
+    #[test]
+    fn test_channel_child_ready() -> Result<()> {
+        let ch = &mut Channel::new()?;
+        match unsafe { unistd::fork()? } {
+            unistd::ForkResult::Parent { child } => {
+                wait::waitpid(child, None)?;
+                let pid = ch.wait_for_child_ready()?;
+                assert_eq!(pid, child);
+            }
+            unistd::ForkResult::Child => {
+                let pid = unistd::getpid();
+                ch.send_child_ready(pid)?;
+                std::process::exit(0);
+            }
+        };
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_channel_id_mapping() -> Result<()> {
+        let ch = &mut Channel::new()?;
+        match unsafe { unistd::fork()? } {
+            unistd::ForkResult::Parent { child } => {
+                ch.wait_for_mapping_ack()?;
+                wait::waitpid(child, None)?;
+            }
+            unistd::ForkResult::Child => {
+                ch.send_mapping_written()?;
+                std::process::exit(0);
+            }
+        };
+
+        Ok(())
+    }
+}
--- a/src/process/child.rs
+++ b/src/process/child.rs
@ -1,67 +0,0 @@
-use anyhow::Result;
-use mio::unix::pipe;
-use mio::unix::pipe::Receiver;
-use mio::unix::pipe::Sender;
-use mio::{Interest, Poll, Token};
-
-use super::parent::ParentChannel;
-
-// Token is used to identify which socket generated an event
-const CHILD: Token = Token(1);
-
-/// Contains sending end of pipe for parent process, receiving end of pipe
-/// for the init process and poller for that
-pub struct ChildProcess {
-    parent_channel: ParentChannel,
-    receiver: Option<Receiver>,
-    poll: Option<Poll>,
-}
-
-// Note: The original Youki process "forks" a child process using clone(2). The
-// child process will become the container init process, where it will set up
-// namespaces, device mounts, and etc. for the container process.  Finally, the
-// container init process will run the actual container payload through exec
-// call. The ChildProcess will be used to synchronize between the Youki main
-// process and the child process (container init process).
-impl ChildProcess {
-    /// create a new Child process structure
-    pub fn new(parent_channel: ParentChannel) -> Result<Self> {
-        Ok(Self {
-            parent_channel,
-            receiver: None,
-            poll: None,
-        })
-    }
-
-    /// sets up sockets for init process
-    pub fn setup_pipe(&mut self) -> Result<Sender> {
-        // create a new pipe
-        let (sender, mut receiver) = pipe::new()?;
-        // create a new poll, and register the receiving end of pipe to it
-        // This will poll for the read events, so when data is written to sending end of the pipe,
-        // the receiving end will be readable and poll wil notify
-        let poll = Poll::new()?;
-        poll.registry()
-            .register(&mut receiver, CHILD, Interest::READABLE)?;
-
-        self.receiver = Some(receiver);
-        self.poll = Some(poll);
-        Ok(sender)
-    }
-
-    /// Indicate that child process has forked the init process to parent process
-    pub fn notify_parent(&mut self) -> Result<()> {
-        self.parent_channel.send_child_ready()?;
-        Ok(())
-    }
-
-    pub fn request_identifier_mapping(&mut self) -> Result<()> {
-        self.parent_channel.request_identifier_mapping()?;
-        Ok(())
-    }
-
-    pub fn wait_for_mapping_ack(&mut self) -> Result<()> {
-        self.parent_channel.wait_for_mapping_ack()?;
-        Ok(())
-    }
-}
--- a/src/process/fork.rs
+++ b/src/process/fork.rs
@ -1,222 +1,24 @@
-use anyhow::bail;
-use anyhow::Context;
 use anyhow::Result;
-use nix::errno::Errno;
-use nix::sched;
-use nix::sys;
-use nix::sys::mman;
+use nix::unistd;
 use nix::unistd::Pid;
-use std::ptr;

-// The clone callback is used in clone call. It is a boxed closure and it needs
-// to trasfer the ownership of related memory to the new process.
-type CloneCb = Box<dyn FnOnce() -> isize + Send>;
-
-const DEFAULT_STACK_SIZE: usize = 8 * 1024 * 1024; // 8M
-
-/// clone uses syscall clone(2) to create a new process for the container init
-/// process. Using clone syscall gives us better control over how to can create
-/// the new container process, where we can enter into namespaces directly instead
-/// of using unshare and fork. This call will only create one new process, instead
-/// of two using fork.
-pub fn clone(cb: CloneCb, clone_flags: sched::CloneFlags) -> Result<Pid> {
-    // Use sysconf to find the page size. If there is an error, we assume
-    // the default 4K page size.
-    let page_size: usize = unsafe {
-        match libc::sysconf(libc::_SC_PAGE_SIZE) {
-            -1 => 4 * 1024, // default to 4K page size
-            x => x as usize,
+// Execute the cb in another process. Make the fork works more like thread_spawn
+// or clone, so it is easier to reason. Compared to clone call, fork is easier
+// to use since fork will magically take care of all the variable copying. If
+// using clone, we would have to manually make sure all the variables are
+// correctly send to the new process, especially Rust borrow checker will be a
+// lot of hassel to deal with every details.
+pub fn container_fork<F: FnOnce() -> Result<()>>(cb: F) -> Result<Pid> {
+    match unsafe { unistd::fork()? } {
+        unistd::ForkResult::Parent { child } => Ok(child),
+        unistd::ForkResult::Child => {
+            let ret = if let Err(error) = cb() {
+                log::debug!("failed to run fork: {:?}", error);
+                -1
+            } else {
+                0
+            };
+            std::process::exit(ret);
        }
-    };
-
-    // Find out the default stack max size through getrlimit.
-    let mut rlimit = libc::rlimit {
-        rlim_cur: 0,
-        rlim_max: 0,
-    };
-    unsafe { Errno::result(libc::getrlimit(libc::RLIMIT_STACK, &mut rlimit))? };
-
-    // mmap will return ENOMEM if stack size is unlimited
-    let default_stack_size = if rlimit.rlim_cur != u64::MAX {
-        rlimit.rlim_cur as usize
-    } else {
-        log::info!("stack size returned by getrlimit() is unlimited, use DEFAULT_STACK_SIZE(8MB)");
-        DEFAULT_STACK_SIZE
-    };
-
-    // Using the clone syscall requires us to create the stack space for the
-    // child process instead of taken cared for us like fork call. We use mmap
-    // here to create the stack.  Instead of guessing how much space the child
-    // process needs, we allocate through mmap to the system default limit,
-    // which is 8MB on most of the linux system today. This is OK since mmap
-    // will only researve the address space upfront, instead of allocating
-    // physical memory upfront.  The stack will grow as needed, up to the size
-    // researved, so no wasted memory here. Lastly, the child stack only needs
-    // to support the container init process set up code in Youki. When Youki
-    // calls exec into the container payload, exec will reset the stack.  Note,
-    // do not use MAP_GROWSDOWN since it is not well supported.
-    // Ref: https://man7.org/linux/man-pages/man2/mmap.2.html
-    let child_stack = unsafe {
-        mman::mmap(
-            ptr::null_mut(),
-            default_stack_size,
-            mman::ProtFlags::PROT_READ | mman::ProtFlags::PROT_WRITE,
-            mman::MapFlags::MAP_PRIVATE | mman::MapFlags::MAP_ANONYMOUS | mman::MapFlags::MAP_STACK,
-            -1,
-            0,
-        )?
-    };
-    // Consistant with how pthread_create sets up the stack, we create a
-    // guard page of 1 page, to protect the child stack collision. Note, for
-    // clone call, the child stack will grow downward, so the bottom of the
-    // child stack is in the beginning.
-    unsafe {
-        mman::mprotect(child_stack, page_size, mman::ProtFlags::PROT_NONE)
-            .with_context(|| "Failed to create guard page")?
-    };
-
-    // Since the child stack for clone grows downward, we need to pass in
-    // the top of the stack address.
-    let child_stack_top = unsafe { child_stack.add(default_stack_size) };
-
-    // Adds SIGCHLD flag to mimic the same behavior as fork.
-    let signal = sys::signal::Signal::SIGCHLD;
-    let combined = clone_flags.bits() | signal as libc::c_int;
-
-    // We are passing the boxed closure "cb" into the clone function as the a
-    // function pointer in C. The box closure in Rust is both a function pointer
-    // and a struct. However, when casting the box closure into libc::c_void,
-    // the function pointer will be lost. Therefore, to work around the issue,
-    // we double box the closure. This is consistant with how std::unix::thread
-    // handles the closure.
-    // Ref: https://github.com/rust-lang/rust/blob/master/library/std/src/sys/unix/thread.rs
-    let data = Box::into_raw(Box::new(cb));
-    // The main is a wrapper function passed into clone call below. The "data"
-    // arg is actually a raw pointer to a Box closure. so here, we re-box the
-    // pointer back into a box closure so the main takes ownership of the
-    // memory. Then we can call the closure passed in.
-    extern "C" fn main(data: *mut libc::c_void) -> libc::c_int {
-        unsafe { Box::from_raw(data as *mut CloneCb)() as i32 }
-    }
-
-    // The nix::sched::clone wrapper doesn't provide the right interface.  Using
-    // the clone syscall is one of the rare cases where we don't want rust to
-    // manage the child stack memory. Instead, we want to use c_void directly
-    // here.  Therefore, here we are using libc::clone syscall directly for
-    // better control.  The child stack will be cleaned when exec is called or
-    // the child process terminates. The nix wrapper also does not treat the
-    // closure memory correctly. The wrapper implementation fails to pass the
-    // right ownership to the new child process.
-    // Ref: https://github.com/nix-rust/nix/issues/919
-    // Ref: https://github.com/nix-rust/nix/pull/920
-    let res = unsafe { libc::clone(main, child_stack_top, combined, data as *mut libc::c_void) };
-    match res {
-        -1 => {
-            // Since the clone call failed, the closure passed in didn't get
-            // consumed. To complete the circle, we can safely box up the
-            // closure again and let rust manage this memory for us.
-            unsafe { drop(Box::from_raw(data)) };
-            bail!(
-                "Failed clone to create new process: {:?}",
-                Errno::result(res)
-            )
-        }
-        pid => Ok(Pid::from_raw(pid)),
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use anyhow::bail;
-    use nix::sys::wait;
-    use nix::unistd;
-
-    #[test]
-    fn test_fork_clone() -> Result<()> {
-        let cb = || -> Result<()> {
-            // In a new pid namespace, pid of this process should be 1
-            let pid = unistd::getpid();
-            assert_eq!(unistd::Pid::from_raw(1), pid, "PID should set to 1");
-
-            Ok(())
-        };
-
-        // For now, we test clone with new pid and user namespace. user
-        // namespace is needed for the test to run without root
-        let flags = sched::CloneFlags::CLONE_NEWPID | sched::CloneFlags::CLONE_NEWUSER;
-        let pid = super::clone(
-            Box::new(move || {
-                if cb().is_err() {
-                    return -1;
-                }
-
-                0
-            }),
-            flags,
-        )?;
-
-        let status = nix::sys::wait::waitpid(pid, None)?;
-        if let nix::sys::wait::WaitStatus::Exited(_, exit_code) = status {
-            assert_eq!(
-                0, exit_code,
-                "Process didn't exit correctly {:?}",
-                exit_code
-            );
-
-            return Ok(());
-        }
-
-        bail!("Process didn't exit correctly")
-    }
-
-    #[test]
-    fn test_clone_stack_allocation() -> Result<()> {
-        let flags = sched::CloneFlags::empty();
-        let pid = super::clone(
-            Box::new(|| {
-                let mut array_on_stack = [0u8; 4096];
-                array_on_stack.iter_mut().for_each(|x| *x = 0);
-
-                0
-            }),
-            flags,
-        )?;
-
-        let status = nix::sys::wait::waitpid(pid, None)?;
-        if let nix::sys::wait::WaitStatus::Exited(_, exit_code) = status {
-            assert_eq!(
-                0, exit_code,
-                "Process didn't exit correctly {:?}",
-                exit_code
-            );
-
-            return Ok(());
-        }
-
-        bail!("Process didn't exit correctly")
-    }
-
-    fn clone_closure_ownership_test_payload() -> super::CloneCb {
-        // The vec should not be deallocated after this function returns. The
-        // ownership should correctly transfer to the closure returned, to be
-        // passed to the clone and new child process.
-        let numbers: Vec<i32> = (0..101).into_iter().collect();
-        Box::new(move || {
-            assert_eq!(numbers.iter().sum::<i32>(), 5050);
-            0
-        })
-    }
-
-    #[test]
-    fn test_clone_closure_ownership() -> Result<()> {
-        let flags = sched::CloneFlags::empty();
-
-        let pid = super::clone(clone_closure_ownership_test_payload(), flags)?;
-        let exit_status =
-            wait::waitpid(pid, Some(wait::WaitPidFlag::__WALL)).expect("Waiting for child");
-        assert_eq!(exit_status, wait::WaitStatus::Exited(pid, 0));
-
-        Ok(())
    }
 }
--- a/src/process/init.rs
+++ b/src/process/init.rs
@ -1,12 +1,13 @@
 use anyhow::{bail, Context, Result};
 use nix::mount::mount as nix_mount;
 use nix::mount::MsFlags;
+use nix::sched::CloneFlags;
 use nix::{
    fcntl, sched,
    sys::statfs,
-    unistd::{self, Gid, Uid},
+    unistd::{self, Gid, Pid, Uid},
 };
-use oci_spec::Spec;
+use oci_spec::{LinuxNamespaceType, Spec};
 use std::collections::HashMap;
 use std::{
    env,
@ -20,7 +21,8 @@ use crate::{
    hooks,
    namespaces::Namespaces,
    notify_socket::NotifyListener,
-    process::child,
+    process::channel,
+    process::fork,
    rootfs,
    syscall::{linux::LinuxSyscall, Syscall},
    tty, utils,
@ -91,6 +93,59 @@ fn cleanup_file_descriptors(preserve_fds: i32) -> Result<()> {
    Ok(())
 }

+fn sysctl(kernel_params: &HashMap<String, String>) -> Result<()> {
+    let sys = PathBuf::from("/proc/sys");
+    for (kernel_param, value) in kernel_params {
+        let path = sys.join(kernel_param.replace(".", "/"));
+        log::debug!(
+            "apply value {} to kernel parameter {}.",
+            value,
+            kernel_param
+        );
+        fs::write(path, value.as_bytes())
+            .with_context(|| format!("failed to set sysctl {}={}", kernel_param, value))?;
+    }
+
+    Ok(())
+}
+
+// make a read only path
+// The first time we bind mount, other flags are ignored,
+// so we need to mount it once and then remount it with the necessary flags specified.
+// https://man7.org/linux/man-pages/man2/mount.2.html
+fn readonly_path(path: &str) -> Result<()> {
+    match nix_mount::<str, str, str, str>(
+        Some(path),
+        path,
+        None::<&str>,
+        MsFlags::MS_BIND | MsFlags::MS_REC,
+        None::<&str>,
+    ) {
+        // ignore error if path is not exist.
+        Err(nix::errno::Errno::ENOENT) => {
+            log::warn!("readonly path {:?} not exist", path);
+            return Ok(());
+        }
+        Err(err) => bail!(err),
+        Ok(_) => {}
+    }
+
+    nix_mount::<str, str, str, str>(
+        Some(path),
+        path,
+        None::<&str>,
+        MsFlags::MS_NOSUID
+            | MsFlags::MS_NODEV
+            | MsFlags::MS_NOEXEC
+            | MsFlags::MS_BIND
+            | MsFlags::MS_REMOUNT
+            | MsFlags::MS_RDONLY,
+        None::<&str>,
+    )?;
+    log::debug!("readonly path {:?} mounted", path);
+    Ok(())
+}
+
 pub struct ContainerInitArgs {
    /// Flag indicating if an init or a tenant container should be created
    pub init: bool,
@ -110,23 +165,16 @@ pub struct ContainerInitArgs {
    pub preserve_fds: i32,
    /// Container state
    pub container: Option<Container>,
-    /// Pipe used to communicate with the child process
-    pub child: child::ChildProcess,
 }

-pub fn container_init(args: ContainerInitArgs) -> Result<()> {
+pub fn container_intermidiate(
+    args: ContainerInitArgs,
+    main_to_intermediate: &mut channel::Channel,
+    intermediate_to_main: &mut channel::Channel,
+) -> Result<()> {
    let command = &args.syscall;
    let spec = &args.spec;
    let linux = spec.linux.as_ref().context("no linux in spec")?;
-    // need to create the notify socket before we pivot root, since the unix
-    // domain socket used here is outside of the rootfs of container
-    let mut notify_socket: NotifyListener = NotifyListener::new(&args.notify_path)?;
-    let proc = spec.process.as_ref().context("no process in spec")?;
-    let mut envs: Vec<String> = proc.env.as_ref().unwrap_or(&vec![]).clone();
-    let rootfs = &args.rootfs;
-    let hooks = spec.hooks.as_ref();
-    let container = args.container.as_ref();
-    let mut child = args.child;

    // if Out-of-memory score adjustment is set in specification.  set the score
    // value for the current process check
@ -144,15 +192,18 @@ pub fn container_init(args: ContainerInitArgs) -> Result<()> {
    // https://man7.org/linux/man-pages/man7/user_namespaces.7.html for more
    // information
    if args.is_rootless {
+        log::debug!("creating new user namespace");
+        sched::unshare(sched::CloneFlags::CLONE_NEWUSER)?;
        // child needs to be dumpable, otherwise the non root parent is not
        // allowed to write the uid/gid maps
        prctl::set_dumpable(true).unwrap();
-        child.request_identifier_mapping()?;
-        child.wait_for_mapping_ack()?;
+        intermediate_to_main.send_identifier_mapping_request()?;
+        main_to_intermediate.wait_for_mapping_ack()?;
        prctl::set_dumpable(false).unwrap();
    }

    // set limits and namespaces to the process
+    let proc = spec.process.as_ref().context("no process in spec")?;
    if let Some(rlimits) = proc.rlimits.as_ref() {
        for rlimit in rlimits.iter() {
            command.set_rlimit(rlimit).context("failed to set rlimit")?;
@ -163,21 +214,72 @@ pub fn container_init(args: ContainerInitArgs) -> Result<()> {
        .set_id(Uid::from_raw(0), Gid::from_raw(0))
        .context("failed to become root")?;

-    // set up tty if specified
-    if let Some(csocketfd) = args.console_socket {
-        tty::setup_console(&csocketfd)?;
+    // Pid namespace requires an extra fork to enter, so we enter pid namespace now.
+    let namespaces = Namespaces::from(linux.namespaces.as_ref());
+    if let Some(pid_namespace) = namespaces.get(LinuxNamespaceType::Pid) {
+        namespaces
+            .unshare_or_setns(pid_namespace)
+            .with_context(|| format!("Failed to enter pid namespace: {:?}", pid_namespace))?;
    }

-    // join existing namespaces
-    let bind_service = if let Some(ns) = linux.namespaces.as_ref() {
-        let namespaces = Namespaces::from(ns);
-        namespaces.apply_setns()?;
+    // We only need for init process to send us the ChildReady.
+    let child_to_parent = &mut channel::Channel::new()?;
+    // We resued the args passed in, but replace with a new set of channels.
+    let init_args = ContainerInitArgs { ..args };
+    // We have to record the pid of the child (container init process), since
+    // the child will be inside the pid namespace. We can't rely on child_ready
+    // to send us the correct pid.
+    let pid = fork::container_fork(|| container_init(init_args, child_to_parent))?;
+    // There is no point using the pid returned here, since the child will be
+    // inside the pid namespace already.
+    child_to_parent.wait_for_child_ready()?;
+    // After the child (the container init process) becomes ready, we can signal
+    // the parent (the main process) that we are ready.
+    intermediate_to_main.send_child_ready(pid)?;
+
+    Ok(())
+}
+
+pub fn container_init(
+    args: ContainerInitArgs,
+    init_to_intermediate: &mut channel::Channel,
+) -> Result<()> {
+    let command = &args.syscall;
+    let spec = &args.spec;
+    let linux = spec.linux.as_ref().context("no linux in spec")?;
+    // Need to create the notify socket before we pivot root, since the unix
+    // domain socket used here is outside of the rootfs of container. During
+    // exec, need to create the socket before we exter into existing mount
+    // namespace.
+    let mut notify_socket: NotifyListener = NotifyListener::new(&args.notify_path)?;
+    let proc = spec.process.as_ref().context("no process in spec")?;
+    let mut envs: Vec<String> = proc.env.as_ref().unwrap_or(&vec![]).clone();
+    let rootfs = &args.rootfs;
+    let hooks = spec.hooks.as_ref();
+    let container = args.container.as_ref();
+    let namespaces = Namespaces::from(linux.namespaces.as_ref());
+
+    // set up tty if specified
+    if let Some(csocketfd) = args.console_socket {
+        tty::setup_console(&csocketfd).with_context(|| "Failed to set up tty")?;
+    }
+
+    // Enter into rest of namespace. Note, we already entered into user and pid
+    // namespace. We also have to enter into mount namespace last since
+    // namespace may be bind to /proc path. The /proc path will need to be
+    // accessed before pivot_root.
+    namespaces
+        .apply_namespaces(|ns_type| -> bool {
+            ns_type != CloneFlags::CLONE_NEWUSER
+                && ns_type != CloneFlags::CLONE_NEWPID
+                && ns_type != CloneFlags::CLONE_NEWNS
+        })
+        .with_context(|| "Failed to apply namespaces")?;
+    if let Some(mount_namespace) = namespaces.get(LinuxNamespaceType::Mount) {
        namespaces
-            .clone_flags
-            .contains(sched::CloneFlags::CLONE_NEWUSER)
-    } else {
-        false
-    };
+            .unshare_or_setns(mount_namespace)
+            .with_context(|| format!("Failed to enter mount namespace: {:?}", mount_namespace))?;
+    }

    if let Some(hostname) = spec.hostname.as_ref() {
        command.set_hostname(hostname)?;
@ -193,6 +295,8 @@ pub fn container_init(args: ContainerInitArgs) -> Result<()> {
        if let Some(hooks) = hooks {
            hooks::run_hooks(hooks.create_container.as_ref(), container)?
        }
+
+        let bind_service = namespaces.get(LinuxNamespaceType::User).is_some();
        rootfs::prepare_rootfs(spec, rootfs, bind_service)
            .with_context(|| "Failed to prepare rootfs")?;

@ -202,7 +306,8 @@ pub fn container_init(args: ContainerInitArgs) -> Result<()> {
            .with_context(|| format!("Failed to pivot root to {:?}", rootfs))?;

        if let Some(kernel_params) = &linux.sysctl {
-            sysctl(kernel_params)?;
+            sysctl(kernel_params)
+                .with_context(|| format!("Failed to sysctl: {:?}", kernel_params))?;
        }
    }

@ -273,9 +378,7 @@ pub fn container_init(args: ContainerInitArgs) -> Result<()> {
    };

    // clean up and handle perserved fds.
-    if args.init {
-        cleanup_file_descriptors(preserve_fds).with_context(|| "Failed to clean up extra fds")?;
-    }
+    cleanup_file_descriptors(preserve_fds).with_context(|| "Failed to clean up extra fds")?;

    // change directory to process.cwd if process.cwd is not empty
    if do_chdir {
@ -289,7 +392,10 @@ pub fn container_init(args: ContainerInitArgs) -> Result<()> {
        .for_each(|(key, value)| env::set_var(key, value));

    // notify parents that the init process is ready to execute the payload.
-    child.notify_parent()?;
+    // Note, we pass -1 here because we are already inside the pid namespace.
+    // The pid outside the pid namespace should be recorded by the intermediate
+    // process.
+    init_to_intermediate.send_child_ready(Pid::from_raw(-1))?;

    // listing on the notify socket for container start command
    notify_socket.wait_for_container_start()?;
@ -313,59 +419,6 @@ pub fn container_init(args: ContainerInitArgs) -> Result<()> {
    unreachable!();
 }

-fn sysctl(kernel_params: &HashMap<String, String>) -> Result<()> {
-    let sys = PathBuf::from("/proc/sys");
-    for (kernel_param, value) in kernel_params {
-        let path = sys.join(kernel_param.replace(".", "/"));
-        log::debug!(
-            "apply value {} to kernel parameter {}.",
-            value,
-            kernel_param
-        );
-        fs::write(path, value.as_bytes())
-            .with_context(|| format!("failed to set sysctl {}={}", kernel_param, value))?;
-    }
-
-    Ok(())
-}
-
-// make a read only path
-// The first time we bind mount, other flags are ignored,
-// so we need to mount it once and then remount it with the necessary flags specified.
-// https://man7.org/linux/man-pages/man2/mount.2.html
-fn readonly_path(path: &str) -> Result<()> {
-    match nix_mount::<str, str, str, str>(
-        Some(path),
-        path,
-        None::<&str>,
-        MsFlags::MS_BIND | MsFlags::MS_REC,
-        None::<&str>,
-    ) {
-        // ignore error if path is not exist.
-        Err(nix::errno::Errno::ENOENT) => {
-            log::warn!("readonly path {:?} not exist", path);
-            return Ok(());
-        }
-        Err(err) => bail!(err),
-        Ok(_) => {}
-    }
-
-    nix_mount::<str, str, str, str>(
-        Some(path),
-        path,
-        None::<&str>,
-        MsFlags::MS_NOSUID
-            | MsFlags::MS_NODEV
-            | MsFlags::MS_NOEXEC
-            | MsFlags::MS_BIND
-            | MsFlags::MS_REMOUNT
-            | MsFlags::MS_RDONLY,
-        None::<&str>,
-    )?;
-    log::debug!("readonly path {:?} mounted", path);
-    Ok(())
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/src/process/mod.rs
+++ b/src/process/mod.rs
@ -1,24 +1,7 @@
 //! Provides a thin wrapper around fork syscall,
 //! with enums and functions specific to youki implemented

-use std::time::Duration;
-
-pub mod child;
+pub mod channel;
 pub mod fork;
 pub mod init;
 pub mod message;
-pub mod parent;
-
-/// Used to describe type of process after fork.
-/// Parent and child processes mean the same thing as in a normal fork call
-/// InitProcess is specifically used to indicate the process which will run the command of container
-pub enum Process<'a> {
-    Parent(parent::ParentProcess<'a>),
-    Child(child::ChildProcess),
-}
-/// Maximum event capacity of polling
-const MAX_EVENTS: usize = 128;
-/// Time to wait when polling for message from child process
-const WAIT_FOR_CHILD: Duration = Duration::from_secs(5);
-/// Time to wait when polling for mapping ack from parent
-const WAIT_FOR_MAPPING: Duration = Duration::from_secs(3);
--- a/src/process/parent.rs
+++ b/src/process/parent.rs
@ -1,246 +0,0 @@
-use std::io::ErrorKind;
-use std::io::Read;
-use std::io::Write;
-use std::path::Path;
-use std::process::Command;
-
-use super::{MAX_EVENTS, WAIT_FOR_CHILD};
-use crate::process::message::Message;
-use crate::process::WAIT_FOR_MAPPING;
-use crate::rootless::Rootless;
-use crate::utils;
-use anyhow::Context;
-use anyhow::{bail, Result};
-use mio::unix::pipe;
-use mio::unix::pipe::{Receiver, Sender};
-use mio::{Events, Interest, Poll, Token};
-use nix::unistd::Pid;
-use oci_spec::LinuxIdMapping;
-
-// Token is used to identify which socket generated an event
-const PARENT: Token = Token(0);
-
-/// Contains receiving end of pipe to child process and a poller for that.
-pub struct ParentProcess<'a> {
-    child_channel: ChildChannel<'a>,
-}
-
-// Poll is used to register and listen for various events
-// by registering it with an event source such as receiving end of a pipe
-impl<'a> ParentProcess<'a> {
-    /// Create new Parent process structure
-    pub fn new(rootless: &'a Option<Rootless>) -> Result<(Self, ParentChannel)> {
-        let (parent_channel, child_channel) = Self::setup_pipes(rootless)?;
-        let parent = Self { child_channel };
-
-        Ok((parent, parent_channel))
-    }
-
-    fn setup_pipes(rootless: &'a Option<Rootless>) -> Result<(ParentChannel, ChildChannel<'a>)> {
-        let (send_to_parent, receive_from_child) = pipe::new()?;
-        let (send_to_child, receive_from_parent) = pipe::new()?;
-
-        let parent_channel = ParentChannel::new(send_to_parent, receive_from_parent)?;
-        let child_channel = ChildChannel::new(send_to_child, receive_from_child, rootless)?;
-
-        Ok((parent_channel, child_channel))
-    }
-
-    /// Waits for associated child process to send ready message
-    /// and return the pid of init process which is forked by child process
-    pub fn wait_for_child_ready(&mut self, child_pid: Pid) -> Result<()> {
-        self.child_channel.wait_for_child_ready(child_pid)?;
-        Ok(())
-    }
-}
-
-// Channel for communicating with the parent
-pub struct ParentChannel {
-    sender: Sender,
-    receiver: Receiver,
-    poll: Poll,
-}
-
-impl ParentChannel {
-    fn new(sender: Sender, mut receiver: Receiver) -> Result<Self> {
-        let poll = Poll::new()?;
-        poll.registry()
-            .register(&mut receiver, PARENT, Interest::READABLE)?;
-        Ok(Self {
-            sender,
-            receiver,
-            poll,
-        })
-    }
-
-    pub fn send_child_ready(&mut self) -> Result<()> {
-        // write ChildReady message to the pipe to parent
-        log::debug!("[child to parent] sending child ready");
-        self.write_message(Message::ChildReady)?;
-        Ok(())
-    }
-
-    // requests the parent to write the id mappings for the child process
-    // this needs to be done from the parent see https://man7.org/linux/man-pages/man7/user_namespaces.7.html
-    pub fn request_identifier_mapping(&mut self) -> Result<()> {
-        log::debug!("[child to parent] request identifier mapping");
-        self.write_message(Message::WriteMapping)?;
-        Ok(())
-    }
-
-    // wait until the parent process has finished writing the id mappings
-    pub fn wait_for_mapping_ack(&mut self) -> Result<()> {
-        let mut events = Events::with_capacity(MAX_EVENTS);
-        log::debug!("waiting for ack from parent");
-
-        self.poll.poll(&mut events, Some(WAIT_FOR_MAPPING))?;
-        for event in events.iter() {
-            if event.token() == PARENT {
-                let mut buf = [0; 1];
-                match self.receiver.read_exact(&mut buf) {
-                    Err(ref e) if e.kind() == ErrorKind::WouldBlock => (),
-                    Err(e) => bail!(
-                        "Failed to receive a message from the child process. {:?}",
-                        e
-                    ),
-                    _ => (),
-                }
-
-                match Message::from(u8::from_be_bytes(buf)) {
-                    Message::MappingWritten => return Ok(()),
-                    msg => bail!("receive unexpected message {:?} in child process", msg),
-                }
-            }
-        }
-        unreachable!("timed out waiting for mapping ack from parent")
-    }
-
-    #[inline]
-    fn write_message(&mut self, msg: Message) -> Result<()> {
-        self.sender.write_all(&(msg as u8).to_be_bytes())?;
-        Ok(())
-    }
-}
-
-struct ChildChannel<'a> {
-    sender: Sender,
-    receiver: Receiver,
-    poll: Poll,
-    rootless: &'a Option<Rootless<'a>>,
-}
-
-impl<'a> ChildChannel<'a> {
-    fn new(sender: Sender, mut receiver: Receiver, rootless: &'a Option<Rootless>) -> Result<Self> {
-        let poll = Poll::new()?;
-        poll.registry()
-            .register(&mut receiver, PARENT, Interest::READABLE)?;
-        Ok(Self {
-            sender,
-            receiver,
-            poll,
-            rootless,
-        })
-    }
-
-    /// Waits for associated child process to send ready message
-    /// and return the pid of init process which is forked by child process
-    pub fn wait_for_child_ready(&mut self, child_pid: Pid) -> Result<()> {
-        // Create collection with capacity to store up to MAX_EVENTS events
-        let mut events = Events::with_capacity(MAX_EVENTS);
-        loop {
-            // poll the receiving end of pipe created for WAIT_FOR_CHILD duration for an event
-            self.poll.poll(&mut events, Some(WAIT_FOR_CHILD))?;
-            for event in events.iter() {
-                // check if the event token in PARENT
-                // note that this does not assign anything to PARENT, but instead compares PARENT and event.token()
-                // check http://patshaughnessy.net/2018/1/18/learning-rust-if-let-vs--match for a bit more detailed explanation
-                if let PARENT = event.token() {
-                    // read data from pipe
-                    let mut buf = [0; 1];
-                    match self.receiver.read_exact(&mut buf) {
-                        // This error simply means that there are no more incoming connections waiting to be accepted at this point.
-                        Err(ref e) if e.kind() == ErrorKind::WouldBlock => {
-                            break;
-                        }
-                        Err(e) => bail!(
-                            "Failed to receive a message from the child process. {:?}",
-                            e
-                        ),
-                        _ => (),
-                    };
-                    // convert to Message wrapper
-                    match Message::from(u8::from_be_bytes(buf)) {
-                        Message::ChildReady => {
-                            log::debug!("received child ready message");
-                            return Ok(());
-                        }
-                        Message::WriteMapping => {
-                            log::debug!("write mapping for pid {:?}", child_pid);
-                            utils::write_file(format!("/proc/{}/setgroups", child_pid), "deny")?;
-                            self.write_uid_mapping(child_pid)?;
-                            self.write_gid_mapping(child_pid)?;
-                            self.notify_mapping_written()?;
-                        }
-                        msg => bail!("receive unexpected message {:?} in parent process", msg),
-                    }
-                } else {
-                    // as the poll is registered with only parent token
-                    unreachable!()
-                }
-            }
-        }
-    }
-
-    fn notify_mapping_written(&mut self) -> Result<()> {
-        self.sender
-            .write_all(&(Message::MappingWritten as u8).to_be_bytes())?;
-        Ok(())
-    }
-
-    fn write_uid_mapping(&self, target_pid: Pid) -> Result<()> {
-        if let Some(rootless) = self.rootless.as_ref() {
-            if let Some(uid_mappings) = rootless.gid_mappings {
-                return write_id_mapping(
-                    &format!("/proc/{}/uid_map", target_pid),
-                    uid_mappings,
-                    rootless.newuidmap.as_deref(),
-                );
-            }
-        }
-        Ok(())
-    }
-
-    fn write_gid_mapping(&self, target_pid: Pid) -> Result<()> {
-        if let Some(rootless) = self.rootless.as_ref() {
-            if let Some(gid_mappings) = rootless.gid_mappings {
-                return write_id_mapping(
-                    &format!("/proc/{}/gid_map", target_pid),
-                    gid_mappings,
-                    rootless.newgidmap.as_deref(),
-                );
-            }
-        }
-        Ok(())
-    }
-}
-
-fn write_id_mapping(
-    map_file: &str,
-    mappings: &[LinuxIdMapping],
-    map_binary: Option<&Path>,
-) -> Result<()> {
-    let mappings: Vec<String> = mappings
-        .iter()
-        .map(|m| format!("{} {} {}", m.container_id, m.host_id, m.size))
-        .collect();
-    if mappings.len() == 1 {
-        utils::write_file(map_file, mappings.first().unwrap())?;
-    } else {
-        Command::new(map_binary.unwrap())
-            .args(mappings)
-            .output()
-            .with_context(|| format!("failed to execute {:?}", map_binary))?;
-    }
-
-    Ok(())
-}
--- a/src/rootless.rs
+++ b/src/rootless.rs
@ -1,8 +1,7 @@
 use std::{env, path::PathBuf};

 use anyhow::{bail, Context, Result};
-use nix::sched::CloneFlags;
-use oci_spec::{Linux, LinuxIdMapping, Mount, Spec};
+use oci_spec::{Linux, LinuxIdMapping, LinuxNamespaceType, Mount, Spec};

 use crate::namespaces::Namespaces;

@ -84,14 +83,8 @@ pub fn validate(spec: &Spec) -> Result<()> {
        bail!("rootless containers require at least one gid mapping")
    }

-    let namespaces = Namespaces::from(
-        linux
-            .namespaces
-            .as_ref()
-            .context("rootless containers require the namespaces.")?,
-    );
-
-    if !namespaces.clone_flags.contains(CloneFlags::CLONE_NEWUSER) {
+    let namespaces = Namespaces::from(linux.namespaces.as_ref());
+    if namespaces.get(LinuxNamespaceType::User).is_none() {
        bail!("rootless containers require the specification of a user namespace");
    }