From 7af4e4deb7a85c1dea728653f8ded50e6ec5e3a8 Mon Sep 17 00:00:00 2001 From: Gary Guo Date: Sun, 3 Mar 2024 16:52:45 +0000 Subject: [PATCH] Add cgroup v2 support cgroup v1 uses two special files to determine access, where cgroup v2 uses eBPF programs to control access. The code will attach a custom eBPF program which allows run-time reconfiguration and detach docker's default. eBPF programs will be detached when the attaching program dies, which can be dangerous if container-hotplug exits unexpectedly while the program is running, so we instead pin it (so it stays when the program exits) and unpin it after the docker container is down. In this case we might have garbage eBPF programs pinned when container-hotplug exits unexpectedly but it is safe. --- Cargo.lock | 51 ++++++++++++++++++++ Cargo.toml | 1 + README.md | 5 +- src/docker/cgroup.rs | 101 +++++++++++++++++++++++++++++++++++++++- src/docker/container.rs | 24 ++++++++-- src/main.rs | 8 +--- 6 files changed, 174 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d7e7ac2..e633e99 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -116,6 +116,12 @@ dependencies = [ "backtrace", ] +[[package]] +name = "assert_matches" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b34d609dfbaf33d6889b2b7106d3ca345eacad44200913df5ba02bfd31d2ba9" + [[package]] name = "async-stream" version = "0.3.5" @@ -144,6 +150,35 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +[[package]] +name = "aya" +version = "0.12.0" +source = "git+https://github.com/nbdd0121/aya.git#764b6d58b2570f43b908cb02c06c7333b3567009" +dependencies = [ + "assert_matches", + "aya-obj", + "bitflags 2.4.2", + "bytes", + "lazy_static", + "libc", + "log", + "object", + "thiserror", +] + +[[package]] +name = "aya-obj" +version = "0.1.0" +source = "git+https://github.com/nbdd0121/aya.git#764b6d58b2570f43b908cb02c06c7333b3567009" +dependencies = [ + "bytes", + "core-error", + "hashbrown 0.14.3", + "log", + "object", + "thiserror", +] + [[package]] name = "backtrace" version = "0.3.69" @@ -323,6 +358,7 @@ version = "0.1.0" dependencies = [ "anyhow", "async-stream", + "aya", "bitflags 2.4.2", "bollard", "bytes", @@ -342,6 +378,15 @@ dependencies = [ "walkdir", ] +[[package]] +name = "core-error" +version = "0.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efcdb2972eb64230b4c50646d8498ff73f5128d196a90c7236eec4cbe8619b8f" +dependencies = [ + "version_check", +] + [[package]] name = "core-foundation-sys" version = "0.8.6" @@ -737,6 +782,12 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + [[package]] name = "libc" version = "0.2.153" diff --git a/Cargo.toml b/Cargo.toml index 455b114..e96b842 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,6 +30,7 @@ bollard = "0.16" futures = "0.3" rustix = { version = "0.38", features = ["fs", "stdio", "termios"] } bitflags = "2" +aya = { git = "https://github.com/nbdd0121/aya.git" } [build-dependencies] anyhow = { version = "1", features = ["backtrace"] } diff --git a/README.md b/README.md index 661b085..03baa33 100644 --- a/README.md +++ b/README.md @@ -25,10 +25,7 @@ Another concern is providing a container with well known paths for the devices. On bare-metal systems this would usually be achieved with a `SYMLINK` directive in a udev rule. This program tries to provide a similar functionality for containers, allowing you to specify symlinks for certain devices. -## Limitations - -`container-hotplug` needs to be run as root and relies on `cgroup v1`. It does not support `cgroup v2`. -On distributions with `cgroup v2`, you can switch back to `cgroup v1` by setting the [kernel parameter](https://wiki.ubuntu.com/Kernel/KernelBootParameters) `systemd.unified_cgroup_hierarchy=0`. +This tool supports both cgroup v1 and v2. ## Example diff --git a/src/docker/cgroup.rs b/src/docker/cgroup.rs index 026d6ae..bf8580a 100644 --- a/src/docker/cgroup.rs +++ b/src/docker/cgroup.rs @@ -1,4 +1,8 @@ -use anyhow::{ensure, Result}; +use anyhow::{ensure, Context, Result}; +use aya::maps::{HashMap, MapData}; +use aya::programs::{CgroupDevice, Link}; +use std::fs::File; +use std::mem::ManuallyDrop; use std::path::PathBuf; // The numerical representation below needs to match BPF_DEVCG constants. @@ -26,6 +30,10 @@ pub trait DeviceAccessController { minor: u32, access: Access, ) -> Result<()>; + + /// Stop performing access control. This may allow all accesses, so should only be used when + /// the cgroup is shutdown. + fn stop(self: Box) -> Result<()>; } pub struct DeviceAccessControllerV1 { @@ -96,4 +104,95 @@ impl DeviceAccessController for DeviceAccessControllerV1 { Ok(()) } + + fn stop(self: Box) -> Result<()> { + Ok(()) + } +} + +#[allow(unused)] // This is read as POD by the BPF program. +#[derive(Clone, Copy)] +struct Device { + device_type: u32, + major: u32, + minor: u32, +} + +// SAFETY: Device is `repr(C)`` and has no padding. +unsafe impl aya::Pod for Device {} + +pub struct DeviceAccessControllerV2 { + map: HashMap, + pin: PathBuf, +} + +impl DeviceAccessControllerV2 { + pub fn new(id: &str) -> Result { + // We want to take control of the device cgroup filtering from docker. To do this, we attach our own + // filter program and detach the one by docker. + let cgroup_path = format!("/sys/fs/cgroup/system.slice/docker-{id}.scope"); + let cgroup = File::open(cgroup_path)?; + + let mut bpf = aya::Bpf::load(include_bytes!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/cgroup_device_filter/target/bpfel-unknown-none/release/cgroup_device_filter" + )))?; + + let program: &mut CgroupDevice = bpf + .program_mut("check_device") + .context("cannot find check_device program")? + .try_into()?; + + program.load()?; + + // Iterate existing programs. We'll need to detach them later. + // Wrap this inside `ManuallyDrop` to prevent accidental detaching. + let existing_programs = ManuallyDrop::new(CgroupDevice::query(&cgroup)?); + + program.attach(&cgroup)?; + + // Pin the program so that if container-hotplug accidentally exits, the filter won't be removed from the docker + // container. + let pin: PathBuf = format!("/sys/fs/bpf/docker-{id}-device-filter").into(); + program.pin(&pin)?; + + // Now our new filter is attached, detach all docker filters. + for existing_program in ManuallyDrop::into_inner(existing_programs) { + existing_program.detach()?; + } + + let map: HashMap<_, Device, u32> = bpf + .take_map("DEVICE_PERM") + .context("cannot find DEVICE_PERM map")? + .try_into()?; + + Ok(Self { map, pin }) + } +} + +impl DeviceAccessController for DeviceAccessControllerV2 { + fn set_permission( + &mut self, + ty: DeviceType, + major: u32, + minor: u32, + access: Access, + ) -> Result<()> { + let device = Device { + device_type: ty as u32, + major, + minor, + }; + if access.is_empty() { + self.map.remove(&device)?; + } else { + self.map.insert(device, access.bits(), 0)?; + } + Ok(()) + } + + fn stop(self: Box) -> Result<()> { + CgroupDevice::from_pin(&self.pin)?.unpin()?; + Ok(()) + } } diff --git a/src/docker/container.rs b/src/docker/container.rs index e7a9525..7821c95 100644 --- a/src/docker/container.rs +++ b/src/docker/container.rs @@ -10,7 +10,7 @@ use tokio::signal::unix::{signal, SignalKind}; use tokio::task::{spawn, JoinHandle}; use tokio_stream::StreamExt; -use super::cgroup::{Access, DeviceAccessController, DeviceAccessControllerV1, DeviceType}; +use super::cgroup::{Access, DeviceAccessController, DeviceAccessControllerV1, DeviceAccessControllerV2, DeviceType}; use super::{IoStream, IoStreamSource}; #[derive(Clone)] @@ -18,7 +18,7 @@ pub struct Container { id: String, docker: bollard::Docker, remove_event: Shared>>, - cgroup_device_filter: Arc>>, + cgroup_device_filter: Arc>>>, } impl Container { @@ -40,13 +40,19 @@ impl Container { .shared(); let cgroup_device_filter: Box = - Box::new(DeviceAccessControllerV1::new(id)?); + match DeviceAccessControllerV2::new(id) { + Ok(v) => Box::new(v), + Err(err) => match DeviceAccessControllerV1::new(id) { + Ok(v) => Box::new(v), + Err(_) => Err(err).context("neither cgroup v1 and cgroup v2 works")?, + }, + }; Ok(Self { id: id.to_owned(), docker: docker.clone(), remove_event: remove_evevnt, - cgroup_device_filter: Arc::new(Mutex::new(cgroup_device_filter)), + cgroup_device_filter: Arc::new(Mutex::new(Some(cgroup_device_filter))), }) } @@ -83,6 +89,14 @@ impl Container { .context("no destroy event")?; } + // Stop the cgroup device filter. Only do so once we're sure that the container is removed. + self.cgroup_device_filter + .lock() + .unwrap() + .take() + .unwrap() + .stop()?; + Ok(()) } @@ -229,7 +243,7 @@ impl Container { let controller = self.cgroup_device_filter.clone(); tokio::task::spawn_blocking(move || -> Result<()> { let mut controller = controller.lock().unwrap(); - controller.set_permission( + controller.as_mut().unwrap().set_permission( DeviceType::Character, major, minor, diff --git a/src/main.rs b/src/main.rs index b0418d8..0bf6372 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,11 +7,11 @@ use cli::{Action, Device, Symlink}; use docker::{Container, Docker}; use hotplug::{Event as HotPlugEvent, HotPlug, PluggedDevice}; +use std::fmt::Display; use std::pin::pin; -use std::{fmt::Display, path::Path}; use tokio_stream::StreamExt; -use anyhow::{bail, Context, Result}; +use anyhow::{Context, Result}; use clap::Parser; use clap_verbosity_flag::{InfoLevel, LogLevel, Verbosity}; use log::info; @@ -98,10 +98,6 @@ fn run_hotplug( async fn run(param: cli::Run, verbosity: Verbosity) -> Result { let mut status = 0; - if !Path::new("/sys/fs/cgroup/devices/").is_dir() { - bail!("Could not find cgroup v1"); - } - let docker = Docker::connect_with_defaults()?; let container = docker.run(param.docker_args).await?; drop(container.pipe_signals());