Skip to content

Commit

Permalink
Add cgroup v2 support
Browse files Browse the repository at this point in the history
cgroup v1 uses two special files to determine access, where cgroup v2
uses eBPF programs to control access. The code will attach a custom eBPF
program which allows run-time reconfiguration and detach docker's default.

eBPF programs will be detached when the attaching program dies, which
can be dangerous if container-hotplug exits unexpectedly while the program
is running, so we instead pin it (so it stays when the program exits) and
unpin it after the docker container is down. In this case we might have
garbage eBPF programs pinned when container-hotplug exits unexpectedly but
it is safe.
  • Loading branch information
nbdd0121 committed Mar 4, 2024
1 parent 14e6a71 commit e0de0ed
Show file tree
Hide file tree
Showing 6 changed files with 174 additions and 16 deletions.
51 changes: 51 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ bollard = "0.16"
futures = "0.3"
rustix = { version = "0.38", features = ["fs", "stdio", "termios"] }
bitflags = "2"
aya = { git = "https://github.com/nbdd0121/aya.git" }

[build-dependencies]
anyhow = { version = "1", features = ["backtrace"] }
Expand Down
5 changes: 1 addition & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,7 @@ Another concern is providing a container with well known paths for the devices.
On bare-metal systems this would usually be achieved with a `SYMLINK` directive in a udev rule.
This program tries to provide a similar functionality for containers, allowing you to specify symlinks for certain devices.

## Limitations

`container-hotplug` needs to be run as root and relies on `cgroup v1`. It does not support `cgroup v2`.
On distributions with `cgroup v2`, you can switch back to `cgroup v1` by setting the [kernel parameter](https://wiki.ubuntu.com/Kernel/KernelBootParameters) `systemd.unified_cgroup_hierarchy=0`.
This tool supports both cgroup v1 and v2.

## Example

Expand Down
101 changes: 100 additions & 1 deletion src/docker/cgroup.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
use anyhow::{ensure, Result};
use anyhow::{ensure, Context, Result};
use aya::maps::{HashMap, MapData};
use aya::programs::{CgroupDevice, Link};
use std::fs::File;
use std::mem::ManuallyDrop;
use std::path::PathBuf;

// The numerical representation below needs to match BPF_DEVCG constants.
Expand Down Expand Up @@ -26,6 +30,10 @@ pub trait DeviceAccessController {
minor: u32,
access: Access,
) -> Result<()>;

/// Stop performing access control. This may allow all accesses, so should only be used when
/// the cgroup is shutdown.
fn stop(self: Box<Self>) -> Result<()>;
}

pub struct DeviceAccessControllerV1 {
Expand Down Expand Up @@ -96,4 +104,95 @@ impl DeviceAccessController for DeviceAccessControllerV1 {

Ok(())
}

fn stop(self: Box<Self>) -> Result<()> {
Ok(())
}
}

#[allow(unused)] // This is read as POD by the BPF program.
#[derive(Clone, Copy)]
struct Device {
device_type: u32,
major: u32,
minor: u32,
}

// SAFETY: Device is `repr(C)`` and has no padding.
unsafe impl aya::Pod for Device {}

pub struct DeviceAccessControllerV2 {
map: HashMap<MapData, Device, u32>,
pin: PathBuf,
}

impl DeviceAccessControllerV2 {
pub fn new(id: &str) -> Result<Self> {
// We want to take control of the device cgroup filtering from docker. To do this, we attach our own
// filter program and detach the one by docker.
let cgroup_path = format!("/sys/fs/cgroup/system.slice/docker-{id}.scope");
let cgroup = File::open(cgroup_path)?;

let mut bpf = aya::Bpf::load(include_bytes!(concat!(
env!("CARGO_MANIFEST_DIR"),
"/cgroup_device_filter/target/bpfel-unknown-none/release/cgroup_device_filter"
)))?;

let program: &mut CgroupDevice = bpf
.program_mut("check_device")
.context("cannot find check_device program")?
.try_into()?;

program.load()?;

// Iterate existing programs. We'll need to detach them later.
// Wrap this inside `ManuallyDrop` to prevent accidental detaching.
let existing_programs = ManuallyDrop::new(CgroupDevice::query(&cgroup)?);

program.attach(&cgroup)?;

// Pin the program so that if container-hotplug accidentally exits, the filter won't be removed from the docker
// container.
let pin: PathBuf = format!("/sys/fs/bpf/docker-{id}-device-filter").into();
program.pin(&pin)?;

// Now our new filter is attached, detach all docker filters.
for existing_program in ManuallyDrop::into_inner(existing_programs) {
existing_program.detach()?;
}

let map: HashMap<_, Device, u32> = bpf
.take_map("DEVICE_PERM")
.context("cannot find DEVICE_PERM map")?
.try_into()?;

Ok(Self { map, pin })
}
}

impl DeviceAccessController for DeviceAccessControllerV2 {
fn set_permission(
&mut self,
ty: DeviceType,
major: u32,
minor: u32,
access: Access,
) -> Result<()> {
let device = Device {
device_type: ty as u32,
major,
minor,
};
if access.is_empty() {
self.map.remove(&device)?;
} else {
self.map.insert(device, access.bits(), 0)?;
}
Ok(())
}

fn stop(self: Box<Self>) -> Result<()> {
CgroupDevice::from_pin(&self.pin)?.unpin()?;
Ok(())
}
}
24 changes: 19 additions & 5 deletions src/docker/container.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@ use tokio::signal::unix::{signal, SignalKind};
use tokio::task::{spawn, JoinHandle};
use tokio_stream::StreamExt;

use super::cgroup::{Access, DeviceAccessController, DeviceAccessControllerV1, DeviceType};
use super::cgroup::{Access, DeviceAccessController, DeviceAccessControllerV1, DeviceAccessControllerV2, DeviceType};
use super::{IoStream, IoStreamSource};

#[derive(Clone)]
pub struct Container {
id: String,
docker: bollard::Docker,
remove_event: Shared<BoxFuture<'static, Option<EventMessage>>>,
cgroup_device_filter: Arc<Mutex<Box<dyn DeviceAccessController + Send>>>,
cgroup_device_filter: Arc<Mutex<Option<Box<dyn DeviceAccessController + Send>>>>,
}

impl Container {
Expand All @@ -40,13 +40,19 @@ impl Container {
.shared();

let cgroup_device_filter: Box<dyn DeviceAccessController + Send> =
Box::new(DeviceAccessControllerV1::new(id)?);
match DeviceAccessControllerV2::new(id) {
Ok(v) => Box::new(v),
Err(err) => match DeviceAccessControllerV1::new(id) {
Ok(v) => Box::new(v),
Err(_) => Err(err).context("neither cgroup v1 and cgroup v2 works")?,
},
};

Ok(Self {
id: id.to_owned(),
docker: docker.clone(),
remove_event: remove_evevnt,
cgroup_device_filter: Arc::new(Mutex::new(cgroup_device_filter)),
cgroup_device_filter: Arc::new(Mutex::new(Some(cgroup_device_filter))),
})
}

Expand Down Expand Up @@ -83,6 +89,14 @@ impl Container {
.context("no destroy event")?;
}

// Stop the cgroup device filter. Only do so once we're sure that the container is removed.
self.cgroup_device_filter
.lock()
.unwrap()
.take()
.unwrap()
.stop()?;

Ok(())
}

Expand Down Expand Up @@ -229,7 +243,7 @@ impl Container {
let controller = self.cgroup_device_filter.clone();
tokio::task::spawn_blocking(move || -> Result<()> {
let mut controller = controller.lock().unwrap();
controller.set_permission(
controller.as_mut().unwrap().set_permission(
DeviceType::Character,
major,
minor,
Expand Down
8 changes: 2 additions & 6 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@ use cli::{Action, Device, Symlink};
use docker::{Container, Docker};
use hotplug::{Event as HotPlugEvent, HotPlug, PluggedDevice};

use std::fmt::Display;
use std::pin::pin;
use std::{fmt::Display, path::Path};
use tokio_stream::StreamExt;

use anyhow::{bail, Context, Result};
use anyhow::{Context, Result};
use clap::Parser;
use clap_verbosity_flag::{InfoLevel, LogLevel, Verbosity};
use log::info;
Expand Down Expand Up @@ -98,10 +98,6 @@ fn run_hotplug(
async fn run(param: cli::Run, verbosity: Verbosity<InfoLevel>) -> Result<u8> {
let mut status = 0;

if !Path::new("/sys/fs/cgroup/devices/").is_dir() {
bail!("Could not find cgroup v1");
}

let docker = Docker::connect_with_defaults()?;
let container = docker.run(param.docker_args).await?;
drop(container.pipe_signals());
Expand Down

0 comments on commit e0de0ed

Please sign in to comment.