Skip to content

Commit 0d5b5a9

Browse files
committed
manager: Introduce SystemdManager
Systemd manager takes cgroups path in the format of "parent:scope_prefix:name" to create and manipulate cgroups through systemd. It does value conversions for resources defined in the Linux resources from the OCI spec, such as CPU quota, period, etc. Signed-off-by: Xuewei Niu <[email protected]>
1 parent 2daa80e commit 0d5b5a9

File tree

3 files changed

+339
-0
lines changed

3 files changed

+339
-0
lines changed

src/manager/error.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
//
55

66
use crate::fs::error::Error as CgroupfsError;
7+
use crate::systemd::error::Error as SystemdError;
78

89
pub type Result<T> = std::result::Result<T, Error>;
910

@@ -17,4 +18,7 @@ pub enum Error {
1718

1819
#[error("cgroupfs error: {0}")]
1920
Cgroupfs(#[from] CgroupfsError),
21+
22+
#[error("systemd error: {0}")]
23+
Systemd(#[from] SystemdError),
2024
}

src/manager/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ mod error;
77
pub use error::{Error, Result};
88
mod fs;
99
pub use fs::FsManager;
10+
mod systemd;
11+
pub use systemd::SystemdManager;
1012
mod conv;
1113

1214
use oci_spec::runtime::LinuxResources;

src/manager/systemd.rs

Lines changed: 333 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,333 @@
1+
// Copyright (c) 2025 Ant Group
2+
//
3+
// SPDX-License-Identifier: Apache-2.0 or MIT
4+
//
5+
6+
use std::collections::HashMap;
7+
8+
use oci_spec::runtime::{LinuxCpu, LinuxMemory, LinuxPids, LinuxResources};
9+
10+
use crate::manager::conv;
11+
use crate::manager::error::{Error, Result};
12+
use crate::manager::fs::{join_path, FsManager};
13+
use crate::systemd::{
14+
cpu, cpuset, memory, pids, Property, SystemdCgroup, DEFAULT_SLICE, SCOPE_SUFFIX, SLICE_SUFFIX,
15+
};
16+
use crate::{CgroupPid, FreezerState, Manager, Stats};
17+
18+
/// Default kernel value for cpu quota period is 100000 us (100 ms), same
19+
/// for v1 [1] and v2 [2].
20+
///
21+
/// 1: https://www.kernel.org/doc/html/latest/scheduler/sched-bwc.html
22+
/// 2: https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html
23+
const DEFAULT_CPU_QUOTA_PERIOD: u64 = 100_000; // 100ms
24+
25+
pub struct SystemdManager {
26+
/// The name of slice
27+
slice: String,
28+
/// The name of unit
29+
unit: String,
30+
/// Systemd cgroup
31+
cgroup: SystemdCgroup,
32+
/// Cgroupfs manager
33+
fs_manager: FsManager,
34+
}
35+
36+
impl SystemdManager {
37+
/// Create a new `SystemdManager` from a cgroup path.
38+
///
39+
/// # Arguments
40+
///
41+
/// * `path` - A string slice that holds the cgroup path in the format
42+
/// "parent:scope_prefix:name".
43+
pub fn new(path: &str) -> Result<Self> {
44+
let parts: Vec<&str> = path.split(':').collect();
45+
if parts.len() != 3 {
46+
return Err(Error::InvalidArgument);
47+
}
48+
49+
let slice = if parts[0].is_empty() {
50+
DEFAULT_SLICE.to_string()
51+
} else {
52+
parts[0].to_string()
53+
};
54+
55+
let slice_base = expand_slice(&slice)?;
56+
let unit = new_unit_name(parts[1], parts[2]);
57+
58+
let fs_base = join_path(&slice_base, &unit);
59+
let fs_manager = FsManager::load(&fs_base)?;
60+
61+
let cgroup = SystemdCgroup::new(&slice, &unit)?;
62+
63+
Ok(Self {
64+
slice,
65+
unit,
66+
fs_manager,
67+
cgroup,
68+
})
69+
}
70+
}
71+
72+
impl SystemdManager {
73+
/// Get the slice name.
74+
pub fn slice(&self) -> &str {
75+
&self.slice
76+
}
77+
78+
/// Get the unit name.
79+
pub fn unit(&self) -> &str {
80+
&self.unit
81+
}
82+
83+
/// Get the cgroup path, see `FsManager::paths()`.
84+
pub fn paths(&self) -> &HashMap<String, String> {
85+
self.fs_manager.paths()
86+
}
87+
88+
/// Get the cgroup mountpoints, see `FsManager::mounts()`.
89+
pub fn mounts(&self) -> &HashMap<String, String> {
90+
self.fs_manager.mounts()
91+
}
92+
93+
fn set_cpuset(
94+
&self,
95+
props: &mut Vec<Property>,
96+
linux_cpu: &LinuxCpu,
97+
systemd_version: usize,
98+
) -> Result<()> {
99+
if let Some(cpus) = linux_cpu.cpus().as_ref() {
100+
let (id, value) = cpuset::cpuset_cpus(cpus, systemd_version)?;
101+
props.push((id, value.into()));
102+
}
103+
104+
if let Some(mems) = linux_cpu.mems().as_ref() {
105+
let (id, value) = cpuset::cpuset_mems(mems, systemd_version)?;
106+
props.push((id, value.into()));
107+
}
108+
109+
Ok(())
110+
}
111+
112+
fn set_cpu(
113+
&self,
114+
props: &mut Vec<Property>,
115+
linux_cpu: &LinuxCpu,
116+
systemd_version: usize,
117+
) -> Result<()> {
118+
if let Some(shares) = linux_cpu.shares() {
119+
let shares = if self.v2() {
120+
conv::cpu_shares_to_cgroup_v2(shares)
121+
} else {
122+
shares
123+
};
124+
let (id, value) = cpu::shares(shares, self.v2())?;
125+
props.push((id, value.into()));
126+
}
127+
128+
let period = linux_cpu.period().unwrap_or(0);
129+
let quota = linux_cpu.quota().unwrap_or(0);
130+
131+
if period != 0 {
132+
let (id, value) = cpu::period(period, systemd_version)?;
133+
props.push((id, value.into()));
134+
}
135+
136+
if period != 0 || quota != 0 {
137+
// Corresponds to USEC_INFINITY in systemd
138+
let mut cpu_quota_per_sec_usec = u64::MAX;
139+
let mut period = period;
140+
if quota > 0 {
141+
if period == 0 {
142+
period = DEFAULT_CPU_QUOTA_PERIOD;
143+
}
144+
// systemd converts CPUQuotaPerSecUSec (microseconds per
145+
// CPU second) to CPUQuota (integer percentage of CPU)
146+
// internally. This means that if a fractional percent of
147+
// CPU is indicated by Resources.CpuQuota, we need to round
148+
// up to the nearest 10ms (1% of a second) such that child
149+
// cgroups can set the cpu.cfs_quota_us they expect.
150+
cpu_quota_per_sec_usec = ((quota as u64) * 1_000_000) / period;
151+
if cpu_quota_per_sec_usec % 10_000 != 0 {
152+
cpu_quota_per_sec_usec = (cpu_quota_per_sec_usec / 10_000 + 1) * 10_000;
153+
}
154+
}
155+
let (id, value) = cpu::quota(cpu_quota_per_sec_usec)?;
156+
props.push((id, value.into()));
157+
}
158+
159+
Ok(())
160+
}
161+
162+
fn set_memory(&self, props: &mut Vec<Property>, linux_memory: &LinuxMemory) -> Result<()> {
163+
let v2 = self.v2();
164+
165+
let mem_limit = linux_memory.limit().unwrap_or(0);
166+
if mem_limit != 0 {
167+
let (id, value) = memory::limit(mem_limit, v2)?;
168+
props.push((id, value.into()));
169+
}
170+
171+
let reservation = linux_memory.reservation().unwrap_or(0);
172+
if reservation != 0 && v2 {
173+
let (id, value) = memory::low(reservation, v2)?;
174+
props.push((id, value.into()));
175+
}
176+
177+
let memswap_limit = linux_memory.swap().unwrap_or(0);
178+
if memswap_limit != 0 && v2 {
179+
let memswap_limit = conv::memory_swap_to_cgroup_v2(memswap_limit, mem_limit)?;
180+
let (id, value) = memory::swap(memswap_limit, v2)?;
181+
props.push((id, value.into()));
182+
}
183+
184+
Ok(())
185+
}
186+
187+
fn set_pids(&self, props: &mut Vec<Property>, linux_pids: &LinuxPids) -> Result<()> {
188+
let limit = linux_pids.limit();
189+
if limit == -1 || limit > 0 {
190+
let (id, value) = pids::max(limit)?;
191+
props.push((id, value.into()));
192+
}
193+
194+
Ok(())
195+
}
196+
}
197+
198+
impl Manager for SystemdManager {
199+
fn apply(&self, pid: CgroupPid) -> Result<()> {
200+
if self.cgroup.exists()? {
201+
let subcgroup = self.fs_manager.subcgroup();
202+
self.cgroup.add_process(pid, subcgroup)?;
203+
204+
return Ok(());
205+
}
206+
207+
self.cgroup.start(pid)?;
208+
// The fs_manager was created in load mode, which doesn't create
209+
// the cgroups. So we create them here.
210+
self.fs_manager.cgroup.create()?;
211+
212+
Ok(())
213+
}
214+
215+
fn cgroup_path(&self, subsystem: Option<&str>) -> Result<String> {
216+
self.fs_manager.cgroup_path(subsystem)
217+
}
218+
219+
fn destroy(&mut self) -> Result<()> {
220+
self.cgroup.kill()?;
221+
self.fs_manager.destroy()?;
222+
223+
Ok(())
224+
}
225+
226+
fn enable_cpus_topdown(&self, cpus: &str) -> Result<()> {
227+
self.fs_manager.enable_cpus_topdown(cpus)
228+
}
229+
230+
fn freeze(&self, state: FreezerState) -> Result<()> {
231+
match state {
232+
FreezerState::Thawed => self.cgroup.thaw()?,
233+
FreezerState::Frozen => self.cgroup.freeze()?,
234+
FreezerState::Freezing => return Err(Error::InvalidArgument),
235+
}
236+
237+
Ok(())
238+
}
239+
240+
fn pids(&self) -> Result<Vec<CgroupPid>> {
241+
self.fs_manager.pids()
242+
}
243+
244+
fn set(&self, resources: &LinuxResources) -> Result<()> {
245+
let mut props = vec![];
246+
247+
let systemd_version = self.cgroup.systemd_version()?;
248+
249+
if let Some(linux_cpu) = resources.cpu() {
250+
self.set_cpuset(&mut props, linux_cpu, systemd_version)?;
251+
self.set_cpu(&mut props, linux_cpu, systemd_version)?;
252+
}
253+
254+
if let Some(linux_memory) = resources.memory() {
255+
self.set_memory(&mut props, linux_memory)?;
256+
}
257+
258+
if let Some(linux_pids) = resources.pids() {
259+
self.set_pids(&mut props, linux_pids)?;
260+
}
261+
262+
Ok(())
263+
}
264+
265+
fn stats(&self) -> Result<Stats> {
266+
self.fs_manager.stats()
267+
}
268+
269+
fn systemd(&self) -> bool {
270+
true
271+
}
272+
273+
fn v2(&self) -> bool {
274+
self.fs_manager.v2()
275+
}
276+
}
277+
278+
/// Expand a slice name to a full path in the filesystem.
279+
///
280+
/// # Arguments
281+
///
282+
/// * `slice` - A string slice that holds the slice name in the format
283+
/// "xxx-yyy-zzz.slice".
284+
///
285+
/// # Returns
286+
///
287+
/// A string that represents the full path of the slice in the filesystem.
288+
/// In the above case, the value would be "xxx/xxx-yyy/xxx-yyy-zzz.slice".
289+
fn expand_slice(slice: &str) -> Result<String> {
290+
// Name has to end with ".slice", but can't be just ".slice".
291+
if !slice.ends_with(SLICE_SUFFIX) || slice.len() < SLICE_SUFFIX.len() {
292+
return Err(Error::InvalidArgument);
293+
}
294+
295+
// Path-separators are not allowed.
296+
if slice.contains('/') {
297+
return Err(Error::InvalidArgument);
298+
}
299+
300+
let name = slice.trim_end_matches(SLICE_SUFFIX);
301+
302+
// If input was -.slice, we should just return root now
303+
if name == "-" {
304+
return Ok("".to_string());
305+
}
306+
307+
let mut slice_path = String::new();
308+
let mut prefix = String::new();
309+
for sub_slice in name.split('-') {
310+
if sub_slice.is_empty() {
311+
return Err(Error::InvalidArgument);
312+
}
313+
314+
slice_path = format!("{}/{}{}{}", slice_path, prefix, sub_slice, SLICE_SUFFIX);
315+
prefix = format!("{}{}-", prefix, sub_slice);
316+
}
317+
318+
// We need a relative path, so remove the first slash.
319+
slice_path.remove(0);
320+
321+
Ok(slice_path)
322+
}
323+
324+
fn new_unit_name(scope_prefix: &str, name: &str) -> String {
325+
// By default, we create a scope unless the user explicitly asks
326+
// for a slice.
327+
if !name.ends_with(SLICE_SUFFIX) {
328+
// {scope_prefix}-{name}.scope
329+
return format!("{}-{}{}", scope_prefix, name, SCOPE_SUFFIX);
330+
}
331+
332+
name.to_string()
333+
}

0 commit comments

Comments
 (0)