|
| 1 | +// Copyright (c) 2025 Ant Group |
| 2 | +// |
| 3 | +// SPDX-License-Identifier: Apache-2.0 or MIT |
| 4 | +// |
| 5 | + |
| 6 | +use std::collections::HashMap; |
| 7 | + |
| 8 | +use oci_spec::runtime::{LinuxCpu, LinuxMemory, LinuxPids, LinuxResources}; |
| 9 | + |
| 10 | +use crate::manager::conv; |
| 11 | +use crate::manager::error::{Error, Result}; |
| 12 | +use crate::manager::fs::{join_path, FsManager}; |
| 13 | +use crate::systemd::{ |
| 14 | + cpu, cpuset, memory, pids, Property, SystemdCgroup, DEFAULT_SLICE, SCOPE_SUFFIX, SLICE_SUFFIX, |
| 15 | +}; |
| 16 | +use crate::{CgroupPid, FreezerState, Manager, Stats}; |
| 17 | + |
| 18 | +/// Default kernel value for cpu quota period is 100000 us (100 ms), same |
| 19 | +/// for v1 [1] and v2 [2]. |
| 20 | +/// |
| 21 | +/// 1: https://www.kernel.org/doc/html/latest/scheduler/sched-bwc.html |
| 22 | +/// 2: https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html |
| 23 | +const DEFAULT_CPU_QUOTA_PERIOD: u64 = 100_000; // 100ms |
| 24 | + |
| 25 | +pub struct SystemdManager { |
| 26 | + /// The name of slice |
| 27 | + slice: String, |
| 28 | + /// The name of unit |
| 29 | + unit: String, |
| 30 | + /// Systemd cgroup |
| 31 | + cgroup: SystemdCgroup, |
| 32 | + /// Cgroupfs manager |
| 33 | + fs_manager: FsManager, |
| 34 | +} |
| 35 | + |
| 36 | +impl SystemdManager { |
| 37 | + /// Create a new `SystemdManager` from a cgroup path. |
| 38 | + /// |
| 39 | + /// # Arguments |
| 40 | + /// |
| 41 | + /// * `path` - A string slice that holds the cgroup path in the format |
| 42 | + /// "parent:scope_prefix:name". |
| 43 | + pub fn new(path: &str) -> Result<Self> { |
| 44 | + let parts: Vec<&str> = path.split(':').collect(); |
| 45 | + if parts.len() != 3 { |
| 46 | + return Err(Error::InvalidArgument); |
| 47 | + } |
| 48 | + |
| 49 | + let slice = if parts[0].is_empty() { |
| 50 | + DEFAULT_SLICE.to_string() |
| 51 | + } else { |
| 52 | + parts[0].to_string() |
| 53 | + }; |
| 54 | + |
| 55 | + let slice_base = expand_slice(&slice)?; |
| 56 | + let unit = new_unit_name(parts[1], parts[2]); |
| 57 | + |
| 58 | + let fs_base = join_path(&slice_base, &unit); |
| 59 | + let fs_manager = FsManager::load(&fs_base)?; |
| 60 | + |
| 61 | + let cgroup = SystemdCgroup::new(&slice, &unit)?; |
| 62 | + |
| 63 | + Ok(Self { |
| 64 | + slice, |
| 65 | + unit, |
| 66 | + fs_manager, |
| 67 | + cgroup, |
| 68 | + }) |
| 69 | + } |
| 70 | +} |
| 71 | + |
| 72 | +impl SystemdManager { |
| 73 | + /// Get the slice name. |
| 74 | + pub fn slice(&self) -> &str { |
| 75 | + &self.slice |
| 76 | + } |
| 77 | + |
| 78 | + /// Get the unit name. |
| 79 | + pub fn unit(&self) -> &str { |
| 80 | + &self.unit |
| 81 | + } |
| 82 | + |
| 83 | + /// Get the cgroup path, see `FsManager::paths()`. |
| 84 | + pub fn paths(&self) -> &HashMap<String, String> { |
| 85 | + self.fs_manager.paths() |
| 86 | + } |
| 87 | + |
| 88 | + /// Get the cgroup mountpoints, see `FsManager::mounts()`. |
| 89 | + pub fn mounts(&self) -> &HashMap<String, String> { |
| 90 | + self.fs_manager.mounts() |
| 91 | + } |
| 92 | + |
| 93 | + fn set_cpuset( |
| 94 | + &self, |
| 95 | + props: &mut Vec<Property>, |
| 96 | + linux_cpu: &LinuxCpu, |
| 97 | + systemd_version: usize, |
| 98 | + ) -> Result<()> { |
| 99 | + if let Some(cpus) = linux_cpu.cpus().as_ref() { |
| 100 | + let (id, value) = cpuset::cpuset_cpus(cpus, systemd_version)?; |
| 101 | + props.push((id, value.into())); |
| 102 | + } |
| 103 | + |
| 104 | + if let Some(mems) = linux_cpu.mems().as_ref() { |
| 105 | + let (id, value) = cpuset::cpuset_mems(mems, systemd_version)?; |
| 106 | + props.push((id, value.into())); |
| 107 | + } |
| 108 | + |
| 109 | + Ok(()) |
| 110 | + } |
| 111 | + |
| 112 | + fn set_cpu( |
| 113 | + &self, |
| 114 | + props: &mut Vec<Property>, |
| 115 | + linux_cpu: &LinuxCpu, |
| 116 | + systemd_version: usize, |
| 117 | + ) -> Result<()> { |
| 118 | + if let Some(shares) = linux_cpu.shares() { |
| 119 | + let shares = if self.v2() { |
| 120 | + conv::cpu_shares_to_cgroup_v2(shares) |
| 121 | + } else { |
| 122 | + shares |
| 123 | + }; |
| 124 | + let (id, value) = cpu::shares(shares, self.v2())?; |
| 125 | + props.push((id, value.into())); |
| 126 | + } |
| 127 | + |
| 128 | + let period = linux_cpu.period().unwrap_or(0); |
| 129 | + let quota = linux_cpu.quota().unwrap_or(0); |
| 130 | + |
| 131 | + if period != 0 { |
| 132 | + let (id, value) = cpu::period(period, systemd_version)?; |
| 133 | + props.push((id, value.into())); |
| 134 | + } |
| 135 | + |
| 136 | + if period != 0 || quota != 0 { |
| 137 | + // Corresponds to USEC_INFINITY in systemd |
| 138 | + let mut cpu_quota_per_sec_usec = u64::MAX; |
| 139 | + let mut period = period; |
| 140 | + if quota > 0 { |
| 141 | + if period == 0 { |
| 142 | + period = DEFAULT_CPU_QUOTA_PERIOD; |
| 143 | + } |
| 144 | + // systemd converts CPUQuotaPerSecUSec (microseconds per |
| 145 | + // CPU second) to CPUQuota (integer percentage of CPU) |
| 146 | + // internally. This means that if a fractional percent of |
| 147 | + // CPU is indicated by Resources.CpuQuota, we need to round |
| 148 | + // up to the nearest 10ms (1% of a second) such that child |
| 149 | + // cgroups can set the cpu.cfs_quota_us they expect. |
| 150 | + cpu_quota_per_sec_usec = ((quota as u64) * 1_000_000) / period; |
| 151 | + if cpu_quota_per_sec_usec % 10_000 != 0 { |
| 152 | + cpu_quota_per_sec_usec = (cpu_quota_per_sec_usec / 10_000 + 1) * 10_000; |
| 153 | + } |
| 154 | + } |
| 155 | + let (id, value) = cpu::quota(cpu_quota_per_sec_usec)?; |
| 156 | + props.push((id, value.into())); |
| 157 | + } |
| 158 | + |
| 159 | + Ok(()) |
| 160 | + } |
| 161 | + |
| 162 | + fn set_memory(&self, props: &mut Vec<Property>, linux_memory: &LinuxMemory) -> Result<()> { |
| 163 | + let v2 = self.v2(); |
| 164 | + |
| 165 | + let mem_limit = linux_memory.limit().unwrap_or(0); |
| 166 | + if mem_limit != 0 { |
| 167 | + let (id, value) = memory::limit(mem_limit, v2)?; |
| 168 | + props.push((id, value.into())); |
| 169 | + } |
| 170 | + |
| 171 | + let reservation = linux_memory.reservation().unwrap_or(0); |
| 172 | + if reservation != 0 && v2 { |
| 173 | + let (id, value) = memory::low(reservation, v2)?; |
| 174 | + props.push((id, value.into())); |
| 175 | + } |
| 176 | + |
| 177 | + let memswap_limit = linux_memory.swap().unwrap_or(0); |
| 178 | + if memswap_limit != 0 && v2 { |
| 179 | + let memswap_limit = conv::memory_swap_to_cgroup_v2(memswap_limit, mem_limit)?; |
| 180 | + let (id, value) = memory::swap(memswap_limit, v2)?; |
| 181 | + props.push((id, value.into())); |
| 182 | + } |
| 183 | + |
| 184 | + Ok(()) |
| 185 | + } |
| 186 | + |
| 187 | + fn set_pids(&self, props: &mut Vec<Property>, linux_pids: &LinuxPids) -> Result<()> { |
| 188 | + let limit = linux_pids.limit(); |
| 189 | + if limit == -1 || limit > 0 { |
| 190 | + let (id, value) = pids::max(limit)?; |
| 191 | + props.push((id, value.into())); |
| 192 | + } |
| 193 | + |
| 194 | + Ok(()) |
| 195 | + } |
| 196 | +} |
| 197 | + |
| 198 | +impl Manager for SystemdManager { |
| 199 | + fn apply(&self, pid: CgroupPid) -> Result<()> { |
| 200 | + if self.cgroup.exists()? { |
| 201 | + let subcgroup = self.fs_manager.subcgroup(); |
| 202 | + self.cgroup.add_process(pid, subcgroup)?; |
| 203 | + |
| 204 | + return Ok(()); |
| 205 | + } |
| 206 | + |
| 207 | + self.cgroup.start(pid)?; |
| 208 | + // The fs_manager was created in load mode, which doesn't create |
| 209 | + // the cgroups. So we create them here. |
| 210 | + self.fs_manager.cgroup.create()?; |
| 211 | + |
| 212 | + Ok(()) |
| 213 | + } |
| 214 | + |
| 215 | + fn cgroup_path(&self, subsystem: Option<&str>) -> Result<String> { |
| 216 | + self.fs_manager.cgroup_path(subsystem) |
| 217 | + } |
| 218 | + |
| 219 | + fn destroy(&mut self) -> Result<()> { |
| 220 | + self.cgroup.kill()?; |
| 221 | + self.fs_manager.destroy()?; |
| 222 | + |
| 223 | + Ok(()) |
| 224 | + } |
| 225 | + |
| 226 | + fn enable_cpus_topdown(&self, cpus: &str) -> Result<()> { |
| 227 | + self.fs_manager.enable_cpus_topdown(cpus) |
| 228 | + } |
| 229 | + |
| 230 | + fn freeze(&self, state: FreezerState) -> Result<()> { |
| 231 | + match state { |
| 232 | + FreezerState::Thawed => self.cgroup.thaw()?, |
| 233 | + FreezerState::Frozen => self.cgroup.freeze()?, |
| 234 | + FreezerState::Freezing => return Err(Error::InvalidArgument), |
| 235 | + } |
| 236 | + |
| 237 | + Ok(()) |
| 238 | + } |
| 239 | + |
| 240 | + fn pids(&self) -> Result<Vec<CgroupPid>> { |
| 241 | + self.fs_manager.pids() |
| 242 | + } |
| 243 | + |
| 244 | + fn set(&self, resources: &LinuxResources) -> Result<()> { |
| 245 | + let mut props = vec![]; |
| 246 | + |
| 247 | + let systemd_version = self.cgroup.systemd_version()?; |
| 248 | + |
| 249 | + if let Some(linux_cpu) = resources.cpu() { |
| 250 | + self.set_cpuset(&mut props, linux_cpu, systemd_version)?; |
| 251 | + self.set_cpu(&mut props, linux_cpu, systemd_version)?; |
| 252 | + } |
| 253 | + |
| 254 | + if let Some(linux_memory) = resources.memory() { |
| 255 | + self.set_memory(&mut props, linux_memory)?; |
| 256 | + } |
| 257 | + |
| 258 | + if let Some(linux_pids) = resources.pids() { |
| 259 | + self.set_pids(&mut props, linux_pids)?; |
| 260 | + } |
| 261 | + |
| 262 | + Ok(()) |
| 263 | + } |
| 264 | + |
| 265 | + fn stats(&self) -> Result<Stats> { |
| 266 | + self.fs_manager.stats() |
| 267 | + } |
| 268 | + |
| 269 | + fn systemd(&self) -> bool { |
| 270 | + true |
| 271 | + } |
| 272 | + |
| 273 | + fn v2(&self) -> bool { |
| 274 | + self.fs_manager.v2() |
| 275 | + } |
| 276 | +} |
| 277 | + |
| 278 | +/// Expand a slice name to a full path in the filesystem. |
| 279 | +/// |
| 280 | +/// # Arguments |
| 281 | +/// |
| 282 | +/// * `slice` - A string slice that holds the slice name in the format |
| 283 | +/// "xxx-yyy-zzz.slice". |
| 284 | +/// |
| 285 | +/// # Returns |
| 286 | +/// |
| 287 | +/// A string that represents the full path of the slice in the filesystem. |
| 288 | +/// In the above case, the value would be "xxx/xxx-yyy/xxx-yyy-zzz.slice". |
| 289 | +fn expand_slice(slice: &str) -> Result<String> { |
| 290 | + // Name has to end with ".slice", but can't be just ".slice". |
| 291 | + if !slice.ends_with(SLICE_SUFFIX) || slice.len() < SLICE_SUFFIX.len() { |
| 292 | + return Err(Error::InvalidArgument); |
| 293 | + } |
| 294 | + |
| 295 | + // Path-separators are not allowed. |
| 296 | + if slice.contains('/') { |
| 297 | + return Err(Error::InvalidArgument); |
| 298 | + } |
| 299 | + |
| 300 | + let name = slice.trim_end_matches(SLICE_SUFFIX); |
| 301 | + |
| 302 | + // If input was -.slice, we should just return root now |
| 303 | + if name == "-" { |
| 304 | + return Ok("".to_string()); |
| 305 | + } |
| 306 | + |
| 307 | + let mut slice_path = String::new(); |
| 308 | + let mut prefix = String::new(); |
| 309 | + for sub_slice in name.split('-') { |
| 310 | + if sub_slice.is_empty() { |
| 311 | + return Err(Error::InvalidArgument); |
| 312 | + } |
| 313 | + |
| 314 | + slice_path = format!("{}/{}{}{}", slice_path, prefix, sub_slice, SLICE_SUFFIX); |
| 315 | + prefix = format!("{}{}-", prefix, sub_slice); |
| 316 | + } |
| 317 | + |
| 318 | + // We need a relative path, so remove the first slash. |
| 319 | + slice_path.remove(0); |
| 320 | + |
| 321 | + Ok(slice_path) |
| 322 | +} |
| 323 | + |
| 324 | +fn new_unit_name(scope_prefix: &str, name: &str) -> String { |
| 325 | + // By default, we create a scope unless the user explicitly asks |
| 326 | + // for a slice. |
| 327 | + if !name.ends_with(SLICE_SUFFIX) { |
| 328 | + // {scope_prefix}-{name}.scope |
| 329 | + return format!("{}-{}{}", scope_prefix, name, SCOPE_SUFFIX); |
| 330 | + } |
| 331 | + |
| 332 | + name.to_string() |
| 333 | +} |
0 commit comments