Skip to content

Commit 2073e39

Browse files
committed
slabinfo: catch errors due to freelist corruption
Freelist corruption results in the "slabinfo" module crashing. Since this is a common symptom of use-after-free bugs, we'd rather give useful information about this case. So don't crash the module. Catch the error and report corruption issues at the end. This also helps in certain cases where we are running against a live kernel, and the freelist is not corrupt, but it changed by the time we decoded the pointer. As a result, we print different messages at different times: for live systems, we say this may be transient, but for core dumps, we say it indicates a potential use-after-free bug. To do this, we have to implement a rather sketchy workaround to use the _SlabCacheHelperSlub from drgn. This is manually verified to work on 0.0.25 through 0.0.27, which are the only supported drgn versions. But, we need to work on upstream tweaks to improve the slab helpers, so we don't need to rely on the hack. Signed-off-by: Stephen Brennan <[email protected]>
1 parent 80e126e commit 2073e39

File tree

1 file changed

+69
-8
lines changed

1 file changed

+69
-8
lines changed

drgn_tools/slabinfo.py

Lines changed: 69 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,16 @@
44
Helper to view slabinfo data
55
"""
66
import argparse
7+
from typing import List
78
from typing import NamedTuple
89
from typing import Set
910
from typing import Tuple
1011

1112
from drgn import cast
13+
from drgn import FaultError
1214
from drgn import Object
1315
from drgn import Program
16+
from drgn import ProgramFlags
1417
from drgn import Type
1518
from drgn.helpers.linux.cpumask import for_each_present_cpu
1619
from drgn.helpers.linux.list import list_for_each_entry
@@ -40,6 +43,8 @@ class SlabCacheInfo(NamedTuple):
4043
"""Slab size"""
4144
name: str
4245
"""Name of the slab cache"""
46+
freelist_corrupt_cpus: List[int]
47+
"""A list of CPUs for which the freelist was found to be corrupt"""
4348

4449

4550
def _slab_type(prog: Program) -> Type:
@@ -204,19 +209,41 @@ def slub_per_cpu_partial_free(cpu_partial: Object) -> int:
204209
return partial_free
205210

206211

207-
def kmem_cache_slub_info(cache: Object) -> Tuple[int, int]:
212+
class _CpuSlubWrapper:
213+
def __init__(self, obj):
214+
self._obj = obj
215+
216+
def __getattr__(self, key):
217+
if key == "cpu_slab":
218+
raise AttributeError("CpuSlubWrapper!")
219+
return self._obj.__getattribute__(key)
220+
221+
222+
def kmem_cache_slub_info(cache: Object) -> Tuple[int, int, List[int]]:
208223
"""
209224
For given kmem_cache object, parse through each cpu
210225
and get number of total slabs and free objects
211226
227+
If the CPU freelist was corrupt, then we do our best effort to count free
228+
objects, but we may undercount them. We set the corruption flag when this
229+
happens.
230+
212231
:param: ``struct kmem_cache`` drgn object
213-
:returns: total slabs, free objects
232+
:returns: total slabs, free objects, corruption instances
214233
"""
215234
prog = cache.prog_
216235
use_slab = _has_struct_slab(prog)
217236

218237
total_slabs = objects = free_objects = 0
219-
slub_helper = _get_slab_cache_helper(cache)
238+
239+
# The "cpu_slab" variable is used by the slab helper to preload the percpu
240+
# freelists. Not only does this duplicate work we're about to do, but also
241+
# corrupt slab caches will crash this function before we can detect which
242+
# CPU is corrupt. Pretend we have no "cpu_slab" variable when getting the
243+
# helper. This depends on implementation details: we will improve the helper
244+
# upstream to avoid this for the future.
245+
slub_helper = _get_slab_cache_helper(_CpuSlubWrapper(cache))
246+
corrupt = []
220247

221248
for cpuid in for_each_present_cpu(prog):
222249
per_cpu_slab = per_cpu_ptr(cache.cpu_slab, cpuid)
@@ -237,15 +264,25 @@ def kmem_cache_slub_info(cache: Object) -> Tuple[int, int]:
237264
objects = 0
238265

239266
free_objects += objects - page_inuse
240-
cpu_free_objects = slub_get_cpu_freelist_cnt(cpu_freelist, slub_helper)
241-
free_objects += cpu_free_objects
267+
268+
# Easily the most common form of corruption in the slab allocator comes
269+
# from use after free, which overwrites the freelist pointer and causes
270+
# a fault error. Catch this and report it for later.
271+
try:
272+
cpu_free_objects = slub_get_cpu_freelist_cnt(
273+
cpu_freelist, slub_helper
274+
)
275+
except FaultError:
276+
corrupt.append(cpuid)
277+
else:
278+
free_objects += cpu_free_objects
242279

243280
partial_frees = slub_per_cpu_partial_free(cpu_partial)
244281
free_objects += partial_frees
245282

246283
total_slabs += 1
247284

248-
return total_slabs, free_objects
285+
return total_slabs, free_objects, corrupt
249286

250287

251288
def get_kmem_cache_slub_info(cache: Object) -> SlabCacheInfo:
@@ -255,7 +292,7 @@ def get_kmem_cache_slub_info(cache: Object) -> SlabCacheInfo:
255292
:param cache: ``struct kmem_cache`` drgn object
256293
:returns: a :class:`SlabCacheInfo` with statistics about the cache
257294
"""
258-
total_slabs, free_objects = kmem_cache_slub_info(cache)
295+
total_slabs, free_objects, corrupt = kmem_cache_slub_info(cache)
259296
(
260297
nr_slabs,
261298
nr_total_objs,
@@ -280,6 +317,7 @@ def get_kmem_cache_slub_info(cache: Object) -> SlabCacheInfo:
280317
total_slabs,
281318
ssize,
282319
cache.name.string_().decode("utf-8"),
320+
corrupt,
283321
)
284322

285323

@@ -296,19 +334,42 @@ def print_slab_info(prog: Program) -> None:
296334
"NAME",
297335
]
298336
)
337+
corruption = []
299338
for cache in for_each_slab_cache(prog):
300339
slabinfo = get_kmem_cache_slub_info(cache)
340+
maybe_asterisk = ""
341+
if slabinfo.freelist_corrupt_cpus:
342+
maybe_asterisk = "*"
343+
corruption.append(slabinfo)
301344
table.row(
302345
slabinfo.cache.value_(),
303346
slabinfo.objsize,
304-
slabinfo.allocated,
347+
f"{slabinfo.allocated}{maybe_asterisk}",
305348
slabinfo.total,
306349
slabinfo.nr_slabs,
307350
f"{int(slabinfo.ssize / 1024)}k",
308351
slabinfo.name,
309352
)
310353
table.write()
311354

355+
if corruption:
356+
if prog.flags & ProgramFlags.IS_LIVE:
357+
print(
358+
"NOTE: freelist corruption was detected. This is not "
359+
"necessarily an error, as live systems may encounter race "
360+
"conditions."
361+
)
362+
else:
363+
print(
364+
"WARNING: freelist corruption was detected. It is likely that "
365+
"a use-after-free bug occurred."
366+
)
367+
table = FixedTable(["CACHE:<24s", "CORRUPT CPUS"])
368+
for slabinfo in corruption:
369+
cpus = ", ".join(map(str, slabinfo.freelist_corrupt_cpus))
370+
table.row(slabinfo.name, cpus)
371+
table.write()
372+
312373

313374
class SlabInfo(CorelensModule):
314375
"""Print info about each slab cache"""

0 commit comments

Comments
 (0)