Skip to content

Commit 6ec8679

Browse files
committed
ggml : add CPU backend reference implementation (wip)
This commit introduces a CPU reference implementation for GGML, designed primarily for testing and validation purposes. The new backend is implemented in pure C, ensuring compatibility across a wide range of systems without relying on specific CPU features or optimizations. This backend is based on the ggml-cpu implementation but where arcihtecture specific optimizations have been removed. Threading support has been left in, including support for OpenMP, but NUMA support has been stipped out. The ggml/src/ggml-cpu-ref/README.md file contains build instruction and usage examples.
1 parent a68f31e commit 6ec8679

29 files changed

+19030
-2
lines changed

ggml/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,7 @@ set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file
220220

221221
# extra artifacts
222222
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
223+
option(GGML_CPU_REF "ggml: build reference CPU backend for testing" OFF)
223224
option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
224225

225226
#

ggml/src/CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,14 @@ ggml_add_backend(WebGPU)
385385
ggml_add_backend(zDNN)
386386
ggml_add_backend(OpenCL)
387387

388+
if (GGML_CPU_REF_BACKEND)
389+
if (NOT GGML_BACKEND_DL)
390+
message(FATAL_ERROR "GGML_CPU_REF_BACKEND requires GGML_BACKEND_DL")
391+
endif()
392+
add_subdirectory(ggml-cpu-ref)
393+
target_compile_definitions(ggml PRIVATE GGML_USE_CPU_REF)
394+
endif()
395+
388396
foreach (target ggml-base ggml)
389397
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
390398
target_compile_features (${target} PRIVATE c_std_11 cxx_std_17) # don't bump

ggml/src/ggml-backend-reg.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -596,4 +596,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
596596
if (backend_path) {
597597
ggml_backend_load(backend_path);
598598
}
599+
#ifdef GGML_USE_CPU_REF
600+
ggml_backend_load_best("cpu-ref", silent, dir_path);
601+
#endif
599602
}
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Baseline c-only CPU backend for testing.
2+
ggml_add_backend_library(ggml-cpu-ref)
3+
4+
target_sources(ggml-cpu-ref PRIVATE
5+
ggml-cpu-ref.cpp
6+
ggml-cpu-ref.c
7+
quants.h
8+
quants.c
9+
vec.h
10+
vec.cpp
11+
ops.h
12+
ops.cpp
13+
repack.h
14+
repack.cpp
15+
traits.h
16+
traits.cpp
17+
binary-ops.h
18+
binary-ops.cpp
19+
unary-ops.h
20+
unary-ops.cpp
21+
ggml-cpu-impl.h
22+
common.h
23+
)
24+
25+
if (GGML_OPENMP)
26+
find_package(OpenMP)
27+
if (OpenMP_FOUND)
28+
set(GGML_OPENMP_ENABLED "ON" CACHE INTERNAL "")
29+
target_compile_definitions(ggml-cpu-ref PRIVATE GGML_USE_OPENMP)
30+
31+
target_link_libraries(ggml-cpu-ref PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
32+
else()
33+
set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "")
34+
message(WARNING "OpenMP not found")
35+
endif()
36+
endif()
37+
38+
target_compile_features(ggml-cpu-ref PRIVATE cxx_std_17)
39+
target_include_directories(ggml-cpu-ref PRIVATE . ../include)
40+
41+
# Explicitly disable all optimizations and SIMD to ensure pure C
42+
set_target_properties(ggml-cpu-ref PROPERTIES COMPILE_FLAGS "-O0 -fno-tree-vectorize")

ggml/src/ggml-cpu-ref/README.md

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
## GGML CPU Reference Backend
2+
This is a CPU C-only reference backend implementation intended to be used to
3+
test CPU backend variants in GGML.
4+
5+
The goal of this backend is to make it possible to test CPU backend variants,
6+
which are CPU backends that contains optimizations for the a specific CPU
7+
architecture, against a plain C-only implemenation that does not use any
8+
architecture specific optimizations.
9+
10+
### Building
11+
This backend must be explicitly enabled when building GGML which can be done
12+
using the following CMake options:
13+
```console
14+
$ cmake -B build \
15+
-DGGML_CPU_REF_BACKEND=ON
16+
-DGGML_BACKEND_DL=ON \
17+
-DGGML_CPU_ALL_VARIANTS=ON
18+
```
19+
20+
### Listing variants
21+
To list the available variants:
22+
```console
23+
$ ./build/bin/test-backend-ops cpu-variants --list
24+
CPU variants:
25+
CPU-alderlake - 12th Gen Intel(R) Core(TM) i7-1260P
26+
```
27+
28+
### Testing a variant
29+
To test a variant against the reference backend use the following command:
30+
```console
31+
$ ./build/bin/test-backend-ops cpu-variants --variant CPU-alderlake -o ADD
32+
Testing CPU variant 'CPU-alderlake' against cpu-ref backend...
33+
34+
ADD(type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1): OK
35+
ADD(type=f16,ne=[1,1,1,1],nr=[32,1,1,1],nf=1): OK
36+
ADD(type=f16,ne=[1,1,320,320],nr=[1,1,1,1],nf=1): OK
37+
...
38+
14471/14471 tests passed
39+
```
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#pragma once
2+
3+
// quants.c
4+
#define quantize_row_q8_0_generic quantize_row_q8_0
5+
#define quantize_row_q8_1_generic quantize_row_q8_1
6+
#define quantize_row_q8_K_generic quantize_row_q8_K
7+
#define ggml_vec_dot_q4_0_q8_0_generic ggml_vec_dot_q4_0_q8_0
8+
#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
9+
#define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
10+
#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
11+
#define ggml_vec_dot_q8_0_q8_0_generic ggml_vec_dot_q8_0_q8_0
12+
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
13+
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
14+
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
15+
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
16+
#define ggml_vec_dot_q3_K_q8_K_generic ggml_vec_dot_q3_K_q8_K
17+
#define ggml_vec_dot_q4_K_q8_K_generic ggml_vec_dot_q4_K_q8_K
18+
#define ggml_vec_dot_q5_K_q8_K_generic ggml_vec_dot_q5_K_q8_K
19+
#define ggml_vec_dot_q6_K_q8_K_generic ggml_vec_dot_q6_K_q8_K
20+
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
21+
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
22+
#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
23+
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
24+
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
25+
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
26+
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
27+
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
28+
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
29+
30+
// repack.cpp
31+
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
32+
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
33+
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
34+
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
35+
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
36+
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
37+
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
38+
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
39+
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
40+
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
41+
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
42+
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
43+
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
44+
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
45+
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
46+
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
47+
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
#include "binary-ops.h"
2+
3+
static inline float op_add(float a, float b) {
4+
return a + b;
5+
}
6+
7+
static inline float op_sub(float a, float b) {
8+
return a - b;
9+
}
10+
11+
static inline float op_mul(float a, float b) {
12+
return a * b;
13+
}
14+
15+
static inline float op_div(float a, float b) {
16+
return a / b;
17+
}
18+
19+
template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
20+
static inline void vec_binary_op_contiguous(const int64_t n, dst_t * z, const src0_t * x, const src1_t * y) {
21+
constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
22+
constexpr auto src1_to_f32 = type_conversion_table<src1_t>::to_f32;
23+
constexpr auto f32_to_dst = type_conversion_table<dst_t >::from_f32;
24+
25+
for (int i = 0; i < n; i++) {
26+
z[i] = f32_to_dst(op(src0_to_f32(x[i]), src1_to_f32(y[i])));
27+
}
28+
}
29+
30+
template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
31+
static inline void vec_binary_op_non_contiguous(const int64_t n, const int64_t ne10, const int64_t nb10, dst_t * z, const src0_t * x, const src1_t * y) {
32+
constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
33+
constexpr auto src1_to_f32 = type_conversion_table<src1_t>::to_f32;
34+
constexpr auto f32_to_dst = type_conversion_table<dst_t >::from_f32;
35+
36+
for (int i = 0; i < n; i++) {
37+
int i10 = i % ne10;
38+
const src1_t * y_ptr = (const src1_t *)((const char *)y + i10*nb10);
39+
z[i] = f32_to_dst(op(src0_to_f32(x[i]), src1_to_f32(*y_ptr)));
40+
}
41+
}
42+
43+
template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
44+
static void apply_binary_op(const ggml_compute_params * params, ggml_tensor * dst) {
45+
const ggml_tensor * src0 = dst->src[0];
46+
const ggml_tensor * src1 = dst->src[1];
47+
48+
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
49+
50+
GGML_TENSOR_BINARY_OP_LOCALS
51+
52+
GGML_ASSERT( nb0 == sizeof(dst_t));
53+
GGML_ASSERT(nb00 == sizeof(src0_t));
54+
55+
const auto [ir0, ir1] = get_thread_range(params, src0);
56+
const bool is_src1_contiguous = (nb10 == sizeof(src1_t));
57+
58+
if (!is_src1_contiguous) { // broadcast not implemented yet for non-contiguous
59+
GGML_ASSERT(ggml_are_same_shape(src0, src1));
60+
}
61+
62+
for (int64_t ir = ir0; ir < ir1; ++ir) {
63+
const int64_t i03 = ir/(ne02*ne01);
64+
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
65+
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
66+
67+
const int64_t i13 = i03 % ne13;
68+
const int64_t i12 = i02 % ne12;
69+
const int64_t i11 = i01 % ne11;
70+
71+
dst_t * dst_ptr = (dst_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
72+
const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
73+
const src1_t * src1_ptr = (const src1_t *) ((const char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
74+
75+
if (is_src1_contiguous) {
76+
// src1 is broadcastable across src0 and dst in i1, i2, i3
77+
const int64_t nr0 = ne00 / ne10;
78+
79+
for (int64_t r = 0; r < nr0; ++r) {
80+
vec_binary_op_contiguous<op>(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
81+
}
82+
} else {
83+
vec_binary_op_non_contiguous<op>(ne0, ne10, nb10, dst_ptr, src0_ptr, src1_ptr);
84+
}
85+
}
86+
}
87+
88+
// TODO: Use the 'traits' lookup table (for type conversion fns), instead of a mass of 'if' conditions with long templates
89+
template <float (*op)(float, float)>
90+
static void binary_op(const ggml_compute_params * params, ggml_tensor * dst) {
91+
const ggml_tensor * src0 = dst->src[0];
92+
const ggml_tensor * src1 = dst->src[1];
93+
94+
/* */ if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { // all f32
95+
apply_binary_op<op, float, float, float>(params, dst);
96+
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { // all f16
97+
apply_binary_op<op, ggml_fp16_t, ggml_fp16_t, ggml_fp16_t>(params, dst);
98+
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
99+
apply_binary_op<op, ggml_bf16_t, ggml_bf16_t, ggml_bf16_t>(params, dst);
100+
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_BF16) {
101+
apply_binary_op<op, ggml_bf16_t, float, ggml_bf16_t>(params, dst);
102+
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
103+
apply_binary_op<op, ggml_bf16_t, float, float>(params, dst);
104+
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
105+
apply_binary_op<op, ggml_fp16_t, float, ggml_fp16_t>(params, dst);
106+
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
107+
apply_binary_op<op, ggml_fp16_t, float, float>(params, dst);
108+
} else {
109+
GGML_ABORT("%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
110+
ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
111+
}
112+
}
113+
114+
void ggml_compute_forward_add_non_quantized(const ggml_compute_params * params, ggml_tensor * dst) {
115+
binary_op<op_add>(params, dst);
116+
}
117+
118+
void ggml_compute_forward_sub(const ggml_compute_params * params, ggml_tensor * dst) {
119+
binary_op<op_sub>(params, dst);
120+
}
121+
122+
void ggml_compute_forward_mul(const ggml_compute_params * params, ggml_tensor * dst) {
123+
binary_op<op_mul>(params, dst);
124+
}
125+
126+
void ggml_compute_forward_div(const ggml_compute_params * params, ggml_tensor * dst) {
127+
binary_op<op_div>(params, dst);
128+
}

ggml/src/ggml-cpu-ref/binary-ops.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#pragma once
2+
3+
#include "common.h"
4+
5+
#ifdef __cplusplus
6+
extern "C" {
7+
#endif
8+
9+
void ggml_compute_forward_add_non_quantized(const struct ggml_compute_params * params, struct ggml_tensor * dst);
10+
void ggml_compute_forward_sub(const struct ggml_compute_params * params, struct ggml_tensor * dst);
11+
void ggml_compute_forward_mul(const struct ggml_compute_params * params, struct ggml_tensor * dst);
12+
void ggml_compute_forward_div(const struct ggml_compute_params * params, struct ggml_tensor * dst);
13+
14+
#ifdef __cplusplus
15+
}
16+
#endif

ggml/src/ggml-cpu-ref/common.h

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
#pragma once
2+
3+
#include "ggml.h"
4+
#include "traits.h"
5+
#include "ggml-cpu-impl.h"
6+
#include "ggml-impl.h"
7+
#include "simd-mappings.h"
8+
9+
#ifdef __cplusplus
10+
11+
#include <utility>
12+
13+
// convenience functions/macros for use in template calls
14+
// note: these won't be required after the 'traits' lookup table is used.
15+
static inline ggml_fp16_t f32_to_f16(float x) {
16+
return GGML_CPU_FP32_TO_FP16(x);
17+
}
18+
19+
static inline float f16_to_f32(ggml_fp16_t x) {
20+
return GGML_CPU_FP16_TO_FP32(x);
21+
}
22+
23+
static inline ggml_bf16_t f32_to_bf16(float x) {
24+
return GGML_FP32_TO_BF16(x);
25+
}
26+
27+
static inline float bf16_to_f32(ggml_bf16_t x) {
28+
return GGML_BF16_TO_FP32(x);
29+
}
30+
31+
static inline float f32_to_f32(float x) {
32+
return x;
33+
}
34+
35+
// TODO - merge this into the traits table, after using row-based conversions
36+
template <class T>
37+
struct type_conversion_table;
38+
39+
template <>
40+
struct type_conversion_table<ggml_fp16_t> {
41+
static constexpr float (*to_f32)(ggml_fp16_t) = f16_to_f32;
42+
static constexpr ggml_fp16_t (*from_f32)(float) = f32_to_f16;
43+
};
44+
45+
template <>
46+
struct type_conversion_table<float> {
47+
static constexpr float (*to_f32)(float) = f32_to_f32;
48+
static constexpr float (*from_f32)(float) = f32_to_f32;
49+
};
50+
51+
template <>
52+
struct type_conversion_table<ggml_bf16_t> {
53+
static constexpr float (*to_f32)(ggml_bf16_t) = bf16_to_f32;
54+
static constexpr ggml_bf16_t (*from_f32)(float) = f32_to_bf16;
55+
};
56+
57+
static std::pair<int64_t, int64_t> get_thread_range(const struct ggml_compute_params * params, const struct ggml_tensor * src0) {
58+
const int64_t ith = params->ith;
59+
const int64_t nth = params->nth;
60+
61+
const int64_t nr = ggml_nrows(src0);
62+
63+
// rows per thread
64+
const int64_t dr = (nr + nth - 1)/nth;
65+
66+
// row range for this thread
67+
const int64_t ir0 = dr*ith;
68+
const int64_t ir1 = MIN(ir0 + dr, nr);
69+
70+
return {ir0, ir1};
71+
}
72+
73+
#endif

0 commit comments

Comments
 (0)