ggml-org
diff --git a/‎ggml/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎ggml/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ggml/src/CMakeLists.txt‎
Lines changed: 8 additions & 0 deletions b/‎ggml/src/CMakeLists.txt‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎ggml/src/ggml-backend-reg.cpp‎
Lines changed: 3 additions & 0 deletions b/‎ggml/src/ggml-backend-reg.cpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cpu-ref/CMakeLists.txt‎
Lines changed: 42 additions & 0 deletions b/‎ggml/src/ggml-cpu-ref/CMakeLists.txt‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cpu-ref/README.md‎
Lines changed: 39 additions & 0 deletions b/‎ggml/src/ggml-cpu-ref/README.md‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cpu-ref/arch-fallback.h‎
Lines changed: 47 additions & 0 deletions b/‎ggml/src/ggml-cpu-ref/arch-fallback.h‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cpu-ref/binary-ops.cpp‎
Lines changed: 128 additions & 0 deletions b/‎ggml/src/ggml-cpu-ref/binary-ops.cpp‎
Lines changed: 128 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cpu-ref/binary-ops.h‎
Lines changed: 16 additions & 0 deletions b/‎ggml/src/ggml-cpu-ref/binary-ops.h‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cpu-ref/common.h‎
Lines changed: 73 additions & 0 deletions b/‎ggml/src/ggml-cpu-ref/common.h‎
Lines changed: 73 additions & 0 deletions
@@ -220,6 +220,7 @@ set   (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file
 
 # extra artifacts
 option(GGML_BUILD_TESTS    "ggml: build tests"    ${GGML_STANDALONE})
+option(GGML_CPU_REF        "ggml: build reference CPU backend for testing" OFF)
 option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
 
 #
 
@@ -385,6 +385,14 @@ ggml_add_backend(WebGPU)
 ggml_add_backend(zDNN)
 ggml_add_backend(OpenCL)
 
+if (GGML_CPU_REF_BACKEND)
+    if (NOT GGML_BACKEND_DL)
+        message(FATAL_ERROR "GGML_CPU_REF_BACKEND requires GGML_BACKEND_DL")
+    endif()
+    add_subdirectory(ggml-cpu-ref)
+    target_compile_definitions(ggml PRIVATE GGML_USE_CPU_REF)
+endif()
+
 foreach (target ggml-base ggml)
     target_include_directories(${target} PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
     target_compile_features   (${target} PRIVATE c_std_11 cxx_std_17) # don't bump
 
@@ -596,4 +596,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
     if (backend_path) {
         ggml_backend_load(backend_path);
     }
+#ifdef GGML_USE_CPU_REF
+    ggml_backend_load_best("cpu-ref", silent, dir_path);
+#endif
 }
@@ -0,0 +1,42 @@
+# Baseline c-only CPU backend for testing.
+ggml_add_backend_library(ggml-cpu-ref)
+
+target_sources(ggml-cpu-ref PRIVATE
+    ggml-cpu-ref.cpp
+    ggml-cpu-ref.c
+    quants.h
+    quants.c
+    vec.h
+    vec.cpp
+    ops.h
+    ops.cpp
+    repack.h
+    repack.cpp
+    traits.h
+    traits.cpp
+    binary-ops.h
+    binary-ops.cpp
+    unary-ops.h
+    unary-ops.cpp
+    ggml-cpu-impl.h
+    common.h
+    )
+
+if (GGML_OPENMP)
+    find_package(OpenMP)
+    if (OpenMP_FOUND)
+        set(GGML_OPENMP_ENABLED "ON" CACHE INTERNAL "")
+        target_compile_definitions(ggml-cpu-ref PRIVATE GGML_USE_OPENMP)
+
+        target_link_libraries(ggml-cpu-ref PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
+    else()
+        set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "")
+        message(WARNING "OpenMP not found")
+    endif()
+endif()
+
+target_compile_features(ggml-cpu-ref PRIVATE cxx_std_17)
+target_include_directories(ggml-cpu-ref PRIVATE . ../include)
+
+# Explicitly disable all optimizations and SIMD to ensure pure C
+set_target_properties(ggml-cpu-ref PROPERTIES COMPILE_FLAGS "-O0 -fno-tree-vectorize")
@@ -0,0 +1,39 @@
+## GGML CPU Reference Backend
+This is a CPU C-only reference backend implementation intended to be used to
+test CPU backend variants in GGML.
+
+The goal of this backend is to make it possible to test CPU backend variants,
+which are CPU backends that contains optimizations for the a specific CPU
+architecture, against a plain C-only implemenation that does not use any
+architecture specific optimizations.
+
+### Building
+This backend must be explicitly enabled when building GGML which can be done
+using the following CMake options:
+```console
+$ cmake -B build \
+    -DGGML_CPU_REF_BACKEND=ON
+    -DGGML_BACKEND_DL=ON \
+    -DGGML_CPU_ALL_VARIANTS=ON
+```
+
+### Listing variants
+To list the available variants:
+```console
+$ ./build/bin/test-backend-ops cpu-variants --list
+CPU variants:
+  CPU-alderlake   - 12th Gen Intel(R) Core(TM) i7-1260P
+```
+
+### Testing a variant
+To test a variant against the reference backend use the following command:
+```console
+$ ./build/bin/test-backend-ops cpu-variants --variant CPU-alderlake -o ADD
+Testing CPU variant 'CPU-alderlake' against cpu-ref backend...
+
+  ADD(type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1): OK
+  ADD(type=f16,ne=[1,1,1,1],nr=[32,1,1,1],nf=1): OK
+  ADD(type=f16,ne=[1,1,320,320],nr=[1,1,1,1],nf=1): OK
+  ...
+  14471/14471 tests passed
+```
@@ -0,0 +1,47 @@
+#pragma once
+
+// quants.c
+#define quantize_row_q8_0_generic quantize_row_q8_0
+#define quantize_row_q8_1_generic quantize_row_q8_1
+#define quantize_row_q8_K_generic quantize_row_q8_K
+#define ggml_vec_dot_q4_0_q8_0_generic ggml_vec_dot_q4_0_q8_0
+#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
+#define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
+#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
+#define ggml_vec_dot_q8_0_q8_0_generic ggml_vec_dot_q8_0_q8_0
+#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
+#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
+#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
+#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
+#define ggml_vec_dot_q3_K_q8_K_generic ggml_vec_dot_q3_K_q8_K
+#define ggml_vec_dot_q4_K_q8_K_generic ggml_vec_dot_q4_K_q8_K
+#define ggml_vec_dot_q5_K_q8_K_generic ggml_vec_dot_q5_K_q8_K
+#define ggml_vec_dot_q6_K_q8_K_generic ggml_vec_dot_q6_K_q8_K
+#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
+#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
+#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
+#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
+#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
+#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
+#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
+#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
+#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
+
+// repack.cpp
+#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
+#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
+#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
+#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
+#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
+#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
+#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
+#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
+#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
+#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
+#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
+#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
+#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
+#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
+#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
+#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
+#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
@@ -0,0 +1,128 @@
+#include "binary-ops.h"
+
+static inline float op_add(float a, float b) {
+    return a + b;
+}
+
+static inline float op_sub(float a, float b) {
+    return a - b;
+}
+
+static inline float op_mul(float a, float b) {
+    return a * b;
+}
+
+static inline float op_div(float a, float b) {
+    return a / b;
+}
+
+template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
+static inline void vec_binary_op_contiguous(const int64_t n, dst_t * z, const src0_t * x, const src1_t * y) {
+    constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
+    constexpr auto src1_to_f32 = type_conversion_table<src1_t>::to_f32;
+    constexpr auto f32_to_dst  = type_conversion_table<dst_t >::from_f32;
+
+    for (int i = 0; i < n; i++) {
+        z[i] = f32_to_dst(op(src0_to_f32(x[i]), src1_to_f32(y[i])));
+    }
+}
+
+template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
+static inline void vec_binary_op_non_contiguous(const int64_t n, const int64_t ne10, const int64_t nb10, dst_t * z, const src0_t * x, const src1_t * y) {
+    constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
+    constexpr auto src1_to_f32 = type_conversion_table<src1_t>::to_f32;
+    constexpr auto f32_to_dst  = type_conversion_table<dst_t >::from_f32;
+
+    for (int i = 0; i < n; i++) {
+        int i10 = i % ne10;
+        const src1_t * y_ptr = (const src1_t *)((const char *)y + i10*nb10);
+        z[i] = f32_to_dst(op(src0_to_f32(x[i]), src1_to_f32(*y_ptr)));
+    }
+}
+
+template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
+static void apply_binary_op(const ggml_compute_params * params, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(dst_t));
+    GGML_ASSERT(nb00 == sizeof(src0_t));
+
+    const auto [ir0, ir1] = get_thread_range(params, src0);
+    const bool is_src1_contiguous = (nb10 == sizeof(src1_t));
+
+    if (!is_src1_contiguous) { // broadcast not implemented yet for non-contiguous
+        GGML_ASSERT(ggml_are_same_shape(src0, src1));
+    }
+
+    for (int64_t ir = ir0; ir < ir1; ++ir) {
+        const int64_t i03 = ir/(ne02*ne01);
+        const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+        const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+        const int64_t i13 = i03 % ne13;
+        const int64_t i12 = i02 % ne12;
+        const int64_t i11 = i01 % ne11;
+
+        dst_t        * dst_ptr  = (dst_t  *)       ((char *)       dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+        const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+        const src1_t * src1_ptr = (const src1_t *) ((const char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+
+        if (is_src1_contiguous) {
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t nr0 = ne00 / ne10;
+
+            for (int64_t r = 0; r < nr0; ++r) {
+                vec_binary_op_contiguous<op>(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
+            }
+        } else {
+            vec_binary_op_non_contiguous<op>(ne0, ne10, nb10, dst_ptr, src0_ptr, src1_ptr);
+        }
+    }
+}
+
+// TODO: Use the 'traits' lookup table (for type conversion fns), instead of a mass of 'if' conditions with long templates
+template <float (*op)(float, float)>
+static void binary_op(const ggml_compute_params * params, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    /*  */ if (src0->type == GGML_TYPE_F32  && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) { // all f32
+        apply_binary_op<op, float, float, float>(params, dst);
+    } else if (src0->type == GGML_TYPE_F16  && src1->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F16) { // all f16
+        apply_binary_op<op, ggml_fp16_t, ggml_fp16_t, ggml_fp16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
+        apply_binary_op<op, ggml_bf16_t, ggml_bf16_t, ggml_bf16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_BF16) {
+        apply_binary_op<op, ggml_bf16_t, float, ggml_bf16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) {
+        apply_binary_op<op, ggml_bf16_t, float, float>(params, dst);
+    } else if (src0->type == GGML_TYPE_F16  && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F16) {
+        apply_binary_op<op, ggml_fp16_t, float, ggml_fp16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_F16  && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) {
+        apply_binary_op<op, ggml_fp16_t, float, float>(params, dst);
+    } else {
+        GGML_ABORT("%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
+            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
+    }
+}
+
+void ggml_compute_forward_add_non_quantized(const ggml_compute_params * params, ggml_tensor * dst) {
+    binary_op<op_add>(params, dst);
+}
+
+void ggml_compute_forward_sub(const ggml_compute_params * params, ggml_tensor * dst) {
+    binary_op<op_sub>(params, dst);
+}
+
+void ggml_compute_forward_mul(const ggml_compute_params * params, ggml_tensor * dst) {
+    binary_op<op_mul>(params, dst);
+}
+
+void ggml_compute_forward_div(const ggml_compute_params * params, ggml_tensor * dst) {
+    binary_op<op_div>(params, dst);
+}
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ggml_compute_forward_add_non_quantized(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_sub(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_mul(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_div(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+
+#ifdef __cplusplus
+}
+#endif
@@ -0,0 +1,73 @@
+#pragma once
+
+#include "ggml.h"
+#include "traits.h"
+#include "ggml-cpu-impl.h"
+#include "ggml-impl.h"
+#include "simd-mappings.h"
+
+#ifdef __cplusplus
+
+#include <utility>
+
+// convenience functions/macros for use in template calls
+// note: these won't be required after the 'traits' lookup table is used.
+static inline ggml_fp16_t f32_to_f16(float x) {
+    return GGML_CPU_FP32_TO_FP16(x);
+}
+
+static inline float f16_to_f32(ggml_fp16_t x) {
+    return GGML_CPU_FP16_TO_FP32(x);
+}
+
+static inline ggml_bf16_t f32_to_bf16(float x) {
+    return GGML_FP32_TO_BF16(x);
+}
+
+static inline float bf16_to_f32(ggml_bf16_t x) {
+    return GGML_BF16_TO_FP32(x);
+}
+
+static inline float f32_to_f32(float x) {
+    return x;
+}
+
+// TODO - merge this into the traits table, after using row-based conversions
+template <class T>
+struct type_conversion_table;
+
+template <>
+struct type_conversion_table<ggml_fp16_t> {
+    static constexpr float (*to_f32)(ggml_fp16_t) = f16_to_f32;
+    static constexpr ggml_fp16_t (*from_f32)(float) = f32_to_f16;
+};
+
+template <>
+struct type_conversion_table<float> {
+    static constexpr float (*to_f32)(float) = f32_to_f32;
+    static constexpr float (*from_f32)(float) = f32_to_f32;
+};
+
+template <>
+struct type_conversion_table<ggml_bf16_t> {
+    static constexpr float (*to_f32)(ggml_bf16_t) = bf16_to_f32;
+    static constexpr ggml_bf16_t (*from_f32)(float) = f32_to_bf16;
+};
+
+static std::pair<int64_t, int64_t> get_thread_range(const struct ggml_compute_params * params, const struct ggml_tensor * src0) {
+    const int64_t ith = params->ith;
+    const int64_t nth = params->nth;
+
+    const int64_t nr  = ggml_nrows(src0);
+
+    // rows per thread
+    const int64_t dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int64_t ir0 = dr*ith;
+    const int64_t ir1 = MIN(ir0 + dr, nr);
+
+    return {ir0, ir1};
+}
+
+#endif
Original file line number	Diff line number	Diff line change
`@@ -220,6 +220,7 @@ set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file`
`220`	`220`
`221`	`221`	`# extra artifacts`
`222`	`222`	`option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})`
	`223`	`+option(GGML_CPU_REF "ggml: build reference CPU backend for testing" OFF)`
`223`	`224`	`option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})`
`224`	`225`
`225`	`226`	`#`
Original file line number	Diff line number	Diff line change
`@@ -596,4 +596,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {`
`596`	`596`	`if (backend_path) {`
`597`	`597`	`ggml_backend_load(backend_path);`
`598`	`598`	`}`
	`599`	`+#ifdef GGML_USE_CPU_REF`
	`600`	`+ ggml_backend_load_best("cpu-ref", silent, dir_path);`
	`601`	`+#endif`
`599`	`602`	`}`