Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ if(PYBIND11_MASTER_PROJECT)
endif()

set(PYBIND11_HEADERS
include/pybind11/detail/argument_vector.h
include/pybind11/detail/class.h
include/pybind11/detail/common.h
include/pybind11/detail/cpp_conduit.h
Expand Down
9 changes: 7 additions & 2 deletions include/pybind11/cast.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#pragma once

#include "detail/argument_vector.h"
#include "detail/common.h"
#include "detail/descr.h"
#include "detail/native_enum_data.h"
Expand Down Expand Up @@ -2037,6 +2038,10 @@ using is_pos_only = std::is_same<intrinsic_t<T>, pos_only>;
// forward declaration (definition in attr.h)
struct function_record;

/// (Inline size chosen mostly arbitrarily; 6 should pad function_call out to two cache lines
/// (16 pointers) in size.)
constexpr std::size_t arg_vector_small_size = 6;

/// Internal data associated with a single function call
struct function_call {
function_call(const function_record &f, handle p); // Implementation in attr.h
Expand All @@ -2045,10 +2050,10 @@ struct function_call {
const function_record &func;

/// Arguments passed to the function:
std::vector<handle> args;
argument_vector<arg_vector_small_size> args;

/// The `convert` value the arguments should be loaded with
std::vector<bool> args_convert;
args_convert_vector<arg_vector_small_size> args_convert;

/// Extra references for the optional `py::args` and/or `py::kwargs` arguments (which, if
/// present, are also in `args` but without a reference).
Expand Down
330 changes: 330 additions & 0 deletions include/pybind11/detail/argument_vector.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,330 @@
/*
pybind11/detail/argument_vector.h: small_vector-like containers to
avoid heap allocation of arguments during function call dispatch.

Copyright (c) Meta Platforms, Inc. and affiliates.

All rights reserved. Use of this source code is governed by a
BSD-style license that can be found in the LICENSE file.
*/

#pragma once

#include <pybind11/pytypes.h>

#include "common.h"

#include <algorithm>
#include <array>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <iterator>
#include <type_traits>
#include <utility>
#include <vector>

PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)

PYBIND11_WARNING_DISABLE_MSVC(4127)

PYBIND11_NAMESPACE_BEGIN(detail)

// Shared implementation utility for our small_vector-like containers.
// We support C++11 and C++14, so we cannot use
// std::variant. Union with the tag packed next to the inline
// array's size is smaller anyway, allowing 1 extra handle of
// inline storage for free. Compare the layouts (1 line per
// size_t/void*, assuming a 64-bit machine):
// With variant, total is N + 2 for N >= 2:
// - variant tag (cannot be packed with the array size)
// - array size (or first pointer of 3 in std::vector)
// - N pointers of inline storage (or 2 remaining pointers of std::vector)
// Custom union, total is N + 1 for N >= 3:
// - variant tag & array size if applicable
// - N pointers of inline storage (or 3 pointers of std::vector)
//
// NOTE: this is a low-level representational convenience; the two
// use cases of this union are materially different and in particular
// have different semantics for inline_array::size. All that is being
// shared is the memory management behavior.
template <typename ArrayT, std::size_t InlineSize, typename VectorT = ArrayT>
union inline_array_or_vector {
struct inline_array {
bool is_inline = true;
std::uint32_t size = 0;
std::array<ArrayT, InlineSize> arr;
};
struct heap_vector {
bool is_inline = false;
std::vector<VectorT> vec;

heap_vector() = default;
heap_vector(std::size_t count, VectorT value) : vec(count, value) {}
};

inline_array iarray;
heap_vector hvector;

static_assert(std::is_trivially_move_constructible<ArrayT>::value,
"ArrayT must be trivially move constructible");
static_assert(std::is_trivially_destructible<ArrayT>::value,
"ArrayT must be trivially destructible");

inline_array_or_vector() : iarray() {}
~inline_array_or_vector() {
if (!is_inline()) {
hvector.~heap_vector();
}
}
// Disable copy ctor and assignment.
inline_array_or_vector(const inline_array_or_vector &) = delete;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A comment for easier reading?

E.g. // Disable copy ctor and assignment

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I generally avoid comments that repeat the code, but if you specifically want one here then sure

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Generally I'm totally with you ... just some minimal mercy for sleepy or rushed eyes scanning this code.

inline_array_or_vector &operator=(const inline_array_or_vector &) = delete;

inline_array_or_vector(inline_array_or_vector &&rhs) noexcept {
if (rhs.is_inline()) {
std::memcpy(&iarray, &rhs.iarray, sizeof(iarray));
} else {
new (&hvector) heap_vector(std::move(rhs.hvector));
}
assert(is_inline() == rhs.is_inline());
}

inline_array_or_vector &operator=(inline_array_or_vector &&rhs) noexcept {
if (this == &rhs) {
return *this;
}

if (rhs.is_inline()) {
if (!is_inline()) {
hvector.~heap_vector();
}
std::memcpy(&iarray, &rhs.iarray, sizeof(iarray));
} else {
if (is_inline()) {
new (&hvector) heap_vector(std::move(rhs.hvector));
} else {
hvector = std::move(rhs.hvector);
}
}
return *this;
}

bool is_inline() const {
// It is undefined behavior to access the inactive member of a
// union directly. However, it is well-defined to reinterpret_cast any
// pointer into a pointer to char and examine it as an array
// of bytes. See
// https://dev-discuss.pytorch.org/t/unionizing-for-profit-how-to-exploit-the-power-of-unions-in-c/444#the-memcpy-loophole-4
bool result = false;
static_assert(offsetof(inline_array, is_inline) == 0,
"untagged union implementation relies on this");
static_assert(offsetof(heap_vector, is_inline) == 0,
"untagged union implementation relies on this");
std::memcpy(&result, reinterpret_cast<const char *>(this), sizeof(bool));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When I got here I had to consult ChatGPT:

https://chatgpt.com/share/68cb906b-941c-8008-9ac2-acbda6c78ca8

ChatGPT is happy with this implementation.

TBH, I'm not the best person to review code manually optimized to this intensity.

If you added the static_asserts suggested by ChatGPT, it'd look safer to me.

@oremanj, is there a chance you could help out with a full review of this PR?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW, the linked article (written by me) tries to explain this technique as well. I recommend reading from the top (and skipping sections if they are boring review to you; it starts slowly) if you have time.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What you have here is surely acceptable, but I can't help feeling uneasy about reaching so deeply into the bag of tricks. So I asked ChatGPT another question in the same conversation, same URL:

https://chatgpt.com/share/68cb906b-941c-8008-9ac2-acbda6c78ca8

It came back with this (copy-pasted here for easy reference):

template <typename ArrayT, std::size_t InlineSize, typename VectorT = ArrayT>
union inline_array_or_vector {
    struct tag_view {
        bool is_inline;
    };

    struct inline_array {
        bool is_inline;                // must be first
        std::uint32_t size = 0;
        std::array<ArrayT, InlineSize> arr;
    };

    struct heap_vector {
        bool is_inline;                // must be first
        std::vector<VectorT> vec;

        heap_vector() = default;
        heap_vector(std::size_t count, VectorT value) : vec(count, value) {}
    };

    // Union members
    tag_view     tag;    // “discriminator view”
    inline_array array;
    heap_vector  vector;

    inline_array_or_vector() : array{true, 0, {}} {}

    ~inline_array_or_vector() {
        if (!is_inline()) {
            vector.~heap_vector();
        }
    }

    bool is_inline() const {
        // Well-defined: all three arms share the same first member layout,
        // so the common-initial-sequence rule lets us read tag.is_inline.
        return tag.is_inline;
    }

    // ... move ctor/assign as in the PR, using array/vector arms ...
};

WDYT?

Copy link
Contributor Author

@swolchok swolchok Sep 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ChatGPT is wrong. You can't read from an inactive member of a union because it's undefined behavior, full stop. C may allow it, but C++ does not.

(EDIT: see correction below; ChatGPT is only wrong because heap_vector isn't guaranteed standard-layout)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

C++: "It is undefined behavior to read from the member of the union that wasn't most recently written. Many compilers implement, as a non-standard language extension, the ability to read inactive members of a union." -- https://en.cppreference.com/w/cpp/language/union.html

C: "If the member used to access the contents of a union is not the same as the member last used to store a value, the object representation of the value that was stored is reinterpreted as an object representation of the new type (this is known as type punning)" -- https://en.cppreference.com/w/c/language/union.html

Copy link
Contributor Author

@swolchok swolchok Sep 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, I stand partially corrected. I asked your ChatGPT conversation for a citation about the common initial sequence rule, and sure enough it does exist in C++ (which I did not realize) and it implies that this codeyour proposed tag_view change is fine, if inline_array and heap_vector are both standard-layout types. Sadly, heap_vector is not guaranteed to be standard-layout because std::vector is not guaranteed to be standard-layout, as we unfortunately ran into below. Good to know though!

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the explanation! (I posted my other comment a minute ago before seeing this. I updated that comment with a strikethrough.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A non-standardese citation for the common initial sequence rule (which, again, I had previously missed) is just "If two union members are standard-layout types, it's well-defined to examine their common subsequence on any compiler." -- https://en.cppreference.com/w/cpp/language/union.html

return result;
}
};

// small_vector-like container to avoid heap allocation for N or fewer
// arguments.
template <std::size_t N>
struct argument_vector {
public:
argument_vector() = default;

// Disable copy ctor and assignment.
argument_vector(const argument_vector &) = delete;
argument_vector &operator=(const argument_vector &) = delete;
argument_vector(argument_vector &&) noexcept = default;
argument_vector &operator=(argument_vector &&) noexcept = default;

std::size_t size() const {
if (is_inline()) {
return m_repr.iarray.size;
}
return m_repr.hvector.vec.size();
}

handle &operator[](std::size_t idx) {
assert(idx < size());
if (is_inline()) {
return m_repr.iarray.arr[idx];
}
return m_repr.hvector.vec[idx];
}

handle operator[](std::size_t idx) const {
assert(idx < size());
if (is_inline()) {
return m_repr.iarray.arr[idx];
}
return m_repr.hvector.vec[idx];
}

void push_back(handle x) {
if (is_inline()) {
auto &ha = m_repr.iarray;
if (ha.size == N) {
move_to_heap_vector_with_reserved_size(N + 1);
push_back_slow_path(x);
} else {
ha.arr[ha.size++] = x;
}
} else {
push_back_slow_path(x);
}
}

template <typename Arg>
void emplace_back(Arg &&x) {
push_back(handle(x));
}

void reserve(std::size_t sz) {
if (is_inline()) {
if (sz > N) {
move_to_heap_vector_with_reserved_size(sz);
}
} else {
reserve_slow_path(sz);
}
}

private:
using repr_type = inline_array_or_vector<handle, N>;
repr_type m_repr;

PYBIND11_NOINLINE void move_to_heap_vector_with_reserved_size(std::size_t reserved_size) {
assert(is_inline());
auto &ha = m_repr.iarray;
using heap_vector = typename repr_type::heap_vector;
heap_vector hv;
hv.vec.reserve(reserved_size);
std::copy(ha.arr.begin(), ha.arr.begin() + ha.size, std::back_inserter(hv.vec));
new (&m_repr.hvector) heap_vector(std::move(hv));
}

PYBIND11_NOINLINE void push_back_slow_path(handle x) { m_repr.hvector.vec.push_back(x); }

PYBIND11_NOINLINE void reserve_slow_path(std::size_t sz) { m_repr.hvector.vec.reserve(sz); }

bool is_inline() const { return m_repr.is_inline(); }
};

// small_vector-like container to avoid heap allocation for N or fewer
// arguments.
template <std::size_t kRequestedInlineSize>
struct args_convert_vector {
private:
public:
args_convert_vector() = default;

// Disable copy ctor and assignment.
args_convert_vector(const args_convert_vector &) = delete;
args_convert_vector &operator=(const args_convert_vector &) = delete;
args_convert_vector(args_convert_vector &&) noexcept = default;
args_convert_vector &operator=(args_convert_vector &&) noexcept = default;

args_convert_vector(std::size_t count, bool value) {
if (count > kInlineSize) {
new (&m_repr.hvector) typename repr_type::heap_vector(count, value);
} else {
auto &inline_arr = m_repr.iarray;
inline_arr.arr.fill(value ? std::size_t(-1) : 0);
inline_arr.size = static_cast<decltype(inline_arr.size)>(count);
}
}

std::size_t size() const {
if (is_inline()) {
return m_repr.iarray.size;
}
return m_repr.hvector.vec.size();
}

void reserve(std::size_t sz) {
if (is_inline()) {
if (sz > kInlineSize) {
move_to_heap_vector_with_reserved_size(sz);
}
} else {
m_repr.hvector.vec.reserve(sz);
}
}

bool operator[](std::size_t idx) const {
if (is_inline()) {
return inline_index(idx);
}
assert(idx < m_repr.hvector.vec.size());
return m_repr.hvector.vec[idx];
}

void push_back(bool b) {
if (is_inline()) {
auto &ha = m_repr.iarray;
if (ha.size == kInlineSize) {
move_to_heap_vector_with_reserved_size(kInlineSize + 1);
push_back_slow_path(b);
} else {
assert(ha.size < kInlineSize);
const auto wbi = word_and_bit_index(ha.size++);
assert(wbi.word < kWords);
assert(wbi.bit < kBitsPerWord);
if (b) {
ha.arr[wbi.word] |= (std::size_t(1) << wbi.bit);
} else {
ha.arr[wbi.word] &= ~(std::size_t(1) << wbi.bit);
}
assert(operator[](ha.size - 1) == b);
}
} else {
push_back_slow_path(b);
}
}

void swap(args_convert_vector &rhs) noexcept { std::swap(m_repr, rhs.m_repr); }

private:
struct WordAndBitIndex {
std::size_t word;
std::size_t bit;
};

static WordAndBitIndex word_and_bit_index(std::size_t idx) {
return WordAndBitIndex{idx / kBitsPerWord, idx % kBitsPerWord};
}

bool inline_index(std::size_t idx) const {
const auto wbi = word_and_bit_index(idx);
assert(wbi.word < kWords);
assert(wbi.bit < kBitsPerWord);
return m_repr.iarray.arr[wbi.word] & (std::size_t(1) << wbi.bit);
}

PYBIND11_NOINLINE void move_to_heap_vector_with_reserved_size(std::size_t reserved_size) {
auto &inline_arr = m_repr.iarray;
using heap_vector = typename repr_type::heap_vector;
heap_vector hv;
hv.vec.reserve(reserved_size);
for (std::size_t ii = 0; ii < inline_arr.size; ++ii) {
hv.vec.push_back(inline_index(ii));
}
new (&m_repr.hvector) heap_vector(std::move(hv));
}

PYBIND11_NOINLINE void push_back_slow_path(bool b) { m_repr.hvector.vec.push_back(b); }

static constexpr auto kBitsPerWord = 8 * sizeof(std::size_t);
static constexpr auto kWords = (kRequestedInlineSize + kBitsPerWord - 1) / kBitsPerWord;
static constexpr auto kInlineSize = kWords * kBitsPerWord;

using repr_type = inline_array_or_vector<std::size_t, kWords, bool>;
repr_type m_repr;

bool is_inline() const { return m_repr.is_inline(); }
};

PYBIND11_NAMESPACE_END(detail)
PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
7 changes: 4 additions & 3 deletions include/pybind11/pybind11.h
Original file line number Diff line number Diff line change
Expand Up @@ -1048,13 +1048,14 @@ class cpp_function : public function {
}
#endif

std::vector<bool> second_pass_convert;
args_convert_vector<arg_vector_small_size> second_pass_convert;
if (overloaded) {
// We're in the first no-convert pass, so swap out the conversion flags for a
// set of all-false flags. If the call fails, we'll swap the flags back in for
// the conversion-allowed call below.
second_pass_convert.resize(func.nargs, false);
call.args_convert.swap(second_pass_convert);
second_pass_convert = std::move(call.args_convert);
call.args_convert
= args_convert_vector<arg_vector_small_size>(func.nargs, false);
}

// 6. Call the function.
Expand Down
4 changes: 2 additions & 2 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -647,8 +647,8 @@ if(NOT PYBIND11_CUDA_TESTS)
# Test pure C++ code (not depending on Python). Provides the `test_pure_cpp` target.
add_subdirectory(pure_cpp)

# Test embedding the interpreter. Provides the `cpptest` target.
add_subdirectory(test_embed)
# Test C++ code that depends on Python, such as embedding the interpreter. Provides the `cpptest` target.
add_subdirectory(test_with_catch)

# Test CMake build using functions and targets from subdirectory or installed location
add_subdirectory(test_cmake_build)
Expand Down
Loading
Loading