Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CITATIONS.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
author = {Leyang Xue and
Yao Fu and
Zhan Lu and
Chuanhao Sun and
Luo Mai and
Mahesh Marina},
title = {MoE-Infinity: Efficient MoE Inference on Personal Machines with Sparsity-Aware Expert Cache},
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ If you use MoE-Inifity for your research, please cite our [paper](https://arxiv.
author = {Leyang Xue and
Yao Fu and
Zhan Lu and
Chuanhao Sun and
Luo Mai and
Mahesh Marina},
title = {MoE{-}Infinity: Efficient MoE Inference on Personal Machines with Sparsity-Aware Expert Cache},
Expand Down
9 changes: 9 additions & 0 deletions core/common/constant.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#pragma once

#include <stdint.h>

constexpr int64_t KB = 1024;
constexpr int64_t MB = KB * KB;
constexpr int64_t GB = KB * KB * KB;

constexpr int kWrapSize = 32;
69 changes: 69 additions & 0 deletions core/common/context.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// Copyright (c) EfficientMoE.
// SPDX-License-Identifier: Apache-2.0

// EfficientMoE Team

#pragma once

#include <cstdint>
#include <mutex>
#include <unordered_map>

enum class DataType { BFLOAT16 = 0, FLOAT32 = 1, FLOAT16 = 2, FP8_E4M3FN = 3 };

struct Context {
// Add any necessary member variables or methods here
int64_t max_expert_tokens = 128; // Default maximum expert tokens
int64_t max_tokens = 4096; // Default maximum tokens
int num_experts = 8; // Default number of experts
int topk = 2; // Default top-k value
int64_t hidden_dim = 1024; // Default hidden dimension
int64_t intermediate_dim =
4096; // Default intermediate dimension for experts
DataType dtype = DataType::FLOAT32; // Default data type

void SetFromDict(const std::unordered_map<std::string, int64_t>& dict) {
if (dict.find("max_expert_tokens") != dict.end()) {
max_expert_tokens = dict.at("max_expert_tokens");
}
if (dict.find("max_tokens") != dict.end()) {
max_tokens = dict.at("max_tokens");
}
if (dict.find("num_experts") != dict.end()) {
num_experts = dict.at("num_experts");
}
if (dict.find("topk") != dict.end()) {
topk = dict.at("topk");
}
if (dict.find("hidden_dim") != dict.end()) {
hidden_dim = dict.at("hidden_dim");
}
if (dict.find("intermediate_dim") != dict.end()) {
intermediate_dim = dict.at("intermediate_dim");
}
if (dict.find("dtype") != dict.end()) {
int dtype_value = dict.at("dtype");
switch (dtype_value) {
case 0:
dtype = DataType::BFLOAT16;
break;
case 1:
dtype = DataType::FLOAT32;
break;
case 2:
dtype = DataType::FLOAT16;
break;
case 3:
dtype = DataType::FP8_E4M3FN;
break;
default:
throw std::invalid_argument("Invalid dtype value");
}
}
}
};

Context& getContext() {
static Context instance;
return instance;
}
88 changes: 88 additions & 0 deletions core/common/generator.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
// Copyright (c) EfficientMoE.
// SPDX-License-Identifier: Apache-2.0

// EfficientMoE Team

#pragma once

#include <uuid/uuid.h>

#include <bitset>
#include <chrono>
#include <iomanip>
#include <mutex>
#include <random>
#include <sstream>
#include <string>
#include <atomic>

class NumGenerator {
public:
// 0是一个特殊的id,必须保证永远不会生成0这个id
static uint32_t ctx_id() {
std::lock_guard g(mutex_);
uint32_t ret = ctx_id_++;
if (ret == 0) ret = ctx_id_++;
return ret;
}
static uint32_t flowno() {
static std::atomic<uint32_t> flowno(1024);
return flowno++;
}

private:
static std::mutex mutex_;
static uint32_t ctx_id_; // Start from 1 to avoid 0
};

// Static member definitions
std::mutex NumGenerator::mutex_;
uint32_t NumGenerator::ctx_id_ = 1;

inline std::string GenUUID() {
uuid_t uuid;
uuid_generate(uuid);
char uuid_str[37];
uuid_unparse(uuid, uuid_str);
return std::string(uuid_str);
}

inline uint64_t GenUUID64() {
static std::random_device rd;
static std::mt19937_64 eng(rd());
static std::uniform_int_distribution<uint64_t> distr;

std::bitset<64> uuid;
uuid = std::chrono::high_resolution_clock::now().time_since_epoch().count();
uuid ^= distr(eng);

return uuid.to_ullong();
}

inline std::string CurrentTimeString() {
// Get current time as time_point
auto now = std::chrono::system_clock::now();

// Convert time_point to system time for breaking down into components
auto now_c = std::chrono::system_clock::to_time_t(now);
auto now_tm = *std::localtime(&now_c);

// Get the current time as milliseconds
auto now_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
now.time_since_epoch()) %
1000;

// Use stringstream to format the time
std::ostringstream oss;
oss << std::put_time(&now_tm, "%Y-%m-%d %H:%M:%S");
oss << '.' << std::setfill('0') << std::setw(3) << now_ms.count();

return oss.str();
}

// constexpr microseconds since epoch
inline uint64_t CurrentTimeMicros() {
return std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::system_clock::now().time_since_epoch())
.count();
}
10 changes: 10 additions & 0 deletions core/common/pytorch.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

#include <torch/extension.h>
#include "aio/archer_prio_aio_handle.h"
#include "types.h"
#include "base/noncopyable.h"

#define CPU_DEVICE torch::Device(torch::kCPU)
Expand All @@ -27,6 +28,15 @@
#define INT64_TENSOR_OPTIONS(target) TENSOR_OPTIONS(torch::kInt64, target)
#define BFLOAT16_TENSOR_OPTIONS(target) TENSOR_OPTIONS(torch::kBFloat16, target)

#define TENSOR_FROM_BLOB(blob, shape, dtype, target) \
torch::from_blob(blob, shape, DoNothingDeleter<void>{}, \
TENSOR_OPTIONS(dtype, target))

// when dtype is a cpp type use type trait to get the torch dtype
#define TENSOR_FROM_BLOB_CPP(blob, shape, dtype, target) \
torch::from_blob(blob, shape, DoNothingDeleter<void>{}, \
TENSOR_OPTIONS(torch::ScalarType(dtype), target))

#define FAKE_TENSOR_SIZES torch::IntArrayRef({1})

inline std::vector<uint32_t> list_to_vector(py::list list) {
Expand Down
99 changes: 99 additions & 0 deletions core/common/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
#pragma once

#include <cstdint>
#include <utility>
#include <string>
#include <type_traits>

typedef std::uint32_t TensorID;
typedef std::size_t HashID;
Expand Down Expand Up @@ -35,3 +38,99 @@ template <typename T>
struct DoNothingDeleter {
void operator()(T* ptr) const {}
};

// Helper to get the Nth type from a parameter pack
template <size_t N, typename... Args>
struct GetNthType;

template <typename First, typename... Rest>
struct GetNthType<0, First, Rest...> {
using type = First;
};

template <size_t N, typename First, typename... Rest>
struct GetNthType<N, First, Rest...> {
using type = typename GetNthType<N - 1, Rest...>::type;
};

template <size_t N, typename... Args>
using GetNthType_t = typename GetNthType<N, Args...>::type;

// Compile-time integer square root
template <int N>
struct ConstexprSqrt {
static constexpr int compute(int low = 1, int high = N) {
if (low == high) return low;
int mid = (low + high + 1) / 2;
return (mid * mid > N) ? compute(low, mid - 1) : compute(mid, high);
}
static constexpr int value = compute();
};

// Round to multiple helper
template <int N, int Multiple>
struct RoundToMultiple {
static constexpr int value = ((N + Multiple - 1) / Multiple) * Multiple;
};

// A constexpr function to convert any const T* pointer to void*
template <typename T>
constexpr void* pointer_to_void(const T* ptr) {
return const_cast<void*>(reinterpret_cast<const void*>(
ptr)); // Cast to void* while preserving constness
}

// Helper macros to generate enum and string mappings
#define ENUM_ENTRY_COMMA(value, EnumType) value,
#define ENUM_CASE(value, EnumType) \
case EnumType::value: \
return #value;
#define STRING_CASE(value, EnumType) \
if (s == #value) return EnumType::value;

// General enum to string conversion using SFINAE
template <typename E>
constexpr auto enum_to_string(E e) noexcept
-> std::enable_if_t<std::is_enum_v<E>, const char*> {
// This will be specialized for each enum type
return "Unknown";
}

// General string to enum conversion
template <typename E>
constexpr auto string_to_enum(const std::string& s) noexcept
-> std::enable_if_t<std::is_enum_v<E>, E> {
// This will be specialized for each enum type
return static_cast<E>(0); // Default to first enum value
}

// Macro to define enum class, enum to string, and string to enum functions
#define DEFINE_ENUM_CLASS(EnumType, ENUM_VALUES) \
enum class EnumType { ENUM_VALUES(ENUM_ENTRY_COMMA, EnumType) Unknown }; \
\
/* Enum to string function */ \
constexpr const char* EnumType##ToString(EnumType v) { \
switch (v) { \
ENUM_VALUES(ENUM_CASE, EnumType) \
default: \
return "Unknown"; \
} \
} \
\
/* String to enum function */ \
EnumType StringTo##EnumType(const std::string& s) { \
ENUM_VALUES(STRING_CASE, EnumType) \
return EnumType::Unknown; \
} \
\
/* Specialize generic template functions for this enum type */ \
template <> \
constexpr auto enum_to_string<EnumType>( \
EnumType e) noexcept -> const char* { \
return EnumType##ToString(e); \
} \
\
template <> \
auto string_to_enum<EnumType>(const std::string& s) noexcept -> EnumType { \
return StringTo##EnumType(s); \
}
Loading