Skip to content

Commit a32acb1

Browse files
committed
Optimize polynomial out of domain evaluation for the same point on wide traces.
- Tested - Added relevant benchmarks
1 parent 0d37a99 commit a32acb1

File tree

9 files changed

+520
-23
lines changed

9 files changed

+520
-23
lines changed

cuda/include/poly/eval_at_point.cuh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,15 @@
22
#define POLY_EVAL_AT_POINT_H
33

44
#include "../fields.cuh"
5+
#include "../utils.cuh"
56

67
extern "C"
78
qm31 eval_at_point(m31 *coeffs, int coeffs_size, qm31 point_x, qm31 point_y);
89

10+
extern "C"
11+
void evaluate_polynomials_out_of_domain(
12+
qm31 **result, m31 **polynomials, int *log_polynomial_sizes, int number_of_polynomials,
13+
qm31 **out_of_domain_points_x, qm31 **out_of_domain_points_y, int *sample_sizes
14+
);
15+
916
#endif // POLY_EVAL_AT_POINT_H

cuda/src/poly/eval_at_point.cu

Lines changed: 208 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@ __global__ void eval_at_point_first_pass(m31 *g_coeffs, qm31 *temp, qm31 *factor
4848
}
4949
factor_idx -= 1;
5050
level_size >>= 1;
51-
5251
}
5352

5453
if (idx == 0) {
@@ -154,3 +153,211 @@ qm31 eval_at_point(m31 *coeffs, int coeffs_size, qm31 point_x, qm31 point_y) {
154153
return result;
155154
}
156155

156+
/* Many polynomials */
157+
158+
159+
__global__ void eval_many_at_point_first_pass(m31 **g_coeffs, qm31 *temp, qm31 *factors, int coeffs_size, int factors_size,
160+
int output_offset) {
161+
int idx = threadIdx.x;
162+
163+
qm31 *output = &temp[output_offset];
164+
165+
int coeffs_per_block = 2 * blockDim.x;
166+
int blocks_in_poly = max(1, coeffs_size / coeffs_per_block);
167+
// Thread syncing happens within a block.
168+
// Split the problem to feed them to multiple blocks.
169+
if (coeffs_size >= coeffs_per_block) {
170+
coeffs_size = coeffs_per_block;
171+
}
172+
173+
extern __shared__ m31 s_coeffs[];
174+
extern __shared__ qm31 s_level[];
175+
176+
int poly_index = blockIdx.x / blocks_in_poly;
177+
178+
// A % X == A & (X-1) when X is a power of two
179+
s_coeffs[idx] = g_coeffs[poly_index][(blockIdx.x & (blocks_in_poly - 1)) * coeffs_size + idx];
180+
s_coeffs[idx + blockDim.x] = g_coeffs[poly_index][(blockIdx.x & (blocks_in_poly - 1)) * coeffs_size + idx + blockDim.x];
181+
__syncthreads();
182+
183+
int level_size = coeffs_size >> 1;
184+
int factor_idx = factors_size - 1;
185+
186+
if (idx < level_size) {
187+
m31 alpha = s_coeffs[2 * idx];
188+
m31 beta = s_coeffs[2 * idx + 1];
189+
qm31 factor = factors[factor_idx];
190+
191+
qm31 result = {
192+
{add(mul(beta, factor.a.a), alpha), mul(factor.a.b, beta)},
193+
{mul(beta, factor.b.a), mul(beta, factor.b.b)}
194+
};
195+
s_level[idx] = result;
196+
}
197+
factor_idx -= 1;
198+
level_size >>= 1;
199+
200+
while (level_size > 0) {
201+
if (idx < level_size) {
202+
__syncthreads();
203+
qm31 a = s_level[2 * idx];
204+
qm31 b = s_level[2 * idx + 1];
205+
__syncthreads();
206+
s_level[idx] = add(a, mul(b, factors[factor_idx]));
207+
}
208+
factor_idx -= 1;
209+
level_size >>= 1;
210+
}
211+
212+
if (idx == 0) {
213+
output[blockIdx.x] = s_level[0];
214+
}
215+
}
216+
217+
__global__
218+
void eval_many_at_point_second_pass(qm31 *temp, qm31 *factors, int level_size, int factor_offset, int level_offset,
219+
int output_offset, int results_per_block) {
220+
int idx = threadIdx.x;
221+
222+
qm31 *level = &temp[level_offset];
223+
qm31 *output = &temp[output_offset];
224+
225+
// Thread syncing happens within a block.
226+
// Split the problem to feed them to multiple blocks.
227+
if (level_size >= 2 * blockDim.x) {
228+
level_size = 2 * blockDim.x;
229+
}
230+
231+
extern __shared__ qm31 s_level[];
232+
233+
s_level[idx] = level[2 * blockIdx.x * blockDim.x + idx];
234+
s_level[idx + blockDim.x] = level[2 * blockIdx.x * blockDim.x + idx + blockDim.x];
235+
236+
level_size >>= 1;
237+
238+
int factor_idx = factor_offset;
239+
240+
while (level_size >= results_per_block) {
241+
if (idx < level_size) {
242+
__syncthreads();
243+
qm31 a = s_level[2 * idx];
244+
qm31 b = s_level[2 * idx + 1];
245+
__syncthreads();
246+
s_level[idx] = add(a, mul(b, factors[factor_idx]));
247+
}
248+
factor_idx -= 1;
249+
level_size >>= 1;
250+
}
251+
252+
if (idx < results_per_block) {
253+
output[blockIdx.x * results_per_block + idx] = s_level[idx];
254+
}
255+
}
256+
257+
__global__
258+
void copy_result_for_polynomial(qm31 **result, qm31 *temp, int number_of_polynomials) {
259+
int global_thread_index = blockIdx.x * blockDim.x + threadIdx.x;
260+
261+
if (global_thread_index < number_of_polynomials) {
262+
result[global_thread_index][0] = temp[global_thread_index];
263+
}
264+
}
265+
266+
void eval_polys_at_point(
267+
qm31 **result, m31 **polynomials, int log_number_of_polynomials, int log_coeffs_size, qm31 point_x, qm31 point_y
268+
) {
269+
int coeffs_size = 1 << log_coeffs_size;
270+
int block_dim = min(256, coeffs_size);
271+
int coeffs_per_block = block_dim * 2;
272+
273+
qm31 *host_mappings = (qm31 *) malloc(sizeof(qm31) * log_coeffs_size);
274+
host_mappings[log_coeffs_size - 1] = point_y;
275+
host_mappings[log_coeffs_size - 2] = point_x;
276+
qm31 x = point_x;
277+
for (int i = 2; i < log_coeffs_size; i += 1) {
278+
x = sub(mul(qm31{cm31{2, 0}, cm31{0, 0}}, mul(x, x)), qm31{cm31{1, 0}, cm31{0, 0}});
279+
host_mappings[log_coeffs_size - 1 - i] = x;
280+
}
281+
282+
int number_of_polynomials = 1 << log_number_of_polynomials;
283+
int total_number_of_coeffs = coeffs_size * number_of_polynomials;
284+
int temp_memory_size = 0;
285+
int size = total_number_of_coeffs;
286+
while (size > number_of_polynomials) {
287+
size = (size + coeffs_per_block - 1) / coeffs_per_block;
288+
temp_memory_size += size;
289+
}
290+
291+
temp_memory_size = max(temp_memory_size, number_of_polynomials);
292+
293+
qm31 *temp = cuda_malloc<qm31>(temp_memory_size);
294+
qm31 *device_mappings = clone_to_device<qm31>(host_mappings, log_coeffs_size);
295+
296+
free(host_mappings);
297+
298+
// First pass
299+
int num_blocks = max(number_of_polynomials, ((total_number_of_coeffs >> 1) + block_dim - 1) / block_dim);
300+
int shared_memory_bytes = coeffs_per_block * 4 + coeffs_per_block * 8;
301+
int output_offset = temp_memory_size - num_blocks;
302+
303+
eval_many_at_point_first_pass<<<num_blocks, block_dim, shared_memory_bytes>>>(polynomials, temp, device_mappings, coeffs_size,
304+
log_coeffs_size, output_offset);
305+
306+
// Second pass
307+
int mappings_offset = log_coeffs_size - 1;
308+
int level_offset = output_offset;
309+
while (num_blocks > number_of_polynomials) {
310+
mappings_offset -= 9;
311+
int new_num_blocks = ((num_blocks >> 1) + block_dim - 1) / block_dim;
312+
int number_of_results = max(number_of_polynomials, new_num_blocks);
313+
int results_per_block = number_of_results / new_num_blocks;
314+
shared_memory_bytes = coeffs_per_block * 4 * 4;
315+
output_offset = level_offset - new_num_blocks;
316+
eval_many_at_point_second_pass<<<new_num_blocks, block_dim, shared_memory_bytes>>>(temp, device_mappings, num_blocks,
317+
mappings_offset, level_offset,
318+
output_offset, results_per_block);
319+
num_blocks = new_num_blocks;
320+
level_offset = output_offset;
321+
}
322+
323+
cudaDeviceSynchronize();
324+
325+
num_blocks = (number_of_polynomials + block_dim - 1) / block_dim;
326+
copy_result_for_polynomial<<<num_blocks, block_dim>>>(
327+
result, temp, number_of_polynomials
328+
);
329+
330+
cuda_free_memory(temp);
331+
cuda_free_memory(device_mappings);
332+
}
333+
334+
void eval_polys_at_points(
335+
qm31 **result, m31 **polynomials, int log_polynomial_size, int log_number_of_polynomials,
336+
qm31 *points_x, qm31 *points_y, int sample_size
337+
) {
338+
for (int point_index = 0; point_index < sample_size; point_index++) {
339+
qm31 point_x = points_x[point_index];
340+
qm31 point_y = points_y[point_index];
341+
eval_polys_at_point(result, polynomials, log_number_of_polynomials, log_polynomial_size, point_x, point_y);
342+
}
343+
}
344+
345+
void evaluate_polynomials_out_of_domain(
346+
qm31 **result, m31 **polynomials, int *log_polynomial_sizes, int number_of_polynomials,
347+
qm31 **out_of_domain_points_x, qm31 **out_of_domain_points_y, int *sample_sizes
348+
) {
349+
// In this iteration, we assume all polynomials are of equal size and will be evaluated in the same single point
350+
351+
qm31 **device_result = clone_to_device<qm31*>(result, number_of_polynomials);
352+
m31 **device_polynomials = clone_to_device<m31*>(polynomials, number_of_polynomials);
353+
int log_polynomial_size = log_polynomial_sizes[0];
354+
int log_number_of_polynomials = log_2(number_of_polynomials);
355+
356+
eval_polys_at_points(
357+
device_result, device_polynomials, log_polynomial_size, log_number_of_polynomials,
358+
out_of_domain_points_x[0], out_of_domain_points_y[0], sample_sizes[0]
359+
);
360+
361+
cuda_free_memory(device_result);
362+
cuda_free_memory(device_polynomials);
363+
};

stwo_gpu_backend/Cargo.toml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ edition = "2021"
55

66
[dependencies]
77
cc = "1.0"
8-
stwo-prover = { git = "https://github.com/starkware-libs/stwo", rev = "ce5b975" }
8+
stwo-prover = { git = "https://github.com/jarnesino/stwo", rev = "fa16181" }
99
itertools = "0.10.5"
1010
rand = "0.8.5"
1111
criterion = "0.4"
@@ -35,5 +35,9 @@ name = "interpolate_columns"
3535
harness = false
3636

3737
[[bench]]
38-
name = "evaluate_columns"
38+
name = "evaluate_polynomials"
39+
harness = false
40+
41+
[[bench]]
42+
name = "evaluate_polynomials_out_of_domain"
3943
harness = false

stwo_gpu_backend/benches/evaluate_columns.rs renamed to stwo_gpu_backend/benches/evaluate_polynomials.rs

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,12 @@ use stwo_prover::core::poly::circle::{CanonicCoset, CircleEvaluation, PolyOps};
88
use stwo_prover::core::poly::BitReversedOrder;
99
use stwo_prover::core::ColumnVec;
1010

11-
const LOG_COLUMN_SIZE: u32 = 10;
12-
const LOG_NUMBER_OF_COLUMNS: usize = 16;
11+
const LOG_COLUMN_SIZE: u32 = 16;
12+
const LOG_NUMBER_OF_COLUMNS: usize = 10;
1313
const LOG_BLOWUP_FACTOR: u32 = 2;
1414

15-
pub fn simd_evaluate_columns(c: &mut Criterion) {
16-
let mut group = c.benchmark_group("evaluate_columns");
15+
pub fn simd_evaluate_polynomials(c: &mut Criterion) {
16+
let mut group = c.benchmark_group("evaluate_polynomials");
1717

1818
let coset = CanonicCoset::new(LOG_COLUMN_SIZE);
1919
let values = (0..coset.size()).map(BaseField::from).collect();
@@ -37,8 +37,8 @@ pub fn simd_evaluate_columns(c: &mut Criterion) {
3737
});
3838
}
3939

40-
pub fn gpu_evaluate_columns(c: &mut Criterion) {
41-
let mut group = c.benchmark_group("evaluate_columns");
40+
pub fn gpu_evaluate_polynomials(c: &mut Criterion) {
41+
let mut group = c.benchmark_group("evaluate_polynomials");
4242

4343
let coset = CanonicCoset::new(LOG_COLUMN_SIZE);
4444
let values = BaseFieldVec::from_vec((0..coset.size()).map(BaseField::from).collect_vec());
@@ -63,7 +63,7 @@ pub fn gpu_evaluate_columns(c: &mut Criterion) {
6363
}
6464

6565
criterion_group!(
66-
name = interpolate_columns;
66+
name = evaluate_polynomials;
6767
config = Criterion::default().sample_size(10);
68-
targets = simd_evaluate_columns, gpu_evaluate_columns);
69-
criterion_main!(interpolate_columns);
68+
targets = simd_evaluate_polynomials, gpu_evaluate_polynomials);
69+
criterion_main!(evaluate_polynomials);
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
use criterion::{criterion_group, criterion_main, BatchSize, BenchmarkId, Criterion};
2+
use itertools::Itertools;
3+
use rand::rngs::SmallRng;
4+
use rand::{Rng, SeedableRng};
5+
use stwo_gpu_backend::cuda::BaseFieldVec;
6+
use stwo_gpu_backend::CudaBackend;
7+
use stwo_prover::core::air::mask::fixed_mask_points;
8+
use stwo_prover::core::backend::simd::SimdBackend;
9+
use stwo_prover::core::backend::{Backend, ColumnOps};
10+
use stwo_prover::core::circle::CirclePoint;
11+
use stwo_prover::core::fields::m31::BaseField;
12+
use stwo_prover::core::fields::qm31::SecureField;
13+
use stwo_prover::core::pcs::TreeVec;
14+
use stwo_prover::core::poly::circle::{CanonicCoset, CircleEvaluation};
15+
use stwo_prover::core::poly::BitReversedOrder;
16+
use stwo_prover::core::ColumnVec;
17+
18+
const LOG_COLUMN_SIZE: u32 = 16;
19+
const LOG_NUMBER_OF_COLUMNS: usize = 10;
20+
const LOG_BLOWUP_FACTOR: u32 = 2;
21+
22+
fn generate_random_point() -> CirclePoint<SecureField> {
23+
let mut rng = SmallRng::seed_from_u64(0);
24+
let x = rng.gen();
25+
let y = rng.gen();
26+
CirclePoint { x, y }
27+
}
28+
29+
fn mask_points(
30+
point: CirclePoint<SecureField>,
31+
number_of_columns: usize,
32+
) -> TreeVec<ColumnVec<Vec<CirclePoint<SecureField>>>> {
33+
TreeVec(vec![fixed_mask_points(
34+
&vec![vec![0_usize]; number_of_columns],
35+
point,
36+
)])
37+
}
38+
39+
fn benchmark_evaluate_polynomials_out_of_domain<B: Backend>(
40+
criterion: &mut Criterion,
41+
coset: CanonicCoset,
42+
values: <B as ColumnOps<BaseField>>::Column,
43+
benchmark_id: &str,
44+
) {
45+
let mut group = criterion.benchmark_group("evaluate_polynomials_out_of_domain");
46+
let number_of_columns = 1 << LOG_NUMBER_OF_COLUMNS;
47+
48+
let circle_evaluation: CircleEvaluation<B, BaseField, BitReversedOrder> =
49+
B::new_canonical_ordered(coset, values);
50+
51+
let interpolation_coset = CanonicCoset::new(LOG_COLUMN_SIZE + LOG_BLOWUP_FACTOR);
52+
let twiddle_tree = B::precompute_twiddles(interpolation_coset.half_coset());
53+
54+
let polynomial = B::interpolate(circle_evaluation, &twiddle_tree);
55+
let polynomials = ColumnVec::from(vec![&polynomial; number_of_columns]);
56+
57+
let point = generate_random_point();
58+
let sample_points = mask_points(point, number_of_columns);
59+
60+
group.bench_function(BenchmarkId::new(benchmark_id, LOG_COLUMN_SIZE), |b| {
61+
b.iter_batched(
62+
|| (polynomials.clone(), sample_points.clone()),
63+
|(polys, sample)| {
64+
B::evaluate_polynomials_out_of_domain(TreeVec::new(vec![polys]), sample)
65+
},
66+
BatchSize::LargeInput,
67+
)
68+
});
69+
}
70+
71+
pub fn simd_evaluate_polynomials_out_of_domain(c: &mut Criterion) {
72+
let coset = CanonicCoset::new(LOG_COLUMN_SIZE);
73+
let values = (0..coset.size()).map(BaseField::from).collect();
74+
75+
benchmark_evaluate_polynomials_out_of_domain::<SimdBackend>(
76+
c,
77+
coset,
78+
values,
79+
"simd evaluate polynomials out of domain",
80+
);
81+
}
82+
83+
pub fn gpu_evaluate_polynomials_out_of_domain(c: &mut Criterion) {
84+
let coset = CanonicCoset::new(LOG_COLUMN_SIZE);
85+
let values = BaseFieldVec::from_vec((0..coset.size()).map(BaseField::from).collect_vec());
86+
87+
benchmark_evaluate_polynomials_out_of_domain::<CudaBackend>(
88+
c,
89+
coset,
90+
values,
91+
"gpu evaluate polynomials out of domain",
92+
);
93+
}
94+
95+
criterion_group!(
96+
name = evaluate_polynomials_out_of_domain;
97+
config = Criterion::default().sample_size(10);
98+
targets = simd_evaluate_polynomials_out_of_domain, gpu_evaluate_polynomials_out_of_domain
99+
);
100+
criterion_main!(evaluate_polynomials_out_of_domain);

0 commit comments

Comments
 (0)