@@ -48,7 +48,6 @@ __global__ void eval_at_point_first_pass(m31 *g_coeffs, qm31 *temp, qm31 *factor
4848 }
4949 factor_idx -= 1 ;
5050 level_size >>= 1 ;
51-
5251 }
5352
5453 if (idx == 0 ) {
@@ -154,3 +153,211 @@ qm31 eval_at_point(m31 *coeffs, int coeffs_size, qm31 point_x, qm31 point_y) {
154153 return result;
155154}
156155
156+ /* Many polynomials */
157+
158+
159+ __global__ void eval_many_at_point_first_pass (m31 **g_coeffs, qm31 *temp, qm31 *factors, int coeffs_size, int factors_size,
160+ int output_offset) {
161+ int idx = threadIdx .x ;
162+
163+ qm31 *output = &temp[output_offset];
164+
165+ int coeffs_per_block = 2 * blockDim .x ;
166+ int blocks_in_poly = max (1 , coeffs_size / coeffs_per_block);
167+ // Thread syncing happens within a block.
168+ // Split the problem to feed them to multiple blocks.
169+ if (coeffs_size >= coeffs_per_block) {
170+ coeffs_size = coeffs_per_block;
171+ }
172+
173+ extern __shared__ m31 s_coeffs[];
174+ extern __shared__ qm31 s_level[];
175+
176+ int poly_index = blockIdx .x / blocks_in_poly;
177+
178+ // A % X == A & (X-1) when X is a power of two
179+ s_coeffs[idx] = g_coeffs[poly_index][(blockIdx .x & (blocks_in_poly - 1 )) * coeffs_size + idx];
180+ s_coeffs[idx + blockDim .x ] = g_coeffs[poly_index][(blockIdx .x & (blocks_in_poly - 1 )) * coeffs_size + idx + blockDim .x ];
181+ __syncthreads ();
182+
183+ int level_size = coeffs_size >> 1 ;
184+ int factor_idx = factors_size - 1 ;
185+
186+ if (idx < level_size) {
187+ m31 alpha = s_coeffs[2 * idx];
188+ m31 beta = s_coeffs[2 * idx + 1 ];
189+ qm31 factor = factors[factor_idx];
190+
191+ qm31 result = {
192+ {add (mul (beta, factor.a .a ), alpha), mul (factor.a .b , beta)},
193+ {mul (beta, factor.b .a ), mul (beta, factor.b .b )}
194+ };
195+ s_level[idx] = result;
196+ }
197+ factor_idx -= 1 ;
198+ level_size >>= 1 ;
199+
200+ while (level_size > 0 ) {
201+ if (idx < level_size) {
202+ __syncthreads ();
203+ qm31 a = s_level[2 * idx];
204+ qm31 b = s_level[2 * idx + 1 ];
205+ __syncthreads ();
206+ s_level[idx] = add (a, mul (b, factors[factor_idx]));
207+ }
208+ factor_idx -= 1 ;
209+ level_size >>= 1 ;
210+ }
211+
212+ if (idx == 0 ) {
213+ output[blockIdx .x ] = s_level[0 ];
214+ }
215+ }
216+
217+ __global__
218+ void eval_many_at_point_second_pass (qm31 *temp, qm31 *factors, int level_size, int factor_offset, int level_offset,
219+ int output_offset, int results_per_block) {
220+ int idx = threadIdx .x ;
221+
222+ qm31 *level = &temp[level_offset];
223+ qm31 *output = &temp[output_offset];
224+
225+ // Thread syncing happens within a block.
226+ // Split the problem to feed them to multiple blocks.
227+ if (level_size >= 2 * blockDim .x ) {
228+ level_size = 2 * blockDim .x ;
229+ }
230+
231+ extern __shared__ qm31 s_level[];
232+
233+ s_level[idx] = level[2 * blockIdx .x * blockDim .x + idx];
234+ s_level[idx + blockDim .x ] = level[2 * blockIdx .x * blockDim .x + idx + blockDim .x ];
235+
236+ level_size >>= 1 ;
237+
238+ int factor_idx = factor_offset;
239+
240+ while (level_size >= results_per_block) {
241+ if (idx < level_size) {
242+ __syncthreads ();
243+ qm31 a = s_level[2 * idx];
244+ qm31 b = s_level[2 * idx + 1 ];
245+ __syncthreads ();
246+ s_level[idx] = add (a, mul (b, factors[factor_idx]));
247+ }
248+ factor_idx -= 1 ;
249+ level_size >>= 1 ;
250+ }
251+
252+ if (idx < results_per_block) {
253+ output[blockIdx .x * results_per_block + idx] = s_level[idx];
254+ }
255+ }
256+
257+ __global__
258+ void copy_result_for_polynomial (qm31 **result, qm31 *temp, int number_of_polynomials) {
259+ int global_thread_index = blockIdx .x * blockDim .x + threadIdx .x ;
260+
261+ if (global_thread_index < number_of_polynomials) {
262+ result[global_thread_index][0 ] = temp[global_thread_index];
263+ }
264+ }
265+
266+ void eval_polys_at_point (
267+ qm31 **result, m31 **polynomials, int log_number_of_polynomials, int log_coeffs_size, qm31 point_x, qm31 point_y
268+ ) {
269+ int coeffs_size = 1 << log_coeffs_size;
270+ int block_dim = min (256 , coeffs_size);
271+ int coeffs_per_block = block_dim * 2 ;
272+
273+ qm31 *host_mappings = (qm31 *) malloc (sizeof (qm31) * log_coeffs_size);
274+ host_mappings[log_coeffs_size - 1 ] = point_y;
275+ host_mappings[log_coeffs_size - 2 ] = point_x;
276+ qm31 x = point_x;
277+ for (int i = 2 ; i < log_coeffs_size; i += 1 ) {
278+ x = sub (mul (qm31{cm31{2 , 0 }, cm31{0 , 0 }}, mul (x, x)), qm31{cm31{1 , 0 }, cm31{0 , 0 }});
279+ host_mappings[log_coeffs_size - 1 - i] = x;
280+ }
281+
282+ int number_of_polynomials = 1 << log_number_of_polynomials;
283+ int total_number_of_coeffs = coeffs_size * number_of_polynomials;
284+ int temp_memory_size = 0 ;
285+ int size = total_number_of_coeffs;
286+ while (size > number_of_polynomials) {
287+ size = (size + coeffs_per_block - 1 ) / coeffs_per_block;
288+ temp_memory_size += size;
289+ }
290+
291+ temp_memory_size = max (temp_memory_size, number_of_polynomials);
292+
293+ qm31 *temp = cuda_malloc<qm31>(temp_memory_size);
294+ qm31 *device_mappings = clone_to_device<qm31>(host_mappings, log_coeffs_size);
295+
296+ free (host_mappings);
297+
298+ // First pass
299+ int num_blocks = max (number_of_polynomials, ((total_number_of_coeffs >> 1 ) + block_dim - 1 ) / block_dim);
300+ int shared_memory_bytes = coeffs_per_block * 4 + coeffs_per_block * 8 ;
301+ int output_offset = temp_memory_size - num_blocks;
302+
303+ eval_many_at_point_first_pass<<<num_blocks, block_dim, shared_memory_bytes>>> (polynomials, temp, device_mappings, coeffs_size,
304+ log_coeffs_size, output_offset);
305+
306+ // Second pass
307+ int mappings_offset = log_coeffs_size - 1 ;
308+ int level_offset = output_offset;
309+ while (num_blocks > number_of_polynomials) {
310+ mappings_offset -= 9 ;
311+ int new_num_blocks = ((num_blocks >> 1 ) + block_dim - 1 ) / block_dim;
312+ int number_of_results = max (number_of_polynomials, new_num_blocks);
313+ int results_per_block = number_of_results / new_num_blocks;
314+ shared_memory_bytes = coeffs_per_block * 4 * 4 ;
315+ output_offset = level_offset - new_num_blocks;
316+ eval_many_at_point_second_pass<<<new_num_blocks, block_dim, shared_memory_bytes>>> (temp, device_mappings, num_blocks,
317+ mappings_offset, level_offset,
318+ output_offset, results_per_block);
319+ num_blocks = new_num_blocks;
320+ level_offset = output_offset;
321+ }
322+
323+ cudaDeviceSynchronize ();
324+
325+ num_blocks = (number_of_polynomials + block_dim - 1 ) / block_dim;
326+ copy_result_for_polynomial<<<num_blocks, block_dim>>> (
327+ result, temp, number_of_polynomials
328+ );
329+
330+ cuda_free_memory (temp);
331+ cuda_free_memory (device_mappings);
332+ }
333+
334+ void eval_polys_at_points (
335+ qm31 **result, m31 **polynomials, int log_polynomial_size, int log_number_of_polynomials,
336+ qm31 *points_x, qm31 *points_y, int sample_size
337+ ) {
338+ for (int point_index = 0 ; point_index < sample_size; point_index++) {
339+ qm31 point_x = points_x[point_index];
340+ qm31 point_y = points_y[point_index];
341+ eval_polys_at_point (result, polynomials, log_number_of_polynomials, log_polynomial_size, point_x, point_y);
342+ }
343+ }
344+
345+ void evaluate_polynomials_out_of_domain (
346+ qm31 **result, m31 **polynomials, int *log_polynomial_sizes, int number_of_polynomials,
347+ qm31 **out_of_domain_points_x, qm31 **out_of_domain_points_y, int *sample_sizes
348+ ) {
349+ // In this iteration, we assume all polynomials are of equal size and will be evaluated in the same single point
350+
351+ qm31 **device_result = clone_to_device<qm31*>(result, number_of_polynomials);
352+ m31 **device_polynomials = clone_to_device<m31*>(polynomials, number_of_polynomials);
353+ int log_polynomial_size = log_polynomial_sizes[0 ];
354+ int log_number_of_polynomials = log_2 (number_of_polynomials);
355+
356+ eval_polys_at_points (
357+ device_result, device_polynomials, log_polynomial_size, log_number_of_polynomials,
358+ out_of_domain_points_x[0 ], out_of_domain_points_y[0 ], sample_sizes[0 ]
359+ );
360+
361+ cuda_free_memory (device_result);
362+ cuda_free_memory (device_polynomials);
363+ };
0 commit comments