Barretenberg
The ZK-SNARK library at the core of Aztec
Loading...
Searching...
No Matches
polynomial_arithmetic.cpp
Go to the documentation of this file.
1// === AUDIT STATUS ===
2// internal: { status: not started, auditors: [], date: YYYY-MM-DD }
3// external_1: { status: not started, auditors: [], date: YYYY-MM-DD }
4// external_2: { status: not started, auditors: [], date: YYYY-MM-DD }
5// =====================
6
13#include <math.h>
14#include <memory.h>
15#include <memory>
16
18
19namespace {
20
21template <typename Fr> std::shared_ptr<Fr[]> get_scratch_space(const size_t num_elements)
22{
23 static std::shared_ptr<Fr[]> working_memory = nullptr;
24 static size_t current_size = 0;
25 if (num_elements > current_size) {
26 working_memory = std::make_shared<Fr[]>(num_elements);
27 current_size = num_elements;
28 }
29 return working_memory;
30}
31
32} // namespace
33
34inline uint32_t reverse_bits(uint32_t x, uint32_t bit_length)
35{
36 x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1));
37 x = (((x & 0xcccccccc) >> 2) | ((x & 0x33333333) << 2));
38 x = (((x & 0xf0f0f0f0) >> 4) | ((x & 0x0f0f0f0f) << 4));
39 x = (((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8));
40 return (((x >> 16) | (x << 16))) >> (32 - bit_length);
41}
42
43inline bool is_power_of_two(uint64_t x)
44{
45 return x && !(x & (x - 1));
46}
47
48template <typename Fr>
50 Fr* target,
51 const EvaluationDomain<Fr>& domain,
52 const Fr& generator_start,
53 const Fr& generator_shift,
54 const size_t generator_size)
55{
56 parallel_for(domain.num_threads, [&](size_t j) {
57 Fr thread_shift = generator_shift.pow(static_cast<uint64_t>(j * (generator_size / domain.num_threads)));
58 Fr work_generator = generator_start * thread_shift;
59 const size_t offset = j * (generator_size / domain.num_threads);
60 const size_t end = offset + (generator_size / domain.num_threads);
61 for (size_t i = offset; i < end; ++i) {
62 target[i] = coeffs[i] * work_generator;
63 work_generator *= generator_shift;
64 }
65 });
66}
77template <typename Fr>
78 requires SupportsFFT<Fr>
79void compute_multiplicative_subgroup(const size_t log2_subgroup_size,
80 const EvaluationDomain<Fr>& src_domain,
81 Fr* subgroup_roots)
82{
83 size_t subgroup_size = 1UL << log2_subgroup_size;
84 // Step 1: get primitive 4th root of unity
85 Fr subgroup_root = Fr::get_root_of_unity(log2_subgroup_size);
86
87 // Step 2: compute the cofactor term g^n
88 Fr accumulator = src_domain.generator;
89 for (size_t i = 0; i < src_domain.log2_size; ++i) {
90 accumulator.self_sqr();
91 }
92
93 // Step 3: fill array with subgroup_size values of (g.X)^n, scaled by the cofactor
94 subgroup_roots[0] = accumulator;
95 for (size_t i = 1; i < subgroup_size; ++i) {
96 subgroup_roots[i] = subgroup_roots[i - 1] * subgroup_root;
97 }
98}
99
100template <typename Fr>
101 requires SupportsFFT<Fr>
102void fft_inner_parallel(std::vector<Fr*> coeffs,
103 const EvaluationDomain<Fr>& domain,
104 const Fr&,
105 const std::vector<Fr*>& root_table)
106{
107 auto scratch_space_ptr = get_scratch_space<Fr>(domain.size);
108 auto scratch_space = scratch_space_ptr.get();
109
110 const size_t num_polys = coeffs.size();
111 BB_ASSERT(is_power_of_two(num_polys));
112 const size_t poly_size = domain.size / num_polys;
113 BB_ASSERT(is_power_of_two(poly_size));
114 const size_t poly_mask = poly_size - 1;
115 const size_t log2_poly_size = (size_t)numeric::get_msb(poly_size);
116
117 parallel_for(domain.num_threads, [&](size_t j) {
118 Fr temp_1;
119 Fr temp_2;
120 for (size_t i = (j * domain.thread_size); i < ((j + 1) * domain.thread_size); i += 2) {
121 uint32_t next_index_1 = (uint32_t)reverse_bits((uint32_t)i + 2, (uint32_t)domain.log2_size);
122 uint32_t next_index_2 = (uint32_t)reverse_bits((uint32_t)i + 3, (uint32_t)domain.log2_size);
123 __builtin_prefetch(&coeffs[next_index_1]);
124 __builtin_prefetch(&coeffs[next_index_2]);
125
126 uint32_t swap_index_1 = (uint32_t)reverse_bits((uint32_t)i, (uint32_t)domain.log2_size);
127 uint32_t swap_index_2 = (uint32_t)reverse_bits((uint32_t)i + 1, (uint32_t)domain.log2_size);
128
129 size_t poly_idx_1 = swap_index_1 >> log2_poly_size;
130 size_t elem_idx_1 = swap_index_1 & poly_mask;
131 size_t poly_idx_2 = swap_index_2 >> log2_poly_size;
132 size_t elem_idx_2 = swap_index_2 & poly_mask;
133
134 Fr::__copy(coeffs[poly_idx_1][elem_idx_1], temp_1);
135 Fr::__copy(coeffs[poly_idx_2][elem_idx_2], temp_2);
136 scratch_space[i + 1] = temp_1 - temp_2;
137 scratch_space[i] = temp_1 + temp_2;
138 }
139 });
140
141 // hard code exception for when the domain size is tiny - we won't execute the next loop, so need to manually
142 // reduce + copy
143 if (domain.size <= 2) {
144 coeffs[0][0] = scratch_space[0];
145 coeffs[0][1] = scratch_space[1];
146 }
147
148 // outer FFT loop
149 for (size_t m = 2; m < (domain.size); m <<= 1) {
150 parallel_for(domain.num_threads, [&](size_t j) {
151 Fr temp;
152
153 // Ok! So, what's going on here? This is the inner loop of the FFT algorithm, and we want to break it
154 // out into multiple independent threads. For `num_threads`, each thread will evaluation `domain.size /
155 // num_threads` of the polynomial. The actual iteration length will be half of this, because we leverage
156 // the fact that \omega^{n/2} = -\omega (where \omega is a root of unity)
157
158 // Here, `start` and `end` are used as our iterator limits, so that we can use our iterator `i` to
159 // directly access the roots of unity lookup table
160 const size_t start = j * (domain.thread_size >> 1);
161 const size_t end = (j + 1) * (domain.thread_size >> 1);
162
163 // For all but the last round of our FFT, the roots of unity that we need, will be a subset of our
164 // lookup table. e.g. for a size 2^n FFT, the 2^n'th roots create a multiplicative subgroup of order 2^n
165 // the 1st round will use the roots from the multiplicative subgroup of order 2 : the 2'th roots of
166 // unity the 2nd round will use the roots from the multiplicative subgroup of order 4 : the 4'th
167 // roots of unity
168 // i.e. each successive FFT round will double the set of roots that we need to index.
169 // We have already laid out the `root_table` container so that each FFT round's roots are linearly
170 // ordered in memory. For all FFT rounds, the number of elements we're iterating over is greater than
171 // the size of our lookup table. We need to access this table in a cyclical fasion - i.e. for a subgroup
172 // of size x, the first x iterations will index the subgroup elements in order, then for the next x
173 // iterations, we loop back to the start.
174
175 // We could implement the algorithm by having 2 nested loops (where the inner loop iterates over the
176 // root table), but we want to flatten this out - as for the first few rounds, the inner loop will be
177 // tiny and we'll have quite a bit of unneccesary branch checks For each iteration of our flattened
178 // loop, indexed by `i`, the element of the root table we need to access will be `i % (current round
179 // subgroup size)` Given that each round subgroup size is `m`, which is a power of 2, we can index the
180 // root table with a very cheap `i & (m - 1)` Which is why we have this odd `block_mask` variable
181 const size_t block_mask = m - 1;
182
183 // The next problem to tackle, is we now need to efficiently index the polynomial element in
184 // `scratch_space` in our flattened loop If we used nested loops, the outer loop (e.g. `y`) iterates
185 // from 0 to 'domain size', in steps of 2 * m, with the inner loop (e.g. `z`) iterating from 0 to m. We
186 // have our inner loop indexer with `i & (m - 1)`. We need to add to this our outer loop indexer, which
187 // is equivalent to taking our indexer `i`, masking out the bits used in the 'inner loop', and doubling
188 // the result. i.e. polynomial indexer = (i & (m - 1)) + ((i & ~(m - 1)) >> 1) To simplify this, we
189 // cache index_mask = ~block_mask, meaning that our indexer is just `((i & index_mask) << 1 + (i &
190 // block_mask)`
191 const size_t index_mask = ~block_mask;
192
193 // `round_roots` fetches the pointer to this round's lookup table. We use `numeric::get_msb(m) - 1` as
194 // our indexer, because we don't store the precomputed root values for the 1st round (because they're
195 // all 1).
196 const Fr* round_roots = root_table[static_cast<size_t>(numeric::get_msb(m)) - 1];
197
198 // Finally, we want to treat the final round differently from the others,
199 // so that we can reduce out of our 'coarse' reduction and store the output in `coeffs` instead of
200 // `scratch_space`
201 if (m != (domain.size >> 1)) {
202 for (size_t i = start; i < end; ++i) {
203 size_t k1 = (i & index_mask) << 1;
204 size_t j1 = i & block_mask;
205 temp = round_roots[j1] * scratch_space[k1 + j1 + m];
206 scratch_space[k1 + j1 + m] = scratch_space[k1 + j1] - temp;
207 scratch_space[k1 + j1] += temp;
208 }
209 } else {
210 for (size_t i = start; i < end; ++i) {
211 size_t k1 = (i & index_mask) << 1;
212 size_t j1 = i & block_mask;
213
214 size_t poly_idx_1 = (k1 + j1) >> log2_poly_size;
215 size_t elem_idx_1 = (k1 + j1) & poly_mask;
216 size_t poly_idx_2 = (k1 + j1 + m) >> log2_poly_size;
217 size_t elem_idx_2 = (k1 + j1 + m) & poly_mask;
218
219 temp = round_roots[j1] * scratch_space[k1 + j1 + m];
220 coeffs[poly_idx_2][elem_idx_2] = scratch_space[k1 + j1] - temp;
221 coeffs[poly_idx_1][elem_idx_1] = scratch_space[k1 + j1] + temp;
222 }
223 }
224 });
225 }
226}
227
228template <typename Fr>
229 requires SupportsFFT<Fr>
231 Fr* coeffs, Fr* target, const EvaluationDomain<Fr>& domain, const Fr&, const std::vector<Fr*>& root_table)
232{
233 parallel_for(domain.num_threads, [&](size_t j) {
234 Fr temp_1;
235 Fr temp_2;
236 for (size_t i = (j * domain.thread_size); i < ((j + 1) * domain.thread_size); i += 2) {
237 uint32_t next_index_1 = (uint32_t)reverse_bits((uint32_t)i + 2, (uint32_t)domain.log2_size);
238 uint32_t next_index_2 = (uint32_t)reverse_bits((uint32_t)i + 3, (uint32_t)domain.log2_size);
239 __builtin_prefetch(&coeffs[next_index_1]);
240 __builtin_prefetch(&coeffs[next_index_2]);
241
242 uint32_t swap_index_1 = (uint32_t)reverse_bits((uint32_t)i, (uint32_t)domain.log2_size);
243 uint32_t swap_index_2 = (uint32_t)reverse_bits((uint32_t)i + 1, (uint32_t)domain.log2_size);
244
245 Fr::__copy(coeffs[swap_index_1], temp_1);
246 Fr::__copy(coeffs[swap_index_2], temp_2);
247 target[i + 1] = temp_1 - temp_2;
248 target[i] = temp_1 + temp_2;
249 }
250 });
251
252 // hard code exception for when the domain size is tiny - we won't execute the next loop, so need to manually
253 // reduce + copy
254 if (domain.size <= 2) {
255 coeffs[0] = target[0];
256 coeffs[1] = target[1];
257 }
258
259 // outer FFT loop
260 for (size_t m = 2; m < (domain.size); m <<= 1) {
261 parallel_for(domain.num_threads, [&](size_t j) {
262 Fr temp;
263
264 // Ok! So, what's going on here? This is the inner loop of the FFT algorithm, and we want to break it
265 // out into multiple independent threads. For `num_threads`, each thread will evaluation `domain.size /
266 // num_threads` of the polynomial. The actual iteration length will be half of this, because we leverage
267 // the fact that \omega^{n/2} = -\omega (where \omega is a root of unity)
268
269 // Here, `start` and `end` are used as our iterator limits, so that we can use our iterator `i` to
270 // directly access the roots of unity lookup table
271 const size_t start = j * (domain.thread_size >> 1);
272 const size_t end = (j + 1) * (domain.thread_size >> 1);
273
274 // For all but the last round of our FFT, the roots of unity that we need, will be a subset of our
275 // lookup table. e.g. for a size 2^n FFT, the 2^n'th roots create a multiplicative subgroup of order 2^n
276 // the 1st round will use the roots from the multiplicative subgroup of order 2 : the 2'th roots of
277 // unity the 2nd round will use the roots from the multiplicative subgroup of order 4 : the 4'th
278 // roots of unity
279 // i.e. each successive FFT round will double the set of roots that we need to index.
280 // We have already laid out the `root_table` container so that each FFT round's roots are linearly
281 // ordered in memory. For all FFT rounds, the number of elements we're iterating over is greater than
282 // the size of our lookup table. We need to access this table in a cyclical fasion - i.e. for a subgroup
283 // of size x, the first x iterations will index the subgroup elements in order, then for the next x
284 // iterations, we loop back to the start.
285
286 // We could implement the algorithm by having 2 nested loops (where the inner loop iterates over the
287 // root table), but we want to flatten this out - as for the first few rounds, the inner loop will be
288 // tiny and we'll have quite a bit of unneccesary branch checks For each iteration of our flattened
289 // loop, indexed by `i`, the element of the root table we need to access will be `i % (current round
290 // subgroup size)` Given that each round subgroup size is `m`, which is a power of 2, we can index the
291 // root table with a very cheap `i & (m - 1)` Which is why we have this odd `block_mask` variable
292 const size_t block_mask = m - 1;
293
294 // The next problem to tackle, is we now need to efficiently index the polynomial element in
295 // `scratch_space` in our flattened loop If we used nested loops, the outer loop (e.g. `y`) iterates
296 // from 0 to 'domain size', in steps of 2 * m, with the inner loop (e.g. `z`) iterating from 0 to m. We
297 // have our inner loop indexer with `i & (m - 1)`. We need to add to this our outer loop indexer, which
298 // is equivalent to taking our indexer `i`, masking out the bits used in the 'inner loop', and doubling
299 // the result. i.e. polynomial indexer = (i & (m - 1)) + ((i & ~(m - 1)) >> 1) To simplify this, we
300 // cache index_mask = ~block_mask, meaning that our indexer is just `((i & index_mask) << 1 + (i &
301 // block_mask)`
302 const size_t index_mask = ~block_mask;
303
304 // `round_roots` fetches the pointer to this round's lookup table. We use `numeric::get_msb(m) - 1` as
305 // our indexer, because we don't store the precomputed root values for the 1st round (because they're
306 // all 1).
307 const Fr* round_roots = root_table[static_cast<size_t>(numeric::get_msb(m)) - 1];
308
309 // Finally, we want to treat the final round differently from the others,
310 // so that we can reduce out of our 'coarse' reduction and store the output in `coeffs` instead of
311 // `scratch_space`
312 for (size_t i = start; i < end; ++i) {
313 size_t k1 = (i & index_mask) << 1;
314 size_t j1 = i & block_mask;
315 temp = round_roots[j1] * target[k1 + j1 + m];
316 target[k1 + j1 + m] = target[k1 + j1] - temp;
317 target[k1 + j1] += temp;
318 }
319 });
320 }
321}
322
323template <typename Fr>
324 requires SupportsFFT<Fr>
325void fft(Fr* coeffs, const EvaluationDomain<Fr>& domain)
326{
327 fft_inner_parallel({ coeffs }, domain, domain.root, domain.get_round_roots());
328}
329
330template <typename Fr>
331 requires SupportsFFT<Fr>
332void fft(Fr* coeffs, Fr* target, const EvaluationDomain<Fr>& domain)
333{
334 fft_inner_parallel(coeffs, target, domain, domain.root, domain.get_round_roots());
335}
336
337template <typename Fr>
338 requires SupportsFFT<Fr>
339void fft(std::vector<Fr*> coeffs, const EvaluationDomain<Fr>& domain)
340{
341 fft_inner_parallel<Fr>(coeffs, domain.size, domain.root, domain.get_round_roots());
342}
343
344template <typename Fr>
345 requires SupportsFFT<Fr>
346void ifft(Fr* coeffs, const EvaluationDomain<Fr>& domain)
347{
348 fft_inner_parallel({ coeffs }, domain, domain.root_inverse, domain.get_inverse_round_roots());
350 coeffs[i] *= domain.domain_inverse;
352}
353
354template <typename Fr>
355 requires SupportsFFT<Fr>
356void ifft(Fr* coeffs, Fr* target, const EvaluationDomain<Fr>& domain)
357{
358 fft_inner_parallel(coeffs, target, domain, domain.root_inverse, domain.get_inverse_round_roots());
360 target[i] *= domain.domain_inverse;
362}
363
364template <typename Fr>
365 requires SupportsFFT<Fr>
366void ifft(std::vector<Fr*> coeffs, const EvaluationDomain<Fr>& domain)
367{
368 fft_inner_parallel(coeffs, domain, domain.root_inverse, domain.get_inverse_round_roots());
369
370 const size_t num_polys = coeffs.size();
371 BB_ASSERT(is_power_of_two(num_polys));
372 const size_t poly_size = domain.size / num_polys;
373 BB_ASSERT(is_power_of_two(poly_size));
374 const size_t poly_mask = poly_size - 1;
375 const size_t log2_poly_size = (size_t)numeric::get_msb(poly_size);
376
378 coeffs[i >> log2_poly_size][i & poly_mask] *= domain.domain_inverse;
380}
381
382template <typename Fr>
383 requires SupportsFFT<Fr>
384void coset_fft(Fr* coeffs, const EvaluationDomain<Fr>& domain)
385{
386 scale_by_generator(coeffs, coeffs, domain, Fr::one(), domain.generator, domain.generator_size);
387 fft(coeffs, domain);
388}
389
390template <typename Fr>
391 requires SupportsFFT<Fr>
392void coset_fft(Fr* coeffs, Fr* target, const EvaluationDomain<Fr>& domain)
393{
394 scale_by_generator(coeffs, target, domain, Fr::one(), domain.generator, domain.generator_size);
395 fft(coeffs, target, domain);
396}
397
398template <typename Fr>
399 requires SupportsFFT<Fr>
400void coset_fft(std::vector<Fr*> coeffs, const EvaluationDomain<Fr>& domain)
401{
402 const size_t num_polys = coeffs.size();
403 BB_ASSERT(is_power_of_two(num_polys));
404 const size_t poly_size = domain.size / num_polys;
405 const Fr generator_pow_n = domain.generator.pow(poly_size);
406 Fr generator_start = 1;
407
408 for (size_t i = 0; i < num_polys; i++) {
409 scale_by_generator(coeffs[i], coeffs[i], domain, generator_start, domain.generator, poly_size);
410 generator_start *= generator_pow_n;
411 }
412 fft(coeffs, domain);
413}
414
415template <typename Fr>
416 requires SupportsFFT<Fr>
417void coset_fft(Fr* coeffs,
418 const EvaluationDomain<Fr>& domain,
420 const size_t domain_extension)
421{
422 const size_t log2_domain_extension = static_cast<size_t>(numeric::get_msb(domain_extension));
423 Fr primitive_root = Fr::get_root_of_unity(domain.log2_size + log2_domain_extension);
424
425 // Fr work_root = domain.generator.sqr();
426 // work_root = domain.generator.sqr();
427 auto scratch_space_ptr = get_scratch_space<Fr>(domain.size * domain_extension);
428 auto scratch_space = scratch_space_ptr.get();
429
430 // Fr* temp_memory = static_cast<Fr*>(aligned_alloc(64, sizeof(Fr) * domain.size *
431 // domain_extension));
432
433 std::vector<Fr> coset_generators(domain_extension);
434 coset_generators[0] = domain.generator;
435 for (size_t i = 1; i < domain_extension; ++i) {
436 coset_generators[i] = coset_generators[i - 1] * primitive_root;
437 }
438 for (size_t i = domain_extension - 1; i < domain_extension; --i) {
439 scale_by_generator(coeffs, coeffs + (i * domain.size), domain, Fr::one(), coset_generators[i], domain.size);
440 }
441
442 for (size_t i = 0; i < domain_extension; ++i) {
443 fft_inner_parallel(coeffs + (i * domain.size),
444 scratch_space + (i * domain.size),
445 domain,
446 domain.root,
447 domain.get_round_roots());
448 }
449
450 if (domain_extension == 4) {
451 parallel_for(domain.num_threads, [&](size_t j) {
452 const size_t start = j * domain.thread_size;
453 const size_t end = (j + 1) * domain.thread_size;
454 for (size_t i = start; i < end; ++i) {
455 Fr::__copy(scratch_space[i], coeffs[(i << 2UL)]);
456 Fr::__copy(scratch_space[i + (1UL << domain.log2_size)], coeffs[(i << 2UL) + 1UL]);
457 Fr::__copy(scratch_space[i + (2UL << domain.log2_size)], coeffs[(i << 2UL) + 2UL]);
458 Fr::__copy(scratch_space[i + (3UL << domain.log2_size)], coeffs[(i << 2UL) + 3UL]);
459 }
460 });
461
462 for (size_t i = 0; i < domain.size; ++i) {
463 for (size_t j = 0; j < domain_extension; ++j) {
464 Fr::__copy(scratch_space[i + (j << domain.log2_size)], coeffs[(i << log2_domain_extension) + j]);
465 }
466 }
467 } else {
468 for (size_t i = 0; i < domain.size; ++i) {
469 for (size_t j = 0; j < domain_extension; ++j) {
470 Fr::__copy(scratch_space[i + (j << domain.log2_size)], coeffs[(i << log2_domain_extension) + j]);
471 }
472 }
473 }
474}
475
476template <typename Fr>
477 requires SupportsFFT<Fr>
478void coset_ifft(Fr* coeffs, const EvaluationDomain<Fr>& domain)
479{
480 ifft(coeffs, domain);
481 scale_by_generator(coeffs, coeffs, domain, Fr::one(), domain.generator_inverse, domain.size);
482}
483
484template <typename Fr>
485 requires SupportsFFT<Fr>
486void coset_ifft(std::vector<Fr*> coeffs, const EvaluationDomain<Fr>& domain)
487{
488 ifft(coeffs, domain);
489
490 const size_t num_polys = coeffs.size();
491 BB_ASSERT(is_power_of_two(num_polys));
492 const size_t poly_size = domain.size / num_polys;
493 const Fr generator_inv_pow_n = domain.generator_inverse.pow(poly_size);
494 Fr generator_start = 1;
495
496 for (size_t i = 0; i < num_polys; i++) {
497 scale_by_generator(coeffs[i], coeffs[i], domain, generator_start, domain.generator_inverse, poly_size);
498 generator_start *= generator_inv_pow_n;
499 }
500}
501
502template <typename Fr> Fr evaluate(const Fr* coeffs, const Fr& z, const size_t n)
503{
504 const size_t num_threads = get_num_cpus();
505 std::vector<Fr> evaluations(num_threads, Fr::zero());
506 parallel_for([&](const ThreadChunk& chunk) {
507 // parallel_for with ThreadChunk uses get_num_cpus() threads
508 BB_ASSERT_EQ(chunk.total_threads, evaluations.size());
509 auto range = chunk.range(n);
510 if (range.empty()) {
511 return;
512 }
513 size_t start = *range.begin();
514 Fr z_acc = z.pow(static_cast<uint64_t>(start));
515 for (size_t i : range) {
516 Fr work_var = z_acc * coeffs[i];
517 evaluations[chunk.thread_index] += work_var;
518 z_acc *= z;
519 }
520 });
521
522 Fr r = Fr::zero();
523 for (const auto& eval : evaluations) {
524 r += eval;
525 }
526 return r;
527}
528
529template <typename Fr> Fr evaluate(const std::vector<Fr*> coeffs, const Fr& z, const size_t large_n)
530{
531 const size_t num_polys = coeffs.size();
532 const size_t poly_size = large_n / num_polys;
533 BB_ASSERT(is_power_of_two(poly_size));
534 const size_t log2_poly_size = (size_t)numeric::get_msb(poly_size);
535 const size_t num_threads = get_num_cpus();
536 std::vector<Fr> evaluations(num_threads, Fr::zero());
537 parallel_for([&](const ThreadChunk& chunk) {
538 // parallel_for with ThreadChunk uses get_num_cpus() threads
539 BB_ASSERT_EQ(chunk.total_threads, evaluations.size());
540 auto range = chunk.range(large_n);
541 if (range.empty()) {
542 return;
543 }
544 size_t start = *range.begin();
545 Fr z_acc = z.pow(static_cast<uint64_t>(start));
546 for (size_t i : range) {
547 Fr work_var = z_acc * coeffs[i >> log2_poly_size][i & (poly_size - 1)];
548 evaluations[chunk.thread_index] += work_var;
549 z_acc *= z;
550 }
551 });
552
553 Fr r = Fr::zero();
554 for (const auto& eval : evaluations) {
555 r += eval;
556 }
557 return r;
558}
559
560// Computes r = \sum_{i=0}^{num_coeffs-1} (L_{i+1}(ʓ).f_i)
561//
562// (ʓ^n - 1)
563// Start with L_1(ʓ) = ---------
564// n.(ʓ - 1)
565//
566// ʓ^n - 1
567// L_i(z) = L_1(ʓ.ω^{1-i}) = ------------------
568// n.(ʓ.ω^{1-i)} - 1)
569//
571 const size_t num_coeffs,
572 const fr& z,
573 const EvaluationDomain<fr>& domain)
574{
575 fr* denominators = static_cast<fr*>(aligned_alloc(64, sizeof(fr) * num_coeffs));
576
577 fr numerator = z;
578 for (size_t i = 0; i < domain.log2_size; ++i) {
579 numerator.self_sqr();
580 }
581 numerator -= fr::one();
582 numerator *= domain.domain_inverse; // (ʓ^n - 1) / n
583
584 denominators[0] = z - fr::one();
585 fr work_root = domain.root_inverse; // ω^{-1}
586 for (size_t i = 1; i < num_coeffs; ++i) {
587 denominators[i] =
588 work_root * z; // denominators[i] will correspond to L_[i+1] (since our 'commented maths' notation indexes
589 // L_i from 1). So ʓ.ω^{-i} = ʓ.ω^{1-(i+1)} is correct for L_{i+1}.
590 denominators[i] -= fr::one(); // ʓ.ω^{-i} - 1
591 work_root *= domain.root_inverse;
592 }
593
594 fr::batch_invert(denominators, num_coeffs);
595
596 fr result = fr::zero();
597
598 for (size_t i = 0; i < num_coeffs; ++i) {
599 fr temp = coeffs[i] * denominators[i]; // f_i * 1/(ʓ.ω^{-i} - 1)
600 result = result + temp;
601 }
602
603 result = result *
604 numerator; // \sum_{i=0}^{num_coeffs-1} f_i * [ʓ^n - 1]/[n.(ʓ.ω^{-i} - 1)]
605 // = \sum_{i=0}^{num_coeffs-1} f_i * L_{i+1}
606 // (with our somewhat messy 'commented maths' convention that L_1 corresponds to the 0th coeff).
607
608 aligned_free(denominators);
609
610 return result;
611}
612
613// This function computes sum of all scalars in a given array.
614template <typename Fr> Fr compute_sum(const Fr* src, const size_t n)
615{
616 Fr result = 0;
617 for (size_t i = 0; i < n; ++i) {
618 result += src[i];
619 }
620 return result;
621}
622
623// This function computes the polynomial (x - a)(x - b)(x - c)... given n distinct roots (a, b, c, ...).
624template <typename Fr> void compute_linear_polynomial_product(const Fr* roots, Fr* dest, const size_t n)
625{
626
627 auto scratch_space_ptr = get_scratch_space<Fr>(n);
628 auto scratch_space = scratch_space_ptr.get();
629 memcpy((void*)scratch_space, (void*)roots, n * sizeof(Fr));
630
631 dest[n] = 1;
632 dest[n - 1] = -compute_sum(scratch_space, n);
633
634 Fr temp;
635 Fr constant = 1;
636 for (size_t i = 0; i < n - 1; ++i) {
637 temp = 0;
638 for (size_t j = 0; j < n - 1 - i; ++j) {
639 scratch_space[j] = roots[j] * compute_sum(&scratch_space[j + 1], n - 1 - i - j);
640 temp += scratch_space[j];
641 }
642 dest[n - 2 - i] = temp * constant;
643 constant *= Fr::neg_one();
644 }
645}
646
647template <typename Fr> Fr compute_linear_polynomial_product_evaluation(const Fr* roots, const Fr z, const size_t n)
648{
649 Fr result = 1;
650 for (size_t i = 0; i < n; ++i) {
651 result *= (z - roots[i]);
652 }
653 return result;
654}
655
656template <typename Fr>
657void compute_efficient_interpolation(const Fr* src, Fr* dest, const Fr* evaluation_points, const size_t n)
658{
659 /*
660 We use Lagrange technique to compute polynomial interpolation.
661 Given: (x_i, y_i) for i ∈ {0, 1, ..., n} =: [n]
662 Compute function f(X) such that f(x_i) = y_i for all i ∈ [n].
663 (X - x1)(X - x2)...(X - xn) (X - x0)(X - x2)...(X - xn)
664 F(X) = y0-------------------------------- + y1---------------------------------- + ...
665 (x0 - x_1)(x0 - x_2)...(x0 - xn) (x1 - x_0)(x1 - x_2)...(x1 - xn)
666 We write this as:
667 [ yi ]
668 F(X) = N(X) * |∑_i --------------- |
669 [ (X - xi) * di ]
670 where:
671 N(X) = ∏_{i \in [n]} (X - xi),
672 di = ∏_{j != i} (xi - xj)
673 For division of N(X) by (X - xi), we use the same trick that was used in compute_opening_polynomial()
674 function in the Kate commitment scheme.
675 We denote
676 q_{x_i} = N(X)/(X-x_i) * y_i * (d_i)^{-1} = q_{x_i,0}*1 + ... + q_{x_i,n-1} * X^{n-1} for i=0,..., n-1.
677
678 The computation of F(X) is split into two cases:
679
680 - if 0 is not in the interpolation domain, then the numerator polynomial N(X) has a non-zero constant term
681 that is used to initialize the division algorithm mentioned above; the monomial coefficients q_{x_i, j} of
682 q_{x_i} are accumulated into dest[j] for j=0,..., n-1
683
684 - if 0 is in the domain at index i_0, the constant term of N(X) is 0 and the division algorithm computing
685 q_{x_i} for i != i_0 is initialized with the constant term of N(X)/X. Note that its coefficients are given
686 by numerator_polynomial[j] for j=1,...,n. The monomial coefficients of q_{x_i} are then accumuluated in
687 dest[j] for j=1,..., n-1. Whereas the coefficients of
688 q_{0} = N(X)/X * f(0) * (d_{i_0})^{-1}
689 are added to dest[j] for j=0,..., n-1. Note that these coefficients do not require performing the division
690 algorithm used in Kate commitment scheme, as the coefficients of N(X)/X are given by numerator_polynomial[j]
691 for j=1,...,n.
692 */
693 Fr numerator_polynomial[n + 1];
694 polynomial_arithmetic::compute_linear_polynomial_product(evaluation_points, numerator_polynomial, n);
695 // First half contains roots, second half contains denominators (to be inverted)
696 Fr roots_and_denominators[2 * n];
697 Fr temp_src[n];
698 for (size_t i = 0; i < n; ++i) {
699 roots_and_denominators[i] = -evaluation_points[i];
700 temp_src[i] = src[i];
701 dest[i] = 0;
702 // compute constant denominators
703 roots_and_denominators[n + i] = 1;
704 for (size_t j = 0; j < n; ++j) {
705 if (j == i) {
706 continue;
707 }
708 roots_and_denominators[n + i] *= (evaluation_points[i] - evaluation_points[j]);
709 }
710 }
711 // at this point roots_and_denominators is populated as follows
712 // (x_0,\ldots, x_{n-1}, d_0, \ldots, d_{n-1})
713 Fr::batch_invert(roots_and_denominators, 2 * n);
714
715 Fr z, multiplier;
716 Fr temp_dest[n];
717 size_t idx_zero = 0;
718 bool interpolation_domain_contains_zero = false;
719 // if the constant term of the numerator polynomial N(X) is 0, then the interpolation domain contains 0
720 // we find the index i_0, such that x_{i_0} = 0
721 if (numerator_polynomial[0] == Fr(0)) {
722 for (size_t i = 0; i < n; ++i) {
723 if (evaluation_points[i] == Fr(0)) {
724 idx_zero = i;
725 interpolation_domain_contains_zero = true;
726 break;
727 }
728 }
729 };
730
731 if (!interpolation_domain_contains_zero) {
732 for (size_t i = 0; i < n; ++i) {
733 // set z = - 1/x_i for x_i <> 0
734 z = roots_and_denominators[i];
735 // temp_src[i] is y_i, it gets multiplied by 1/d_i
736 multiplier = temp_src[i] * roots_and_denominators[n + i];
737 temp_dest[0] = multiplier * numerator_polynomial[0];
738 temp_dest[0] *= z;
739 dest[0] += temp_dest[0];
740 for (size_t j = 1; j < n; ++j) {
741 temp_dest[j] = multiplier * numerator_polynomial[j] - temp_dest[j - 1];
742 temp_dest[j] *= z;
743 dest[j] += temp_dest[j];
744 }
745 }
746 } else {
747 for (size_t i = 0; i < n; ++i) {
748 if (i == idx_zero) {
749 // the contribution from the term corresponding to i_0 is computed separately
750 continue;
751 }
752 // get the next inverted root
753 z = roots_and_denominators[i];
754 // compute f(x_i) * d_{x_i}^{-1}
755 multiplier = temp_src[i] * roots_and_denominators[n + i];
756 // get x_i^{-1} * f(x_i) * d_{x_i}^{-1} into the "free" term
757 temp_dest[1] = multiplier * numerator_polynomial[1];
758 temp_dest[1] *= z;
759 // correct the first coefficient as it is now accumulating free terms from
760 // f(x_i) d_i^{-1} prod_(X-x_i, x_i != 0) (X-x_i) * 1/(X-x_i)
761 dest[1] += temp_dest[1];
762 // compute the quotient N(X)/(X-x_i) f(x_i)/d_{x_i} and its contribution to the target coefficients
763 for (size_t j = 2; j < n; ++j) {
764 temp_dest[j] = multiplier * numerator_polynomial[j] - temp_dest[j - 1];
765 temp_dest[j] *= z;
766 dest[j] += temp_dest[j];
767 };
768 }
769 // correct the target coefficients by the contribution from q_{0} = N(X)/X * d_{i_0}^{-1} * f(0)
770 for (size_t i = 0; i < n; ++i) {
771 dest[i] += temp_src[idx_zero] * roots_and_denominators[n + idx_zero] * numerator_polynomial[i + 1];
772 }
773 }
774}
775
776template fr evaluate<fr>(const fr*, const fr&, const size_t);
777template fr evaluate<fr>(const std::vector<fr*>, const fr&, const size_t);
778template void fft_inner_parallel<fr>(std::vector<fr*>, const EvaluationDomain<fr>&, const fr&, const std::vector<fr*>&);
779template void fft<fr>(fr*, const EvaluationDomain<fr>&);
780template void fft<fr>(fr*, fr*, const EvaluationDomain<fr>&);
781template void fft<fr>(std::vector<fr*>, const EvaluationDomain<fr>&);
782template void coset_fft<fr>(fr*, const EvaluationDomain<fr>&);
783template void coset_fft<fr>(fr*, fr*, const EvaluationDomain<fr>&);
784template void coset_fft<fr>(std::vector<fr*>, const EvaluationDomain<fr>&);
785template void coset_fft<fr>(fr*, const EvaluationDomain<fr>&, const EvaluationDomain<fr>&, const size_t);
786template void ifft<fr>(fr*, const EvaluationDomain<fr>&);
787template void ifft<fr>(fr*, fr*, const EvaluationDomain<fr>&);
788template void ifft<fr>(std::vector<fr*>, const EvaluationDomain<fr>&);
789template void coset_ifft<fr>(fr*, const EvaluationDomain<fr>&);
790template void coset_ifft<fr>(std::vector<fr*>, const EvaluationDomain<fr>&);
791template fr compute_sum<fr>(const fr*, const size_t);
792template void compute_linear_polynomial_product<fr>(const fr*, fr*, const size_t);
793template void compute_efficient_interpolation<fr>(const fr*, fr*, const fr*, const size_t);
794
795template grumpkin::fr evaluate<grumpkin::fr>(const grumpkin::fr*, const grumpkin::fr&, const size_t);
796template grumpkin::fr evaluate<grumpkin::fr>(const std::vector<grumpkin::fr*>, const grumpkin::fr&, const size_t);
797template grumpkin::fr compute_sum<grumpkin::fr>(const grumpkin::fr*, const size_t);
798template void compute_linear_polynomial_product<grumpkin::fr>(const grumpkin::fr*, grumpkin::fr*, const size_t);
799template void compute_efficient_interpolation<grumpkin::fr>(const grumpkin::fr*,
801 const grumpkin::fr*,
802 const size_t);
803
804} // namespace bb::polynomial_arithmetic
#define BB_ASSERT(expression,...)
Definition assert.hpp:67
#define BB_ASSERT_EQ(actual, expected,...)
Definition assert.hpp:77
const std::vector< FF * > & get_round_roots() const
const std::vector< FF * > & get_inverse_round_roots() const
#define ITERATE_OVER_DOMAIN_START(domain)
#define ITERATE_OVER_DOMAIN_END
constexpr bool is_power_of_two(uint64_t x)
Definition pow.hpp:35
void ifft(Fr *coeffs, const EvaluationDomain< Fr > &domain)
Fr compute_linear_polynomial_product_evaluation(const Fr *roots, const Fr z, const size_t n)
uint32_t reverse_bits(uint32_t x, uint32_t bit_length)
void fft_inner_parallel(std::vector< Fr * > coeffs, const EvaluationDomain< Fr > &domain, const Fr &, const std::vector< Fr * > &root_table)
void coset_fft(Fr *coeffs, const EvaluationDomain< Fr > &domain)
void compute_linear_polynomial_product(const Fr *roots, Fr *dest, const size_t n)
void compute_multiplicative_subgroup(const size_t log2_subgroup_size, const EvaluationDomain< Fr > &src_domain, Fr *subgroup_roots)
Fr evaluate(const Fr *coeffs, const Fr &z, const size_t n)
void fft(Fr *coeffs, const EvaluationDomain< Fr > &domain)
fr compute_barycentric_evaluation(const fr *coeffs, const size_t num_coeffs, const fr &z, const EvaluationDomain< fr > &domain)
void coset_ifft(Fr *coeffs, const EvaluationDomain< Fr > &domain)
void compute_efficient_interpolation(const Fr *src, Fr *dest, const Fr *evaluation_points, const size_t n)
void scale_by_generator(Fr *coeffs, Fr *target, const EvaluationDomain< Fr > &domain, const Fr &generator_start, const Fr &generator_shift, const size_t generator_size)
Fr compute_sum(const Fr *src, const size_t n)
size_t get_num_cpus()
Definition thread.cpp:33
void parallel_for(size_t num_iterations, const std::function< void(size_t)> &func)
Definition thread.cpp:111
constexpr decltype(auto) get(::tuplet::tuple< T... > &&t) noexcept
Definition tuple.hpp:13
Curve::ScalarField Fr
size_t total_threads
Definition thread.hpp:160
size_t thread_index
Definition thread.hpp:159
auto range(size_t size, size_t offset=0) const
Definition thread.hpp:161
static constexpr field get_root_of_unity(size_t subgroup_size) noexcept
static constexpr field neg_one()
static constexpr field one()
BB_INLINE constexpr field pow(const uint256_t &exponent) const noexcept
BB_INLINE constexpr void self_sqr() &noexcept
static BB_INLINE void __copy(const field &a, field &r) noexcept
static void batch_invert(C &coeffs) noexcept
static constexpr field zero()