Barretenberg
The ZK-SNARK library at the core of Aztec
Loading...
Searching...
No Matches
scalar_multiplication.cpp
Go to the documentation of this file.
1// === AUDIT STATUS ===
2// internal: { status: not started, auditors: [], date: YYYY-MM-DD }
3// external_1: { status: not started, auditors: [], date: YYYY-MM-DD }
4// external_2: { status: not started, auditors: [], date: YYYY-MM-DD }
5// =====================
9
10#include "./process_buckets.hpp"
18
21
23
33template <typename Curve>
36 std::span<const uint32_t> scalar_indices,
37 size_t range) noexcept
38{
39 typename Curve::Element r = Curve::Group::point_at_infinity;
40 for (size_t i = 0; i < range; ++i) {
41 typename Curve::Element f = points[scalar_indices[i]];
42 r += f * scalars[scalar_indices[i]].to_montgomery_form();
43 }
44 return r;
45}
46
54template <typename Curve>
56 std::vector<uint32_t>& consolidated_indices) noexcept
57{
59
60 parallel_for([&](const ThreadChunk& chunk) {
61 // parallel_for with ThreadChunk uses get_num_cpus() threads
62 BB_ASSERT_EQ(chunk.total_threads, thread_indices.size());
63 auto range = chunk.range(scalars.size());
64 if (range.empty()) {
65 return;
66 }
67 std::vector<uint32_t>& thread_scalar_indices = thread_indices[chunk.thread_index];
68 thread_scalar_indices.reserve(range.size());
69 for (size_t i : range) {
70 BB_ASSERT_DEBUG(i < scalars.size());
71 auto& scalar = scalars[i];
72 scalar.self_from_montgomery_form();
73
74 bool is_zero =
75 (scalar.data[0] == 0) && (scalar.data[1] == 0) && (scalar.data[2] == 0) && (scalar.data[3] == 0);
76 if (!is_zero) {
77 thread_scalar_indices.push_back(static_cast<uint32_t>(i));
78 }
79 }
80 });
81
82 size_t num_entries = 0;
83 for (const auto& indices : thread_indices) {
84 num_entries += indices.size();
85 }
86 consolidated_indices.resize(num_entries);
87
88 parallel_for([&](const ThreadChunk& chunk) {
89 // parallel_for with ThreadChunk uses get_num_cpus() threads
90 BB_ASSERT_EQ(chunk.total_threads, thread_indices.size());
91 size_t offset = 0;
92 for (size_t i = 0; i < chunk.thread_index; ++i) {
93 offset += thread_indices[i].size();
94 }
95 for (size_t i = offset; i < offset + thread_indices[chunk.thread_index].size(); ++i) {
96 consolidated_indices[i] = thread_indices[chunk.thread_index][i - offset];
97 }
98 });
99}
100
112template <typename Curve>
114 std::span<std::span<ScalarField>> scalars, std::vector<std::vector<uint32_t>>& msm_scalar_indices) noexcept
115{
116
117 const size_t num_msms = scalars.size();
118 msm_scalar_indices.resize(num_msms);
119 for (size_t i = 0; i < num_msms; ++i) {
120 BB_ASSERT_LT(i, scalars.size());
121 transform_scalar_and_get_nonzero_scalar_indices(scalars[i], msm_scalar_indices[i]);
122 }
123
124 size_t total_work = 0;
125 for (const auto& indices : msm_scalar_indices) {
126 total_work += indices.size();
127 }
128
129 const size_t num_threads = get_num_cpus();
130 std::vector<ThreadWorkUnits> work_units(num_threads);
131
132 const size_t work_per_thread = numeric::ceil_div(total_work, num_threads);
133 size_t work_of_last_thread = total_work - (work_per_thread * (num_threads - 1));
134
135 // [(MSMs + T - 1) / T] * [T - 1] > MSMs
136 // T = 192
137 // ([M + 191] / 192) * 193 > M
138 // only use a single work unit if we don't have enough work for every thread
139 if (num_threads > total_work) {
140 for (size_t i = 0; i < num_msms; ++i) {
141 work_units[0].push_back(MSMWorkUnit{
142 .batch_msm_index = i,
143 .start_index = 0,
144 .size = msm_scalar_indices[i].size(),
145 });
146 }
147 return work_units;
148 }
149
150 size_t thread_accumulated_work = 0;
151 size_t current_thread_idx = 0;
152 for (size_t i = 0; i < num_msms; ++i) {
153 BB_ASSERT_DEBUG(i < msm_scalar_indices.size());
154 size_t msm_work = msm_scalar_indices[i].size();
155 size_t msm_size = msm_work;
156 while (msm_work > 0) {
157 const size_t total_thread_work =
158 (current_thread_idx == num_threads - 1) ? work_of_last_thread : work_per_thread;
159 const size_t available_thread_work = total_thread_work - thread_accumulated_work;
160
161 if (available_thread_work >= msm_work) {
162 BB_ASSERT_LT(current_thread_idx, work_units.size());
163 work_units[current_thread_idx].push_back(MSMWorkUnit{
164 .batch_msm_index = i,
165 .start_index = msm_size - msm_work,
166 .size = msm_work,
167 });
168 thread_accumulated_work += msm_work;
169 msm_work = 0;
170 } else {
171 BB_ASSERT_LT(current_thread_idx, work_units.size());
172 work_units[current_thread_idx].push_back(MSMWorkUnit{
173 .batch_msm_index = i,
174 .start_index = msm_size - msm_work,
175 .size = available_thread_work,
176 });
177 msm_work -= available_thread_work;
178 current_thread_idx++;
179 thread_accumulated_work = 0;
180 }
181 }
182 }
183 return work_units;
184}
185
196template <typename Curve>
197uint32_t MSM<Curve>::get_scalar_slice(const typename Curve::ScalarField& scalar,
198 size_t round,
199 size_t slice_size) noexcept
200{
201 size_t hi_bit = NUM_BITS_IN_FIELD - (round * slice_size);
202 // todo remove
203 bool last_slice = hi_bit < slice_size;
204 size_t target_slice_size = last_slice ? hi_bit : slice_size;
205 size_t lo_bit = last_slice ? 0 : hi_bit - slice_size;
206 size_t start_limb = lo_bit / 64;
207 size_t end_limb = hi_bit / 64;
208 size_t lo_slice_offset = lo_bit & 63;
209 size_t lo_slice_bits = std::min(target_slice_size, 64 - lo_slice_offset);
210 size_t hi_slice_bits = target_slice_size - lo_slice_bits;
211 size_t lo_slice = (scalar.data[start_limb] >> lo_slice_offset) & ((static_cast<size_t>(1) << lo_slice_bits) - 1);
212 size_t hi_slice = (scalar.data[end_limb] & ((static_cast<size_t>(1) << hi_slice_bits) - 1));
213
214 uint32_t lo = static_cast<uint32_t>(lo_slice);
215 uint32_t hi = static_cast<uint32_t>(hi_slice);
216
217 uint32_t result = lo + (hi << lo_slice_bits);
218 return result;
219}
220
228template <typename Curve> size_t MSM<Curve>::get_optimal_log_num_buckets(const size_t num_points) noexcept
229{
230 // We do 2 group operations per bucket, and they are full 3D Jacobian adds which are ~2x more than an affine add
231 constexpr size_t COST_OF_BUCKET_OP_RELATIVE_TO_POINT = 5;
232 size_t cached_cost = static_cast<size_t>(-1);
233 size_t target_bit_slice = 0;
234 for (size_t bit_slice = 1; bit_slice < 20; ++bit_slice) {
235 const size_t num_rounds = numeric::ceil_div(NUM_BITS_IN_FIELD, bit_slice);
236 const size_t num_buckets = 1 << bit_slice;
237 const size_t addition_cost = num_rounds * num_points;
238 const size_t bucket_cost = num_rounds * num_buckets * COST_OF_BUCKET_OP_RELATIVE_TO_POINT;
239 const size_t total_cost = addition_cost + bucket_cost;
240 if (total_cost < cached_cost) {
241 cached_cost = total_cost;
242 target_bit_slice = bit_slice;
243 }
244 }
245 return target_bit_slice;
246}
247
257template <typename Curve> bool MSM<Curve>::use_affine_trick(const size_t num_points, const size_t num_buckets) noexcept
258{
259 if (num_points < 128) {
260 return false;
261 }
262
263 // Affine trick requires log(N) modular inversions per Pippenger round.
264 // It saves NUM_POINTS * COST_SAVING_OF_AFFINE_TRICK_PER_GROUP_OPERATION field muls
265 // It also saves NUM_BUCKETS * EXTRA_COST_OF_JACOBIAN_GROUP_OPERATION_IF_Z2_IS_NOT_1 field muls
266 // due to all our buckets having Z=1 if we use the affine trick
267
268 // COST_OF_INVERSION cost:
269 // Requires NUM_BITS_IN_FIELD sqarings
270 // We use 4-bit windows = ((NUM_BITS_IN_FIELD + 3) / 4) multiplications
271 // Computing 4-bit window table requires 14 muls
272 constexpr size_t COST_OF_INVERSION = NUM_BITS_IN_FIELD + ((NUM_BITS_IN_FIELD + 3) / 4) + 14;
273 constexpr size_t COST_SAVING_OF_AFFINE_TRICK_PER_GROUP_OPERATION = 5;
274 constexpr size_t EXTRA_COST_OF_JACOBIAN_GROUP_OPERATION_IF_Z2_IS_NOT_1 = 5;
275
276 double num_points_f = static_cast<double>(num_points);
277 double log2_num_points_f = log2(num_points_f);
278
279 size_t group_op_cost_saving_per_round = (num_points * COST_SAVING_OF_AFFINE_TRICK_PER_GROUP_OPERATION) +
280 (num_buckets * EXTRA_COST_OF_JACOBIAN_GROUP_OPERATION_IF_Z2_IS_NOT_1);
281 double inversion_cost_per_round = log2_num_points_f * static_cast<double>(COST_OF_INVERSION);
282
283 return static_cast<double>(group_op_cost_saving_per_round) > inversion_cost_per_round;
284}
285
323template <typename Curve>
325 const size_t num_points,
326 typename Curve::BaseField* scratch_space) noexcept
327{
328 using Fq = typename Curve::BaseField;
329 Fq batch_inversion_accumulator = Fq::one();
330
331 for (size_t i = 0; i < num_points; i += 2) {
332 scratch_space[i >> 1] = points[i].x + points[i + 1].x; // x2 + x1
333 points[i + 1].x -= points[i].x; // x2 - x1
334 points[i + 1].y -= points[i].y; // y2 - y1
335 points[i + 1].y *= batch_inversion_accumulator; // (y2 - y1)*accumulator_old
336 batch_inversion_accumulator *= (points[i + 1].x);
337 }
338 if (batch_inversion_accumulator == 0) {
339 // prefer abort to throw for code that might emit from multiple threads
340 throw_or_abort("attempted to invert zero in add_affine_points");
341 } else {
342 batch_inversion_accumulator = batch_inversion_accumulator.invert();
343 }
344
345 // Iterate backwards through the points, comnputing pairwise affine additions; addition results are stored in the
346 // latter half of the array
347 for (size_t i = (num_points)-2; i < num_points; i -= 2) {
348 points[i + 1].y *= batch_inversion_accumulator; // update accumulator
349 batch_inversion_accumulator *= points[i + 1].x;
350 points[i + 1].x = points[i + 1].y.sqr();
351 points[(i + num_points) >> 1].x = points[i + 1].x - (scratch_space[i >> 1]); // x3 = lambda_squared - x2
352 // - x1
353 // Memory bandwidth is a bit of a bottleneck here.
354 // There's probably a more elegant way of structuring our data so we don't need to do all of this
355 // prefetching
356 if (i >= 2) {
357 __builtin_prefetch(points + i - 2);
358 __builtin_prefetch(points + i - 1);
359 __builtin_prefetch(points + ((i + num_points - 2) >> 1));
360 __builtin_prefetch(scratch_space + ((i - 2) >> 1));
361 }
362 points[i].x -= points[(i + num_points) >> 1].x;
363 points[i].x *= points[i + 1].y;
364 points[(i + num_points) >> 1].y = points[i].x - points[i].y;
365 }
366}
367
375template <typename Curve>
377{
378 std::span<const uint32_t>& nonzero_scalar_indices = msm_data.scalar_indices;
379 const size_t size = nonzero_scalar_indices.size();
380 const size_t bits_per_slice = get_optimal_log_num_buckets(size);
381 const size_t num_buckets = 1 << bits_per_slice;
383 Element round_output = Curve::Group::point_at_infinity;
384
385 const size_t num_rounds = numeric::ceil_div(NUM_BITS_IN_FIELD, bits_per_slice);
386
387 for (size_t i = 0; i < num_rounds; ++i) {
388 round_output = evaluate_small_pippenger_round(msm_data, i, bucket_data, round_output, bits_per_slice);
389 }
390 return round_output;
391}
392
400template <typename Curve>
402{
403 const size_t msm_size = msm_data.scalar_indices.size();
404 const size_t bits_per_slice = get_optimal_log_num_buckets(msm_size);
405 const size_t num_buckets = 1 << bits_per_slice;
406
407 if (!use_affine_trick(msm_size, num_buckets)) {
408 return small_pippenger_low_memory_with_transformed_scalars(msm_data);
409 }
410 // TODO(https://github.com/AztecProtocol/barretenberg/issues/1452): Consider allowing this memory to persist rather
411 // than allocating/deallocating on every execution.
413 BucketAccumulators bucket_data = BucketAccumulators(num_buckets);
414
415 Element round_output = Curve::Group::point_at_infinity;
416
417 const size_t num_rounds = numeric::ceil_div(NUM_BITS_IN_FIELD, bits_per_slice);
418 for (size_t i = 0; i < num_rounds; ++i) {
419 round_output = evaluate_pippenger_round(msm_data, i, affine_data, bucket_data, round_output, bits_per_slice);
420 }
421
422 return (round_output);
423}
424
436template <typename Curve>
438 const size_t round_index,
440 typename Curve::Element previous_round_output,
441 const size_t bits_per_slice) noexcept
442{
443 std::span<const uint32_t>& nonzero_scalar_indices = msm_data.scalar_indices;
444 std::span<const ScalarField>& scalars = msm_data.scalars;
445 std::span<const AffineElement>& points = msm_data.points;
446
447 const size_t size = nonzero_scalar_indices.size();
448 for (size_t i = 0; i < size; ++i) {
449 BB_ASSERT_DEBUG(nonzero_scalar_indices[i] < scalars.size());
450 uint32_t bucket_index = get_scalar_slice(scalars[nonzero_scalar_indices[i]], round_index, bits_per_slice);
451 BB_ASSERT_DEBUG(bucket_index < static_cast<uint32_t>(1 << bits_per_slice));
452 if (bucket_index > 0) {
453 // do this check because we do not reset bucket_data.buckets after each round
454 // (i.e. not neccessarily at infinity)
455 if (bucket_data.bucket_exists.get(bucket_index)) {
456 bucket_data.buckets[bucket_index] += points[nonzero_scalar_indices[i]];
457 } else {
458 bucket_data.buckets[bucket_index] = points[nonzero_scalar_indices[i]];
459 bucket_data.bucket_exists.set(bucket_index, true);
460 }
461 }
462 }
463 Element round_output;
464 round_output.self_set_infinity();
465 round_output = accumulate_buckets(bucket_data);
466 bucket_data.bucket_exists.clear();
467 Element result = previous_round_output;
468 const size_t num_rounds = numeric::ceil_div(NUM_BITS_IN_FIELD, bits_per_slice);
469 size_t num_doublings = ((round_index == num_rounds - 1) && (NUM_BITS_IN_FIELD % bits_per_slice != 0))
470 ? NUM_BITS_IN_FIELD % bits_per_slice
471 : bits_per_slice;
472 for (size_t i = 0; i < num_doublings; ++i) {
473 result.self_dbl();
474 }
475
476 result += round_output;
477 return result;
478}
479
492template <typename Curve>
494 const size_t round_index,
497 typename Curve::Element previous_round_output,
498 const size_t bits_per_slice) noexcept
499{
500 std::span<const uint32_t>& scalar_indices = msm_data.scalar_indices; // indices of nonzero scalars
501 std::span<const ScalarField>& scalars = msm_data.scalars;
502 std::span<const AffineElement>& points = msm_data.points;
503 std::span<uint64_t>& round_schedule = msm_data.point_schedule;
504 const size_t size = scalar_indices.size();
505
506 // Construct a "round schedule". Each entry describes:
507 // 1. low 32 bits: which bucket index do we add the point into? (bucket index = slice value)
508 // 2. high 32 bits: which point index do we source the point from?
509 for (size_t i = 0; i < size; ++i) {
510 BB_ASSERT_DEBUG(scalar_indices[i] < scalars.size());
511 round_schedule[i] = get_scalar_slice(scalars[scalar_indices[i]], round_index, bits_per_slice);
512 round_schedule[i] += (static_cast<uint64_t>(scalar_indices[i]) << 32ULL);
513 }
514 // Sort our point schedules based on their bucket values. Reduces memory throughput in next step of algo
515 const size_t num_zero_entries = scalar_multiplication::process_buckets_count_zero_entries(
516 &round_schedule[0], size, static_cast<uint32_t>(bits_per_slice));
517 BB_ASSERT_DEBUG(num_zero_entries <= size);
518 const size_t round_size = size - num_zero_entries;
519
520 Element round_output;
521 round_output.self_set_infinity();
522
523 if (round_size > 0) {
524 std::span<uint64_t> point_schedule(&round_schedule[num_zero_entries], round_size);
525 // Iterate through our point schedule and add points into corresponding buckets
526 consume_point_schedule(point_schedule, points, affine_data, bucket_data, 0, 0);
527 round_output = accumulate_buckets(bucket_data);
528 bucket_data.bucket_exists.clear();
529 }
530
531 Element result = previous_round_output;
532 const size_t num_rounds = numeric::ceil_div(NUM_BITS_IN_FIELD, bits_per_slice);
533 size_t num_doublings = ((round_index == num_rounds - 1) && (NUM_BITS_IN_FIELD % bits_per_slice != 0))
534 ? NUM_BITS_IN_FIELD % bits_per_slice
535 : bits_per_slice;
536 for (size_t i = 0; i < num_doublings; ++i) {
537 result.self_dbl();
538 }
539
540 result += round_output;
541 return result;
542}
543
556template <typename Curve>
561 size_t num_input_points_processed,
562 size_t num_queued_affine_points) noexcept
563{
564
565 size_t point_it = num_input_points_processed;
566 size_t affine_input_it = num_queued_affine_points;
567 // N.B. points and point_schedule MAY HAVE DIFFERENT SIZES
568 // We source the number of actual points we work on from the point schedule
569 size_t num_points = point_schedule.size();
570 auto& bucket_accumulator_exists = bucket_data.bucket_exists;
571 auto& affine_addition_scratch_space = affine_data.points_to_add;
572 auto& bucket_accumulators = bucket_data.buckets;
573 auto& affine_addition_output_bucket_destinations = affine_data.addition_result_bucket_destinations;
574 auto& scalar_scratch_space = affine_data.scalar_scratch_space;
575 auto& output_point_schedule = affine_data.addition_result_bucket_destinations;
576 AffineElement null_location{};
577 // We do memory prefetching, `prefetch_max` ensures we do not overflow our containers
578 size_t prefetch_max = (num_points - 32);
579 if (num_points < 32) {
580 prefetch_max = 0;
581 }
582 size_t end = num_points - 1;
583 if (num_points == 0) {
584 end = 0;
585 }
586
587 // Step 1: Fill up `affine_addition_scratch_space` with up to AffineAdditionData::BATCH_SIZE/2 independent additions
588 while (((affine_input_it + 1) < AffineAdditionData::BATCH_SIZE) && (point_it < end)) {
589
590 // we prefetchin'
591 if ((point_it < prefetch_max) && ((point_it & 0x0f) == 0)) {
592 for (size_t i = 16; i < 32; ++i) {
593 __builtin_prefetch(&points[(point_schedule[point_it + i] >> 32ULL)]);
594 }
595 }
596
597 // We do some branchless programming here to minimize instruction pipeline flushes
598 // TODO(@zac-williamson, cc @ludamad) check these ternary operators are not branching! -> (ludamad: they don't,
599 // but its not clear that the conditional move is fundamentally less expensive)
600 // We are iterating through our points and
601 // can come across the following scenarios: 1: The next 2 points in `point_schedule` belong to the *same* bucket
602 // (happy path - can put both points into affine_addition_scratch_space)
603 // 2: The next 2 points have different bucket destinations AND point_schedule[point_it].bucket contains a point
604 // (happyish path - we can put points[lhs_schedule] and buckets[lhs_bucket] into
605 // affine_addition_scratch_space)
606 // 3: The next 2 points have different bucket destionations AND point_schedule[point_it].bucket is empty
607 // We cache points[lhs_schedule] into buckets[lhs_bucket]
608 // We iterate `point_it` by 2 (case 1), or by 1 (case 2 or 3). The number of points we add into
609 // `affine_addition_scratch_space` is 2 (case 1 or 2) or 0 (case 3).
610 uint64_t lhs_schedule = point_schedule[point_it];
611 uint64_t rhs_schedule = point_schedule[point_it + 1];
612 size_t lhs_bucket = static_cast<size_t>(lhs_schedule) & 0xFFFFFFFF;
613 size_t rhs_bucket = static_cast<size_t>(rhs_schedule) & 0xFFFFFFFF;
614 size_t lhs_point = static_cast<size_t>(lhs_schedule >> 32);
615 size_t rhs_point = static_cast<size_t>(rhs_schedule >> 32);
616
617 bool has_bucket_accumulator = bucket_accumulator_exists.get(lhs_bucket);
618 bool buckets_match = lhs_bucket == rhs_bucket;
619 bool do_affine_add = buckets_match || has_bucket_accumulator;
620
621 const AffineElement* lhs_source = &points[lhs_point];
622 const AffineElement* rhs_source = buckets_match ? &points[rhs_point] : &bucket_accumulators[lhs_bucket];
623
624 // either two points are set to be added (point to point or point into bucket accumulator), or lhs is stored in
625 // the bucket and rhs is temporarily ignored
626 AffineElement* lhs_destination =
627 do_affine_add ? &affine_addition_scratch_space[affine_input_it] : &bucket_accumulators[lhs_bucket];
628 AffineElement* rhs_destination =
629 do_affine_add ? &affine_addition_scratch_space[affine_input_it + 1] : &null_location;
630
631 // if performing an affine add, set the destination bucket corresponding to the addition result
632 uint64_t& source_bucket_destination = affine_addition_output_bucket_destinations[affine_input_it >> 1];
633 source_bucket_destination = do_affine_add ? lhs_bucket : source_bucket_destination;
634
635 // unconditional swap. No if statements here.
636 *lhs_destination = *lhs_source;
637 *rhs_destination = *rhs_source;
638
639 // indicate whether bucket_accumulators[lhs_bucket] will contain a point after this iteration
640 bucket_accumulator_exists.set(
641 lhs_bucket,
642 (has_bucket_accumulator && buckets_match) || /* bucket has an accum and its not being used in current add */
643 !do_affine_add); /* lhs point is cached into the bucket */
644
645 affine_input_it += do_affine_add ? 2 : 0;
646 point_it += (do_affine_add && buckets_match) ? 2 : 1;
647 }
648 // We have to handle the last point as an edge case so that we dont overflow the bounds of `point_schedule`. If the
649 // bucket accumulator exists, we add the point to it, otherwise the point simply becomes the bucket accumulator.
650 if (point_it == num_points - 1) {
651 uint64_t lhs_schedule = point_schedule[point_it];
652 size_t lhs_bucket = static_cast<size_t>(lhs_schedule) & 0xFFFFFFFF;
653 size_t lhs_point = static_cast<size_t>(lhs_schedule >> 32);
654 bool has_bucket_accumulator = bucket_accumulator_exists.get(lhs_bucket);
655
656 if (has_bucket_accumulator) { // point is added to its bucket accumulator
657 affine_addition_scratch_space[affine_input_it] = points[lhs_point];
658 affine_addition_scratch_space[affine_input_it + 1] = bucket_accumulators[lhs_bucket];
659 bucket_accumulator_exists.set(lhs_bucket, false);
660 affine_addition_output_bucket_destinations[affine_input_it >> 1] = lhs_bucket;
661 affine_input_it += 2;
662 point_it += 1;
663 } else { // otherwise, cache the point into the bucket
664 BB_ASSERT_DEBUG(lhs_point < points.size());
665 bucket_accumulators[lhs_bucket] = points[lhs_point];
666 bucket_accumulator_exists.set(lhs_bucket, true);
667 point_it += 1;
668 }
669 }
670
671 // Now that we have populated `affine_addition_scratch_space`,
672 // compute `num_affine_points_to_add` independent additions using the Affine trick
673 size_t num_affine_points_to_add = affine_input_it;
674 if (num_affine_points_to_add >= 2) {
675 add_affine_points(&affine_addition_scratch_space[0], num_affine_points_to_add, &scalar_scratch_space[0]);
676 }
677 // `add_affine_points` stores the result in the top-half of the used scratch space
678 G1* affine_output = &affine_addition_scratch_space[0] + (num_affine_points_to_add / 2);
679
680 // Process the addition outputs.
681 // We either need to feed the addition outputs back into affine_addition_scratch_space for more addition operations.
682 // Or, if there are no more additions for a bucket, we store the addition output in a bucket accumulator.
683 size_t new_scratch_space_it = 0;
684 size_t affine_output_it = 0;
685 size_t num_affine_output_points = num_affine_points_to_add / 2;
686 // This algorithm is equivalent to the one we used to populate `affine_addition_scratch_space` from the point
687 // schedule, however here we source points from a different location (the addition results)
688 while ((affine_output_it < (num_affine_output_points - 1)) && (num_affine_output_points > 0)) {
689 size_t lhs_bucket = static_cast<size_t>(affine_addition_output_bucket_destinations[affine_output_it]);
690 size_t rhs_bucket = static_cast<size_t>(affine_addition_output_bucket_destinations[affine_output_it + 1]);
691 BB_ASSERT_DEBUG(lhs_bucket < bucket_accumulator_exists.size());
692
693 bool has_bucket_accumulator = bucket_accumulator_exists.get(lhs_bucket);
694 bool buckets_match = (lhs_bucket == rhs_bucket);
695 bool do_affine_add = buckets_match || has_bucket_accumulator;
696
697 const AffineElement* lhs_source = &affine_output[affine_output_it];
698 const AffineElement* rhs_source =
699 buckets_match ? &affine_output[affine_output_it + 1] : &bucket_accumulators[lhs_bucket];
700
701 AffineElement* lhs_destination =
702 do_affine_add ? &affine_addition_scratch_space[new_scratch_space_it] : &bucket_accumulators[lhs_bucket];
703 AffineElement* rhs_destination =
704 do_affine_add ? &affine_addition_scratch_space[new_scratch_space_it + 1] : &null_location;
705
706 uint64_t& source_bucket_destination = output_point_schedule[new_scratch_space_it >> 1];
707 source_bucket_destination = do_affine_add ? lhs_bucket : source_bucket_destination;
708
709 *lhs_destination = *lhs_source;
710 *rhs_destination = *rhs_source;
711
712 bucket_accumulator_exists.set(lhs_bucket, (has_bucket_accumulator && buckets_match) || !do_affine_add);
713 new_scratch_space_it += do_affine_add ? 2 : 0;
714 affine_output_it += (do_affine_add && buckets_match) ? 2 : 1;
715 }
716 // perform final iteration as edge case so we don't overflow `affine_addition_output_bucket_destinations`
717 if (affine_output_it == (num_affine_output_points - 1)) {
718
719 size_t lhs_bucket = static_cast<size_t>(affine_addition_output_bucket_destinations[affine_output_it]);
720
721 bool has_bucket_accumulator = bucket_accumulator_exists.get(lhs_bucket);
722 if (has_bucket_accumulator) {
723 BB_ASSERT_DEBUG(new_scratch_space_it + 1 < affine_addition_scratch_space.size());
724 BB_ASSERT_DEBUG(lhs_bucket < bucket_accumulators.size());
725 BB_ASSERT_DEBUG((new_scratch_space_it >> 1) < output_point_schedule.size());
726 affine_addition_scratch_space[new_scratch_space_it] = affine_output[affine_output_it];
727 affine_addition_scratch_space[new_scratch_space_it + 1] = bucket_accumulators[lhs_bucket];
728 bucket_accumulator_exists.set(lhs_bucket, false);
729 output_point_schedule[new_scratch_space_it >> 1] = lhs_bucket;
730 new_scratch_space_it += 2;
731 affine_output_it += 1;
732 } else {
733 bucket_accumulators[lhs_bucket] = affine_output[affine_output_it];
734 bucket_accumulator_exists.set(lhs_bucket, true);
735 affine_output_it += 1;
736 }
737 }
738
739 // If we have not finished iterating over the point schedule,
740 // OR we have affine additions to perform in the scratch space, continue
741 if (point_it < num_points || new_scratch_space_it != 0) {
742 consume_point_schedule(point_schedule, points, affine_data, bucket_data, point_it, new_scratch_space_it);
743 }
744}
745
759template <typename Curve>
760std::vector<typename Curve::AffineElement> MSM<Curve>::batch_multi_scalar_mul(
762 std::span<std::span<ScalarField>> scalars,
763 bool handle_edge_cases) noexcept
764{
765 BB_ASSERT_EQ(points.size(), scalars.size());
766 const size_t num_msms = points.size();
767
768 std::vector<std::vector<uint32_t>> msm_scalar_indices;
769 std::vector<ThreadWorkUnits> thread_work_units = get_work_units(scalars, msm_scalar_indices);
770 const size_t num_cpus = get_num_cpus();
771 std::vector<std::vector<std::pair<Element, size_t>>> thread_msm_results(num_cpus);
772 BB_ASSERT_EQ(thread_work_units.size(), num_cpus);
773
774 // Once we have our work units, each thread can independently evaluate its assigned msms
775 parallel_for(num_cpus, [&](size_t thread_idx) {
776 if (!thread_work_units[thread_idx].empty()) {
777 const std::vector<MSMWorkUnit>& msms = thread_work_units[thread_idx];
778 std::vector<std::pair<Element, size_t>>& msm_results = thread_msm_results[thread_idx];
779 for (const MSMWorkUnit& msm : msms) {
780 std::span<const ScalarField> work_scalars = scalars[msm.batch_msm_index];
781 std::span<const AffineElement> work_points = points[msm.batch_msm_index];
782 std::span<const uint32_t> work_indices =
783 std::span<const uint32_t>{ &msm_scalar_indices[msm.batch_msm_index][msm.start_index], msm.size };
784 std::vector<uint64_t> point_schedule(msm.size);
785 MSMData msm_data(work_scalars, work_points, work_indices, std::span<uint64_t>(point_schedule));
786 Element msm_result = Curve::Group::point_at_infinity;
787 constexpr size_t SINGLE_MUL_THRESHOLD = 16;
788 if (msm.size < SINGLE_MUL_THRESHOLD) {
789 msm_result = small_mul<Curve>(work_scalars, work_points, msm_data.scalar_indices, msm.size);
790 } else {
791 // Our non-affine method implicitly handles cases where Weierstrass edge cases may occur
792 // Note: not as fast! use unsafe version if you know all input base points are linearly independent
793 if (handle_edge_cases) {
794 msm_result = small_pippenger_low_memory_with_transformed_scalars(msm_data);
795 } else {
796 msm_result = pippenger_low_memory_with_transformed_scalars(msm_data);
797 }
798 }
799 msm_results.push_back(std::make_pair(msm_result, msm.batch_msm_index));
800 }
801 }
802 });
803
804 // Accumulate results. This part needs to be single threaded, but amount of work done here should be small
805 // TODO(@zac-williamson) check this? E.g. if we are doing a 2^16 MSM with 256 threads this single-threaded part
806 // will be painful.
807 std::vector<Element> results(num_msms);
808 for (Element& ele : results) {
809 ele.self_set_infinity();
810 }
811 for (const auto& single_thread_msm_results : thread_msm_results) {
812 for (const std::pair<Element, size_t>& result : single_thread_msm_results) {
813 results[result.second] += result.first;
814 }
815 }
816 Element::batch_normalize(&results[0], num_msms);
817
818 std::vector<AffineElement> affine_results;
819 for (const auto& ele : results) {
820 affine_results.emplace_back(AffineElement(ele.x, ele.y));
821 }
822
823 // Convert our scalars back into Montgomery form so they remain unchanged
824 for (auto& msm_scalars : scalars) {
825 parallel_for_range(msm_scalars.size(), [&](size_t start, size_t end) {
826 for (size_t i = start; i < end; ++i) {
827 msm_scalars[i].self_to_montgomery_form();
828 }
829 });
830 }
831 return affine_results;
832}
833
842template <typename Curve>
845 bool handle_edge_cases) noexcept
846{
847 if (_scalars.size() == 0) {
848 return Curve::Group::affine_point_at_infinity;
849 }
850 BB_ASSERT_GTE(points.size(), _scalars.start_index + _scalars.size());
851
852 // unfortnately we need to remove const on this data type to prevent duplicating _scalars (which is typically
853 // large) We need to convert `_scalars` out of montgomery form for the MSM. We then convert the scalars back
854 // into Montgomery form at the end of the algorithm. NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
855 // TODO(https://github.com/AztecProtocol/barretenberg/issues/1449): handle const correctness.
856 ScalarField* scalars = const_cast<ScalarField*>(&_scalars[_scalars.start_index]);
857
858 std::vector<std::span<const AffineElement>> pp{ points.subspan(_scalars.start_index) };
859 std::vector<std::span<ScalarField>> ss{ std::span<ScalarField>(scalars, _scalars.size()) };
860 AffineElement result = batch_multi_scalar_mul(pp, ss, handle_edge_cases)[0];
861 return result;
862}
863
864template <typename Curve>
867 [[maybe_unused]] bool handle_edge_cases) noexcept
868{
869 return MSM<Curve>::msm(points, scalars, handle_edge_cases);
870}
871
872template <typename Curve>
878
881 bool handle_edge_cases = true) noexcept;
882
883template curve::Grumpkin::Element pippenger_unsafe<curve::Grumpkin>(
884 PolynomialSpan<const curve::Grumpkin::ScalarField> scalars, std::span<const curve::Grumpkin::AffineElement> points);
885
886template curve::BN254::Element pippenger<curve::BN254>(PolynomialSpan<const curve::BN254::ScalarField> scalars,
887 std::span<const curve::BN254::AffineElement> points,
888 bool handle_edge_cases = true);
889
890template curve::BN254::Element pippenger_unsafe<curve::BN254>(PolynomialSpan<const curve::BN254::ScalarField> scalars,
891 std::span<const curve::BN254::AffineElement> points);
892
893} // namespace bb::scalar_multiplication
894
895template class bb::scalar_multiplication::MSM<bb::curve::Grumpkin>;
896template class bb::scalar_multiplication::MSM<bb::curve::BN254>;
#define BB_ASSERT_GTE(left, right,...)
Definition assert.hpp:122
#define BB_ASSERT_DEBUG(expression,...)
Definition assert.hpp:54
#define BB_ASSERT_EQ(actual, expected,...)
Definition assert.hpp:77
#define BB_ASSERT_LT(left, right,...)
Definition assert.hpp:137
typename Group::element Element
Definition grumpkin.hpp:62
typename Group::affine_element AffineElement
Definition grumpkin.hpp:63
static Element evaluate_small_pippenger_round(MSMData &msm_data, const size_t round_index, JacobianBucketAccumulators &bucket_data, Element previous_round_output, const size_t bits_per_slice) noexcept
Evaluate a single Pippenger round when we do not use the Affine trick.
static Element small_pippenger_low_memory_with_transformed_scalars(MSMData &msm_data) noexcept
Top-level Pippenger algorithm where number of points is small and we are not using the Affine trick.
static Element pippenger_low_memory_with_transformed_scalars(MSMData &msm_data) noexcept
Top-level Pippenger algorithm.
static uint32_t get_scalar_slice(const ScalarField &scalar, size_t round, size_t normal_slice_size) noexcept
Given a scalar that is NOT in Montgomery form, extract a slice_size-bit chunk.
static bool use_affine_trick(const size_t num_points, const size_t num_buckets) noexcept
Given a number of points and an optimal bucket size, should we use the affine trick?
static void add_affine_points(AffineElement *points, const size_t num_points, typename Curve::BaseField *scratch_space) noexcept
static size_t get_optimal_log_num_buckets(const size_t num_points) noexcept
For a given number of points, compute the optimal Pippenger bucket size.
static std::vector< ThreadWorkUnits > get_work_units(std::span< std::span< ScalarField > > scalars, std::vector< std::vector< uint32_t > > &msm_scalar_indices) noexcept
Split a multiple multi-scalar-multiplication into equal units of work that can be processed by thread...
static void consume_point_schedule(std::span< const uint64_t > point_schedule, std::span< const AffineElement > points, AffineAdditionData &affine_data, BucketAccumulators &bucket_data, size_t num_input_points_processed, size_t num_queued_affine_points) noexcept
Given a list of points and target buckets to add into, perform required group operations.
static std::vector< AffineElement > batch_multi_scalar_mul(std::span< std::span< const AffineElement > > points, std::span< std::span< ScalarField > > scalars, bool handle_edge_cases=true) noexcept
Compute multiple multi-scalar multiplications.
static Element evaluate_pippenger_round(MSMData &msm_data, const size_t round_index, AffineAdditionData &affine_data, BucketAccumulators &bucket_data, Element previous_round_output, const size_t bits_per_slice) noexcept
Evaluate a single Pippenger round where we use the affine trick.
static void transform_scalar_and_get_nonzero_scalar_indices(std::span< typename Curve::ScalarField > scalars, std::vector< uint32_t > &consolidated_indices) noexcept
Convert scalar out of Montgomery form. Populate consolidated_indices with nonzero scalar indices.
typename Curve::ScalarField ScalarField
typename Curve::AffineElement AffineElement
ssize_t offset
Definition engine.cpp:36
constexpr T ceil_div(const T &numerator, const T &denominator)
Computes the ceiling of the division of two integral types.
Definition general.hpp:23
Curve::Element pippenger(PolynomialSpan< const typename Curve::ScalarField > scalars, std::span< const typename Curve::AffineElement > points, bool handle_edge_cases) noexcept
Curve::Element small_mul(std::span< const typename Curve::ScalarField > &scalars, std::span< const typename Curve::AffineElement > &points, std::span< const uint32_t > scalar_indices, size_t range) noexcept
Fallback method for very small numbers of input points.
size_t process_buckets_count_zero_entries(uint64_t *wnaf_entries, const size_t num_entries, const uint32_t num_bits) noexcept
Curve::Element pippenger_unsafe(PolynomialSpan< const typename Curve::ScalarField > scalars, std::span< const typename Curve::AffineElement > points) noexcept
Entry point for Barretenberg command-line interface.
Definition api.hpp:5
size_t get_num_cpus()
Definition thread.cpp:33
void parallel_for(size_t num_iterations, const std::function< void(size_t)> &func)
Definition thread.cpp:111
void parallel_for_range(size_t num_points, const std::function< void(size_t, size_t)> &func, size_t no_multhreading_if_less_or_equal)
Split a loop into several loops running in parallel.
Definition thread.cpp:141
STL namespace.
constexpr decltype(auto) get(::tuplet::tuple< T... > &&t) noexcept
Definition tuple.hpp:13
Curve::Element Element
size_t total_threads
Definition thread.hpp:160
size_t thread_index
Definition thread.hpp:159
auto range(size_t size, size_t offset=0) const
Definition thread.hpp:161
static constexpr field one()
constexpr field invert() const noexcept
BB_INLINE constexpr field sqr() const noexcept
Temp data structure, one created per thread!
Temp data structure, one created per thread!
MSMWorkUnit describes an MSM that may be part of a larger MSM.
void throw_or_abort(std::string const &err)