114 std::span<
std::span<ScalarField>> scalars, std::vector<std::vector<uint32_t>>& msm_scalar_indices)
noexcept
117 const size_t num_msms = scalars.size();
118 msm_scalar_indices.resize(num_msms);
119 for (
size_t i = 0; i < num_msms; ++i) {
121 transform_scalar_and_get_nonzero_scalar_indices(scalars[i], msm_scalar_indices[i]);
124 size_t total_work = 0;
125 for (
const auto& indices : msm_scalar_indices) {
126 total_work += indices.size();
133 size_t work_of_last_thread = total_work - (work_per_thread * (num_threads - 1));
139 if (num_threads > total_work) {
140 for (
size_t i = 0; i < num_msms; ++i) {
144 .size = msm_scalar_indices[i].size(),
150 size_t thread_accumulated_work = 0;
151 size_t current_thread_idx = 0;
152 for (
size_t i = 0; i < num_msms; ++i) {
154 size_t msm_work = msm_scalar_indices[i].size();
155 size_t msm_size = msm_work;
156 while (msm_work > 0) {
157 const size_t total_thread_work =
158 (current_thread_idx == num_threads - 1) ? work_of_last_thread : work_per_thread;
159 const size_t available_thread_work = total_thread_work - thread_accumulated_work;
161 if (available_thread_work >= msm_work) {
163 work_units[current_thread_idx].push_back(
MSMWorkUnit{
165 .start_index = msm_size - msm_work,
168 thread_accumulated_work += msm_work;
172 work_units[current_thread_idx].push_back(
MSMWorkUnit{
174 .start_index = msm_size - msm_work,
175 .size = available_thread_work,
177 msm_work -= available_thread_work;
178 current_thread_idx++;
179 thread_accumulated_work = 0;
325 const size_t num_points,
329 Fq batch_inversion_accumulator =
Fq::one();
331 for (
size_t i = 0; i < num_points; i += 2) {
332 scratch_space[i >> 1] = points[i].x + points[i + 1].x;
333 points[i + 1].x -= points[i].x;
334 points[i + 1].y -= points[i].y;
335 points[i + 1].y *= batch_inversion_accumulator;
336 batch_inversion_accumulator *= (points[i + 1].x);
338 if (batch_inversion_accumulator == 0) {
342 batch_inversion_accumulator = batch_inversion_accumulator.
invert();
347 for (
size_t i = (num_points)-2; i < num_points; i -= 2) {
348 points[i + 1].y *= batch_inversion_accumulator;
349 batch_inversion_accumulator *= points[i + 1].x;
350 points[i + 1].x = points[i + 1].y.
sqr();
351 points[(i + num_points) >> 1].x = points[i + 1].x - (scratch_space[i >> 1]);
357 __builtin_prefetch(points + i - 2);
358 __builtin_prefetch(points + i - 1);
359 __builtin_prefetch(points + ((i + num_points - 2) >> 1));
360 __builtin_prefetch(scratch_space + ((i - 2) >> 1));
362 points[i].x -= points[(i + num_points) >> 1].x;
363 points[i].x *= points[i + 1].y;
364 points[(i + num_points) >> 1].y = points[i].x - points[i].y;
438 const size_t round_index,
441 const size_t bits_per_slice)
noexcept
443 std::span<const uint32_t>& nonzero_scalar_indices = msm_data.scalar_indices;
444 std::span<const ScalarField>& scalars = msm_data.scalars;
445 std::span<const AffineElement>& points = msm_data.points;
447 const size_t size = nonzero_scalar_indices.size();
448 for (
size_t i = 0; i < size; ++i) {
450 uint32_t bucket_index = get_scalar_slice(scalars[nonzero_scalar_indices[i]], round_index, bits_per_slice);
451 BB_ASSERT_DEBUG(bucket_index <
static_cast<uint32_t
>(1 << bits_per_slice));
452 if (bucket_index > 0) {
455 if (bucket_data.bucket_exists.get(bucket_index)) {
456 bucket_data.buckets[bucket_index] += points[nonzero_scalar_indices[i]];
458 bucket_data.buckets[bucket_index] = points[nonzero_scalar_indices[i]];
459 bucket_data.bucket_exists.set(bucket_index,
true);
464 round_output.self_set_infinity();
465 round_output = accumulate_buckets(bucket_data);
466 bucket_data.bucket_exists.clear();
467 Element result = previous_round_output;
469 size_t num_doublings = ((round_index == num_rounds - 1) && (NUM_BITS_IN_FIELD % bits_per_slice != 0))
470 ? NUM_BITS_IN_FIELD % bits_per_slice
472 for (
size_t i = 0; i < num_doublings; ++i) {
476 result += round_output;
494 const size_t round_index,
498 const size_t bits_per_slice)
noexcept
500 std::span<const uint32_t>& scalar_indices = msm_data.scalar_indices;
501 std::span<const ScalarField>& scalars = msm_data.scalars;
502 std::span<const AffineElement>& points = msm_data.points;
503 std::span<uint64_t>& round_schedule = msm_data.point_schedule;
504 const size_t size = scalar_indices.size();
509 for (
size_t i = 0; i < size; ++i) {
511 round_schedule[i] = get_scalar_slice(scalars[scalar_indices[i]], round_index, bits_per_slice);
512 round_schedule[i] += (
static_cast<uint64_t
>(scalar_indices[i]) << 32ULL);
516 &round_schedule[0], size,
static_cast<uint32_t
>(bits_per_slice));
518 const size_t round_size = size - num_zero_entries;
521 round_output.self_set_infinity();
523 if (round_size > 0) {
524 std::span<uint64_t> point_schedule(&round_schedule[num_zero_entries], round_size);
526 consume_point_schedule(point_schedule, points, affine_data, bucket_data, 0, 0);
527 round_output = accumulate_buckets(bucket_data);
528 bucket_data.bucket_exists.clear();
531 Element result = previous_round_output;
533 size_t num_doublings = ((round_index == num_rounds - 1) && (NUM_BITS_IN_FIELD % bits_per_slice != 0))
534 ? NUM_BITS_IN_FIELD % bits_per_slice
536 for (
size_t i = 0; i < num_doublings; ++i) {
540 result += round_output;
561 size_t num_input_points_processed,
562 size_t num_queued_affine_points)
noexcept
565 size_t point_it = num_input_points_processed;
566 size_t affine_input_it = num_queued_affine_points;
569 size_t num_points = point_schedule.size();
570 auto& bucket_accumulator_exists = bucket_data.bucket_exists;
571 auto& affine_addition_scratch_space = affine_data.points_to_add;
572 auto& bucket_accumulators = bucket_data.buckets;
573 auto& affine_addition_output_bucket_destinations = affine_data.addition_result_bucket_destinations;
574 auto& scalar_scratch_space = affine_data.scalar_scratch_space;
575 auto& output_point_schedule = affine_data.addition_result_bucket_destinations;
578 size_t prefetch_max = (num_points - 32);
579 if (num_points < 32) {
582 size_t end = num_points - 1;
583 if (num_points == 0) {
588 while (((affine_input_it + 1) < AffineAdditionData::BATCH_SIZE) && (point_it < end)) {
591 if ((point_it < prefetch_max) && ((point_it & 0x0f) == 0)) {
592 for (
size_t i = 16; i < 32; ++i) {
593 __builtin_prefetch(&points[(point_schedule[point_it + i] >> 32ULL)]);
610 uint64_t lhs_schedule = point_schedule[point_it];
611 uint64_t rhs_schedule = point_schedule[point_it + 1];
612 size_t lhs_bucket =
static_cast<size_t>(lhs_schedule) & 0xFFFFFFFF;
613 size_t rhs_bucket =
static_cast<size_t>(rhs_schedule) & 0xFFFFFFFF;
614 size_t lhs_point =
static_cast<size_t>(lhs_schedule >> 32);
615 size_t rhs_point =
static_cast<size_t>(rhs_schedule >> 32);
617 bool has_bucket_accumulator = bucket_accumulator_exists.get(lhs_bucket);
618 bool buckets_match = lhs_bucket == rhs_bucket;
619 bool do_affine_add = buckets_match || has_bucket_accumulator;
622 const AffineElement* rhs_source = buckets_match ? &points[rhs_point] : &bucket_accumulators[lhs_bucket];
627 do_affine_add ? &affine_addition_scratch_space[affine_input_it] : &bucket_accumulators[lhs_bucket];
629 do_affine_add ? &affine_addition_scratch_space[affine_input_it + 1] : &null_location;
632 uint64_t& source_bucket_destination = affine_addition_output_bucket_destinations[affine_input_it >> 1];
633 source_bucket_destination = do_affine_add ? lhs_bucket : source_bucket_destination;
636 *lhs_destination = *lhs_source;
637 *rhs_destination = *rhs_source;
640 bucket_accumulator_exists.set(
642 (has_bucket_accumulator && buckets_match) ||
645 affine_input_it += do_affine_add ? 2 : 0;
646 point_it += (do_affine_add && buckets_match) ? 2 : 1;
650 if (point_it == num_points - 1) {
651 uint64_t lhs_schedule = point_schedule[point_it];
652 size_t lhs_bucket =
static_cast<size_t>(lhs_schedule) & 0xFFFFFFFF;
653 size_t lhs_point =
static_cast<size_t>(lhs_schedule >> 32);
654 bool has_bucket_accumulator = bucket_accumulator_exists.get(lhs_bucket);
656 if (has_bucket_accumulator) {
657 affine_addition_scratch_space[affine_input_it] = points[lhs_point];
658 affine_addition_scratch_space[affine_input_it + 1] = bucket_accumulators[lhs_bucket];
659 bucket_accumulator_exists.set(lhs_bucket,
false);
660 affine_addition_output_bucket_destinations[affine_input_it >> 1] = lhs_bucket;
661 affine_input_it += 2;
665 bucket_accumulators[lhs_bucket] = points[lhs_point];
666 bucket_accumulator_exists.set(lhs_bucket,
true);
673 size_t num_affine_points_to_add = affine_input_it;
674 if (num_affine_points_to_add >= 2) {
675 add_affine_points(&affine_addition_scratch_space[0], num_affine_points_to_add, &scalar_scratch_space[0]);
678 G1* affine_output = &affine_addition_scratch_space[0] + (num_affine_points_to_add / 2);
683 size_t new_scratch_space_it = 0;
684 size_t affine_output_it = 0;
685 size_t num_affine_output_points = num_affine_points_to_add / 2;
688 while ((affine_output_it < (num_affine_output_points - 1)) && (num_affine_output_points > 0)) {
689 size_t lhs_bucket =
static_cast<size_t>(affine_addition_output_bucket_destinations[affine_output_it]);
690 size_t rhs_bucket =
static_cast<size_t>(affine_addition_output_bucket_destinations[affine_output_it + 1]);
693 bool has_bucket_accumulator = bucket_accumulator_exists.get(lhs_bucket);
694 bool buckets_match = (lhs_bucket == rhs_bucket);
695 bool do_affine_add = buckets_match || has_bucket_accumulator;
697 const AffineElement* lhs_source = &affine_output[affine_output_it];
699 buckets_match ? &affine_output[affine_output_it + 1] : &bucket_accumulators[lhs_bucket];
702 do_affine_add ? &affine_addition_scratch_space[new_scratch_space_it] : &bucket_accumulators[lhs_bucket];
704 do_affine_add ? &affine_addition_scratch_space[new_scratch_space_it + 1] : &null_location;
706 uint64_t& source_bucket_destination = output_point_schedule[new_scratch_space_it >> 1];
707 source_bucket_destination = do_affine_add ? lhs_bucket : source_bucket_destination;
709 *lhs_destination = *lhs_source;
710 *rhs_destination = *rhs_source;
712 bucket_accumulator_exists.set(lhs_bucket, (has_bucket_accumulator && buckets_match) || !do_affine_add);
713 new_scratch_space_it += do_affine_add ? 2 : 0;
714 affine_output_it += (do_affine_add && buckets_match) ? 2 : 1;
717 if (affine_output_it == (num_affine_output_points - 1)) {
719 size_t lhs_bucket =
static_cast<size_t>(affine_addition_output_bucket_destinations[affine_output_it]);
721 bool has_bucket_accumulator = bucket_accumulator_exists.get(lhs_bucket);
722 if (has_bucket_accumulator) {
723 BB_ASSERT_DEBUG(new_scratch_space_it + 1 < affine_addition_scratch_space.size());
725 BB_ASSERT_DEBUG((new_scratch_space_it >> 1) < output_point_schedule.size());
726 affine_addition_scratch_space[new_scratch_space_it] = affine_output[affine_output_it];
727 affine_addition_scratch_space[new_scratch_space_it + 1] = bucket_accumulators[lhs_bucket];
728 bucket_accumulator_exists.set(lhs_bucket,
false);
729 output_point_schedule[new_scratch_space_it >> 1] = lhs_bucket;
730 new_scratch_space_it += 2;
731 affine_output_it += 1;
733 bucket_accumulators[lhs_bucket] = affine_output[affine_output_it];
734 bucket_accumulator_exists.set(lhs_bucket,
true);
735 affine_output_it += 1;
741 if (point_it < num_points || new_scratch_space_it != 0) {
742 consume_point_schedule(point_schedule, points, affine_data, bucket_data, point_it, new_scratch_space_it);
763 bool handle_edge_cases)
noexcept
766 const size_t num_msms = points.size();
776 if (!thread_work_units[thread_idx].empty()) {
780 std::span<const ScalarField> work_scalars = scalars[msm.batch_msm_index];
781 std::span<const AffineElement> work_points = points[msm.batch_msm_index];
782 std::span<const uint32_t> work_indices =
783 std::span<const uint32_t>{ &msm_scalar_indices[msm.batch_msm_index][msm.start_index], msm.size };
784 std::vector<uint64_t> point_schedule(msm.size);
785 MSMData msm_data(work_scalars, work_points, work_indices, std::span<uint64_t>(point_schedule));
786 Element msm_result = Curve::Group::point_at_infinity;
787 constexpr size_t SINGLE_MUL_THRESHOLD = 16;
788 if (msm.size < SINGLE_MUL_THRESHOLD) {
789 msm_result = small_mul<Curve>(work_scalars, work_points, msm_data.
scalar_indices, msm.size);
793 if (handle_edge_cases) {
794 msm_result = small_pippenger_low_memory_with_transformed_scalars(msm_data);
796 msm_result = pippenger_low_memory_with_transformed_scalars(msm_data);
799 msm_results.push_back(
std::make_pair(msm_result, msm.batch_msm_index));
807 std::vector<Element> results(num_msms);
809 ele.self_set_infinity();
811 for (
const auto& single_thread_msm_results : thread_msm_results) {
813 results[result.second] += result.first;
816 Element::batch_normalize(&results[0], num_msms);
818 std::vector<AffineElement> affine_results;
819 for (
const auto& ele : results) {
824 for (
auto& msm_scalars : scalars) {
826 for (size_t i = start; i < end; ++i) {
827 msm_scalars[i].self_to_montgomery_form();
831 return affine_results;