13 const std::span<G1>& points,
const std::vector<size_t>& sequence_counts)
18 std::span<Fq> scratch_space(scratch_space_vector);
21 auto [addition_sequences_, sequence_tags] = construct_thread_data(points, sequence_counts, scratch_space);
22 auto& addition_sequences = addition_sequences_;
24 const size_t num_threads = addition_sequences.size();
25 parallel_for(num_threads, [&](
size_t thread_idx) { batched_affine_add_in_place(addition_sequences[thread_idx]); });
29 size_t prev_tag = std::numeric_limits<size_t>::max();
30 for (
auto [sequences, tags] :
zip_view(addition_sequences, sequence_tags)) {
32 for (
size_t i = 0; i < sequences.sequence_counts.size(); ++i) {
33 if (tags[i] == prev_tag) {
34 reduced_points.back() = reduced_points.back() + sequences.points[i];
36 reduced_points.emplace_back(sequences.points[i]);
42 return reduced_points;
47 const std::span<G1>& points,
const std::vector<size_t>& sequence_counts,
const std::span<Fq>& scratch_space)
50 std::vector<size_t> sequence_endpoints;
51 size_t total_count = 0;
52 for (
const auto& count : sequence_counts) {
54 sequence_endpoints.emplace_back(total_count);
57 if (points.size() != total_count) {
58 throw_or_abort(
"Number of input points does not match sequence counts!");
62 const size_t MIN_POINTS_PER_THREAD = 1 << 14;
63 const size_t total_num_points = points.size();
64 const size_t optimal_threads = total_num_points / MIN_POINTS_PER_THREAD;
67 const size_t base_thread_size = total_num_points / num_threads;
68 const size_t leftover_size = total_num_points % num_threads;
69 std::vector<size_t> thread_sizes(num_threads, base_thread_size);
70 for (
size_t i = 0; i < leftover_size; ++i) {
77 std::vector<size_t> thread_endpoints;
78 size_t point_index = 0;
79 for (
auto size : thread_sizes) {
80 thread_points.push_back(points.subspan(point_index, size));
81 thread_scratch_space.push_back(scratch_space.subspan(point_index, size));
83 thread_endpoints.emplace_back(point_index);
89 std::vector<size_t> all_endpoints;
90 all_endpoints.reserve(thread_endpoints.size() + sequence_endpoints.size());
91 all_endpoints.insert(all_endpoints.end(), thread_endpoints.begin(), thread_endpoints.end());
92 all_endpoints.insert(all_endpoints.end(), sequence_endpoints.begin(), sequence_endpoints.end());
93 std::sort(all_endpoints.begin(), all_endpoints.end());
94 auto last = std::unique(all_endpoints.begin(), all_endpoints.end());
95 all_endpoints.erase(last, all_endpoints.end());
98 size_t prev_endpoint = 0;
99 size_t thread_idx = 0;
100 size_t sequence_idx = 0;
103 for (
auto& endpoint : all_endpoints) {
104 size_t chunk_size = endpoint - prev_endpoint;
105 thread_sequence_counts[thread_idx].emplace_back(chunk_size);
106 thread_sequence_tags[thread_idx].emplace_back(sequence_idx);
107 if (endpoint == thread_endpoints[thread_idx]) {
110 if (endpoint == sequence_endpoints[sequence_idx]) {
113 prev_endpoint = endpoint;
116 if (thread_sequence_counts.size() != thread_points.size()) {
122 for (
size_t i = 0; i < num_threads; ++i) {
123 addition_sequences.push_back(
124 AdditionSequences{ thread_sequence_counts[i], thread_points[i], thread_scratch_space[i] });
127 return { addition_sequences, thread_sequence_tags };
134 auto points = add_sequences.
points;
138 size_t total_num_pairs{ 0 };
139 for (
auto& count : sequence_counts) {
140 total_num_pairs += count >> 1;
145 std::span<Fq> denominators = add_sequences.
scratch_space.subspan(0, total_num_pairs);
146 std::span<Fq> differences = add_sequences.
scratch_space.subspan(total_num_pairs, 2 * total_num_pairs);
150 size_t point_idx = 0;
152 for (
auto& count : sequence_counts) {
153 const auto num_pairs = count >> 1;
154 for (
size_t j = 0; j < num_pairs; ++j) {
156 const auto& x1 = points[point_idx++].x;
157 const auto& x2 = points[point_idx++].x;
163 differences[pair_idx] = diff;
166 denominators[pair_idx++] = accumulator;
170 point_idx += (count & 0x01ULL);
174 Fq inverse = accumulator.invert();
177 for (
size_t i = 0; i < total_num_pairs; ++i) {
178 size_t idx = total_num_pairs - 1 - i;
179 denominators[idx] *= inverse;
180 inverse *= differences[idx];
189 const size_t num_points = add_sequences.
points.size();
190 if (num_points == 0 || num_points == 1) {
195 std::span<Fq> denominators = batch_compute_point_addition_slope_inverses(add_sequences);
197 auto points = add_sequences.
points;
201 size_t point_idx = 0;
202 size_t result_point_idx = 0;
204 bool more_additions =
false;
205 for (
auto& count : sequence_counts) {
206 const auto num_pairs = count >> 1;
207 const bool overflow =
static_cast<bool>(count & 0x01ULL);
209 for (
size_t j = 0; j < num_pairs; ++j) {
210 const auto& point_1 = points[point_idx++];
211 const auto& point_2 = points[point_idx++];
212 const auto& denominator = denominators[pair_idx++];
213 auto& result = points[result_point_idx++];
215 result = affine_add_with_denominator(point_1, point_2, denominator);
219 points[result_point_idx++] = points[point_idx++];
223 const uint32_t updated_sequence_count =
static_cast<uint32_t
>(num_pairs) +
static_cast<uint32_t
>(overflow);
224 count = updated_sequence_count;
227 more_additions = more_additions || updated_sequence_count > 1;
231 if (more_additions) {
232 const size_t updated_point_count = result_point_idx;
233 std::span<G1> updated_points(&points[0], updated_point_count);
234 return batched_affine_add_in_place(