#include "benchmark.hpp"

#include <algorithm>
#include <array>
#include <bit>
#include <cmath>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <immintrin.h>
#include <limits>
#include <memory>
#include <numeric>
#include <random>
#include <stdexcept>
#include <type_traits>
#include <utility>
#include <vector>

#ifdef _MSC_VER
#include <intrin.h>
#endif

namespace cachebench {

namespace {

// Kid picture for this file:
//
// 1 lane:
// [a] + [b] -> [out]
//
// 8 lanes at once:
// [a a a a a a a a]
// [b b b b b b b b]
//        |
//        v
// [o o o o o o o o]
//
// AoS gather:
// [hot cold cold][hot cold cold][hot cold cold]
//   ^              ^              ^
//  we must pick the hot crumbs out of a messy lunchbox
//
// SoA packed:
// hot: [h][h][h][h][h][h][h][h]
//  now 8 kids can grab 8 hot values in one neat scoop
//
// The main story in this file is:
// 1. scalar baseline
// 2. same job with a friendlier layout
// 3. same layout but more lanes at once
// 4. friendly layout + more lanes at once

constexpr std::uint64_t kVectorTargetBytes = 128ULL * 1024ULL * 1024ULL;
constexpr std::uint64_t kGatherTargetBytes = 64ULL * 1024ULL * 1024ULL;
constexpr std::uint64_t kReductionTargetBytes = 128ULL * 1024ULL * 1024ULL;
constexpr std::uint64_t kFilterTargetBytes = 64ULL * 1024ULL * 1024ULL;
constexpr std::uint64_t kScanCompleteTargetBytes = 128ULL * 1024ULL * 1024ULL;
constexpr std::uint64_t kScanCompleteColdTargetBytes = 32ULL * 1024ULL * 1024ULL;
constexpr std::uint64_t kShuffleTargetBytes = 64ULL * 1024ULL * 1024ULL;
constexpr std::uint64_t kCaseStudyTargetBytes = 64ULL * 1024ULL * 1024ULL;
constexpr std::uint64_t kMatmulTargetBytes = 256ULL * 1024ULL;
constexpr std::uint64_t kSearchTargetBytes = 16ULL * 1024ULL * 1024ULL;
constexpr std::uint64_t kGraphTargetBytes = 32ULL * 1024ULL * 1024ULL;
constexpr std::size_t kAlignmentBytes = 64;
constexpr std::size_t kAvxWidthFloats = 8;
constexpr std::size_t kAvxWidthInts = 8;
constexpr std::size_t kMinMatrixDim = 32;
constexpr std::size_t kMaxMatrixDim = 4096;
constexpr std::size_t kSearchMinQueries = 2048;
constexpr std::size_t kSearchMaxQueries = 16384;
constexpr std::size_t kBTreeNodeKeys = 16;
constexpr std::size_t kLadderHistogramBins = 256;
constexpr std::size_t kGraphDegree = 8;

#if defined(_MSC_VER)
#define CACHEBENCH_NOINLINE __declspec(noinline)
#define CACHEBENCH_NO_VECTOR_LOOP __pragma(loop(no_vector))
#define CACHEBENCH_NOVECTOR
#elif defined(__clang__)
#define CACHEBENCH_NOINLINE __attribute__((noinline))
#define CACHEBENCH_NO_VECTOR_LOOP _Pragma("clang loop vectorize(disable)")
#define CACHEBENCH_NOVECTOR
#elif defined(__GNUC__)
#define CACHEBENCH_NOINLINE __attribute__((noinline))
#define CACHEBENCH_NO_VECTOR_LOOP
#define CACHEBENCH_NOVECTOR __attribute__((optimize("no-tree-vectorize")))
#else
#define CACHEBENCH_NOINLINE
#define CACHEBENCH_NO_VECTOR_LOOP
#define CACHEBENCH_NOVECTOR
#endif

template <typename T>
class AlignedBuffer {
public:
    explicit AlignedBuffer(std::size_t size, std::size_t alignment = kAlignmentBytes)
        : size_(size),
          data_(static_cast<T*>(_mm_malloc(sizeof(T) * size, alignment))) {
        if (data_ == nullptr) {
            throw std::bad_alloc();
        }
    }

    ~AlignedBuffer() {
        _mm_free(data_);
    }

    AlignedBuffer(const AlignedBuffer&) = delete;
    AlignedBuffer& operator=(const AlignedBuffer&) = delete;

    T* data() {
        return data_;
    }

    const T* data() const {
        return data_;
    }

    std::size_t size() const {
        return size_;
    }

    T& operator[](std::size_t index) {
        return data_[index];
    }

    const T& operator[](std::size_t index) const {
        return data_[index];
    }

private:
    std::size_t size_;
    T* data_ = nullptr;
};

std::size_t clamp_elements(std::size_t requested_size_bytes, std::size_t element_size) {
    const std::size_t safe_bytes = std::max<std::size_t>(requested_size_bytes, element_size);
    return std::max<std::size_t>(1, safe_bytes / element_size);
}

std::size_t round_down_multiple(std::size_t value, std::size_t multiple) {
    if (multiple == 0) {
        return value;
    }
    return value - (value % multiple);
}

std::uint64_t seeded_value(std::size_t index) {
    constexpr std::uint64_t kMix = 0x9E3779B97F4A7C15ULL;
    return (static_cast<std::uint64_t>(index) + 1ULL) * kMix;
}

float seeded_float(std::size_t index, float scale) {
    const std::uint32_t value = static_cast<std::uint32_t>((seeded_value(index) >> 16) & 0xFFFFULL);
    return static_cast<float>(value % 2048U) / scale;
}

template <typename T>
std::uint64_t sample_checksum(const T* data, std::size_t elements) {
    if (elements == 0) {
        return 0;
    }
    const std::size_t step = std::max<std::size_t>(1, elements / 8);
    std::uint64_t sum = 0;
    for (std::size_t index = 0; index < elements; index += step) {
        if constexpr (std::is_same_v<T, float>) {
            sum += static_cast<std::uint64_t>(std::bit_cast<std::uint32_t>(data[index]));
        } else if constexpr (std::is_same_v<T, std::uint8_t>) {
            sum += data[index];
        } else {
            sum += static_cast<std::uint64_t>(data[index]);
        }
    }
    if constexpr (std::is_same_v<T, float>) {
        sum += static_cast<std::uint64_t>(std::bit_cast<std::uint32_t>(data[elements - 1]));
    } else if constexpr (std::is_same_v<T, std::uint8_t>) {
        sum += data[elements - 1];
    } else {
        sum += static_cast<std::uint64_t>(data[elements - 1]);
    }
    return sum;
}

bool cpu_supports_avx2() {
#if defined(_MSC_VER) && defined(_M_X64)
    int info[4] = {};
    __cpuid(info, 1);
    const bool osxsave = (info[2] & (1 << 27)) != 0;
    const bool avx = (info[2] & (1 << 28)) != 0;
    if (!osxsave || !avx) {
        return false;
    }
    const unsigned long long xcr0 = _xgetbv(0);
    if ((xcr0 & 0x6ULL) != 0x6ULL) {
        return false;
    }

    __cpuidex(info, 7, 0);
    return (info[1] & (1 << 5)) != 0;
#elif (defined(__GNUC__) || defined(__clang__)) && defined(__x86_64__)
    return __builtin_cpu_supports("avx2");
#else
    return false;
#endif
}

CACHEBENCH_NOINLINE void scan_complete_compiler_barrier() {
#if defined(_MSC_VER)
    _ReadWriteBarrier();
#elif defined(__GNUC__) || defined(__clang__)
    asm volatile("" ::: "memory");
#endif
}

// Scalar baseline:
// [a] + [b] -> [out]
// one answer per trip, and vectorization is intentionally disabled.
CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR void scalar_add_f32_no_vector(const float* a,
                                                                      const float* b,
                                                                      float* c,
                                                                      std::size_t elements) {
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t i = 0; i < elements; ++i) {
        c[i] = a[i] + b[i];
    }
}

// Compiler-helped version:
// source still looks scalar, but the compiler may widen it into SIMD for us.
CACHEBENCH_NOINLINE void auto_add_f32(const float* a,
                                      const float* b,
                                      float* c,
                                      std::size_t elements) {
    for (std::size_t i = 0; i < elements; ++i) {
        c[i] = a[i] + b[i];
    }
}

// Manual AVX2 aligned version:
// [a a a a a a a a] + [b b b b b b b b] -> [o o o o o o o o]
// Clean alignment lets the CPU grab one neat 256-bit chunk at a time.
CACHEBENCH_NOINLINE void avx2_add_f32_aligned(const float* a,
                                              const float* b,
                                              float* c,
                                              std::size_t elements) {
    std::size_t i = 0;
    for (; i + kAvxWidthFloats <= elements; i += kAvxWidthFloats) {
        const __m256 x = _mm256_load_ps(a + i);
        const __m256 y = _mm256_load_ps(b + i);
        _mm256_store_ps(c + i, _mm256_add_ps(x, y));
    }
    for (; i < elements; ++i) {
        c[i] = a[i] + b[i];
    }
}

// Manual AVX2 unaligned version:
// still 8-at-once, but the data starts at a messier byte position.
// This isolates alignment cost from pure SIMD width.
CACHEBENCH_NOINLINE void avx2_add_f32_unaligned(const float* a,
                                                const float* b,
                                                float* c,
                                                std::size_t elements) {
    std::size_t i = 0;
    for (; i + kAvxWidthFloats <= elements; i += kAvxWidthFloats) {
        const __m256 x = _mm256_loadu_ps(a + i);
        const __m256 y = _mm256_loadu_ps(b + i);
        _mm256_storeu_ps(c + i, _mm256_add_ps(x, y));
    }
    for (; i < elements; ++i) {
        c[i] = a[i] + b[i];
    }
}

enum class AddMode {
    kScalarNoVector,
    kAutoVectorized,
    kAvx2Aligned,
    kAvx2Unaligned,
};

BenchmarkInstance make_add_benchmark(std::size_t requested_size_bytes, AddMode mode) {
    const std::size_t elements = std::max<std::size_t>(
        kAvxWidthFloats,
        round_down_multiple(std::max<std::size_t>(kAvxWidthFloats,
                                                  requested_size_bytes / (sizeof(float) * 3)),
                            kAvxWidthFloats));
    const std::size_t slack = mode == AddMode::kAvx2Unaligned ? kAvxWidthFloats : 0;

    auto a = std::make_shared<AlignedBuffer<float>>(elements + slack + kAvxWidthFloats);
    auto b = std::make_shared<AlignedBuffer<float>>(elements + slack + kAvxWidthFloats);
    auto c = std::make_shared<AlignedBuffer<float>>(elements + slack + kAvxWidthFloats);

    const std::size_t offset = mode == AddMode::kAvx2Unaligned ? 1 : 0;
    float* pa = a->data() + offset;
    float* pb = b->data() + offset;
    float* pc = c->data() + offset;
    for (std::size_t i = 0; i < elements; ++i) {
        pa[i] = seeded_float(i, 31.0f);
        pb[i] = seeded_float(i + elements, 29.0f);
        pc[i] = 0.0f;
    }

    BenchmarkInstance instance;
    instance.actual_size_bytes = elements * sizeof(float) * 3;
    instance.elements = elements;
    instance.threads_used = 1;
    instance.bytes_per_pass = instance.actual_size_bytes;
    instance.target_bytes_per_trial = kVectorTargetBytes;
    instance.run = [a, b, c, pa, pb, pc, elements, mode](std::size_t passes) {
        std::uint64_t sum = 0;
        for (std::size_t pass = 0; pass < passes; ++pass) {
            switch (mode) {
                case AddMode::kScalarNoVector:
                    scalar_add_f32_no_vector(pa, pb, pc, elements);
                    break;
                case AddMode::kAutoVectorized:
                    auto_add_f32(pa, pb, pc, elements);
                    break;
                case AddMode::kAvx2Aligned:
                    avx2_add_f32_aligned(pa, pb, pc, elements);
                    break;
                case AddMode::kAvx2Unaligned:
                    avx2_add_f32_unaligned(pa, pb, pc, elements);
                    break;
            }
            sum ^= sample_checksum(pc, elements);
        }
        return sum;
    };
    return instance;
}

struct ParticleHotAoS {
    float x = 0.0f;
    std::array<float, 15> cold{};
};

static_assert(sizeof(ParticleHotAoS) == 64, "ParticleHotAoS should stay one cache line wide");

float horizontal_sum_ps(__m256 value) {
    alignas(32) std::array<float, kAvxWidthFloats> lanes{};
    _mm256_store_ps(lanes.data(), value);
    return std::accumulate(lanes.begin(), lanes.end(), 0.0f);
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_sum_particle_x_aos(const ParticleHotAoS* data,
                                                                        std::size_t elements) {
    float sum = 0.0f;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t i = 0; i < elements; ++i) {
        sum += data[i].x;
    }
    return sum;
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_sum_particle_x_soa(const float* x,
                                                                        std::size_t elements) {
    float sum = 0.0f;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t i = 0; i < elements; ++i) {
        sum += x[i];
    }
    return sum;
}

// SIMD-only gather:
// [x cold cold cold][x cold cold cold]
//  ^                ^
// x values are useful, but they live inside fat records, so AVX2 must gather.
CACHEBENCH_NOINLINE float avx2_sum_particle_x_aos_gather(const ParticleHotAoS* data, std::size_t elements) {
    const float* base = reinterpret_cast<const float*>(data);
    constexpr std::int32_t kStrideFloats = static_cast<std::int32_t>(sizeof(ParticleHotAoS) / sizeof(float));
    __m256 acc = _mm256_setzero_ps();
    std::size_t i = 0;
    for (; i + kAvxWidthFloats <= elements; i += kAvxWidthFloats) {
        const __m256i indices = _mm256_setr_epi32(static_cast<int>((i + 0) * kStrideFloats),
                                                  static_cast<int>((i + 1) * kStrideFloats),
                                                  static_cast<int>((i + 2) * kStrideFloats),
                                                  static_cast<int>((i + 3) * kStrideFloats),
                                                  static_cast<int>((i + 4) * kStrideFloats),
                                                  static_cast<int>((i + 5) * kStrideFloats),
                                                  static_cast<int>((i + 6) * kStrideFloats),
                                                  static_cast<int>((i + 7) * kStrideFloats));
        acc = _mm256_add_ps(acc, _mm256_i32gather_ps(base, indices, sizeof(float)));
    }

    float sum = horizontal_sum_ps(acc);
    for (; i < elements; ++i) {
        sum += data[i].x;
    }
    return sum;
}

// Cache-aware + SIMD:
// hot strip: [x][x][x][x][x][x][x][x]
// Now AVX2 can load one whole neat strip without fishing values out of records.
CACHEBENCH_NOINLINE float avx2_sum_particle_x_soa(const float* x, std::size_t elements) {
    __m256 acc = _mm256_setzero_ps();
    std::size_t i = 0;
    for (; i + kAvxWidthFloats <= elements; i += kAvxWidthFloats) {
        acc = _mm256_add_ps(acc, _mm256_load_ps(x + i));
    }

    float sum = horizontal_sum_ps(acc);
    for (; i < elements; ++i) {
        sum += x[i];
    }
    return sum;
}

enum class HotSumMode {
    kScalarAos,
    kScalarSoa,
    kAvx2AosGather,
    kAvx2Soa,
};

BenchmarkInstance make_particle_hot_sum_benchmark(std::size_t requested_size_bytes, HotSumMode mode) {
    const std::size_t elements = std::max<std::size_t>(
        kAvxWidthFloats,
        round_down_multiple(std::max<std::size_t>(kAvxWidthFloats,
                                                  requested_size_bytes / sizeof(ParticleHotAoS)),
                            kAvxWidthFloats));

    auto aos = std::make_shared<AlignedBuffer<ParticleHotAoS>>(elements);
    auto x = std::make_shared<AlignedBuffer<float>>(elements);
    auto cold = std::make_shared<std::array<std::vector<float>, 15>>();

    for (auto& lane : *cold) {
        lane.resize(elements);
    }

    for (std::size_t i = 0; i < elements; ++i) {
        (*aos)[i].x = seeded_float(i, 31.0f);
        (*x)[i] = (*aos)[i].x;
        for (std::size_t lane = 0; lane < cold->size(); ++lane) {
            const float value = seeded_float(i + (lane + 1) * elements, 27.0f + static_cast<float>(lane));
            (*aos)[i].cold[lane] = value;
            (*cold)[lane][i] = value;
        }
    }

    BenchmarkInstance instance;
    instance.actual_size_bytes = elements * sizeof(ParticleHotAoS);
    instance.elements = elements;
    instance.threads_used = 1;
    instance.bytes_per_pass = (mode == HotSumMode::kScalarSoa || mode == HotSumMode::kAvx2Soa)
                                  ? elements * sizeof(float)
                                  : elements * sizeof(ParticleHotAoS);
    instance.target_bytes_per_trial = kReductionTargetBytes;
    instance.run = [aos, x, cold, elements, mode](std::size_t passes) {
        float sum = 0.0f;
        for (std::size_t pass = 0; pass < passes; ++pass) {
            switch (mode) {
                case HotSumMode::kScalarAos:
                    sum += scalar_sum_particle_x_aos(aos->data(), elements);
                    break;
                case HotSumMode::kScalarSoa:
                    sum += scalar_sum_particle_x_soa(x->data(), elements);
                    break;
                case HotSumMode::kAvx2AosGather:
                    sum += avx2_sum_particle_x_aos_gather(aos->data(), elements);
                    break;
                case HotSumMode::kAvx2Soa:
                    sum += avx2_sum_particle_x_soa(x->data(), elements);
                    break;
            }
        }
        return static_cast<std::uint64_t>(std::bit_cast<std::uint32_t>(sum));
    };
    return instance;
}

std::size_t choose_query_count(std::size_t elements) {
    return std::clamp<std::size_t>(elements / 32, kSearchMinQueries, kSearchMaxQueries);
}

std::shared_ptr<std::vector<int>> build_sorted_values_i32(std::size_t elements) {
    auto values = std::make_shared<std::vector<int>>(elements);
    for (std::size_t i = 0; i < elements; ++i) {
        (*values)[i] = static_cast<int>(i * 2);
    }
    return values;
}

std::shared_ptr<std::vector<int>> build_queries_i32(std::size_t elements, std::size_t query_count) {
    auto queries = std::make_shared<std::vector<int>>(query_count);
    const int max_value = static_cast<int>((elements - 1) * 2);
    std::mt19937 rng(0x13579BDFU);
    std::uniform_int_distribution<int> dist(0, max_value);
    for (int& query : *queries) {
        query = dist(rng);
    }
    return queries;
}

int scalar_lower_bound_value(const int* data, std::size_t size, int needle) {
    std::size_t left = 0;
    std::size_t right = size;
    while (left < right) {
        const std::size_t mid = left + (right - left) / 2;
        if (data[mid] < needle) {
            left = mid + 1;
        } else {
            right = mid;
        }
    }
    return data[std::min(left, size - 1)];
}

struct alignas(64) SimdBTreeBlock {
    std::array<int, kBTreeNodeKeys> values{};
};

std::size_t go_btree(std::size_t node_index, std::size_t child_index) {
    return node_index * (kBTreeNodeKeys + 1) + child_index + 1;
}

void build_btree_recursive(const std::vector<int>& sorted,
                           std::vector<SimdBTreeBlock>& blocks,
                           std::size_t block_count,
                           std::size_t node_index,
                           std::size_t& sorted_index) {
    if (node_index >= block_count) {
        return;
    }
    for (std::size_t key_index = 0; key_index < kBTreeNodeKeys; ++key_index) {
        build_btree_recursive(sorted, blocks, block_count, go_btree(node_index, key_index), sorted_index);
        blocks[node_index].values[key_index] =
            sorted_index < sorted.size() ? sorted[sorted_index++] : std::numeric_limits<int>::max();
    }
    build_btree_recursive(sorted, blocks, block_count, go_btree(node_index, kBTreeNodeKeys), sorted_index);
}

std::shared_ptr<std::vector<SimdBTreeBlock>> build_btree_layout_i32(const std::vector<int>& sorted) {
    const std::size_t block_count = (sorted.size() + kBTreeNodeKeys - 1) / kBTreeNodeKeys;
    auto blocks = std::make_shared<std::vector<SimdBTreeBlock>>(block_count);
    for (SimdBTreeBlock& block : *blocks) {
        block.values.fill(std::numeric_limits<int>::max());
    }
    std::size_t sorted_index = 0;
    build_btree_recursive(sorted, *blocks, block_count, 0, sorted_index);
    return blocks;
}

int scalar_btree_lower_bound_value(const std::vector<SimdBTreeBlock>& blocks, int needle) {
    int result = std::numeric_limits<int>::max();
    std::size_t node_index = 0;
    while (node_index < blocks.size()) {
        const SimdBTreeBlock& block = blocks[node_index];
        std::size_t child_index = 0;
        while (child_index < kBTreeNodeKeys && block.values[child_index] < needle) {
            ++child_index;
        }
        if (child_index < kBTreeNodeKeys) {
            result = block.values[child_index];
        }
        node_index = go_btree(node_index, child_index);
    }
    return result;
}

std::size_t avx2_btree_child_index(const SimdBTreeBlock& block, int needle) {
    const __m256i needle_vec = _mm256_set1_epi32(needle);
    const __m256i lower = _mm256_load_si256(reinterpret_cast<const __m256i*>(block.values.data()));
    const __m256i upper = _mm256_load_si256(reinterpret_cast<const __m256i*>(block.values.data() + 8));
    const unsigned mask_lower =
        static_cast<unsigned>(_mm256_movemask_ps(_mm256_castsi256_ps(_mm256_cmpgt_epi32(needle_vec, lower))));
    if (mask_lower != 0xFFU) {
        return std::countr_one(mask_lower);
    }
    const unsigned mask_upper =
        static_cast<unsigned>(_mm256_movemask_ps(_mm256_castsi256_ps(_mm256_cmpgt_epi32(needle_vec, upper))));
    return 8U + std::countr_one(mask_upper);
}

int avx2_btree_lower_bound_value(const std::vector<SimdBTreeBlock>& blocks, int needle) {
    int result = std::numeric_limits<int>::max();
    std::size_t node_index = 0;
    while (node_index < blocks.size()) {
        const SimdBTreeBlock& block = blocks[node_index];
        const std::size_t child_index = avx2_btree_child_index(block, needle);
        if (child_index < kBTreeNodeKeys) {
            result = block.values[child_index];
        }
        node_index = go_btree(node_index, child_index);
    }
    return result;
}

std::uint64_t avx2_array_lower_bound_sum(const int* data,
                                         std::size_t size,
                                         const int* queries,
                                         std::size_t query_count) {
    const int max_index = static_cast<int>(size - 1);
    int highest_step = 1;
    while (highest_step < max_index) {
        highest_step <<= 1;
    }
    highest_step >>= 1;

    std::uint64_t sum = 0;
    std::size_t i = 0;
    const __m256i one = _mm256_set1_epi32(1);
    const __m256i max_index_vec = _mm256_set1_epi32(max_index);

    for (; i + kAvxWidthInts <= query_count; i += kAvxWidthInts) {
        const __m256i needles = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(queries + i));
        __m256i index = _mm256_set1_epi32(-1);

        for (int step = highest_step; step > 0; step >>= 1) {
            const __m256i candidate =
                _mm256_min_epi32(_mm256_add_epi32(index, _mm256_set1_epi32(step)), max_index_vec);
            const __m256i gathered = _mm256_i32gather_epi32(data, candidate, sizeof(int));
            const __m256i mask = _mm256_cmpgt_epi32(needles, gathered);
            index = _mm256_blendv_epi8(index, candidate, mask);
        }

        const __m256i result_index = _mm256_min_epi32(_mm256_add_epi32(index, one), max_index_vec);
        const __m256i result = _mm256_i32gather_epi32(data, result_index, sizeof(int));
        alignas(32) std::array<int, kAvxWidthInts> lanes{};
        _mm256_store_si256(reinterpret_cast<__m256i*>(lanes.data()), result);
        for (int lane : lanes) {
            sum += static_cast<std::uint32_t>(lane);
        }
    }

    for (; i < query_count; ++i) {
        sum += static_cast<std::uint32_t>(scalar_lower_bound_value(data, size, queries[i]));
    }
    return sum;
}

enum class SearchLadderMode {
    kScalarArray,
    kScalarBTree,
    kAvx2Array,
    kAvx2BTree,
};

BenchmarkInstance make_search_ladder_benchmark(std::size_t requested_size_bytes, SearchLadderMode mode) {
    const std::size_t elements = std::max<std::size_t>(
        1024, round_down_multiple(clamp_elements(requested_size_bytes, sizeof(int)), kAvxWidthInts));
    const std::size_t query_count = round_down_multiple(choose_query_count(elements), kAvxWidthInts);
    auto values = build_sorted_values_i32(elements);
    auto queries = build_queries_i32(elements, query_count);
    auto btree = build_btree_layout_i32(*values);

    BenchmarkInstance instance;
    instance.actual_size_bytes = mode == SearchLadderMode::kScalarBTree || mode == SearchLadderMode::kAvx2BTree
                                     ? btree->size() * sizeof(SimdBTreeBlock)
                                     : values->size() * sizeof(int);
    instance.elements = queries->size();
    instance.threads_used = 1;
    instance.bytes_per_pass = std::max<std::uint64_t>(
        instance.actual_size_bytes, static_cast<std::uint64_t>(queries->size() * sizeof(int) * 64ULL));
    instance.target_bytes_per_trial = kSearchTargetBytes;
    instance.run = [values, btree, queries, mode](std::size_t passes) {
        std::uint64_t sum = 0;
        for (std::size_t pass = 0; pass < passes; ++pass) {
            switch (mode) {
                case SearchLadderMode::kScalarArray:
                    for (int query : *queries) {
                        sum += static_cast<std::uint32_t>(
                            scalar_lower_bound_value(values->data(), values->size(), query));
                    }
                    break;
                case SearchLadderMode::kScalarBTree:
                    for (int query : *queries) {
                        sum += static_cast<std::uint32_t>(scalar_btree_lower_bound_value(*btree, query));
                    }
                    break;
                case SearchLadderMode::kAvx2Array:
                    sum += avx2_array_lower_bound_sum(values->data(), values->size(), queries->data(), queries->size());
                    break;
                case SearchLadderMode::kAvx2BTree:
                    for (int query : *queries) {
                        sum += static_cast<std::uint32_t>(avx2_btree_lower_bound_value(*btree, query));
                    }
                    break;
            }
        }
        return sum;
    };
    return instance;
}

struct KeyRecordAoS {
    std::uint32_t key = 0;
    std::array<std::uint32_t, 15> cold{};
};

static_assert(sizeof(KeyRecordAoS) == 64, "KeyRecordAoS should stay one cache line wide");

enum class HistogramLadderMode {
    kScalarAos,
    kScalarSoa,
    kAvx2Aos,
    kAvx2Soa,
};

BenchmarkInstance make_histogram_ladder_benchmark(std::size_t requested_size_bytes, HistogramLadderMode mode) {
    const std::size_t elements = std::max<std::size_t>(
        kAvxWidthInts,
        round_down_multiple(std::max<std::size_t>(kAvxWidthInts, requested_size_bytes / sizeof(KeyRecordAoS)),
                            kAvxWidthInts));
    auto aos = std::make_shared<AlignedBuffer<KeyRecordAoS>>(elements);
    auto keys = std::make_shared<AlignedBuffer<std::uint32_t>>(elements);
    auto bins = std::make_shared<std::array<std::uint32_t, kLadderHistogramBins>>();

    for (std::size_t i = 0; i < elements; ++i) {
        const std::uint32_t key = static_cast<std::uint32_t>(seeded_value(i) & (kLadderHistogramBins - 1));
        (*aos)[i].key = key;
        (*keys)[i] = key;
        for (std::size_t lane = 0; lane < (*aos)[i].cold.size(); ++lane) {
            (*aos)[i].cold[lane] = static_cast<std::uint32_t>(seeded_value(i + lane + 1));
        }
    }

    BenchmarkInstance instance;
    instance.actual_size_bytes = elements * sizeof(KeyRecordAoS);
    instance.elements = elements;
    instance.threads_used = 1;
    instance.bytes_per_pass = (mode == HistogramLadderMode::kScalarSoa || mode == HistogramLadderMode::kAvx2Soa)
                                  ? elements * sizeof(std::uint32_t)
                                  : elements * sizeof(KeyRecordAoS);
    instance.target_bytes_per_trial = kFilterTargetBytes;
    instance.run = [aos, keys, bins, elements, mode](std::size_t passes) {
        std::uint64_t sum = 0;
        alignas(32) std::array<std::uint32_t, kAvxWidthInts> lanes{};

        for (std::size_t pass = 0; pass < passes; ++pass) {
            bins->fill(0U);
            switch (mode) {
                case HistogramLadderMode::kScalarAos:
                    for (std::size_t i = 0; i < elements; ++i) {
                        ++(*bins)[(*aos)[i].key];
                    }
                    break;
                case HistogramLadderMode::kScalarSoa:
                    for (std::size_t i = 0; i < elements; ++i) {
                        ++(*bins)[(*keys)[i]];
                    }
                    break;
                case HistogramLadderMode::kAvx2Aos: {
                    const std::uint32_t* base = reinterpret_cast<const std::uint32_t*>(aos->data());
                    constexpr std::int32_t kStrideWords = static_cast<std::int32_t>(sizeof(KeyRecordAoS) / sizeof(std::uint32_t));
                    std::size_t i = 0;
                    for (; i + kAvxWidthInts <= elements; i += kAvxWidthInts) {
                        const __m256i indices = _mm256_setr_epi32(static_cast<int>((i + 0) * kStrideWords),
                                                                  static_cast<int>((i + 1) * kStrideWords),
                                                                  static_cast<int>((i + 2) * kStrideWords),
                                                                  static_cast<int>((i + 3) * kStrideWords),
                                                                  static_cast<int>((i + 4) * kStrideWords),
                                                                  static_cast<int>((i + 5) * kStrideWords),
                                                                  static_cast<int>((i + 6) * kStrideWords),
                                                                  static_cast<int>((i + 7) * kStrideWords));
                        const __m256i gathered =
                            _mm256_i32gather_epi32(reinterpret_cast<const int*>(base), indices, sizeof(std::uint32_t));
                        _mm256_store_si256(reinterpret_cast<__m256i*>(lanes.data()), gathered);
                        for (std::uint32_t key : lanes) {
                            ++(*bins)[key];
                        }
                    }
                    for (; i < elements; ++i) {
                        ++(*bins)[(*aos)[i].key];
                    }
                    break;
                }
                case HistogramLadderMode::kAvx2Soa: {
                    std::size_t i = 0;
                    for (; i + kAvxWidthInts <= elements; i += kAvxWidthInts) {
                        const __m256i loaded =
                            _mm256_load_si256(reinterpret_cast<const __m256i*>(keys->data() + i));
                        _mm256_store_si256(reinterpret_cast<__m256i*>(lanes.data()), loaded);
                        for (std::uint32_t key : lanes) {
                            ++(*bins)[key];
                        }
                    }
                    for (; i < elements; ++i) {
                        ++(*bins)[(*keys)[i]];
                    }
                    break;
                }
            }

            for (std::size_t bin = 0; bin < 32; ++bin) {
                sum += (*bins)[bin];
            }
        }
        return sum;
    };
    return instance;
}

__m256 prefix8_ps(__m256 x) {
    x = _mm256_add_ps(x, _mm256_castsi256_ps(_mm256_slli_si256(_mm256_castps_si256(x), 4)));
    x = _mm256_add_ps(x, _mm256_castsi256_ps(_mm256_slli_si256(_mm256_castps_si256(x), 8)));

    const __m128 low = _mm256_castps256_ps128(x);
    __m128 high = _mm256_extractf128_ps(x, 1);
    const __m128 carry = _mm_shuffle_ps(low, low, _MM_SHUFFLE(3, 3, 3, 3));
    high = _mm_add_ps(high, carry);

    __m256 result = _mm256_castps128_ps256(low);
    result = _mm256_insertf128_ps(result, high, 1);
    return result;
}

float last_lane_ps(__m256 value) {
    alignas(32) std::array<float, kAvxWidthFloats> lanes{};
    _mm256_store_ps(lanes.data(), value);
    return lanes.back();
}

enum class WidePrefixMode {
    kScalarAos,
    kScalarSoa,
    kAvx2Aos,
    kAvx2Soa,
};

BenchmarkInstance make_wide_prefix_benchmark(std::size_t requested_size_bytes, WidePrefixMode mode) {
    const std::size_t elements = std::max<std::size_t>(
        kAvxWidthFloats,
        round_down_multiple(std::max<std::size_t>(kAvxWidthFloats, requested_size_bytes / sizeof(ParticleHotAoS)),
                            kAvxWidthFloats));
    auto aos = std::make_shared<AlignedBuffer<ParticleHotAoS>>(elements);
    auto x = std::make_shared<AlignedBuffer<float>>(elements);
    auto dst = std::make_shared<AlignedBuffer<float>>(elements);
    for (std::size_t i = 0; i < elements; ++i) {
        const float value = seeded_float(i, 17.0f);
        (*aos)[i].x = value;
        (*x)[i] = value;
        (*dst)[i] = 0.0f;
        for (std::size_t lane = 0; lane < (*aos)[i].cold.size(); ++lane) {
            (*aos)[i].cold[lane] = seeded_float(i + lane + 1, 31.0f);
        }
    }

    BenchmarkInstance instance;
    instance.actual_size_bytes = elements * sizeof(ParticleHotAoS);
    instance.elements = elements;
    instance.threads_used = 1;
    instance.bytes_per_pass =
        ((mode == WidePrefixMode::kScalarSoa || mode == WidePrefixMode::kAvx2Soa)
             ? elements * sizeof(float)
             : elements * sizeof(ParticleHotAoS)) +
        elements * sizeof(float);
    instance.target_bytes_per_trial = kCaseStudyTargetBytes;
    instance.run = [aos, x, dst, elements, mode](std::size_t passes) {
        float checksum = 0.0f;
        for (std::size_t pass = 0; pass < passes; ++pass) {
            switch (mode) {
                case WidePrefixMode::kScalarAos: {
                    float carry = 0.0f;
                    for (std::size_t i = 0; i < elements; ++i) {
                        carry += (*aos)[i].x;
                        (*dst)[i] = carry;
                    }
                    break;
                }
                case WidePrefixMode::kScalarSoa: {
                    float carry = 0.0f;
                    for (std::size_t i = 0; i < elements; ++i) {
                        carry += (*x)[i];
                        (*dst)[i] = carry;
                    }
                    break;
                }
                case WidePrefixMode::kAvx2Aos: {
                    const float* base = reinterpret_cast<const float*>(aos->data());
                    constexpr std::int32_t kStrideFloats = static_cast<std::int32_t>(sizeof(ParticleHotAoS) / sizeof(float));
                    float carry = 0.0f;
                    std::size_t i = 0;
                    for (; i + kAvxWidthFloats <= elements; i += kAvxWidthFloats) {
                        const __m256i indices = _mm256_setr_epi32(static_cast<int>((i + 0) * kStrideFloats),
                                                                  static_cast<int>((i + 1) * kStrideFloats),
                                                                  static_cast<int>((i + 2) * kStrideFloats),
                                                                  static_cast<int>((i + 3) * kStrideFloats),
                                                                  static_cast<int>((i + 4) * kStrideFloats),
                                                                  static_cast<int>((i + 5) * kStrideFloats),
                                                                  static_cast<int>((i + 6) * kStrideFloats),
                                                                  static_cast<int>((i + 7) * kStrideFloats));
                        __m256 block = _mm256_i32gather_ps(base, indices, sizeof(float));
                        block = prefix8_ps(block);
                        block = _mm256_add_ps(block, _mm256_set1_ps(carry));
                        _mm256_store_ps(dst->data() + i, block);
                        carry = last_lane_ps(block);
                    }
                    for (; i < elements; ++i) {
                        carry += (*aos)[i].x;
                        (*dst)[i] = carry;
                    }
                    break;
                }
                case WidePrefixMode::kAvx2Soa: {
                    float carry = 0.0f;
                    std::size_t i = 0;
                    for (; i + kAvxWidthFloats <= elements; i += kAvxWidthFloats) {
                        __m256 block = _mm256_load_ps(x->data() + i);
                        block = prefix8_ps(block);
                        block = _mm256_add_ps(block, _mm256_set1_ps(carry));
                        _mm256_store_ps(dst->data() + i, block);
                        carry = last_lane_ps(block);
                    }
                    for (; i < elements; ++i) {
                        carry += (*x)[i];
                        (*dst)[i] = carry;
                    }
                    break;
                }
            }
            checksum += (*dst)[elements - 1];
        }
        return static_cast<std::uint64_t>(std::bit_cast<std::uint32_t>(checksum));
    };
    return instance;
}

struct EdgeAoS {
    float weight = 0.0f;
    std::uint32_t dst = 0;
    std::uint32_t pad0 = 0;
    std::uint32_t pad1 = 0;
};

static_assert(sizeof(EdgeAoS) == 16, "EdgeAoS should stay compact");

struct GraphAdjList {
    std::vector<std::vector<EdgeAoS>> edges;
};

struct GraphCsr {
    std::vector<std::uint32_t> offsets;
    std::vector<std::uint32_t> dst;
    std::vector<float> weight;
};

std::shared_ptr<GraphAdjList> build_graph_adjlist(std::size_t vertices) {
    auto graph = std::make_shared<GraphAdjList>();
    graph->edges.resize(vertices);
    for (std::size_t vertex = 0; vertex < vertices; ++vertex) {
        auto& out = graph->edges[vertex];
        out.reserve(kGraphDegree);
        for (std::size_t edge = 0; edge < kGraphDegree; ++edge) {
            EdgeAoS value{};
            value.weight = seeded_float(vertex * kGraphDegree + edge, 23.0f);
            value.dst = static_cast<std::uint32_t>((vertex * 17 + edge * 31 + 1) % vertices);
            out.push_back(value);
        }
    }
    return graph;
}

std::shared_ptr<GraphCsr> build_graph_csr(const GraphAdjList& adjlist) {
    auto graph = std::make_shared<GraphCsr>();
    const std::size_t vertices = adjlist.edges.size();
    graph->offsets.resize(vertices + 1, 0U);
    graph->dst.reserve(vertices * kGraphDegree);
    graph->weight.reserve(vertices * kGraphDegree);
    for (std::size_t vertex = 0; vertex < vertices; ++vertex) {
        graph->offsets[vertex] = static_cast<std::uint32_t>(graph->dst.size());
        for (const EdgeAoS& edge : adjlist.edges[vertex]) {
            graph->dst.push_back(edge.dst);
            graph->weight.push_back(edge.weight);
        }
    }
    graph->offsets[vertices] = static_cast<std::uint32_t>(graph->dst.size());
    return graph;
}

float scalar_adjlist_edge_sum(const GraphAdjList& graph) {
    float sum = 0.0f;
    for (const auto& edges : graph.edges) {
        for (const EdgeAoS& edge : edges) {
            sum += edge.weight;
        }
    }
    return sum;
}

float scalar_csr_edge_sum(const GraphCsr& graph) {
    return std::accumulate(graph.weight.begin(), graph.weight.end(), 0.0f);
}

float avx2_adjlist_edge_sum(const GraphAdjList& graph) {
    float sum = 0.0f;
    for (const auto& edges : graph.edges) {
        const float* base = reinterpret_cast<const float*>(edges.data());
        constexpr std::int32_t kStrideFloats = static_cast<std::int32_t>(sizeof(EdgeAoS) / sizeof(float));
        __m256 acc = _mm256_setzero_ps();
        std::size_t i = 0;
        for (; i + kAvxWidthFloats <= edges.size(); i += kAvxWidthFloats) {
            const __m256i indices = _mm256_setr_epi32(static_cast<int>((i + 0) * kStrideFloats),
                                                      static_cast<int>((i + 1) * kStrideFloats),
                                                      static_cast<int>((i + 2) * kStrideFloats),
                                                      static_cast<int>((i + 3) * kStrideFloats),
                                                      static_cast<int>((i + 4) * kStrideFloats),
                                                      static_cast<int>((i + 5) * kStrideFloats),
                                                      static_cast<int>((i + 6) * kStrideFloats),
                                                      static_cast<int>((i + 7) * kStrideFloats));
            acc = _mm256_add_ps(acc, _mm256_i32gather_ps(base, indices, sizeof(float)));
        }
        sum += horizontal_sum_ps(acc);
        for (; i < edges.size(); ++i) {
            sum += edges[i].weight;
        }
    }
    return sum;
}

float avx2_csr_edge_sum(const GraphCsr& graph) {
    __m256 acc = _mm256_setzero_ps();
    std::size_t i = 0;
    for (; i + kAvxWidthFloats <= graph.weight.size(); i += kAvxWidthFloats) {
        acc = _mm256_add_ps(acc, _mm256_loadu_ps(graph.weight.data() + i));
    }
    float sum = horizontal_sum_ps(acc);
    for (; i < graph.weight.size(); ++i) {
        sum += graph.weight[i];
    }
    return sum;
}

enum class GraphEdgeMode {
    kScalarAdjList,
    kScalarCsr,
    kAvx2AdjList,
    kAvx2Csr,
};

BenchmarkInstance make_graph_edge_benchmark(std::size_t requested_size_bytes, GraphEdgeMode mode) {
    const std::size_t vertices = std::max<std::size_t>(
        256,
        round_down_multiple(
            std::max<std::size_t>(256, requested_size_bytes / (kGraphDegree * sizeof(EdgeAoS) * 2)),
            kAvxWidthInts));
    auto adjlist = build_graph_adjlist(vertices);
    auto csr = build_graph_csr(*adjlist);

    BenchmarkInstance instance;
    instance.actual_size_bytes = csr->weight.size() * sizeof(float) + csr->dst.size() * sizeof(std::uint32_t) +
                                 csr->offsets.size() * sizeof(std::uint32_t);
    instance.elements = csr->weight.size();
    instance.threads_used = 1;
    instance.bytes_per_pass = mode == GraphEdgeMode::kScalarCsr || mode == GraphEdgeMode::kAvx2Csr
                                  ? csr->weight.size() * sizeof(float)
                                  : adjlist->edges.size() * kGraphDegree * sizeof(EdgeAoS);
    instance.target_bytes_per_trial = kGraphTargetBytes;
    instance.run = [adjlist, csr, mode](std::size_t passes) {
        float sum = 0.0f;
        for (std::size_t pass = 0; pass < passes; ++pass) {
            switch (mode) {
                case GraphEdgeMode::kScalarAdjList:
                    sum += scalar_adjlist_edge_sum(*adjlist);
                    break;
                case GraphEdgeMode::kScalarCsr:
                    sum += scalar_csr_edge_sum(*csr);
                    break;
                case GraphEdgeMode::kAvx2AdjList:
                    sum += avx2_adjlist_edge_sum(*adjlist);
                    break;
                case GraphEdgeMode::kAvx2Csr:
                    sum += avx2_csr_edge_sum(*csr);
                    break;
            }
        }
        return static_cast<std::uint64_t>(std::bit_cast<std::uint32_t>(sum));
    };
    return instance;
}

void insertion_sort_records(KeyRecordAoS* values, std::size_t count) {
    for (std::size_t i = 1; i < count; ++i) {
        KeyRecordAoS current = values[i];
        std::size_t j = i;
        while (j > 0 && values[j - 1].key > current.key) {
            values[j] = values[j - 1];
            --j;
        }
        values[j] = current;
    }
}

enum class BlockSortMode {
    kScalarAos,
    kScalarSoa,
    kAvx2Aos,
    kAvx2Soa,
};

BenchmarkInstance make_blocksort8_benchmark(std::size_t requested_size_bytes, BlockSortMode mode) {
    const std::size_t elements = std::max<std::size_t>(
        kAvxWidthInts,
        round_down_multiple(std::max<std::size_t>(kAvxWidthInts, requested_size_bytes / sizeof(KeyRecordAoS)),
                            kAvxWidthInts));
    auto aos = std::make_shared<AlignedBuffer<KeyRecordAoS>>(elements);
    auto keys = std::make_shared<AlignedBuffer<std::int32_t>>(elements);
    for (std::size_t i = 0; i < elements; ++i) {
        const std::int32_t key = static_cast<std::int32_t>(seeded_value(elements - i) & 0x7FFFFFFFULL);
        (*aos)[i].key = static_cast<std::uint32_t>(key);
        (*keys)[i] = key;
        for (std::size_t lane = 0; lane < (*aos)[i].cold.size(); ++lane) {
            (*aos)[i].cold[lane] = static_cast<std::uint32_t>(seeded_value(i + lane + 3));
        }
    }

    BenchmarkInstance instance;
    instance.actual_size_bytes = elements * sizeof(KeyRecordAoS);
    instance.elements = elements;
    instance.threads_used = 1;
    instance.bytes_per_pass = (mode == BlockSortMode::kScalarSoa || mode == BlockSortMode::kAvx2Soa)
                                  ? elements * sizeof(std::int32_t)
                                  : elements * sizeof(KeyRecordAoS);
    instance.target_bytes_per_trial = kCaseStudyTargetBytes;
    instance.run = [aos, keys, elements, mode](std::size_t passes) {
        std::uint64_t sum = 0;
        alignas(32) std::array<std::int32_t, kAvxWidthInts> lanes{};
        alignas(32) std::array<KeyRecordAoS, kAvxWidthInts> record_block{};
        alignas(32) std::array<std::int32_t, kAvxWidthInts> key_block{};

        for (std::size_t pass = 0; pass < passes; ++pass) {
            for (std::size_t i = 0; i < elements; i += kAvxWidthInts) {
                switch (mode) {
                    case BlockSortMode::kScalarAos:
                        for (std::size_t lane = 0; lane < kAvxWidthInts; ++lane) {
                            record_block[lane] = (*aos)[i + lane];
                        }
                        insertion_sort_records(record_block.data(), record_block.size());
                        sum += record_block.front().key + record_block.back().key;
                        break;
                    case BlockSortMode::kScalarSoa:
                        for (std::size_t lane = 0; lane < kAvxWidthInts; ++lane) {
                            key_block[lane] = (*keys)[i + lane];
                        }
                        std::sort(key_block.begin(), key_block.end());
                        sum += static_cast<std::uint32_t>(key_block.front()) +
                               static_cast<std::uint32_t>(key_block.back());
                        break;
                    case BlockSortMode::kAvx2Aos: {
                        const std::uint32_t* base = reinterpret_cast<const std::uint32_t*>(aos->data());
                        constexpr std::int32_t kStrideWords = static_cast<std::int32_t>(sizeof(KeyRecordAoS) / sizeof(std::uint32_t));
                        const __m256i indices = _mm256_setr_epi32(static_cast<int>((i + 0) * kStrideWords),
                                                                  static_cast<int>((i + 1) * kStrideWords),
                                                                  static_cast<int>((i + 2) * kStrideWords),
                                                                  static_cast<int>((i + 3) * kStrideWords),
                                                                  static_cast<int>((i + 4) * kStrideWords),
                                                                  static_cast<int>((i + 5) * kStrideWords),
                                                                  static_cast<int>((i + 6) * kStrideWords),
                                                                  static_cast<int>((i + 7) * kStrideWords));
                        const __m256i gathered =
                            _mm256_i32gather_epi32(reinterpret_cast<const int*>(base), indices, sizeof(std::uint32_t));
                        _mm256_store_si256(reinterpret_cast<__m256i*>(lanes.data()), gathered);
                        std::sort(lanes.begin(), lanes.end());
                        sum += static_cast<std::uint32_t>(lanes.front()) + static_cast<std::uint32_t>(lanes.back());
                        break;
                    }
                    case BlockSortMode::kAvx2Soa: {
                        const __m256i loaded =
                            _mm256_load_si256(reinterpret_cast<const __m256i*>(keys->data() + i));
                        _mm256_store_si256(reinterpret_cast<__m256i*>(lanes.data()), loaded);
                        std::sort(lanes.begin(), lanes.end());
                        sum += static_cast<std::uint32_t>(lanes.front()) + static_cast<std::uint32_t>(lanes.back());
                        break;
                    }
                }
            }
        }
        return sum;
    };
    return instance;
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t scalar_gather_sum_i32(
    const std::int32_t* data,
    const std::int32_t* indices,
    std::size_t query_count) {
    std::uint64_t sum = 0;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t i = 0; i < query_count; ++i) {
        sum += static_cast<std::uint32_t>(data[static_cast<std::size_t>(indices[i])]);
    }
    return sum;
}

CACHEBENCH_NOINLINE std::uint64_t avx2_gather_sum_i32(const std::int32_t* data,
                                                      const std::int32_t* indices,
                                                      std::size_t query_count) {
    __m256i acc = _mm256_setzero_si256();
    std::size_t i = 0;
    for (; i + kAvxWidthInts <= query_count; i += kAvxWidthInts) {
        const __m256i idx = _mm256_load_si256(reinterpret_cast<const __m256i*>(indices + i));
        const __m256i gathered = _mm256_i32gather_epi32(data, idx, sizeof(std::int32_t));
        acc = _mm256_add_epi32(acc, gathered);
    }

    alignas(32) std::array<std::uint32_t, kAvxWidthInts> lanes{};
    _mm256_store_si256(reinterpret_cast<__m256i*>(lanes.data()), acc);
    std::uint64_t sum = std::accumulate(lanes.begin(), lanes.end(), std::uint64_t{0});
    for (; i < query_count; ++i) {
        sum += static_cast<std::uint32_t>(data[static_cast<std::size_t>(indices[i])]);
    }
    return sum;
}

enum class GatherMode {
    kScalar,
    kAvx2,
};

BenchmarkInstance make_gather_benchmark(std::size_t requested_size_bytes, GatherMode mode) {
    const std::size_t data_elements =
        std::max<std::size_t>(4096, round_down_multiple(clamp_elements(requested_size_bytes, sizeof(std::int32_t)),
                                                        kAvxWidthInts));
    const std::size_t query_count = std::max<std::size_t>(
        4096,
        round_down_multiple(std::clamp<std::size_t>(data_elements / 2, 4096, 1ULL << 20), kAvxWidthInts));

    auto data = std::make_shared<AlignedBuffer<std::int32_t>>(data_elements);
    auto indices = std::make_shared<AlignedBuffer<std::int32_t>>(query_count);
    for (std::size_t i = 0; i < data_elements; ++i) {
        (*data)[i] = static_cast<std::int32_t>(seeded_value(i) & 0x3FFULL);
    }

    std::mt19937 rng(0xA11CEU);
    std::uniform_int_distribution<std::int32_t> dist(0, static_cast<std::int32_t>(data_elements - 1));
    for (std::size_t i = 0; i < query_count; ++i) {
        (*indices)[i] = dist(rng);
    }

    BenchmarkInstance instance;
    instance.actual_size_bytes = data_elements * sizeof(std::int32_t) + query_count * sizeof(std::int32_t);
    instance.elements = query_count;
    instance.threads_used = 1;
    instance.bytes_per_pass = query_count * sizeof(std::int32_t) * 2;
    instance.target_bytes_per_trial = kGatherTargetBytes;
    instance.run = [data, indices, query_count, mode](std::size_t passes) {
        std::uint64_t sum = 0;
        for (std::size_t pass = 0; pass < passes; ++pass) {
            sum += mode == GatherMode::kScalar
                       ? scalar_gather_sum_i32(data->data(), indices->data(), query_count)
                       : avx2_gather_sum_i32(data->data(), indices->data(), query_count);
        }
        return sum;
    };
    return instance;
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t scalar_sum_i32(const std::int32_t* data,
                                                                     std::size_t elements) {
    std::uint64_t sum = 0;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t i = 0; i < elements; ++i) {
        sum += static_cast<std::uint32_t>(data[i]);
    }
    return sum;
}

CACHEBENCH_NOINLINE std::uint64_t avx2_sum_i32(const std::int32_t* data, std::size_t elements) {
    __m256i acc = _mm256_setzero_si256();
    std::size_t i = 0;
    for (; i + kAvxWidthInts <= elements; i += kAvxWidthInts) {
        acc = _mm256_add_epi32(acc, _mm256_load_si256(reinterpret_cast<const __m256i*>(data + i)));
    }

    alignas(32) std::array<std::uint32_t, kAvxWidthInts> lanes{};
    _mm256_store_si256(reinterpret_cast<__m256i*>(lanes.data()), acc);
    std::uint64_t sum = std::accumulate(lanes.begin(), lanes.end(), std::uint64_t{0});
    for (; i < elements; ++i) {
        sum += static_cast<std::uint32_t>(data[i]);
    }
    return sum;
}

CACHEBENCH_NOINLINE std::uint64_t avx2_sum_i32_4acc(const std::int32_t* data, std::size_t elements) {
    __m256i acc0 = _mm256_setzero_si256();
    __m256i acc1 = _mm256_setzero_si256();
    __m256i acc2 = _mm256_setzero_si256();
    __m256i acc3 = _mm256_setzero_si256();
    std::size_t i = 0;
    for (; i + 4 * kAvxWidthInts <= elements; i += 4 * kAvxWidthInts) {
        acc0 = _mm256_add_epi32(acc0, _mm256_load_si256(reinterpret_cast<const __m256i*>(data + i)));
        acc1 = _mm256_add_epi32(
            acc1, _mm256_load_si256(reinterpret_cast<const __m256i*>(data + i + kAvxWidthInts)));
        acc2 = _mm256_add_epi32(
            acc2, _mm256_load_si256(reinterpret_cast<const __m256i*>(data + i + 2 * kAvxWidthInts)));
        acc3 = _mm256_add_epi32(
            acc3, _mm256_load_si256(reinterpret_cast<const __m256i*>(data + i + 3 * kAvxWidthInts)));
    }

    const __m256i acc = _mm256_add_epi32(_mm256_add_epi32(acc0, acc1), _mm256_add_epi32(acc2, acc3));
    alignas(32) std::array<std::uint32_t, kAvxWidthInts> lanes{};
    _mm256_store_si256(reinterpret_cast<__m256i*>(lanes.data()), acc);
    std::uint64_t sum = std::accumulate(lanes.begin(), lanes.end(), std::uint64_t{0});
    for (; i < elements; ++i) {
        sum += static_cast<std::uint32_t>(data[i]);
    }
    return sum;
}

enum class ReductionMode {
    kScalar,
    kAvx2,
    kAvx2FourAccumulators,
};

BenchmarkInstance make_sum_benchmark(std::size_t requested_size_bytes, ReductionMode mode) {
    const std::size_t elements =
        std::max<std::size_t>(32, round_down_multiple(clamp_elements(requested_size_bytes, sizeof(std::int32_t)), 32));
    auto data = std::make_shared<AlignedBuffer<std::int32_t>>(elements);
    for (std::size_t i = 0; i < elements; ++i) {
        (*data)[i] = static_cast<std::int32_t>(seeded_value(i) & 0x3FFULL);
    }

    BenchmarkInstance instance;
    instance.actual_size_bytes = elements * sizeof(std::int32_t);
    instance.elements = elements;
    instance.threads_used = 1;
    instance.bytes_per_pass = instance.actual_size_bytes;
    instance.target_bytes_per_trial = kReductionTargetBytes;
    instance.run = [data, elements, mode](std::size_t passes) {
        std::uint64_t sum = 0;
        for (std::size_t pass = 0; pass < passes; ++pass) {
            switch (mode) {
                case ReductionMode::kScalar:
                    sum += scalar_sum_i32(data->data(), elements);
                    break;
                case ReductionMode::kAvx2:
                    sum += avx2_sum_i32(data->data(), elements);
                    break;
                case ReductionMode::kAvx2FourAccumulators:
                    sum += avx2_sum_i32_4acc(data->data(), elements);
                    break;
            }
        }
        return sum;
    };
    return instance;
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t scalar_branch_filter_sum(const std::int32_t* data,
                                                                                std::size_t elements) {
    std::uint64_t sum = 0;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t i = 0; i < elements; ++i) {
        if (data[i] < 50) {
            sum += static_cast<std::uint32_t>(data[i]);
        }
    }
    return sum;
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t scalar_branchless_filter_sum(
    const std::int32_t* data,
    std::size_t elements) {
    std::uint64_t sum = 0;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t i = 0; i < elements; ++i) {
        sum += static_cast<std::uint32_t>(data[i] < 50 ? data[i] : 0);
    }
    return sum;
}

CACHEBENCH_NOINLINE std::uint64_t avx2_mask_filter_sum(const std::int32_t* data, std::size_t elements) {
    const __m256i limit = _mm256_set1_epi32(50);
    __m256i acc = _mm256_setzero_si256();
    std::size_t i = 0;
    for (; i + kAvxWidthInts <= elements; i += kAvxWidthInts) {
        const __m256i values = _mm256_load_si256(reinterpret_cast<const __m256i*>(data + i));
        const __m256i mask = _mm256_cmpgt_epi32(limit, values);
        acc = _mm256_add_epi32(acc, _mm256_and_si256(values, mask));
    }

    alignas(32) std::array<std::uint32_t, kAvxWidthInts> lanes{};
    _mm256_store_si256(reinterpret_cast<__m256i*>(lanes.data()), acc);
    std::uint64_t sum = std::accumulate(lanes.begin(), lanes.end(), std::uint64_t{0});
    for (; i < elements; ++i) {
        if (data[i] < 50) {
            sum += static_cast<std::uint32_t>(data[i]);
        }
    }
    return sum;
}

enum class FilterMode {
    kBranch,
    kBranchless,
    kAvx2Mask,
};

BenchmarkInstance make_filter_benchmark(std::size_t requested_size_bytes, FilterMode mode) {
    const std::size_t elements = std::max<std::size_t>(
        kAvxWidthInts, round_down_multiple(clamp_elements(requested_size_bytes, sizeof(std::int32_t)), kAvxWidthInts));
    auto data = std::make_shared<AlignedBuffer<std::int32_t>>(elements);

    std::mt19937 rng(0x51EEDU);
    std::uniform_int_distribution<std::int32_t> dist(0, 99);
    for (std::size_t i = 0; i < elements; ++i) {
        (*data)[i] = dist(rng);
    }

    BenchmarkInstance instance;
    instance.actual_size_bytes = elements * sizeof(std::int32_t);
    instance.elements = elements;
    instance.threads_used = 1;
    instance.bytes_per_pass = instance.actual_size_bytes;
    instance.target_bytes_per_trial = kFilterTargetBytes;
    instance.run = [data, elements, mode](std::size_t passes) {
        std::uint64_t sum = 0;
        for (std::size_t pass = 0; pass < passes; ++pass) {
            switch (mode) {
                case FilterMode::kBranch:
                    sum += scalar_branch_filter_sum(data->data(), elements);
                    break;
                case FilterMode::kBranchless:
                    sum += scalar_branchless_filter_sum(data->data(), elements);
                    break;
                case FilterMode::kAvx2Mask:
                    sum += avx2_mask_filter_sum(data->data(), elements);
                    break;
            }
        }
        return sum;
    };
    return instance;
}

template <typename T>
T scan_complete_seed(std::size_t index) {
    if constexpr (std::is_floating_point_v<T>) {
        return static_cast<T>((index % 251U) + 1U) / static_cast<T>(7);
    } else if constexpr (sizeof(T) == 1) {
        return static_cast<T>(seeded_value(index) & 0x7FULL);
    } else if constexpr (sizeof(T) == 2) {
        return static_cast<T>(seeded_value(index) & 0x7FFFULL);
    } else if constexpr (sizeof(T) == 4) {
        return static_cast<T>(seeded_value(index) & 0x7FFFFFFFULL);
    } else {
        return static_cast<T>(seeded_value(index));
    }
}

template <typename T>
CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t scan_complete_scalar_sum(const T* data,
                                                                                std::size_t elements) {
    if constexpr (std::is_floating_point_v<T>) {
        long double sum = 0.0;
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t i = 0; i < elements; ++i) {
            sum += static_cast<long double>(data[i]);
        }
        return static_cast<std::uint64_t>(sum);
    } else {
        std::uint64_t sum = 0;
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t i = 0; i < elements; ++i) {
            sum += static_cast<std::uint64_t>(data[i]);
        }
        return sum;
    }
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t scan_complete_scalar_sum_u64_4acc(
    const std::uint64_t* data,
    std::size_t elements) {
    std::uint64_t acc0 = 0;
    std::uint64_t acc1 = 0;
    std::uint64_t acc2 = 0;
    std::uint64_t acc3 = 0;
    std::size_t i = 0;
    CACHEBENCH_NO_VECTOR_LOOP
    for (; i + 4 <= elements; i += 4) {
        acc0 += data[i];
        acc1 += data[i + 1];
        acc2 += data[i + 2];
        acc3 += data[i + 3];
    }
    std::uint64_t sum = acc0 + acc1 + acc2 + acc3;
    for (; i < elements; ++i) {
        sum += data[i];
    }
    return sum;
}

CACHEBENCH_NOINLINE std::uint64_t scan_complete_avx2_sum_u8(const std::uint8_t* data,
                                                             std::size_t elements) {
    const __m256i zero = _mm256_setzero_si256();
    __m256i acc = _mm256_setzero_si256();
    std::size_t i = 0;
    for (; i + 32 <= elements; i += 32) {
        const __m256i values = _mm256_load_si256(reinterpret_cast<const __m256i*>(data + i));
        acc = _mm256_add_epi64(acc, _mm256_sad_epu8(values, zero));
    }

    alignas(32) std::array<std::uint64_t, 4> lanes{};
    _mm256_store_si256(reinterpret_cast<__m256i*>(lanes.data()), acc);
    std::uint64_t sum = lanes[0] + lanes[1] + lanes[2] + lanes[3];
    for (; i < elements; ++i) {
        sum += data[i];
    }
    return sum;
}

CACHEBENCH_NOINLINE std::uint64_t scan_complete_avx2_sum_u16(const std::uint16_t* data,
                                                              std::size_t elements) {
    __m256i acc = _mm256_setzero_si256();
    std::size_t i = 0;
    for (; i + 16 <= elements; i += 16) {
        const __m256i values = _mm256_load_si256(reinterpret_cast<const __m256i*>(data + i));
        const __m128i low128 = _mm256_castsi256_si128(values);
        const __m128i high128 = _mm256_extracti128_si256(values, 1);
        acc = _mm256_add_epi32(acc, _mm256_cvtepu16_epi32(low128));
        acc = _mm256_add_epi32(acc, _mm256_cvtepu16_epi32(high128));
    }

    alignas(32) std::array<std::uint32_t, 8> lanes{};
    _mm256_store_si256(reinterpret_cast<__m256i*>(lanes.data()), acc);
    std::uint64_t sum = std::accumulate(lanes.begin(), lanes.end(), std::uint64_t{0});
    for (; i < elements; ++i) {
        sum += data[i];
    }
    return sum;
}

CACHEBENCH_NOINLINE std::uint64_t scan_complete_avx2_sum_u32(const std::uint32_t* data,
                                                              std::size_t elements) {
    __m256i acc = _mm256_setzero_si256();
    std::size_t i = 0;
    for (; i + 8 <= elements; i += 8) {
        acc = _mm256_add_epi32(acc, _mm256_load_si256(reinterpret_cast<const __m256i*>(data + i)));
    }

    alignas(32) std::array<std::uint32_t, 8> lanes{};
    _mm256_store_si256(reinterpret_cast<__m256i*>(lanes.data()), acc);
    std::uint64_t sum = std::accumulate(lanes.begin(), lanes.end(), std::uint64_t{0});
    for (; i < elements; ++i) {
        sum += data[i];
    }
    return sum;
}

CACHEBENCH_NOINLINE std::uint64_t scan_complete_avx2_sum_u64(const std::uint64_t* data,
                                                              std::size_t elements) {
    __m256i acc = _mm256_setzero_si256();
    std::size_t i = 0;
    for (; i + 4 <= elements; i += 4) {
        acc = _mm256_add_epi64(acc, _mm256_load_si256(reinterpret_cast<const __m256i*>(data + i)));
    }

    alignas(32) std::array<std::uint64_t, 4> lanes{};
    _mm256_store_si256(reinterpret_cast<__m256i*>(lanes.data()), acc);
    std::uint64_t sum = lanes[0] + lanes[1] + lanes[2] + lanes[3];
    for (; i < elements; ++i) {
        sum += data[i];
    }
    return sum;
}

CACHEBENCH_NOINLINE std::uint64_t scan_complete_avx2_sum_u64_4acc(const std::uint64_t* data,
                                                                   std::size_t elements) {
    __m256i acc0 = _mm256_setzero_si256();
    __m256i acc1 = _mm256_setzero_si256();
    __m256i acc2 = _mm256_setzero_si256();
    __m256i acc3 = _mm256_setzero_si256();
    std::size_t i = 0;
    for (; i + 16 <= elements; i += 16) {
        acc0 = _mm256_add_epi64(acc0, _mm256_load_si256(reinterpret_cast<const __m256i*>(data + i)));
        acc1 = _mm256_add_epi64(acc1, _mm256_load_si256(reinterpret_cast<const __m256i*>(data + i + 4)));
        acc2 = _mm256_add_epi64(acc2, _mm256_load_si256(reinterpret_cast<const __m256i*>(data + i + 8)));
        acc3 = _mm256_add_epi64(acc3, _mm256_load_si256(reinterpret_cast<const __m256i*>(data + i + 12)));
    }

    const __m256i acc = _mm256_add_epi64(_mm256_add_epi64(acc0, acc1), _mm256_add_epi64(acc2, acc3));
    alignas(32) std::array<std::uint64_t, 4> lanes{};
    _mm256_store_si256(reinterpret_cast<__m256i*>(lanes.data()), acc);
    std::uint64_t sum = lanes[0] + lanes[1] + lanes[2] + lanes[3];
    for (; i < elements; ++i) {
        sum += data[i];
    }
    return sum;
}

CACHEBENCH_NOINLINE std::uint64_t scan_complete_avx2_sum_f32(const float* data,
                                                              std::size_t elements) {
    __m256 acc = _mm256_setzero_ps();
    std::size_t i = 0;
    for (; i + 8 <= elements; i += 8) {
        acc = _mm256_add_ps(acc, _mm256_load_ps(data + i));
    }

    alignas(32) std::array<float, 8> lanes{};
    _mm256_store_ps(lanes.data(), acc);
    double sum = std::accumulate(lanes.begin(), lanes.end(), 0.0);
    for (; i < elements; ++i) {
        sum += data[i];
    }
    return static_cast<std::uint64_t>(sum);
}

CACHEBENCH_NOINLINE std::uint64_t scan_complete_avx2_sum_f64(const double* data,
                                                              std::size_t elements) {
    __m256d acc = _mm256_setzero_pd();
    std::size_t i = 0;
    for (; i + 4 <= elements; i += 4) {
        acc = _mm256_add_pd(acc, _mm256_load_pd(data + i));
    }

    alignas(32) std::array<double, 4> lanes{};
    _mm256_store_pd(lanes.data(), acc);
    long double sum = std::accumulate(lanes.begin(), lanes.end(), 0.0L);
    for (; i < elements; ++i) {
        sum += data[i];
    }
    return static_cast<std::uint64_t>(sum);
}

CACHEBENCH_NOINLINE std::uint64_t scan_complete_avx2_sum_u64_unaligned_bytes(
    const std::uint8_t* bytes,
    std::size_t elements) {
    __m256i acc = _mm256_setzero_si256();
    std::size_t i = 0;
    for (; i + 4 <= elements; i += 4) {
        acc = _mm256_add_epi64(acc, _mm256_loadu_si256(reinterpret_cast<const __m256i*>(bytes + i * sizeof(std::uint64_t))));
    }

    alignas(32) std::array<std::uint64_t, 4> lanes{};
    _mm256_store_si256(reinterpret_cast<__m256i*>(lanes.data()), acc);
    std::uint64_t sum = lanes[0] + lanes[1] + lanes[2] + lanes[3];
    for (; i < elements; ++i) {
        std::uint64_t value = 0;
        std::memcpy(&value, bytes + i * sizeof(value), sizeof(value));
        sum += value;
    }
    return sum;
}

template <typename T, std::uint64_t (*SumFn)(const T*, std::size_t)>
BenchmarkInstance make_scan_complete_typed_sum(std::size_t requested_size_bytes) {
    const std::size_t alignment_multiple = std::max<std::size_t>(1, 32 / sizeof(T));
    const std::size_t elements = std::max<std::size_t>(
        alignment_multiple,
        round_down_multiple(clamp_elements(requested_size_bytes, sizeof(T)), alignment_multiple));
    auto data = std::make_shared<AlignedBuffer<T>>(elements);
    for (std::size_t i = 0; i < elements; ++i) {
        (*data)[i] = scan_complete_seed<T>(i);
    }

    BenchmarkInstance instance;
    instance.actual_size_bytes = elements * sizeof(T);
    instance.elements = elements;
    instance.threads_used = 1;
    instance.bytes_per_pass = instance.actual_size_bytes;
    instance.target_bytes_per_trial = kScanCompleteTargetBytes;
    instance.run = [data, elements](std::size_t passes) {
        std::uint64_t sum = 0;
        for (std::size_t pass = 0; pass < passes; ++pass) {
            scan_complete_compiler_barrier();
            sum += SumFn(data->data(), elements);
        }
        return sum;
    };
    return instance;
}

enum class ScanCompleteU64ReduceMode {
    kScalar1Acc,
    kScalar4Acc,
    kAvx2OneAcc,
    kAvx2FourAcc,
};

BenchmarkInstance make_scan_complete_u64_reduce(std::size_t requested_size_bytes,
                                                ScanCompleteU64ReduceMode mode) {
    const std::size_t elements =
        std::max<std::size_t>(16, round_down_multiple(clamp_elements(requested_size_bytes, sizeof(std::uint64_t)), 16));
    auto data = std::make_shared<AlignedBuffer<std::uint64_t>>(elements);
    for (std::size_t i = 0; i < elements; ++i) {
        (*data)[i] = scan_complete_seed<std::uint64_t>(i);
    }

    BenchmarkInstance instance;
    instance.actual_size_bytes = elements * sizeof(std::uint64_t);
    instance.elements = elements;
    instance.threads_used = 1;
    instance.bytes_per_pass = instance.actual_size_bytes;
    instance.target_bytes_per_trial = kScanCompleteTargetBytes;
    instance.run = [data, elements, mode](std::size_t passes) {
        std::uint64_t sum = 0;
        for (std::size_t pass = 0; pass < passes; ++pass) {
            scan_complete_compiler_barrier();
            switch (mode) {
                case ScanCompleteU64ReduceMode::kScalar1Acc:
                    sum += scan_complete_scalar_sum<std::uint64_t>(data->data(), elements);
                    break;
                case ScanCompleteU64ReduceMode::kScalar4Acc:
                    sum += scan_complete_scalar_sum_u64_4acc(data->data(), elements);
                    break;
                case ScanCompleteU64ReduceMode::kAvx2OneAcc:
                    sum += scan_complete_avx2_sum_u64(data->data(), elements);
                    break;
                case ScanCompleteU64ReduceMode::kAvx2FourAcc:
                    sum += scan_complete_avx2_sum_u64_4acc(data->data(), elements);
                    break;
            }
        }
        return sum;
    };
    return instance;
}

BenchmarkInstance make_scan_complete_unaligned_u64(std::size_t requested_size_bytes,
                                                   std::size_t byte_offset) {
    const std::size_t elements =
        std::max<std::size_t>(16, round_down_multiple(clamp_elements(requested_size_bytes, sizeof(std::uint64_t)), 16));
    const std::size_t bytes = byte_offset + elements * sizeof(std::uint64_t) + 32;
    auto storage = std::make_shared<AlignedBuffer<std::uint8_t>>(bytes);
    std::fill(storage->data(), storage->data() + storage->size(), std::uint8_t{0});
    for (std::size_t i = 0; i < elements; ++i) {
        const std::uint64_t value = scan_complete_seed<std::uint64_t>(i);
        std::memcpy(storage->data() + byte_offset + i * sizeof(value), &value, sizeof(value));
    }

    BenchmarkInstance instance;
    instance.actual_size_bytes = elements * sizeof(std::uint64_t);
    instance.elements = elements;
    instance.threads_used = 1;
    instance.bytes_per_pass = instance.actual_size_bytes;
    instance.target_bytes_per_trial = kScanCompleteTargetBytes;
    instance.run = [storage, byte_offset, elements](std::size_t passes) {
        std::uint64_t sum = 0;
        const std::uint8_t* base = storage->data() + byte_offset;
        for (std::size_t pass = 0; pass < passes; ++pass) {
            scan_complete_compiler_barrier();
            sum += scan_complete_avx2_sum_u64_unaligned_bytes(base, elements);
        }
        return sum;
    };
    return instance;
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t scan_complete_branchy_filter_u32(
    const std::uint32_t* data,
    std::size_t elements,
    std::uint32_t threshold) {
    std::uint64_t sum = 0;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t i = 0; i < elements; ++i) {
        if (data[i] < threshold) {
            sum += data[i];
        }
    }
    return sum;
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t scan_complete_branchless_filter_u32(
    const std::uint32_t* data,
    std::size_t elements,
    std::uint32_t threshold) {
    std::uint64_t sum = 0;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t i = 0; i < elements; ++i) {
        sum += (data[i] < threshold) ? data[i] : 0U;
    }
    return sum;
}

CACHEBENCH_NOINLINE std::uint64_t scan_complete_avx2_filter_u32(const std::uint32_t* data,
                                                                 std::size_t elements,
                                                                 std::uint32_t threshold) {
    const __m256i limit = _mm256_set1_epi32(static_cast<std::int32_t>(threshold));
    __m256i acc = _mm256_setzero_si256();
    std::size_t i = 0;
    for (; i + 8 <= elements; i += 8) {
        const __m256i values = _mm256_load_si256(reinterpret_cast<const __m256i*>(data + i));
        const __m256i mask = _mm256_cmpgt_epi32(limit, values);
        acc = _mm256_add_epi32(acc, _mm256_and_si256(values, mask));
    }

    alignas(32) std::array<std::uint32_t, 8> lanes{};
    _mm256_store_si256(reinterpret_cast<__m256i*>(lanes.data()), acc);
    std::uint64_t sum = std::accumulate(lanes.begin(), lanes.end(), std::uint64_t{0});
    for (; i < elements; ++i) {
        if (data[i] < threshold) {
            sum += data[i];
        }
    }
    return sum;
}

enum class ScanCompleteFilterMode {
    kBranchy,
    kBranchless,
    kAvx2Mask,
};

BenchmarkInstance make_scan_complete_filter(std::size_t requested_size_bytes,
                                            std::uint32_t selectivity_pct,
                                            ScanCompleteFilterMode mode) {
    const std::size_t elements = std::max<std::size_t>(
        32, round_down_multiple(clamp_elements(requested_size_bytes, sizeof(std::uint32_t)), 32));
    auto data = std::make_shared<AlignedBuffer<std::uint32_t>>(elements);
    std::mt19937 rng(0x5CA11U + selectivity_pct);
    std::uniform_int_distribution<std::uint32_t> dist(0, 999);
    for (std::size_t i = 0; i < elements; ++i) {
        (*data)[i] = dist(rng);
    }
    const std::uint32_t threshold = std::max<std::uint32_t>(1, selectivity_pct * 10U);

    BenchmarkInstance instance;
    instance.actual_size_bytes = elements * sizeof(std::uint32_t);
    instance.elements = elements;
    instance.threads_used = 1;
    instance.bytes_per_pass = instance.actual_size_bytes;
    instance.target_bytes_per_trial = kScanCompleteTargetBytes;
    instance.run = [data, elements, threshold, mode](std::size_t passes) {
        std::uint64_t sum = 0;
        for (std::size_t pass = 0; pass < passes; ++pass) {
            scan_complete_compiler_barrier();
            switch (mode) {
                case ScanCompleteFilterMode::kBranchy:
                    sum += scan_complete_branchy_filter_u32(data->data(), elements, threshold);
                    break;
                case ScanCompleteFilterMode::kBranchless:
                    sum += scan_complete_branchless_filter_u32(data->data(), elements, threshold);
                    break;
                case ScanCompleteFilterMode::kAvx2Mask:
                    sum += scan_complete_avx2_filter_u32(data->data(), elements, threshold);
                    break;
            }
        }
        return sum;
    };
    return instance;
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t scan_complete_scalar_multistream_u64(
    const std::uint64_t* const* streams,
    std::size_t stream_count,
    std::size_t elements_per_stream) {
    std::uint64_t sum = 0;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t i = 0; i < elements_per_stream; ++i) {
        for (std::size_t stream = 0; stream < stream_count; ++stream) {
            sum += streams[stream][i];
        }
    }
    return sum;
}

CACHEBENCH_NOINLINE std::uint64_t scan_complete_avx2_multistream_f32(
    const float* const* streams,
    std::size_t stream_count,
    std::size_t elements_per_stream) {
    __m256 acc = _mm256_setzero_ps();
    std::size_t i = 0;
    for (; i + 8 <= elements_per_stream; i += 8) {
        for (std::size_t stream = 0; stream < stream_count; ++stream) {
            acc = _mm256_add_ps(acc, _mm256_load_ps(streams[stream] + i));
        }
    }

    alignas(32) std::array<float, 8> lanes{};
    _mm256_store_ps(lanes.data(), acc);
    double sum = std::accumulate(lanes.begin(), lanes.end(), 0.0);
    for (; i < elements_per_stream; ++i) {
        for (std::size_t stream = 0; stream < stream_count; ++stream) {
            sum += streams[stream][i];
        }
    }
    return static_cast<std::uint64_t>(sum);
}

BenchmarkInstance make_scan_complete_scalar_multistream_u64(std::size_t requested_size_bytes,
                                                            std::size_t stream_count) {
    const std::size_t elements_per_stream = std::max<std::size_t>(
        16, round_down_multiple(clamp_elements(requested_size_bytes / stream_count, sizeof(std::uint64_t)), 16));
    auto buffers = std::make_shared<std::vector<std::shared_ptr<AlignedBuffer<std::uint64_t>>>>();
    auto pointers = std::make_shared<std::vector<const std::uint64_t*>>();
    buffers->reserve(stream_count);
    pointers->reserve(stream_count);
    for (std::size_t stream = 0; stream < stream_count; ++stream) {
        auto buffer = std::make_shared<AlignedBuffer<std::uint64_t>>(elements_per_stream);
        for (std::size_t i = 0; i < elements_per_stream; ++i) {
            (*buffer)[i] = scan_complete_seed<std::uint64_t>(i + stream * 131U);
        }
        pointers->push_back(buffer->data());
        buffers->push_back(std::move(buffer));
    }

    BenchmarkInstance instance;
    instance.actual_size_bytes = elements_per_stream * stream_count * sizeof(std::uint64_t);
    instance.elements = elements_per_stream * stream_count;
    instance.threads_used = 1;
    instance.bytes_per_pass = instance.actual_size_bytes;
    instance.target_bytes_per_trial = kScanCompleteTargetBytes;
    instance.run = [buffers, pointers, stream_count, elements_per_stream](std::size_t passes) {
        (void)buffers;
        std::uint64_t sum = 0;
        for (std::size_t pass = 0; pass < passes; ++pass) {
            scan_complete_compiler_barrier();
            sum += scan_complete_scalar_multistream_u64(pointers->data(), stream_count, elements_per_stream);
        }
        return sum;
    };
    return instance;
}

BenchmarkInstance make_scan_complete_avx2_multistream_f32(std::size_t requested_size_bytes,
                                                          std::size_t stream_count) {
    const std::size_t elements_per_stream = std::max<std::size_t>(
        32, round_down_multiple(clamp_elements(requested_size_bytes / stream_count, sizeof(float)), 32));
    auto buffers = std::make_shared<std::vector<std::shared_ptr<AlignedBuffer<float>>>>();
    auto pointers = std::make_shared<std::vector<const float*>>();
    buffers->reserve(stream_count);
    pointers->reserve(stream_count);
    for (std::size_t stream = 0; stream < stream_count; ++stream) {
        auto buffer = std::make_shared<AlignedBuffer<float>>(elements_per_stream);
        for (std::size_t i = 0; i < elements_per_stream; ++i) {
            (*buffer)[i] = scan_complete_seed<float>(i + stream * 131U);
        }
        pointers->push_back(buffer->data());
        buffers->push_back(std::move(buffer));
    }

    BenchmarkInstance instance;
    instance.actual_size_bytes = elements_per_stream * stream_count * sizeof(float);
    instance.elements = elements_per_stream * stream_count;
    instance.threads_used = 1;
    instance.bytes_per_pass = instance.actual_size_bytes;
    instance.target_bytes_per_trial = kScanCompleteTargetBytes;
    instance.run = [buffers, pointers, stream_count, elements_per_stream](std::size_t passes) {
        (void)buffers;
        std::uint64_t sum = 0;
        for (std::size_t pass = 0; pass < passes; ++pass) {
            scan_complete_compiler_barrier();
            sum += scan_complete_avx2_multistream_f32(pointers->data(), stream_count, elements_per_stream);
        }
        return sum;
    };
    return instance;
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t scan_complete_scalar_stride_f32(const float* data,
                                                                                       std::size_t sampled_elements,
                                                                                       std::size_t stride) {
    long double sum = 0.0;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t i = 0; i < sampled_elements; ++i) {
        sum += data[i * stride];
    }
    return static_cast<std::uint64_t>(sum);
}

CACHEBENCH_NOINLINE std::uint64_t scan_complete_avx2_gather_stride_f32(const float* data,
                                                                        std::size_t sampled_elements,
                                                                        std::size_t stride) {
    const __m256i offsets =
        _mm256_setr_epi32(0,
                          static_cast<int>(stride),
                          static_cast<int>(2 * stride),
                          static_cast<int>(3 * stride),
                          static_cast<int>(4 * stride),
                          static_cast<int>(5 * stride),
                          static_cast<int>(6 * stride),
                          static_cast<int>(7 * stride));
    __m256 acc = _mm256_setzero_ps();
    std::size_t sampled = 0;
    std::size_t base = 0;
    for (; sampled + 8 <= sampled_elements; sampled += 8, base += 8 * stride) {
        acc = _mm256_add_ps(acc, _mm256_i32gather_ps(data + base, offsets, 4));
    }

    alignas(32) std::array<float, 8> lanes{};
    _mm256_store_ps(lanes.data(), acc);
    long double sum = std::accumulate(lanes.begin(), lanes.end(), 0.0L);
    for (; sampled < sampled_elements; ++sampled, base += stride) {
        sum += data[base];
    }
    return static_cast<std::uint64_t>(sum);
}

enum class ScanCompleteStrideMode {
    kScalar,
    kAvx2Gather,
};

BenchmarkInstance make_scan_complete_stride_f32(std::size_t requested_size_bytes,
                                                std::size_t stride,
                                                ScanCompleteStrideMode mode) {
    const std::size_t elements = std::max<std::size_t>(
        64 * stride, round_down_multiple(clamp_elements(requested_size_bytes, sizeof(float)), 64 * stride));
    const std::size_t sampled_elements = std::max<std::size_t>(1, elements / stride);
    auto data = std::make_shared<AlignedBuffer<float>>(elements);
    for (std::size_t i = 0; i < elements; ++i) {
        (*data)[i] = scan_complete_seed<float>(i);
    }

    BenchmarkInstance instance;
    instance.actual_size_bytes = elements * sizeof(float);
    instance.elements = sampled_elements;
    instance.threads_used = 1;
    instance.bytes_per_pass = instance.actual_size_bytes;
    instance.target_bytes_per_trial = kScanCompleteTargetBytes;
    instance.run = [data, sampled_elements, stride, mode](std::size_t passes) {
        std::uint64_t sum = 0;
        for (std::size_t pass = 0; pass < passes; ++pass) {
            scan_complete_compiler_barrier();
            if (mode == ScanCompleteStrideMode::kScalar) {
                sum += scan_complete_scalar_stride_f32(data->data(), sampled_elements, stride);
            } else {
                sum += scan_complete_avx2_gather_stride_f32(data->data(), sampled_elements, stride);
            }
        }
        return sum;
    };
    return instance;
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR void scan_complete_scalar_copy_u64(const std::uint64_t* src,
                                                                            std::uint64_t* dst,
                                                                            std::size_t elements,
                                                                            bool transform) {
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t i = 0; i < elements; ++i) {
        dst[i] = transform ? (src[i] * 3ULL + 1ULL) : src[i];
    }
}

CACHEBENCH_NOINLINE void scan_complete_avx2_copy_u64(const std::uint64_t* src,
                                                      std::uint64_t* dst,
                                                      std::size_t elements,
                                                      bool transform,
                                                      bool non_temporal) {
    const __m256i three = _mm256_set1_epi64x(3);
    const __m256i one = _mm256_set1_epi64x(1);
    std::size_t i = 0;
    for (; i + 4 <= elements; i += 4) {
        __m256i values = _mm256_load_si256(reinterpret_cast<const __m256i*>(src + i));
        if (transform) {
            values = _mm256_add_epi64(_mm256_mul_epu32(values, three), one);
        }
        if (non_temporal) {
            _mm256_stream_si256(reinterpret_cast<__m256i*>(dst + i), values);
        } else {
            _mm256_store_si256(reinterpret_cast<__m256i*>(dst + i), values);
        }
    }
    if (non_temporal) {
        _mm_sfence();
    }
    for (; i < elements; ++i) {
        dst[i] = transform ? (src[i] * 3ULL + 1ULL) : src[i];
    }
}

enum class ScanCompleteCopyMode {
    kScalarCopy,
    kScalarTransform,
    kAvx2Copy,
    kAvx2Transform,
    kAvx2NonTemporalCopy,
};

BenchmarkInstance make_scan_complete_copy_u64(std::size_t requested_size_bytes,
                                              ScanCompleteCopyMode mode) {
    const std::size_t elements =
        std::max<std::size_t>(16, round_down_multiple(clamp_elements(requested_size_bytes, sizeof(std::uint64_t)), 16));
    auto src = std::make_shared<AlignedBuffer<std::uint64_t>>(elements);
    auto dst = std::make_shared<AlignedBuffer<std::uint64_t>>(elements);
    for (std::size_t i = 0; i < elements; ++i) {
        (*src)[i] = scan_complete_seed<std::uint64_t>(i);
        (*dst)[i] = 0;
    }

    BenchmarkInstance instance;
    instance.actual_size_bytes = elements * sizeof(std::uint64_t) * 2;
    instance.elements = elements;
    instance.threads_used = 1;
    instance.bytes_per_pass = instance.actual_size_bytes;
    instance.target_bytes_per_trial = kScanCompleteTargetBytes;
    instance.run = [src, dst, elements, mode](std::size_t passes) {
        std::uint64_t sum = 0;
        for (std::size_t pass = 0; pass < passes; ++pass) {
            scan_complete_compiler_barrier();
            switch (mode) {
                case ScanCompleteCopyMode::kScalarCopy:
                    scan_complete_scalar_copy_u64(src->data(), dst->data(), elements, false);
                    break;
                case ScanCompleteCopyMode::kScalarTransform:
                    scan_complete_scalar_copy_u64(src->data(), dst->data(), elements, true);
                    break;
                case ScanCompleteCopyMode::kAvx2Copy:
                    scan_complete_avx2_copy_u64(src->data(), dst->data(), elements, false, false);
                    break;
                case ScanCompleteCopyMode::kAvx2Transform:
                    scan_complete_avx2_copy_u64(src->data(), dst->data(), elements, true, false);
                    break;
                case ScanCompleteCopyMode::kAvx2NonTemporalCopy:
                    scan_complete_avx2_copy_u64(src->data(), dst->data(), elements, false, true);
                    break;
            }
            sum ^= sample_checksum(dst->data(), dst->size());
        }
        return sum;
    };
    return instance;
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t scan_complete_touch_cache_lines(
    const std::uint64_t* data,
    std::size_t elements) {
    std::uint64_t sum = 0;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t i = 0; i < elements; i += 8) {
        sum += data[i];
    }
    return sum;
}

BenchmarkInstance make_scan_complete_cold_thrash_u64(std::size_t requested_size_bytes) {
    const std::size_t elements =
        std::max<std::size_t>(16, round_down_multiple(clamp_elements(requested_size_bytes, sizeof(std::uint64_t)), 16));
    const std::size_t thrash_elements = std::max<std::size_t>(
        elements * 2,
        static_cast<std::size_t>(64ULL * 1024ULL * 1024ULL / sizeof(std::uint64_t)));
    auto data = std::make_shared<AlignedBuffer<std::uint64_t>>(elements);
    auto thrash = std::make_shared<AlignedBuffer<std::uint64_t>>(thrash_elements);
    for (std::size_t i = 0; i < elements; ++i) {
        (*data)[i] = scan_complete_seed<std::uint64_t>(i);
    }
    for (std::size_t i = 0; i < thrash_elements; ++i) {
        (*thrash)[i] = scan_complete_seed<std::uint64_t>(i + elements);
    }

    BenchmarkInstance instance;
    instance.actual_size_bytes = elements * sizeof(std::uint64_t) + thrash_elements * sizeof(std::uint64_t);
    instance.elements = elements;
    instance.threads_used = 1;
    instance.bytes_per_pass = instance.actual_size_bytes;
    instance.target_bytes_per_trial = kScanCompleteColdTargetBytes;
    instance.run = [data, thrash, elements, thrash_elements](std::size_t passes) {
        std::uint64_t sum = 0;
        for (std::size_t pass = 0; pass < passes; ++pass) {
            scan_complete_compiler_barrier();
            sum += scan_complete_touch_cache_lines(thrash->data(), thrash_elements);
            sum += scan_complete_scalar_sum<std::uint64_t>(data->data(), elements);
        }
        return sum;
    };
    return instance;
}

CACHEBENCH_NOINLINE std::uint64_t scalar_popcount_builtin(const std::uint8_t* data, std::size_t bytes) {
    std::uint64_t sum = 0;
    std::size_t i = 0;
    for (; i + sizeof(std::uint64_t) <= bytes; i += sizeof(std::uint64_t)) {
        std::uint64_t word = 0;
        std::memcpy(&word, data + i, sizeof(word));
        sum += std::popcount(word);
    }
    for (; i < bytes; ++i) {
        sum += std::popcount(static_cast<unsigned int>(data[i]));
    }
    return sum;
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t scalar_popcount_lut(const std::uint8_t* data,
                                                                           std::size_t bytes) {
    alignas(64) static constexpr std::array<std::uint8_t, 256> kLut = [] {
        std::array<std::uint8_t, 256> lut{};
        for (std::size_t value = 0; value < lut.size(); ++value) {
            lut[value] = static_cast<std::uint8_t>(std::popcount(static_cast<unsigned int>(value)));
        }
        return lut;
    }();

    std::uint64_t sum = 0;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t i = 0; i < bytes; ++i) {
        sum += kLut[data[i]];
    }
    return sum;
}

CACHEBENCH_NOINLINE std::uint64_t avx2_shuffle_popcount(const std::uint8_t* data, std::size_t bytes) {
    const __m256i low_mask = _mm256_set1_epi8(0x0F);
    const __m256i zero = _mm256_setzero_si256();
    const __m256i lut = _mm256_setr_epi8(0,
                                         1,
                                         1,
                                         2,
                                         1,
                                         2,
                                         2,
                                         3,
                                         1,
                                         2,
                                         2,
                                         3,
                                         2,
                                         3,
                                         3,
                                         4,
                                         0,
                                         1,
                                         1,
                                         2,
                                         1,
                                         2,
                                         2,
                                         3,
                                         1,
                                         2,
                                         2,
                                         3,
                                         2,
                                         3,
                                         3,
                                         4);

    __m256i acc = _mm256_setzero_si256();
    std::size_t i = 0;
    for (; i + 32 <= bytes; i += 32) {
        const __m256i x = _mm256_load_si256(reinterpret_cast<const __m256i*>(data + i));
        const __m256i lo = _mm256_and_si256(x, low_mask);
        const __m256i hi = _mm256_and_si256(_mm256_srli_epi16(x, 4), low_mask);
        const __m256i counts =
            _mm256_add_epi8(_mm256_shuffle_epi8(lut, lo), _mm256_shuffle_epi8(lut, hi));
        acc = _mm256_add_epi64(acc, _mm256_sad_epu8(counts, zero));
    }

    alignas(32) std::array<std::uint64_t, 4> lanes{};
    _mm256_store_si256(reinterpret_cast<__m256i*>(lanes.data()), acc);
    std::uint64_t sum = std::accumulate(lanes.begin(), lanes.end(), std::uint64_t{0});
    for (; i < bytes; ++i) {
        sum += std::popcount(static_cast<unsigned int>(data[i]));
    }
    return sum;
}

enum class ShuffleMode {
    kBuiltinPopcount,
    kScalarLut,
    kAvx2Shuffle,
};

BenchmarkInstance make_popcount_benchmark(std::size_t requested_size_bytes, ShuffleMode mode) {
    const std::size_t bytes = std::max<std::size_t>(32, round_down_multiple(requested_size_bytes, 32));
    auto data = std::make_shared<AlignedBuffer<std::uint8_t>>(bytes);
    for (std::size_t i = 0; i < bytes; ++i) {
        (*data)[i] = static_cast<std::uint8_t>(seeded_value(i) & 0xFFULL);
    }

    BenchmarkInstance instance;
    instance.actual_size_bytes = bytes;
    instance.elements = bytes;
    instance.threads_used = 1;
    instance.bytes_per_pass = bytes;
    instance.target_bytes_per_trial = kShuffleTargetBytes;
    instance.run = [data, bytes, mode](std::size_t passes) {
        std::uint64_t sum = 0;
        for (std::size_t pass = 0; pass < passes; ++pass) {
            switch (mode) {
                case ShuffleMode::kBuiltinPopcount:
                    sum += scalar_popcount_builtin(data->data(), bytes);
                    break;
                case ShuffleMode::kScalarLut:
                    sum += scalar_popcount_lut(data->data(), bytes);
                    break;
                case ShuffleMode::kAvx2Shuffle:
                    sum += avx2_shuffle_popcount(data->data(), bytes);
                    break;
            }
        }
        return sum;
    };
    return instance;
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::size_t scalar_argmin_i32(const std::int32_t* data,
                                                                      std::size_t elements) {
    std::size_t best = 0;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t i = 1; i < elements; ++i) {
        if (data[i] < data[best]) {
            best = i;
        }
    }
    return best;
}

CACHEBENCH_NOINLINE std::size_t avx2_argmin_blend_i32(const std::int32_t* data, std::size_t elements) {
    const __m256i step = _mm256_set1_epi32(static_cast<int>(kAvxWidthInts));
    __m256i current = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
    __m256i best_values = _mm256_set1_epi32(std::numeric_limits<int>::max());
    __m256i best_indices = _mm256_setzero_si256();

    std::size_t i = 0;
    for (; i + kAvxWidthInts <= elements; i += kAvxWidthInts) {
        const __m256i values = _mm256_load_si256(reinterpret_cast<const __m256i*>(data + i));
        const __m256i mask = _mm256_cmpgt_epi32(best_values, values);
        best_values = _mm256_blendv_epi8(best_values, values, mask);
        best_indices = _mm256_blendv_epi8(best_indices, current, mask);
        current = _mm256_add_epi32(current, step);
    }

    alignas(32) std::array<int, kAvxWidthInts> values{};
    alignas(32) std::array<int, kAvxWidthInts> indices{};
    _mm256_store_si256(reinterpret_cast<__m256i*>(values.data()), best_values);
    _mm256_store_si256(reinterpret_cast<__m256i*>(indices.data()), best_indices);

    std::size_t best = static_cast<std::size_t>(indices[0]);
    int best_value = values[0];
    for (std::size_t lane = 1; lane < kAvxWidthInts; ++lane) {
        const auto lane_index = static_cast<std::size_t>(indices[lane]);
        if (values[lane] < best_value || (values[lane] == best_value && lane_index < best)) {
            best_value = values[lane];
            best = lane_index;
        }
    }
    for (; i < elements; ++i) {
        if (data[i] < best_value) {
            best_value = data[i];
            best = i;
        }
    }
    return best;
}

CACHEBENCH_NOINLINE std::size_t avx2_argmin_rescan_i32(const std::int32_t* data, std::size_t elements) {
    int best_value = std::numeric_limits<int>::max();
    std::size_t best = 0;
    __m256i threshold = _mm256_set1_epi32(best_value);

    std::size_t i = 0;
    for (; i + kAvxWidthInts <= elements; i += kAvxWidthInts) {
        const __m256i values = _mm256_load_si256(reinterpret_cast<const __m256i*>(data + i));
        const __m256i mask = _mm256_cmpgt_epi32(threshold, values);
        if (!_mm256_testz_si256(mask, mask)) {
            for (std::size_t lane = 0; lane < kAvxWidthInts; ++lane) {
                const std::size_t index = i + lane;
                if (data[index] < best_value) {
                    best_value = data[index];
                    best = index;
                }
            }
            threshold = _mm256_set1_epi32(best_value);
        }
    }
    for (; i < elements; ++i) {
        if (data[i] < best_value) {
            best_value = data[i];
            best = i;
        }
    }
    return best;
}

enum class ArgminMode {
    kScalar,
    kAvx2Blend,
    kAvx2Rescan,
};

BenchmarkInstance make_argmin_benchmark(std::size_t requested_size_bytes, ArgminMode mode) {
    const std::size_t elements = std::max<std::size_t>(
        kAvxWidthInts, round_down_multiple(clamp_elements(requested_size_bytes, sizeof(std::int32_t)), kAvxWidthInts));
    auto data = std::make_shared<AlignedBuffer<std::int32_t>>(elements);

    std::mt19937 rng(0xBADC0DEU);
    std::uniform_int_distribution<std::int32_t> dist(0, std::numeric_limits<std::int32_t>::max());
    for (std::size_t i = 0; i < elements; ++i) {
        (*data)[i] = dist(rng);
    }
    (*data)[elements / 3] = -17;

    BenchmarkInstance instance;
    instance.actual_size_bytes = elements * sizeof(std::int32_t);
    instance.elements = elements;
    instance.threads_used = 1;
    instance.bytes_per_pass = instance.actual_size_bytes;
    instance.target_bytes_per_trial = kCaseStudyTargetBytes;
    instance.run = [data, elements, mode](std::size_t passes) {
        std::uint64_t sum = 0;
        for (std::size_t pass = 0; pass < passes; ++pass) {
            switch (mode) {
                case ArgminMode::kScalar:
                    sum ^= scalar_argmin_i32(data->data(), elements);
                    break;
                case ArgminMode::kAvx2Blend:
                    sum ^= avx2_argmin_blend_i32(data->data(), elements);
                    break;
                case ArgminMode::kAvx2Rescan:
                    sum ^= avx2_argmin_rescan_i32(data->data(), elements);
                    break;
            }
        }
        return sum;
    };
    return instance;
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR void scalar_prefix_i32(const std::int32_t* src,
                                                               std::int32_t* dst,
                                                               std::size_t elements) {
    std::int32_t carry = 0;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t i = 0; i < elements; ++i) {
        carry += src[i];
        dst[i] = carry;
    }
}

CACHEBENCH_NOINLINE void avx2_prefix_i32(const std::int32_t* src,
                                         std::int32_t* dst,
                                         std::size_t elements) {
    std::int32_t carry = 0;
    std::size_t i = 0;
    for (; i + kAvxWidthInts <= elements; i += kAvxWidthInts) {
        __m256i x = _mm256_load_si256(reinterpret_cast<const __m256i*>(src + i));
        x = _mm256_add_epi32(x, _mm256_slli_si256(x, 4));
        x = _mm256_add_epi32(x, _mm256_slli_si256(x, 8));

        const __m128i low = _mm256_castsi256_si128(x);
        __m128i high = _mm256_extracti128_si256(x, 1);
        const __m128i low_carry = _mm_shuffle_epi32(low, _MM_SHUFFLE(3, 3, 3, 3));
        high = _mm_add_epi32(high, low_carry);

        __m256i y = _mm256_castsi128_si256(low);
        y = _mm256_inserti128_si256(y, high, 1);
        y = _mm256_add_epi32(y, _mm256_set1_epi32(carry));
        _mm256_store_si256(reinterpret_cast<__m256i*>(dst + i), y);

        const __m128i y_high = _mm256_extracti128_si256(y, 1);
        carry = _mm_extract_epi32(y_high, 3);
    }
    for (; i < elements; ++i) {
        carry += src[i];
        dst[i] = carry;
    }
}

enum class PrefixMode {
    kScalar,
    kAvx2,
};

BenchmarkInstance make_prefix_benchmark(std::size_t requested_size_bytes, PrefixMode mode) {
    const std::size_t elements = std::max<std::size_t>(
        kAvxWidthInts,
        round_down_multiple(std::max<std::size_t>(kAvxWidthInts,
                                                  requested_size_bytes / (sizeof(std::int32_t) * 2)),
                            kAvxWidthInts));
    auto src = std::make_shared<AlignedBuffer<std::int32_t>>(elements);
    auto dst = std::make_shared<AlignedBuffer<std::int32_t>>(elements);
    for (std::size_t i = 0; i < elements; ++i) {
        (*src)[i] = static_cast<std::int32_t>((seeded_value(i) & 0x3ULL) + 1ULL);
        (*dst)[i] = 0;
    }

    BenchmarkInstance instance;
    instance.actual_size_bytes = elements * sizeof(std::int32_t) * 2;
    instance.elements = elements;
    instance.threads_used = 1;
    instance.bytes_per_pass = instance.actual_size_bytes;
    instance.target_bytes_per_trial = kCaseStudyTargetBytes;
    instance.run = [src, dst, elements, mode](std::size_t passes) {
        std::uint64_t sum = 0;
        for (std::size_t pass = 0; pass < passes; ++pass) {
            switch (mode) {
                case PrefixMode::kScalar:
                    scalar_prefix_i32(src->data(), dst->data(), elements);
                    break;
                case PrefixMode::kAvx2:
                    avx2_prefix_i32(src->data(), dst->data(), elements);
                    break;
            }
            sum ^= sample_checksum(dst->data(), elements);
        }
        return sum;
    };
    return instance;
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR void scalar_matmul_naive(const float* a,
                                                                 const float* b,
                                                                 float* c,
                                                                 std::size_t n) {
    std::fill_n(c, n * n, 0.0f);
    for (std::size_t i = 0; i < n; ++i) {
        for (std::size_t j = 0; j < n; ++j) {
            float sum = 0.0f;
            CACHEBENCH_NO_VECTOR_LOOP
            for (std::size_t k = 0; k < n; ++k) {
                sum += a[i * n + k] * b[k * n + j];
            }
            c[i * n + j] = sum;
        }
    }
}

void transpose_square_matrix(const float* src, float* dst, std::size_t n) {
    for (std::size_t i = 0; i < n; ++i) {
        for (std::size_t j = 0; j < n; ++j) {
            dst[j * n + i] = src[i * n + j];
        }
    }
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR void scalar_matmul_transposed(const float* a,
                                                                      const float* bt,
                                                                      float* c,
                                                                      std::size_t n) {
    std::fill_n(c, n * n, 0.0f);
    for (std::size_t i = 0; i < n; ++i) {
        const float* row_a = a + i * n;
        for (std::size_t j = 0; j < n; ++j) {
            const float* row_bt = bt + j * n;
            float sum = 0.0f;
            CACHEBENCH_NO_VECTOR_LOOP
            for (std::size_t k = 0; k < n; ++k) {
                sum += row_a[k] * row_bt[k];
            }
            c[i * n + j] = sum;
        }
    }
}

CACHEBENCH_NOINLINE void avx2_matmul_gather(const float* a,
                                            const float* b,
                                            float* c,
                                            std::size_t n) {
    std::fill_n(c, n * n, 0.0f);
    for (std::size_t i = 0; i < n; ++i) {
        const float* row_a = a + i * n;
        for (std::size_t j = 0; j < n; ++j) {
            __m256 acc = _mm256_setzero_ps();
            std::size_t k = 0;
            for (; k + kAvxWidthFloats <= n; k += kAvxWidthFloats) {
                const __m256 avec = _mm256_load_ps(row_a + k);
                const __m256i bindex = _mm256_setr_epi32(static_cast<int>((k + 0) * n + j),
                                                         static_cast<int>((k + 1) * n + j),
                                                         static_cast<int>((k + 2) * n + j),
                                                         static_cast<int>((k + 3) * n + j),
                                                         static_cast<int>((k + 4) * n + j),
                                                         static_cast<int>((k + 5) * n + j),
                                                         static_cast<int>((k + 6) * n + j),
                                                         static_cast<int>((k + 7) * n + j));
                const __m256 bcol = _mm256_i32gather_ps(b, bindex, sizeof(float));
                acc = _mm256_add_ps(acc, _mm256_mul_ps(avec, bcol));
            }

            float sum = horizontal_sum_ps(acc);
            for (; k < n; ++k) {
                sum += row_a[k] * b[k * n + j];
            }
            c[i * n + j] = sum;
        }
    }
}

CACHEBENCH_NOINLINE void avx2_matmul_blocked(const float* a,
                                             const float* b,
                                             float* c,
                                             std::size_t n) {
    constexpr std::size_t kBlockI = 32;
    constexpr std::size_t kBlockJ = 64;
    constexpr std::size_t kBlockK = 64;

    std::fill_n(c, n * n, 0.0f);
    for (std::size_t ii = 0; ii < n; ii += kBlockI) {
        const std::size_t i_end = std::min(ii + kBlockI, n);
        for (std::size_t kk = 0; kk < n; kk += kBlockK) {
            const std::size_t k_end = std::min(kk + kBlockK, n);
            for (std::size_t jj = 0; jj < n; jj += kBlockJ) {
                const std::size_t j_end = std::min(jj + kBlockJ, n);
                for (std::size_t i = ii; i < i_end; ++i) {
                    for (std::size_t k = kk; k < k_end; ++k) {
                        const __m256 aval = _mm256_set1_ps(a[i * n + k]);
                        std::size_t j = jj;
                        for (; j + kAvxWidthFloats <= j_end; j += kAvxWidthFloats) {
                            const __m256 cvec = _mm256_loadu_ps(c + i * n + j);
                            const __m256 bvec = _mm256_loadu_ps(b + k * n + j);
                            const __m256 updated =
                                _mm256_add_ps(cvec, _mm256_mul_ps(aval, bvec));
                            _mm256_storeu_ps(c + i * n + j, updated);
                        }
                        for (; j < j_end; ++j) {
                            c[i * n + j] += a[i * n + k] * b[k * n + j];
                        }
                    }
                }
            }
        }
    }
}

enum class MatmulMode {
    kScalarNaive,
    kScalarTransposed,
    kAvx2Gather,
    kAvx2Blocked,
};

std::size_t matrix_dim_from_bytes(std::size_t requested_size_bytes) {
    const long double per_matrix = static_cast<long double>(std::max<std::size_t>(requested_size_bytes, 1));
    const long double raw_dim = std::sqrt(per_matrix / (3.0L * sizeof(float)));
    std::size_t n = static_cast<std::size_t>(raw_dim);
    n = std::clamp<std::size_t>(n, kMinMatrixDim, kMaxMatrixDim);
    n = round_down_multiple(n, kAvxWidthFloats);
    return std::max<std::size_t>(kMinMatrixDim, n);
}

BenchmarkInstance make_matmul_benchmark(std::size_t requested_size_bytes, MatmulMode mode) {
    const std::size_t n = matrix_dim_from_bytes(requested_size_bytes);
    const std::size_t elements = n * n;
    auto a = std::make_shared<AlignedBuffer<float>>(elements);
    auto b = std::make_shared<AlignedBuffer<float>>(elements);
    auto c = std::make_shared<AlignedBuffer<float>>(elements);

    for (std::size_t i = 0; i < elements; ++i) {
        (*a)[i] = seeded_float(i, 257.0f);
        (*b)[i] = seeded_float(i + elements, 263.0f);
        (*c)[i] = 0.0f;
    }

    std::shared_ptr<AlignedBuffer<float>> bt;
    if (mode == MatmulMode::kScalarTransposed) {
        bt = std::make_shared<AlignedBuffer<float>>(elements);
        transpose_square_matrix(b->data(), bt->data(), n);
    }

    BenchmarkInstance instance;
    instance.actual_size_bytes = elements * sizeof(float) * 3;
    instance.elements = n;
    instance.threads_used = 1;
    instance.bytes_per_pass = instance.actual_size_bytes;
    instance.target_bytes_per_trial = kMatmulTargetBytes;
    instance.run = [a, b, bt, c, n, mode](std::size_t passes) {
        std::uint64_t sum = 0;
        for (std::size_t pass = 0; pass < passes; ++pass) {
            switch (mode) {
                case MatmulMode::kScalarNaive:
                    scalar_matmul_naive(a->data(), b->data(), c->data(), n);
                    break;
                case MatmulMode::kScalarTransposed:
                    scalar_matmul_transposed(a->data(), bt->data(), c->data(), n);
                    break;
                case MatmulMode::kAvx2Gather:
                    avx2_matmul_gather(a->data(), b->data(), c->data(), n);
                    break;
                case MatmulMode::kAvx2Blocked:
                    avx2_matmul_blocked(a->data(), b->data(), c->data(), n);
                    break;
            }
            sum ^= sample_checksum(c->data(), c->size());
        }
        return sum;
    };
    return instance;
}

}  // namespace

void append_simd_benchmarks(std::vector<BenchmarkSpec>& specs) {
    if (!cpu_supports_avx2()) {
        return;
    }

    specs.push_back({"scan_complete_scalar_sum_u8",
                     "Scan-completeness: scalar no-vector sum over contiguous uint8_t data.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_scan_complete_typed_sum<std::uint8_t, scan_complete_scalar_sum<std::uint8_t>>(
                             size_bytes);
                     }});
    specs.push_back({"scan_complete_scalar_sum_u16",
                     "Scan-completeness: scalar no-vector sum over contiguous uint16_t data.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_scan_complete_typed_sum<std::uint16_t, scan_complete_scalar_sum<std::uint16_t>>(
                             size_bytes);
                     }});
    specs.push_back({"scan_complete_scalar_sum_u32",
                     "Scan-completeness: scalar no-vector sum over contiguous uint32_t data.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_scan_complete_typed_sum<std::uint32_t, scan_complete_scalar_sum<std::uint32_t>>(
                             size_bytes);
                     }});
    specs.push_back({"scan_complete_scalar_sum_u64",
                     "Scan-completeness: scalar no-vector sum over contiguous uint64_t data.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_scan_complete_typed_sum<std::uint64_t, scan_complete_scalar_sum<std::uint64_t>>(
                             size_bytes);
                     }});
    specs.push_back({"scan_complete_scalar_sum_f32",
                     "Scan-completeness: scalar no-vector sum over contiguous float data.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_scan_complete_typed_sum<float, scan_complete_scalar_sum<float>>(size_bytes);
                     }});
    specs.push_back({"scan_complete_scalar_sum_f64",
                     "Scan-completeness: scalar no-vector sum over contiguous double data.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_scan_complete_typed_sum<double, scan_complete_scalar_sum<double>>(size_bytes);
                     }});
    specs.push_back({"scan_complete_avx2_sum_u8",
                     "Scan-completeness: AVX2 sum over contiguous uint8_t data using byte SAD.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_scan_complete_typed_sum<std::uint8_t, scan_complete_avx2_sum_u8>(size_bytes);
                     }});
    specs.push_back({"scan_complete_avx2_sum_u16",
                     "Scan-completeness: AVX2 sum over contiguous uint16_t data with widening.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_scan_complete_typed_sum<std::uint16_t, scan_complete_avx2_sum_u16>(size_bytes);
                     }});
    specs.push_back({"scan_complete_avx2_sum_u32",
                     "Scan-completeness: AVX2 sum over contiguous uint32_t data.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_scan_complete_typed_sum<std::uint32_t, scan_complete_avx2_sum_u32>(size_bytes);
                     }});
    specs.push_back({"scan_complete_avx2_sum_u64",
                     "Scan-completeness: AVX2 sum over contiguous uint64_t data.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_scan_complete_typed_sum<std::uint64_t, scan_complete_avx2_sum_u64>(size_bytes);
                     }});
    specs.push_back({"scan_complete_avx2_sum_f32",
                     "Scan-completeness: AVX2 sum over contiguous float data.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_scan_complete_typed_sum<float, scan_complete_avx2_sum_f32>(size_bytes);
                     }});
    specs.push_back({"scan_complete_avx2_sum_f64",
                     "Scan-completeness: AVX2 sum over contiguous double data.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_scan_complete_typed_sum<double, scan_complete_avx2_sum_f64>(size_bytes);
                     }});
    specs.push_back({"scan_complete_scalar_sum_u64_1acc",
                     "Scan-completeness: scalar uint64_t reduction with one dependent accumulator.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_scan_complete_u64_reduce(size_bytes, ScanCompleteU64ReduceMode::kScalar1Acc);
                     }});
    specs.push_back({"scan_complete_scalar_sum_u64_4acc",
                     "Scan-completeness: scalar uint64_t reduction with four independent accumulators.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_scan_complete_u64_reduce(size_bytes, ScanCompleteU64ReduceMode::kScalar4Acc);
                     }});
    specs.push_back({"scan_complete_avx2_sum_u64_1acc",
                     "Scan-completeness: AVX2 uint64_t reduction with one vector accumulator.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_scan_complete_u64_reduce(size_bytes, ScanCompleteU64ReduceMode::kAvx2OneAcc);
                     }});
    specs.push_back({"scan_complete_avx2_sum_u64_4acc",
                     "Scan-completeness: AVX2 uint64_t reduction with four independent vector accumulators.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_scan_complete_u64_reduce(size_bytes, ScanCompleteU64ReduceMode::kAvx2FourAcc);
                     }});

    for (const std::size_t offset : std::array<std::size_t, 6>{0, 1, 4, 8, 16, 32}) {
        specs.push_back({"scan_complete_avx2_sum_u64_offset" + std::to_string(offset) + "b",
                         "Scan-completeness: AVX2 uint64_t sum using unaligned loads at byte offset " +
                             std::to_string(offset) + ".",
                         [offset](std::size_t size_bytes, std::size_t) {
                             return make_scan_complete_unaligned_u64(size_bytes, offset);
                         }});
    }

    for (const std::uint32_t selectivity_pct : std::array<std::uint32_t, 4>{1, 10, 50, 90}) {
        specs.push_back({"scan_complete_branchy_filter_" + std::to_string(selectivity_pct) + "pct_u32",
                         "Scan-completeness: randomized branchy threshold scan with " +
                             std::to_string(selectivity_pct) + "% selectivity.",
                         [selectivity_pct](std::size_t size_bytes, std::size_t) {
                             return make_scan_complete_filter(
                                 size_bytes, selectivity_pct, ScanCompleteFilterMode::kBranchy);
                         }});
        specs.push_back({"scan_complete_branchless_filter_" + std::to_string(selectivity_pct) + "pct_u32",
                         "Scan-completeness: randomized branchless threshold scan with " +
                             std::to_string(selectivity_pct) + "% selectivity.",
                         [selectivity_pct](std::size_t size_bytes, std::size_t) {
                             return make_scan_complete_filter(
                                 size_bytes, selectivity_pct, ScanCompleteFilterMode::kBranchless);
                         }});
        specs.push_back({"scan_complete_avx2_filter_" + std::to_string(selectivity_pct) + "pct_u32",
                         "Scan-completeness: AVX2 masked threshold scan with " +
                             std::to_string(selectivity_pct) + "% selectivity.",
                         [selectivity_pct](std::size_t size_bytes, std::size_t) {
                             return make_scan_complete_filter(
                                 size_bytes, selectivity_pct, ScanCompleteFilterMode::kAvx2Mask);
                         }});
    }

    for (const std::size_t stream_count : std::array<std::size_t, 5>{1, 2, 4, 8, 16}) {
        specs.push_back({"scan_complete_scalar_multistream_" + std::to_string(stream_count) + "_u64",
                         "Scan-completeness: scalar interleaved scan over " + std::to_string(stream_count) +
                             " independent uint64_t streams.",
                         [stream_count](std::size_t size_bytes, std::size_t) {
                             return make_scan_complete_scalar_multistream_u64(size_bytes, stream_count);
                         }});
        specs.push_back({"scan_complete_cache_simd_multistream_" + std::to_string(stream_count) + "_f32",
                         "Scan-completeness: cache and SIMD together, AVX2 scan over " +
                             std::to_string(stream_count) + " independent float streams.",
                         [stream_count](std::size_t size_bytes, std::size_t) {
                             return make_scan_complete_avx2_multistream_f32(size_bytes, stream_count);
                         }});
    }

    for (const std::size_t stride : std::array<std::size_t, 6>{1, 2, 4, 8, 16, 64}) {
        specs.push_back({"scan_complete_scalar_stride" + std::to_string(stride) + "_f32",
                         "Scan-completeness: scalar float scan sampling every " + std::to_string(stride) +
                             " element(s).",
                         [stride](std::size_t size_bytes, std::size_t) {
                             return make_scan_complete_stride_f32(size_bytes, stride, ScanCompleteStrideMode::kScalar);
                         }});
        specs.push_back({"scan_complete_cache_simd_gather_stride" + std::to_string(stride) + "_f32",
                         "Scan-completeness: cache and SIMD together, AVX2 gather scan sampling every " +
                             std::to_string(stride) + " element(s).",
                         [stride](std::size_t size_bytes, std::size_t) {
                             return make_scan_complete_stride_f32(
                                 size_bytes, stride, ScanCompleteStrideMode::kAvx2Gather);
                         }});
    }

    specs.push_back({"scan_complete_scalar_copy_u64",
                     "Scan-completeness: scalar copy scan from one contiguous uint64_t array to another.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_scan_complete_copy_u64(size_bytes, ScanCompleteCopyMode::kScalarCopy);
                     }});
    specs.push_back({"scan_complete_scalar_transform_u64",
                     "Scan-completeness: scalar read-transform-write scan over contiguous uint64_t arrays.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_scan_complete_copy_u64(size_bytes, ScanCompleteCopyMode::kScalarTransform);
                     }});
    specs.push_back({"scan_complete_avx2_copy_u64",
                     "Scan-completeness: AVX2 copy scan from one contiguous uint64_t array to another.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_scan_complete_copy_u64(size_bytes, ScanCompleteCopyMode::kAvx2Copy);
                     }});
    specs.push_back({"scan_complete_avx2_transform_u64",
                     "Scan-completeness: AVX2 read-transform-write scan over contiguous uint64_t arrays.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_scan_complete_copy_u64(size_bytes, ScanCompleteCopyMode::kAvx2Transform);
                     }});
    specs.push_back({"scan_complete_avx2_nt_copy_u64",
                     "Scan-completeness: AVX2 non-temporal store copy scan for write-heavy streaming output.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_scan_complete_copy_u64(size_bytes, ScanCompleteCopyMode::kAvx2NonTemporalCopy);
                     }});
    specs.push_back({"scan_complete_cold_thrash_sum_u64",
                     "Scan-completeness: scalar uint64_t sum after touching a large cache-thrashing buffer.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_scan_complete_cold_thrash_u64(size_bytes);
                     }});

    specs.push_back({"simd_scalar_add_f32",
                     "Scalar float vector add with vectorization disabled.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_add_benchmark(size_bytes, AddMode::kScalarNoVector);
                     }});
    specs.push_back({"simd_auto_add_f32",
                     "Plain float vector add for compiler auto-vectorization.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_add_benchmark(size_bytes, AddMode::kAutoVectorized);
                     }});
    specs.push_back({"simd_avx2_add_aligned_f32",
                     "Manual AVX2 float vector add with aligned loads and stores.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_add_benchmark(size_bytes, AddMode::kAvx2Aligned);
                     }});
    specs.push_back({"simd_avx2_add_unaligned_f32",
                     "Manual AVX2 float vector add with intentionally misaligned memory.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_add_benchmark(size_bytes, AddMode::kAvx2Unaligned);
                     }});
    specs.push_back({"simd_scalar_aos_hot_sum_f32",
                     "Scalar hot-field sum over 64-byte records stored as AoS.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_particle_hot_sum_benchmark(size_bytes, HotSumMode::kScalarAos);
                     }});
    specs.push_back({"simd_scalar_soa_hot_sum_f32",
                     "Scalar hot-field sum over the same logical records stored as SoA.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_particle_hot_sum_benchmark(size_bytes, HotSumMode::kScalarSoa);
                     }});
    specs.push_back({"simd_avx2_aos_gather_hot_sum_f32",
                     "AVX2 hot-field sum over AoS records using gather loads.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_particle_hot_sum_benchmark(size_bytes, HotSumMode::kAvx2AosGather);
                     }});
    specs.push_back({"simd_avx2_soa_hot_sum_f32",
                     "AVX2 hot-field sum over SoA records using contiguous loads.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_particle_hot_sum_benchmark(size_bytes, HotSumMode::kAvx2Soa);
                     }});
    specs.push_back({"simd_scalar_array_lower_bound_i32",
                     "Scalar lower_bound queries over a sorted contiguous int array.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_search_ladder_benchmark(size_bytes, SearchLadderMode::kScalarArray);
                     }});
    specs.push_back({"simd_scalar_btree16_lower_bound_i32",
                     "Scalar lower_bound queries over a cache-aware 16-way B-tree layout.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_search_ladder_benchmark(size_bytes, SearchLadderMode::kScalarBTree);
                     }});
    specs.push_back({"simd_avx2_array_lower_bound_i32",
                     "SIMD lower_bound queries over a sorted contiguous int array using batched gather steps.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_search_ladder_benchmark(size_bytes, SearchLadderMode::kAvx2Array);
                     }});
    specs.push_back({"simd_avx2_btree16_lower_bound_i32",
                     "SIMD-assisted lower_bound queries over a 16-way B-tree layout using vector compares inside nodes.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_search_ladder_benchmark(size_bytes, SearchLadderMode::kAvx2BTree);
                     }});
    specs.push_back({"simd_scalar_aos_histogram_u8",
                     "Scalar histogram over bounded keys embedded in 64-byte AoS records.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_histogram_ladder_benchmark(size_bytes, HistogramLadderMode::kScalarAos);
                     }});
    specs.push_back({"simd_scalar_soa_histogram_u8",
                     "Scalar histogram over the same keys stored contiguously.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_histogram_ladder_benchmark(size_bytes, HistogramLadderMode::kScalarSoa);
                     }});
    specs.push_back({"simd_avx2_aos_histogram_u8",
                     "SIMD-assisted histogram over AoS records using gathered key loads.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_histogram_ladder_benchmark(size_bytes, HistogramLadderMode::kAvx2Aos);
                     }});
    specs.push_back({"simd_avx2_soa_histogram_u8",
                     "SIMD-assisted histogram over contiguous keys using vector loads.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_histogram_ladder_benchmark(size_bytes, HistogramLadderMode::kAvx2Soa);
                     }});
    specs.push_back({"simd_scalar_aos_prefix_f32",
                     "Scalar prefix scan over a hot float field embedded in 64-byte AoS records.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_wide_prefix_benchmark(size_bytes, WidePrefixMode::kScalarAos);
                     }});
    specs.push_back({"simd_scalar_soa_prefix_f32",
                     "Scalar prefix scan over the same float field stored contiguously.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_wide_prefix_benchmark(size_bytes, WidePrefixMode::kScalarSoa);
                     }});
    specs.push_back({"simd_avx2_aos_prefix_f32",
                     "AVX2 prefix scan over AoS records using gathered hot-field loads.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_wide_prefix_benchmark(size_bytes, WidePrefixMode::kAvx2Aos);
                     }});
    specs.push_back({"simd_avx2_soa_prefix_f32",
                     "AVX2 prefix scan over contiguous hot-field storage.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_wide_prefix_benchmark(size_bytes, WidePrefixMode::kAvx2Soa);
                     }});
    specs.push_back({"simd_scalar_adjlist_edge_sum_f32",
                     "Scalar graph edge traversal over adjacency-list AoS edges.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_graph_edge_benchmark(size_bytes, GraphEdgeMode::kScalarAdjList);
                     }});
    specs.push_back({"simd_scalar_csr_edge_sum_f32",
                     "Scalar graph edge traversal over CSR edge arrays.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_graph_edge_benchmark(size_bytes, GraphEdgeMode::kScalarCsr);
                     }});
    specs.push_back({"simd_avx2_adjlist_edge_sum_f32",
                     "AVX2 graph edge traversal over adjacency-list AoS edges using gathered weights.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_graph_edge_benchmark(size_bytes, GraphEdgeMode::kAvx2AdjList);
                     }});
    specs.push_back({"simd_avx2_csr_edge_sum_f32",
                     "AVX2 graph edge traversal over CSR weights using contiguous loads.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_graph_edge_benchmark(size_bytes, GraphEdgeMode::kAvx2Csr);
                     }});
    specs.push_back({"simd_scalar_aos_blocksort8_u32",
                     "Scalar direct sort of 8-key blocks stored inside 64-byte AoS records.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_blocksort8_benchmark(size_bytes, BlockSortMode::kScalarAos);
                     }});
    specs.push_back({"simd_scalar_soa_blocksort8_u32",
                     "Scalar sort of 8-key blocks stored contiguously.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_blocksort8_benchmark(size_bytes, BlockSortMode::kScalarSoa);
                     }});
    specs.push_back({"simd_avx2_aos_blocksort8_u32",
                     "SIMD-assisted block sort over AoS records using gathered key loads.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_blocksort8_benchmark(size_bytes, BlockSortMode::kAvx2Aos);
                     }});
    specs.push_back({"simd_avx2_soa_blocksort8_u32",
                     "SIMD-assisted block sort over contiguous keys using vector loads.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_blocksort8_benchmark(size_bytes, BlockSortMode::kAvx2Soa);
                     }});
    specs.push_back({"simd_scalar_gather_sum_i32",
                     "Scalar sum over randomly gathered int32 elements.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_gather_benchmark(size_bytes, GatherMode::kScalar);
                     }});
    specs.push_back({"simd_avx2_gather_sum_i32",
                     "AVX2 gather sum over randomly indexed int32 elements.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_gather_benchmark(size_bytes, GatherMode::kAvx2);
                     }});
    specs.push_back({"simd_scalar_sum_i32",
                     "Scalar int32 reduction with vectorization disabled.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_sum_benchmark(size_bytes, ReductionMode::kScalar);
                     }});
    specs.push_back({"simd_avx2_sum_i32",
                     "AVX2 int32 reduction using one vector accumulator.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_sum_benchmark(size_bytes, ReductionMode::kAvx2);
                     }});
    specs.push_back({"simd_avx2_sum_i32_4acc",
                     "AVX2 int32 reduction with four independent accumulators.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_sum_benchmark(size_bytes, ReductionMode::kAvx2FourAccumulators);
                     }});
    specs.push_back({"simd_branch_filter_sum_i32",
                     "Scalar branch-based sum of values below a threshold.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_filter_benchmark(size_bytes, FilterMode::kBranch);
                     }});
    specs.push_back({"simd_branchless_filter_sum_i32",
                     "Scalar branchless thresholded sum using predication.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_filter_benchmark(size_bytes, FilterMode::kBranchless);
                     }});
    specs.push_back({"simd_avx2_mask_filter_sum_i32",
                     "AVX2 masked thresholded sum using compare-and-mask.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_filter_benchmark(size_bytes, FilterMode::kAvx2Mask);
                     }});
    specs.push_back({"simd_scalar_popcount_builtin",
                     "Scalar popcount using the CPU scalar popcount path.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_popcount_benchmark(size_bytes, ShuffleMode::kBuiltinPopcount);
                     }});
    specs.push_back({"simd_scalar_popcount_lut",
                     "Scalar popcount using a byte lookup table.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_popcount_benchmark(size_bytes, ShuffleMode::kScalarLut);
                     }});
    specs.push_back({"simd_avx2_shuffle_popcount",
                     "AVX2 popcount using nibble lookup via shuffle instructions.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_popcount_benchmark(size_bytes, ShuffleMode::kAvx2Shuffle);
                     }});
    specs.push_back({"simd_scalar_argmin_i32",
                     "Scalar argmin over random int32 values.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_argmin_benchmark(size_bytes, ArgminMode::kScalar);
                     }});
    specs.push_back({"simd_avx2_argmin_blend_i32",
                     "AVX2 argmin that tracks vector minima and indices with blending.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_argmin_benchmark(size_bytes, ArgminMode::kAvx2Blend);
                     }});
    specs.push_back({"simd_avx2_argmin_rescan_i32",
                     "AVX2 argmin that rescans only when a lane beats the current minimum.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_argmin_benchmark(size_bytes, ArgminMode::kAvx2Rescan);
                     }});
    specs.push_back({"simd_scalar_prefix_i32",
                     "Scalar inclusive prefix sum over int32 values.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_prefix_benchmark(size_bytes, PrefixMode::kScalar);
                     }});
    specs.push_back({"simd_avx2_prefix_i32",
                     "AVX2 inclusive prefix sum using in-register lane scans plus carry propagation.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_prefix_benchmark(size_bytes, PrefixMode::kAvx2);
                     }});
    specs.push_back({"simd_scalar_matmul_naive",
                     "Naive scalar matrix multiply with i-j-k loop order.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_matmul_benchmark(size_bytes, MatmulMode::kScalarNaive);
                     }});
    specs.push_back({"simd_scalar_matmul_transposed",
                     "Scalar matrix multiply against a transposed copy of B.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_matmul_benchmark(size_bytes, MatmulMode::kScalarTransposed);
                     }});
    specs.push_back({"simd_avx2_matmul_gather",
                     "Naive-layout AVX2 matrix multiply using gather loads from B columns.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_matmul_benchmark(size_bytes, MatmulMode::kAvx2Gather);
                     }});
    specs.push_back({"simd_avx2_matmul_blocked",
                     "Blocked AVX2 outer-product matrix multiply.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_matmul_benchmark(size_bytes, MatmulMode::kAvx2Blocked);
                     }});
}

}  // namespace cachebench
