#include "benchmark.hpp"

#include <algorithm>
#include <array>
#include <bit>
#include <cmath>
#include <cstddef>
#include <cstring>
#include <cstdint>
#include <deque>
#include <immintrin.h>
#include <limits>
#include <memory>
#include <numeric>
#include <stdexcept>
#include <vector>

#ifdef _MSC_VER
#include <intrin.h>
#endif

namespace cachebench {

namespace {

constexpr std::uint64_t kSequenceTargetBytes = 64ULL * 1024ULL * 1024ULL;
constexpr std::uint64_t kTransposeTargetBytes = 32ULL * 1024ULL * 1024ULL;
constexpr std::size_t kBatchLanes = 8;
constexpr std::size_t kColdFieldCount = 15;
constexpr std::size_t kAlignmentBytes = 64;
constexpr std::size_t kWindowWidth = 32;
constexpr std::size_t kTransposeBlock = 32;

#if defined(_MSC_VER)
#define CACHEBENCH_NOINLINE __declspec(noinline)
#define CACHEBENCH_NO_VECTOR_LOOP __pragma(loop(no_vector))
#define CACHEBENCH_NOVECTOR
#elif defined(__clang__)
#define CACHEBENCH_NOINLINE __attribute__((noinline))
#define CACHEBENCH_NO_VECTOR_LOOP _Pragma("clang loop vectorize(disable)")
#define CACHEBENCH_NOVECTOR
#elif defined(__GNUC__)
#define CACHEBENCH_NOINLINE __attribute__((noinline))
#define CACHEBENCH_NO_VECTOR_LOOP
#define CACHEBENCH_NOVECTOR __attribute__((optimize("no-tree-vectorize")))
#else
#define CACHEBENCH_NOINLINE
#define CACHEBENCH_NO_VECTOR_LOOP
#define CACHEBENCH_NOVECTOR
#endif

template <typename T>
class AlignedBuffer {
public:
    explicit AlignedBuffer(std::size_t size, std::size_t alignment = kAlignmentBytes)
        : size_(size),
          data_(static_cast<T*>(_mm_malloc(sizeof(T) * size, alignment))) {
        if (data_ == nullptr) {
            throw std::bad_alloc();
        }
    }

    ~AlignedBuffer() {
        _mm_free(data_);
    }

    AlignedBuffer(const AlignedBuffer&) = delete;
    AlignedBuffer& operator=(const AlignedBuffer&) = delete;

    T* data() {
        return data_;
    }

    const T* data() const {
        return data_;
    }

    T& operator[](std::size_t index) {
        return data_[index];
    }

    const T& operator[](std::size_t index) const {
        return data_[index];
    }

    std::size_t size() const {
        return size_;
    }

private:
    std::size_t size_ = 0;
    T* data_ = nullptr;
};

template <typename T>
struct WideRecord {
    T value{};
    std::array<T, kColdFieldCount> cold{};
};

static_assert(sizeof(WideRecord<float>) == 64, "Wide float record should occupy one cache line");
static_assert(sizeof(WideRecord<std::uint32_t>) == 64, "Wide uint32 record should occupy one cache line");

template <typename T>
struct SequenceDataset {
    std::size_t length = 0;
    std::size_t total_values = 0;
    std::size_t logical_size_bytes = 0;
    std::shared_ptr<AlignedBuffer<WideRecord<T>>> aos;
    std::shared_ptr<AlignedBuffer<T>> lane_major_hot;
    std::shared_ptr<AlignedBuffer<T>> lane_major_cold;
    std::shared_ptr<AlignedBuffer<T>> interleaved_hot;
    std::shared_ptr<AlignedBuffer<T>> interleaved_cold;
};

enum class StockMode {
    kScalarAos,
    kScalarSoa,
    kSimdAos,
    kSimdSoa,
};

enum class MaxSubarrayMode {
    kScalarAos,
    kScalarSoa,
    kSimdAos,
    kSimdSoa,
};

enum class TrappingRainMode {
    kScalarAos,
    kScalarSoa,
    kSimdAos,
    kSimdSoa,
};

enum class SlidingWindowMode {
    kScalarAos,
    kScalarSoa,
    kSimdAos,
    kSimdSoa,
};

enum class ProductExceptSelfMode {
    kScalarAos,
    kScalarSoa,
    kSimdAos,
    kSimdSoa,
};

enum class TransposeMode {
    kScalarNaive,
    kScalarBlocked,
    kSimdNaive,
    kSimdBlocked,
};

enum class RotatedSearchMode {
    kScalarAos,
    kScalarSoa,
    kSimdAos,
    kSimdSoa,
};

enum class Sort012Mode {
    kScalarAos,
    kScalarSoa,
    kSimdAos,
    kSimdSoa,
};

enum class BfsMode {
    kScalarAdjList,
    kScalarCsr,
    kSimdAdjList,
    kSimdCsr,
};

struct RotatedSearchDataset {
    std::size_t length = 0;
    std::size_t query_count = 0;
    std::size_t logical_size_bytes = 0;
    std::shared_ptr<AlignedBuffer<WideRecord<std::uint32_t>>> aos;
    std::shared_ptr<AlignedBuffer<std::uint32_t>> lane_major_hot;
    std::shared_ptr<AlignedBuffer<std::uint32_t>> interleaved_hot;
    std::shared_ptr<AlignedBuffer<std::uint32_t>> lane_major_queries;
    std::shared_ptr<AlignedBuffer<std::uint32_t>> interleaved_queries;
};

struct Sort012Record {
    std::uint32_t value = 0;
    std::array<std::uint32_t, kColdFieldCount> cold{};
};

static_assert(sizeof(Sort012Record) == 64, "Sort012Record should occupy one cache line");

struct Sort012Dataset {
    std::size_t length = 0;
    std::size_t logical_size_bytes = 0;
    std::shared_ptr<AlignedBuffer<Sort012Record>> input_aos;
    std::shared_ptr<AlignedBuffer<std::uint32_t>> input_soa;
};

struct GraphEdgeRecord {
    std::uint32_t dst = 0;
    std::uint32_t cold0 = 0;
    std::uint32_t cold1 = 0;
    std::uint32_t cold2 = 0;
};

static_assert(sizeof(GraphEdgeRecord) == 16, "GraphEdgeRecord should stay compact");

struct GraphAdjList {
    std::vector<std::vector<GraphEdgeRecord>> edges;
};

struct GraphCsr {
    std::vector<std::uint32_t> offsets;
    std::vector<std::uint32_t> edges;
};

constexpr std::size_t kDsaGraphDegree = 8;

std::uint64_t seeded_value(std::size_t index) {
    constexpr std::uint64_t kMix = 0x9E3779B97F4A7C15ULL;
    return (static_cast<std::uint64_t>(index) + 1ULL) * kMix;
}

float seeded_float(std::size_t index, float scale) {
    const std::uint32_t value = static_cast<std::uint32_t>((seeded_value(index) >> 17) & 0xFFFFULL);
    return static_cast<float>(value % 4096U) / scale;
}

bool cpu_supports_avx2() {
#if defined(_MSC_VER) && defined(_M_X64)
    int info[4] = {};
    __cpuid(info, 1);
    const bool osxsave = (info[2] & (1 << 27)) != 0;
    const bool avx = (info[2] & (1 << 28)) != 0;
    if (!osxsave || !avx) {
        return false;
    }
    const unsigned long long xcr0 = _xgetbv(0);
    if ((xcr0 & 0x6ULL) != 0x6ULL) {
        return false;
    }

    __cpuidex(info, 7, 0);
    return (info[1] & (1 << 5)) != 0;
#elif (defined(__GNUC__) || defined(__clang__)) && defined(__x86_64__)
    return __builtin_cpu_supports("avx2");
#else
    return false;
#endif
}

template <typename T>
T sample_checksum(const T* data, std::size_t elements) {
    if (elements == 0) {
        return T{};
    }
    const std::size_t step = std::max<std::size_t>(1, elements / 8);
    T sum{};
    for (std::size_t index = 0; index < elements; index += step) {
        sum += data[index];
    }
    sum += data[elements - 1];
    return sum;
}

std::uint64_t checksum_from_float_buffer(const float* data, std::size_t elements) {
    if (elements == 0) {
        return 0;
    }
    const std::size_t step = std::max<std::size_t>(1, elements / 8);
    std::uint64_t sum = 0;
    for (std::size_t index = 0; index < elements; index += step) {
        sum += static_cast<std::uint64_t>(std::bit_cast<std::uint32_t>(data[index]));
    }
    sum += static_cast<std::uint64_t>(std::bit_cast<std::uint32_t>(data[elements - 1]));
    return sum;
}

float horizontal_sum_ps(__m256 value) {
    alignas(32) std::array<float, kBatchLanes> lanes{};
    _mm256_store_ps(lanes.data(), value);
    return std::accumulate(lanes.begin(), lanes.end(), 0.0f);
}

std::uint64_t horizontal_sum_u32(__m256i value) {
    alignas(32) std::array<std::uint32_t, kBatchLanes> lanes{};
    _mm256_store_si256(reinterpret_cast<__m256i*>(lanes.data()), value);
    return std::accumulate(lanes.begin(), lanes.end(), 0ULL);
}

__m256i setr_epi32_lanes() {
    return _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
}

__m256i cmp_lt_epi32(__m256i lhs, __m256i rhs) {
    return _mm256_cmpgt_epi32(rhs, lhs);
}

__m256i cmp_le_epi32(__m256i lhs, __m256i rhs) {
    return _mm256_or_si256(_mm256_cmpgt_epi32(rhs, lhs), _mm256_cmpeq_epi32(lhs, rhs));
}

int mask_bits(__m256i mask) {
    return _mm256_movemask_ps(_mm256_castsi256_ps(mask));
}

template <typename T, typename Generator>
SequenceDataset<T> build_sequence_dataset(std::size_t requested_size_bytes,
                                          std::size_t min_length,
                                          Generator&& generator) {
    const std::size_t records = std::max<std::size_t>(
        kBatchLanes * min_length,
        requested_size_bytes / sizeof(WideRecord<T>));
    const std::size_t length = std::max<std::size_t>(min_length, records / kBatchLanes);
    const std::size_t total_values = length * kBatchLanes;

    SequenceDataset<T> dataset;
    dataset.length = length;
    dataset.total_values = total_values;
    dataset.logical_size_bytes = total_values * sizeof(WideRecord<T>);
    dataset.aos = std::make_shared<AlignedBuffer<WideRecord<T>>>(total_values);
    dataset.lane_major_hot = std::make_shared<AlignedBuffer<T>>(total_values);
    dataset.lane_major_cold = std::make_shared<AlignedBuffer<T>>(total_values * kColdFieldCount);
    dataset.interleaved_hot = std::make_shared<AlignedBuffer<T>>(total_values);
    dataset.interleaved_cold = std::make_shared<AlignedBuffer<T>>(total_values * kColdFieldCount);

    for (std::size_t lane = 0; lane < kBatchLanes; ++lane) {
        for (std::size_t index = 0; index < length; ++index) {
            const T value = generator(lane, index);
            const std::size_t lane_major_index = lane * length + index;
            const std::size_t interleaved_index = index * kBatchLanes + lane;
            (*dataset.aos)[lane_major_index].value = value;
            for (std::size_t cold_index = 0; cold_index < kColdFieldCount; ++cold_index) {
                if constexpr (std::is_same_v<T, float>) {
                    (*dataset.aos)[lane_major_index].cold[cold_index] =
                        seeded_float(lane_major_index * kColdFieldCount + cold_index, 251.0f);
                    (*dataset.lane_major_cold)[cold_index * total_values + lane_major_index] =
                        seeded_float(cold_index * total_values + lane_major_index, 211.0f);
                    (*dataset.interleaved_cold)[cold_index * total_values + interleaved_index] =
                        seeded_float(cold_index * total_values + interleaved_index, 199.0f);
                } else {
                    (*dataset.aos)[lane_major_index].cold[cold_index] = static_cast<T>(
                        (seeded_value(lane_major_index * kColdFieldCount + cold_index) & 0xFFULL) + 1ULL);
                    (*dataset.lane_major_cold)[cold_index * total_values + lane_major_index] = static_cast<T>(
                        (seeded_value(cold_index * total_values + lane_major_index) & 0xFFULL) + 1ULL);
                    (*dataset.interleaved_cold)[cold_index * total_values + interleaved_index] = static_cast<T>(
                        (seeded_value(cold_index * total_values + interleaved_index) & 0xFFULL) + 1ULL);
                }
            }
            (*dataset.lane_major_hot)[lane_major_index] = value;
            (*dataset.interleaved_hot)[interleaved_index] = value;
        }
    }

    return dataset;
}

float stock_price_value(std::size_t lane, std::size_t index) {
    const float x = static_cast<float>(index);
    return 64.0f + static_cast<float>(lane) * 2.5f + 0.04f * x +
           9.0f * std::sin(0.031f * x + static_cast<float>(lane) * 0.7f) +
           4.0f * std::cos(0.073f * x + static_cast<float>(lane) * 0.3f);
}

float max_subarray_value(std::size_t lane, std::size_t index) {
    const float x = static_cast<float>(index);
    return 8.0f * std::sin(0.11f * x + static_cast<float>(lane) * 0.5f) +
           4.0f * std::cos(0.049f * x + static_cast<float>(lane) * 0.2f) - 1.5f;
}

float height_value(std::size_t lane, std::size_t index) {
    const float x = static_cast<float>(index);
    return 6.0f + 4.0f * std::sin(0.087f * x + static_cast<float>(lane) * 0.4f) +
           3.0f * std::cos(0.031f * x + static_cast<float>(lane) * 0.8f);
}

float window_value(std::size_t lane, std::size_t index) {
    const float x = static_cast<float>(index);
    return 2.0f + 6.0f * std::sin(0.057f * x + static_cast<float>(lane) * 0.6f) +
           1.5f * std::cos(0.121f * x + static_cast<float>(lane) * 0.2f);
}

std::uint32_t product_value(std::size_t lane, std::size_t index) {
    return static_cast<std::uint32_t>(((lane * 11 + index * 7) % 13) + 2);
}

RotatedSearchDataset build_rotated_search_dataset(std::size_t requested_size_bytes) {
    const std::size_t records = std::max<std::size_t>(
        kBatchLanes * 256,
        requested_size_bytes / sizeof(WideRecord<std::uint32_t>));
    const std::size_t length = std::max<std::size_t>(256, records / kBatchLanes);
    const std::size_t total_values = length * kBatchLanes;
    const std::size_t query_count = std::max<std::size_t>(128, length / 8);

    RotatedSearchDataset dataset;
    dataset.length = length;
    dataset.query_count = query_count;
    dataset.logical_size_bytes = total_values * sizeof(WideRecord<std::uint32_t>);
    dataset.aos = std::make_shared<AlignedBuffer<WideRecord<std::uint32_t>>>(total_values);
    dataset.lane_major_hot = std::make_shared<AlignedBuffer<std::uint32_t>>(total_values);
    dataset.interleaved_hot = std::make_shared<AlignedBuffer<std::uint32_t>>(total_values);
    dataset.lane_major_queries = std::make_shared<AlignedBuffer<std::uint32_t>>(query_count * kBatchLanes);
    dataset.interleaved_queries = std::make_shared<AlignedBuffer<std::uint32_t>>(query_count * kBatchLanes);

    for (std::size_t lane = 0; lane < kBatchLanes; ++lane) {
        const std::uint32_t base = static_cast<std::uint32_t>(lane * 1'000'003U + 17U);
        const std::size_t rotation = (length / 5 + lane * 37) % length;
        for (std::size_t index = 0; index < length; ++index) {
            const std::uint32_t sorted_index = static_cast<std::uint32_t>((index + rotation) % length);
            const std::uint32_t value = base + sorted_index * 3U;
            const std::size_t lane_major_index = lane * length + index;
            const std::size_t interleaved_index = index * kBatchLanes + lane;
            (*dataset.aos)[lane_major_index].value = value;
            for (std::size_t cold = 0; cold < kColdFieldCount; ++cold) {
                (*dataset.aos)[lane_major_index].cold[cold] = static_cast<std::uint32_t>(
                    seeded_value(lane_major_index * kColdFieldCount + cold) & 0xFFFFULL);
            }
            (*dataset.lane_major_hot)[lane_major_index] = value;
            (*dataset.interleaved_hot)[interleaved_index] = value;
        }
        for (std::size_t query = 0; query < query_count; ++query) {
            const bool present = (query & 3U) != 3U;
            const std::uint32_t value =
                present ? base + static_cast<std::uint32_t>(((query * 13U) + lane * 11U) % length) * 3U
                        : base + static_cast<std::uint32_t>(length * 3U + query * 5U + lane);
            (*dataset.lane_major_queries)[lane * query_count + query] = value;
            (*dataset.interleaved_queries)[query * kBatchLanes + lane] = value;
        }
    }

    return dataset;
}

Sort012Dataset build_sort012_dataset(std::size_t requested_size_bytes) {
    const std::size_t length = std::max<std::size_t>(512, requested_size_bytes / sizeof(Sort012Record));
    Sort012Dataset dataset;
    dataset.length = length;
    dataset.logical_size_bytes = length * sizeof(Sort012Record);
    dataset.input_aos = std::make_shared<AlignedBuffer<Sort012Record>>(length);
    dataset.input_soa = std::make_shared<AlignedBuffer<std::uint32_t>>(length);

    for (std::size_t index = 0; index < length; ++index) {
        const std::uint32_t value = static_cast<std::uint32_t>((seeded_value(index) + index * 5ULL) % 3ULL);
        (*dataset.input_aos)[index].value = value;
        for (std::size_t cold = 0; cold < kColdFieldCount; ++cold) {
            (*dataset.input_aos)[index].cold[cold] = static_cast<std::uint32_t>(
                seeded_value(index * kColdFieldCount + cold + 19) & 0xFFFFFFFFULL);
        }
        (*dataset.input_soa)[index] = value;
    }

    return dataset;
}

std::shared_ptr<GraphAdjList> build_dsa_graph_adjlist(std::size_t vertices) {
    auto graph = std::make_shared<GraphAdjList>();
    graph->edges.resize(vertices);
    for (std::size_t vertex = 0; vertex < vertices; ++vertex) {
        auto& out = graph->edges[vertex];
        out.reserve(kDsaGraphDegree);
        for (std::size_t edge = 0; edge < kDsaGraphDegree; ++edge) {
            GraphEdgeRecord record{};
            const std::size_t jump = 1 + edge * 13 + (vertex % 17);
            record.dst = static_cast<std::uint32_t>((vertex + jump) % vertices);
            record.cold0 = static_cast<std::uint32_t>(seeded_value(vertex * kDsaGraphDegree + edge) & 0xFFFFULL);
            record.cold1 = static_cast<std::uint32_t>(seeded_value(vertex * kDsaGraphDegree + edge + 3) & 0xFFFFULL);
            record.cold2 = static_cast<std::uint32_t>(seeded_value(vertex * kDsaGraphDegree + edge + 5) & 0xFFFFULL);
            out.push_back(record);
        }
    }
    return graph;
}

std::shared_ptr<GraphCsr> build_dsa_graph_csr(const GraphAdjList& adjlist) {
    auto graph = std::make_shared<GraphCsr>();
    const std::size_t vertices = adjlist.edges.size();
    graph->offsets.resize(vertices + 1, 0U);
    graph->edges.reserve(vertices * kDsaGraphDegree);
    for (std::size_t vertex = 0; vertex < vertices; ++vertex) {
        graph->offsets[vertex] = static_cast<std::uint32_t>(graph->edges.size());
        for (const GraphEdgeRecord& edge : adjlist.edges[vertex]) {
            graph->edges.push_back(edge.dst);
        }
    }
    graph->offsets[vertices] = static_cast<std::uint32_t>(graph->edges.size());
    return graph;
}

__m256 gather_hot_values(const WideRecord<float>* aos, std::size_t length, std::size_t index) {
    const float* base = reinterpret_cast<const float*>(aos);
    return _mm256_i32gather_ps(
        base,
        _mm256_set_epi32(static_cast<int>(((7 * length + index) * 16)),
                         static_cast<int>(((6 * length + index) * 16)),
                         static_cast<int>(((5 * length + index) * 16)),
                         static_cast<int>(((4 * length + index) * 16)),
                         static_cast<int>(((3 * length + index) * 16)),
                         static_cast<int>(((2 * length + index) * 16)),
                         static_cast<int>(((1 * length + index) * 16)),
                         static_cast<int>(((0 * length + index) * 16))),
        4);
}

__m256i gather_hot_u32_values(const WideRecord<std::uint32_t>* aos, std::size_t length, std::size_t index) {
    const int* base = reinterpret_cast<const int*>(aos);
    return _mm256_i32gather_epi32(
        base,
        _mm256_set_epi32(static_cast<int>(((7 * length + index) * 16)),
                         static_cast<int>(((6 * length + index) * 16)),
                         static_cast<int>(((5 * length + index) * 16)),
                         static_cast<int>(((4 * length + index) * 16)),
                         static_cast<int>(((3 * length + index) * 16)),
                         static_cast<int>(((2 * length + index) * 16)),
                         static_cast<int>(((1 * length + index) * 16)),
                         static_cast<int>(((0 * length + index) * 16))),
        4);
}

__m256i gather_rotated_aos_values(const WideRecord<std::uint32_t>* aos, std::size_t length, __m256i logical_indices) {
    alignas(32) std::array<std::int32_t, kBatchLanes> indices{};
    _mm256_store_si256(reinterpret_cast<__m256i*>(indices.data()), logical_indices);
    for (std::size_t lane = 0; lane < kBatchLanes; ++lane) {
        indices[lane] = indices[lane] * 16 + static_cast<std::int32_t>(lane * length * 16);
    }
    return _mm256_i32gather_epi32(
        reinterpret_cast<const int*>(aos),
        _mm256_load_si256(reinterpret_cast<const __m256i*>(indices.data())),
        4);
}

__m256i gather_rotated_soa_values(const std::uint32_t* interleaved_hot, __m256i logical_indices) {
    alignas(32) std::array<std::int32_t, kBatchLanes> indices{};
    _mm256_store_si256(reinterpret_cast<__m256i*>(indices.data()), logical_indices);
    for (std::size_t lane = 0; lane < kBatchLanes; ++lane) {
        indices[lane] = indices[lane] * static_cast<std::int32_t>(kBatchLanes) + static_cast<std::int32_t>(lane);
    }
    return _mm256_i32gather_epi32(
        reinterpret_cast<const int*>(interleaved_hot),
        _mm256_load_si256(reinterpret_cast<const __m256i*>(indices.data())),
        4);
}

std::uint64_t sample_sort012_record_values(const Sort012Record* values, std::size_t elements) {
    if (elements == 0) {
        return 0;
    }
    const std::size_t step = std::max<std::size_t>(1, elements / 8);
    std::uint64_t sum = 0;
    for (std::size_t index = 0; index < elements; index += step) {
        sum += values[index].value;
    }
    sum += values[elements - 1].value;
    return sum;
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_stock_profit_aos(const WideRecord<float>* aos,
                                                                      std::size_t length) {
    float total = 0.0f;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t lane = 0; lane < kBatchLanes; ++lane) {
        float min_price = std::numeric_limits<float>::infinity();
        float best = 0.0f;
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t index = 0; index < length; ++index) {
            const float price = aos[lane * length + index].value;
            min_price = std::min(min_price, price);
            best = std::max(best, price - min_price);
        }
        total += best;
    }
    return total;
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_stock_profit_soa(const float* hot, std::size_t length) {
    float total = 0.0f;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t lane = 0; lane < kBatchLanes; ++lane) {
        const float* values = hot + lane * length;
        float min_price = std::numeric_limits<float>::infinity();
        float best = 0.0f;
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t index = 0; index < length; ++index) {
            min_price = std::min(min_price, values[index]);
            best = std::max(best, values[index] - min_price);
        }
        total += best;
    }
    return total;
}

CACHEBENCH_NOINLINE float simd_stock_profit_aos(const WideRecord<float>* aos, std::size_t length) {
    __m256 min_values = _mm256_set1_ps(std::numeric_limits<float>::infinity());
    __m256 best_values = _mm256_setzero_ps();
    for (std::size_t index = 0; index < length; ++index) {
        const __m256 prices = gather_hot_values(aos, length, index);
        min_values = _mm256_min_ps(min_values, prices);
        best_values = _mm256_max_ps(best_values, _mm256_sub_ps(prices, min_values));
    }
    return horizontal_sum_ps(best_values);
}

CACHEBENCH_NOINLINE float simd_stock_profit_soa(const float* hot, std::size_t length) {
    __m256 min_values = _mm256_set1_ps(std::numeric_limits<float>::infinity());
    __m256 best_values = _mm256_setzero_ps();
    for (std::size_t index = 0; index < length; ++index) {
        const __m256 prices = _mm256_load_ps(hot + index * kBatchLanes);
        min_values = _mm256_min_ps(min_values, prices);
        best_values = _mm256_max_ps(best_values, _mm256_sub_ps(prices, min_values));
    }
    return horizontal_sum_ps(best_values);
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_max_subarray_aos(const WideRecord<float>* aos,
                                                                       std::size_t length) {
    float total = 0.0f;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t lane = 0; lane < kBatchLanes; ++lane) {
        float current = aos[lane * length].value;
        float best = current;
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t index = 1; index < length; ++index) {
            const float value = aos[lane * length + index].value;
            current = std::max(value, current + value);
            best = std::max(best, current);
        }
        total += best;
    }
    return total;
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_max_subarray_soa(const float* hot, std::size_t length) {
    float total = 0.0f;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t lane = 0; lane < kBatchLanes; ++lane) {
        const float* values = hot + lane * length;
        float current = values[0];
        float best = current;
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t index = 1; index < length; ++index) {
            current = std::max(values[index], current + values[index]);
            best = std::max(best, current);
        }
        total += best;
    }
    return total;
}

CACHEBENCH_NOINLINE float simd_max_subarray_aos(const WideRecord<float>* aos, std::size_t length) {
    __m256 current = gather_hot_values(aos, length, 0);
    __m256 best = current;
    for (std::size_t index = 1; index < length; ++index) {
        const __m256 values = gather_hot_values(aos, length, index);
        current = _mm256_max_ps(values, _mm256_add_ps(current, values));
        best = _mm256_max_ps(best, current);
    }
    return horizontal_sum_ps(best);
}

CACHEBENCH_NOINLINE float simd_max_subarray_soa(const float* hot, std::size_t length) {
    __m256 current = _mm256_load_ps(hot);
    __m256 best = current;
    for (std::size_t index = 1; index < length; ++index) {
        const __m256 values = _mm256_load_ps(hot + index * kBatchLanes);
        current = _mm256_max_ps(values, _mm256_add_ps(current, values));
        best = _mm256_max_ps(best, current);
    }
    return horizontal_sum_ps(best);
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_trapping_rain_aos(const WideRecord<float>* aos,
                                                                        std::size_t length) {
    std::vector<float> left(length);
    float total = 0.0f;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t lane = 0; lane < kBatchLanes; ++lane) {
        const WideRecord<float>* values = aos + lane * length;
        float left_max = values[0].value;
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t index = 0; index < length; ++index) {
            left_max = std::max(left_max, values[index].value);
            left[index] = left_max;
        }
        float right_max = values[length - 1].value;
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t index = length; index-- > 0;) {
            right_max = std::max(right_max, values[index].value);
            total += std::max(0.0f, std::min(left[index], right_max) - values[index].value);
        }
    }
    return total;
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_trapping_rain_soa(const float* hot, std::size_t length) {
    std::vector<float> left(length);
    float total = 0.0f;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t lane = 0; lane < kBatchLanes; ++lane) {
        const float* values = hot + lane * length;
        float left_max = values[0];
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t index = 0; index < length; ++index) {
            left_max = std::max(left_max, values[index]);
            left[index] = left_max;
        }
        float right_max = values[length - 1];
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t index = length; index-- > 0;) {
            right_max = std::max(right_max, values[index]);
            total += std::max(0.0f, std::min(left[index], right_max) - values[index]);
        }
    }
    return total;
}

CACHEBENCH_NOINLINE float simd_trapping_rain_aos(const WideRecord<float>* aos,
                                                 std::size_t length,
                                                 float* left_temp) {
    __m256 left_max = _mm256_set1_ps(std::numeric_limits<float>::lowest());
    for (std::size_t index = 0; index < length; ++index) {
        left_max = _mm256_max_ps(left_max, gather_hot_values(aos, length, index));
        _mm256_store_ps(left_temp + index * kBatchLanes, left_max);
    }

    __m256 right_max = _mm256_set1_ps(std::numeric_limits<float>::lowest());
    __m256 total = _mm256_setzero_ps();
    for (std::size_t index = length; index-- > 0;) {
        const __m256 values = gather_hot_values(aos, length, index);
        right_max = _mm256_max_ps(right_max, values);
        const __m256 left_values = _mm256_load_ps(left_temp + index * kBatchLanes);
        const __m256 water = _mm256_sub_ps(_mm256_min_ps(left_values, right_max), values);
        total = _mm256_add_ps(total, _mm256_max_ps(water, _mm256_setzero_ps()));
    }
    return horizontal_sum_ps(total);
}

CACHEBENCH_NOINLINE float simd_trapping_rain_soa(const float* hot,
                                                 std::size_t length,
                                                 float* left_temp) {
    __m256 left_max = _mm256_set1_ps(std::numeric_limits<float>::lowest());
    for (std::size_t index = 0; index < length; ++index) {
        const __m256 values = _mm256_load_ps(hot + index * kBatchLanes);
        left_max = _mm256_max_ps(left_max, values);
        _mm256_store_ps(left_temp + index * kBatchLanes, left_max);
    }

    __m256 right_max = _mm256_set1_ps(std::numeric_limits<float>::lowest());
    __m256 total = _mm256_setzero_ps();
    for (std::size_t index = length; index-- > 0;) {
        const __m256 values = _mm256_load_ps(hot + index * kBatchLanes);
        right_max = _mm256_max_ps(right_max, values);
        const __m256 left_values = _mm256_load_ps(left_temp + index * kBatchLanes);
        const __m256 water = _mm256_sub_ps(_mm256_min_ps(left_values, right_max), values);
        total = _mm256_add_ps(total, _mm256_max_ps(water, _mm256_setzero_ps()));
    }
    return horizontal_sum_ps(total);
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_sliding_max_aos(const WideRecord<float>* aos,
                                                                      std::size_t length) {
    float total = 0.0f;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t lane = 0; lane < kBatchLanes; ++lane) {
        const WideRecord<float>* values = aos + lane * length;
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t start = 0; start + kWindowWidth <= length; ++start) {
            float window_max = values[start].value;
            CACHEBENCH_NO_VECTOR_LOOP
            for (std::size_t offset = 1; offset < kWindowWidth; ++offset) {
                window_max = std::max(window_max, values[start + offset].value);
            }
            total += window_max;
        }
    }
    return total;
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_sliding_max_soa(const float* hot, std::size_t length) {
    float total = 0.0f;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t lane = 0; lane < kBatchLanes; ++lane) {
        const float* values = hot + lane * length;
        std::deque<std::size_t> deque;
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t index = 0; index < length; ++index) {
            while (!deque.empty() && deque.front() + kWindowWidth <= index) {
                deque.pop_front();
            }
            while (!deque.empty() && values[deque.back()] <= values[index]) {
                deque.pop_back();
            }
            deque.push_back(index);
            if (index + 1 >= kWindowWidth) {
                total += values[deque.front()];
            }
        }
    }
    return total;
}

CACHEBENCH_NOINLINE float simd_sliding_max_aos(const WideRecord<float>* aos, std::size_t length) {
    __m256 total = _mm256_setzero_ps();
    for (std::size_t start = 0; start + kWindowWidth <= length; ++start) {
        __m256 window_max = gather_hot_values(aos, length, start);
        for (std::size_t offset = 1; offset < kWindowWidth; ++offset) {
            window_max = _mm256_max_ps(window_max, gather_hot_values(aos, length, start + offset));
        }
        total = _mm256_add_ps(total, window_max);
    }
    return horizontal_sum_ps(total);
}

CACHEBENCH_NOINLINE float simd_sliding_max_soa(const float* hot,
                                               std::size_t length,
                                               float* left_temp,
                                               float* right_temp) {
    __m256 block_left = _mm256_setzero_ps();
    for (std::size_t index = 0; index < length; ++index) {
        const __m256 values = _mm256_load_ps(hot + index * kBatchLanes);
        if (index % kWindowWidth == 0) {
            block_left = values;
        } else {
            block_left = _mm256_max_ps(block_left, values);
        }
        _mm256_store_ps(left_temp + index * kBatchLanes, block_left);
    }

    __m256 block_right = _mm256_setzero_ps();
    for (std::size_t reverse = length; reverse-- > 0;) {
        const __m256 values = _mm256_load_ps(hot + reverse * kBatchLanes);
        if (reverse == length - 1 || ((reverse + 1) % kWindowWidth == 0)) {
            block_right = values;
        } else {
            block_right = _mm256_max_ps(block_right, values);
        }
        _mm256_store_ps(right_temp + reverse * kBatchLanes, block_right);
    }

    __m256 total = _mm256_setzero_ps();
    for (std::size_t start = 0; start + kWindowWidth <= length; ++start) {
        const __m256 left_value = _mm256_load_ps(left_temp + (start + kWindowWidth - 1) * kBatchLanes);
        const __m256 right_value = _mm256_load_ps(right_temp + start * kBatchLanes);
        total = _mm256_add_ps(total, _mm256_max_ps(left_value, right_value));
    }
    return horizontal_sum_ps(total);
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t scalar_product_except_self_aos(
    const WideRecord<std::uint32_t>* aos,
    std::size_t length) {
    std::vector<std::uint32_t> work(length);
    std::uint64_t total = 0;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t lane = 0; lane < kBatchLanes; ++lane) {
        const WideRecord<std::uint32_t>* values = aos + lane * length;
        std::uint32_t prefix = 1U;
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t index = 0; index < length; ++index) {
            work[index] = prefix;
            prefix *= values[index].value;
        }
        std::uint32_t suffix = 1U;
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t index = length; index-- > 0;) {
            work[index] *= suffix;
            suffix *= values[index].value;
        }
        total += sample_checksum(work.data(), length);
    }
    return total;
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t scalar_product_except_self_soa(const std::uint32_t* hot,
                                                                                      std::size_t length) {
    std::vector<std::uint32_t> work(length);
    std::uint64_t total = 0;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t lane = 0; lane < kBatchLanes; ++lane) {
        const std::uint32_t* values = hot + lane * length;
        std::uint32_t prefix = 1U;
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t index = 0; index < length; ++index) {
            work[index] = prefix;
            prefix *= values[index];
        }
        std::uint32_t suffix = 1U;
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t index = length; index-- > 0;) {
            work[index] *= suffix;
            suffix *= values[index];
        }
        total += sample_checksum(work.data(), length);
    }
    return total;
}

CACHEBENCH_NOINLINE std::uint64_t simd_product_except_self_aos(const WideRecord<std::uint32_t>* aos,
                                                               std::size_t length,
                                                               std::uint32_t* temp) {
    __m256i prefix = _mm256_set1_epi32(1);
    for (std::size_t index = 0; index < length; ++index) {
        _mm256_store_si256(reinterpret_cast<__m256i*>(temp + index * kBatchLanes), prefix);
        prefix = _mm256_mullo_epi32(prefix, gather_hot_u32_values(aos, length, index));
    }

    __m256i suffix = _mm256_set1_epi32(1);
    for (std::size_t index = length; index-- > 0;) {
        const __m256i prefix_values = _mm256_load_si256(reinterpret_cast<const __m256i*>(temp + index * kBatchLanes));
        const __m256i output = _mm256_mullo_epi32(prefix_values, suffix);
        _mm256_store_si256(reinterpret_cast<__m256i*>(temp + index * kBatchLanes), output);
        suffix = _mm256_mullo_epi32(suffix, gather_hot_u32_values(aos, length, index));
    }
    return sample_checksum(temp, length * kBatchLanes);
}

CACHEBENCH_NOINLINE std::uint64_t simd_product_except_self_soa(const std::uint32_t* hot,
                                                               std::size_t length,
                                                               std::uint32_t* temp) {
    __m256i prefix = _mm256_set1_epi32(1);
    for (std::size_t index = 0; index < length; ++index) {
        _mm256_store_si256(reinterpret_cast<__m256i*>(temp + index * kBatchLanes), prefix);
        prefix = _mm256_mullo_epi32(prefix, _mm256_load_si256(reinterpret_cast<const __m256i*>(hot + index * kBatchLanes)));
    }

    __m256i suffix = _mm256_set1_epi32(1);
    for (std::size_t index = length; index-- > 0;) {
        const __m256i prefix_values = _mm256_load_si256(reinterpret_cast<const __m256i*>(temp + index * kBatchLanes));
        const __m256i output = _mm256_mullo_epi32(prefix_values, suffix);
        _mm256_store_si256(reinterpret_cast<__m256i*>(temp + index * kBatchLanes), output);
        suffix = _mm256_mullo_epi32(suffix, _mm256_load_si256(reinterpret_cast<const __m256i*>(hot + index * kBatchLanes)));
    }
    return sample_checksum(temp, length * kBatchLanes);
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR void transpose_scalar_naive(const float* src,
                                                                    float* dst,
                                                                    std::size_t dim) {
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t row = 0; row < dim; ++row) {
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t col = 0; col < dim; ++col) {
            dst[col * dim + row] = src[row * dim + col];
        }
    }
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR void transpose_scalar_blocked(const float* src,
                                                                      float* dst,
                                                                      std::size_t dim) {
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t row_block = 0; row_block < dim; row_block += kTransposeBlock) {
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t col_block = 0; col_block < dim; col_block += kTransposeBlock) {
            const std::size_t row_limit = std::min(dim, row_block + kTransposeBlock);
            const std::size_t col_limit = std::min(dim, col_block + kTransposeBlock);
            CACHEBENCH_NO_VECTOR_LOOP
            for (std::size_t row = row_block; row < row_limit; ++row) {
                CACHEBENCH_NO_VECTOR_LOOP
                for (std::size_t col = col_block; col < col_limit; ++col) {
                    dst[col * dim + row] = src[row * dim + col];
                }
            }
        }
    }
}

void transpose4x4_simd(const float* src, std::size_t src_stride, float* dst, std::size_t dst_stride) {
    __m128 row0 = _mm_loadu_ps(src + 0 * src_stride);
    __m128 row1 = _mm_loadu_ps(src + 1 * src_stride);
    __m128 row2 = _mm_loadu_ps(src + 2 * src_stride);
    __m128 row3 = _mm_loadu_ps(src + 3 * src_stride);
    _MM_TRANSPOSE4_PS(row0, row1, row2, row3);
    _mm_storeu_ps(dst + 0 * dst_stride, row0);
    _mm_storeu_ps(dst + 1 * dst_stride, row1);
    _mm_storeu_ps(dst + 2 * dst_stride, row2);
    _mm_storeu_ps(dst + 3 * dst_stride, row3);
}

CACHEBENCH_NOINLINE void transpose_simd_naive(const float* src, float* dst, std::size_t dim) {
    const std::size_t simd_dim = dim - (dim % 4);
    for (std::size_t row = 0; row < simd_dim; row += 4) {
        for (std::size_t col = 0; col < simd_dim; col += 4) {
            transpose4x4_simd(src + row * dim + col, dim, dst + col * dim + row, dim);
        }
        for (std::size_t col = simd_dim; col < dim; ++col) {
            for (std::size_t lane = 0; lane < 4 && row + lane < dim; ++lane) {
                dst[col * dim + (row + lane)] = src[(row + lane) * dim + col];
            }
        }
    }
    for (std::size_t row = simd_dim; row < dim; ++row) {
        for (std::size_t col = 0; col < dim; ++col) {
            dst[col * dim + row] = src[row * dim + col];
        }
    }
}

CACHEBENCH_NOINLINE void transpose_simd_blocked(const float* src, float* dst, std::size_t dim) {
    const std::size_t simd_dim = dim - (dim % 4);
    for (std::size_t row_block = 0; row_block < dim; row_block += kTransposeBlock) {
        for (std::size_t col_block = 0; col_block < dim; col_block += kTransposeBlock) {
            const std::size_t row_limit = std::min(dim, row_block + kTransposeBlock);
            const std::size_t col_limit = std::min(dim, col_block + kTransposeBlock);
            std::size_t row = row_block;
            for (; row + 4 <= row_limit && row < simd_dim; row += 4) {
                std::size_t col = col_block;
                for (; col + 4 <= col_limit && col < simd_dim; col += 4) {
                    transpose4x4_simd(src + row * dim + col, dim, dst + col * dim + row, dim);
                }
                for (; col < col_limit; ++col) {
                    for (std::size_t lane = 0; lane < 4; ++lane) {
                        dst[col * dim + (row + lane)] = src[(row + lane) * dim + col];
                    }
                }
            }
            for (; row < row_limit; ++row) {
                for (std::size_t col = col_block; col < col_limit; ++col) {
                    dst[col * dim + row] = src[row * dim + col];
                }
            }
        }
    }
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR int scalar_rotated_search_aos_value(
    const WideRecord<std::uint32_t>* values,
    std::size_t length,
    std::uint32_t target) {
    int left = 0;
    int right = static_cast<int>(length) - 1;
    while (left <= right) {
        const int mid = left + (right - left) / 2;
        const std::uint32_t left_value = values[left].value;
        const std::uint32_t mid_value = values[mid].value;
        const std::uint32_t right_value = values[right].value;
        if (mid_value == target) {
            return mid;
        }
        if (left_value <= mid_value) {
            if (left_value <= target && target < mid_value) {
                right = mid - 1;
            } else {
                left = mid + 1;
            }
        } else {
            if (mid_value < target && target <= right_value) {
                left = mid + 1;
            } else {
                right = mid - 1;
            }
        }
    }
    return -1;
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR int scalar_rotated_search_soa_value(
    const std::uint32_t* values,
    std::size_t length,
    std::uint32_t target) {
    int left = 0;
    int right = static_cast<int>(length) - 1;
    while (left <= right) {
        const int mid = left + (right - left) / 2;
        const std::uint32_t left_value = values[left];
        const std::uint32_t mid_value = values[mid];
        const std::uint32_t right_value = values[right];
        if (mid_value == target) {
            return mid;
        }
        if (left_value <= mid_value) {
            if (left_value <= target && target < mid_value) {
                right = mid - 1;
            } else {
                left = mid + 1;
            }
        } else {
            if (mid_value < target && target <= right_value) {
                left = mid + 1;
            } else {
                right = mid - 1;
            }
        }
    }
    return -1;
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t scalar_rotated_search_aos(
    const WideRecord<std::uint32_t>* aos,
    const std::uint32_t* queries,
    std::size_t length,
    std::size_t query_count) {
    std::uint64_t sum = 0;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t lane = 0; lane < kBatchLanes; ++lane) {
        const WideRecord<std::uint32_t>* values = aos + lane * length;
        const std::uint32_t* lane_queries = queries + lane * query_count;
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t query = 0; query < query_count; ++query) {
            sum += static_cast<std::uint32_t>(scalar_rotated_search_aos_value(values, length, lane_queries[query]) + 1);
        }
    }
    return sum;
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t scalar_rotated_search_soa(
    const std::uint32_t* hot,
    const std::uint32_t* queries,
    std::size_t length,
    std::size_t query_count) {
    std::uint64_t sum = 0;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t lane = 0; lane < kBatchLanes; ++lane) {
        const std::uint32_t* values = hot + lane * length;
        const std::uint32_t* lane_queries = queries + lane * query_count;
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t query = 0; query < query_count; ++query) {
            sum += static_cast<std::uint32_t>(scalar_rotated_search_soa_value(values, length, lane_queries[query]) + 1);
        }
    }
    return sum;
}

template <typename GatherFn>
std::uint64_t simd_rotated_search_common(const std::uint32_t* interleaved_queries,
                                         std::size_t length,
                                         std::size_t query_count,
                                         GatherFn&& gather_values) {
    const __m256i zero = _mm256_setzero_si256();
    const __m256i one = _mm256_set1_epi32(1);
    const __m256i all_bits = _mm256_set1_epi32(-1);
    const __m256i max_index = _mm256_set1_epi32(static_cast<int>(length - 1));
    const std::size_t max_iterations = std::bit_width(length) + 2;
    std::uint64_t sum = 0;
    alignas(32) std::array<std::int32_t, kBatchLanes> result_lanes{};

    for (std::size_t query = 0; query < query_count; ++query) {
        const __m256i target =
            _mm256_load_si256(reinterpret_cast<const __m256i*>(interleaved_queries + query * kBatchLanes));
        __m256i left = zero;
        __m256i right = max_index;
        __m256i result = _mm256_set1_epi32(-1);
        __m256i active = all_bits;

        for (std::size_t iteration = 0; iteration < max_iterations && mask_bits(active) != 0; ++iteration) {
            const __m256i safe_left = _mm256_blendv_epi8(zero, left, active);
            const __m256i safe_right = _mm256_blendv_epi8(zero, right, active);
            const __m256i mid = _mm256_srli_epi32(_mm256_add_epi32(safe_left, safe_right), 1);
            const __m256i mid_values = gather_values(mid);
            const __m256i equal_mask = _mm256_and_si256(active, _mm256_cmpeq_epi32(mid_values, target));
            result = _mm256_blendv_epi8(result, mid, equal_mask);
            active = _mm256_andnot_si256(equal_mask, active);
            if (mask_bits(active) == 0) {
                break;
            }

            const __m256i left_values = gather_values(safe_left);
            const __m256i right_values = gather_values(safe_right);
            const __m256i left_sorted = cmp_le_epi32(left_values, mid_values);
            const __m256i target_in_left =
                _mm256_and_si256(cmp_le_epi32(left_values, target), cmp_lt_epi32(target, mid_values));
            const __m256i target_in_right =
                _mm256_and_si256(cmp_lt_epi32(mid_values, target), cmp_le_epi32(target, right_values));
            const __m256i go_right = _mm256_and_si256(
                active,
                _mm256_or_si256(
                    _mm256_and_si256(left_sorted, _mm256_andnot_si256(target_in_left, all_bits)),
                    _mm256_andnot_si256(left_sorted, target_in_right)));
            const __m256i go_left = _mm256_and_si256(active, _mm256_andnot_si256(go_right, all_bits));

            left = _mm256_blendv_epi8(left, _mm256_add_epi32(mid, one), go_right);
            right = _mm256_blendv_epi8(right, _mm256_sub_epi32(mid, one), go_left);
            active = _mm256_and_si256(active, cmp_le_epi32(left, right));
        }

        _mm256_store_si256(reinterpret_cast<__m256i*>(result_lanes.data()), result);
        for (std::size_t lane = 0; lane < kBatchLanes; ++lane) {
            sum += static_cast<std::uint32_t>(result_lanes[lane] + 1);
        }
    }

    return sum;
}

CACHEBENCH_NOINLINE std::uint64_t simd_rotated_search_aos(const WideRecord<std::uint32_t>* aos,
                                                          const std::uint32_t* interleaved_queries,
                                                          std::size_t length,
                                                          std::size_t query_count) {
    return simd_rotated_search_common(
        interleaved_queries,
        length,
        query_count,
        [aos, length](__m256i indices) { return gather_rotated_aos_values(aos, length, indices); });
}

CACHEBENCH_NOINLINE std::uint64_t simd_rotated_search_soa(const std::uint32_t* interleaved_hot,
                                                          const std::uint32_t* interleaved_queries,
                                                          std::size_t length,
                                                          std::size_t query_count) {
    return simd_rotated_search_common(
        interleaved_queries,
        length,
        query_count,
        [interleaved_hot](__m256i indices) { return gather_rotated_soa_values(interleaved_hot, indices); });
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR void sort012_scalar_aos(Sort012Record* values, std::size_t length) {
    std::size_t low = 0;
    std::size_t mid = 0;
    std::size_t high = length - 1;
    while (mid <= high) {
        if (values[mid].value == 0U) {
            std::swap(values[low++], values[mid++]);
        } else if (values[mid].value == 1U) {
            ++mid;
        } else {
            std::swap(values[mid], values[high--]);
        }
    }
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t sort012_scalar_soa(const std::uint32_t* input,
                                                                         std::uint32_t* output,
                                                                         std::size_t length) {
    std::array<std::size_t, 3> counts{};
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t index = 0; index < length; ++index) {
        ++counts[input[index]];
    }
    std::fill_n(output, counts[0], 0U);
    std::fill_n(output + counts[0], counts[1], 1U);
    std::fill_n(output + counts[0] + counts[1], counts[2], 2U);
    return sample_checksum(output, length);
}

CACHEBENCH_NOINLINE std::uint64_t sort012_simd_aos(const Sort012Record* input,
                                                   std::uint32_t* output,
                                                   std::size_t length) {
    constexpr std::int32_t kStrideWords = static_cast<std::int32_t>(sizeof(Sort012Record) / sizeof(std::uint32_t));
    const int* base = reinterpret_cast<const int*>(input);
    const __m256i zero = _mm256_setzero_si256();
    const __m256i one = _mm256_set1_epi32(1);
    const __m256i two = _mm256_set1_epi32(2);
    std::array<std::size_t, 3> counts{};
    std::size_t index = 0;
    for (; index + kBatchLanes <= length; index += kBatchLanes) {
        const __m256i gather_indices = _mm256_setr_epi32(static_cast<int>((index + 0) * kStrideWords),
                                                         static_cast<int>((index + 1) * kStrideWords),
                                                         static_cast<int>((index + 2) * kStrideWords),
                                                         static_cast<int>((index + 3) * kStrideWords),
                                                         static_cast<int>((index + 4) * kStrideWords),
                                                         static_cast<int>((index + 5) * kStrideWords),
                                                         static_cast<int>((index + 6) * kStrideWords),
                                                         static_cast<int>((index + 7) * kStrideWords));
        const __m256i values = _mm256_i32gather_epi32(base, gather_indices, sizeof(std::uint32_t));
        counts[0] += static_cast<std::size_t>(std::popcount(static_cast<unsigned>(mask_bits(_mm256_cmpeq_epi32(values, zero)))));
        counts[1] += static_cast<std::size_t>(std::popcount(static_cast<unsigned>(mask_bits(_mm256_cmpeq_epi32(values, one)))));
        counts[2] += static_cast<std::size_t>(std::popcount(static_cast<unsigned>(mask_bits(_mm256_cmpeq_epi32(values, two)))));
    }
    for (; index < length; ++index) {
        ++counts[input[index].value];
    }
    std::fill_n(output, counts[0], 0U);
    std::fill_n(output + counts[0], counts[1], 1U);
    std::fill_n(output + counts[0] + counts[1], counts[2], 2U);
    return sample_checksum(output, length);
}

CACHEBENCH_NOINLINE std::uint64_t sort012_simd_soa(const std::uint32_t* input,
                                                   std::uint32_t* output,
                                                   std::size_t length) {
    const __m256i zero = _mm256_setzero_si256();
    const __m256i one = _mm256_set1_epi32(1);
    const __m256i two = _mm256_set1_epi32(2);
    std::array<std::size_t, 3> counts{};
    std::size_t index = 0;
    for (; index + kBatchLanes <= length; index += kBatchLanes) {
        const __m256i values = _mm256_load_si256(reinterpret_cast<const __m256i*>(input + index));
        counts[0] += static_cast<std::size_t>(std::popcount(static_cast<unsigned>(mask_bits(_mm256_cmpeq_epi32(values, zero)))));
        counts[1] += static_cast<std::size_t>(std::popcount(static_cast<unsigned>(mask_bits(_mm256_cmpeq_epi32(values, one)))));
        counts[2] += static_cast<std::size_t>(std::popcount(static_cast<unsigned>(mask_bits(_mm256_cmpeq_epi32(values, two)))));
    }
    for (; index < length; ++index) {
        ++counts[input[index]];
    }
    std::fill_n(output, counts[0], 0U);
    std::fill_n(output + counts[0], counts[1], 1U);
    std::fill_n(output + counts[0] + counts[1], counts[2], 2U);
    return sample_checksum(output, length);
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t bfs_scalar_adjlist(const GraphAdjList& graph,
                                                                         std::size_t passes) {
    const std::size_t vertices = graph.edges.size();
    std::vector<std::uint32_t> queue(vertices);
    std::vector<int> visited(vertices, 0);
    std::uint64_t total = 0;
    for (std::size_t pass = 0; pass < passes; ++pass) {
        std::fill(visited.begin(), visited.end(), 0);
        const std::uint32_t source = static_cast<std::uint32_t>((pass * 17) % vertices);
        std::size_t head = 0;
        std::size_t tail = 0;
        visited[source] = 1;
        queue[tail++] = source;
        while (head < tail) {
            const std::uint32_t node = queue[head++];
            total += node;
            for (const GraphEdgeRecord& edge : graph.edges[node]) {
                if (visited[edge.dst] == 0) {
                    visited[edge.dst] = 1;
                    queue[tail++] = edge.dst;
                }
            }
        }
        total += tail;
    }
    return total;
}

CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t bfs_scalar_csr(const GraphCsr& graph,
                                                                     std::size_t passes) {
    const std::size_t vertices = graph.offsets.size() - 1;
    std::vector<std::uint32_t> queue(vertices);
    std::vector<int> visited(vertices, 0);
    std::uint64_t total = 0;
    for (std::size_t pass = 0; pass < passes; ++pass) {
        std::fill(visited.begin(), visited.end(), 0);
        const std::uint32_t source = static_cast<std::uint32_t>((pass * 17) % vertices);
        std::size_t head = 0;
        std::size_t tail = 0;
        visited[source] = 1;
        queue[tail++] = source;
        while (head < tail) {
            const std::uint32_t node = queue[head++];
            total += node;
            for (std::uint32_t edge = graph.offsets[node]; edge < graph.offsets[node + 1]; ++edge) {
                const std::uint32_t next = graph.edges[edge];
                if (visited[next] == 0) {
                    visited[next] = 1;
                    queue[tail++] = next;
                }
            }
        }
        total += tail;
    }
    return total;
}

CACHEBENCH_NOINLINE std::uint64_t bfs_simd_adjlist(const GraphAdjList& graph, std::size_t passes) {
    const std::size_t vertices = graph.edges.size();
    std::vector<std::uint32_t> queue(vertices);
    std::vector<int> visited(vertices, 0);
    std::uint64_t total = 0;
    alignas(32) std::array<std::uint32_t, kBatchLanes> dst_lanes{};
    constexpr std::int32_t kStrideWords = static_cast<std::int32_t>(sizeof(GraphEdgeRecord) / sizeof(std::uint32_t));
    const __m256i zero = _mm256_setzero_si256();

    for (std::size_t pass = 0; pass < passes; ++pass) {
        std::fill(visited.begin(), visited.end(), 0);
        const std::uint32_t source = static_cast<std::uint32_t>((pass * 17) % vertices);
        std::size_t head = 0;
        std::size_t tail = 0;
        visited[source] = 1;
        queue[tail++] = source;
        while (head < tail) {
            const std::uint32_t node = queue[head++];
            total += node;
            const auto& edges = graph.edges[node];
            const int* base = reinterpret_cast<const int*>(edges.data());
            std::size_t edge = 0;
            for (; edge + kBatchLanes <= edges.size(); edge += kBatchLanes) {
                const __m256i edge_indices = _mm256_setr_epi32(static_cast<int>((edge + 0) * kStrideWords),
                                                               static_cast<int>((edge + 1) * kStrideWords),
                                                               static_cast<int>((edge + 2) * kStrideWords),
                                                               static_cast<int>((edge + 3) * kStrideWords),
                                                               static_cast<int>((edge + 4) * kStrideWords),
                                                               static_cast<int>((edge + 5) * kStrideWords),
                                                               static_cast<int>((edge + 6) * kStrideWords),
                                                               static_cast<int>((edge + 7) * kStrideWords));
                const __m256i next_nodes =
                    _mm256_i32gather_epi32(base, edge_indices, sizeof(std::uint32_t));
                const __m256i seen =
                    _mm256_i32gather_epi32(visited.data(), next_nodes, static_cast<int>(sizeof(int)));
                _mm256_store_si256(reinterpret_cast<__m256i*>(dst_lanes.data()), next_nodes);
                const int unseen_mask = mask_bits(_mm256_cmpeq_epi32(seen, zero));
                for (std::size_t lane = 0; lane < kBatchLanes; ++lane) {
                    if (((unseen_mask >> lane) & 1) != 0 && visited[dst_lanes[lane]] == 0) {
                        visited[dst_lanes[lane]] = 1;
                        queue[tail++] = dst_lanes[lane];
                    }
                }
            }
            for (; edge < edges.size(); ++edge) {
                const std::uint32_t next = edges[edge].dst;
                if (visited[next] == 0) {
                    visited[next] = 1;
                    queue[tail++] = next;
                }
            }
        }
        total += tail;
    }
    return total;
}

CACHEBENCH_NOINLINE std::uint64_t bfs_simd_csr(const GraphCsr& graph, std::size_t passes) {
    const std::size_t vertices = graph.offsets.size() - 1;
    std::vector<std::uint32_t> queue(vertices);
    std::vector<int> visited(vertices, 0);
    std::uint64_t total = 0;
    alignas(32) std::array<std::uint32_t, kBatchLanes> dst_lanes{};
    const __m256i zero = _mm256_setzero_si256();

    for (std::size_t pass = 0; pass < passes; ++pass) {
        std::fill(visited.begin(), visited.end(), 0);
        const std::uint32_t source = static_cast<std::uint32_t>((pass * 17) % vertices);
        std::size_t head = 0;
        std::size_t tail = 0;
        visited[source] = 1;
        queue[tail++] = source;
        while (head < tail) {
            const std::uint32_t node = queue[head++];
            total += node;
            std::uint32_t edge = graph.offsets[node];
            const std::uint32_t edge_limit = graph.offsets[node + 1];
            for (; edge + kBatchLanes <= edge_limit; edge += static_cast<std::uint32_t>(kBatchLanes)) {
                const __m256i next_nodes =
                    _mm256_loadu_si256(reinterpret_cast<const __m256i*>(graph.edges.data() + edge));
                const __m256i seen =
                    _mm256_i32gather_epi32(visited.data(), next_nodes, static_cast<int>(sizeof(int)));
                _mm256_store_si256(reinterpret_cast<__m256i*>(dst_lanes.data()), next_nodes);
                const int unseen_mask = mask_bits(_mm256_cmpeq_epi32(seen, zero));
                for (std::size_t lane = 0; lane < kBatchLanes; ++lane) {
                    if (((unseen_mask >> lane) & 1) != 0 && visited[dst_lanes[lane]] == 0) {
                        visited[dst_lanes[lane]] = 1;
                        queue[tail++] = dst_lanes[lane];
                    }
                }
            }
            for (; edge < edge_limit; ++edge) {
                const std::uint32_t next = graph.edges[edge];
                if (visited[next] == 0) {
                    visited[next] = 1;
                    queue[tail++] = next;
                }
            }
        }
        total += tail;
    }
    return total;
}

BenchmarkInstance make_rotated_search_benchmark(std::size_t requested_size_bytes, RotatedSearchMode mode) {
    auto dataset = std::make_shared<RotatedSearchDataset>(build_rotated_search_dataset(requested_size_bytes));

    BenchmarkInstance instance;
    instance.actual_size_bytes = dataset->logical_size_bytes;
    instance.elements = dataset->query_count * kBatchLanes;
    instance.threads_used = 1;
    instance.bytes_per_pass = dataset->logical_size_bytes + dataset->query_count * kBatchLanes * sizeof(std::uint32_t);
    instance.target_bytes_per_trial = kSequenceTargetBytes;
    instance.run = [dataset, mode](std::size_t passes) {
        std::uint64_t sum = 0;
        for (std::size_t pass = 0; pass < passes; ++pass) {
            switch (mode) {
                case RotatedSearchMode::kScalarAos:
                    sum += scalar_rotated_search_aos(
                        dataset->aos->data(), dataset->lane_major_queries->data(), dataset->length, dataset->query_count);
                    break;
                case RotatedSearchMode::kScalarSoa:
                    sum += scalar_rotated_search_soa(dataset->lane_major_hot->data(),
                                                     dataset->lane_major_queries->data(),
                                                     dataset->length,
                                                     dataset->query_count);
                    break;
                case RotatedSearchMode::kSimdAos:
                    sum += simd_rotated_search_aos(
                        dataset->aos->data(), dataset->interleaved_queries->data(), dataset->length, dataset->query_count);
                    break;
                case RotatedSearchMode::kSimdSoa:
                    sum += simd_rotated_search_soa(dataset->interleaved_hot->data(),
                                                   dataset->interleaved_queries->data(),
                                                   dataset->length,
                                                   dataset->query_count);
                    break;
            }
        }
        return sum;
    };
    return instance;
}

BenchmarkInstance make_sort012_benchmark(std::size_t requested_size_bytes, Sort012Mode mode) {
    auto dataset = std::make_shared<Sort012Dataset>(build_sort012_dataset(requested_size_bytes));
    auto work_aos = std::make_shared<AlignedBuffer<Sort012Record>>(dataset->length);
    auto output = std::make_shared<AlignedBuffer<std::uint32_t>>(dataset->length);

    BenchmarkInstance instance;
    instance.actual_size_bytes = dataset->logical_size_bytes;
    instance.elements = dataset->length;
    instance.threads_used = 1;
    instance.bytes_per_pass = dataset->logical_size_bytes * 2ULL;
    instance.target_bytes_per_trial = kSequenceTargetBytes;
    instance.run = [dataset, work_aos, output, mode](std::size_t passes) {
        std::uint64_t sum = 0;
        for (std::size_t pass = 0; pass < passes; ++pass) {
            switch (mode) {
                case Sort012Mode::kScalarAos:
                    std::memcpy(work_aos->data(), dataset->input_aos->data(), dataset->length * sizeof(Sort012Record));
                    sort012_scalar_aos(work_aos->data(), dataset->length);
                    sum ^= sample_sort012_record_values(work_aos->data(), dataset->length);
                    break;
                case Sort012Mode::kScalarSoa:
                    sum ^= sort012_scalar_soa(dataset->input_soa->data(), output->data(), dataset->length);
                    break;
                case Sort012Mode::kSimdAos:
                    sum ^= sort012_simd_aos(dataset->input_aos->data(), output->data(), dataset->length);
                    break;
                case Sort012Mode::kSimdSoa:
                    sum ^= sort012_simd_soa(dataset->input_soa->data(), output->data(), dataset->length);
                    break;
            }
        }
        return sum;
    };
    return instance;
}

BenchmarkInstance make_bfs_benchmark(std::size_t requested_size_bytes, BfsMode mode) {
    const std::size_t vertices = std::max<std::size_t>(
        512,
        std::min<std::size_t>(requested_size_bytes / (kDsaGraphDegree * sizeof(GraphEdgeRecord)), 1ULL << 17));
    auto adjlist = build_dsa_graph_adjlist(vertices);
    auto csr = build_dsa_graph_csr(*adjlist);
    const std::size_t logical_size_bytes =
        adjlist->edges.size() * kDsaGraphDegree * sizeof(GraphEdgeRecord) + csr->offsets.size() * sizeof(std::uint32_t);

    BenchmarkInstance instance;
    instance.actual_size_bytes = logical_size_bytes;
    instance.elements = vertices;
    instance.threads_used = 1;
    instance.bytes_per_pass = logical_size_bytes;
    instance.target_bytes_per_trial = kSequenceTargetBytes / 2ULL;
    instance.run = [adjlist, csr, mode](std::size_t passes) {
        switch (mode) {
            case BfsMode::kScalarAdjList:
                return bfs_scalar_adjlist(*adjlist, passes);
            case BfsMode::kScalarCsr:
                return bfs_scalar_csr(*csr, passes);
            case BfsMode::kSimdAdjList:
                return bfs_simd_adjlist(*adjlist, passes);
            case BfsMode::kSimdCsr:
                return bfs_simd_csr(*csr, passes);
        }
        return std::uint64_t{0};
    };
    return instance;
}

BenchmarkInstance make_stock_profit_benchmark(std::size_t requested_size_bytes, StockMode mode) {
    auto dataset = std::make_shared<SequenceDataset<float>>(
        build_sequence_dataset<float>(requested_size_bytes, 256, stock_price_value));

    BenchmarkInstance instance;
    instance.actual_size_bytes = dataset->logical_size_bytes;
    instance.elements = dataset->total_values;
    instance.threads_used = 1;
    instance.bytes_per_pass = dataset->logical_size_bytes;
    instance.target_bytes_per_trial = kSequenceTargetBytes;
    instance.run = [dataset, mode](std::size_t passes) {
        float sum = 0.0f;
        for (std::size_t pass = 0; pass < passes; ++pass) {
            switch (mode) {
                case StockMode::kScalarAos:
                    sum += scalar_stock_profit_aos(dataset->aos->data(), dataset->length);
                    break;
                case StockMode::kScalarSoa:
                    sum += scalar_stock_profit_soa(dataset->lane_major_hot->data(), dataset->length);
                    break;
                case StockMode::kSimdAos:
                    sum += simd_stock_profit_aos(dataset->aos->data(), dataset->length);
                    break;
                case StockMode::kSimdSoa:
                    sum += simd_stock_profit_soa(dataset->interleaved_hot->data(), dataset->length);
                    break;
            }
        }
        return static_cast<std::uint64_t>(std::bit_cast<std::uint32_t>(sum));
    };
    return instance;
}

BenchmarkInstance make_max_subarray_benchmark(std::size_t requested_size_bytes, MaxSubarrayMode mode) {
    auto dataset = std::make_shared<SequenceDataset<float>>(
        build_sequence_dataset<float>(requested_size_bytes, 256, max_subarray_value));

    BenchmarkInstance instance;
    instance.actual_size_bytes = dataset->logical_size_bytes;
    instance.elements = dataset->total_values;
    instance.threads_used = 1;
    instance.bytes_per_pass = dataset->logical_size_bytes;
    instance.target_bytes_per_trial = kSequenceTargetBytes;
    instance.run = [dataset, mode](std::size_t passes) {
        float sum = 0.0f;
        for (std::size_t pass = 0; pass < passes; ++pass) {
            switch (mode) {
                case MaxSubarrayMode::kScalarAos:
                    sum += scalar_max_subarray_aos(dataset->aos->data(), dataset->length);
                    break;
                case MaxSubarrayMode::kScalarSoa:
                    sum += scalar_max_subarray_soa(dataset->lane_major_hot->data(), dataset->length);
                    break;
                case MaxSubarrayMode::kSimdAos:
                    sum += simd_max_subarray_aos(dataset->aos->data(), dataset->length);
                    break;
                case MaxSubarrayMode::kSimdSoa:
                    sum += simd_max_subarray_soa(dataset->interleaved_hot->data(), dataset->length);
                    break;
            }
        }
        return static_cast<std::uint64_t>(std::bit_cast<std::uint32_t>(sum));
    };
    return instance;
}

BenchmarkInstance make_trapping_rain_benchmark(std::size_t requested_size_bytes, TrappingRainMode mode) {
    auto dataset = std::make_shared<SequenceDataset<float>>(
        build_sequence_dataset<float>(requested_size_bytes, 256, height_value));
    auto left_temp = std::make_shared<AlignedBuffer<float>>(dataset->total_values);

    BenchmarkInstance instance;
    instance.actual_size_bytes = dataset->logical_size_bytes;
    instance.elements = dataset->total_values;
    instance.threads_used = 1;
    instance.bytes_per_pass = dataset->logical_size_bytes * 2ULL;
    instance.target_bytes_per_trial = kSequenceTargetBytes;
    instance.run = [dataset, left_temp, mode](std::size_t passes) {
        float sum = 0.0f;
        for (std::size_t pass = 0; pass < passes; ++pass) {
            switch (mode) {
                case TrappingRainMode::kScalarAos:
                    sum += scalar_trapping_rain_aos(dataset->aos->data(), dataset->length);
                    break;
                case TrappingRainMode::kScalarSoa:
                    sum += scalar_trapping_rain_soa(dataset->lane_major_hot->data(), dataset->length);
                    break;
                case TrappingRainMode::kSimdAos:
                    sum += simd_trapping_rain_aos(dataset->aos->data(), dataset->length, left_temp->data());
                    break;
                case TrappingRainMode::kSimdSoa:
                    sum += simd_trapping_rain_soa(dataset->interleaved_hot->data(), dataset->length, left_temp->data());
                    break;
            }
        }
        return static_cast<std::uint64_t>(std::bit_cast<std::uint32_t>(sum));
    };
    return instance;
}

BenchmarkInstance make_sliding_window_benchmark(std::size_t requested_size_bytes, SlidingWindowMode mode) {
    auto dataset = std::make_shared<SequenceDataset<float>>(
        build_sequence_dataset<float>(requested_size_bytes, 256, window_value));
    auto left_temp = std::make_shared<AlignedBuffer<float>>(dataset->total_values);
    auto right_temp = std::make_shared<AlignedBuffer<float>>(dataset->total_values);

    BenchmarkInstance instance;
    instance.actual_size_bytes = dataset->logical_size_bytes;
    instance.elements = dataset->total_values;
    instance.threads_used = 1;
    instance.bytes_per_pass = dataset->logical_size_bytes * 2ULL;
    instance.target_bytes_per_trial = kSequenceTargetBytes;
    instance.run = [dataset, left_temp, right_temp, mode](std::size_t passes) {
        float sum = 0.0f;
        for (std::size_t pass = 0; pass < passes; ++pass) {
            switch (mode) {
                case SlidingWindowMode::kScalarAos:
                    sum += scalar_sliding_max_aos(dataset->aos->data(), dataset->length);
                    break;
                case SlidingWindowMode::kScalarSoa:
                    sum += scalar_sliding_max_soa(dataset->lane_major_hot->data(), dataset->length);
                    break;
                case SlidingWindowMode::kSimdAos:
                    sum += simd_sliding_max_aos(dataset->aos->data(), dataset->length);
                    break;
                case SlidingWindowMode::kSimdSoa:
                    sum += simd_sliding_max_soa(
                        dataset->interleaved_hot->data(), dataset->length, left_temp->data(), right_temp->data());
                    break;
            }
        }
        return static_cast<std::uint64_t>(std::bit_cast<std::uint32_t>(sum));
    };
    return instance;
}

BenchmarkInstance make_product_except_self_benchmark(std::size_t requested_size_bytes, ProductExceptSelfMode mode) {
    auto dataset = std::make_shared<SequenceDataset<std::uint32_t>>(
        build_sequence_dataset<std::uint32_t>(requested_size_bytes, 256, product_value));
    auto temp = std::make_shared<AlignedBuffer<std::uint32_t>>(dataset->total_values);

    BenchmarkInstance instance;
    instance.actual_size_bytes = dataset->logical_size_bytes;
    instance.elements = dataset->total_values;
    instance.threads_used = 1;
    instance.bytes_per_pass = dataset->logical_size_bytes * 2ULL;
    instance.target_bytes_per_trial = kSequenceTargetBytes;
    instance.run = [dataset, temp, mode](std::size_t passes) {
        std::uint64_t sum = 0;
        for (std::size_t pass = 0; pass < passes; ++pass) {
            switch (mode) {
                case ProductExceptSelfMode::kScalarAos:
                    sum ^= scalar_product_except_self_aos(dataset->aos->data(), dataset->length);
                    break;
                case ProductExceptSelfMode::kScalarSoa:
                    sum ^= scalar_product_except_self_soa(dataset->lane_major_hot->data(), dataset->length);
                    break;
                case ProductExceptSelfMode::kSimdAos:
                    sum ^= simd_product_except_self_aos(dataset->aos->data(), dataset->length, temp->data());
                    break;
                case ProductExceptSelfMode::kSimdSoa:
                    sum ^= simd_product_except_self_soa(dataset->interleaved_hot->data(), dataset->length, temp->data());
                    break;
            }
        }
        return sum;
    };
    return instance;
}

BenchmarkInstance make_transpose_benchmark(std::size_t requested_size_bytes, TransposeMode mode) {
    const std::size_t elements = std::max<std::size_t>(64, requested_size_bytes / (sizeof(float) * 2ULL));
    const std::size_t dim = std::max<std::size_t>(64, static_cast<std::size_t>(std::sqrt(static_cast<double>(elements))) & ~3ULL);
    const std::size_t matrix_elements = dim * dim;
    auto src = std::make_shared<AlignedBuffer<float>>(matrix_elements);
    auto dst = std::make_shared<AlignedBuffer<float>>(matrix_elements);

    for (std::size_t index = 0; index < matrix_elements; ++index) {
        (*src)[index] = seeded_float(index, 31.0f);
        (*dst)[index] = 0.0f;
    }

    BenchmarkInstance instance;
    instance.actual_size_bytes = matrix_elements * sizeof(float) * 2ULL;
    instance.elements = matrix_elements;
    instance.threads_used = 1;
    instance.bytes_per_pass = instance.actual_size_bytes;
    instance.target_bytes_per_trial = kTransposeTargetBytes;
    instance.run = [src, dst, dim, matrix_elements, mode](std::size_t passes) {
        std::uint64_t sum = 0;
        for (std::size_t pass = 0; pass < passes; ++pass) {
            switch (mode) {
                case TransposeMode::kScalarNaive:
                    transpose_scalar_naive(src->data(), dst->data(), dim);
                    break;
                case TransposeMode::kScalarBlocked:
                    transpose_scalar_blocked(src->data(), dst->data(), dim);
                    break;
                case TransposeMode::kSimdNaive:
                    transpose_simd_naive(src->data(), dst->data(), dim);
                    break;
                case TransposeMode::kSimdBlocked:
                    transpose_simd_blocked(src->data(), dst->data(), dim);
                    break;
            }
            sum ^= checksum_from_float_buffer(dst->data(), matrix_elements);
        }
        return sum;
    };
    return instance;
}

}  // namespace

void append_dsa_benchmarks(std::vector<BenchmarkSpec>& specs) {
    if (!cpu_supports_avx2()) {
        return;
    }

    specs.push_back({"dsa_scalar_stock_profit_aos",
                     "Best time to buy/sell stock on wide AoS price records.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_stock_profit_benchmark(size_bytes, StockMode::kScalarAos);
                     }});
    specs.push_back({"dsa_scalar_stock_profit_soa",
                     "Best time to buy/sell stock on cache-aware SoA price buffers.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_stock_profit_benchmark(size_bytes, StockMode::kScalarSoa);
                     }});
    specs.push_back({"dsa_simd_stock_profit_aos",
                     "Best time to buy/sell stock with SIMD over AoS price records.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_stock_profit_benchmark(size_bytes, StockMode::kSimdAos);
                     }});
    specs.push_back({"dsa_simd_stock_profit_soa",
                     "Best time to buy/sell stock with cache-aware SIMD layout.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_stock_profit_benchmark(size_bytes, StockMode::kSimdSoa);
                     }});

    specs.push_back({"dsa_scalar_max_subarray_aos",
                     "Kadane maximum subarray on wide AoS records.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_max_subarray_benchmark(size_bytes, MaxSubarrayMode::kScalarAos);
                     }});
    specs.push_back({"dsa_scalar_max_subarray_soa",
                     "Kadane maximum subarray on cache-aware SoA values.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_max_subarray_benchmark(size_bytes, MaxSubarrayMode::kScalarSoa);
                     }});
    specs.push_back({"dsa_simd_max_subarray_aos",
                     "Kadane maximum subarray with SIMD gathers over AoS values.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_max_subarray_benchmark(size_bytes, MaxSubarrayMode::kSimdAos);
                     }});
    specs.push_back({"dsa_simd_max_subarray_soa",
                     "Kadane maximum subarray with SIMD on interleaved SoA values.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_max_subarray_benchmark(size_bytes, MaxSubarrayMode::kSimdSoa);
                     }});

    specs.push_back({"dsa_scalar_trapping_rain_aos",
                     "Trapping rain water on wide AoS height records.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_trapping_rain_benchmark(size_bytes, TrappingRainMode::kScalarAos);
                     }});
    specs.push_back({"dsa_scalar_trapping_rain_soa",
                     "Trapping rain water on cache-aware SoA heights.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_trapping_rain_benchmark(size_bytes, TrappingRainMode::kScalarSoa);
                     }});
    specs.push_back({"dsa_simd_trapping_rain_aos",
                     "Trapping rain water with SIMD gathers over AoS heights.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_trapping_rain_benchmark(size_bytes, TrappingRainMode::kSimdAos);
                     }});
    specs.push_back({"dsa_simd_trapping_rain_soa",
                     "Trapping rain water with cache-aware SIMD layout.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_trapping_rain_benchmark(size_bytes, TrappingRainMode::kSimdSoa);
                     }});

    specs.push_back({"dsa_scalar_sliding_window_max_aos",
                     "Naive K-window maximum on wide AoS records.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_sliding_window_benchmark(size_bytes, SlidingWindowMode::kScalarAos);
                     }});
    specs.push_back({"dsa_scalar_sliding_window_max_soa",
                     "Deque-based K-window maximum on cache-aware SoA values.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_sliding_window_benchmark(size_bytes, SlidingWindowMode::kScalarSoa);
                     }});
    specs.push_back({"dsa_simd_sliding_window_max_aos",
                     "Naive K-window maximum with SIMD over AoS values.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_sliding_window_benchmark(size_bytes, SlidingWindowMode::kSimdAos);
                     }});
    specs.push_back({"dsa_simd_sliding_window_max_soa",
                     "Block-based K-window maximum with cache-aware SIMD layout.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_sliding_window_benchmark(size_bytes, SlidingWindowMode::kSimdSoa);
                     }});

    specs.push_back({"dsa_scalar_product_except_self_aos",
                     "Product-except-self on wide AoS integer records.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_product_except_self_benchmark(size_bytes, ProductExceptSelfMode::kScalarAos);
                     }});
    specs.push_back({"dsa_scalar_product_except_self_soa",
                     "Product-except-self on cache-aware SoA integer buffers.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_product_except_self_benchmark(size_bytes, ProductExceptSelfMode::kScalarSoa);
                     }});
    specs.push_back({"dsa_simd_product_except_self_aos",
                     "Product-except-self with SIMD gathers over AoS integer records.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_product_except_self_benchmark(size_bytes, ProductExceptSelfMode::kSimdAos);
                     }});
    specs.push_back({"dsa_simd_product_except_self_soa",
                     "Product-except-self with cache-aware SIMD layout.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_product_except_self_benchmark(size_bytes, ProductExceptSelfMode::kSimdSoa);
                     }});

    specs.push_back({"dsa_scalar_rotated_search_aos",
                     "Rotated sorted-array lookup on wide AoS records.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_rotated_search_benchmark(size_bytes, RotatedSearchMode::kScalarAos);
                     }});
    specs.push_back({"dsa_scalar_rotated_search_soa",
                     "Rotated sorted-array lookup on cache-aware SoA values.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_rotated_search_benchmark(size_bytes, RotatedSearchMode::kScalarSoa);
                     }});
    specs.push_back({"dsa_simd_rotated_search_aos",
                     "Rotated sorted-array lookup with SIMD gathers over AoS records.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_rotated_search_benchmark(size_bytes, RotatedSearchMode::kSimdAos);
                     }});
    specs.push_back({"dsa_simd_rotated_search_soa",
                     "Rotated sorted-array lookup with cache-aware SIMD layout.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_rotated_search_benchmark(size_bytes, RotatedSearchMode::kSimdSoa);
                     }});

    specs.push_back({"dsa_scalar_sort012_aos",
                     "Dutch-flag sort for 0/1/2 values stored in wide AoS records.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_sort012_benchmark(size_bytes, Sort012Mode::kScalarAos);
                     }});
    specs.push_back({"dsa_scalar_sort012_soa",
                     "Counting sort for 0/1/2 values stored contiguously.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_sort012_benchmark(size_bytes, Sort012Mode::kScalarSoa);
                     }});
    specs.push_back({"dsa_simd_sort012_aos",
                     "SIMD counting sort for 0/1/2 values gathered from AoS records.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_sort012_benchmark(size_bytes, Sort012Mode::kSimdAos);
                     }});
    specs.push_back({"dsa_simd_sort012_soa",
                     "SIMD counting sort for contiguous 0/1/2 values.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_sort012_benchmark(size_bytes, Sort012Mode::kSimdSoa);
                     }});

    specs.push_back({"dsa_scalar_bfs_adjlist",
                     "Breadth-first search over adjacency-list edge records.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_bfs_benchmark(size_bytes, BfsMode::kScalarAdjList);
                     }});
    specs.push_back({"dsa_scalar_bfs_csr",
                     "Breadth-first search over a cache-aware CSR layout.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_bfs_benchmark(size_bytes, BfsMode::kScalarCsr);
                     }});
    specs.push_back({"dsa_simd_bfs_adjlist",
                     "Breadth-first search with SIMD-assisted adjacency-list neighbor tests.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_bfs_benchmark(size_bytes, BfsMode::kSimdAdjList);
                     }});
    specs.push_back({"dsa_simd_bfs_csr",
                     "Breadth-first search with SIMD-assisted CSR neighbor tests.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_bfs_benchmark(size_bytes, BfsMode::kSimdCsr);
                     }});

    specs.push_back({"dsa_scalar_transpose_matrix",
                     "Naive transpose of a row-major matrix.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_transpose_benchmark(size_bytes, TransposeMode::kScalarNaive);
                     }});
    specs.push_back({"dsa_scalar_blocked_transpose_matrix",
                     "Blocked transpose of a row-major matrix.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_transpose_benchmark(size_bytes, TransposeMode::kScalarBlocked);
                     }});
    specs.push_back({"dsa_simd_transpose_matrix",
                     "SIMD tiled transpose of a row-major matrix.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_transpose_benchmark(size_bytes, TransposeMode::kSimdNaive);
                     }});
    specs.push_back({"dsa_simd_blocked_transpose_matrix",
                     "Blocked SIMD transpose of a row-major matrix.",
                     [](std::size_t size_bytes, std::size_t) {
                         return make_transpose_benchmark(size_bytes, TransposeMode::kSimdBlocked);
                     }});
}

}  // namespace cachebench
