# DSA Full Code Appendix

This appendix contains the measured core functions. The benchmark harness also includes dataset generation, warmup, checksums, CSV output, and CLI plumbing.

## Shared DSA support types

```cpp
template <typename T>
struct WideRecord {
    T value{};
    std::array<T, kColdFieldCount> cold{};
};

static_assert(sizeof(WideRecord<float>) == 64, "Wide float record should occupy one cache line");
static_assert(sizeof(WideRecord<std::uint32_t>) == 64, "Wide uint32 record should occupy one cache line");

template <typename T>
struct SequenceDataset {
    std::size_t length = 0;
    std::size_t total_values = 0;
    std::size_t logical_size_bytes = 0;
    std::shared_ptr<AlignedBuffer<WideRecord<T>>> aos;
    std::shared_ptr<AlignedBuffer<T>> lane_major_hot;
    std::shared_ptr<AlignedBuffer<T>> lane_major_cold;
    std::shared_ptr<AlignedBuffer<T>> interleaved_hot;
    std::shared_ptr<AlignedBuffer<T>> interleaved_cold;
};

enum class StockMode {
    kScalarAos,
    kScalarSoa,
    kSimdAos,
    kSimdSoa,
};

enum class MaxSubarrayMode {
    kScalarAos,
    kScalarSoa,
    kSimdAos,
    kSimdSoa,
};

enum class TrappingRainMode {
    kScalarAos,
    kScalarSoa,
    kSimdAos,
    kSimdSoa,
};

enum class SlidingWindowMode {
    kScalarAos,
    kScalarSoa,
    kSimdAos,
    kSimdSoa,
};

enum class ProductExceptSelfMode {
    kScalarAos,
    kScalarSoa,
    kSimdAos,
    kSimdSoa,
};

enum class TransposeMode {
    kScalarNaive,
    kScalarBlocked,
    kSimdNaive,
    kSimdBlocked,
};

enum class RotatedSearchMode {
    kScalarAos,
    kScalarSoa,
    kSimdAos,
    kSimdSoa,
};

enum class Sort012Mode {
    kScalarAos,
    kScalarSoa,
    kSimdAos,
    kSimdSoa,
};

enum class BfsMode {
    kScalarAdjList,
    kScalarCsr,
    kSimdAdjList,
    kSimdCsr,
};

struct RotatedSearchDataset {
    std::size_t length = 0;
    std::size_t query_count = 0;
    std::size_t logical_size_bytes = 0;
    std::shared_ptr<AlignedBuffer<WideRecord<std::uint32_t>>> aos;
    std::shared_ptr<AlignedBuffer<std::uint32_t>> lane_major_hot;
    std::shared_ptr<AlignedBuffer<std::uint32_t>> interleaved_hot;
    std::shared_ptr<AlignedBuffer<std::uint32_t>> lane_major_queries;
    std::shared_ptr<AlignedBuffer<std::uint32_t>> interleaved_queries;
};

struct Sort012Record {
    std::uint32_t value = 0;
    std::array<std::uint32_t, kColdFieldCount> cold{};
};

static_assert(sizeof(Sort012Record) == 64, "Sort012Record should occupy one cache line");

struct Sort012Dataset {
    std::size_t length = 0;
    std::size_t logical_size_bytes = 0;
    std::shared_ptr<AlignedBuffer<Sort012Record>> input_aos;
    std::shared_ptr<AlignedBuffer<std::uint32_t>> input_soa;
};

struct GraphEdgeRecord {
    std::uint32_t dst = 0;
    std::uint32_t cold0 = 0;
    std::uint32_t cold1 = 0;
    std::uint32_t cold2 = 0;
};

static_assert(sizeof(GraphEdgeRecord) == 16, "GraphEdgeRecord should stay compact");

struct GraphAdjList {
    std::vector<std::vector<GraphEdgeRecord>> edges;
};

struct GraphCsr {
    std::vector<std::uint32_t> offsets;
    std::vector<std::uint32_t> edges;
};
```

## Stock profit

```cpp
CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_stock_profit_aos(const WideRecord<float>* aos,
                                                                      std::size_t length) {
    float total = 0.0f;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t lane = 0; lane < kBatchLanes; ++lane) {
        float min_price = std::numeric_limits<float>::infinity();
        float best = 0.0f;
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t index = 0; index < length; ++index) {
            const float price = aos[lane * length + index].value;
            min_price = std::min(min_price, price);
            best = std::max(best, price - min_price);
        }
        total += best;
    }
    return total;
}



CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_stock_profit_soa(const float* hot, std::size_t length) {
    float total = 0.0f;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t lane = 0; lane < kBatchLanes; ++lane) {
        const float* values = hot + lane * length;
        float min_price = std::numeric_limits<float>::infinity();
        float best = 0.0f;
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t index = 0; index < length; ++index) {
            min_price = std::min(min_price, values[index]);
            best = std::max(best, values[index] - min_price);
        }
        total += best;
    }
    return total;
}



CACHEBENCH_NOINLINE float simd_stock_profit_aos(const WideRecord<float>* aos, std::size_t length) {
    __m256 min_values = _mm256_set1_ps(std::numeric_limits<float>::infinity());
    __m256 best_values = _mm256_setzero_ps();
    for (std::size_t index = 0; index < length; ++index) {
        const __m256 prices = gather_hot_values(aos, length, index);
        min_values = _mm256_min_ps(min_values, prices);
        best_values = _mm256_max_ps(best_values, _mm256_sub_ps(prices, min_values));
    }
    return horizontal_sum_ps(best_values);
}



CACHEBENCH_NOINLINE float simd_stock_profit_soa(const float* hot, std::size_t length) {
    __m256 min_values = _mm256_set1_ps(std::numeric_limits<float>::infinity());
    __m256 best_values = _mm256_setzero_ps();
    for (std::size_t index = 0; index < length; ++index) {
        const __m256 prices = _mm256_load_ps(hot + index * kBatchLanes);
        min_values = _mm256_min_ps(min_values, prices);
        best_values = _mm256_max_ps(best_values, _mm256_sub_ps(prices, min_values));
    }
    return horizontal_sum_ps(best_values);
}
```

## Maximum subarray

```cpp
CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_max_subarray_aos(const WideRecord<float>* aos,
                                                                       std::size_t length) {
    float total = 0.0f;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t lane = 0; lane < kBatchLanes; ++lane) {
        float current = aos[lane * length].value;
        float best = current;
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t index = 1; index < length; ++index) {
            const float value = aos[lane * length + index].value;
            current = std::max(value, current + value);
            best = std::max(best, current);
        }
        total += best;
    }
    return total;
}



CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_max_subarray_soa(const float* hot, std::size_t length) {
    float total = 0.0f;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t lane = 0; lane < kBatchLanes; ++lane) {
        const float* values = hot + lane * length;
        float current = values[0];
        float best = current;
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t index = 1; index < length; ++index) {
            current = std::max(values[index], current + values[index]);
            best = std::max(best, current);
        }
        total += best;
    }
    return total;
}



CACHEBENCH_NOINLINE float simd_max_subarray_aos(const WideRecord<float>* aos, std::size_t length) {
    __m256 current = gather_hot_values(aos, length, 0);
    __m256 best = current;
    for (std::size_t index = 1; index < length; ++index) {
        const __m256 values = gather_hot_values(aos, length, index);
        current = _mm256_max_ps(values, _mm256_add_ps(current, values));
        best = _mm256_max_ps(best, current);
    }
    return horizontal_sum_ps(best);
}



CACHEBENCH_NOINLINE float simd_max_subarray_soa(const float* hot, std::size_t length) {
    __m256 current = _mm256_load_ps(hot);
    __m256 best = current;
    for (std::size_t index = 1; index < length; ++index) {
        const __m256 values = _mm256_load_ps(hot + index * kBatchLanes);
        current = _mm256_max_ps(values, _mm256_add_ps(current, values));
        best = _mm256_max_ps(best, current);
    }
    return horizontal_sum_ps(best);
}
```

## Trapping rain water

```cpp
CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_trapping_rain_aos(const WideRecord<float>* aos,
                                                                        std::size_t length) {
    std::vector<float> left(length);
    float total = 0.0f;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t lane = 0; lane < kBatchLanes; ++lane) {
        const WideRecord<float>* values = aos + lane * length;
        float left_max = values[0].value;
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t index = 0; index < length; ++index) {
            left_max = std::max(left_max, values[index].value);
            left[index] = left_max;
        }
        float right_max = values[length - 1].value;
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t index = length; index-- > 0;) {
            right_max = std::max(right_max, values[index].value);
            total += std::max(0.0f, std::min(left[index], right_max) - values[index].value);
        }
    }
    return total;
}



CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_trapping_rain_soa(const float* hot, std::size_t length) {
    std::vector<float> left(length);
    float total = 0.0f;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t lane = 0; lane < kBatchLanes; ++lane) {
        const float* values = hot + lane * length;
        float left_max = values[0];
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t index = 0; index < length; ++index) {
            left_max = std::max(left_max, values[index]);
            left[index] = left_max;
        }
        float right_max = values[length - 1];
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t index = length; index-- > 0;) {
            right_max = std::max(right_max, values[index]);
            total += std::max(0.0f, std::min(left[index], right_max) - values[index]);
        }
    }
    return total;
}



CACHEBENCH_NOINLINE float simd_trapping_rain_aos(const WideRecord<float>* aos,
                                                 std::size_t length,
                                                 float* left_temp) {
    __m256 left_max = _mm256_set1_ps(std::numeric_limits<float>::lowest());
    for (std::size_t index = 0; index < length; ++index) {
        left_max = _mm256_max_ps(left_max, gather_hot_values(aos, length, index));
        _mm256_store_ps(left_temp + index * kBatchLanes, left_max);
    }

    __m256 right_max = _mm256_set1_ps(std::numeric_limits<float>::lowest());
    __m256 total = _mm256_setzero_ps();
    for (std::size_t index = length; index-- > 0;) {
        const __m256 values = gather_hot_values(aos, length, index);
        right_max = _mm256_max_ps(right_max, values);
        const __m256 left_values = _mm256_load_ps(left_temp + index * kBatchLanes);
        const __m256 water = _mm256_sub_ps(_mm256_min_ps(left_values, right_max), values);
        total = _mm256_add_ps(total, _mm256_max_ps(water, _mm256_setzero_ps()));
    }
    return horizontal_sum_ps(total);
}



CACHEBENCH_NOINLINE float simd_trapping_rain_soa(const float* hot,
                                                 std::size_t length,
                                                 float* left_temp) {
    __m256 left_max = _mm256_set1_ps(std::numeric_limits<float>::lowest());
    for (std::size_t index = 0; index < length; ++index) {
        const __m256 values = _mm256_load_ps(hot + index * kBatchLanes);
        left_max = _mm256_max_ps(left_max, values);
        _mm256_store_ps(left_temp + index * kBatchLanes, left_max);
    }

    __m256 right_max = _mm256_set1_ps(std::numeric_limits<float>::lowest());
    __m256 total = _mm256_setzero_ps();
    for (std::size_t index = length; index-- > 0;) {
        const __m256 values = _mm256_load_ps(hot + index * kBatchLanes);
        right_max = _mm256_max_ps(right_max, values);
        const __m256 left_values = _mm256_load_ps(left_temp + index * kBatchLanes);
        const __m256 water = _mm256_sub_ps(_mm256_min_ps(left_values, right_max), values);
        total = _mm256_add_ps(total, _mm256_max_ps(water, _mm256_setzero_ps()));
    }
    return horizontal_sum_ps(total);
}
```

## Sliding-window maximum

```cpp
CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_sliding_max_aos(const WideRecord<float>* aos,
                                                                      std::size_t length) {
    float total = 0.0f;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t lane = 0; lane < kBatchLanes; ++lane) {
        const WideRecord<float>* values = aos + lane * length;
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t start = 0; start + kWindowWidth <= length; ++start) {
            float window_max = values[start].value;
            CACHEBENCH_NO_VECTOR_LOOP
            for (std::size_t offset = 1; offset < kWindowWidth; ++offset) {
                window_max = std::max(window_max, values[start + offset].value);
            }
            total += window_max;
        }
    }
    return total;
}



CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_sliding_max_soa(const float* hot, std::size_t length) {
    float total = 0.0f;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t lane = 0; lane < kBatchLanes; ++lane) {
        const float* values = hot + lane * length;
        std::deque<std::size_t> deque;
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t index = 0; index < length; ++index) {
            while (!deque.empty() && deque.front() + kWindowWidth <= index) {
                deque.pop_front();
            }
            while (!deque.empty() && values[deque.back()] <= values[index]) {
                deque.pop_back();
            }
            deque.push_back(index);
            if (index + 1 >= kWindowWidth) {
                total += values[deque.front()];
            }
        }
    }
    return total;
}



CACHEBENCH_NOINLINE float simd_sliding_max_aos(const WideRecord<float>* aos, std::size_t length) {
    __m256 total = _mm256_setzero_ps();
    for (std::size_t start = 0; start + kWindowWidth <= length; ++start) {
        __m256 window_max = gather_hot_values(aos, length, start);
        for (std::size_t offset = 1; offset < kWindowWidth; ++offset) {
            window_max = _mm256_max_ps(window_max, gather_hot_values(aos, length, start + offset));
        }
        total = _mm256_add_ps(total, window_max);
    }
    return horizontal_sum_ps(total);
}



CACHEBENCH_NOINLINE float simd_sliding_max_soa(const float* hot,
                                               std::size_t length,
                                               float* left_temp,
                                               float* right_temp) {
    __m256 block_left = _mm256_setzero_ps();
    for (std::size_t index = 0; index < length; ++index) {
        const __m256 values = _mm256_load_ps(hot + index * kBatchLanes);
        if (index % kWindowWidth == 0) {
            block_left = values;
        } else {
            block_left = _mm256_max_ps(block_left, values);
        }
        _mm256_store_ps(left_temp + index * kBatchLanes, block_left);
    }

    __m256 block_right = _mm256_setzero_ps();
    for (std::size_t reverse = length; reverse-- > 0;) {
        const __m256 values = _mm256_load_ps(hot + reverse * kBatchLanes);
        if (reverse == length - 1 || ((reverse + 1) % kWindowWidth == 0)) {
            block_right = values;
        } else {
            block_right = _mm256_max_ps(block_right, values);
        }
        _mm256_store_ps(right_temp + reverse * kBatchLanes, block_right);
    }

    __m256 total = _mm256_setzero_ps();
    for (std::size_t start = 0; start + kWindowWidth <= length; ++start) {
        const __m256 left_value = _mm256_load_ps(left_temp + (start + kWindowWidth - 1) * kBatchLanes);
        const __m256 right_value = _mm256_load_ps(right_temp + start * kBatchLanes);
        total = _mm256_add_ps(total, _mm256_max_ps(left_value, right_value));
    }
    return horizontal_sum_ps(total);
}
```

## Product except self

```cpp
CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t scalar_product_except_self_aos(
    const WideRecord<std::uint32_t>* aos,
    std::size_t length) {
    std::vector<std::uint32_t> work(length);
    std::uint64_t total = 0;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t lane = 0; lane < kBatchLanes; ++lane) {
        const WideRecord<std::uint32_t>* values = aos + lane * length;
        std::uint32_t prefix = 1U;
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t index = 0; index < length; ++index) {
            work[index] = prefix;
            prefix *= values[index].value;
        }
        std::uint32_t suffix = 1U;
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t index = length; index-- > 0;) {
            work[index] *= suffix;
            suffix *= values[index].value;
        }
        total += sample_checksum(work.data(), length);
    }
    return total;
}



CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t scalar_product_except_self_soa(const std::uint32_t* hot,
                                                                                      std::size_t length) {
    std::vector<std::uint32_t> work(length);
    std::uint64_t total = 0;
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t lane = 0; lane < kBatchLanes; ++lane) {
        const std::uint32_t* values = hot + lane * length;
        std::uint32_t prefix = 1U;
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t index = 0; index < length; ++index) {
            work[index] = prefix;
            prefix *= values[index];
        }
        std::uint32_t suffix = 1U;
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t index = length; index-- > 0;) {
            work[index] *= suffix;
            suffix *= values[index];
        }
        total += sample_checksum(work.data(), length);
    }
    return total;
}



CACHEBENCH_NOINLINE std::uint64_t simd_product_except_self_aos(const WideRecord<std::uint32_t>* aos,
                                                               std::size_t length,
                                                               std::uint32_t* temp) {
    __m256i prefix = _mm256_set1_epi32(1);
    for (std::size_t index = 0; index < length; ++index) {
        _mm256_store_si256(reinterpret_cast<__m256i*>(temp + index * kBatchLanes), prefix);
        prefix = _mm256_mullo_epi32(prefix, gather_hot_u32_values(aos, length, index));
    }

    __m256i suffix = _mm256_set1_epi32(1);
    for (std::size_t index = length; index-- > 0;) {
        const __m256i prefix_values = _mm256_load_si256(reinterpret_cast<const __m256i*>(temp + index * kBatchLanes));
        const __m256i output = _mm256_mullo_epi32(prefix_values, suffix);
        _mm256_store_si256(reinterpret_cast<__m256i*>(temp + index * kBatchLanes), output);
        suffix = _mm256_mullo_epi32(suffix, gather_hot_u32_values(aos, length, index));
    }
    return sample_checksum(temp, length * kBatchLanes);
}



CACHEBENCH_NOINLINE std::uint64_t simd_product_except_self_soa(const std::uint32_t* hot,
                                                               std::size_t length,
                                                               std::uint32_t* temp) {
    __m256i prefix = _mm256_set1_epi32(1);
    for (std::size_t index = 0; index < length; ++index) {
        _mm256_store_si256(reinterpret_cast<__m256i*>(temp + index * kBatchLanes), prefix);
        prefix = _mm256_mullo_epi32(prefix, _mm256_load_si256(reinterpret_cast<const __m256i*>(hot + index * kBatchLanes)));
    }

    __m256i suffix = _mm256_set1_epi32(1);
    for (std::size_t index = length; index-- > 0;) {
        const __m256i prefix_values = _mm256_load_si256(reinterpret_cast<const __m256i*>(temp + index * kBatchLanes));
        const __m256i output = _mm256_mullo_epi32(prefix_values, suffix);
        _mm256_store_si256(reinterpret_cast<__m256i*>(temp + index * kBatchLanes), output);
        suffix = _mm256_mullo_epi32(suffix, _mm256_load_si256(reinterpret_cast<const __m256i*>(hot + index * kBatchLanes)));
    }
    return sample_checksum(temp, length * kBatchLanes);
}
```

## Rotated search

```cpp
CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR int scalar_rotated_search_aos_value(
    const WideRecord<std::uint32_t>* values,
    std::size_t length,
    std::uint32_t target) {
    int left = 0;
    int right = static_cast<int>(length) - 1;
    while (left <= right) {
        const int mid = left + (right - left) / 2;
        const std::uint32_t left_value = values[left].value;
        const std::uint32_t mid_value = values[mid].value;
        const std::uint32_t right_value = values[right].value;
        if (mid_value == target) {
            return mid;
        }
        if (left_value <= mid_value) {
            if (left_value <= target && target < mid_value) {
                right = mid - 1;
            } else {
                left = mid + 1;
            }
        } else {
            if (mid_value < target && target <= right_value) {
                left = mid + 1;
            } else {
                right = mid - 1;
            }
        }
    }
    return -1;
}



CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR int scalar_rotated_search_soa_value(
    const std::uint32_t* values,
    std::size_t length,
    std::uint32_t target) {
    int left = 0;
    int right = static_cast<int>(length) - 1;
    while (left <= right) {
        const int mid = left + (right - left) / 2;
        const std::uint32_t left_value = values[left];
        const std::uint32_t mid_value = values[mid];
        const std::uint32_t right_value = values[right];
        if (mid_value == target) {
            return mid;
        }
        if (left_value <= mid_value) {
            if (left_value <= target && target < mid_value) {
                right = mid - 1;
            } else {
                left = mid + 1;
            }
        } else {
            if (mid_value < target && target <= right_value) {
                left = mid + 1;
            } else {
                right = mid - 1;
            }
        }
    }
    return -1;
}



CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR int scalar_rotated_search_aos_value(
    const WideRecord<std::uint32_t>* values,
    std::size_t length,
    std::uint32_t target) {
    int left = 0;
    int right = static_cast<int>(length) - 1;
    while (left <= right) {
        const int mid = left + (right - left) / 2;
        const std::uint32_t left_value = values[left].value;
        const std::uint32_t mid_value = values[mid].value;
        const std::uint32_t right_value = values[right].value;
        if (mid_value == target) {
            return mid;
        }
        if (left_value <= mid_value) {
            if (left_value <= target && target < mid_value) {
                right = mid - 1;
            } else {
                left = mid + 1;
            }
        } else {
            if (mid_value < target && target <= right_value) {
                left = mid + 1;
            } else {
                right = mid - 1;
            }
        }
    }
    return -1;
}



CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR int scalar_rotated_search_soa_value(
    const std::uint32_t* values,
    std::size_t length,
    std::uint32_t target) {
    int left = 0;
    int right = static_cast<int>(length) - 1;
    while (left <= right) {
        const int mid = left + (right - left) / 2;
        const std::uint32_t left_value = values[left];
        const std::uint32_t mid_value = values[mid];
        const std::uint32_t right_value = values[right];
        if (mid_value == target) {
            return mid;
        }
        if (left_value <= mid_value) {
            if (left_value <= target && target < mid_value) {
                right = mid - 1;
            } else {
                left = mid + 1;
            }
        } else {
            if (mid_value < target && target <= right_value) {
                left = mid + 1;
            } else {
                right = mid - 1;
            }
        }
    }
    return -1;
}



CACHEBENCH_NOINLINE std::uint64_t simd_rotated_search_aos(const WideRecord<std::uint32_t>* aos,
                                                          const std::uint32_t* interleaved_queries,
                                                          std::size_t length,
                                                          std::size_t query_count) {
    return simd_rotated_search_common(
        interleaved_queries,
        length,
        query_count,
        [aos, length](__m256i indices) { return gather_rotated_aos_values(aos, length, indices); });
}



CACHEBENCH_NOINLINE std::uint64_t simd_rotated_search_soa(const std::uint32_t* interleaved_hot,
                                                          const std::uint32_t* interleaved_queries,
                                                          std::size_t length,
                                                          std::size_t query_count) {
    return simd_rotated_search_common(
        interleaved_queries,
        length,
        query_count,
        [interleaved_hot](__m256i indices) { return gather_rotated_soa_values(interleaved_hot, indices); });
}
```

## Sort 0/1/2

```cpp
CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR void sort012_scalar_aos(Sort012Record* values, std::size_t length) {
    std::size_t low = 0;
    std::size_t mid = 0;
    std::size_t high = length - 1;
    while (mid <= high) {
        if (values[mid].value == 0U) {
            std::swap(values[low++], values[mid++]);
        } else if (values[mid].value == 1U) {
            ++mid;
        } else {
            std::swap(values[mid], values[high--]);
        }
    }
}



CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t sort012_scalar_soa(const std::uint32_t* input,
                                                                         std::uint32_t* output,
                                                                         std::size_t length) {
    std::array<std::size_t, 3> counts{};
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t index = 0; index < length; ++index) {
        ++counts[input[index]];
    }
    std::fill_n(output, counts[0], 0U);
    std::fill_n(output + counts[0], counts[1], 1U);
    std::fill_n(output + counts[0] + counts[1], counts[2], 2U);
    return sample_checksum(output, length);
}



CACHEBENCH_NOINLINE std::uint64_t sort012_simd_aos(const Sort012Record* input,
                                                   std::uint32_t* output,
                                                   std::size_t length) {
    constexpr std::int32_t kStrideWords = static_cast<std::int32_t>(sizeof(Sort012Record) / sizeof(std::uint32_t));
    const int* base = reinterpret_cast<const int*>(input);
    const __m256i zero = _mm256_setzero_si256();
    const __m256i one = _mm256_set1_epi32(1);
    const __m256i two = _mm256_set1_epi32(2);
    std::array<std::size_t, 3> counts{};
    std::size_t index = 0;
    for (; index + kBatchLanes <= length; index += kBatchLanes) {
        const __m256i gather_indices = _mm256_setr_epi32(static_cast<int>((index + 0) * kStrideWords),
                                                         static_cast<int>((index + 1) * kStrideWords),
                                                         static_cast<int>((index + 2) * kStrideWords),
                                                         static_cast<int>((index + 3) * kStrideWords),
                                                         static_cast<int>((index + 4) * kStrideWords),
                                                         static_cast<int>((index + 5) * kStrideWords),
                                                         static_cast<int>((index + 6) * kStrideWords),
                                                         static_cast<int>((index + 7) * kStrideWords));
        const __m256i values = _mm256_i32gather_epi32(base, gather_indices, sizeof(std::uint32_t));
        counts[0] += static_cast<std::size_t>(std::popcount(static_cast<unsigned>(mask_bits(_mm256_cmpeq_epi32(values, zero)))));
        counts[1] += static_cast<std::size_t>(std::popcount(static_cast<unsigned>(mask_bits(_mm256_cmpeq_epi32(values, one)))));
        counts[2] += static_cast<std::size_t>(std::popcount(static_cast<unsigned>(mask_bits(_mm256_cmpeq_epi32(values, two)))));
    }
    for (; index < length; ++index) {
        ++counts[input[index].value];
    }
    std::fill_n(output, counts[0], 0U);
    std::fill_n(output + counts[0], counts[1], 1U);
    std::fill_n(output + counts[0] + counts[1], counts[2], 2U);
    return sample_checksum(output, length);
}



CACHEBENCH_NOINLINE std::uint64_t sort012_simd_soa(const std::uint32_t* input,
                                                   std::uint32_t* output,
                                                   std::size_t length) {
    const __m256i zero = _mm256_setzero_si256();
    const __m256i one = _mm256_set1_epi32(1);
    const __m256i two = _mm256_set1_epi32(2);
    std::array<std::size_t, 3> counts{};
    std::size_t index = 0;
    for (; index + kBatchLanes <= length; index += kBatchLanes) {
        const __m256i values = _mm256_load_si256(reinterpret_cast<const __m256i*>(input + index));
        counts[0] += static_cast<std::size_t>(std::popcount(static_cast<unsigned>(mask_bits(_mm256_cmpeq_epi32(values, zero)))));
        counts[1] += static_cast<std::size_t>(std::popcount(static_cast<unsigned>(mask_bits(_mm256_cmpeq_epi32(values, one)))));
        counts[2] += static_cast<std::size_t>(std::popcount(static_cast<unsigned>(mask_bits(_mm256_cmpeq_epi32(values, two)))));
    }
    for (; index < length; ++index) {
        ++counts[input[index]];
    }
    std::fill_n(output, counts[0], 0U);
    std::fill_n(output + counts[0], counts[1], 1U);
    std::fill_n(output + counts[0] + counts[1], counts[2], 2U);
    return sample_checksum(output, length);
}
```

## BFS

```cpp
CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t bfs_scalar_adjlist(const GraphAdjList& graph,
                                                                         std::size_t passes) {
    const std::size_t vertices = graph.edges.size();
    std::vector<std::uint32_t> queue(vertices);
    std::vector<int> visited(vertices, 0);
    std::uint64_t total = 0;
    for (std::size_t pass = 0; pass < passes; ++pass) {
        std::fill(visited.begin(), visited.end(), 0);
        const std::uint32_t source = static_cast<std::uint32_t>((pass * 17) % vertices);
        std::size_t head = 0;
        std::size_t tail = 0;
        visited[source] = 1;
        queue[tail++] = source;
        while (head < tail) {
            const std::uint32_t node = queue[head++];
            total += node;
            for (const GraphEdgeRecord& edge : graph.edges[node]) {
                if (visited[edge.dst] == 0) {
                    visited[edge.dst] = 1;
                    queue[tail++] = edge.dst;
                }
            }
        }
        total += tail;
    }
    return total;
}



CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t bfs_scalar_csr(const GraphCsr& graph,
                                                                     std::size_t passes) {
    const std::size_t vertices = graph.offsets.size() - 1;
    std::vector<std::uint32_t> queue(vertices);
    std::vector<int> visited(vertices, 0);
    std::uint64_t total = 0;
    for (std::size_t pass = 0; pass < passes; ++pass) {
        std::fill(visited.begin(), visited.end(), 0);
        const std::uint32_t source = static_cast<std::uint32_t>((pass * 17) % vertices);
        std::size_t head = 0;
        std::size_t tail = 0;
        visited[source] = 1;
        queue[tail++] = source;
        while (head < tail) {
            const std::uint32_t node = queue[head++];
            total += node;
            for (std::uint32_t edge = graph.offsets[node]; edge < graph.offsets[node + 1]; ++edge) {
                const std::uint32_t next = graph.edges[edge];
                if (visited[next] == 0) {
                    visited[next] = 1;
                    queue[tail++] = next;
                }
            }
        }
        total += tail;
    }
    return total;
}



CACHEBENCH_NOINLINE std::uint64_t bfs_simd_adjlist(const GraphAdjList& graph, std::size_t passes) {
    const std::size_t vertices = graph.edges.size();
    std::vector<std::uint32_t> queue(vertices);
    std::vector<int> visited(vertices, 0);
    std::uint64_t total = 0;
    alignas(32) std::array<std::uint32_t, kBatchLanes> dst_lanes{};
    constexpr std::int32_t kStrideWords = static_cast<std::int32_t>(sizeof(GraphEdgeRecord) / sizeof(std::uint32_t));
    const __m256i zero = _mm256_setzero_si256();

    for (std::size_t pass = 0; pass < passes; ++pass) {
        std::fill(visited.begin(), visited.end(), 0);
        const std::uint32_t source = static_cast<std::uint32_t>((pass * 17) % vertices);
        std::size_t head = 0;
        std::size_t tail = 0;
        visited[source] = 1;
        queue[tail++] = source;
        while (head < tail) {
            const std::uint32_t node = queue[head++];
            total += node;
            const auto& edges = graph.edges[node];
            const int* base = reinterpret_cast<const int*>(edges.data());
            std::size_t edge = 0;
            for (; edge + kBatchLanes <= edges.size(); edge += kBatchLanes) {
                const __m256i edge_indices = _mm256_setr_epi32(static_cast<int>((edge + 0) * kStrideWords),
                                                               static_cast<int>((edge + 1) * kStrideWords),
                                                               static_cast<int>((edge + 2) * kStrideWords),
                                                               static_cast<int>((edge + 3) * kStrideWords),
                                                               static_cast<int>((edge + 4) * kStrideWords),
                                                               static_cast<int>((edge + 5) * kStrideWords),
                                                               static_cast<int>((edge + 6) * kStrideWords),
                                                               static_cast<int>((edge + 7) * kStrideWords));
                const __m256i next_nodes =
                    _mm256_i32gather_epi32(base, edge_indices, sizeof(std::uint32_t));
                const __m256i seen =
                    _mm256_i32gather_epi32(visited.data(), next_nodes, static_cast<int>(sizeof(int)));
                _mm256_store_si256(reinterpret_cast<__m256i*>(dst_lanes.data()), next_nodes);
                const int unseen_mask = mask_bits(_mm256_cmpeq_epi32(seen, zero));
                for (std::size_t lane = 0; lane < kBatchLanes; ++lane) {
                    if (((unseen_mask >> lane) & 1) != 0 && visited[dst_lanes[lane]] == 0) {
                        visited[dst_lanes[lane]] = 1;
                        queue[tail++] = dst_lanes[lane];
                    }
                }
            }
            for (; edge < edges.size(); ++edge) {
                const std::uint32_t next = edges[edge].dst;
                if (visited[next] == 0) {
                    visited[next] = 1;
                    queue[tail++] = next;
                }
            }
        }
        total += tail;
    }
    return total;
}



CACHEBENCH_NOINLINE std::uint64_t bfs_simd_csr(const GraphCsr& graph, std::size_t passes) {
    const std::size_t vertices = graph.offsets.size() - 1;
    std::vector<std::uint32_t> queue(vertices);
    std::vector<int> visited(vertices, 0);
    std::uint64_t total = 0;
    alignas(32) std::array<std::uint32_t, kBatchLanes> dst_lanes{};
    const __m256i zero = _mm256_setzero_si256();

    for (std::size_t pass = 0; pass < passes; ++pass) {
        std::fill(visited.begin(), visited.end(), 0);
        const std::uint32_t source = static_cast<std::uint32_t>((pass * 17) % vertices);
        std::size_t head = 0;
        std::size_t tail = 0;
        visited[source] = 1;
        queue[tail++] = source;
        while (head < tail) {
            const std::uint32_t node = queue[head++];
            total += node;
            std::uint32_t edge = graph.offsets[node];
            const std::uint32_t edge_limit = graph.offsets[node + 1];
            for (; edge + kBatchLanes <= edge_limit; edge += static_cast<std::uint32_t>(kBatchLanes)) {
                const __m256i next_nodes =
                    _mm256_loadu_si256(reinterpret_cast<const __m256i*>(graph.edges.data() + edge));
                const __m256i seen =
                    _mm256_i32gather_epi32(visited.data(), next_nodes, static_cast<int>(sizeof(int)));
                _mm256_store_si256(reinterpret_cast<__m256i*>(dst_lanes.data()), next_nodes);
                const int unseen_mask = mask_bits(_mm256_cmpeq_epi32(seen, zero));
                for (std::size_t lane = 0; lane < kBatchLanes; ++lane) {
                    if (((unseen_mask >> lane) & 1) != 0 && visited[dst_lanes[lane]] == 0) {
                        visited[dst_lanes[lane]] = 1;
                        queue[tail++] = dst_lanes[lane];
                    }
                }
            }
            for (; edge < edge_limit; ++edge) {
                const std::uint32_t next = graph.edges[edge];
                if (visited[next] == 0) {
                    visited[next] = 1;
                    queue[tail++] = next;
                }
            }
        }
        total += tail;
    }
    return total;
}
```

## Transpose

```cpp
CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR void transpose_scalar_naive(const float* src,
                                                                    float* dst,
                                                                    std::size_t dim) {
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t row = 0; row < dim; ++row) {
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t col = 0; col < dim; ++col) {
            dst[col * dim + row] = src[row * dim + col];
        }
    }
}



CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR void transpose_scalar_blocked(const float* src,
                                                                      float* dst,
                                                                      std::size_t dim) {
    CACHEBENCH_NO_VECTOR_LOOP
    for (std::size_t row_block = 0; row_block < dim; row_block += kTransposeBlock) {
        CACHEBENCH_NO_VECTOR_LOOP
        for (std::size_t col_block = 0; col_block < dim; col_block += kTransposeBlock) {
            const std::size_t row_limit = std::min(dim, row_block + kTransposeBlock);
            const std::size_t col_limit = std::min(dim, col_block + kTransposeBlock);
            CACHEBENCH_NO_VECTOR_LOOP
            for (std::size_t row = row_block; row < row_limit; ++row) {
                CACHEBENCH_NO_VECTOR_LOOP
                for (std::size_t col = col_block; col < col_limit; ++col) {
                    dst[col * dim + row] = src[row * dim + col];
                }
            }
        }
    }
}



CACHEBENCH_NOINLINE void transpose_simd_naive(const float* src, float* dst, std::size_t dim) {
    const std::size_t simd_dim = dim - (dim % 4);
    for (std::size_t row = 0; row < simd_dim; row += 4) {
        for (std::size_t col = 0; col < simd_dim; col += 4) {
            transpose4x4_simd(src + row * dim + col, dim, dst + col * dim + row, dim);
        }
        for (std::size_t col = simd_dim; col < dim; ++col) {
            for (std::size_t lane = 0; lane < 4 && row + lane < dim; ++lane) {
                dst[col * dim + (row + lane)] = src[(row + lane) * dim + col];
            }
        }
    }
    for (std::size_t row = simd_dim; row < dim; ++row) {
        for (std::size_t col = 0; col < dim; ++col) {
            dst[col * dim + row] = src[row * dim + col];
        }
    }
}



CACHEBENCH_NOINLINE void transpose_simd_blocked(const float* src, float* dst, std::size_t dim) {
    const std::size_t simd_dim = dim - (dim % 4);
    for (std::size_t row_block = 0; row_block < dim; row_block += kTransposeBlock) {
        for (std::size_t col_block = 0; col_block < dim; col_block += kTransposeBlock) {
            const std::size_t row_limit = std::min(dim, row_block + kTransposeBlock);
            const std::size_t col_limit = std::min(dim, col_block + kTransposeBlock);
            std::size_t row = row_block;
            for (; row + 4 <= row_limit && row < simd_dim; row += 4) {
                std::size_t col = col_block;
                for (; col + 4 <= col_limit && col < simd_dim; col += 4) {
                    transpose4x4_simd(src + row * dim + col, dim, dst + col * dim + row, dim);
                }
                for (; col < col_limit; ++col) {
                    for (std::size_t lane = 0; lane < 4; ++lane) {
                        dst[col * dim + (row + lane)] = src[(row + lane) * dim + col];
                    }
                }
            }
            for (; row < row_limit; ++row) {
                for (std::size_t col = col_block; col < col_limit; ++col) {
                    dst[col * dim + row] = src[row * dim + col];
                }
            }
        }
    }
}
```

## Matrix multiply

```cpp
CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR void scalar_matmul_naive(const float* a,
                                                                 const float* b,
                                                                 float* c,
                                                                 std::size_t n) {
    std::fill_n(c, n * n, 0.0f);
    for (std::size_t i = 0; i < n; ++i) {
        for (std::size_t j = 0; j < n; ++j) {
            float sum = 0.0f;
            CACHEBENCH_NO_VECTOR_LOOP
            for (std::size_t k = 0; k < n; ++k) {
                sum += a[i * n + k] * b[k * n + j];
            }
            c[i * n + j] = sum;
        }
    }
}



void transpose_square_matrix(const float* src, float* dst, std::size_t n) {
    for (std::size_t i = 0; i < n; ++i) {
        for (std::size_t j = 0; j < n; ++j) {
            dst[j * n + i] = src[i * n + j];
        }
    }
}



CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR void scalar_matmul_transposed(const float* a,
                                                                      const float* bt,
                                                                      float* c,
                                                                      std::size_t n) {
    std::fill_n(c, n * n, 0.0f);
    for (std::size_t i = 0; i < n; ++i) {
        const float* row_a = a + i * n;
        for (std::size_t j = 0; j < n; ++j) {
            const float* row_bt = bt + j * n;
            float sum = 0.0f;
            CACHEBENCH_NO_VECTOR_LOOP
            for (std::size_t k = 0; k < n; ++k) {
                sum += row_a[k] * row_bt[k];
            }
            c[i * n + j] = sum;
        }
    }
}



CACHEBENCH_NOINLINE void avx2_matmul_gather(const float* a,
                                            const float* b,
                                            float* c,
                                            std::size_t n) {
    std::fill_n(c, n * n, 0.0f);
    for (std::size_t i = 0; i < n; ++i) {
        const float* row_a = a + i * n;
        for (std::size_t j = 0; j < n; ++j) {
            __m256 acc = _mm256_setzero_ps();
            std::size_t k = 0;
            for (; k + kAvxWidthFloats <= n; k += kAvxWidthFloats) {
                const __m256 avec = _mm256_load_ps(row_a + k);
                const __m256i bindex = _mm256_setr_epi32(static_cast<int>((k + 0) * n + j),
                                                         static_cast<int>((k + 1) * n + j),
                                                         static_cast<int>((k + 2) * n + j),
                                                         static_cast<int>((k + 3) * n + j),
                                                         static_cast<int>((k + 4) * n + j),
                                                         static_cast<int>((k + 5) * n + j),
                                                         static_cast<int>((k + 6) * n + j),
                                                         static_cast<int>((k + 7) * n + j));
                const __m256 bcol = _mm256_i32gather_ps(b, bindex, sizeof(float));
                acc = _mm256_add_ps(acc, _mm256_mul_ps(avec, bcol));
            }

            float sum = horizontal_sum_ps(acc);
            for (; k < n; ++k) {
                sum += row_a[k] * b[k * n + j];
            }
            c[i * n + j] = sum;
        }
    }
}



CACHEBENCH_NOINLINE void avx2_matmul_blocked(const float* a,
                                             const float* b,
                                             float* c,
                                             std::size_t n) {
    constexpr std::size_t kBlockI = 32;
    constexpr std::size_t kBlockJ = 64;
    constexpr std::size_t kBlockK = 64;

    std::fill_n(c, n * n, 0.0f);
    for (std::size_t ii = 0; ii < n; ii += kBlockI) {
        const std::size_t i_end = std::min(ii + kBlockI, n);
        for (std::size_t kk = 0; kk < n; kk += kBlockK) {
            const std::size_t k_end = std::min(kk + kBlockK, n);
            for (std::size_t jj = 0; jj < n; jj += kBlockJ) {
                const std::size_t j_end = std::min(jj + kBlockJ, n);
                for (std::size_t i = ii; i < i_end; ++i) {
                    for (std::size_t k = kk; k < k_end; ++k) {
                        const __m256 aval = _mm256_set1_ps(a[i * n + k]);
                        std::size_t j = jj;
                        for (; j + kAvxWidthFloats <= j_end; j += kAvxWidthFloats) {
                            const __m256 cvec = _mm256_loadu_ps(c + i * n + j);
                            const __m256 bvec = _mm256_loadu_ps(b + k * n + j);
                            const __m256 updated =
                                _mm256_add_ps(cvec, _mm256_mul_ps(aval, bvec));
                            _mm256_storeu_ps(c + i * n + j, updated);
                        }
                        for (; j < j_end; ++j) {
                            c[i * n + j] += a[i * n + k] * b[k * n + j];
                        }
                    }
                }
            }
        }
    }
}
```
