#include "benchmark.hpp" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _MSC_VER #include #endif namespace cachebench { namespace { constexpr std::uint64_t kSequenceTargetBytes = 64ULL * 1024ULL * 1024ULL; constexpr std::uint64_t kTransposeTargetBytes = 32ULL * 1024ULL * 1024ULL; constexpr std::size_t kBatchLanes = 8; constexpr std::size_t kColdFieldCount = 15; constexpr std::size_t kAlignmentBytes = 64; constexpr std::size_t kWindowWidth = 32; constexpr std::size_t kTransposeBlock = 32; #if defined(_MSC_VER) #define CACHEBENCH_NOINLINE __declspec(noinline) #define CACHEBENCH_NO_VECTOR_LOOP __pragma(loop(no_vector)) #define CACHEBENCH_NOVECTOR #elif defined(__clang__) #define CACHEBENCH_NOINLINE __attribute__((noinline)) #define CACHEBENCH_NO_VECTOR_LOOP _Pragma("clang loop vectorize(disable)") #define CACHEBENCH_NOVECTOR #elif defined(__GNUC__) #define CACHEBENCH_NOINLINE __attribute__((noinline)) #define CACHEBENCH_NO_VECTOR_LOOP #define CACHEBENCH_NOVECTOR __attribute__((optimize("no-tree-vectorize"))) #else #define CACHEBENCH_NOINLINE #define CACHEBENCH_NO_VECTOR_LOOP #define CACHEBENCH_NOVECTOR #endif template class AlignedBuffer { public: explicit AlignedBuffer(std::size_t size, std::size_t alignment = kAlignmentBytes) : size_(size), data_(static_cast(_mm_malloc(sizeof(T) * size, alignment))) { if (data_ == nullptr) { throw std::bad_alloc(); } } ~AlignedBuffer() { _mm_free(data_); } AlignedBuffer(const AlignedBuffer&) = delete; AlignedBuffer& operator=(const AlignedBuffer&) = delete; T* data() { return data_; } const T* data() const { return data_; } T& operator[](std::size_t index) { return data_[index]; } const T& operator[](std::size_t index) const { return data_[index]; } std::size_t size() const { return size_; } private: std::size_t size_ = 0; T* data_ = nullptr; }; template struct WideRecord { T value{}; std::array cold{}; }; static_assert(sizeof(WideRecord) == 64, "Wide float record should occupy one cache line"); static_assert(sizeof(WideRecord) == 64, "Wide uint32 record should occupy one cache line"); template struct SequenceDataset { std::size_t length = 0; std::size_t total_values = 0; std::size_t logical_size_bytes = 0; std::shared_ptr>> aos; std::shared_ptr> lane_major_hot; std::shared_ptr> lane_major_cold; std::shared_ptr> interleaved_hot; std::shared_ptr> interleaved_cold; }; enum class StockMode { kScalarAos, kScalarSoa, kSimdAos, kSimdSoa, }; enum class MaxSubarrayMode { kScalarAos, kScalarSoa, kSimdAos, kSimdSoa, }; enum class TrappingRainMode { kScalarAos, kScalarSoa, kSimdAos, kSimdSoa, }; enum class SlidingWindowMode { kScalarAos, kScalarSoa, kSimdAos, kSimdSoa, }; enum class ProductExceptSelfMode { kScalarAos, kScalarSoa, kSimdAos, kSimdSoa, }; enum class TransposeMode { kScalarNaive, kScalarBlocked, kSimdNaive, kSimdBlocked, }; enum class RotatedSearchMode { kScalarAos, kScalarSoa, kSimdAos, kSimdSoa, }; enum class Sort012Mode { kScalarAos, kScalarSoa, kSimdAos, kSimdSoa, }; enum class BfsMode { kScalarAdjList, kScalarCsr, kSimdAdjList, kSimdCsr, }; struct RotatedSearchDataset { std::size_t length = 0; std::size_t query_count = 0; std::size_t logical_size_bytes = 0; std::shared_ptr>> aos; std::shared_ptr> lane_major_hot; std::shared_ptr> interleaved_hot; std::shared_ptr> lane_major_queries; std::shared_ptr> interleaved_queries; }; struct Sort012Record { std::uint32_t value = 0; std::array cold{}; }; static_assert(sizeof(Sort012Record) == 64, "Sort012Record should occupy one cache line"); struct Sort012Dataset { std::size_t length = 0; std::size_t logical_size_bytes = 0; std::shared_ptr> input_aos; std::shared_ptr> input_soa; }; struct GraphEdgeRecord { std::uint32_t dst = 0; std::uint32_t cold0 = 0; std::uint32_t cold1 = 0; std::uint32_t cold2 = 0; }; static_assert(sizeof(GraphEdgeRecord) == 16, "GraphEdgeRecord should stay compact"); struct GraphAdjList { std::vector> edges; }; struct GraphCsr { std::vector offsets; std::vector edges; }; constexpr std::size_t kDsaGraphDegree = 8; std::uint64_t seeded_value(std::size_t index) { constexpr std::uint64_t kMix = 0x9E3779B97F4A7C15ULL; return (static_cast(index) + 1ULL) * kMix; } float seeded_float(std::size_t index, float scale) { const std::uint32_t value = static_cast((seeded_value(index) >> 17) & 0xFFFFULL); return static_cast(value % 4096U) / scale; } bool cpu_supports_avx2() { #if defined(_MSC_VER) && defined(_M_X64) int info[4] = {}; __cpuid(info, 1); const bool osxsave = (info[2] & (1 << 27)) != 0; const bool avx = (info[2] & (1 << 28)) != 0; if (!osxsave || !avx) { return false; } const unsigned long long xcr0 = _xgetbv(0); if ((xcr0 & 0x6ULL) != 0x6ULL) { return false; } __cpuidex(info, 7, 0); return (info[1] & (1 << 5)) != 0; #elif (defined(__GNUC__) || defined(__clang__)) && defined(__x86_64__) return __builtin_cpu_supports("avx2"); #else return false; #endif } template T sample_checksum(const T* data, std::size_t elements) { if (elements == 0) { return T{}; } const std::size_t step = std::max(1, elements / 8); T sum{}; for (std::size_t index = 0; index < elements; index += step) { sum += data[index]; } sum += data[elements - 1]; return sum; } std::uint64_t checksum_from_float_buffer(const float* data, std::size_t elements) { if (elements == 0) { return 0; } const std::size_t step = std::max(1, elements / 8); std::uint64_t sum = 0; for (std::size_t index = 0; index < elements; index += step) { sum += static_cast(std::bit_cast(data[index])); } sum += static_cast(std::bit_cast(data[elements - 1])); return sum; } float horizontal_sum_ps(__m256 value) { alignas(32) std::array lanes{}; _mm256_store_ps(lanes.data(), value); return std::accumulate(lanes.begin(), lanes.end(), 0.0f); } std::uint64_t horizontal_sum_u32(__m256i value) { alignas(32) std::array lanes{}; _mm256_store_si256(reinterpret_cast<__m256i*>(lanes.data()), value); return std::accumulate(lanes.begin(), lanes.end(), 0ULL); } __m256i setr_epi32_lanes() { return _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); } __m256i cmp_lt_epi32(__m256i lhs, __m256i rhs) { return _mm256_cmpgt_epi32(rhs, lhs); } __m256i cmp_le_epi32(__m256i lhs, __m256i rhs) { return _mm256_or_si256(_mm256_cmpgt_epi32(rhs, lhs), _mm256_cmpeq_epi32(lhs, rhs)); } int mask_bits(__m256i mask) { return _mm256_movemask_ps(_mm256_castsi256_ps(mask)); } template SequenceDataset build_sequence_dataset(std::size_t requested_size_bytes, std::size_t min_length, Generator&& generator) { const std::size_t records = std::max( kBatchLanes * min_length, requested_size_bytes / sizeof(WideRecord)); const std::size_t length = std::max(min_length, records / kBatchLanes); const std::size_t total_values = length * kBatchLanes; SequenceDataset dataset; dataset.length = length; dataset.total_values = total_values; dataset.logical_size_bytes = total_values * sizeof(WideRecord); dataset.aos = std::make_shared>>(total_values); dataset.lane_major_hot = std::make_shared>(total_values); dataset.lane_major_cold = std::make_shared>(total_values * kColdFieldCount); dataset.interleaved_hot = std::make_shared>(total_values); dataset.interleaved_cold = std::make_shared>(total_values * kColdFieldCount); for (std::size_t lane = 0; lane < kBatchLanes; ++lane) { for (std::size_t index = 0; index < length; ++index) { const T value = generator(lane, index); const std::size_t lane_major_index = lane * length + index; const std::size_t interleaved_index = index * kBatchLanes + lane; (*dataset.aos)[lane_major_index].value = value; for (std::size_t cold_index = 0; cold_index < kColdFieldCount; ++cold_index) { if constexpr (std::is_same_v) { (*dataset.aos)[lane_major_index].cold[cold_index] = seeded_float(lane_major_index * kColdFieldCount + cold_index, 251.0f); (*dataset.lane_major_cold)[cold_index * total_values + lane_major_index] = seeded_float(cold_index * total_values + lane_major_index, 211.0f); (*dataset.interleaved_cold)[cold_index * total_values + interleaved_index] = seeded_float(cold_index * total_values + interleaved_index, 199.0f); } else { (*dataset.aos)[lane_major_index].cold[cold_index] = static_cast( (seeded_value(lane_major_index * kColdFieldCount + cold_index) & 0xFFULL) + 1ULL); (*dataset.lane_major_cold)[cold_index * total_values + lane_major_index] = static_cast( (seeded_value(cold_index * total_values + lane_major_index) & 0xFFULL) + 1ULL); (*dataset.interleaved_cold)[cold_index * total_values + interleaved_index] = static_cast( (seeded_value(cold_index * total_values + interleaved_index) & 0xFFULL) + 1ULL); } } (*dataset.lane_major_hot)[lane_major_index] = value; (*dataset.interleaved_hot)[interleaved_index] = value; } } return dataset; } float stock_price_value(std::size_t lane, std::size_t index) { const float x = static_cast(index); return 64.0f + static_cast(lane) * 2.5f + 0.04f * x + 9.0f * std::sin(0.031f * x + static_cast(lane) * 0.7f) + 4.0f * std::cos(0.073f * x + static_cast(lane) * 0.3f); } float max_subarray_value(std::size_t lane, std::size_t index) { const float x = static_cast(index); return 8.0f * std::sin(0.11f * x + static_cast(lane) * 0.5f) + 4.0f * std::cos(0.049f * x + static_cast(lane) * 0.2f) - 1.5f; } float height_value(std::size_t lane, std::size_t index) { const float x = static_cast(index); return 6.0f + 4.0f * std::sin(0.087f * x + static_cast(lane) * 0.4f) + 3.0f * std::cos(0.031f * x + static_cast(lane) * 0.8f); } float window_value(std::size_t lane, std::size_t index) { const float x = static_cast(index); return 2.0f + 6.0f * std::sin(0.057f * x + static_cast(lane) * 0.6f) + 1.5f * std::cos(0.121f * x + static_cast(lane) * 0.2f); } std::uint32_t product_value(std::size_t lane, std::size_t index) { return static_cast(((lane * 11 + index * 7) % 13) + 2); } RotatedSearchDataset build_rotated_search_dataset(std::size_t requested_size_bytes) { const std::size_t records = std::max( kBatchLanes * 256, requested_size_bytes / sizeof(WideRecord)); const std::size_t length = std::max(256, records / kBatchLanes); const std::size_t total_values = length * kBatchLanes; const std::size_t query_count = std::max(128, length / 8); RotatedSearchDataset dataset; dataset.length = length; dataset.query_count = query_count; dataset.logical_size_bytes = total_values * sizeof(WideRecord); dataset.aos = std::make_shared>>(total_values); dataset.lane_major_hot = std::make_shared>(total_values); dataset.interleaved_hot = std::make_shared>(total_values); dataset.lane_major_queries = std::make_shared>(query_count * kBatchLanes); dataset.interleaved_queries = std::make_shared>(query_count * kBatchLanes); for (std::size_t lane = 0; lane < kBatchLanes; ++lane) { const std::uint32_t base = static_cast(lane * 1'000'003U + 17U); const std::size_t rotation = (length / 5 + lane * 37) % length; for (std::size_t index = 0; index < length; ++index) { const std::uint32_t sorted_index = static_cast((index + rotation) % length); const std::uint32_t value = base + sorted_index * 3U; const std::size_t lane_major_index = lane * length + index; const std::size_t interleaved_index = index * kBatchLanes + lane; (*dataset.aos)[lane_major_index].value = value; for (std::size_t cold = 0; cold < kColdFieldCount; ++cold) { (*dataset.aos)[lane_major_index].cold[cold] = static_cast( seeded_value(lane_major_index * kColdFieldCount + cold) & 0xFFFFULL); } (*dataset.lane_major_hot)[lane_major_index] = value; (*dataset.interleaved_hot)[interleaved_index] = value; } for (std::size_t query = 0; query < query_count; ++query) { const bool present = (query & 3U) != 3U; const std::uint32_t value = present ? base + static_cast(((query * 13U) + lane * 11U) % length) * 3U : base + static_cast(length * 3U + query * 5U + lane); (*dataset.lane_major_queries)[lane * query_count + query] = value; (*dataset.interleaved_queries)[query * kBatchLanes + lane] = value; } } return dataset; } Sort012Dataset build_sort012_dataset(std::size_t requested_size_bytes) { const std::size_t length = std::max(512, requested_size_bytes / sizeof(Sort012Record)); Sort012Dataset dataset; dataset.length = length; dataset.logical_size_bytes = length * sizeof(Sort012Record); dataset.input_aos = std::make_shared>(length); dataset.input_soa = std::make_shared>(length); for (std::size_t index = 0; index < length; ++index) { const std::uint32_t value = static_cast((seeded_value(index) + index * 5ULL) % 3ULL); (*dataset.input_aos)[index].value = value; for (std::size_t cold = 0; cold < kColdFieldCount; ++cold) { (*dataset.input_aos)[index].cold[cold] = static_cast( seeded_value(index * kColdFieldCount + cold + 19) & 0xFFFFFFFFULL); } (*dataset.input_soa)[index] = value; } return dataset; } std::shared_ptr build_dsa_graph_adjlist(std::size_t vertices) { auto graph = std::make_shared(); graph->edges.resize(vertices); for (std::size_t vertex = 0; vertex < vertices; ++vertex) { auto& out = graph->edges[vertex]; out.reserve(kDsaGraphDegree); for (std::size_t edge = 0; edge < kDsaGraphDegree; ++edge) { GraphEdgeRecord record{}; const std::size_t jump = 1 + edge * 13 + (vertex % 17); record.dst = static_cast((vertex + jump) % vertices); record.cold0 = static_cast(seeded_value(vertex * kDsaGraphDegree + edge) & 0xFFFFULL); record.cold1 = static_cast(seeded_value(vertex * kDsaGraphDegree + edge + 3) & 0xFFFFULL); record.cold2 = static_cast(seeded_value(vertex * kDsaGraphDegree + edge + 5) & 0xFFFFULL); out.push_back(record); } } return graph; } std::shared_ptr build_dsa_graph_csr(const GraphAdjList& adjlist) { auto graph = std::make_shared(); const std::size_t vertices = adjlist.edges.size(); graph->offsets.resize(vertices + 1, 0U); graph->edges.reserve(vertices * kDsaGraphDegree); for (std::size_t vertex = 0; vertex < vertices; ++vertex) { graph->offsets[vertex] = static_cast(graph->edges.size()); for (const GraphEdgeRecord& edge : adjlist.edges[vertex]) { graph->edges.push_back(edge.dst); } } graph->offsets[vertices] = static_cast(graph->edges.size()); return graph; } __m256 gather_hot_values(const WideRecord* aos, std::size_t length, std::size_t index) { const float* base = reinterpret_cast(aos); return _mm256_i32gather_ps( base, _mm256_set_epi32(static_cast(((7 * length + index) * 16)), static_cast(((6 * length + index) * 16)), static_cast(((5 * length + index) * 16)), static_cast(((4 * length + index) * 16)), static_cast(((3 * length + index) * 16)), static_cast(((2 * length + index) * 16)), static_cast(((1 * length + index) * 16)), static_cast(((0 * length + index) * 16))), 4); } __m256i gather_hot_u32_values(const WideRecord* aos, std::size_t length, std::size_t index) { const int* base = reinterpret_cast(aos); return _mm256_i32gather_epi32( base, _mm256_set_epi32(static_cast(((7 * length + index) * 16)), static_cast(((6 * length + index) * 16)), static_cast(((5 * length + index) * 16)), static_cast(((4 * length + index) * 16)), static_cast(((3 * length + index) * 16)), static_cast(((2 * length + index) * 16)), static_cast(((1 * length + index) * 16)), static_cast(((0 * length + index) * 16))), 4); } __m256i gather_rotated_aos_values(const WideRecord* aos, std::size_t length, __m256i logical_indices) { alignas(32) std::array indices{}; _mm256_store_si256(reinterpret_cast<__m256i*>(indices.data()), logical_indices); for (std::size_t lane = 0; lane < kBatchLanes; ++lane) { indices[lane] = indices[lane] * 16 + static_cast(lane * length * 16); } return _mm256_i32gather_epi32( reinterpret_cast(aos), _mm256_load_si256(reinterpret_cast(indices.data())), 4); } __m256i gather_rotated_soa_values(const std::uint32_t* interleaved_hot, __m256i logical_indices) { alignas(32) std::array indices{}; _mm256_store_si256(reinterpret_cast<__m256i*>(indices.data()), logical_indices); for (std::size_t lane = 0; lane < kBatchLanes; ++lane) { indices[lane] = indices[lane] * static_cast(kBatchLanes) + static_cast(lane); } return _mm256_i32gather_epi32( reinterpret_cast(interleaved_hot), _mm256_load_si256(reinterpret_cast(indices.data())), 4); } std::uint64_t sample_sort012_record_values(const Sort012Record* values, std::size_t elements) { if (elements == 0) { return 0; } const std::size_t step = std::max(1, elements / 8); std::uint64_t sum = 0; for (std::size_t index = 0; index < elements; index += step) { sum += values[index].value; } sum += values[elements - 1].value; return sum; } CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_stock_profit_aos(const WideRecord* aos, std::size_t length) { float total = 0.0f; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t lane = 0; lane < kBatchLanes; ++lane) { float min_price = std::numeric_limits::infinity(); float best = 0.0f; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t index = 0; index < length; ++index) { const float price = aos[lane * length + index].value; min_price = std::min(min_price, price); best = std::max(best, price - min_price); } total += best; } return total; } CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_stock_profit_soa(const float* hot, std::size_t length) { float total = 0.0f; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t lane = 0; lane < kBatchLanes; ++lane) { const float* values = hot + lane * length; float min_price = std::numeric_limits::infinity(); float best = 0.0f; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t index = 0; index < length; ++index) { min_price = std::min(min_price, values[index]); best = std::max(best, values[index] - min_price); } total += best; } return total; } CACHEBENCH_NOINLINE float simd_stock_profit_aos(const WideRecord* aos, std::size_t length) { __m256 min_values = _mm256_set1_ps(std::numeric_limits::infinity()); __m256 best_values = _mm256_setzero_ps(); for (std::size_t index = 0; index < length; ++index) { const __m256 prices = gather_hot_values(aos, length, index); min_values = _mm256_min_ps(min_values, prices); best_values = _mm256_max_ps(best_values, _mm256_sub_ps(prices, min_values)); } return horizontal_sum_ps(best_values); } CACHEBENCH_NOINLINE float simd_stock_profit_soa(const float* hot, std::size_t length) { __m256 min_values = _mm256_set1_ps(std::numeric_limits::infinity()); __m256 best_values = _mm256_setzero_ps(); for (std::size_t index = 0; index < length; ++index) { const __m256 prices = _mm256_load_ps(hot + index * kBatchLanes); min_values = _mm256_min_ps(min_values, prices); best_values = _mm256_max_ps(best_values, _mm256_sub_ps(prices, min_values)); } return horizontal_sum_ps(best_values); } CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_max_subarray_aos(const WideRecord* aos, std::size_t length) { float total = 0.0f; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t lane = 0; lane < kBatchLanes; ++lane) { float current = aos[lane * length].value; float best = current; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t index = 1; index < length; ++index) { const float value = aos[lane * length + index].value; current = std::max(value, current + value); best = std::max(best, current); } total += best; } return total; } CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_max_subarray_soa(const float* hot, std::size_t length) { float total = 0.0f; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t lane = 0; lane < kBatchLanes; ++lane) { const float* values = hot + lane * length; float current = values[0]; float best = current; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t index = 1; index < length; ++index) { current = std::max(values[index], current + values[index]); best = std::max(best, current); } total += best; } return total; } CACHEBENCH_NOINLINE float simd_max_subarray_aos(const WideRecord* aos, std::size_t length) { __m256 current = gather_hot_values(aos, length, 0); __m256 best = current; for (std::size_t index = 1; index < length; ++index) { const __m256 values = gather_hot_values(aos, length, index); current = _mm256_max_ps(values, _mm256_add_ps(current, values)); best = _mm256_max_ps(best, current); } return horizontal_sum_ps(best); } CACHEBENCH_NOINLINE float simd_max_subarray_soa(const float* hot, std::size_t length) { __m256 current = _mm256_load_ps(hot); __m256 best = current; for (std::size_t index = 1; index < length; ++index) { const __m256 values = _mm256_load_ps(hot + index * kBatchLanes); current = _mm256_max_ps(values, _mm256_add_ps(current, values)); best = _mm256_max_ps(best, current); } return horizontal_sum_ps(best); } CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_trapping_rain_aos(const WideRecord* aos, std::size_t length) { std::vector left(length); float total = 0.0f; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t lane = 0; lane < kBatchLanes; ++lane) { const WideRecord* values = aos + lane * length; float left_max = values[0].value; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t index = 0; index < length; ++index) { left_max = std::max(left_max, values[index].value); left[index] = left_max; } float right_max = values[length - 1].value; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t index = length; index-- > 0;) { right_max = std::max(right_max, values[index].value); total += std::max(0.0f, std::min(left[index], right_max) - values[index].value); } } return total; } CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_trapping_rain_soa(const float* hot, std::size_t length) { std::vector left(length); float total = 0.0f; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t lane = 0; lane < kBatchLanes; ++lane) { const float* values = hot + lane * length; float left_max = values[0]; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t index = 0; index < length; ++index) { left_max = std::max(left_max, values[index]); left[index] = left_max; } float right_max = values[length - 1]; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t index = length; index-- > 0;) { right_max = std::max(right_max, values[index]); total += std::max(0.0f, std::min(left[index], right_max) - values[index]); } } return total; } CACHEBENCH_NOINLINE float simd_trapping_rain_aos(const WideRecord* aos, std::size_t length, float* left_temp) { __m256 left_max = _mm256_set1_ps(std::numeric_limits::lowest()); for (std::size_t index = 0; index < length; ++index) { left_max = _mm256_max_ps(left_max, gather_hot_values(aos, length, index)); _mm256_store_ps(left_temp + index * kBatchLanes, left_max); } __m256 right_max = _mm256_set1_ps(std::numeric_limits::lowest()); __m256 total = _mm256_setzero_ps(); for (std::size_t index = length; index-- > 0;) { const __m256 values = gather_hot_values(aos, length, index); right_max = _mm256_max_ps(right_max, values); const __m256 left_values = _mm256_load_ps(left_temp + index * kBatchLanes); const __m256 water = _mm256_sub_ps(_mm256_min_ps(left_values, right_max), values); total = _mm256_add_ps(total, _mm256_max_ps(water, _mm256_setzero_ps())); } return horizontal_sum_ps(total); } CACHEBENCH_NOINLINE float simd_trapping_rain_soa(const float* hot, std::size_t length, float* left_temp) { __m256 left_max = _mm256_set1_ps(std::numeric_limits::lowest()); for (std::size_t index = 0; index < length; ++index) { const __m256 values = _mm256_load_ps(hot + index * kBatchLanes); left_max = _mm256_max_ps(left_max, values); _mm256_store_ps(left_temp + index * kBatchLanes, left_max); } __m256 right_max = _mm256_set1_ps(std::numeric_limits::lowest()); __m256 total = _mm256_setzero_ps(); for (std::size_t index = length; index-- > 0;) { const __m256 values = _mm256_load_ps(hot + index * kBatchLanes); right_max = _mm256_max_ps(right_max, values); const __m256 left_values = _mm256_load_ps(left_temp + index * kBatchLanes); const __m256 water = _mm256_sub_ps(_mm256_min_ps(left_values, right_max), values); total = _mm256_add_ps(total, _mm256_max_ps(water, _mm256_setzero_ps())); } return horizontal_sum_ps(total); } CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_sliding_max_aos(const WideRecord* aos, std::size_t length) { float total = 0.0f; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t lane = 0; lane < kBatchLanes; ++lane) { const WideRecord* values = aos + lane * length; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t start = 0; start + kWindowWidth <= length; ++start) { float window_max = values[start].value; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t offset = 1; offset < kWindowWidth; ++offset) { window_max = std::max(window_max, values[start + offset].value); } total += window_max; } } return total; } CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_sliding_max_soa(const float* hot, std::size_t length) { float total = 0.0f; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t lane = 0; lane < kBatchLanes; ++lane) { const float* values = hot + lane * length; std::deque deque; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t index = 0; index < length; ++index) { while (!deque.empty() && deque.front() + kWindowWidth <= index) { deque.pop_front(); } while (!deque.empty() && values[deque.back()] <= values[index]) { deque.pop_back(); } deque.push_back(index); if (index + 1 >= kWindowWidth) { total += values[deque.front()]; } } } return total; } CACHEBENCH_NOINLINE float simd_sliding_max_aos(const WideRecord* aos, std::size_t length) { __m256 total = _mm256_setzero_ps(); for (std::size_t start = 0; start + kWindowWidth <= length; ++start) { __m256 window_max = gather_hot_values(aos, length, start); for (std::size_t offset = 1; offset < kWindowWidth; ++offset) { window_max = _mm256_max_ps(window_max, gather_hot_values(aos, length, start + offset)); } total = _mm256_add_ps(total, window_max); } return horizontal_sum_ps(total); } CACHEBENCH_NOINLINE float simd_sliding_max_soa(const float* hot, std::size_t length, float* left_temp, float* right_temp) { __m256 block_left = _mm256_setzero_ps(); for (std::size_t index = 0; index < length; ++index) { const __m256 values = _mm256_load_ps(hot + index * kBatchLanes); if (index % kWindowWidth == 0) { block_left = values; } else { block_left = _mm256_max_ps(block_left, values); } _mm256_store_ps(left_temp + index * kBatchLanes, block_left); } __m256 block_right = _mm256_setzero_ps(); for (std::size_t reverse = length; reverse-- > 0;) { const __m256 values = _mm256_load_ps(hot + reverse * kBatchLanes); if (reverse == length - 1 || ((reverse + 1) % kWindowWidth == 0)) { block_right = values; } else { block_right = _mm256_max_ps(block_right, values); } _mm256_store_ps(right_temp + reverse * kBatchLanes, block_right); } __m256 total = _mm256_setzero_ps(); for (std::size_t start = 0; start + kWindowWidth <= length; ++start) { const __m256 left_value = _mm256_load_ps(left_temp + (start + kWindowWidth - 1) * kBatchLanes); const __m256 right_value = _mm256_load_ps(right_temp + start * kBatchLanes); total = _mm256_add_ps(total, _mm256_max_ps(left_value, right_value)); } return horizontal_sum_ps(total); } CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t scalar_product_except_self_aos( const WideRecord* aos, std::size_t length) { std::vector work(length); std::uint64_t total = 0; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t lane = 0; lane < kBatchLanes; ++lane) { const WideRecord* values = aos + lane * length; std::uint32_t prefix = 1U; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t index = 0; index < length; ++index) { work[index] = prefix; prefix *= values[index].value; } std::uint32_t suffix = 1U; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t index = length; index-- > 0;) { work[index] *= suffix; suffix *= values[index].value; } total += sample_checksum(work.data(), length); } return total; } CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t scalar_product_except_self_soa(const std::uint32_t* hot, std::size_t length) { std::vector work(length); std::uint64_t total = 0; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t lane = 0; lane < kBatchLanes; ++lane) { const std::uint32_t* values = hot + lane * length; std::uint32_t prefix = 1U; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t index = 0; index < length; ++index) { work[index] = prefix; prefix *= values[index]; } std::uint32_t suffix = 1U; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t index = length; index-- > 0;) { work[index] *= suffix; suffix *= values[index]; } total += sample_checksum(work.data(), length); } return total; } CACHEBENCH_NOINLINE std::uint64_t simd_product_except_self_aos(const WideRecord* aos, std::size_t length, std::uint32_t* temp) { __m256i prefix = _mm256_set1_epi32(1); for (std::size_t index = 0; index < length; ++index) { _mm256_store_si256(reinterpret_cast<__m256i*>(temp + index * kBatchLanes), prefix); prefix = _mm256_mullo_epi32(prefix, gather_hot_u32_values(aos, length, index)); } __m256i suffix = _mm256_set1_epi32(1); for (std::size_t index = length; index-- > 0;) { const __m256i prefix_values = _mm256_load_si256(reinterpret_cast(temp + index * kBatchLanes)); const __m256i output = _mm256_mullo_epi32(prefix_values, suffix); _mm256_store_si256(reinterpret_cast<__m256i*>(temp + index * kBatchLanes), output); suffix = _mm256_mullo_epi32(suffix, gather_hot_u32_values(aos, length, index)); } return sample_checksum(temp, length * kBatchLanes); } CACHEBENCH_NOINLINE std::uint64_t simd_product_except_self_soa(const std::uint32_t* hot, std::size_t length, std::uint32_t* temp) { __m256i prefix = _mm256_set1_epi32(1); for (std::size_t index = 0; index < length; ++index) { _mm256_store_si256(reinterpret_cast<__m256i*>(temp + index * kBatchLanes), prefix); prefix = _mm256_mullo_epi32(prefix, _mm256_load_si256(reinterpret_cast(hot + index * kBatchLanes))); } __m256i suffix = _mm256_set1_epi32(1); for (std::size_t index = length; index-- > 0;) { const __m256i prefix_values = _mm256_load_si256(reinterpret_cast(temp + index * kBatchLanes)); const __m256i output = _mm256_mullo_epi32(prefix_values, suffix); _mm256_store_si256(reinterpret_cast<__m256i*>(temp + index * kBatchLanes), output); suffix = _mm256_mullo_epi32(suffix, _mm256_load_si256(reinterpret_cast(hot + index * kBatchLanes))); } return sample_checksum(temp, length * kBatchLanes); } CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR void transpose_scalar_naive(const float* src, float* dst, std::size_t dim) { CACHEBENCH_NO_VECTOR_LOOP for (std::size_t row = 0; row < dim; ++row) { CACHEBENCH_NO_VECTOR_LOOP for (std::size_t col = 0; col < dim; ++col) { dst[col * dim + row] = src[row * dim + col]; } } } CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR void transpose_scalar_blocked(const float* src, float* dst, std::size_t dim) { CACHEBENCH_NO_VECTOR_LOOP for (std::size_t row_block = 0; row_block < dim; row_block += kTransposeBlock) { CACHEBENCH_NO_VECTOR_LOOP for (std::size_t col_block = 0; col_block < dim; col_block += kTransposeBlock) { const std::size_t row_limit = std::min(dim, row_block + kTransposeBlock); const std::size_t col_limit = std::min(dim, col_block + kTransposeBlock); CACHEBENCH_NO_VECTOR_LOOP for (std::size_t row = row_block; row < row_limit; ++row) { CACHEBENCH_NO_VECTOR_LOOP for (std::size_t col = col_block; col < col_limit; ++col) { dst[col * dim + row] = src[row * dim + col]; } } } } } void transpose4x4_simd(const float* src, std::size_t src_stride, float* dst, std::size_t dst_stride) { __m128 row0 = _mm_loadu_ps(src + 0 * src_stride); __m128 row1 = _mm_loadu_ps(src + 1 * src_stride); __m128 row2 = _mm_loadu_ps(src + 2 * src_stride); __m128 row3 = _mm_loadu_ps(src + 3 * src_stride); _MM_TRANSPOSE4_PS(row0, row1, row2, row3); _mm_storeu_ps(dst + 0 * dst_stride, row0); _mm_storeu_ps(dst + 1 * dst_stride, row1); _mm_storeu_ps(dst + 2 * dst_stride, row2); _mm_storeu_ps(dst + 3 * dst_stride, row3); } CACHEBENCH_NOINLINE void transpose_simd_naive(const float* src, float* dst, std::size_t dim) { const std::size_t simd_dim = dim - (dim % 4); for (std::size_t row = 0; row < simd_dim; row += 4) { for (std::size_t col = 0; col < simd_dim; col += 4) { transpose4x4_simd(src + row * dim + col, dim, dst + col * dim + row, dim); } for (std::size_t col = simd_dim; col < dim; ++col) { for (std::size_t lane = 0; lane < 4 && row + lane < dim; ++lane) { dst[col * dim + (row + lane)] = src[(row + lane) * dim + col]; } } } for (std::size_t row = simd_dim; row < dim; ++row) { for (std::size_t col = 0; col < dim; ++col) { dst[col * dim + row] = src[row * dim + col]; } } } CACHEBENCH_NOINLINE void transpose_simd_blocked(const float* src, float* dst, std::size_t dim) { const std::size_t simd_dim = dim - (dim % 4); for (std::size_t row_block = 0; row_block < dim; row_block += kTransposeBlock) { for (std::size_t col_block = 0; col_block < dim; col_block += kTransposeBlock) { const std::size_t row_limit = std::min(dim, row_block + kTransposeBlock); const std::size_t col_limit = std::min(dim, col_block + kTransposeBlock); std::size_t row = row_block; for (; row + 4 <= row_limit && row < simd_dim; row += 4) { std::size_t col = col_block; for (; col + 4 <= col_limit && col < simd_dim; col += 4) { transpose4x4_simd(src + row * dim + col, dim, dst + col * dim + row, dim); } for (; col < col_limit; ++col) { for (std::size_t lane = 0; lane < 4; ++lane) { dst[col * dim + (row + lane)] = src[(row + lane) * dim + col]; } } } for (; row < row_limit; ++row) { for (std::size_t col = col_block; col < col_limit; ++col) { dst[col * dim + row] = src[row * dim + col]; } } } } } CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR int scalar_rotated_search_aos_value( const WideRecord* values, std::size_t length, std::uint32_t target) { int left = 0; int right = static_cast(length) - 1; while (left <= right) { const int mid = left + (right - left) / 2; const std::uint32_t left_value = values[left].value; const std::uint32_t mid_value = values[mid].value; const std::uint32_t right_value = values[right].value; if (mid_value == target) { return mid; } if (left_value <= mid_value) { if (left_value <= target && target < mid_value) { right = mid - 1; } else { left = mid + 1; } } else { if (mid_value < target && target <= right_value) { left = mid + 1; } else { right = mid - 1; } } } return -1; } CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR int scalar_rotated_search_soa_value( const std::uint32_t* values, std::size_t length, std::uint32_t target) { int left = 0; int right = static_cast(length) - 1; while (left <= right) { const int mid = left + (right - left) / 2; const std::uint32_t left_value = values[left]; const std::uint32_t mid_value = values[mid]; const std::uint32_t right_value = values[right]; if (mid_value == target) { return mid; } if (left_value <= mid_value) { if (left_value <= target && target < mid_value) { right = mid - 1; } else { left = mid + 1; } } else { if (mid_value < target && target <= right_value) { left = mid + 1; } else { right = mid - 1; } } } return -1; } CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t scalar_rotated_search_aos( const WideRecord* aos, const std::uint32_t* queries, std::size_t length, std::size_t query_count) { std::uint64_t sum = 0; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t lane = 0; lane < kBatchLanes; ++lane) { const WideRecord* values = aos + lane * length; const std::uint32_t* lane_queries = queries + lane * query_count; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t query = 0; query < query_count; ++query) { sum += static_cast(scalar_rotated_search_aos_value(values, length, lane_queries[query]) + 1); } } return sum; } CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t scalar_rotated_search_soa( const std::uint32_t* hot, const std::uint32_t* queries, std::size_t length, std::size_t query_count) { std::uint64_t sum = 0; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t lane = 0; lane < kBatchLanes; ++lane) { const std::uint32_t* values = hot + lane * length; const std::uint32_t* lane_queries = queries + lane * query_count; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t query = 0; query < query_count; ++query) { sum += static_cast(scalar_rotated_search_soa_value(values, length, lane_queries[query]) + 1); } } return sum; } template std::uint64_t simd_rotated_search_common(const std::uint32_t* interleaved_queries, std::size_t length, std::size_t query_count, GatherFn&& gather_values) { const __m256i zero = _mm256_setzero_si256(); const __m256i one = _mm256_set1_epi32(1); const __m256i all_bits = _mm256_set1_epi32(-1); const __m256i max_index = _mm256_set1_epi32(static_cast(length - 1)); const std::size_t max_iterations = std::bit_width(length) + 2; std::uint64_t sum = 0; alignas(32) std::array result_lanes{}; for (std::size_t query = 0; query < query_count; ++query) { const __m256i target = _mm256_load_si256(reinterpret_cast(interleaved_queries + query * kBatchLanes)); __m256i left = zero; __m256i right = max_index; __m256i result = _mm256_set1_epi32(-1); __m256i active = all_bits; for (std::size_t iteration = 0; iteration < max_iterations && mask_bits(active) != 0; ++iteration) { const __m256i safe_left = _mm256_blendv_epi8(zero, left, active); const __m256i safe_right = _mm256_blendv_epi8(zero, right, active); const __m256i mid = _mm256_srli_epi32(_mm256_add_epi32(safe_left, safe_right), 1); const __m256i mid_values = gather_values(mid); const __m256i equal_mask = _mm256_and_si256(active, _mm256_cmpeq_epi32(mid_values, target)); result = _mm256_blendv_epi8(result, mid, equal_mask); active = _mm256_andnot_si256(equal_mask, active); if (mask_bits(active) == 0) { break; } const __m256i left_values = gather_values(safe_left); const __m256i right_values = gather_values(safe_right); const __m256i left_sorted = cmp_le_epi32(left_values, mid_values); const __m256i target_in_left = _mm256_and_si256(cmp_le_epi32(left_values, target), cmp_lt_epi32(target, mid_values)); const __m256i target_in_right = _mm256_and_si256(cmp_lt_epi32(mid_values, target), cmp_le_epi32(target, right_values)); const __m256i go_right = _mm256_and_si256( active, _mm256_or_si256( _mm256_and_si256(left_sorted, _mm256_andnot_si256(target_in_left, all_bits)), _mm256_andnot_si256(left_sorted, target_in_right))); const __m256i go_left = _mm256_and_si256(active, _mm256_andnot_si256(go_right, all_bits)); left = _mm256_blendv_epi8(left, _mm256_add_epi32(mid, one), go_right); right = _mm256_blendv_epi8(right, _mm256_sub_epi32(mid, one), go_left); active = _mm256_and_si256(active, cmp_le_epi32(left, right)); } _mm256_store_si256(reinterpret_cast<__m256i*>(result_lanes.data()), result); for (std::size_t lane = 0; lane < kBatchLanes; ++lane) { sum += static_cast(result_lanes[lane] + 1); } } return sum; } CACHEBENCH_NOINLINE std::uint64_t simd_rotated_search_aos(const WideRecord* aos, const std::uint32_t* interleaved_queries, std::size_t length, std::size_t query_count) { return simd_rotated_search_common( interleaved_queries, length, query_count, [aos, length](__m256i indices) { return gather_rotated_aos_values(aos, length, indices); }); } CACHEBENCH_NOINLINE std::uint64_t simd_rotated_search_soa(const std::uint32_t* interleaved_hot, const std::uint32_t* interleaved_queries, std::size_t length, std::size_t query_count) { return simd_rotated_search_common( interleaved_queries, length, query_count, [interleaved_hot](__m256i indices) { return gather_rotated_soa_values(interleaved_hot, indices); }); } CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR void sort012_scalar_aos(Sort012Record* values, std::size_t length) { std::size_t low = 0; std::size_t mid = 0; std::size_t high = length - 1; while (mid <= high) { if (values[mid].value == 0U) { std::swap(values[low++], values[mid++]); } else if (values[mid].value == 1U) { ++mid; } else { std::swap(values[mid], values[high--]); } } } CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t sort012_scalar_soa(const std::uint32_t* input, std::uint32_t* output, std::size_t length) { std::array counts{}; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t index = 0; index < length; ++index) { ++counts[input[index]]; } std::fill_n(output, counts[0], 0U); std::fill_n(output + counts[0], counts[1], 1U); std::fill_n(output + counts[0] + counts[1], counts[2], 2U); return sample_checksum(output, length); } CACHEBENCH_NOINLINE std::uint64_t sort012_simd_aos(const Sort012Record* input, std::uint32_t* output, std::size_t length) { constexpr std::int32_t kStrideWords = static_cast(sizeof(Sort012Record) / sizeof(std::uint32_t)); const int* base = reinterpret_cast(input); const __m256i zero = _mm256_setzero_si256(); const __m256i one = _mm256_set1_epi32(1); const __m256i two = _mm256_set1_epi32(2); std::array counts{}; std::size_t index = 0; for (; index + kBatchLanes <= length; index += kBatchLanes) { const __m256i gather_indices = _mm256_setr_epi32(static_cast((index + 0) * kStrideWords), static_cast((index + 1) * kStrideWords), static_cast((index + 2) * kStrideWords), static_cast((index + 3) * kStrideWords), static_cast((index + 4) * kStrideWords), static_cast((index + 5) * kStrideWords), static_cast((index + 6) * kStrideWords), static_cast((index + 7) * kStrideWords)); const __m256i values = _mm256_i32gather_epi32(base, gather_indices, sizeof(std::uint32_t)); counts[0] += static_cast(std::popcount(static_cast(mask_bits(_mm256_cmpeq_epi32(values, zero))))); counts[1] += static_cast(std::popcount(static_cast(mask_bits(_mm256_cmpeq_epi32(values, one))))); counts[2] += static_cast(std::popcount(static_cast(mask_bits(_mm256_cmpeq_epi32(values, two))))); } for (; index < length; ++index) { ++counts[input[index].value]; } std::fill_n(output, counts[0], 0U); std::fill_n(output + counts[0], counts[1], 1U); std::fill_n(output + counts[0] + counts[1], counts[2], 2U); return sample_checksum(output, length); } CACHEBENCH_NOINLINE std::uint64_t sort012_simd_soa(const std::uint32_t* input, std::uint32_t* output, std::size_t length) { const __m256i zero = _mm256_setzero_si256(); const __m256i one = _mm256_set1_epi32(1); const __m256i two = _mm256_set1_epi32(2); std::array counts{}; std::size_t index = 0; for (; index + kBatchLanes <= length; index += kBatchLanes) { const __m256i values = _mm256_load_si256(reinterpret_cast(input + index)); counts[0] += static_cast(std::popcount(static_cast(mask_bits(_mm256_cmpeq_epi32(values, zero))))); counts[1] += static_cast(std::popcount(static_cast(mask_bits(_mm256_cmpeq_epi32(values, one))))); counts[2] += static_cast(std::popcount(static_cast(mask_bits(_mm256_cmpeq_epi32(values, two))))); } for (; index < length; ++index) { ++counts[input[index]]; } std::fill_n(output, counts[0], 0U); std::fill_n(output + counts[0], counts[1], 1U); std::fill_n(output + counts[0] + counts[1], counts[2], 2U); return sample_checksum(output, length); } CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t bfs_scalar_adjlist(const GraphAdjList& graph, std::size_t passes) { const std::size_t vertices = graph.edges.size(); std::vector queue(vertices); std::vector visited(vertices, 0); std::uint64_t total = 0; for (std::size_t pass = 0; pass < passes; ++pass) { std::fill(visited.begin(), visited.end(), 0); const std::uint32_t source = static_cast((pass * 17) % vertices); std::size_t head = 0; std::size_t tail = 0; visited[source] = 1; queue[tail++] = source; while (head < tail) { const std::uint32_t node = queue[head++]; total += node; for (const GraphEdgeRecord& edge : graph.edges[node]) { if (visited[edge.dst] == 0) { visited[edge.dst] = 1; queue[tail++] = edge.dst; } } } total += tail; } return total; } CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t bfs_scalar_csr(const GraphCsr& graph, std::size_t passes) { const std::size_t vertices = graph.offsets.size() - 1; std::vector queue(vertices); std::vector visited(vertices, 0); std::uint64_t total = 0; for (std::size_t pass = 0; pass < passes; ++pass) { std::fill(visited.begin(), visited.end(), 0); const std::uint32_t source = static_cast((pass * 17) % vertices); std::size_t head = 0; std::size_t tail = 0; visited[source] = 1; queue[tail++] = source; while (head < tail) { const std::uint32_t node = queue[head++]; total += node; for (std::uint32_t edge = graph.offsets[node]; edge < graph.offsets[node + 1]; ++edge) { const std::uint32_t next = graph.edges[edge]; if (visited[next] == 0) { visited[next] = 1; queue[tail++] = next; } } } total += tail; } return total; } CACHEBENCH_NOINLINE std::uint64_t bfs_simd_adjlist(const GraphAdjList& graph, std::size_t passes) { const std::size_t vertices = graph.edges.size(); std::vector queue(vertices); std::vector visited(vertices, 0); std::uint64_t total = 0; alignas(32) std::array dst_lanes{}; constexpr std::int32_t kStrideWords = static_cast(sizeof(GraphEdgeRecord) / sizeof(std::uint32_t)); const __m256i zero = _mm256_setzero_si256(); for (std::size_t pass = 0; pass < passes; ++pass) { std::fill(visited.begin(), visited.end(), 0); const std::uint32_t source = static_cast((pass * 17) % vertices); std::size_t head = 0; std::size_t tail = 0; visited[source] = 1; queue[tail++] = source; while (head < tail) { const std::uint32_t node = queue[head++]; total += node; const auto& edges = graph.edges[node]; const int* base = reinterpret_cast(edges.data()); std::size_t edge = 0; for (; edge + kBatchLanes <= edges.size(); edge += kBatchLanes) { const __m256i edge_indices = _mm256_setr_epi32(static_cast((edge + 0) * kStrideWords), static_cast((edge + 1) * kStrideWords), static_cast((edge + 2) * kStrideWords), static_cast((edge + 3) * kStrideWords), static_cast((edge + 4) * kStrideWords), static_cast((edge + 5) * kStrideWords), static_cast((edge + 6) * kStrideWords), static_cast((edge + 7) * kStrideWords)); const __m256i next_nodes = _mm256_i32gather_epi32(base, edge_indices, sizeof(std::uint32_t)); const __m256i seen = _mm256_i32gather_epi32(visited.data(), next_nodes, static_cast(sizeof(int))); _mm256_store_si256(reinterpret_cast<__m256i*>(dst_lanes.data()), next_nodes); const int unseen_mask = mask_bits(_mm256_cmpeq_epi32(seen, zero)); for (std::size_t lane = 0; lane < kBatchLanes; ++lane) { if (((unseen_mask >> lane) & 1) != 0 && visited[dst_lanes[lane]] == 0) { visited[dst_lanes[lane]] = 1; queue[tail++] = dst_lanes[lane]; } } } for (; edge < edges.size(); ++edge) { const std::uint32_t next = edges[edge].dst; if (visited[next] == 0) { visited[next] = 1; queue[tail++] = next; } } } total += tail; } return total; } CACHEBENCH_NOINLINE std::uint64_t bfs_simd_csr(const GraphCsr& graph, std::size_t passes) { const std::size_t vertices = graph.offsets.size() - 1; std::vector queue(vertices); std::vector visited(vertices, 0); std::uint64_t total = 0; alignas(32) std::array dst_lanes{}; const __m256i zero = _mm256_setzero_si256(); for (std::size_t pass = 0; pass < passes; ++pass) { std::fill(visited.begin(), visited.end(), 0); const std::uint32_t source = static_cast((pass * 17) % vertices); std::size_t head = 0; std::size_t tail = 0; visited[source] = 1; queue[tail++] = source; while (head < tail) { const std::uint32_t node = queue[head++]; total += node; std::uint32_t edge = graph.offsets[node]; const std::uint32_t edge_limit = graph.offsets[node + 1]; for (; edge + kBatchLanes <= edge_limit; edge += static_cast(kBatchLanes)) { const __m256i next_nodes = _mm256_loadu_si256(reinterpret_cast(graph.edges.data() + edge)); const __m256i seen = _mm256_i32gather_epi32(visited.data(), next_nodes, static_cast(sizeof(int))); _mm256_store_si256(reinterpret_cast<__m256i*>(dst_lanes.data()), next_nodes); const int unseen_mask = mask_bits(_mm256_cmpeq_epi32(seen, zero)); for (std::size_t lane = 0; lane < kBatchLanes; ++lane) { if (((unseen_mask >> lane) & 1) != 0 && visited[dst_lanes[lane]] == 0) { visited[dst_lanes[lane]] = 1; queue[tail++] = dst_lanes[lane]; } } } for (; edge < edge_limit; ++edge) { const std::uint32_t next = graph.edges[edge]; if (visited[next] == 0) { visited[next] = 1; queue[tail++] = next; } } } total += tail; } return total; } BenchmarkInstance make_rotated_search_benchmark(std::size_t requested_size_bytes, RotatedSearchMode mode) { auto dataset = std::make_shared(build_rotated_search_dataset(requested_size_bytes)); BenchmarkInstance instance; instance.actual_size_bytes = dataset->logical_size_bytes; instance.elements = dataset->query_count * kBatchLanes; instance.threads_used = 1; instance.bytes_per_pass = dataset->logical_size_bytes + dataset->query_count * kBatchLanes * sizeof(std::uint32_t); instance.target_bytes_per_trial = kSequenceTargetBytes; instance.run = [dataset, mode](std::size_t passes) { std::uint64_t sum = 0; for (std::size_t pass = 0; pass < passes; ++pass) { switch (mode) { case RotatedSearchMode::kScalarAos: sum += scalar_rotated_search_aos( dataset->aos->data(), dataset->lane_major_queries->data(), dataset->length, dataset->query_count); break; case RotatedSearchMode::kScalarSoa: sum += scalar_rotated_search_soa(dataset->lane_major_hot->data(), dataset->lane_major_queries->data(), dataset->length, dataset->query_count); break; case RotatedSearchMode::kSimdAos: sum += simd_rotated_search_aos( dataset->aos->data(), dataset->interleaved_queries->data(), dataset->length, dataset->query_count); break; case RotatedSearchMode::kSimdSoa: sum += simd_rotated_search_soa(dataset->interleaved_hot->data(), dataset->interleaved_queries->data(), dataset->length, dataset->query_count); break; } } return sum; }; return instance; } BenchmarkInstance make_sort012_benchmark(std::size_t requested_size_bytes, Sort012Mode mode) { auto dataset = std::make_shared(build_sort012_dataset(requested_size_bytes)); auto work_aos = std::make_shared>(dataset->length); auto output = std::make_shared>(dataset->length); BenchmarkInstance instance; instance.actual_size_bytes = dataset->logical_size_bytes; instance.elements = dataset->length; instance.threads_used = 1; instance.bytes_per_pass = dataset->logical_size_bytes * 2ULL; instance.target_bytes_per_trial = kSequenceTargetBytes; instance.run = [dataset, work_aos, output, mode](std::size_t passes) { std::uint64_t sum = 0; for (std::size_t pass = 0; pass < passes; ++pass) { switch (mode) { case Sort012Mode::kScalarAos: std::memcpy(work_aos->data(), dataset->input_aos->data(), dataset->length * sizeof(Sort012Record)); sort012_scalar_aos(work_aos->data(), dataset->length); sum ^= sample_sort012_record_values(work_aos->data(), dataset->length); break; case Sort012Mode::kScalarSoa: sum ^= sort012_scalar_soa(dataset->input_soa->data(), output->data(), dataset->length); break; case Sort012Mode::kSimdAos: sum ^= sort012_simd_aos(dataset->input_aos->data(), output->data(), dataset->length); break; case Sort012Mode::kSimdSoa: sum ^= sort012_simd_soa(dataset->input_soa->data(), output->data(), dataset->length); break; } } return sum; }; return instance; } BenchmarkInstance make_bfs_benchmark(std::size_t requested_size_bytes, BfsMode mode) { const std::size_t vertices = std::max( 512, std::min(requested_size_bytes / (kDsaGraphDegree * sizeof(GraphEdgeRecord)), 1ULL << 17)); auto adjlist = build_dsa_graph_adjlist(vertices); auto csr = build_dsa_graph_csr(*adjlist); const std::size_t logical_size_bytes = adjlist->edges.size() * kDsaGraphDegree * sizeof(GraphEdgeRecord) + csr->offsets.size() * sizeof(std::uint32_t); BenchmarkInstance instance; instance.actual_size_bytes = logical_size_bytes; instance.elements = vertices; instance.threads_used = 1; instance.bytes_per_pass = logical_size_bytes; instance.target_bytes_per_trial = kSequenceTargetBytes / 2ULL; instance.run = [adjlist, csr, mode](std::size_t passes) { switch (mode) { case BfsMode::kScalarAdjList: return bfs_scalar_adjlist(*adjlist, passes); case BfsMode::kScalarCsr: return bfs_scalar_csr(*csr, passes); case BfsMode::kSimdAdjList: return bfs_simd_adjlist(*adjlist, passes); case BfsMode::kSimdCsr: return bfs_simd_csr(*csr, passes); } return std::uint64_t{0}; }; return instance; } BenchmarkInstance make_stock_profit_benchmark(std::size_t requested_size_bytes, StockMode mode) { auto dataset = std::make_shared>( build_sequence_dataset(requested_size_bytes, 256, stock_price_value)); BenchmarkInstance instance; instance.actual_size_bytes = dataset->logical_size_bytes; instance.elements = dataset->total_values; instance.threads_used = 1; instance.bytes_per_pass = dataset->logical_size_bytes; instance.target_bytes_per_trial = kSequenceTargetBytes; instance.run = [dataset, mode](std::size_t passes) { float sum = 0.0f; for (std::size_t pass = 0; pass < passes; ++pass) { switch (mode) { case StockMode::kScalarAos: sum += scalar_stock_profit_aos(dataset->aos->data(), dataset->length); break; case StockMode::kScalarSoa: sum += scalar_stock_profit_soa(dataset->lane_major_hot->data(), dataset->length); break; case StockMode::kSimdAos: sum += simd_stock_profit_aos(dataset->aos->data(), dataset->length); break; case StockMode::kSimdSoa: sum += simd_stock_profit_soa(dataset->interleaved_hot->data(), dataset->length); break; } } return static_cast(std::bit_cast(sum)); }; return instance; } BenchmarkInstance make_max_subarray_benchmark(std::size_t requested_size_bytes, MaxSubarrayMode mode) { auto dataset = std::make_shared>( build_sequence_dataset(requested_size_bytes, 256, max_subarray_value)); BenchmarkInstance instance; instance.actual_size_bytes = dataset->logical_size_bytes; instance.elements = dataset->total_values; instance.threads_used = 1; instance.bytes_per_pass = dataset->logical_size_bytes; instance.target_bytes_per_trial = kSequenceTargetBytes; instance.run = [dataset, mode](std::size_t passes) { float sum = 0.0f; for (std::size_t pass = 0; pass < passes; ++pass) { switch (mode) { case MaxSubarrayMode::kScalarAos: sum += scalar_max_subarray_aos(dataset->aos->data(), dataset->length); break; case MaxSubarrayMode::kScalarSoa: sum += scalar_max_subarray_soa(dataset->lane_major_hot->data(), dataset->length); break; case MaxSubarrayMode::kSimdAos: sum += simd_max_subarray_aos(dataset->aos->data(), dataset->length); break; case MaxSubarrayMode::kSimdSoa: sum += simd_max_subarray_soa(dataset->interleaved_hot->data(), dataset->length); break; } } return static_cast(std::bit_cast(sum)); }; return instance; } BenchmarkInstance make_trapping_rain_benchmark(std::size_t requested_size_bytes, TrappingRainMode mode) { auto dataset = std::make_shared>( build_sequence_dataset(requested_size_bytes, 256, height_value)); auto left_temp = std::make_shared>(dataset->total_values); BenchmarkInstance instance; instance.actual_size_bytes = dataset->logical_size_bytes; instance.elements = dataset->total_values; instance.threads_used = 1; instance.bytes_per_pass = dataset->logical_size_bytes * 2ULL; instance.target_bytes_per_trial = kSequenceTargetBytes; instance.run = [dataset, left_temp, mode](std::size_t passes) { float sum = 0.0f; for (std::size_t pass = 0; pass < passes; ++pass) { switch (mode) { case TrappingRainMode::kScalarAos: sum += scalar_trapping_rain_aos(dataset->aos->data(), dataset->length); break; case TrappingRainMode::kScalarSoa: sum += scalar_trapping_rain_soa(dataset->lane_major_hot->data(), dataset->length); break; case TrappingRainMode::kSimdAos: sum += simd_trapping_rain_aos(dataset->aos->data(), dataset->length, left_temp->data()); break; case TrappingRainMode::kSimdSoa: sum += simd_trapping_rain_soa(dataset->interleaved_hot->data(), dataset->length, left_temp->data()); break; } } return static_cast(std::bit_cast(sum)); }; return instance; } BenchmarkInstance make_sliding_window_benchmark(std::size_t requested_size_bytes, SlidingWindowMode mode) { auto dataset = std::make_shared>( build_sequence_dataset(requested_size_bytes, 256, window_value)); auto left_temp = std::make_shared>(dataset->total_values); auto right_temp = std::make_shared>(dataset->total_values); BenchmarkInstance instance; instance.actual_size_bytes = dataset->logical_size_bytes; instance.elements = dataset->total_values; instance.threads_used = 1; instance.bytes_per_pass = dataset->logical_size_bytes * 2ULL; instance.target_bytes_per_trial = kSequenceTargetBytes; instance.run = [dataset, left_temp, right_temp, mode](std::size_t passes) { float sum = 0.0f; for (std::size_t pass = 0; pass < passes; ++pass) { switch (mode) { case SlidingWindowMode::kScalarAos: sum += scalar_sliding_max_aos(dataset->aos->data(), dataset->length); break; case SlidingWindowMode::kScalarSoa: sum += scalar_sliding_max_soa(dataset->lane_major_hot->data(), dataset->length); break; case SlidingWindowMode::kSimdAos: sum += simd_sliding_max_aos(dataset->aos->data(), dataset->length); break; case SlidingWindowMode::kSimdSoa: sum += simd_sliding_max_soa( dataset->interleaved_hot->data(), dataset->length, left_temp->data(), right_temp->data()); break; } } return static_cast(std::bit_cast(sum)); }; return instance; } BenchmarkInstance make_product_except_self_benchmark(std::size_t requested_size_bytes, ProductExceptSelfMode mode) { auto dataset = std::make_shared>( build_sequence_dataset(requested_size_bytes, 256, product_value)); auto temp = std::make_shared>(dataset->total_values); BenchmarkInstance instance; instance.actual_size_bytes = dataset->logical_size_bytes; instance.elements = dataset->total_values; instance.threads_used = 1; instance.bytes_per_pass = dataset->logical_size_bytes * 2ULL; instance.target_bytes_per_trial = kSequenceTargetBytes; instance.run = [dataset, temp, mode](std::size_t passes) { std::uint64_t sum = 0; for (std::size_t pass = 0; pass < passes; ++pass) { switch (mode) { case ProductExceptSelfMode::kScalarAos: sum ^= scalar_product_except_self_aos(dataset->aos->data(), dataset->length); break; case ProductExceptSelfMode::kScalarSoa: sum ^= scalar_product_except_self_soa(dataset->lane_major_hot->data(), dataset->length); break; case ProductExceptSelfMode::kSimdAos: sum ^= simd_product_except_self_aos(dataset->aos->data(), dataset->length, temp->data()); break; case ProductExceptSelfMode::kSimdSoa: sum ^= simd_product_except_self_soa(dataset->interleaved_hot->data(), dataset->length, temp->data()); break; } } return sum; }; return instance; } BenchmarkInstance make_transpose_benchmark(std::size_t requested_size_bytes, TransposeMode mode) { const std::size_t elements = std::max(64, requested_size_bytes / (sizeof(float) * 2ULL)); const std::size_t dim = std::max(64, static_cast(std::sqrt(static_cast(elements))) & ~3ULL); const std::size_t matrix_elements = dim * dim; auto src = std::make_shared>(matrix_elements); auto dst = std::make_shared>(matrix_elements); for (std::size_t index = 0; index < matrix_elements; ++index) { (*src)[index] = seeded_float(index, 31.0f); (*dst)[index] = 0.0f; } BenchmarkInstance instance; instance.actual_size_bytes = matrix_elements * sizeof(float) * 2ULL; instance.elements = matrix_elements; instance.threads_used = 1; instance.bytes_per_pass = instance.actual_size_bytes; instance.target_bytes_per_trial = kTransposeTargetBytes; instance.run = [src, dst, dim, matrix_elements, mode](std::size_t passes) { std::uint64_t sum = 0; for (std::size_t pass = 0; pass < passes; ++pass) { switch (mode) { case TransposeMode::kScalarNaive: transpose_scalar_naive(src->data(), dst->data(), dim); break; case TransposeMode::kScalarBlocked: transpose_scalar_blocked(src->data(), dst->data(), dim); break; case TransposeMode::kSimdNaive: transpose_simd_naive(src->data(), dst->data(), dim); break; case TransposeMode::kSimdBlocked: transpose_simd_blocked(src->data(), dst->data(), dim); break; } sum ^= checksum_from_float_buffer(dst->data(), matrix_elements); } return sum; }; return instance; } } // namespace void append_dsa_benchmarks(std::vector& specs) { if (!cpu_supports_avx2()) { return; } specs.push_back({"dsa_scalar_stock_profit_aos", "Best time to buy/sell stock on wide AoS price records.", [](std::size_t size_bytes, std::size_t) { return make_stock_profit_benchmark(size_bytes, StockMode::kScalarAos); }}); specs.push_back({"dsa_scalar_stock_profit_soa", "Best time to buy/sell stock on cache-aware SoA price buffers.", [](std::size_t size_bytes, std::size_t) { return make_stock_profit_benchmark(size_bytes, StockMode::kScalarSoa); }}); specs.push_back({"dsa_simd_stock_profit_aos", "Best time to buy/sell stock with SIMD over AoS price records.", [](std::size_t size_bytes, std::size_t) { return make_stock_profit_benchmark(size_bytes, StockMode::kSimdAos); }}); specs.push_back({"dsa_simd_stock_profit_soa", "Best time to buy/sell stock with cache-aware SIMD layout.", [](std::size_t size_bytes, std::size_t) { return make_stock_profit_benchmark(size_bytes, StockMode::kSimdSoa); }}); specs.push_back({"dsa_scalar_max_subarray_aos", "Kadane maximum subarray on wide AoS records.", [](std::size_t size_bytes, std::size_t) { return make_max_subarray_benchmark(size_bytes, MaxSubarrayMode::kScalarAos); }}); specs.push_back({"dsa_scalar_max_subarray_soa", "Kadane maximum subarray on cache-aware SoA values.", [](std::size_t size_bytes, std::size_t) { return make_max_subarray_benchmark(size_bytes, MaxSubarrayMode::kScalarSoa); }}); specs.push_back({"dsa_simd_max_subarray_aos", "Kadane maximum subarray with SIMD gathers over AoS values.", [](std::size_t size_bytes, std::size_t) { return make_max_subarray_benchmark(size_bytes, MaxSubarrayMode::kSimdAos); }}); specs.push_back({"dsa_simd_max_subarray_soa", "Kadane maximum subarray with SIMD on interleaved SoA values.", [](std::size_t size_bytes, std::size_t) { return make_max_subarray_benchmark(size_bytes, MaxSubarrayMode::kSimdSoa); }}); specs.push_back({"dsa_scalar_trapping_rain_aos", "Trapping rain water on wide AoS height records.", [](std::size_t size_bytes, std::size_t) { return make_trapping_rain_benchmark(size_bytes, TrappingRainMode::kScalarAos); }}); specs.push_back({"dsa_scalar_trapping_rain_soa", "Trapping rain water on cache-aware SoA heights.", [](std::size_t size_bytes, std::size_t) { return make_trapping_rain_benchmark(size_bytes, TrappingRainMode::kScalarSoa); }}); specs.push_back({"dsa_simd_trapping_rain_aos", "Trapping rain water with SIMD gathers over AoS heights.", [](std::size_t size_bytes, std::size_t) { return make_trapping_rain_benchmark(size_bytes, TrappingRainMode::kSimdAos); }}); specs.push_back({"dsa_simd_trapping_rain_soa", "Trapping rain water with cache-aware SIMD layout.", [](std::size_t size_bytes, std::size_t) { return make_trapping_rain_benchmark(size_bytes, TrappingRainMode::kSimdSoa); }}); specs.push_back({"dsa_scalar_sliding_window_max_aos", "Naive K-window maximum on wide AoS records.", [](std::size_t size_bytes, std::size_t) { return make_sliding_window_benchmark(size_bytes, SlidingWindowMode::kScalarAos); }}); specs.push_back({"dsa_scalar_sliding_window_max_soa", "Deque-based K-window maximum on cache-aware SoA values.", [](std::size_t size_bytes, std::size_t) { return make_sliding_window_benchmark(size_bytes, SlidingWindowMode::kScalarSoa); }}); specs.push_back({"dsa_simd_sliding_window_max_aos", "Naive K-window maximum with SIMD over AoS values.", [](std::size_t size_bytes, std::size_t) { return make_sliding_window_benchmark(size_bytes, SlidingWindowMode::kSimdAos); }}); specs.push_back({"dsa_simd_sliding_window_max_soa", "Block-based K-window maximum with cache-aware SIMD layout.", [](std::size_t size_bytes, std::size_t) { return make_sliding_window_benchmark(size_bytes, SlidingWindowMode::kSimdSoa); }}); specs.push_back({"dsa_scalar_product_except_self_aos", "Product-except-self on wide AoS integer records.", [](std::size_t size_bytes, std::size_t) { return make_product_except_self_benchmark(size_bytes, ProductExceptSelfMode::kScalarAos); }}); specs.push_back({"dsa_scalar_product_except_self_soa", "Product-except-self on cache-aware SoA integer buffers.", [](std::size_t size_bytes, std::size_t) { return make_product_except_self_benchmark(size_bytes, ProductExceptSelfMode::kScalarSoa); }}); specs.push_back({"dsa_simd_product_except_self_aos", "Product-except-self with SIMD gathers over AoS integer records.", [](std::size_t size_bytes, std::size_t) { return make_product_except_self_benchmark(size_bytes, ProductExceptSelfMode::kSimdAos); }}); specs.push_back({"dsa_simd_product_except_self_soa", "Product-except-self with cache-aware SIMD layout.", [](std::size_t size_bytes, std::size_t) { return make_product_except_self_benchmark(size_bytes, ProductExceptSelfMode::kSimdSoa); }}); specs.push_back({"dsa_scalar_rotated_search_aos", "Rotated sorted-array lookup on wide AoS records.", [](std::size_t size_bytes, std::size_t) { return make_rotated_search_benchmark(size_bytes, RotatedSearchMode::kScalarAos); }}); specs.push_back({"dsa_scalar_rotated_search_soa", "Rotated sorted-array lookup on cache-aware SoA values.", [](std::size_t size_bytes, std::size_t) { return make_rotated_search_benchmark(size_bytes, RotatedSearchMode::kScalarSoa); }}); specs.push_back({"dsa_simd_rotated_search_aos", "Rotated sorted-array lookup with SIMD gathers over AoS records.", [](std::size_t size_bytes, std::size_t) { return make_rotated_search_benchmark(size_bytes, RotatedSearchMode::kSimdAos); }}); specs.push_back({"dsa_simd_rotated_search_soa", "Rotated sorted-array lookup with cache-aware SIMD layout.", [](std::size_t size_bytes, std::size_t) { return make_rotated_search_benchmark(size_bytes, RotatedSearchMode::kSimdSoa); }}); specs.push_back({"dsa_scalar_sort012_aos", "Dutch-flag sort for 0/1/2 values stored in wide AoS records.", [](std::size_t size_bytes, std::size_t) { return make_sort012_benchmark(size_bytes, Sort012Mode::kScalarAos); }}); specs.push_back({"dsa_scalar_sort012_soa", "Counting sort for 0/1/2 values stored contiguously.", [](std::size_t size_bytes, std::size_t) { return make_sort012_benchmark(size_bytes, Sort012Mode::kScalarSoa); }}); specs.push_back({"dsa_simd_sort012_aos", "SIMD counting sort for 0/1/2 values gathered from AoS records.", [](std::size_t size_bytes, std::size_t) { return make_sort012_benchmark(size_bytes, Sort012Mode::kSimdAos); }}); specs.push_back({"dsa_simd_sort012_soa", "SIMD counting sort for contiguous 0/1/2 values.", [](std::size_t size_bytes, std::size_t) { return make_sort012_benchmark(size_bytes, Sort012Mode::kSimdSoa); }}); specs.push_back({"dsa_scalar_bfs_adjlist", "Breadth-first search over adjacency-list edge records.", [](std::size_t size_bytes, std::size_t) { return make_bfs_benchmark(size_bytes, BfsMode::kScalarAdjList); }}); specs.push_back({"dsa_scalar_bfs_csr", "Breadth-first search over a cache-aware CSR layout.", [](std::size_t size_bytes, std::size_t) { return make_bfs_benchmark(size_bytes, BfsMode::kScalarCsr); }}); specs.push_back({"dsa_simd_bfs_adjlist", "Breadth-first search with SIMD-assisted adjacency-list neighbor tests.", [](std::size_t size_bytes, std::size_t) { return make_bfs_benchmark(size_bytes, BfsMode::kSimdAdjList); }}); specs.push_back({"dsa_simd_bfs_csr", "Breadth-first search with SIMD-assisted CSR neighbor tests.", [](std::size_t size_bytes, std::size_t) { return make_bfs_benchmark(size_bytes, BfsMode::kSimdCsr); }}); specs.push_back({"dsa_scalar_transpose_matrix", "Naive transpose of a row-major matrix.", [](std::size_t size_bytes, std::size_t) { return make_transpose_benchmark(size_bytes, TransposeMode::kScalarNaive); }}); specs.push_back({"dsa_scalar_blocked_transpose_matrix", "Blocked transpose of a row-major matrix.", [](std::size_t size_bytes, std::size_t) { return make_transpose_benchmark(size_bytes, TransposeMode::kScalarBlocked); }}); specs.push_back({"dsa_simd_transpose_matrix", "SIMD tiled transpose of a row-major matrix.", [](std::size_t size_bytes, std::size_t) { return make_transpose_benchmark(size_bytes, TransposeMode::kSimdNaive); }}); specs.push_back({"dsa_simd_blocked_transpose_matrix", "Blocked SIMD transpose of a row-major matrix.", [](std::size_t size_bytes, std::size_t) { return make_transpose_benchmark(size_bytes, TransposeMode::kSimdBlocked); }}); } } // namespace cachebench