# DSA Full Code Appendix This appendix contains the measured core functions. The benchmark harness also includes dataset generation, warmup, checksums, CSV output, and CLI plumbing. ## Shared DSA support types ```cpp template struct WideRecord { T value{}; std::array cold{}; }; static_assert(sizeof(WideRecord) == 64, "Wide float record should occupy one cache line"); static_assert(sizeof(WideRecord) == 64, "Wide uint32 record should occupy one cache line"); template struct SequenceDataset { std::size_t length = 0; std::size_t total_values = 0; std::size_t logical_size_bytes = 0; std::shared_ptr>> aos; std::shared_ptr> lane_major_hot; std::shared_ptr> lane_major_cold; std::shared_ptr> interleaved_hot; std::shared_ptr> interleaved_cold; }; enum class StockMode { kScalarAos, kScalarSoa, kSimdAos, kSimdSoa, }; enum class MaxSubarrayMode { kScalarAos, kScalarSoa, kSimdAos, kSimdSoa, }; enum class TrappingRainMode { kScalarAos, kScalarSoa, kSimdAos, kSimdSoa, }; enum class SlidingWindowMode { kScalarAos, kScalarSoa, kSimdAos, kSimdSoa, }; enum class ProductExceptSelfMode { kScalarAos, kScalarSoa, kSimdAos, kSimdSoa, }; enum class TransposeMode { kScalarNaive, kScalarBlocked, kSimdNaive, kSimdBlocked, }; enum class RotatedSearchMode { kScalarAos, kScalarSoa, kSimdAos, kSimdSoa, }; enum class Sort012Mode { kScalarAos, kScalarSoa, kSimdAos, kSimdSoa, }; enum class BfsMode { kScalarAdjList, kScalarCsr, kSimdAdjList, kSimdCsr, }; struct RotatedSearchDataset { std::size_t length = 0; std::size_t query_count = 0; std::size_t logical_size_bytes = 0; std::shared_ptr>> aos; std::shared_ptr> lane_major_hot; std::shared_ptr> interleaved_hot; std::shared_ptr> lane_major_queries; std::shared_ptr> interleaved_queries; }; struct Sort012Record { std::uint32_t value = 0; std::array cold{}; }; static_assert(sizeof(Sort012Record) == 64, "Sort012Record should occupy one cache line"); struct Sort012Dataset { std::size_t length = 0; std::size_t logical_size_bytes = 0; std::shared_ptr> input_aos; std::shared_ptr> input_soa; }; struct GraphEdgeRecord { std::uint32_t dst = 0; std::uint32_t cold0 = 0; std::uint32_t cold1 = 0; std::uint32_t cold2 = 0; }; static_assert(sizeof(GraphEdgeRecord) == 16, "GraphEdgeRecord should stay compact"); struct GraphAdjList { std::vector> edges; }; struct GraphCsr { std::vector offsets; std::vector edges; }; ``` ## Stock profit ```cpp CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_stock_profit_aos(const WideRecord* aos, std::size_t length) { float total = 0.0f; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t lane = 0; lane < kBatchLanes; ++lane) { float min_price = std::numeric_limits::infinity(); float best = 0.0f; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t index = 0; index < length; ++index) { const float price = aos[lane * length + index].value; min_price = std::min(min_price, price); best = std::max(best, price - min_price); } total += best; } return total; } CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_stock_profit_soa(const float* hot, std::size_t length) { float total = 0.0f; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t lane = 0; lane < kBatchLanes; ++lane) { const float* values = hot + lane * length; float min_price = std::numeric_limits::infinity(); float best = 0.0f; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t index = 0; index < length; ++index) { min_price = std::min(min_price, values[index]); best = std::max(best, values[index] - min_price); } total += best; } return total; } CACHEBENCH_NOINLINE float simd_stock_profit_aos(const WideRecord* aos, std::size_t length) { __m256 min_values = _mm256_set1_ps(std::numeric_limits::infinity()); __m256 best_values = _mm256_setzero_ps(); for (std::size_t index = 0; index < length; ++index) { const __m256 prices = gather_hot_values(aos, length, index); min_values = _mm256_min_ps(min_values, prices); best_values = _mm256_max_ps(best_values, _mm256_sub_ps(prices, min_values)); } return horizontal_sum_ps(best_values); } CACHEBENCH_NOINLINE float simd_stock_profit_soa(const float* hot, std::size_t length) { __m256 min_values = _mm256_set1_ps(std::numeric_limits::infinity()); __m256 best_values = _mm256_setzero_ps(); for (std::size_t index = 0; index < length; ++index) { const __m256 prices = _mm256_load_ps(hot + index * kBatchLanes); min_values = _mm256_min_ps(min_values, prices); best_values = _mm256_max_ps(best_values, _mm256_sub_ps(prices, min_values)); } return horizontal_sum_ps(best_values); } ``` ## Maximum subarray ```cpp CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_max_subarray_aos(const WideRecord* aos, std::size_t length) { float total = 0.0f; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t lane = 0; lane < kBatchLanes; ++lane) { float current = aos[lane * length].value; float best = current; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t index = 1; index < length; ++index) { const float value = aos[lane * length + index].value; current = std::max(value, current + value); best = std::max(best, current); } total += best; } return total; } CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_max_subarray_soa(const float* hot, std::size_t length) { float total = 0.0f; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t lane = 0; lane < kBatchLanes; ++lane) { const float* values = hot + lane * length; float current = values[0]; float best = current; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t index = 1; index < length; ++index) { current = std::max(values[index], current + values[index]); best = std::max(best, current); } total += best; } return total; } CACHEBENCH_NOINLINE float simd_max_subarray_aos(const WideRecord* aos, std::size_t length) { __m256 current = gather_hot_values(aos, length, 0); __m256 best = current; for (std::size_t index = 1; index < length; ++index) { const __m256 values = gather_hot_values(aos, length, index); current = _mm256_max_ps(values, _mm256_add_ps(current, values)); best = _mm256_max_ps(best, current); } return horizontal_sum_ps(best); } CACHEBENCH_NOINLINE float simd_max_subarray_soa(const float* hot, std::size_t length) { __m256 current = _mm256_load_ps(hot); __m256 best = current; for (std::size_t index = 1; index < length; ++index) { const __m256 values = _mm256_load_ps(hot + index * kBatchLanes); current = _mm256_max_ps(values, _mm256_add_ps(current, values)); best = _mm256_max_ps(best, current); } return horizontal_sum_ps(best); } ``` ## Trapping rain water ```cpp CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_trapping_rain_aos(const WideRecord* aos, std::size_t length) { std::vector left(length); float total = 0.0f; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t lane = 0; lane < kBatchLanes; ++lane) { const WideRecord* values = aos + lane * length; float left_max = values[0].value; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t index = 0; index < length; ++index) { left_max = std::max(left_max, values[index].value); left[index] = left_max; } float right_max = values[length - 1].value; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t index = length; index-- > 0;) { right_max = std::max(right_max, values[index].value); total += std::max(0.0f, std::min(left[index], right_max) - values[index].value); } } return total; } CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_trapping_rain_soa(const float* hot, std::size_t length) { std::vector left(length); float total = 0.0f; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t lane = 0; lane < kBatchLanes; ++lane) { const float* values = hot + lane * length; float left_max = values[0]; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t index = 0; index < length; ++index) { left_max = std::max(left_max, values[index]); left[index] = left_max; } float right_max = values[length - 1]; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t index = length; index-- > 0;) { right_max = std::max(right_max, values[index]); total += std::max(0.0f, std::min(left[index], right_max) - values[index]); } } return total; } CACHEBENCH_NOINLINE float simd_trapping_rain_aos(const WideRecord* aos, std::size_t length, float* left_temp) { __m256 left_max = _mm256_set1_ps(std::numeric_limits::lowest()); for (std::size_t index = 0; index < length; ++index) { left_max = _mm256_max_ps(left_max, gather_hot_values(aos, length, index)); _mm256_store_ps(left_temp + index * kBatchLanes, left_max); } __m256 right_max = _mm256_set1_ps(std::numeric_limits::lowest()); __m256 total = _mm256_setzero_ps(); for (std::size_t index = length; index-- > 0;) { const __m256 values = gather_hot_values(aos, length, index); right_max = _mm256_max_ps(right_max, values); const __m256 left_values = _mm256_load_ps(left_temp + index * kBatchLanes); const __m256 water = _mm256_sub_ps(_mm256_min_ps(left_values, right_max), values); total = _mm256_add_ps(total, _mm256_max_ps(water, _mm256_setzero_ps())); } return horizontal_sum_ps(total); } CACHEBENCH_NOINLINE float simd_trapping_rain_soa(const float* hot, std::size_t length, float* left_temp) { __m256 left_max = _mm256_set1_ps(std::numeric_limits::lowest()); for (std::size_t index = 0; index < length; ++index) { const __m256 values = _mm256_load_ps(hot + index * kBatchLanes); left_max = _mm256_max_ps(left_max, values); _mm256_store_ps(left_temp + index * kBatchLanes, left_max); } __m256 right_max = _mm256_set1_ps(std::numeric_limits::lowest()); __m256 total = _mm256_setzero_ps(); for (std::size_t index = length; index-- > 0;) { const __m256 values = _mm256_load_ps(hot + index * kBatchLanes); right_max = _mm256_max_ps(right_max, values); const __m256 left_values = _mm256_load_ps(left_temp + index * kBatchLanes); const __m256 water = _mm256_sub_ps(_mm256_min_ps(left_values, right_max), values); total = _mm256_add_ps(total, _mm256_max_ps(water, _mm256_setzero_ps())); } return horizontal_sum_ps(total); } ``` ## Sliding-window maximum ```cpp CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_sliding_max_aos(const WideRecord* aos, std::size_t length) { float total = 0.0f; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t lane = 0; lane < kBatchLanes; ++lane) { const WideRecord* values = aos + lane * length; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t start = 0; start + kWindowWidth <= length; ++start) { float window_max = values[start].value; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t offset = 1; offset < kWindowWidth; ++offset) { window_max = std::max(window_max, values[start + offset].value); } total += window_max; } } return total; } CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR float scalar_sliding_max_soa(const float* hot, std::size_t length) { float total = 0.0f; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t lane = 0; lane < kBatchLanes; ++lane) { const float* values = hot + lane * length; std::deque deque; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t index = 0; index < length; ++index) { while (!deque.empty() && deque.front() + kWindowWidth <= index) { deque.pop_front(); } while (!deque.empty() && values[deque.back()] <= values[index]) { deque.pop_back(); } deque.push_back(index); if (index + 1 >= kWindowWidth) { total += values[deque.front()]; } } } return total; } CACHEBENCH_NOINLINE float simd_sliding_max_aos(const WideRecord* aos, std::size_t length) { __m256 total = _mm256_setzero_ps(); for (std::size_t start = 0; start + kWindowWidth <= length; ++start) { __m256 window_max = gather_hot_values(aos, length, start); for (std::size_t offset = 1; offset < kWindowWidth; ++offset) { window_max = _mm256_max_ps(window_max, gather_hot_values(aos, length, start + offset)); } total = _mm256_add_ps(total, window_max); } return horizontal_sum_ps(total); } CACHEBENCH_NOINLINE float simd_sliding_max_soa(const float* hot, std::size_t length, float* left_temp, float* right_temp) { __m256 block_left = _mm256_setzero_ps(); for (std::size_t index = 0; index < length; ++index) { const __m256 values = _mm256_load_ps(hot + index * kBatchLanes); if (index % kWindowWidth == 0) { block_left = values; } else { block_left = _mm256_max_ps(block_left, values); } _mm256_store_ps(left_temp + index * kBatchLanes, block_left); } __m256 block_right = _mm256_setzero_ps(); for (std::size_t reverse = length; reverse-- > 0;) { const __m256 values = _mm256_load_ps(hot + reverse * kBatchLanes); if (reverse == length - 1 || ((reverse + 1) % kWindowWidth == 0)) { block_right = values; } else { block_right = _mm256_max_ps(block_right, values); } _mm256_store_ps(right_temp + reverse * kBatchLanes, block_right); } __m256 total = _mm256_setzero_ps(); for (std::size_t start = 0; start + kWindowWidth <= length; ++start) { const __m256 left_value = _mm256_load_ps(left_temp + (start + kWindowWidth - 1) * kBatchLanes); const __m256 right_value = _mm256_load_ps(right_temp + start * kBatchLanes); total = _mm256_add_ps(total, _mm256_max_ps(left_value, right_value)); } return horizontal_sum_ps(total); } ``` ## Product except self ```cpp CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t scalar_product_except_self_aos( const WideRecord* aos, std::size_t length) { std::vector work(length); std::uint64_t total = 0; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t lane = 0; lane < kBatchLanes; ++lane) { const WideRecord* values = aos + lane * length; std::uint32_t prefix = 1U; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t index = 0; index < length; ++index) { work[index] = prefix; prefix *= values[index].value; } std::uint32_t suffix = 1U; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t index = length; index-- > 0;) { work[index] *= suffix; suffix *= values[index].value; } total += sample_checksum(work.data(), length); } return total; } CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t scalar_product_except_self_soa(const std::uint32_t* hot, std::size_t length) { std::vector work(length); std::uint64_t total = 0; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t lane = 0; lane < kBatchLanes; ++lane) { const std::uint32_t* values = hot + lane * length; std::uint32_t prefix = 1U; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t index = 0; index < length; ++index) { work[index] = prefix; prefix *= values[index]; } std::uint32_t suffix = 1U; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t index = length; index-- > 0;) { work[index] *= suffix; suffix *= values[index]; } total += sample_checksum(work.data(), length); } return total; } CACHEBENCH_NOINLINE std::uint64_t simd_product_except_self_aos(const WideRecord* aos, std::size_t length, std::uint32_t* temp) { __m256i prefix = _mm256_set1_epi32(1); for (std::size_t index = 0; index < length; ++index) { _mm256_store_si256(reinterpret_cast<__m256i*>(temp + index * kBatchLanes), prefix); prefix = _mm256_mullo_epi32(prefix, gather_hot_u32_values(aos, length, index)); } __m256i suffix = _mm256_set1_epi32(1); for (std::size_t index = length; index-- > 0;) { const __m256i prefix_values = _mm256_load_si256(reinterpret_cast(temp + index * kBatchLanes)); const __m256i output = _mm256_mullo_epi32(prefix_values, suffix); _mm256_store_si256(reinterpret_cast<__m256i*>(temp + index * kBatchLanes), output); suffix = _mm256_mullo_epi32(suffix, gather_hot_u32_values(aos, length, index)); } return sample_checksum(temp, length * kBatchLanes); } CACHEBENCH_NOINLINE std::uint64_t simd_product_except_self_soa(const std::uint32_t* hot, std::size_t length, std::uint32_t* temp) { __m256i prefix = _mm256_set1_epi32(1); for (std::size_t index = 0; index < length; ++index) { _mm256_store_si256(reinterpret_cast<__m256i*>(temp + index * kBatchLanes), prefix); prefix = _mm256_mullo_epi32(prefix, _mm256_load_si256(reinterpret_cast(hot + index * kBatchLanes))); } __m256i suffix = _mm256_set1_epi32(1); for (std::size_t index = length; index-- > 0;) { const __m256i prefix_values = _mm256_load_si256(reinterpret_cast(temp + index * kBatchLanes)); const __m256i output = _mm256_mullo_epi32(prefix_values, suffix); _mm256_store_si256(reinterpret_cast<__m256i*>(temp + index * kBatchLanes), output); suffix = _mm256_mullo_epi32(suffix, _mm256_load_si256(reinterpret_cast(hot + index * kBatchLanes))); } return sample_checksum(temp, length * kBatchLanes); } ``` ## Rotated search ```cpp CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR int scalar_rotated_search_aos_value( const WideRecord* values, std::size_t length, std::uint32_t target) { int left = 0; int right = static_cast(length) - 1; while (left <= right) { const int mid = left + (right - left) / 2; const std::uint32_t left_value = values[left].value; const std::uint32_t mid_value = values[mid].value; const std::uint32_t right_value = values[right].value; if (mid_value == target) { return mid; } if (left_value <= mid_value) { if (left_value <= target && target < mid_value) { right = mid - 1; } else { left = mid + 1; } } else { if (mid_value < target && target <= right_value) { left = mid + 1; } else { right = mid - 1; } } } return -1; } CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR int scalar_rotated_search_soa_value( const std::uint32_t* values, std::size_t length, std::uint32_t target) { int left = 0; int right = static_cast(length) - 1; while (left <= right) { const int mid = left + (right - left) / 2; const std::uint32_t left_value = values[left]; const std::uint32_t mid_value = values[mid]; const std::uint32_t right_value = values[right]; if (mid_value == target) { return mid; } if (left_value <= mid_value) { if (left_value <= target && target < mid_value) { right = mid - 1; } else { left = mid + 1; } } else { if (mid_value < target && target <= right_value) { left = mid + 1; } else { right = mid - 1; } } } return -1; } CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR int scalar_rotated_search_aos_value( const WideRecord* values, std::size_t length, std::uint32_t target) { int left = 0; int right = static_cast(length) - 1; while (left <= right) { const int mid = left + (right - left) / 2; const std::uint32_t left_value = values[left].value; const std::uint32_t mid_value = values[mid].value; const std::uint32_t right_value = values[right].value; if (mid_value == target) { return mid; } if (left_value <= mid_value) { if (left_value <= target && target < mid_value) { right = mid - 1; } else { left = mid + 1; } } else { if (mid_value < target && target <= right_value) { left = mid + 1; } else { right = mid - 1; } } } return -1; } CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR int scalar_rotated_search_soa_value( const std::uint32_t* values, std::size_t length, std::uint32_t target) { int left = 0; int right = static_cast(length) - 1; while (left <= right) { const int mid = left + (right - left) / 2; const std::uint32_t left_value = values[left]; const std::uint32_t mid_value = values[mid]; const std::uint32_t right_value = values[right]; if (mid_value == target) { return mid; } if (left_value <= mid_value) { if (left_value <= target && target < mid_value) { right = mid - 1; } else { left = mid + 1; } } else { if (mid_value < target && target <= right_value) { left = mid + 1; } else { right = mid - 1; } } } return -1; } CACHEBENCH_NOINLINE std::uint64_t simd_rotated_search_aos(const WideRecord* aos, const std::uint32_t* interleaved_queries, std::size_t length, std::size_t query_count) { return simd_rotated_search_common( interleaved_queries, length, query_count, [aos, length](__m256i indices) { return gather_rotated_aos_values(aos, length, indices); }); } CACHEBENCH_NOINLINE std::uint64_t simd_rotated_search_soa(const std::uint32_t* interleaved_hot, const std::uint32_t* interleaved_queries, std::size_t length, std::size_t query_count) { return simd_rotated_search_common( interleaved_queries, length, query_count, [interleaved_hot](__m256i indices) { return gather_rotated_soa_values(interleaved_hot, indices); }); } ``` ## Sort 0/1/2 ```cpp CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR void sort012_scalar_aos(Sort012Record* values, std::size_t length) { std::size_t low = 0; std::size_t mid = 0; std::size_t high = length - 1; while (mid <= high) { if (values[mid].value == 0U) { std::swap(values[low++], values[mid++]); } else if (values[mid].value == 1U) { ++mid; } else { std::swap(values[mid], values[high--]); } } } CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t sort012_scalar_soa(const std::uint32_t* input, std::uint32_t* output, std::size_t length) { std::array counts{}; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t index = 0; index < length; ++index) { ++counts[input[index]]; } std::fill_n(output, counts[0], 0U); std::fill_n(output + counts[0], counts[1], 1U); std::fill_n(output + counts[0] + counts[1], counts[2], 2U); return sample_checksum(output, length); } CACHEBENCH_NOINLINE std::uint64_t sort012_simd_aos(const Sort012Record* input, std::uint32_t* output, std::size_t length) { constexpr std::int32_t kStrideWords = static_cast(sizeof(Sort012Record) / sizeof(std::uint32_t)); const int* base = reinterpret_cast(input); const __m256i zero = _mm256_setzero_si256(); const __m256i one = _mm256_set1_epi32(1); const __m256i two = _mm256_set1_epi32(2); std::array counts{}; std::size_t index = 0; for (; index + kBatchLanes <= length; index += kBatchLanes) { const __m256i gather_indices = _mm256_setr_epi32(static_cast((index + 0) * kStrideWords), static_cast((index + 1) * kStrideWords), static_cast((index + 2) * kStrideWords), static_cast((index + 3) * kStrideWords), static_cast((index + 4) * kStrideWords), static_cast((index + 5) * kStrideWords), static_cast((index + 6) * kStrideWords), static_cast((index + 7) * kStrideWords)); const __m256i values = _mm256_i32gather_epi32(base, gather_indices, sizeof(std::uint32_t)); counts[0] += static_cast(std::popcount(static_cast(mask_bits(_mm256_cmpeq_epi32(values, zero))))); counts[1] += static_cast(std::popcount(static_cast(mask_bits(_mm256_cmpeq_epi32(values, one))))); counts[2] += static_cast(std::popcount(static_cast(mask_bits(_mm256_cmpeq_epi32(values, two))))); } for (; index < length; ++index) { ++counts[input[index].value]; } std::fill_n(output, counts[0], 0U); std::fill_n(output + counts[0], counts[1], 1U); std::fill_n(output + counts[0] + counts[1], counts[2], 2U); return sample_checksum(output, length); } CACHEBENCH_NOINLINE std::uint64_t sort012_simd_soa(const std::uint32_t* input, std::uint32_t* output, std::size_t length) { const __m256i zero = _mm256_setzero_si256(); const __m256i one = _mm256_set1_epi32(1); const __m256i two = _mm256_set1_epi32(2); std::array counts{}; std::size_t index = 0; for (; index + kBatchLanes <= length; index += kBatchLanes) { const __m256i values = _mm256_load_si256(reinterpret_cast(input + index)); counts[0] += static_cast(std::popcount(static_cast(mask_bits(_mm256_cmpeq_epi32(values, zero))))); counts[1] += static_cast(std::popcount(static_cast(mask_bits(_mm256_cmpeq_epi32(values, one))))); counts[2] += static_cast(std::popcount(static_cast(mask_bits(_mm256_cmpeq_epi32(values, two))))); } for (; index < length; ++index) { ++counts[input[index]]; } std::fill_n(output, counts[0], 0U); std::fill_n(output + counts[0], counts[1], 1U); std::fill_n(output + counts[0] + counts[1], counts[2], 2U); return sample_checksum(output, length); } ``` ## BFS ```cpp CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t bfs_scalar_adjlist(const GraphAdjList& graph, std::size_t passes) { const std::size_t vertices = graph.edges.size(); std::vector queue(vertices); std::vector visited(vertices, 0); std::uint64_t total = 0; for (std::size_t pass = 0; pass < passes; ++pass) { std::fill(visited.begin(), visited.end(), 0); const std::uint32_t source = static_cast((pass * 17) % vertices); std::size_t head = 0; std::size_t tail = 0; visited[source] = 1; queue[tail++] = source; while (head < tail) { const std::uint32_t node = queue[head++]; total += node; for (const GraphEdgeRecord& edge : graph.edges[node]) { if (visited[edge.dst] == 0) { visited[edge.dst] = 1; queue[tail++] = edge.dst; } } } total += tail; } return total; } CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR std::uint64_t bfs_scalar_csr(const GraphCsr& graph, std::size_t passes) { const std::size_t vertices = graph.offsets.size() - 1; std::vector queue(vertices); std::vector visited(vertices, 0); std::uint64_t total = 0; for (std::size_t pass = 0; pass < passes; ++pass) { std::fill(visited.begin(), visited.end(), 0); const std::uint32_t source = static_cast((pass * 17) % vertices); std::size_t head = 0; std::size_t tail = 0; visited[source] = 1; queue[tail++] = source; while (head < tail) { const std::uint32_t node = queue[head++]; total += node; for (std::uint32_t edge = graph.offsets[node]; edge < graph.offsets[node + 1]; ++edge) { const std::uint32_t next = graph.edges[edge]; if (visited[next] == 0) { visited[next] = 1; queue[tail++] = next; } } } total += tail; } return total; } CACHEBENCH_NOINLINE std::uint64_t bfs_simd_adjlist(const GraphAdjList& graph, std::size_t passes) { const std::size_t vertices = graph.edges.size(); std::vector queue(vertices); std::vector visited(vertices, 0); std::uint64_t total = 0; alignas(32) std::array dst_lanes{}; constexpr std::int32_t kStrideWords = static_cast(sizeof(GraphEdgeRecord) / sizeof(std::uint32_t)); const __m256i zero = _mm256_setzero_si256(); for (std::size_t pass = 0; pass < passes; ++pass) { std::fill(visited.begin(), visited.end(), 0); const std::uint32_t source = static_cast((pass * 17) % vertices); std::size_t head = 0; std::size_t tail = 0; visited[source] = 1; queue[tail++] = source; while (head < tail) { const std::uint32_t node = queue[head++]; total += node; const auto& edges = graph.edges[node]; const int* base = reinterpret_cast(edges.data()); std::size_t edge = 0; for (; edge + kBatchLanes <= edges.size(); edge += kBatchLanes) { const __m256i edge_indices = _mm256_setr_epi32(static_cast((edge + 0) * kStrideWords), static_cast((edge + 1) * kStrideWords), static_cast((edge + 2) * kStrideWords), static_cast((edge + 3) * kStrideWords), static_cast((edge + 4) * kStrideWords), static_cast((edge + 5) * kStrideWords), static_cast((edge + 6) * kStrideWords), static_cast((edge + 7) * kStrideWords)); const __m256i next_nodes = _mm256_i32gather_epi32(base, edge_indices, sizeof(std::uint32_t)); const __m256i seen = _mm256_i32gather_epi32(visited.data(), next_nodes, static_cast(sizeof(int))); _mm256_store_si256(reinterpret_cast<__m256i*>(dst_lanes.data()), next_nodes); const int unseen_mask = mask_bits(_mm256_cmpeq_epi32(seen, zero)); for (std::size_t lane = 0; lane < kBatchLanes; ++lane) { if (((unseen_mask >> lane) & 1) != 0 && visited[dst_lanes[lane]] == 0) { visited[dst_lanes[lane]] = 1; queue[tail++] = dst_lanes[lane]; } } } for (; edge < edges.size(); ++edge) { const std::uint32_t next = edges[edge].dst; if (visited[next] == 0) { visited[next] = 1; queue[tail++] = next; } } } total += tail; } return total; } CACHEBENCH_NOINLINE std::uint64_t bfs_simd_csr(const GraphCsr& graph, std::size_t passes) { const std::size_t vertices = graph.offsets.size() - 1; std::vector queue(vertices); std::vector visited(vertices, 0); std::uint64_t total = 0; alignas(32) std::array dst_lanes{}; const __m256i zero = _mm256_setzero_si256(); for (std::size_t pass = 0; pass < passes; ++pass) { std::fill(visited.begin(), visited.end(), 0); const std::uint32_t source = static_cast((pass * 17) % vertices); std::size_t head = 0; std::size_t tail = 0; visited[source] = 1; queue[tail++] = source; while (head < tail) { const std::uint32_t node = queue[head++]; total += node; std::uint32_t edge = graph.offsets[node]; const std::uint32_t edge_limit = graph.offsets[node + 1]; for (; edge + kBatchLanes <= edge_limit; edge += static_cast(kBatchLanes)) { const __m256i next_nodes = _mm256_loadu_si256(reinterpret_cast(graph.edges.data() + edge)); const __m256i seen = _mm256_i32gather_epi32(visited.data(), next_nodes, static_cast(sizeof(int))); _mm256_store_si256(reinterpret_cast<__m256i*>(dst_lanes.data()), next_nodes); const int unseen_mask = mask_bits(_mm256_cmpeq_epi32(seen, zero)); for (std::size_t lane = 0; lane < kBatchLanes; ++lane) { if (((unseen_mask >> lane) & 1) != 0 && visited[dst_lanes[lane]] == 0) { visited[dst_lanes[lane]] = 1; queue[tail++] = dst_lanes[lane]; } } } for (; edge < edge_limit; ++edge) { const std::uint32_t next = graph.edges[edge]; if (visited[next] == 0) { visited[next] = 1; queue[tail++] = next; } } } total += tail; } return total; } ``` ## Transpose ```cpp CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR void transpose_scalar_naive(const float* src, float* dst, std::size_t dim) { CACHEBENCH_NO_VECTOR_LOOP for (std::size_t row = 0; row < dim; ++row) { CACHEBENCH_NO_VECTOR_LOOP for (std::size_t col = 0; col < dim; ++col) { dst[col * dim + row] = src[row * dim + col]; } } } CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR void transpose_scalar_blocked(const float* src, float* dst, std::size_t dim) { CACHEBENCH_NO_VECTOR_LOOP for (std::size_t row_block = 0; row_block < dim; row_block += kTransposeBlock) { CACHEBENCH_NO_VECTOR_LOOP for (std::size_t col_block = 0; col_block < dim; col_block += kTransposeBlock) { const std::size_t row_limit = std::min(dim, row_block + kTransposeBlock); const std::size_t col_limit = std::min(dim, col_block + kTransposeBlock); CACHEBENCH_NO_VECTOR_LOOP for (std::size_t row = row_block; row < row_limit; ++row) { CACHEBENCH_NO_VECTOR_LOOP for (std::size_t col = col_block; col < col_limit; ++col) { dst[col * dim + row] = src[row * dim + col]; } } } } } CACHEBENCH_NOINLINE void transpose_simd_naive(const float* src, float* dst, std::size_t dim) { const std::size_t simd_dim = dim - (dim % 4); for (std::size_t row = 0; row < simd_dim; row += 4) { for (std::size_t col = 0; col < simd_dim; col += 4) { transpose4x4_simd(src + row * dim + col, dim, dst + col * dim + row, dim); } for (std::size_t col = simd_dim; col < dim; ++col) { for (std::size_t lane = 0; lane < 4 && row + lane < dim; ++lane) { dst[col * dim + (row + lane)] = src[(row + lane) * dim + col]; } } } for (std::size_t row = simd_dim; row < dim; ++row) { for (std::size_t col = 0; col < dim; ++col) { dst[col * dim + row] = src[row * dim + col]; } } } CACHEBENCH_NOINLINE void transpose_simd_blocked(const float* src, float* dst, std::size_t dim) { const std::size_t simd_dim = dim - (dim % 4); for (std::size_t row_block = 0; row_block < dim; row_block += kTransposeBlock) { for (std::size_t col_block = 0; col_block < dim; col_block += kTransposeBlock) { const std::size_t row_limit = std::min(dim, row_block + kTransposeBlock); const std::size_t col_limit = std::min(dim, col_block + kTransposeBlock); std::size_t row = row_block; for (; row + 4 <= row_limit && row < simd_dim; row += 4) { std::size_t col = col_block; for (; col + 4 <= col_limit && col < simd_dim; col += 4) { transpose4x4_simd(src + row * dim + col, dim, dst + col * dim + row, dim); } for (; col < col_limit; ++col) { for (std::size_t lane = 0; lane < 4; ++lane) { dst[col * dim + (row + lane)] = src[(row + lane) * dim + col]; } } } for (; row < row_limit; ++row) { for (std::size_t col = col_block; col < col_limit; ++col) { dst[col * dim + row] = src[row * dim + col]; } } } } } ``` ## Matrix multiply ```cpp CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR void scalar_matmul_naive(const float* a, const float* b, float* c, std::size_t n) { std::fill_n(c, n * n, 0.0f); for (std::size_t i = 0; i < n; ++i) { for (std::size_t j = 0; j < n; ++j) { float sum = 0.0f; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t k = 0; k < n; ++k) { sum += a[i * n + k] * b[k * n + j]; } c[i * n + j] = sum; } } } void transpose_square_matrix(const float* src, float* dst, std::size_t n) { for (std::size_t i = 0; i < n; ++i) { for (std::size_t j = 0; j < n; ++j) { dst[j * n + i] = src[i * n + j]; } } } CACHEBENCH_NOINLINE CACHEBENCH_NOVECTOR void scalar_matmul_transposed(const float* a, const float* bt, float* c, std::size_t n) { std::fill_n(c, n * n, 0.0f); for (std::size_t i = 0; i < n; ++i) { const float* row_a = a + i * n; for (std::size_t j = 0; j < n; ++j) { const float* row_bt = bt + j * n; float sum = 0.0f; CACHEBENCH_NO_VECTOR_LOOP for (std::size_t k = 0; k < n; ++k) { sum += row_a[k] * row_bt[k]; } c[i * n + j] = sum; } } } CACHEBENCH_NOINLINE void avx2_matmul_gather(const float* a, const float* b, float* c, std::size_t n) { std::fill_n(c, n * n, 0.0f); for (std::size_t i = 0; i < n; ++i) { const float* row_a = a + i * n; for (std::size_t j = 0; j < n; ++j) { __m256 acc = _mm256_setzero_ps(); std::size_t k = 0; for (; k + kAvxWidthFloats <= n; k += kAvxWidthFloats) { const __m256 avec = _mm256_load_ps(row_a + k); const __m256i bindex = _mm256_setr_epi32(static_cast((k + 0) * n + j), static_cast((k + 1) * n + j), static_cast((k + 2) * n + j), static_cast((k + 3) * n + j), static_cast((k + 4) * n + j), static_cast((k + 5) * n + j), static_cast((k + 6) * n + j), static_cast((k + 7) * n + j)); const __m256 bcol = _mm256_i32gather_ps(b, bindex, sizeof(float)); acc = _mm256_add_ps(acc, _mm256_mul_ps(avec, bcol)); } float sum = horizontal_sum_ps(acc); for (; k < n; ++k) { sum += row_a[k] * b[k * n + j]; } c[i * n + j] = sum; } } } CACHEBENCH_NOINLINE void avx2_matmul_blocked(const float* a, const float* b, float* c, std::size_t n) { constexpr std::size_t kBlockI = 32; constexpr std::size_t kBlockJ = 64; constexpr std::size_t kBlockK = 64; std::fill_n(c, n * n, 0.0f); for (std::size_t ii = 0; ii < n; ii += kBlockI) { const std::size_t i_end = std::min(ii + kBlockI, n); for (std::size_t kk = 0; kk < n; kk += kBlockK) { const std::size_t k_end = std::min(kk + kBlockK, n); for (std::size_t jj = 0; jj < n; jj += kBlockJ) { const std::size_t j_end = std::min(jj + kBlockJ, n); for (std::size_t i = ii; i < i_end; ++i) { for (std::size_t k = kk; k < k_end; ++k) { const __m256 aval = _mm256_set1_ps(a[i * n + k]); std::size_t j = jj; for (; j + kAvxWidthFloats <= j_end; j += kAvxWidthFloats) { const __m256 cvec = _mm256_loadu_ps(c + i * n + j); const __m256 bvec = _mm256_loadu_ps(b + k * n + j); const __m256 updated = _mm256_add_ps(cvec, _mm256_mul_ps(aval, bvec)); _mm256_storeu_ps(c + i * n + j, updated); } for (; j < j_end; ++j) { c[i * n + j] += a[i * n + k] * b[k * n + j]; } } } } } } } ```