// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #include #include #include #include "benchmark/benchmark.h" #include "arrow/memory_pool.h" #include "arrow/type_fwd.h" #include "arrow/util/cpu_info.h" #include "arrow/util/logging.h" // IWYU pragma: keep namespace arrow { // Benchmark changed its parameter type between releases from // int to int64_t. As it doesn't have version macros, we need // to apply C++ template magic. template struct BenchmarkArgsType; // Pattern matching that extracts the vector element type of Benchmark::Args() template struct BenchmarkArgsType&)> { using type = Values; }; using ArgsType = typename BenchmarkArgsType::type; using internal::CpuInfo; static const CpuInfo* cpu_info = CpuInfo::GetInstance(); static const int64_t kL1Size = cpu_info->CacheSize(CpuInfo::CacheLevel::L1); static const int64_t kL2Size = cpu_info->CacheSize(CpuInfo::CacheLevel::L2); static const int64_t kL3Size = cpu_info->CacheSize(CpuInfo::CacheLevel::L3); static const int64_t kCantFitInL3Size = kL3Size * 4; static const std::vector kMemorySizes = {kL1Size, kL2Size, kL3Size, kCantFitInL3Size}; // 0 is treated as "no nulls" static const std::vector kInverseNullProportions = {10000, 100, 10, 2, 1, 0}; struct GenericItemsArgs { // number of items processed per iteration const int64_t size; // proportion of nulls in generated arrays double null_proportion; explicit GenericItemsArgs(benchmark::State& state) : size(state.range(0)), state_(state) { if (state.range(1) == 0) { this->null_proportion = 0.0; } else { this->null_proportion = std::min(1., 1. / static_cast(state.range(1))); } } ~GenericItemsArgs() { state_.counters["size"] = static_cast(size); state_.counters["null_percent"] = null_proportion * 100; state_.SetItemsProcessed(state_.iterations() * size); } private: benchmark::State& state_; }; void BenchmarkSetArgsWithSizes(benchmark::internal::Benchmark* bench, const std::vector& sizes = kMemorySizes) { bench->Unit(benchmark::kMicrosecond); for (const auto size : sizes) { for (const auto inverse_null_proportion : kInverseNullProportions) { bench->Args({static_cast(size), inverse_null_proportion}); } } } void BenchmarkSetArgs(benchmark::internal::Benchmark* bench) { BenchmarkSetArgsWithSizes(bench, kMemorySizes); } void RegressionSetArgs(benchmark::internal::Benchmark* bench) { // Regression do not need to account for cache hierarchy, thus optimize for // the best case. BenchmarkSetArgsWithSizes(bench, {kL1Size}); } // RAII struct to handle some of the boilerplate in regression benchmarks struct RegressionArgs { // size of memory tested (per iteration) in bytes int64_t size; // proportion of nulls in generated arrays double null_proportion; // If size_is_bytes is true, then it's a number of bytes, otherwise it's the // number of items processed (for reporting) explicit RegressionArgs(benchmark::State& state, bool size_is_bytes = true) : size(state.range(0)), state_(state), size_is_bytes_(size_is_bytes) { if (state.range(1) == 0) { this->null_proportion = 0.0; } else { this->null_proportion = std::min(1., 1. / static_cast(state.range(1))); } } ~RegressionArgs() { state_.counters["size"] = static_cast(size); state_.counters["null_percent"] = null_proportion * 100; if (size_is_bytes_) { state_.SetBytesProcessed(state_.iterations() * size); } else { state_.SetItemsProcessed(state_.iterations() * size); } } private: benchmark::State& state_; bool size_is_bytes_; }; class MemoryPoolMemoryManager : public benchmark::MemoryManager { void Start() override { memory_pool = std::make_shared(default_memory_pool()); MemoryPool* default_pool = default_memory_pool(); global_allocations_start = default_pool->num_allocations(); } // BENCHMARK_DONT_OPTIMIZE is used here to detect Google Benchmark // 1.8.0. We can remove this Stop(Result*) when we require Google // Benchmark 1.8.0 or later. #ifndef BENCHMARK_DONT_OPTIMIZE void Stop(Result* result) override { Stop(*result); } #endif void Stop(benchmark::MemoryManager::Result& result) override { // If num_allocations is still zero, we assume that the memory pool wasn't passed down // so we should record them. MemoryPool* default_pool = default_memory_pool(); int64_t new_default_allocations = default_pool->num_allocations() - global_allocations_start; // Only record metrics if (1) there were allocations and (2) we // recorded at least one. if (new_default_allocations > 0 && memory_pool->num_allocations() > 0) { if (new_default_allocations > memory_pool->num_allocations()) { // If we missed some, let's report that. int64_t missed_allocations = new_default_allocations - memory_pool->num_allocations(); ARROW_LOG(WARNING) << "BenchmarkMemoryTracker recorded some allocations " << "for a benchmark, but missed " << missed_allocations << " allocations.\n"; } result.max_bytes_used = memory_pool->max_memory(); result.total_allocated_bytes = memory_pool->total_bytes_allocated(); result.num_allocs = memory_pool->num_allocations(); } } public: std::shared_ptr<::arrow::ProxyMemoryPool> memory_pool; protected: int64_t global_allocations_start; }; /// \brief Track memory pool allocations in benchmarks. /// /// Instantiate as a global variable to register the hooks into Google Benchmark /// to collect memory metrics. Before each benchmark, a new ProxyMemoryPool is /// created. It can then be accessed with memory_pool(). Once the benchmark is /// complete, the hook will record the maximum memory used, the total bytes /// allocated, and the total number of allocations. If no allocations were seen, /// (for example, if you forgot to pass down the memory pool), then these metrics /// will not be saved. /// /// Since this is used as one global variable, this will not work if multiple /// benchmarks are run concurrently or for multi-threaded benchmarks (ones /// that use `->ThreadRange(...)`). class BenchmarkMemoryTracker { public: BenchmarkMemoryTracker() : manager_() { ::benchmark::RegisterMemoryManager(&manager_); } ::arrow::MemoryPool* memory_pool() const { return manager_.memory_pool.get(); } protected: ::arrow::MemoryPoolMemoryManager manager_; }; } // namespace arrow