diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index b4e03954..cfa48b82 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -1,9 +1,27 @@ +include(FetchContent) + +FetchContent_Declare( + counters + GIT_REPOSITORY https://github.com/lemire/counters.git + GIT_TAG v2.0.0 +) + +FetchContent_MakeAvailable(counters) + add_executable(realbenchmark benchmark.cpp) +target_link_libraries(realbenchmark PRIVATE counters::counters) +add_executable(bench_ip bench_ip.cpp) +target_link_libraries(bench_ip PRIVATE counters::counters) + set_property( TARGET realbenchmark PROPERTY CXX_STANDARD 17) - +set_property( + TARGET bench_ip + PROPERTY CXX_STANDARD 17) target_link_libraries(realbenchmark PUBLIC fast_float) +target_link_libraries(bench_ip PUBLIC fast_float) + include(ExternalProject) # Define the external project diff --git a/benchmarks/apple_arm_events.h b/benchmarks/apple_arm_events.h deleted file mode 100644 index f127d14d..00000000 --- a/benchmarks/apple_arm_events.h +++ /dev/null @@ -1,1117 +0,0 @@ -// Original design from: -// ============================================================================= -// XNU kperf/kpc -// Available for 64-bit Intel/Apple Silicon, macOS/iOS, with root privileges -// -// References: -// -// XNU source (since xnu 2422.1.72): -// https://github.com/apple/darwin-xnu/blob/main/osfmk/kern/kpc.h -// https://github.com/apple/darwin-xnu/blob/main/bsd/kern/kern_kpc.c -// -// Lightweight PET (Profile Every Thread, since xnu 3789.1.32): -// https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/pet.c -// https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/kperf_kpc.c -// -// System Private frameworks (since macOS 10.11, iOS 8.0): -// /System/Library/PrivateFrameworks/kperf.framework -// /System/Library/PrivateFrameworks/kperfdata.framework -// -// Xcode framework (since Xcode 7.0): -// /Applications/Xcode.app/Contents/SharedFrameworks/DVTInstrumentsFoundation.framework -// -// CPU database (plist files) -// macOS (since macOS 10.11): -// /usr/share/kpep/.plist -// iOS (copied from Xcode, since iOS 10.0, Xcode 8.0): -// /Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform -// /DeviceSupport//DeveloperDiskImage.dmg/usr/share/kpep/.plist -// -// -// Created by YaoYuan on 2021. -// Released into the public domain (unlicense.org). -// ============================================================================= - -#ifndef M1CYCLES_H -#define M1CYCLES_H - -#include -#include -#include -#include -#include - -#include // for dlopen() and dlsym() -#include // for mach_absolute_time() -#include // for kdebug trace decode -#include // for sysctl() -#include // for usleep() - -struct performance_counters { - double cycles; - double branches; - double missed_branches; - double instructions; - - performance_counters(uint64_t c, uint64_t b, uint64_t m, uint64_t i) - : cycles(c), branches(b), missed_branches(m), instructions(i) {} - - performance_counters(double c, double b, double m, double i) - : cycles(c), branches(b), missed_branches(m), instructions(i) {} - - performance_counters(double init) - : cycles(init), branches(init), missed_branches(init), - instructions(init) {} - - inline performance_counters &operator-=(const performance_counters &other) { - cycles -= other.cycles; - branches -= other.branches; - missed_branches -= other.missed_branches; - instructions -= other.instructions; - return *this; - } - - inline performance_counters &min(const performance_counters &other) { - cycles = other.cycles < cycles ? other.cycles : cycles; - branches = other.branches < branches ? other.branches : branches; - missed_branches = other.missed_branches < missed_branches - ? other.missed_branches - : missed_branches; - instructions = - other.instructions < instructions ? other.instructions : instructions; - return *this; - } - - inline performance_counters &operator+=(const performance_counters &other) { - cycles += other.cycles; - branches += other.branches; - missed_branches += other.missed_branches; - instructions += other.instructions; - return *this; - } - - inline performance_counters &operator/=(double numerator) { - cycles /= numerator; - branches /= numerator; - missed_branches /= numerator; - instructions /= numerator; - return *this; - } -}; - -inline performance_counters operator-(const performance_counters &a, - const performance_counters &b) { - return performance_counters(a.cycles - b.cycles, a.branches - b.branches, - a.missed_branches - b.missed_branches, - a.instructions - b.instructions); -} - -typedef float f32; -typedef double f64; -typedef int8_t i8; -typedef uint8_t u8; -typedef int16_t i16; -typedef uint16_t u16; -typedef int32_t i32; -typedef uint32_t u32; -typedef int64_t i64; -typedef uint64_t u64; -typedef size_t usize; - -// ----------------------------------------------------------------------------- -// header (reverse engineered) -// This framework wraps some sysctl calls to communicate with the kpc in kernel. -// Most functions requires root privileges, or process is "blessed". -// ----------------------------------------------------------------------------- - -// Cross-platform class constants. -#define KPC_CLASS_FIXED (0) -#define KPC_CLASS_CONFIGURABLE (1) -#define KPC_CLASS_POWER (2) -#define KPC_CLASS_RAWPMU (3) - -// Cross-platform class mask constants. -#define KPC_CLASS_FIXED_MASK (1u << KPC_CLASS_FIXED) // 1 -#define KPC_CLASS_CONFIGURABLE_MASK (1u << KPC_CLASS_CONFIGURABLE) // 2 -#define KPC_CLASS_POWER_MASK (1u << KPC_CLASS_POWER) // 4 -#define KPC_CLASS_RAWPMU_MASK (1u << KPC_CLASS_RAWPMU) // 8 - -// PMU version constants. -#define KPC_PMU_ERROR (0) // Error -#define KPC_PMU_INTEL_V3 (1) // Intel -#define KPC_PMU_ARM_APPLE (2) // ARM64 -#define KPC_PMU_INTEL_V2 (3) // Old Intel -#define KPC_PMU_ARM_V2 (4) // Old ARM - -// The maximum number of counters we could read from every class in one go. -// ARMV7: FIXED: 1, CONFIGURABLE: 4 -// ARM32: FIXED: 2, CONFIGURABLE: 6 -// ARM64: FIXED: 2, CONFIGURABLE: CORE_NCTRS - FIXED (6 or 8) -// x86: 32 -#define KPC_MAX_COUNTERS 32 - -// Bits for defining what to do on an action. -// Defined in https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/action.h -#define KPERF_SAMPLER_TH_INFO (1U << 0) -#define KPERF_SAMPLER_TH_SNAPSHOT (1U << 1) -#define KPERF_SAMPLER_KSTACK (1U << 2) -#define KPERF_SAMPLER_USTACK (1U << 3) -#define KPERF_SAMPLER_PMC_THREAD (1U << 4) -#define KPERF_SAMPLER_PMC_CPU (1U << 5) -#define KPERF_SAMPLER_PMC_CONFIG (1U << 6) -#define KPERF_SAMPLER_MEMINFO (1U << 7) -#define KPERF_SAMPLER_TH_SCHEDULING (1U << 8) -#define KPERF_SAMPLER_TH_DISPATCH (1U << 9) -#define KPERF_SAMPLER_TK_SNAPSHOT (1U << 10) -#define KPERF_SAMPLER_SYS_MEM (1U << 11) -#define KPERF_SAMPLER_TH_INSCYC (1U << 12) -#define KPERF_SAMPLER_TK_INFO (1U << 13) - -// Maximum number of kperf action ids. -#define KPERF_ACTION_MAX (32) - -// Maximum number of kperf timer ids. -#define KPERF_TIMER_MAX (8) - -// x86/arm config registers are 64-bit -typedef u64 kpc_config_t; - -/// Print current CPU identification string to the buffer (same as snprintf), -/// such as "cpu_7_8_10b282dc_46". This string can be used to locate the PMC -/// database in /usr/share/kpep. -/// @return string's length, or negative value if error occurs. -/// @note This method does not requires root privileges. -/// @details sysctl get(hw.cputype), get(hw.cpusubtype), -/// get(hw.cpufamily), get(machdep.cpu.model) -static int (*kpc_cpu_string)(char *buf, usize buf_size); - -/// Get the version of KPC that's being run. -/// @return See `PMU version constants` above. -/// @details sysctl get(kpc.pmu_version) -static u32 (*kpc_pmu_version)(void); - -/// Get running PMC classes. -/// @return See `class mask constants` above, -/// 0 if error occurs or no class is set. -/// @details sysctl get(kpc.counting) -static u32 (*kpc_get_counting)(void); - -/// Set PMC classes to enable counting. -/// @param classes See `class mask constants` above, set 0 to shutdown counting. -/// @return 0 for success. -/// @details sysctl set(kpc.counting) -static int (*kpc_set_counting)(u32 classes); - -/// Get running PMC classes for current thread. -/// @return See `class mask constants` above, -/// 0 if error occurs or no class is set. -/// @details sysctl get(kpc.thread_counting) -static u32 (*kpc_get_thread_counting)(void); - -/// Set PMC classes to enable counting for current thread. -/// @param classes See `class mask constants` above, set 0 to shutdown counting. -/// @return 0 for success. -/// @details sysctl set(kpc.thread_counting) -static int (*kpc_set_thread_counting)(u32 classes); - -/// Get how many config registers there are for a given mask. -/// For example: Intel may returns 1 for `KPC_CLASS_FIXED_MASK`, -/// returns 4 for `KPC_CLASS_CONFIGURABLE_MASK`. -/// @param classes See `class mask constants` above. -/// @return 0 if error occurs or no class is set. -/// @note This method does not requires root privileges. -/// @details sysctl get(kpc.config_count) -static u32 (*kpc_get_config_count)(u32 classes); - -/// Get config registers. -/// @param classes see `class mask constants` above. -/// @param config Config buffer to receive values, should not smaller than -/// kpc_get_config_count(classes) * sizeof(kpc_config_t). -/// @return 0 for success. -/// @details sysctl get(kpc.config_count), get(kpc.config) -static int (*kpc_get_config)(u32 classes, kpc_config_t *config); - -/// Set config registers. -/// @param classes see `class mask constants` above. -/// @param config Config buffer, should not smaller than -/// kpc_get_config_count(classes) * sizeof(kpc_config_t). -/// @return 0 for success. -/// @details sysctl get(kpc.config_count), set(kpc.config) -static int (*kpc_set_config)(u32 classes, kpc_config_t *config); - -/// Get how many counters there are for a given mask. -/// For example: Intel may returns 3 for `KPC_CLASS_FIXED_MASK`, -/// returns 4 for `KPC_CLASS_CONFIGURABLE_MASK`. -/// @param classes See `class mask constants` above. -/// @note This method does not requires root privileges. -/// @details sysctl get(kpc.counter_count) -static u32 (*kpc_get_counter_count)(u32 classes); - -/// Get counter accumulations. -/// If `all_cpus` is true, the buffer count should not smaller than -/// (cpu_count * counter_count). Otherwize, the buffer count should not smaller -/// than (counter_count). -/// @see kpc_get_counter_count(), kpc_cpu_count(). -/// @param all_cpus true for all CPUs, false for current cpu. -/// @param classes See `class mask constants` above. -/// @param curcpu A pointer to receive current cpu id, can be NULL. -/// @param buf Buffer to receive counter's value. -/// @return 0 for success. -/// @details sysctl get(hw.ncpu), get(kpc.counter_count), get(kpc.counters) -static int (*kpc_get_cpu_counters)(bool all_cpus, u32 classes, int *curcpu, - u64 *buf); - -/// Get counter accumulations for current thread. -/// @param tid Thread id, should be 0. -/// @param buf_count The number of buf's elements (not bytes), -/// should not smaller than kpc_get_counter_count(). -/// @param buf Buffer to receive counter's value. -/// @return 0 for success. -/// @details sysctl get(kpc.thread_counters) -static int (*kpc_get_thread_counters)(u32 tid, u32 buf_count, u64 *buf); - -/// Acquire/release the counters used by the Power Manager. -/// @param val 1:acquire, 0:release -/// @return 0 for success. -/// @details sysctl set(kpc.force_all_ctrs) -static int (*kpc_force_all_ctrs_set)(int val); - -/// Get the state of all_ctrs. -/// @return 0 for success. -/// @details sysctl get(kpc.force_all_ctrs) -static int (*kpc_force_all_ctrs_get)(int *val_out); - -/// Set number of actions, should be `KPERF_ACTION_MAX`. -/// @details sysctl set(kperf.action.count) -static int (*kperf_action_count_set)(u32 count); - -/// Get number of actions. -/// @details sysctl get(kperf.action.count) -static int (*kperf_action_count_get)(u32 *count); - -/// Set what to sample when a trigger fires an action, e.g. -/// `KPERF_SAMPLER_PMC_CPU`. -/// @details sysctl set(kperf.action.samplers) -static int (*kperf_action_samplers_set)(u32 actionid, u32 sample); - -/// Get what to sample when a trigger fires an action. -/// @details sysctl get(kperf.action.samplers) -static int (*kperf_action_samplers_get)(u32 actionid, u32 *sample); - -/// Apply a task filter to the action, -1 to disable filter. -/// @details sysctl set(kperf.action.filter_by_task) -static int (*kperf_action_filter_set_by_task)(u32 actionid, i32 port); - -/// Apply a pid filter to the action, -1 to disable filter. -/// @details sysctl set(kperf.action.filter_by_pid) -static int (*kperf_action_filter_set_by_pid)(u32 actionid, i32 pid); - -/// Set number of time triggers, should be `KPERF_TIMER_MAX`. -/// @details sysctl set(kperf.timer.count) -static int (*kperf_timer_count_set)(u32 count); - -/// Get number of time triggers. -/// @details sysctl get(kperf.timer.count) -static int (*kperf_timer_count_get)(u32 *count); - -/// Set timer number and period. -/// @details sysctl set(kperf.timer.period) -static int (*kperf_timer_period_set)(u32 actionid, u64 tick); - -/// Get timer number and period. -/// @details sysctl get(kperf.timer.period) -static int (*kperf_timer_period_get)(u32 actionid, u64 *tick); - -/// Set timer number and actionid. -/// @details sysctl set(kperf.timer.action) -static int (*kperf_timer_action_set)(u32 actionid, u32 timerid); - -/// Get timer number and actionid. -/// @details sysctl get(kperf.timer.action) -static int (*kperf_timer_action_get)(u32 actionid, u32 *timerid); - -/// Set which timer ID does PET (Profile Every Thread). -/// @details sysctl set(kperf.timer.pet_timer) -static int (*kperf_timer_pet_set)(u32 timerid); - -/// Get which timer ID does PET (Profile Every Thread). -/// @details sysctl get(kperf.timer.pet_timer) -static int (*kperf_timer_pet_get)(u32 *timerid); - -/// Enable or disable sampling. -/// @details sysctl set(kperf.sampling) -static int (*kperf_sample_set)(u32 enabled); - -/// Get is currently sampling. -/// @details sysctl get(kperf.sampling) -static int (*kperf_sample_get)(u32 *enabled); - -/// Reset kperf: stop sampling, kdebug, timers and actions. -/// @return 0 for success. -static int (*kperf_reset)(void); - -/// Nanoseconds to CPU ticks. -static u64 (*kperf_ns_to_ticks)(u64 ns); - -/// CPU ticks to nanoseconds. -static u64 (*kperf_ticks_to_ns)(u64 ticks); - -/// CPU ticks frequency (mach_absolute_time). -static u64 (*kperf_tick_frequency)(void); - -/// Get lightweight PET mode (not in kperf.framework). -static int kperf_lightweight_pet_get(u32 *enabled) { - if (!enabled) - return -1; - usize size = 4; - return sysctlbyname("kperf.lightweight_pet", enabled, &size, NULL, 0); -} - -/// Set lightweight PET mode (not in kperf.framework). -static int kperf_lightweight_pet_set(u32 enabled) { - return sysctlbyname("kperf.lightweight_pet", NULL, NULL, &enabled, 4); -} - -// ----------------------------------------------------------------------------- -// header (reverse engineered) -// This framework provides some functions to access the local CPU database. -// These functions do not require root privileges. -// ----------------------------------------------------------------------------- - -// KPEP CPU archtecture constants. -#define KPEP_ARCH_I386 0 -#define KPEP_ARCH_X86_64 1 -#define KPEP_ARCH_ARM 2 -#define KPEP_ARCH_ARM64 3 - -/// KPEP event (size: 48/28 bytes on 64/32 bit OS) -typedef struct kpep_event { - const char *name; ///< Unique name of a event, such as "INST_RETIRED.ANY". - const char *description; ///< Description for this event. - const char *errata; ///< Errata, currently NULL. - const char *alias; ///< Alias name, such as "Instructions", "Cycles". - const char *fallback; ///< Fallback event name for fixed counter. - u32 mask; - u8 number; - u8 umask; - u8 reserved; - u8 is_fixed; -} kpep_event; - -/// KPEP database (size: 144/80 bytes on 64/32 bit OS) -typedef struct kpep_db { - const char *name; ///< Database name, such as "haswell". - const char *cpu_id; ///< Plist name, such as "cpu_7_8_10b282dc". - const char *marketing_name; ///< Marketing name, such as "Intel Haswell". - void *plist_data; ///< Plist data (CFDataRef), currently NULL. - void *event_map; ///< All events (CFDict). - kpep_event - *event_arr; ///< Event struct buffer (sizeof(kpep_event) * events_count). - kpep_event **fixed_event_arr; ///< Fixed counter events (sizeof(kpep_event *) - ///< * fixed_counter_count) - void *alias_map; ///< All aliases (CFDict). - usize reserved_1; - usize reserved_2; - usize reserved_3; - usize event_count; ///< All events count. - usize alias_count; - usize fixed_counter_count; - usize config_counter_count; - usize power_counter_count; - u32 archtecture; ///< see `KPEP CPU archtecture constants` above. - u32 fixed_counter_bits; - u32 config_counter_bits; - u32 power_counter_bits; -} kpep_db; - -/// KPEP config (size: 80/44 bytes on 64/32 bit OS) -typedef struct kpep_config { - kpep_db *db; - kpep_event **ev_arr; ///< (sizeof(kpep_event *) * counter_count), init NULL - usize *ev_map; ///< (sizeof(usize *) * counter_count), init 0 - usize *ev_idx; ///< (sizeof(usize *) * counter_count), init -1 - u32 *flags; ///< (sizeof(u32 *) * counter_count), init 0 - u64 *kpc_periods; ///< (sizeof(u64 *) * counter_count), init 0 - usize event_count; /// kpep_config_events_count() - usize counter_count; - u32 classes; ///< See `class mask constants` above. - u32 config_counter; - u32 power_counter; - u32 reserved; -} kpep_config; - -/// Error code for kpep_config_xxx() and kpep_db_xxx() functions. -typedef enum { - KPEP_CONFIG_ERROR_NONE = 0, - KPEP_CONFIG_ERROR_INVALID_ARGUMENT = 1, - KPEP_CONFIG_ERROR_OUT_OF_MEMORY = 2, - KPEP_CONFIG_ERROR_IO = 3, - KPEP_CONFIG_ERROR_BUFFER_TOO_SMALL = 4, - KPEP_CONFIG_ERROR_CUR_SYSTEM_UNKNOWN = 5, - KPEP_CONFIG_ERROR_DB_PATH_INVALID = 6, - KPEP_CONFIG_ERROR_DB_NOT_FOUND = 7, - KPEP_CONFIG_ERROR_DB_ARCH_UNSUPPORTED = 8, - KPEP_CONFIG_ERROR_DB_VERSION_UNSUPPORTED = 9, - KPEP_CONFIG_ERROR_DB_CORRUPT = 10, - KPEP_CONFIG_ERROR_EVENT_NOT_FOUND = 11, - KPEP_CONFIG_ERROR_CONFLICTING_EVENTS = 12, - KPEP_CONFIG_ERROR_COUNTERS_NOT_FORCED = 13, - KPEP_CONFIG_ERROR_EVENT_UNAVAILABLE = 14, - KPEP_CONFIG_ERROR_ERRNO = 15, - KPEP_CONFIG_ERROR_MAX -} kpep_config_error_code; - -/// Error description for kpep_config_error_code. -static const char *kpep_config_error_names[KPEP_CONFIG_ERROR_MAX] = { - "none", - "invalid argument", - "out of memory", - "I/O", - "buffer too small", - "current system unknown", - "database path invalid", - "database not found", - "database architecture unsupported", - "database version unsupported", - "database corrupt", - "event not found", - "conflicting events", - "all counters must be forced", - "event unavailable", - "check errno"}; - -/// Error description. -static const char *kpep_config_error_desc(int code) { - if (0 <= code && code < KPEP_CONFIG_ERROR_MAX) { - return kpep_config_error_names[code]; - } - return "unknown error"; -} - -/// Create a config. -/// @param db A kpep db, see kpep_db_create() -/// @param cfg_ptr A pointer to receive the new config. -/// @return kpep_config_error_code, 0 for success. -static int (*kpep_config_create)(kpep_db *db, kpep_config **cfg_ptr); - -/// Free the config. -static void (*kpep_config_free)(kpep_config *cfg); - -/// Add an event to config. -/// @param cfg The config. -/// @param ev_ptr A event pointer. -/// @param flag 0: all, 1: user space only -/// @param err Error bitmap pointer, can be NULL. -/// If return value is `CONFLICTING_EVENTS`, this bitmap contains -/// the conflicted event indices, e.g. "1 << 2" means index 2. -/// @return kpep_config_error_code, 0 for success. -static int (*kpep_config_add_event)(kpep_config *cfg, kpep_event **ev_ptr, - u32 flag, u32 *err); - -/// Remove event at index. -/// @return kpep_config_error_code, 0 for success. -static int (*kpep_config_remove_event)(kpep_config *cfg, usize idx); - -/// Force all counters. -/// @return kpep_config_error_code, 0 for success. -static int (*kpep_config_force_counters)(kpep_config *cfg); - -/// Get events count. -/// @return kpep_config_error_code, 0 for success. -static int (*kpep_config_events_count)(kpep_config *cfg, usize *count_ptr); - -/// Get all event pointers. -/// @param buf A buffer to receive event pointers. -/// @param buf_size The buffer's size in bytes, should not smaller than -/// kpep_config_events_count() * sizeof(void *). -/// @return kpep_config_error_code, 0 for success. -static int (*kpep_config_events)(kpep_config *cfg, kpep_event **buf, - usize buf_size); - -/// Get kpc register configs. -/// @param buf A buffer to receive kpc register configs. -/// @param buf_size The buffer's size in bytes, should not smaller than -/// kpep_config_kpc_count() * sizeof(kpc_config_t). -/// @return kpep_config_error_code, 0 for success. -static int (*kpep_config_kpc)(kpep_config *cfg, kpc_config_t *buf, - usize buf_size); - -/// Get kpc register config count. -/// @return kpep_config_error_code, 0 for success. -static int (*kpep_config_kpc_count)(kpep_config *cfg, usize *count_ptr); - -/// Get kpc classes. -/// @param classes See `class mask constants` above. -/// @return kpep_config_error_code, 0 for success. -static int (*kpep_config_kpc_classes)(kpep_config *cfg, u32 *classes_ptr); - -/// Get the index mapping from event to counter. -/// @param buf A buffer to receive indexes. -/// @param buf_size The buffer's size in bytes, should not smaller than -/// kpep_config_events_count() * sizeof(kpc_config_t). -/// @return kpep_config_error_code, 0 for success. -static int (*kpep_config_kpc_map)(kpep_config *cfg, usize *buf, usize buf_size); - -/// Open a kpep database file in "/usr/share/kpep/" or "/usr/local/share/kpep/". -/// @param name File name, for example "haswell", "cpu_100000c_1_92fb37c8". -/// Pass NULL for current CPU. -/// @return kpep_config_error_code, 0 for success. -static int (*kpep_db_create)(const char *name, kpep_db **db_ptr); - -/// Free the kpep database. -static void (*kpep_db_free)(kpep_db *db); - -/// Get the database's name. -/// @return kpep_config_error_code, 0 for success. -static int (*kpep_db_name)(kpep_db *db, const char **name); - -/// Get the event alias count. -/// @return kpep_config_error_code, 0 for success. -static int (*kpep_db_aliases_count)(kpep_db *db, usize *count); - -/// Get all alias. -/// @param buf A buffer to receive all alias strings. -/// @param buf_size The buffer's size in bytes, -/// should not smaller than kpep_db_aliases_count() * sizeof(void *). -/// @return kpep_config_error_code, 0 for success. -static int (*kpep_db_aliases)(kpep_db *db, const char **buf, usize buf_size); - -/// Get counters count for given classes. -/// @param classes 1: Fixed, 2: Configurable. -/// @return kpep_config_error_code, 0 for success. -static int (*kpep_db_counters_count)(kpep_db *db, u8 classes, usize *count); - -/// Get all event count. -/// @return kpep_config_error_code, 0 for success. -static int (*kpep_db_events_count)(kpep_db *db, usize *count); - -/// Get all events. -/// @param buf A buffer to receive all event pointers. -/// @param buf_size The buffer's size in bytes, -/// should not smaller than kpep_db_events_count() * sizeof(void *). -/// @return kpep_config_error_code, 0 for success. -static int (*kpep_db_events)(kpep_db *db, kpep_event **buf, usize buf_size); - -/// Get one event by name. -/// @return kpep_config_error_code, 0 for success. -static int (*kpep_db_event)(kpep_db *db, const char *name, kpep_event **ev_ptr); - -/// Get event's name. -/// @return kpep_config_error_code, 0 for success. -static int (*kpep_event_name)(kpep_event *ev, const char **name_ptr); - -/// Get event's alias. -/// @return kpep_config_error_code, 0 for success. -static int (*kpep_event_alias)(kpep_event *ev, const char **alias_ptr); - -/// Get event's description. -/// @return kpep_config_error_code, 0 for success. -static int (*kpep_event_description)(kpep_event *ev, const char **str_ptr); - -// ----------------------------------------------------------------------------- -// load kperf/kperfdata dynamic library -// ----------------------------------------------------------------------------- - -typedef struct { - const char *name; - void **impl; -} lib_symbol; - -#define lib_nelems(x) (sizeof(x) / sizeof((x)[0])) -#define lib_symbol_def(name) \ - { #name, (void **)&name } - -static const lib_symbol lib_symbols_kperf[] = { - lib_symbol_def(kpc_pmu_version), - lib_symbol_def(kpc_cpu_string), - lib_symbol_def(kpc_set_counting), - lib_symbol_def(kpc_get_counting), - lib_symbol_def(kpc_set_thread_counting), - lib_symbol_def(kpc_get_thread_counting), - lib_symbol_def(kpc_get_config_count), - lib_symbol_def(kpc_get_counter_count), - lib_symbol_def(kpc_set_config), - lib_symbol_def(kpc_get_config), - lib_symbol_def(kpc_get_cpu_counters), - lib_symbol_def(kpc_get_thread_counters), - lib_symbol_def(kpc_force_all_ctrs_set), - lib_symbol_def(kpc_force_all_ctrs_get), - lib_symbol_def(kperf_action_count_set), - lib_symbol_def(kperf_action_count_get), - lib_symbol_def(kperf_action_samplers_set), - lib_symbol_def(kperf_action_samplers_get), - lib_symbol_def(kperf_action_filter_set_by_task), - lib_symbol_def(kperf_action_filter_set_by_pid), - lib_symbol_def(kperf_timer_count_set), - lib_symbol_def(kperf_timer_count_get), - lib_symbol_def(kperf_timer_period_set), - lib_symbol_def(kperf_timer_period_get), - lib_symbol_def(kperf_timer_action_set), - lib_symbol_def(kperf_timer_action_get), - lib_symbol_def(kperf_sample_set), - lib_symbol_def(kperf_sample_get), - lib_symbol_def(kperf_reset), - lib_symbol_def(kperf_timer_pet_set), - lib_symbol_def(kperf_timer_pet_get), - lib_symbol_def(kperf_ns_to_ticks), - lib_symbol_def(kperf_ticks_to_ns), - lib_symbol_def(kperf_tick_frequency), -}; - -static const lib_symbol lib_symbols_kperfdata[] = { - lib_symbol_def(kpep_config_create), - lib_symbol_def(kpep_config_free), - lib_symbol_def(kpep_config_add_event), - lib_symbol_def(kpep_config_remove_event), - lib_symbol_def(kpep_config_force_counters), - lib_symbol_def(kpep_config_events_count), - lib_symbol_def(kpep_config_events), - lib_symbol_def(kpep_config_kpc), - lib_symbol_def(kpep_config_kpc_count), - lib_symbol_def(kpep_config_kpc_classes), - lib_symbol_def(kpep_config_kpc_map), - lib_symbol_def(kpep_db_create), - lib_symbol_def(kpep_db_free), - lib_symbol_def(kpep_db_name), - lib_symbol_def(kpep_db_aliases_count), - lib_symbol_def(kpep_db_aliases), - lib_symbol_def(kpep_db_counters_count), - lib_symbol_def(kpep_db_events_count), - lib_symbol_def(kpep_db_events), - lib_symbol_def(kpep_db_event), - lib_symbol_def(kpep_event_name), - lib_symbol_def(kpep_event_alias), - lib_symbol_def(kpep_event_description), -}; - -#define lib_path_kperf "/System/Library/PrivateFrameworks/kperf.framework/kperf" -#define lib_path_kperfdata \ - "/System/Library/PrivateFrameworks/kperfdata.framework/kperfdata" - -static bool lib_inited = false; -static bool lib_has_err = false; -static char lib_err_msg[256]; - -static void *lib_handle_kperf = NULL; -static void *lib_handle_kperfdata = NULL; - -static void lib_deinit(void) { - lib_inited = false; - lib_has_err = false; - if (lib_handle_kperf) - dlclose(lib_handle_kperf); - if (lib_handle_kperfdata) - dlclose(lib_handle_kperfdata); - lib_handle_kperf = NULL; - lib_handle_kperfdata = NULL; - for (usize i = 0; i < lib_nelems(lib_symbols_kperf); i++) { - const lib_symbol *symbol = &lib_symbols_kperf[i]; - *symbol->impl = NULL; - } - for (usize i = 0; i < lib_nelems(lib_symbols_kperfdata); i++) { - const lib_symbol *symbol = &lib_symbols_kperfdata[i]; - *symbol->impl = NULL; - } -} - -static bool lib_init(void) { -#define return_err() \ - do { \ - lib_deinit(); \ - lib_inited = true; \ - lib_has_err = true; \ - return false; \ - } while (false) - - if (lib_inited) - return !lib_has_err; - - // load dynamic library - lib_handle_kperf = dlopen(lib_path_kperf, RTLD_LAZY); - if (!lib_handle_kperf) { - snprintf(lib_err_msg, sizeof(lib_err_msg), - "Failed to load kperf.framework, message: %s.", dlerror()); - return_err(); - } - lib_handle_kperfdata = dlopen(lib_path_kperfdata, RTLD_LAZY); - if (!lib_handle_kperfdata) { - snprintf(lib_err_msg, sizeof(lib_err_msg), - "Failed to load kperfdata.framework, message: %s.", dlerror()); - return_err(); - } - - // load symbol address from dynamic library - for (usize i = 0; i < lib_nelems(lib_symbols_kperf); i++) { - const lib_symbol *symbol = &lib_symbols_kperf[i]; - *symbol->impl = dlsym(lib_handle_kperf, symbol->name); - if (!*symbol->impl) { - snprintf(lib_err_msg, sizeof(lib_err_msg), - "Failed to load kperf function: %s.", symbol->name); - return_err(); - } - } - for (usize i = 0; i < lib_nelems(lib_symbols_kperfdata); i++) { - const lib_symbol *symbol = &lib_symbols_kperfdata[i]; - *symbol->impl = dlsym(lib_handle_kperfdata, symbol->name); - if (!*symbol->impl) { - snprintf(lib_err_msg, sizeof(lib_err_msg), - "Failed to load kperfdata function: %s.", symbol->name); - return_err(); - } - } - - lib_inited = true; - lib_has_err = false; - return true; - -#undef return_err -} - -// ----------------------------------------------------------------------------- -// kdebug private structs -// https://github.com/apple/darwin-xnu/blob/main/bsd/sys_private/kdebug_private.h -// ----------------------------------------------------------------------------- - -/* - * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf - * structure. - */ -#if defined(__arm64__) -typedef uint64_t kd_buf_argtype; -#else -typedef uintptr_t kd_buf_argtype; -#endif - -typedef struct { - uint64_t timestamp; - kd_buf_argtype arg1; - kd_buf_argtype arg2; - kd_buf_argtype arg3; - kd_buf_argtype arg4; - kd_buf_argtype arg5; /* the thread ID */ - uint32_t debugid; /* see */ - -/* - * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf - * structure. - */ -#if defined(__LP64__) || defined(__arm64__) - uint32_t cpuid; /* cpu index, from 0 */ - kd_buf_argtype unused; -#endif -} kd_buf; - -/* bits for the type field of kd_regtype */ -#define KDBG_CLASSTYPE 0x10000 -#define KDBG_SUBCLSTYPE 0x20000 -#define KDBG_RANGETYPE 0x40000 -#define KDBG_TYPENONE 0x80000 -#define KDBG_CKTYPES 0xF0000 - -/* only trace at most 4 types of events, at the code granularity */ -#define KDBG_VALCHECK 0x00200000U - -typedef struct { - unsigned int type; - unsigned int value1; - unsigned int value2; - unsigned int value3; - unsigned int value4; -} kd_regtype; - -typedef struct { - /* number of events that can fit in the buffers */ - int nkdbufs; - /* set if trace is disabled */ - int nolog; - /* kd_ctrl_page.flags */ - unsigned int flags; - /* number of threads in thread map */ - int nkdthreads; - /* the owning pid */ - int bufid; -} kbufinfo_t; - -// ----------------------------------------------------------------------------- -// kdebug utils -// ----------------------------------------------------------------------------- - -/// Clean up trace buffers and reset ktrace/kdebug/kperf. -/// @return 0 on success. -static int kdebug_reset(void) { - int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDREMOVE}; - return sysctl(mib, 3, NULL, NULL, NULL, 0); -} - -/// Disable and reinitialize the trace buffers. -/// @return 0 on success. -static int kdebug_reinit(void) { - int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDSETUP}; - return sysctl(mib, 3, NULL, NULL, NULL, 0); -} - -/// Set debug filter. -static int kdebug_setreg(kd_regtype *kdr) { - int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDSETREG}; - usize size = sizeof(kd_regtype); - return sysctl(mib, 3, kdr, &size, NULL, 0); -} - -/// Set maximum number of trace entries (kd_buf). -/// Only allow allocation up to half the available memory (sane_size). -/// @return 0 on success. -static int kdebug_trace_setbuf(int nbufs) { - int mib[4] = {CTL_KERN, KERN_KDEBUG, KERN_KDSETBUF, nbufs}; - return sysctl(mib, 4, NULL, NULL, NULL, 0); -} - -/// Enable or disable kdebug trace. -/// Trace buffer must already be initialized. -/// @return 0 on success. -static int kdebug_trace_enable(bool enable) { - int mib[4] = {CTL_KERN, KERN_KDEBUG, KERN_KDENABLE, enable}; - return sysctl(mib, 4, NULL, 0, NULL, 0); -} - -/// Retrieve trace buffer information from kernel. -/// @return 0 on success. -static int kdebug_get_bufinfo(kbufinfo_t *info) { - if (!info) - return -1; - int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDGETBUF}; - size_t needed = sizeof(kbufinfo_t); - return sysctl(mib, 3, info, &needed, NULL, 0); -} - -/// Retrieve trace buffers from kernel. -/// @param buf Memory to receive buffer data, array of `kd_buf`. -/// @param len Length of `buf` in bytes. -/// @param count Number of trace entries (kd_buf) obtained. -/// @return 0 on success. -static int kdebug_trace_read(void *buf, usize len, usize *count) { - if (count) - *count = 0; - if (!buf || !len) - return -1; - - // Note: the input and output units are not the same. - // input: bytes - // output: number of kd_buf - int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDREADTR}; - int ret = sysctl(mib, 3, buf, &len, NULL, 0); - if (ret != 0) - return ret; - *count = len; - return 0; -} - -/// Block until there are new buffers filled or `timeout_ms` have passed. -/// @param timeout_ms timeout milliseconds, 0 means wait forever. -/// @param suc set true if new buffers filled. -/// @return 0 on success. -static int kdebug_wait(usize timeout_ms, bool *suc) { - if (timeout_ms == 0) - return -1; - int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDBUFWAIT}; - usize val = timeout_ms; - int ret = sysctl(mib, 3, NULL, &val, NULL, 0); - if (suc) - *suc = !!val; - return ret; -} - -// ----------------------------------------------------------------------------- -// Demo -// ----------------------------------------------------------------------------- - -#define EVENT_NAME_MAX 8 - -typedef struct { - const char *alias; /// name for print - const char *names[EVENT_NAME_MAX]; /// name from pmc db -} event_alias; - -/// Event names from /usr/share/kpep/.plist -static const event_alias profile_events[] = { - {"cycles", - { - "FIXED_CYCLES", // Apple A7-A15//CORE_ACTIVE_CYCLE - "CPU_CLK_UNHALTED.THREAD", // Intel Core 1th-10th - "CPU_CLK_UNHALTED.CORE", // Intel Yonah, Merom - }}, - {"instructions", - { - "FIXED_INSTRUCTIONS", // Apple A7-A15 - "INST_RETIRED.ANY" // Intel Yonah, Merom, Core 1th-10th - }}, - {"branches", - { - "INST_BRANCH", // Apple A7-A15 - "BR_INST_RETIRED.ALL_BRANCHES", // Intel Core 1th-10th - "INST_RETIRED.ANY", // Intel Yonah, Merom - }}, - {"branch-misses", - { - "BRANCH_MISPRED_NONSPEC", // Apple A7-A15, since iOS 15, macOS 12 - "BRANCH_MISPREDICT", // Apple A7-A14 - "BR_MISP_RETIRED.ALL_BRANCHES", // Intel Core 2th-10th - "BR_INST_RETIRED.MISPRED", // Intel Yonah, Merom - }}, -}; - -static kpep_event *get_event(kpep_db *db, const event_alias *alias) { - for (usize j = 0; j < EVENT_NAME_MAX; j++) { - const char *name = alias->names[j]; - if (!name) - break; - kpep_event *ev = NULL; - if (kpep_db_event(db, name, &ev) == 0) { - return ev; - } - } - return NULL; -} - -kpc_config_t regs[KPC_MAX_COUNTERS] = {0}; -usize counter_map[KPC_MAX_COUNTERS] = {0}; -u64 counters_0[KPC_MAX_COUNTERS] = {0}; -u64 counters_1[KPC_MAX_COUNTERS] = {0}; -const usize ev_count = sizeof(profile_events) / sizeof(profile_events[0]); - -bool setup_performance_counters() { - static bool init = false; - static bool worked = false; - - if (init) { - return worked; - } - init = true; - - // load dylib - if (!lib_init()) { - printf("Error: %s\n", lib_err_msg); - return (worked = false); - } - - // check permission - int force_ctrs = 0; - if (kpc_force_all_ctrs_get(&force_ctrs)) { - // printf("Permission denied, xnu/kpc requires root privileges.\n"); - return (worked = false); - } - int ret; - // load pmc db - kpep_db *db = NULL; - if ((ret = kpep_db_create(NULL, &db))) { - printf("Error: cannot load pmc database: %d.\n", ret); - return (worked = false); - } - printf("loaded db: %s (%s)\n", db->name, db->marketing_name); - - // create a config - kpep_config *cfg = NULL; - if ((ret = kpep_config_create(db, &cfg))) { - printf("Failed to create kpep config: %d (%s).\n", ret, - kpep_config_error_desc(ret)); - return (worked = false); - } - if ((ret = kpep_config_force_counters(cfg))) { - printf("Failed to force counters: %d (%s).\n", ret, - kpep_config_error_desc(ret)); - return (worked = false); - } - - // get events - kpep_event *ev_arr[ev_count] = {0}; - for (usize i = 0; i < ev_count; i++) { - const event_alias *alias = profile_events + i; - ev_arr[i] = get_event(db, alias); - if (!ev_arr[i]) { - printf("Cannot find event: %s.\n", alias->alias); - return (worked = false); - } - } - - // add event to config - for (usize i = 0; i < ev_count; i++) { - kpep_event *ev = ev_arr[i]; - if ((ret = kpep_config_add_event(cfg, &ev, 0, NULL))) { - printf("Failed to add event: %d (%s).\n", ret, - kpep_config_error_desc(ret)); - return (worked = false); - } - } - - // prepare buffer and config - u32 classes = 0; - usize reg_count = 0; - if ((ret = kpep_config_kpc_classes(cfg, &classes))) { - printf("Failed get kpc classes: %d (%s).\n", ret, - kpep_config_error_desc(ret)); - return (worked = false); - } - if ((ret = kpep_config_kpc_count(cfg, ®_count))) { - printf("Failed get kpc count: %d (%s).\n", ret, - kpep_config_error_desc(ret)); - return (worked = false); - } - if ((ret = kpep_config_kpc_map(cfg, counter_map, sizeof(counter_map)))) { - printf("Failed get kpc map: %d (%s).\n", ret, kpep_config_error_desc(ret)); - return (worked = false); - } - if ((ret = kpep_config_kpc(cfg, regs, sizeof(regs)))) { - printf("Failed get kpc registers: %d (%s).\n", ret, - kpep_config_error_desc(ret)); - return (worked = false); - } - - // set config to kernel - if ((ret = kpc_force_all_ctrs_set(1))) { - printf("Failed force all ctrs: %d.\n", ret); - return (worked = false); - } - if ((classes & KPC_CLASS_CONFIGURABLE_MASK) && reg_count) { - if ((ret = kpc_set_config(classes, regs))) { - printf("Failed set kpc config: %d.\n", ret); - return (worked = false); - } - } - - // start counting - if ((ret = kpc_set_counting(classes))) { - printf("Failed set counting: %d.\n", ret); - return (worked = false); - } - if ((ret = kpc_set_thread_counting(classes))) { - printf("Failed set thread counting: %d.\n", ret); - return (worked = false); - } - - return (worked = true); -} - -inline performance_counters get_counters() { - static bool warned = false; - int ret; - // get counters before - if ((ret = kpc_get_thread_counters(0, KPC_MAX_COUNTERS, counters_0))) { - if (!warned) { - - printf("Failed get thread counters before: %d.\n", ret); - warned = true; - } - return 1; - } - /*printf("counters value:\n"); - for (usize i = 0; i < ev_count; i++) { - const event_alias *alias = profile_events + i; - usize idx = counter_map[i]; - u64 val = counters_1[idx] - counters_0[idx]; - printf("%14s: %llu\n", alias->alias, val); - }*/ - return performance_counters{ - counters_0[counter_map[0]], counters_0[counter_map[2]], - counters_0[counter_map[3]], counters_0[counter_map[1]]}; -} - -#endif diff --git a/benchmarks/bench_ip.cpp b/benchmarks/bench_ip.cpp new file mode 100644 index 00000000..782358e2 --- /dev/null +++ b/benchmarks/bench_ip.cpp @@ -0,0 +1,164 @@ +#include "counters/bench.h" +#include "fast_float/fast_float.h" +#include +#include +#include +#include +#include +#include +#include +#include + +void pretty_print(size_t volume, size_t bytes, std::string name, + counters::event_aggregate agg) { + printf("%-40s : ", name.c_str()); + printf(" %5.2f GB/s ", bytes / agg.fastest_elapsed_ns()); + printf(" %5.1f Ma/s ", volume * 1000.0 / agg.fastest_elapsed_ns()); + printf(" %5.2f ns/d ", agg.fastest_elapsed_ns() / volume); + if (counters::event_collector().has_events()) { + printf(" %5.2f GHz ", agg.fastest_cycles() / agg.fastest_elapsed_ns()); + printf(" %5.2f c/d ", agg.fastest_cycles() / volume); + printf(" %5.2f i/d ", agg.fastest_instructions() / volume); + printf(" %5.2f c/b ", agg.fastest_cycles() / bytes); + printf(" %5.2f i/b ", agg.fastest_instructions() / bytes); + printf(" %5.2f i/c ", agg.fastest_instructions() / agg.fastest_cycles()); + } + printf("\n"); +} + +int parse_u8_fastfloat(const char *&p, const char *pend, uint8_t *out) { + if (p == pend) + return 0; + auto r = fast_float::from_chars(p, pend, *out); + if (r.ec == std::errc()) { + p = r.ptr; + return 1; + } + return 0; +} + +static inline int parse_u8_fromchars(const char *&p, const char *pend, + uint8_t *out) { + if (p == pend) + return 0; + auto r = std::from_chars(p, pend, *out); + if (r.ec == std::errc()) { + p = r.ptr; + return 1; + } + return 0; +} + +template +static inline int parse_ip_line(const char *&p, const char *pend, uint32_t &sum, + Parser parse_uint8) { + uint8_t o = 0; + for (int i = 0; i < 4; ++i) { + if (!parse_uint8(p, pend, &o)) + return 0; + sum += o; + if (i != 3) { + if (p == pend || *p != '.') + return 0; + ++p; + } + } + // consume optional '\r' + if (p != pend && *p == '\r') + ++p; + // expect '\n' or end + if (p != pend && *p == '\n') + ++p; + return 1; +} + +static std::string make_ip_line(uint8_t a, uint8_t b, uint8_t c, uint8_t d) { + std::string s; + s.reserve(16); + s += std::to_string(a); + s += '.'; + s += std::to_string(b); + s += '.'; + s += std::to_string(c); + s += '.'; + s += std::to_string(d); + s += '\n'; + return s; +} + +int main() { + constexpr size_t N = 500000; + std::mt19937 rng(1234); + std::uniform_int_distribution dist(0, 255); + + std::string buf; + buf.reserve(N * 16); + + for (size_t i = 0; i < N; ++i) { + uint8_t a = (uint8_t)dist(rng); + uint8_t b = (uint8_t)dist(rng); + uint8_t c = (uint8_t)dist(rng); + uint8_t d = (uint8_t)dist(rng); + buf += make_ip_line(a, b, c, d); + } + + // sentinel to allow 4-byte loads at end + buf.append(4, '\0'); + + const size_t bytes = buf.size() - 4; // exclude sentinel from throughput + const size_t volume = N; + + // validate correctness + { + const char *start = buf.data(); + const char *end = buf.data() + bytes; + const char *p = start; + const char *pend = end; + uint32_t sum = 0; + for (size_t i = 0; i < N; ++i) { + int ok = parse_ip_line(p, pend, sum, parse_u8_fromchars); + if (!ok) { + std::fprintf(stderr, "fromchars parse failed at line %zu\n", i); + std::abort(); + } + p = start; + pend = end; + ok = parse_ip_line(p, pend, sum, parse_u8_fastfloat); + if (!ok) { + std::fprintf(stderr, "fastswar parse failed at line %zu\n", i); + std::abort(); + } + } + } + + uint32_t sink = 0; + + pretty_print(volume, bytes, "parse_ip_std_fromchars", counters::bench([&]() { + const char *p = buf.data(); + const char *pend = buf.data() + bytes; + uint32_t sum = 0; + int ok = 0; + for (size_t i = 0; i < N; ++i) { + ok = parse_ip_line(p, pend, sum, parse_u8_fromchars); + if (!ok) + std::abort(); + } + sink += sum; + })); + + pretty_print(volume, bytes, "parse_ip_fastfloat", counters::bench([&]() { + const char *p = buf.data(); + const char *pend = buf.data() + bytes; + uint32_t sum = 0; + int ok = 0; + for (size_t i = 0; i < N; ++i) { + ok = parse_ip_line(p, pend, sum, parse_u8_fastfloat); + if (!ok) + std::abort(); + } + sink += sum; + })); + + std::printf("sink=%u\n", sink); + return EXIT_SUCCESS; +} \ No newline at end of file diff --git a/benchmarks/benchmark.cpp b/benchmarks/benchmark.cpp index 05f12330..d90038ed 100644 --- a/benchmarks/benchmark.cpp +++ b/benchmarks/benchmark.cpp @@ -1,7 +1,7 @@ #if defined(__linux__) || (__APPLE__ && __aarch64__) #define USING_COUNTERS #endif -#include "event_counter.h" +#include "counters/event_counter.h" #include #include "fast_float/fast_float.h" #include @@ -50,14 +50,14 @@ double findmax_fastfloat32(std::vector> &s) { return answer; } -event_collector collector{}; +counters::event_collector collector{}; #ifdef USING_COUNTERS template -std::vector +std::vector time_it_ns(std::vector> &lines, T const &function, size_t repeat) { - std::vector aggregate; + std::vector aggregate; bool printed_bug = false; for (size_t i = 0; i < repeat; i++) { collector.start(); @@ -72,7 +72,7 @@ time_it_ns(std::vector> &lines, T const &function, } void pretty_print(double volume, size_t number_of_floats, std::string name, - std::vector events) { + std::vector events) { double volumeMB = volume / (1024. * 1024.); double average_ns{0}; double min_ns{DBL_MAX}; @@ -84,7 +84,7 @@ void pretty_print(double volume, size_t number_of_floats, std::string name, double branches_avg{0}; double branch_misses_min{0}; double branch_misses_avg{0}; - for (event_count e : events) { + for (counters::event_count e : events) { double ns = e.elapsed_ns(); average_ns += ns; min_ns = min_ns < ns ? min_ns : ns; @@ -102,7 +102,7 @@ void pretty_print(double volume, size_t number_of_floats, std::string name, branches_avg += branches; branches_min = branches_min < branches ? branches_min : branches; - double branch_misses = e.missed_branches(); + double branch_misses = e.branch_misses(); branch_misses_avg += branch_misses; branch_misses_min = branch_misses_min < branch_misses ? branch_misses_min : branch_misses; diff --git a/benchmarks/event_counter.h b/benchmarks/event_counter.h deleted file mode 100644 index cd594787..00000000 --- a/benchmarks/event_counter.h +++ /dev/null @@ -1,181 +0,0 @@ -#ifndef __EVENT_COUNTER_H -#define __EVENT_COUNTER_H - -#include -#ifndef _MSC_VER -#include -#endif -#include - -#include - -#include -#include - -#include "linux-perf-events.h" -#ifdef __linux__ -#include -#endif - -#if (defined(__APPLE__) && __APPLE__) && (defined(__aarch64__) && __aarch64__) -#include "apple_arm_events.h" -#endif - -struct event_count { - std::chrono::duration elapsed; - std::vector event_counts; - - event_count() : elapsed(0), event_counts{0, 0, 0, 0, 0} {} - - event_count(const std::chrono::duration _elapsed, - const std::vector _event_counts) - : elapsed(_elapsed), event_counts(_event_counts) {} - - event_count(const event_count &other) - : elapsed(other.elapsed), event_counts(other.event_counts) {} - - // The types of counters (so we can read the getter more easily) - enum event_counter_types { - CPU_CYCLES = 0, - INSTRUCTIONS = 1, - BRANCHES = 2, - MISSED_BRANCHES = 3 - }; - - double elapsed_sec() const { - return std::chrono::duration(elapsed).count(); - } - - double elapsed_ns() const { - return std::chrono::duration(elapsed).count(); - } - - double cycles() const { - return static_cast(event_counts[CPU_CYCLES]); - } - - double instructions() const { - return static_cast(event_counts[INSTRUCTIONS]); - } - - double branches() const { - return static_cast(event_counts[BRANCHES]); - } - - double missed_branches() const { - return static_cast(event_counts[MISSED_BRANCHES]); - } - - event_count &operator=(const event_count &other) { - this->elapsed = other.elapsed; - this->event_counts = other.event_counts; - return *this; - } - - event_count operator+(const event_count &other) const { - return event_count(elapsed + other.elapsed, - { - event_counts[0] + other.event_counts[0], - event_counts[1] + other.event_counts[1], - event_counts[2] + other.event_counts[2], - event_counts[3] + other.event_counts[3], - event_counts[4] + other.event_counts[4], - }); - } - - void operator+=(const event_count &other) { *this = *this + other; } -}; - -struct event_aggregate { - bool has_events = false; - int iterations = 0; - event_count total{}; - event_count best{}; - event_count worst{}; - - event_aggregate() = default; - - void operator<<(const event_count &other) { - if (iterations == 0 || other.elapsed < best.elapsed) { - best = other; - } - if (iterations == 0 || other.elapsed > worst.elapsed) { - worst = other; - } - iterations++; - total += other; - } - - double elapsed_sec() const { return total.elapsed_sec() / iterations; } - - double elapsed_ns() const { return total.elapsed_ns() / iterations; } - - double cycles() const { return total.cycles() / iterations; } - - double instructions() const { return total.instructions() / iterations; } - - double branches() const { return total.branches() / iterations; } - - double missed_branches() const { - return total.missed_branches() / iterations; - } -}; - -struct event_collector { - event_count count{}; - std::chrono::time_point start_clock{}; - -#if defined(__linux__) - LinuxEvents linux_events; - - event_collector() - : linux_events(std::vector{ - PERF_COUNT_HW_CPU_CYCLES, PERF_COUNT_HW_INSTRUCTIONS, - PERF_COUNT_HW_BRANCH_INSTRUCTIONS, // Retired branch instructions - PERF_COUNT_HW_BRANCH_MISSES}) {} - - bool has_events() { return linux_events.is_working(); } -#elif __APPLE__ && __aarch64__ - performance_counters diff; - - event_collector() : diff(0) { setup_performance_counters(); } - - bool has_events() { return setup_performance_counters(); } -#else - event_collector() {} - - bool has_events() { return false; } -#endif - - inline void start() { -#if defined(__linux) - linux_events.start(); -#elif __APPLE__ && __aarch64__ - if (has_events()) { - diff = get_counters(); - } -#endif - start_clock = std::chrono::steady_clock::now(); - } - - inline event_count &end() { - const auto end_clock = std::chrono::steady_clock::now(); -#if defined(__linux) - linux_events.end(count.event_counts); -#elif __APPLE__ && __aarch64__ - if (has_events()) { - performance_counters end = get_counters(); - diff = end - diff; - } - count.event_counts[0] = diff.cycles; - count.event_counts[1] = diff.instructions; - count.event_counts[2] = diff.branches; - count.event_counts[3] = diff.missed_branches; - count.event_counts[4] = 0; -#endif - count.elapsed = end_clock - start_clock; - return count; - } -}; - -#endif diff --git a/benchmarks/linux-perf-events.h b/benchmarks/linux-perf-events.h deleted file mode 100644 index 0a9e5538..00000000 --- a/benchmarks/linux-perf-events.h +++ /dev/null @@ -1,104 +0,0 @@ -#pragma once -#ifdef __linux__ - -#include // for __NR_perf_event_open -#include // for perf event constants -#include // for ioctl -#include // for syscall - -#include // for errno -#include // for memset -#include - -#include -#include - -template class LinuxEvents { - int fd; - bool working; - perf_event_attr attribs{}; - size_t num_events{}; - std::vector temp_result_vec{}; - std::vector ids{}; - -public: - explicit LinuxEvents(std::vector config_vec) : fd(0), working(true) { - memset(&attribs, 0, sizeof(attribs)); - attribs.type = TYPE; - attribs.size = sizeof(attribs); - attribs.disabled = 1; - attribs.exclude_kernel = 1; - attribs.exclude_hv = 1; - - attribs.sample_period = 0; - attribs.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID; - const int pid = 0; // the current process - const int cpu = -1; // all CPUs - const unsigned long flags = 0; - - int group = -1; // no group - num_events = config_vec.size(); - ids.resize(config_vec.size()); - uint32_t i = 0; - for (auto config : config_vec) { - attribs.config = config; - int _fd = static_cast( - syscall(__NR_perf_event_open, &attribs, pid, cpu, group, flags)); - if (_fd == -1) { - report_error("perf_event_open"); - } - ioctl(_fd, PERF_EVENT_IOC_ID, &ids[i++]); - if (group == -1) { - group = _fd; - fd = _fd; - } - } - - temp_result_vec.resize(num_events * 2 + 1); - } - - ~LinuxEvents() { - if (fd != -1) { - close(fd); - } - } - - inline void start() { - if (fd != -1) { - if (ioctl(fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) { - report_error("ioctl(PERF_EVENT_IOC_RESET)"); - } - - if (ioctl(fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) { - report_error("ioctl(PERF_EVENT_IOC_ENABLE)"); - } - } - } - - inline void end(std::vector &results) { - if (fd != -1) { - if (ioctl(fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP) == -1) { - report_error("ioctl(PERF_EVENT_IOC_DISABLE)"); - } - - if (read(fd, temp_result_vec.data(), temp_result_vec.size() * 8) == -1) { - report_error("read"); - } - } - // our actual results are in slots 1,3,5, ... of this structure - for (uint32_t i = 1; i < temp_result_vec.size(); i += 2) { - results[i / 2] = temp_result_vec[i]; - } - for (uint32_t i = 2; i < temp_result_vec.size(); i += 2) { - if (ids[i / 2 - 1] != temp_result_vec[i]) { - report_error("event mismatch"); - } - } - } - - bool is_working() { return working; } - -private: - void report_error(const std::string &) { working = false; } -}; -#endif \ No newline at end of file