diff --git a/CMakeLists.txt b/CMakeLists.txt index e571adf..35616c5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,12 +15,15 @@ FetchContent_Declare( FetchContent_MakeAvailable(spdlog) # Please modify the below option. -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCOMPILE_ZEN_4") +# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCOMPILE_ZEN_4") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCOMPILE_INTEL_SPR") # Supported flag: # -DCOMPILE_ALDER_LAKE_DDR4 # -DCOMPILE_ALDER_LAKE_DDR5 # -DCOMPILE_RAPTOR_LAKE # -DCOMPILE_ZEN_4 +# -DCOMPILE_INTEL_SKYLAKE +# -DCOMPILE_INTEL_SPR add_compile_options(-Wall -Wextra -O0) diff --git a/README.md b/README.md index acb8c94..a846a00 100644 --- a/README.md +++ b/README.md @@ -107,6 +107,8 @@ sudo numactl -C {core} -m {memory} ./validate_mapping ### Decomposing DRAM address mapping into component functions ```bash +sudo numactl -N {node} ./bandwidth_test + sudo numactl -C {core} -m {memory} ./decompose_functions -o {fname_prefix} -p {num_pages} -t {ddr_type} -n {num_dimms} \ -s {dimm_size} -r {num_ranks} -w {dq_width} \ diff --git a/sudoku/internal/CMakeLists.txt b/sudoku/internal/CMakeLists.txt index 3febbc3..6e005f4 100644 --- a/sudoku/internal/CMakeLists.txt +++ b/sudoku/internal/CMakeLists.txt @@ -20,6 +20,8 @@ set(SUDOKU_INTERNAL_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/intel_core_12th_ddr5.h ${CMAKE_CURRENT_SOURCE_DIR}/intel_core_14th_ddr5.h ${CMAKE_CURRENT_SOURCE_DIR}/amd_ryzen_zen4_ddr5.h + ${CMAKE_CURRENT_SOURCE_DIR}/intel_skylake.h + ${CMAKE_CURRENT_SOURCE_DIR}/intel_spr.h ) add_library(sudoku_internal_lib STATIC @@ -32,3 +34,24 @@ target_include_directories(sudoku_internal_lib PUBLIC ) target_link_libraries(sudoku_internal_lib PUBLIC spdlog::spdlog) + +# -------------------------------------------------- +# Bandwidth microbenchmark executable +# -------------------------------------------------- + +find_package(OpenMP REQUIRED) + +add_executable(bandwidth_test + ${CMAKE_CURRENT_SOURCE_DIR}/bandwidth.cc +) + +target_compile_options(bandwidth_test PRIVATE -Wall -Wextra -O3) + +target_link_libraries(bandwidth_test PRIVATE + OpenMP::OpenMP_CXX + Threads::Threads +) + +set_target_properties(bandwidth_test PROPERTIES + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin" +) diff --git a/sudoku/internal/bandwidth.cc b/sudoku/internal/bandwidth.cc new file mode 100644 index 0000000..9a33c1a --- /dev/null +++ b/sudoku/internal/bandwidth.cc @@ -0,0 +1,118 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +uintptr_t get_physical_address_once(uintptr_t virtual_addr) { + int fd = open("/proc/self/pagemap", O_RDONLY); + if (fd < 0) return 0; + size_t pagesize = getpagesize(); + off_t offset = (virtual_addr / pagesize) * sizeof(uint64_t); + uint64_t entry; + if (pread(fd, &entry, sizeof(entry), offset) != sizeof(entry)) { + close(fd); + return 0; + } + close(fd); + if (!(entry & (1ULL << 63))) return 0; + uintptr_t pfn = entry & ((1ULL << 55) - 1); + return (pfn * pagesize) | (virtual_addr % pagesize); +} + +int main(int argc, char* argv[]) { + if (argc < 3) { + std::cerr << "Usage: " << argv[0] + << " " + << std::endl; + return 1; + } + + int num_pages = std::stoi(argv[1]); + std::string mask_str = argv[2]; + std::vector masks; + std::stringstream ss(mask_str); + std::string item; + while (std::getline(ss, item, ',')) { + masks.push_back(std::stoull(item, nullptr, 16)); + } + + const size_t HUGE_PAGE_SIZE = 1024ULL * 1024ULL * 1024ULL; // 1GB + size_t total_bytes = (size_t)num_pages * HUGE_PAGE_SIZE; + + void* ptr = mmap(NULL, total_bytes, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | (30 << 26), + -1, 0); + if (ptr == MAP_FAILED) { + perror("mmap"); + return 1; + } + + // [Important] Initialization identical to the original version + // (touch only the first byte of each page) + char* base_ptr = static_cast(ptr); + for (size_t p = 0; p < (size_t)num_pages; ++p) { + base_ptr[p * HUGE_PAGE_SIZE] = 0; + } + + for (uintptr_t mask : masks) { + std::vector valid_ptrs; + + // Filtering is performed internally without printing per-mask messages + for (size_t p = 0; p < (size_t)num_pages; ++p) { + uintptr_t page_vaddr = + reinterpret_cast(base_ptr + (p * HUGE_PAGE_SIZE)); + uintptr_t page_paddr = get_physical_address_once(page_vaddr); + if (page_paddr == 0) continue; + + #pragma omp parallel + { + std::vector local_ptrs; + #pragma omp for nowait + for (size_t offset = 0; offset < HUGE_PAGE_SIZE; offset += 64) { + uintptr_t current_paddr = page_paddr + offset; + if ((current_paddr & mask) == 0) { + local_ptrs.push_back( + reinterpret_cast(page_vaddr + offset)); + } + } + #pragma omp critical + valid_ptrs.insert(valid_ptrs.end(), + local_ptrs.begin(), + local_ptrs.end()); + } + } + + if (valid_ptrs.empty()) continue; + + // Measurement section (identical to the original) + double global_sum = 0.0; + size_t n = valid_ptrs.size(); + auto start = std::chrono::high_resolution_clock::now(); + + #pragma omp parallel for reduction(+:global_sum) + for (size_t i = 0; i < n; ++i) { + global_sum += *(valid_ptrs[i]); + } + + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration diff = end - start; + + double read_bytes = n * 64.0; + double bandwidth = + (read_bytes / (1024.0 * 1024.0 * 1024.0)) / diff.count(); + + // Preserve output format + std::cout << "Mask: 0x" << std::hex << mask << std::dec + << " | Read bandwidth: " << bandwidth << " GB/s" + << " (Lines: " << n << ")" << std::endl; + } + + munmap(ptr, total_bytes); + return 0; +} + diff --git a/sudoku/internal/config.h b/sudoku/internal/config.h index 1dce113..547dbf5 100644 --- a/sudoku/internal/config.h +++ b/sudoku/internal/config.h @@ -44,9 +44,11 @@ struct AddressTableEntry { static constexpr AddressTableEntry DRAMAddressTable[] = { // {type, chip_size, dq, bg, ba, row, column, page_size, burst_length} - {DDRType::DDR4, 8ULL * GB, 8, 2, 2, 16, 10, 8 * 1024, 8}, - {DDRType::DDR4, 16ULL * GB, 8, 2, 2, 17, 10, 8 * 1024, 8}, - {DDRType::DDR5, 16ULL * GB, 8, 3, 2, 16, 10, 8 * 1024, 16}, + {DDRType::DDR4, 8ULL * GB, 8, 2, 2, 16, 10, 8 * 1024, 8}, // 16GB DDR4 2Rx8 + {DDRType::DDR4, 16ULL * GB, 8, 2, 2, 17, 10, 8 * 1024, 8}, // 32GB DDR4 2Rx8 + {DDRType::DDR4, 8ULL * GB, 4, 2, 2, 17, 10, 4 * 1024, 8}, // 32GB DDR4 1Rx4 + {DDRType::DDR5, 16ULL * GB, 8, 3, 2, 16, 10, 8 * 1024, 16}, // 32GB DDR5 2Rx8 + {DDRType::DDR5, 16ULL * GB, 4, 3, 2, 16, 11, 8 * 1024, 16}, // 32GB DDR5 1Rx4 }; static constexpr uint32_t kNumChipEntries = diff --git a/sudoku/internal/constants.h b/sudoku/internal/constants.h index 3767d51..cca2954 100644 --- a/sudoku/internal/constants.h +++ b/sudoku/internal/constants.h @@ -9,6 +9,10 @@ #include "intel_core_14th_ddr5.h" #elif defined(COMPILE_ZEN_4) #include "amd_ryzen_zen4_ddr5.h" +#elif defined(COMPILE_INTEL_SKYLAKE) +#include "intel_skylake.h" +#elif defined(COMPILE_INTEL_SPR) +#include "intel_spr.h" #else #error "Please add the appropriate header files in compile options." #endif diff --git a/sudoku/internal/intel_skylake.h b/sudoku/internal/intel_skylake.h new file mode 100644 index 0000000..e86bbe7 --- /dev/null +++ b/sudoku/internal/intel_skylake.h @@ -0,0 +1,14 @@ +#ifndef SUDOKU_INTERNAL_INTEL_SKYLAKE_H +#define SUDOKU_INTERNAL_INTEL_SKYLAKE_H + +#define SBDR_LOWER_BOUND_ 320 +#define SBDR_UPPER_BOUND_ 400 +#define REFRESH_CYCLE_LOWER_BOUND_ 500 +#define REFRESH_CYCLE_UPPER_BOUND_ 1000 +#define BANK_GROUP_THRESHOLD_ 400 // for NUM_READ_STREAM (8) depends on offsets +#define REGULAR_REFRESH_INTERVAL_THRESHOLD_ 20000 + +#define PCI_OFFSET_UPPER_BOUND_ 0x0 +#define PCI_OFFSET_LOWER_BOUND_ 0x0 + +#endif // SUDOKU_INTERNAL_INTEL_SKYLAKE_H diff --git a/sudoku/internal/intel_spr.h b/sudoku/internal/intel_spr.h new file mode 100644 index 0000000..0dfd01e --- /dev/null +++ b/sudoku/internal/intel_spr.h @@ -0,0 +1,14 @@ +#ifndef SUDOKU_INTERNAL_INTEL_SPR_H +#define SUDOKU_INTERNAL_INTEL_SPR_H + +#define SBDR_LOWER_BOUND_ 390 // 400 +#define SBDR_UPPER_BOUND_ 500 // 800 +#define REFRESH_CYCLE_LOWER_BOUND_ 500 +#define REFRESH_CYCLE_UPPER_BOUND_ 1000 +#define BANK_GROUP_THRESHOLD_ 400 // for NUM_READ_STREAM (8) depends on offsets +#define REGULAR_REFRESH_INTERVAL_THRESHOLD_ 20000 + +#define PCI_OFFSET_UPPER_BOUND_ 0x0 +#define PCI_OFFSET_LOWER_BOUND_ 0x0 + +#endif // SUDOKU_INTERNAL_INTEL_SPR_H diff --git a/sudoku/internal/refreshes.cc b/sudoku/internal/refreshes.cc index 95c8c2f..12fca0e 100644 --- a/sudoku/internal/refreshes.cc +++ b/sudoku/internal/refreshes.cc @@ -184,7 +184,7 @@ uint64_t MedianFineRefreshIntervalPairedMemoryAccess(uint64_t faddr, uint64_t AverageRefreshIntervalPairedAccessFine(uint64_t faddr, uint64_t saddr, uint64_t threshold) { uint64_t** histogram = AllocateHistogram(SUDOKU_REFRESH_NUM_ITERATION, 2); - MeasureRefreshPairedAccessCoarse(faddr, saddr, histogram); + MeasureRefreshPairedAccessFine(faddr, saddr, histogram); // Filter refresh timings and compute intervals std::vector refreshes; FilterRefreshTiming(histogram, 3, threshold, refreshes);