Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,15 @@ FetchContent_Declare(
FetchContent_MakeAvailable(spdlog)

# Please modify the below option.
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCOMPILE_ZEN_4")
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCOMPILE_ZEN_4")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCOMPILE_INTEL_SPR")
# Supported flag:
# -DCOMPILE_ALDER_LAKE_DDR4
# -DCOMPILE_ALDER_LAKE_DDR5
# -DCOMPILE_RAPTOR_LAKE
# -DCOMPILE_ZEN_4
# -DCOMPILE_INTEL_SKYLAKE
# -DCOMPILE_INTEL_SPR

add_compile_options(-Wall -Wextra -O0)

Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ sudo numactl -C {core} -m {memory} ./validate_mapping
### Decomposing DRAM address mapping into component functions

```bash
sudo numactl -N {node} ./bandwidth_test <num_pages> <functions_seperated_by_commas>

sudo numactl -C {core} -m {memory} ./decompose_functions
-o {fname_prefix} -p {num_pages} -t {ddr_type} -n {num_dimms} \
-s {dimm_size} -r {num_ranks} -w {dq_width} \
Expand Down
23 changes: 23 additions & 0 deletions sudoku/internal/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ set(SUDOKU_INTERNAL_HEADERS
${CMAKE_CURRENT_SOURCE_DIR}/intel_core_12th_ddr5.h
${CMAKE_CURRENT_SOURCE_DIR}/intel_core_14th_ddr5.h
${CMAKE_CURRENT_SOURCE_DIR}/amd_ryzen_zen4_ddr5.h
${CMAKE_CURRENT_SOURCE_DIR}/intel_skylake.h
${CMAKE_CURRENT_SOURCE_DIR}/intel_spr.h
)

add_library(sudoku_internal_lib STATIC
Expand All @@ -32,3 +34,24 @@ target_include_directories(sudoku_internal_lib PUBLIC
)

target_link_libraries(sudoku_internal_lib PUBLIC spdlog::spdlog)

# --------------------------------------------------
# Bandwidth microbenchmark executable
# --------------------------------------------------

find_package(OpenMP REQUIRED)

add_executable(bandwidth_test
${CMAKE_CURRENT_SOURCE_DIR}/bandwidth.cc
)

target_compile_options(bandwidth_test PRIVATE -Wall -Wextra -O3)

target_link_libraries(bandwidth_test PRIVATE
OpenMP::OpenMP_CXX
Threads::Threads
)

set_target_properties(bandwidth_test PROPERTIES
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
)
118 changes: 118 additions & 0 deletions sudoku/internal/bandwidth.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
#include <iostream>
#include <vector>
#include <chrono>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <omp.h>
#include <sstream>

uintptr_t get_physical_address_once(uintptr_t virtual_addr) {
int fd = open("/proc/self/pagemap", O_RDONLY);
if (fd < 0) return 0;
size_t pagesize = getpagesize();
off_t offset = (virtual_addr / pagesize) * sizeof(uint64_t);
uint64_t entry;
if (pread(fd, &entry, sizeof(entry), offset) != sizeof(entry)) {
close(fd);
return 0;
}
close(fd);
if (!(entry & (1ULL << 63))) return 0;
uintptr_t pfn = entry & ((1ULL << 55) - 1);
return (pfn * pagesize) | (virtual_addr % pagesize);
}

int main(int argc, char* argv[]) {
if (argc < 3) {
std::cerr << "Usage: " << argv[0]
<< " <Number of HugePages> <Mask1,Mask2,...>"
<< std::endl;
return 1;
}

int num_pages = std::stoi(argv[1]);
std::string mask_str = argv[2];
std::vector<uintptr_t> masks;
std::stringstream ss(mask_str);
std::string item;
while (std::getline(ss, item, ',')) {
masks.push_back(std::stoull(item, nullptr, 16));
}

const size_t HUGE_PAGE_SIZE = 1024ULL * 1024ULL * 1024ULL; // 1GB
size_t total_bytes = (size_t)num_pages * HUGE_PAGE_SIZE;

void* ptr = mmap(NULL, total_bytes,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | (30 << 26),
-1, 0);
if (ptr == MAP_FAILED) {
perror("mmap");
return 1;
}

// [Important] Initialization identical to the original version
// (touch only the first byte of each page)
char* base_ptr = static_cast<char*>(ptr);
for (size_t p = 0; p < (size_t)num_pages; ++p) {
base_ptr[p * HUGE_PAGE_SIZE] = 0;
}

for (uintptr_t mask : masks) {
std::vector<double*> valid_ptrs;

// Filtering is performed internally without printing per-mask messages
for (size_t p = 0; p < (size_t)num_pages; ++p) {
uintptr_t page_vaddr =
reinterpret_cast<uintptr_t>(base_ptr + (p * HUGE_PAGE_SIZE));
uintptr_t page_paddr = get_physical_address_once(page_vaddr);
if (page_paddr == 0) continue;

#pragma omp parallel
{
std::vector<double*> local_ptrs;
#pragma omp for nowait
for (size_t offset = 0; offset < HUGE_PAGE_SIZE; offset += 64) {
uintptr_t current_paddr = page_paddr + offset;
if ((current_paddr & mask) == 0) {
local_ptrs.push_back(
reinterpret_cast<double*>(page_vaddr + offset));
}
}
#pragma omp critical
valid_ptrs.insert(valid_ptrs.end(),
local_ptrs.begin(),
local_ptrs.end());
}
}

if (valid_ptrs.empty()) continue;

// Measurement section (identical to the original)
double global_sum = 0.0;
size_t n = valid_ptrs.size();
auto start = std::chrono::high_resolution_clock::now();

#pragma omp parallel for reduction(+:global_sum)
for (size_t i = 0; i < n; ++i) {
global_sum += *(valid_ptrs[i]);
}

auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start;

double read_bytes = n * 64.0;
double bandwidth =
(read_bytes / (1024.0 * 1024.0 * 1024.0)) / diff.count();

// Preserve output format
std::cout << "Mask: 0x" << std::hex << mask << std::dec
<< " | Read bandwidth: " << bandwidth << " GB/s"
<< " (Lines: " << n << ")" << std::endl;
}

munmap(ptr, total_bytes);
return 0;
}

8 changes: 5 additions & 3 deletions sudoku/internal/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,11 @@ struct AddressTableEntry {

static constexpr AddressTableEntry DRAMAddressTable[] = {
// {type, chip_size, dq, bg, ba, row, column, page_size, burst_length}
{DDRType::DDR4, 8ULL * GB, 8, 2, 2, 16, 10, 8 * 1024, 8},
{DDRType::DDR4, 16ULL * GB, 8, 2, 2, 17, 10, 8 * 1024, 8},
{DDRType::DDR5, 16ULL * GB, 8, 3, 2, 16, 10, 8 * 1024, 16},
{DDRType::DDR4, 8ULL * GB, 8, 2, 2, 16, 10, 8 * 1024, 8}, // 16GB DDR4 2Rx8
{DDRType::DDR4, 16ULL * GB, 8, 2, 2, 17, 10, 8 * 1024, 8}, // 32GB DDR4 2Rx8
{DDRType::DDR4, 8ULL * GB, 4, 2, 2, 17, 10, 4 * 1024, 8}, // 32GB DDR4 1Rx4
{DDRType::DDR5, 16ULL * GB, 8, 3, 2, 16, 10, 8 * 1024, 16}, // 32GB DDR5 2Rx8
{DDRType::DDR5, 16ULL * GB, 4, 3, 2, 16, 11, 8 * 1024, 16}, // 32GB DDR5 1Rx4
};

static constexpr uint32_t kNumChipEntries =
Expand Down
4 changes: 4 additions & 0 deletions sudoku/internal/constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
#include "intel_core_14th_ddr5.h"
#elif defined(COMPILE_ZEN_4)
#include "amd_ryzen_zen4_ddr5.h"
#elif defined(COMPILE_INTEL_SKYLAKE)
#include "intel_skylake.h"
#elif defined(COMPILE_INTEL_SPR)
#include "intel_spr.h"
#else
#error "Please add the appropriate header files in compile options."
#endif
Expand Down
14 changes: 14 additions & 0 deletions sudoku/internal/intel_skylake.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#ifndef SUDOKU_INTERNAL_INTEL_SKYLAKE_H
#define SUDOKU_INTERNAL_INTEL_SKYLAKE_H

#define SBDR_LOWER_BOUND_ 320
#define SBDR_UPPER_BOUND_ 400
#define REFRESH_CYCLE_LOWER_BOUND_ 500
#define REFRESH_CYCLE_UPPER_BOUND_ 1000
#define BANK_GROUP_THRESHOLD_ 400 // for NUM_READ_STREAM (8) depends on offsets
#define REGULAR_REFRESH_INTERVAL_THRESHOLD_ 20000

#define PCI_OFFSET_UPPER_BOUND_ 0x0
#define PCI_OFFSET_LOWER_BOUND_ 0x0

#endif // SUDOKU_INTERNAL_INTEL_SKYLAKE_H
14 changes: 14 additions & 0 deletions sudoku/internal/intel_spr.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#ifndef SUDOKU_INTERNAL_INTEL_SPR_H
#define SUDOKU_INTERNAL_INTEL_SPR_H

#define SBDR_LOWER_BOUND_ 390 // 400
#define SBDR_UPPER_BOUND_ 500 // 800
#define REFRESH_CYCLE_LOWER_BOUND_ 500
#define REFRESH_CYCLE_UPPER_BOUND_ 1000
#define BANK_GROUP_THRESHOLD_ 400 // for NUM_READ_STREAM (8) depends on offsets
#define REGULAR_REFRESH_INTERVAL_THRESHOLD_ 20000

#define PCI_OFFSET_UPPER_BOUND_ 0x0
#define PCI_OFFSET_LOWER_BOUND_ 0x0

#endif // SUDOKU_INTERNAL_INTEL_SPR_H
2 changes: 1 addition & 1 deletion sudoku/internal/refreshes.cc
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ uint64_t MedianFineRefreshIntervalPairedMemoryAccess(uint64_t faddr,
uint64_t AverageRefreshIntervalPairedAccessFine(uint64_t faddr, uint64_t saddr,
uint64_t threshold) {
uint64_t** histogram = AllocateHistogram(SUDOKU_REFRESH_NUM_ITERATION, 2);
MeasureRefreshPairedAccessCoarse(faddr, saddr, histogram);
MeasureRefreshPairedAccessFine(faddr, saddr, histogram);
// Filter refresh timings and compute intervals
std::vector<uint64_t> refreshes;
FilterRefreshTiming(histogram, 3, threshold, refreshes);
Expand Down