Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 21 additions & 15 deletions scripts/performance_test/perftest_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# Prometheus metrics exporter.
metrics:
enabled: true
# HTTP port for /metrics endpoint (0 = auto-assign free port)
port: 0

controller:
Expand All @@ -20,37 +21,39 @@ backend:
# SimpleStorage, Yuanrong, MooncakeStore, ...
storage_backend: SimpleStorage

# For SimpleStorage:
# SimpleStorage: ZMQ-based in-memory storage for out-of-the-box usage
SimpleStorage:
# Total number of samples
# Maximum number of experience samples to hold across all storage units
total_storage_size: 100000
# Number of distributed storage units for SimpleStorage backend
# Number of distributed storage units.
# Recommended: >= 2 x number of nodes for load balancing.
num_data_storage_units: 16
# ZMQ Server IP & Ports (automatically generated during init)
zmq_info: null

# For MooncakeStore:
# MooncakeStore: high-performance KV-based hierarchical storage
# that supports RDMA transport between GPU and DRAM.
MooncakeStore:
# Whether to let TQ automatically init metadata_server.
# Whether TransferQueue should automatically start the Mooncake metadata server.
# WARNING: When set to `true`, TQ will attempt to terminate any existing mooncake_master process.
auto_init: true
# Address of the HTTP metadata server
# Address of the metadata coordination server.
metadata_server: localhost:50050
# Address of master server
# Address of the Mooncake master server.
master_server_address: localhost:50051
# Address of local host. Set to "" to use Ray IP as local host address
# Local host address visible to the Mooncake cluster.
# Set to "" to auto-detect using Ray's node IP.
local_hostname: ""
# Protocol for transmission. Choose from: tcp, rdma. (default: rdma)
# Transport protocol. Choose from: tcp, rdma.
protocol: rdma
# Memory segment size in bytes for mounting
# Global memory segment size in bytes **per client** for mounting (default: 4GB)
global_segment_size: 86294967296
# Local buffer size in bytes
# Local buffer size in bytes **per client** (default: 1GB)
local_buffer_size: 86294967296
# Network device name. Set to "" to let Mooncake to auto-picks devices
# Network device name.
# Set to "" to let Mooncake auto-select available devices.
device_name: ""

# For RayStore:
RayStore:

# For Yuanrong:
Yuanrong:
# Whether to let TQ automatically init yuanrong
Expand All @@ -67,3 +70,6 @@ backend:
# --enable_huge_tlb Enable huge page memory to improve performance. Required for >21GB shared memory on 910B.
# Example: "--shared_memory_size_mb 16384 --remote_h2d_device_ids 0,1,2,3 --enable_huge_tlb true"
worker_args: "--shared_memory_size_mb 65536 --remote_h2d_device_ids 0 --enable_huge_tlb true"

# For RayStore:
RayStore:
42 changes: 24 additions & 18 deletions transfer_queue/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
# Prometheus metrics exporter.
metrics:
enabled: false
port: 0 # HTTP port for /metrics endpoint (0 = auto-assign free port)
# HTTP port for /metrics endpoint (0 = auto-assign free port)
port: 0

controller:
# User-defined sampler. User can pass sampler instance to overwrite this string config.
Expand All @@ -20,37 +21,39 @@ backend:
# SimpleStorage, Yuanrong, MooncakeStore, ...
storage_backend: SimpleStorage

# For SimpleStorage:
# SimpleStorage: ZMQ-based in-memory storage for out-of-the-box usage
SimpleStorage:
# Total number of samples
# Maximum number of experience samples to hold across all storage units
total_storage_size: 100000
# Number of distributed storage units for SimpleStorage backend
# Number of distributed storage units.
# Recommended: >= 2 x number of nodes for load balancing.
num_data_storage_units: 2
# ZMQ Server IP & Ports (automatically generated during init)
zmq_info: null

# For MooncakeStore:
# MooncakeStore: high-performance KV-based hierarchical storage
# that supports RDMA transport between GPU and DRAM.
MooncakeStore:
# Whether to let TQ automatically init metadata_server.
# Whether TransferQueue should automatically start the Mooncake metadata server.
# WARNING: When set to `true`, TQ will attempt to terminate any existing mooncake_master process.
auto_init: true
# Address of the HTTP metadata server
# Address of the metadata coordination server.
metadata_server: localhost:50050
# Address of master server
# Address of the Mooncake master server.
master_server_address: localhost:50051
# Address of local host. Set to "" to use Ray IP as local host address
# Local host address visible to the Mooncake cluster.
# Set to "" to auto-detect using Ray's node IP.
local_hostname: ""
# Protocol for transmission. Choose from: tcp, rdma. (default: tcp)
# Transport protocol. Choose from: tcp, rdma.
protocol: tcp
# Memory segment size in bytes for mounting (default: 4GB)
# Global memory segment size in bytes **per client** for mounting (default: 4GB)
global_segment_size: 4294967296
# Local buffer size in bytes (default: 1GB)
# Local buffer size in bytes **per client** (default: 1GB)
local_buffer_size: 1073741824
# Network device name. Set to "" to let Mooncake to auto-picks devices
# Network device name.
# Set to "" to let Mooncake auto-select available devices.
device_name: ""

# For RayStore:
RayStore:

# For Yuanrong:
Yuanrong:
# Whether to let TQ automatically init yuanrong
Expand All @@ -59,9 +62,9 @@ backend:
worker_port: 31501
# Metastore service port
metastore_port: 2379
# If enable npu transport
# Whether to enable npu transport
enable_yr_npu_transport: false
# If enable host RDMA (H2H) transport via UCX. Requires RDMA NIC hardware and rdma-core driver.
# Whether to enable host RDMA (H2H) transport via UCX. Requires RDMA NIC hardware and rdma-core driver.
# See https://pages.openeuler.openatom.cn/openyuanrong-datasystem/docs/zh-cn/latest/best_practices/best_practices_for_rdma.html
enable_rdma: false
# UCX env vars passed to dscli subprocess. Precedence: ucx_env_vars > parent env > TQ default (UCX_TLS=rc_x when enable_rdma=true).
Expand All @@ -81,3 +84,6 @@ backend:
# ulimit -l unlimited (allow pinning enough memory for RDMA/Ascend)
# Example: "--shared_memory_size_mb 16384 --remote_h2d_device_ids 0,1,2,3 --enable_huge_tlb true"
worker_args: "--shared_memory_size_mb 8192"

# For RayStore:
RayStore:
Loading