diff --git a/.appveyor.yml b/.appveyor.yml deleted file mode 100644 index 42b79958d4..0000000000 --- a/.appveyor.yml +++ /dev/null @@ -1,53 +0,0 @@ -clone_depth: 1 # NB: this stops FIO-VERSION-GEN making tag based versions - -image: - - Visual Studio 2019 - -environment: - CYG_MIRROR: http://cygwin.mirror.constant.com - matrix: - - ARCHITECTURE: x64 - CC: clang - CONFIGURE_OPTIONS: --enable-pdb - DISTRO: msys2 -# Skip 32 bit clang build -# - ARCHITECTURE: x86 -# CC: clang -# CONFIGURE_OPTIONS: --enable-pdb -# DISTRO: msys2 - - ARCHITECTURE: x64 - CONFIGURE_OPTIONS: - DISTRO: cygwin - - ARCHITECTURE: x86 - CONFIGURE_OPTIONS: --build-32bit-win - DISTRO: cygwin - -install: - - if %DISTRO%==cygwin ( - SET "PATH=C:\cygwin64\bin;C:\cygwin64;%PATH%" - ) - - if %DISTRO%==msys2 if %ARCHITECTURE%==x86 ( - SET "PATH=C:\msys64\mingw32\bin;C:\msys64\usr\bin;%PATH%" - ) - - if %DISTRO%==msys2 if %ARCHITECTURE%==x64 ( - SET "PATH=C:\msys64\mingw64\bin;C:\msys64\usr\bin;%PATH%" - ) - - SET PATH=C:\Python38-x64;%PATH% # NB: Changed env variables persist to later sections - - SET PYTHONUNBUFFERED=TRUE - - bash.exe ci\appveyor-install.sh - -build_script: - - bash.exe configure --extra-cflags=-Werror --disable-native %CONFIGURE_OPTIONS% - - make.exe -j2 - -after_build: - - file.exe fio.exe - - make.exe test - - 'cd os\windows && dobuild.cmd %ARCHITECTURE% && cd ..' - - ps: Get-ChildItem .\os\windows\*.msi | % { Push-AppveyorArtifact $_.FullName -FileName $_.Name -DeploymentName fio.msi } - -test_script: - - python.exe t/run-fio-tests.py --artifact-root test-artifacts --debug - -on_finish: - - 'bash.exe -lc "cd \"${APPVEYOR_BUILD_FOLDER}\" && [ -d test-artifacts ] && 7z a -t7z test-artifacts.7z test-artifacts -xr!foo.0.0 -xr!latency.?.0 -xr!fio_jsonplus_clat2csv.test && appveyor PushArtifact test-artifacts.7z' diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000000..6cead5b3fa --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,15 @@ +Please confirm that your commit message(s) follow these guidelines: + +1. First line is a commit title, a descriptive one-liner for the change +2. Empty second line +3. Commit message body that explains why the change is useful. Break lines that + aren't something like a URL at 72-74 chars. +4. Empty line +5. Signed-off-by: Real Name + +Reminders: + +1. If you modify struct thread_options, also make corresponding changes in + cconv.c and bump FIO_SERVER_VER in server.h +2. If you change the ioengine interface (hooks, flags, etc), remember to bump + FIO_IOOPS_VERSION in ioengines.h. diff --git a/.github/actions/build-qemu/action.yml b/.github/actions/build-qemu/action.yml new file mode 100644 index 0000000000..279a8ed71e --- /dev/null +++ b/.github/actions/build-qemu/action.yml @@ -0,0 +1,31 @@ +name: 'Build and Install QEMU on Ubuntu' +desription: 'Build QEMU and Install on Ubuntu' + +inputs: + version: # What QEMU version to build/install + description: 'QEMU version to build and install' + required: false + default: '9.1.0' + + +runs: + using: "composite" + steps: + - name: Install QEMU build dependencies + run: sudo apt update && sudo apt-get -qq install libglib2.0-dev libfdt-dev libpixman-1-dev ninja-build flex bison libsdl2-dev libaio-dev python3-tomli libslirp-dev + shell: bash + + - name: Build and install QEMU + run: | + wget -nv https://download.qemu.org/qemu-$INPUT_VER.tar.xz + tar xJf qemu-$INPUT_VER.tar.xz + rm qemu-$INPUT_VER.tar.xz + cd qemu-$INPUT_VER + ./configure --enable-kvm --target-list=x86_64-softmmu + make -j $(nproc) + sudo make install + cd .. + rm -rf qemu-$INPUT_VER + shell: bash + env: + INPUT_VER: ${{ inputs.version }} diff --git a/.github/actions/create-guest-image/action.yml b/.github/actions/create-guest-image/action.yml new file mode 100644 index 0000000000..77be3a3cab --- /dev/null +++ b/.github/actions/create-guest-image/action.yml @@ -0,0 +1,45 @@ +name: 'Create guest image' +description: 'Create VM guest image on Ubuntu runner' + +inputs: + distro: + description: 'Linux distribution to use for guest image' + required: false + default: 'debian-12' + extra_pkgs: + description: 'Extra packages to install for guest image' + required: false + default: + +runs: + using: "composite" + steps: + - name: Install libguestfs + run: sudo apt update && sudo apt-get -qq install libguestfs-tools + shell: bash + - name: Setup steps for virt-builder + run: | + sudo chmod a+r /boot/vmlinuz* + sudo chmod 0666 /dev/kvm + ssh-keygen -t rsa -N "" -f ~/.ssh/id_rsa + shell: bash + - name: Create Debian image + run: | + virt-builder ${{ inputs.distro }} \ + --quiet \ + --hostname fio-tester \ + --ssh-inject root \ + --run-command "ssh-keygen -A" \ + --run-command "sed -i 's/ens2/enp0s2/g' /etc/network/interfaces" \ + --append-line '/etc/environment:PYTHONUNBUFFERED=1' \ + --append-line '/etc/environment:GITHUB_SERVER_URL=${{ github.server_url }}' \ + --append-line '/etc/environment:GITHUB_REPOSITORY=${{ github.repository }}' \ + --append-line '/etc/environment:GITHUB_REF=${{ github.ref }}' \ + --append-line '/etc/environment:GITHUB_SHA=${{ github.sha }}' \ + --append-line '/etc/environment:GITHUB_JOB=${{ github.job }}' \ + --append-line '/etc/environment:EXTRA_PKGS=${{ inputs.extra_pkgs }}' \ + --append-line '/etc/environment:CI_TARGET_BUILD=${{ env.CI_TARGET_BUILD }}' \ + --append-line '/etc/environment:CI_TARGET_OS=${{ env.CI_TARGET_OS }}' + + shell: bash + diff --git a/.github/actions/start-vm/action.yml b/.github/actions/start-vm/action.yml new file mode 100644 index 0000000000..ab13aa091a --- /dev/null +++ b/.github/actions/start-vm/action.yml @@ -0,0 +1,60 @@ +name: 'Start QEMU VM' +description: 'Start QEMU virtual machine' + +inputs: + qemu: # QEMU binary to use + required: false + default: "qemu-system-x86_64" + image: # VM image file + required: true + ssh_fwd_port: # forward this host port to the guest's SSH port + required: false + default: 2022 + options: # Custom QEMU invocation options no \n at the end! + required: false + ram: # how much RAM to allocate to VM + required: false + default: "12G" + host_key: # If true add guest host key to known_hosts + required: false + default: "false" + +runs: + using: "composite" + steps: + - name: install wait-for-it + shell: bash + run: sudo apt update && sudo apt-get -qq install wait-for-it + - name: Enable KVM group perms + shell: bash + run: | + echo 'KERNEL=="kvm", GROUP="kvm", MODE="0666", OPTIONS+="static_node=kvm"' | sudo tee /etc/udev/rules.d/99-kvm4all.rules + sudo udevadm control --reload-rules + sudo udevadm trigger --name-match=kvm + - name: Start VM in background + shell: bash + run: | + ${{ inputs.qemu }} \ + -cpu host \ + -drive file=${{ inputs.image }},format=raw,if=virtio \ + -enable-kvm \ + -smp $(nproc) \ + -nographic \ + -m ${{ inputs.ram }} \ + -display none \ + -machine q35,accel=kvm \ + -nic user,model=virtio-net-pci,hostfwd=tcp::${{ inputs.ssh_fwd_port }}-:22 \ + ${{ inputs.options }} \ + & + - name: Wait for VM to boot + shell: bash + run: | + wait-for-it localhost:${{ inputs.ssh_fwd_port }} -t 15 + sleep 3 + - name: Add guest host key to known_hosts + shell: bash + run: | + if echo ${{ inputs.host_key }} | grep -c "true" + then + ssh root@localhost -p ${{ inputs.ssh_fwd_port }} -o StrictHostKeyChecking=no echo + fi diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000000..a162ad18be --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,170 @@ +name: CI + +on: + push: + pull_request: + workflow_dispatch: + schedule: + - cron: "35 5 * * *" # 5:35 UTC which is 0:35 ET + +jobs: + build-containers: + runs-on: ubuntu-latest + + strategy: + fail-fast: false + matrix: + container: + - {os: 'debian', dh: 'debian', ver: 'bookworm', target_arch: 'x86_64'} + - {os: 'fedora', dh: 'fedora', ver: '40', target_arch: 'x86_64'} + - {os: 'alma', dh: 'almalinux', ver: '9', target_arch: 'x86_64'} + - {os: 'oracle', dh: 'oraclelinux', ver: '9', target_arch: 'x86_64'} + - {os: 'rocky', dh: 'rockylinux', ver: '9', target_arch: 'x86_64'} + - {os: 'ubuntu', dh: 'ubuntu', ver: 'noble', target_arch: 'i686'} + - {os: 'ubuntu', dh: 'ubuntu', ver: 'noble', target_arch: 'x86_64'} + + container: + image: ${{ matrix.container.dh }}:${{ matrix.container.ver }} + env: + CI_TARGET_BUILD: Linux + CI_TARGET_ARCH: ${{ matrix.container.target_arch }} + CI_TARGET_OS: ${{ matrix.container.os }} + CI_TARGET_OS_VER: ${{ matrix.container.ver }} + + steps: + - name: Checkout repo + uses: actions/checkout@v4 + - name: Install dependencies + run: ./ci/actions-install.sh + - name: Build + run: ./ci/actions-build.sh + - name: Smoke test + run: ./ci/actions-smoke-test.sh + - name: Full test + run: ./ci/actions-full-test.sh + + build: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + build: + - linux-gcc + - linux-clang + - macos + - linux-i686-gcc + - android + - windows-cygwin-64 + - windows-cygwin-32 + - windows-msys2-64 + include: + - build: linux-gcc + os: ubuntu-22.04 + cc: gcc + - build: linux-clang + os: ubuntu-22.04 + cc: clang + - build: macos + os: macos-15 + - build: linux-i686-gcc + os: ubuntu-22.04 + arch: i686 + - build: android + os: ubuntu-22.04 + arch: aarch64-linux-android32 + - build: android-recovery + os: ubuntu-22.04 + arch: aarch64-linux-android32 + - build: windows-cygwin-64 + os: windows-latest + arch: x86_64 + installer_arch: x64 + shell: bash + - build: windows-cygwin-32 + os: windows-latest + arch: i686 + installer_arch: x86 + shell: bash + - build: windows-msys2-64 + os: windows-latest + cc: clang + arch: x86_64 + installer_arch: x64 + shell: msys2 + + env: + CI_TARGET_BUILD: ${{ matrix.build }} + CI_TARGET_ARCH: ${{ matrix.arch }} + CC: ${{ matrix.cc }} + + steps: + - name: git config line endings (Windows) + if: ${{ contains( matrix.build, 'windows' ) }} + run: git config --global core.autocrlf input + - name: Checkout repo + uses: actions/checkout@v4 + - name: Install Cygwin toolchain (Windows) + if: ${{ startsWith(matrix.build, 'windows-cygwin') }} + uses: cygwin/cygwin-install-action@master + with: + packages: > + mingw64-${{matrix.arch}}-binutils + mingw64-${{matrix.arch}}-CUnit + mingw64-${{matrix.arch}}-curl + mingw64-${{matrix.arch}}-dlfcn + mingw64-${{matrix.arch}}-gcc-core + mingw64-${{matrix.arch}}-headers + mingw64-${{matrix.arch}}-runtime + mingw64-${{matrix.arch}}-zlib + + - name: Install msys2 toolchain (Windows) + if: ${{ startsWith(matrix.build, 'windows-msys2') }} + uses: msys2/setup-msys2@v2 + with: + install: > + git + base-devel + mingw-w64-${{matrix.arch}}-clang + mingw-w64-${{matrix.arch}}-cunit + mingw-w64-${{matrix.arch}}-toolchain + mingw-w64-${{matrix.arch}}-lld + mingw-w64-${{matrix.arch}}-python-scipy + mingw-w64-${{matrix.arch}}-python-six + mingw-w64-${{matrix.arch}}-python-statsmodels + mingw-w64-${{matrix.arch}}-python-sphinx + + - name: install bash 4 (macOS) + if: ${{ contains( matrix.build, 'macOS' ) }} + run: HOMEBREW_NO_AUTO_UPDATE=1 brew install bash + - name: Install dependencies + run: ${{matrix.shell}} ./ci/actions-install.sh + if: ${{ !contains( matrix.build, 'msys2' ) }} + - name: Build + run: ${{matrix.shell}} ./ci/actions-build.sh + - name: Build installer (Windows) + if: ${{ contains( matrix.build, 'windows' ) }} + shell: cmd + run: | + cd os\windows + dobuild.cmd ${{ matrix.installer_arch }} + cd ..\.. + + - name: Upload installer as artifact (Windows) + if: ${{ contains( matrix.build, 'windows' ) }} + uses: actions/upload-artifact@v4 + with: + name: ${{ matrix.build }}-installer + path: os\windows\*.msi + - name: Upload installer as release for tagged builds (Windows) + uses: softprops/action-gh-release@v1 + if: ${{ startsWith(github.ref, 'refs/tags/') && startsWith(matrix.build, 'windows-cygwin') }} + with: + files: os/windows/*.msi + - name: Remove dependency files to resolve Makefile Cygwin sed issue (Windows) + if: ${{ startsWith(matrix.build, 'windows-cygwin') }} + run: rm *.d */*.d */*/*.d + shell: bash + - name: Smoke test + run: ${{matrix.shell}} ./ci/actions-smoke-test.sh + - name: Full test + run: ${{matrix.shell}} ./ci/actions-full-test.sh diff --git a/.github/workflows/cifuzz.yml b/.github/workflows/cifuzz.yml new file mode 100644 index 0000000000..d12388f1af --- /dev/null +++ b/.github/workflows/cifuzz.yml @@ -0,0 +1,24 @@ +name: CIFuzz +on: [pull_request, workflow_dispatch] +jobs: + Fuzzing: + runs-on: ubuntu-latest + steps: + - name: Build Fuzzers + id: build + uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master + with: + oss-fuzz-project-name: 'fio' + dry-run: false + - name: Run Fuzzers + uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master + with: + oss-fuzz-project-name: 'fio' + fuzz-seconds: 600 + dry-run: false + - name: Upload Crash + uses: actions/upload-artifact@v4 + if: failure() && steps.build.outcome == 'success' + with: + name: artifacts + path: ./out/artifacts diff --git a/.github/workflows/qemu.yml b/.github/workflows/qemu.yml new file mode 100644 index 0000000000..16787018d0 --- /dev/null +++ b/.github/workflows/qemu.yml @@ -0,0 +1,114 @@ +name: QEMU + +on: + workflow_dispatch: + schedule: + - cron: "50 3 * * *" # daily at 4:50 UTC (00:50 EST) + +jobs: + qemu-guest: + runs-on: ubuntu-22.04 + + strategy: + fail-fast: false + matrix: + include: + - config: basic io_uring_cmd tests + device: >- + -device nvme,id=nvme0,serial=deadbeef + -drive id=nvm-0,file=nvme0.img,format=raw,if=none,discard=unmap,media=disk + -device nvme-ns,id=nvm-0,drive=nvm-0,bus=nvme0,nsid=1 + test_cmd: "python3 t/run-fio-tests.py --nvmecdev /dev/ng0n1 --run-only 1014 1015" + extra_pkgs: "nvme-cli" + - config: 16-bit Guard PI tests (long) + device: >- + -device nvme,id=nvme0,serial=deadbeef + -drive id=nvm-0,file=nvme0.img,format=raw,if=none,discard=unmap,media=disk + -device nvme-ns,id=nvm-0,drive=nvm-0,bus=nvme0,nsid=1 + test_cmd: "python3 t/nvmept_pi.py --fio ./fio --dut /dev/ng0n1" + extra_pkgs: "nvme-cli" + - config: 4K+16 w/64-bit Guard PI + device: >- # 4K+16 w/64-bit Guard PI + -device nvme,id=nvme1,serial=deadbeee + -drive id=nvm-1,file=nvme0.img,format=raw,if=none,discard=unmap,media=disk + -device nvme-ns,id=nvm-1,drive=nvm-1,bus=nvme1,nsid=1,pif=2,ms=16,mset=1,pi=1,pil=0,logical_block_size=4096,physical_block_size=4096 + test_cmd: "python3 t/nvmept_pi.py --fio ./fio --dut /dev/ng0n1 --lbaf 6" + extra_pkgs: "nvme-cli" + - config: 4K+64 w/64-bit Guard PI + device: >- + -device nvme,id=nvme2,serial=deadeeef + -drive id=nvm-2,file=nvme0.img,format=raw,if=none,discard=unmap,media=disk + -device nvme-ns,id=nvm-2,drive=nvm-2,bus=nvme2,nsid=1,pif=2,ms=64,mset=1,pi=1,pil=0,logical_block_size=4096,physical_block_size=4096 + test_cmd: "python3 t/nvmept_pi.py --fio ./fio --dut /dev/ng0n1 --lbaf 7" + extra_pkgs: "nvme-cli" + - config: FDP + device: >- + -device nvme-subsys,id=nvme-subsys0,fdp=on,fdp.runs=128K,fdp.nrg=8,fdp.nruh=64 + -device nvme,id=nvme0,serial=deadbeef,subsys=nvme-subsys0 + -drive id=nvm-1,file=nvme0.img,format=raw,if=none,discard=unmap,media=disk + -device nvme-ns,id=nvm-1,drive=nvm-1,bus=nvme0,nsid=1,logical_block_size=4096,physical_block_size=4096,fdp.ruhs=0-63 + test_cmd: "nvme fdp status /dev/ng0n1 && python3 t/nvmept_fdp.py --fio ./fio --dut /dev/ng0n1" + extra_pkgs: "nvme-cli" + - config: verify-trim + device: + test_cmd: "python3 t/verify-trim.py" + extra_pkgs: sg3-utils + - config: ZBD + device: + test_cmd: "./t/zbd/run-tests-against-nullb" + extra_pkgs: sg3-utils + + env: + DISTRO: debian-12 + SSHCMD: ssh root@localhost -p 2022 + SCPCMD: scp -P 2022 + CI_TARGET_BUILD: linux + CI_TARGET_OS: debian + + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Create tarball containing repository + run: | + git archive --format=tar.gz -o fio-src.tar.gz --prefix=fio/ HEAD + git log -1 + + - name: Create guest VM image + uses: ./.github/actions/create-guest-image + with: + distro: ${{ env.DISTRO }} + extra_pkgs: ${{ matrix.extra_pkgs }} + + - name: Build and install QEMU + uses: ./.github/actions/build-qemu + + - name: Create backing file for NVMe device + run: truncate -s 1G nvme0.img + + - name: Start VM + uses: ./.github/actions/start-vm + with: + image: ${{ env.DISTRO }}.img + host_key: true + options: ${{ matrix.device }} + + - name: Transfer fio source to guest + run: | + $SCPCMD fio-src.tar.gz root@localhost:/root/ + $SSHCMD "tar xzf fio-src.tar.gz" + + - name: Install dependencies on guest + run: $SSHCMD "cd fio && ./ci/actions-install.sh" + + - name: Build fio on guest + run: $SSHCMD "cd fio && ./ci/actions-build.sh" + + - name: Show nvme device configuration + if: ${{ contains( matrix.extra_pkgs, 'nvme-cli' ) }} + run: | + $SSHCMD "nvme id-ns /dev/ng0n1 -H" + $SSHCMD "nvme nvm-id-ns /dev/ng0n1 -v" + + - name: Run test + run: $SSHCMD "cd fio && ${{ matrix.test_cmd }}" diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000000..e731de6d9a --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,20 @@ +# .readthedocs.yaml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +build: + os: ubuntu-22.04 + tools: + python: "3" + +# Build documentation in the docs/ directory with Sphinx +sphinx: + configuration: doc/conf.py + +# Optionally build your docs in additional formats such as PDF +formats: + - epub + - pdf diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index e35aff394b..0000000000 --- a/.travis.yml +++ /dev/null @@ -1,37 +0,0 @@ -language: c -dist: bionic -os: - - linux -compiler: - - clang - - gcc -arch: - - amd64 - - arm64 -env: - global: - - MAKEFLAGS="-j 2" -matrix: - include: - - os: linux - compiler: gcc - arch: amd64 - env: BUILD_ARCH="x86" # Only do the gcc x86 build to reduce clutter - # Default xcode image - - os: osx - compiler: clang # Workaround travis setting CC=["clang", "gcc"] - arch: amd64 - # Latest xcode image (needs periodic updating) - - os: osx - compiler: clang - osx_image: xcode11.2 - arch: amd64 - exclude: - - os: osx - compiler: gcc - -install: - - ci/travis-install.sh - -script: - - ci/travis-build.sh diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000000..3df315e5bc --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,11 @@ +cff-version: 1.2.0 +preferred-citation: + type: software + authors: + - family-names: "Axboe" + given-names: "Jens" + email: axboe@kernel.dk + title: "Flexible I/O Tester" + year: 2022 + url: "https://github.com/axboe/fio" +licence: GNU GPL v2.0 diff --git a/FIO-VERSION-GEN b/FIO-VERSION-GEN index e9d563c124..f92f5a46d4 100755 --- a/FIO-VERSION-GEN +++ b/FIO-VERSION-GEN @@ -1,7 +1,7 @@ #!/bin/sh GVF=FIO-VERSION-FILE -DEF_VER=fio-3.28 +DEF_VER=fio-3.41 LF=' ' diff --git a/HOWTO b/HOWTO.rst similarity index 75% rename from HOWTO rename to HOWTO.rst index 8c9e41356b..d31851e93a 100644 --- a/HOWTO +++ b/HOWTO.rst @@ -167,9 +167,9 @@ Command line options defined by `ioengine`. If no `ioengine` is given, list all available ioengines. -.. option:: --showcmd=jobfile +.. option:: --showcmd - Convert `jobfile` to a set of command-line options. + Convert given job files to a set of command-line options. .. option:: --readonly @@ -686,10 +686,12 @@ Time related parameters .. option:: runtime=time - Tell fio to terminate processing after the specified period of time. It - can be quite hard to determine for how long a specified job will run, so - this parameter is handy to cap the total runtime to a given time. When - the unit is omitted, the value is interpreted in seconds. + Limit runtime. The test will run until it completes the configured I/O + workload or until it has run for this specified amount of time, whichever + occurs first. It can be quite hard to determine for how long a specified + job will run, so this parameter is handy to cap the total runtime to a + given time. When the unit is omitted, the value is interpreted in + seconds. .. option:: time_based @@ -713,6 +715,16 @@ Time related parameters :option:`runtime` is specified. When the unit is omitted, the value is given in seconds. +.. option:: ramp_size=size + + If set, fio will wait until the job does given amount of IO before + logging any performance numbers. When ``group_reporting`` is enabled, + the logging starts when all jobs in the group together perform given + amount of IO. Similarly to ``ramp_time`` this is useful for letting + performance to settle before logging results and will increase the total + runtime if a special timeout or :option:`runtime` is specified. When + the unit is omitted, the value is given in bytes. + .. option:: clocksource=str Use the given clocksource as the base of timing. The supported options are: @@ -753,6 +765,11 @@ Time related parameters calls will be excluded from other uses. Fio will manually clear it from the CPU mask of other jobs. +.. option:: job_start_clock_id=int + + The clock_id passed to the call to `clock_gettime` used to record + job_start in the `json` output format. Default is 0, or CLOCK_REALTIME. + Target file/device ~~~~~~~~~~~~~~~~~~ @@ -795,9 +812,17 @@ Target file/device On Windows, disk devices are accessed as :file:`\\\\.\\PhysicalDrive0` for the first device, :file:`\\\\.\\PhysicalDrive1` for the second etc. - Note: Windows and FreeBSD prevent write access to areas + Note: Windows and FreeBSD (refer to geom(4)) prevent write access to areas of the disk containing in-use data (e.g. filesystems). + For HTTP and S3 access, specify a valid URL path or S3 key, respectively. + A filename for path-style S3 includes a bucket name (:file:`/bucket/k/e.y`) + while a virtual-hosted-style S3 filename :file:`/k/e.y` does not because + its bucket name is specified in :option:`http_host`. In both cases, the + filename should begin with a ``/``. The HTTP engine does not automatically + add a leading ``/`` when constructing URLs from :option:`http_host` and + :option:`filename`. + The filename "`-`" is a reserved name, meaning *stdin* or *stdout*. Which of the two depends on the read/write direction set. @@ -839,9 +864,27 @@ Target file/device generated filenames (with a directory specified) with the source of the client connecting. To disable this behavior, set this option to 0. +.. option:: filetype=str + + Assume that all files defined in a job are of this type. By default fio + will do :manpage:`stat(2)` for each file to know its file type. For huge + filesets it might be a bottleneck, so the option can be used to skip the + huge number of syscalls. The file types are: + + **none** + Unset. The default. + **file** + Regular file. + **block** + Block device file. + **char** + Char device file. + .. option:: opendir=str - Recursively open any files below directory `str`. + Recursively open any files below directory `str`. This accepts only a + single directory and unlike related options, colons appearing in the + path must not be escaped. .. option:: lockfile=str @@ -962,13 +1005,13 @@ Target file/device .. option:: unlink=bool - Unlink the job files when done. Not the default, as repeated runs of that + Unlink (delete) the job files when done. Not the default, as repeated runs of that job would then waste time recreating the file set again and again. Default: false. .. option:: unlink_each_loop=bool - Unlink job files after each iteration or loop. Default: false. + Unlink (delete) job files after each iteration or loop. Default: false. .. option:: zonemode=str @@ -976,14 +1019,14 @@ Target file/device **none** The :option:`zonerange`, :option:`zonesize`, - :option `zonecapacity` and option:`zoneskip` + :option:`zonecapacity` and :option:`zoneskip` parameters are ignored. **strided** I/O happens in a single zone until :option:`zonesize` bytes have been transferred. After that number of bytes has been transferred processing of the next zone - starts. :option `zonecapacity` is ignored. + starts. :option:`zonecapacity` is ignored. **zbd** Zoned block device mode. I/O happens sequentially in each zone, even if random I/O @@ -1052,22 +1095,57 @@ Target file/device .. option:: max_open_zones=int - When running a random write test across an entire drive many more - zones will be open than in a typical application workload. Hence this - command line option that allows to limit the number of open zones. The - number of open zones is defined as the number of zones to which write - commands are issued. + When a zone of a zoned block device is partially written (i.e. not all + sectors of the zone have been written), the zone is in one of three + conditions: 'implicit open', 'explicit open' or 'closed'. Zoned block + devices may have a limit called 'max_open_zones' (same name as the + parameter) on the total number of zones that can simultaneously be in + the 'implicit open' or 'explicit open' conditions. Zoned block devices + may have another limit called 'max_active_zones', on the total number of + zones that can simultaneously be in the three conditions. The + :option:`max_open_zones` parameter limits the number of zones to which + write commands are issued by all fio jobs, that is, limits the number of + zones that will be in the conditions. When the device has the + max_open_zones limit and does not have the max_active_zones limit, the + :option:`max_open_zones` parameter limits the number of zones in the two + open conditions up to the limit. In this case, fio includes zones in the + two open conditions to the write target zones at fio start. When the + device has both the max_open_zones and the max_active_zones limits, the + :option:`max_open_zones` parameter limits the number of zones in the + three conditions up to the limit. In this case, fio includes zones in + the three conditions to the write target zones at fio start. + + This parameter is relevant only if the :option:`zonemode` =zbd is used. + The default value is always equal to the max_open_zones limit of the + target zoned block device and a value higher than this limit cannot be + specified by users unless the option :option:`ignore_zone_limits` is + specified. When :option:`ignore_zone_limits` is specified or the target + device does not have the max_open_zones limit, :option:`max_open_zones` + can specify 0 to disable any limit on the number of zones that can be + simultaneously written to by all jobs. .. option:: job_max_open_zones=int - Limit on the number of simultaneously opened zones per single - thread/process. + In the same manner as :option:`max_open_zones`, limit the number of open + zones per fio job, that is, the number of zones that a single job can + simultaneously write to. A value of zero indicates no limit. + Default: zero. + +.. option:: ignore_zone_limits=bool + + If this option is used, fio will ignore the maximum number of open + zones limit of the zoned block device in use, thus allowing the + option :option:`max_open_zones` value to be larger than the device + reported limit. Default: false. .. option:: zone_reset_threshold=float - A number between zero and one that indicates the ratio of logical - blocks with data to the total number of logical blocks in the test - above which zones should be reset periodically. + A number between zero and one that indicates the ratio of written bytes + in the zones with write pointers in the IO range to the size of the IO + range. When current ratio is above this ratio, zones are reset + periodically as :option:`zone_reset_frequency` specifies. If there are + multiple jobs when using this option, the IO range for all write jobs + has to be the same. .. option:: zone_reset_frequency=float @@ -1077,6 +1155,17 @@ Target file/device requests. This and the previous parameter can be used to simulate garbage collection activity. +.. option:: recover_zbd_write_error=bool + + If this option is specified together with the option + :option:`continue_on_error`, check the write pointer positions after the + failed writes to sequential write required zones. Then move the write + pointers so that the next writes do not fail due to partial writes and + unexpected write pointer positions. If :option:`continue_on_error` is + not specified, errors out. When the writes are asynchronous, the write + pointer move fills blocks with zero then breaks verify data. If an + asynchronous IO engine and :option:`verify` workload are specified, + errors out. Default: false. I/O type ~~~~~~~~ @@ -1087,12 +1176,6 @@ I/O type OpenBSD and ZFS on Solaris don't support direct I/O. On Windows the synchronous ioengines don't support direct I/O. Default: false. -.. option:: atomic=bool - - If value is true, attempt to use atomic direct I/O. Atomic writes are - guaranteed to be stable once acknowledged by the operating system. Only - Linux supports O_ATOMIC right now. - .. option:: buffered=bool If value is true, use buffered I/O. This is the opposite of the @@ -1122,7 +1205,14 @@ I/O type Random mixed reads and writes. **trimwrite** Sequential trim+write sequences. Blocks will be trimmed first, - then the same blocks will be written to. + then the same blocks will be written to. So if ``io_size=64K`` + is specified, Fio will trim a total of 64K bytes and also + write 64K bytes on the same trimmed blocks. This behaviour + will be consistent with ``number_ios`` or other Fio options + limiting the total bytes or number of I/O's. + **randtrimwrite** + Like trimwrite, but uses random offsets rather + than sequential writes. Fio defaults to read if the option is not specified. For the mixed I/O types, the default is to split them 50/50. For certain types of I/O the @@ -1135,7 +1225,9 @@ I/O type pattern, then the ** value specified will be **added** to the generated offset for each I/O turning sequential I/O into sequential I/O with holes. For instance, using ``rw=write:4k`` will skip 4k for every write. Also see - the :option:`rw_sequencer` option. + the :option:`rw_sequencer` option. If this is used with :option:`verify` + then :option:`verify_header_seed` will be disabled, unless its explicitly + enabled. .. option:: rw_sequencer=str @@ -1149,13 +1241,34 @@ I/O type Generate the same offset. ``sequential`` is only useful for random I/O, where fio would normally - generate a new random offset for every I/O. If you append e.g. 8 to randread, - you would get a new random offset for every 8 I/Os. The result would be a - seek for only every 8 I/Os, instead of for every I/O. Use ``rw=randread:8`` - to specify that. As sequential I/O is already sequential, setting - ``sequential`` for that would not result in any differences. ``identical`` - behaves in a similar fashion, except it sends the same offset 8 number of - times before generating a new offset. + generate a new random offset for every I/O. If you append e.g. 8 to + randread, i.e. ``rw=randread:8`` you would get a new random offset for + every 8 I/Os. The result would be a sequence of 8 sequential offsets + with a random starting point. However this behavior may change if a + sequential I/O reaches end of the file. As sequential I/O is already + sequential, setting ``sequential`` for that would not result in any + difference. ``identical`` behaves in a similar fashion, except it sends + the same offset 8 number of times before generating a new offset. + + Example #1:: + + rw=randread:8 + rw_sequencer=sequential + bs=4k + + The generated sequence of offsets will look like this: + 4k, 8k, 12k, 16k, 20k, 24k, 28k, 32k, 92k, 96k, 100k, 104k, 108k, + 112k, 116k, 120k, 48k, 52k ... + + Example #2:: + + rw=randread:8 + rw_sequencer=identical + bs=4k + + The generated sequence of offsets will look like this: + 4k, 4k, 4k, 4k, 4k, 4k, 4k, 4k, 92k, 92k, 92k, 92k, 92k, 92k, 92k, 92k, + 48k, 48k, 48k ... .. option:: unified_rw_reporting=str @@ -1185,13 +1298,12 @@ I/O type .. option:: randrepeat=bool - Seed the random number generator used for random I/O patterns in a - predictable way so the pattern is repeatable across runs. Default: true. + Seed all random number generators in a predictable way so the pattern + is repeatable across runs. Default: true. .. option:: allrandrepeat=bool - Seed all random number generators in a predictable way so results are - repeatable across runs. Default: false. + Alias for :option:`randrepeat`. Default: true. .. option:: randseed=int @@ -1261,6 +1373,11 @@ I/O type **random** Advise using **FADV_RANDOM**. + **noreuse** + Advise using **FADV_NOREUSE**. This may be a no-op on older Linux + kernels. Since Linux 6.3, it provides a hint to the LRU algorithm. + See the :manpage:`posix_fadvise(2)` man page. + .. option:: write_hint=str Use :manpage:`fcntl(2)` to advise the kernel what life time to expect @@ -1294,7 +1411,7 @@ I/O type effectively caps the file size at `real_size - offset`. Can be combined with :option:`size` to constrain the start and end range of the I/O workload. A percentage can be specified by a number between 1 and 100 followed by '%', - for example, ``offset=20%`` to specify 20%. In ZBD mode, value can be set as + for example, ``offset=20%`` to specify 20%. In ZBD mode, value can be set as number of zones using 'z'. .. option:: offset_align=int @@ -1338,7 +1455,7 @@ I/O type .. option:: fdatasync=int Like :option:`fsync` but uses :manpage:`fdatasync(2)` to only sync data and - not metadata blocks. In Windows, FreeBSD, DragonFlyBSD or OSX there is no + not metadata blocks. In Windows, DragonFlyBSD or OSX there is no :manpage:`fdatasync(2)` so this falls back to using :manpage:`fsync(2)`. Defaults to 0, which means fio does not periodically issue and wait for a data-only sync to complete. @@ -1432,12 +1549,12 @@ I/O type supplied as a value between 0 and 100. The second, optional float is allowed for **pareto**, **zipf** and **normal** distributions. - It allows to set base of distribution in non-default place, giving more control + It allows one to set base of distribution in non-default place, giving more control over most probable outcome. This value is in range [0-1] which maps linearly to range of possible random values. Defaults are: random for **pareto** and **zipf**, and 0.5 for **normal**. If you wanted to use **zipf** with a `theta` of 1.2 centered on 1/4 of allowed value range, - you would use ``random_distibution=zipf:1.2:0.25``. + you would use ``random_distribution=zipf:1.2:0.25``. For a **zoned** distribution, fio supports specifying percentages of I/O access that should fall within what range of the file or device. For @@ -1489,11 +1606,12 @@ I/O type this option is given, fio will just get a new random offset without looking at past I/O history. This means that some blocks may not be read or written, and that some blocks may be read/written more than once. If this option is - used with :option:`verify` and multiple blocksizes (via :option:`bsrange`), - only intact blocks are verified, i.e., partially-overwritten blocks are - ignored. With an async I/O engine and an I/O depth > 1, it is possible for - the same block to be overwritten, which can cause verification errors. Either - do not use norandommap in this case, or also use the lfsr random generator. + used with :option:`verify` then :option:`verify_header_seed` will be + disabled. If this option is used with :option:`verify` and multiple blocksizes + (via :option:`bsrange`), only intact blocks are verified, i.e., + partially-overwritten blocks are ignored. With an async I/O engine and an I/O + depth > 1, header write sequence number verification will be disabled. See + :option:`verify_write_sequence`. .. option:: softrandommap=bool @@ -1524,6 +1642,55 @@ I/O type space exceeds 2^32 blocks. If it does, then **tausworthe64** is selected automatically. +.. option:: sprandom=bool + + + SPRandom is a method designed to rapidly precondition SSDs for + steady-state random write workloads. It divides the device into + equally sized regions and writes the device's entire physical capacity + once, selecting offsets so that the regions have a distribution of + invalid blocks matching the distribution that occurs at steady state. + + Default: false. + + It uses **random_generator=lfsr**, which fio will set by default. + Selecting any other random generator will result in an error. + + +.. option:: spr_num_regions=int + + See :option:`sprandom`. Specifies the number of regions used for SPRandom. + For large devices it is better to use more regions, to increase precision + and reduce memory allocation. The allocation is proportional to the region size. + + Default=100 + + +.. option:: spr_op=float + + See :option:`sprandom`. Over-provisioning ratio in the range (0, 1), + as specified by the SSD manufacturer. + + Default=0.15 + + +.. option:: spr_cs=int + + See :option:`sprandom`. Define a cache size in bytes, as specified + by the SSD manufacturer. When this is non-zero, delay invalidating + writes by one region in order to make sure that all original + writes from a region are flushed from cache before the later + invalidating writes are sent to the device. This deferral + prevents the original write and the later invalidating write + from being present in the device's cache at the same time which + would allow the device to ignore the original write and prevent + sprandom from achieving its target validity fractions. The + actual cache size is used to ensure that the number of regions + is not set so large that the size of a region is smaller than + the device cache. + + Default=0 + Block size ~~~~~~~~~~ @@ -1561,7 +1728,7 @@ Block size Comma-separated ranges may be specified for reads, writes, and trims as described in :option:`blocksize`. - Example: ``bsrange=1k-4k,2k-8k``. + Example: ``bsrange=1k-4k,2k-8k`` also the ':' delimiter ``bsrange=1k:4k,2k:8k``. .. option:: bssplit=str[,str][,str] @@ -1743,6 +1910,12 @@ Buffers and memory Note that size needs to be explicitly provided and only 1 file per job is supported +.. option:: dedupe_global=bool + + This controls whether the deduplication buffers will be shared amongst + all jobs that have this option set. The buffers are spread evenly between + participating jobs. + .. option:: invalidate=bool Invalidate the buffer/page cache parts of the files to be used prior to @@ -1810,13 +1983,14 @@ Buffers and memory **mmaphuge** to work, the system must have free huge pages allocated. This can normally be checked and set by reading/writing :file:`/proc/sys/vm/nr_hugepages` on a Linux system. Fio assumes a huge page - is 4MiB in size. So to calculate the number of huge pages you need for a - given job file, add up the I/O depth of all jobs (normally one unless - :option:`iodepth` is used) and multiply by the maximum bs set. Then divide - that number by the huge page size. You can see the size of the huge pages in - :file:`/proc/meminfo`. If no huge pages are allocated by having a non-zero - number in `nr_hugepages`, using **mmaphuge** or **shmhuge** will fail. Also - see :option:`hugepage-size`. + is 2 or 4MiB in size depending on the platform. So to calculate the + number of huge pages you need for a given job file, add up the I/O + depth of all jobs (normally one unless :option:`iodepth` is used) and + multiply by the maximum bs set. Then divide that number by the huge + page size. You can see the size of the huge pages in + :file:`/proc/meminfo`. If no huge pages are allocated by having a + non-zero number in `nr_hugepages`, using **mmaphuge** or **shmhuge** + will fail. Also see :option:`hugepage-size`. **mmaphuge** also needs to have hugetlbfs mounted and the file location should point there. So if it's mounted in :file:`/huge`, you would use @@ -1835,10 +2009,12 @@ Buffers and memory .. option:: hugepage-size=int - Defines the size of a huge page. Must at least be equal to the system - setting, see :file:`/proc/meminfo`. Defaults to 4MiB. Should probably - always be a multiple of megabytes, so using ``hugepage-size=Xm`` is the - preferred way to set this to avoid setting a non-pow-2 bad value. + Defines the size of a huge page. Must at least be equal to the system + setting, see :file:`/proc/meminfo` and + :file:`/sys/kernel/mm/hugepages/`. Defaults to 2 or 4MiB depending on + the platform. Should probably always be a multiple of megabytes, so + using ``hugepage-size=Xm`` is the preferred way to set this to avoid + setting a non-pow-2 bad value. .. option:: lockmem=int @@ -1852,8 +2028,11 @@ I/O size .. option:: size=int The total size of file I/O for each thread of this job. Fio will run until - this many bytes has been transferred, unless runtime is limited by other options - (such as :option:`runtime`, for instance, or increased/decreased by :option:`io_size`). + this many bytes has been transferred, unless runtime is altered by other means + such as (1) :option:`runtime`, (2) :option:`io_size` (3) :option:`number_ios`, + (4) gaps/holes while doing I/O's such as ``rw=read:16K``, or (5) sequential + I/O reaching end of the file which is possible when :option:`percentage_random` + is less than 100. Fio will divide this size between the available files determined by options such as :option:`nrfiles`, :option:`filename`, unless :option:`filesize` is specified by the job. If the result of division happens to be 0, the size is @@ -1861,7 +2040,7 @@ I/O size If this option is not specified, fio will use the full size of the given files or devices. If the files do not exist, size must be given. It is also possible to give size as a percentage between 1 and 100. If ``size=20%`` is - given, fio will use 20% of the full size of the given files or devices. + given, fio will use 20% of the full size of the given files or devices. In ZBD mode, value can also be set as number of zones using 'z'. Can be combined with :option:`offset` to constrain the start and end range that I/O will be done within. @@ -1880,11 +2059,12 @@ I/O size .. option:: filesize=irange(int) - Individual file sizes. May be a range, in which case fio will select sizes - for files at random within the given range and limited to :option:`size` in - total (if that is given). If not given, each created file is the same size. - This option overrides :option:`size` in terms of file size, which means - this value is used as a fixed size or possible range of each file. + Individual file sizes. May be a range, in which case fio will select sizes for + files at random within the given range. If not given, each created file is the + same size. This option overrides :option:`size` in terms of file size, i.e. if + :option:`filesize` is specified then :option:`size` becomes merely the default + for :option:`io_size` and has no effect at all if :option:`io_size` is set + explicitly. .. option:: file_append=bool @@ -1909,7 +2089,9 @@ I/O engine .. option:: ioengine=str - Defines how the job issues I/O to the file. The following types are defined: + fio supports 2 kinds of performance measurement: I/O and file/directory operation. + + I/O engines define how the job issues I/O to the file. The following types are defined: **sync** Basic :manpage:`read(2)` or :manpage:`write(2)` @@ -1935,6 +2117,10 @@ I/O engine for both direct and buffered IO. This engine defines engine specific options. + **io_uring_cmd** + Fast Linux native asynchronous I/O for pass through commands. + This engine defines engine specific options. + **libaio** Linux native asynchronous I/O. Note that Linux may only support queued behavior with non-buffered I/O (set ``direct=1`` or @@ -2079,11 +2265,6 @@ I/O engine before overwriting. The `trimwrite` mode works well for this constraint. - **pmemblk** - Read and write using filesystem DAX to a file on a filesystem - mounted with DAX on a persistent memory device through the PMDK - libpmemblk library. - **dev-dax** Read and write using device DAX to a persistent memory device (e.g., /dev/dax0.0) through the PMDK libpmem library. @@ -2095,21 +2276,6 @@ I/O engine absolute or relative. See :file:`engines/skeleton_external.c` for details of writing an external I/O engine. - **filecreate** - Simply create the files and do no I/O to them. You still need to - set `filesize` so that all the accounting still occurs, but no - actual I/O will be done other than creating the file. - - **filestat** - Simply do stat() and do no I/O to the file. You need to set 'filesize' - and 'nrfiles', so that files will be created. - This engine is to measure file lookup and meta data access. - - **filedelete** - Simply delete the files by unlink() and do no I/O to them. You need to set 'filesize' - and 'nrfiles', so that the files will be created. - This engine is to measure file delete. - **libpmem** Read and write using mmap I/O to a file on a filesystem mounted with DAX on a persistent memory device through the PMDK @@ -2130,8 +2296,10 @@ I/O engine Asynchronous read and write using DDN's Infinite Memory Engine (IME). This engine will try to stack as much IOs as possible by creating requests for IME. FIO will then decide when to commit these requests. + **libiscsi** Read and write iscsi lun with libiscsi. + **nbd** Read and write a Network Block Device (NBD). @@ -2142,6 +2310,7 @@ I/O engine unless :option:`verify` is set or :option:`cuda_io` is `posix`. :option:`iomem` must not be `cudamalloc`. This ioengine defines engine specific options. + **dfs** I/O engine supporting asynchronous read and write operations to the DAOS File System (DFS) via libdfs. @@ -2155,6 +2324,71 @@ I/O engine **exec** Execute 3rd party tools. Could be used to perform monitoring during jobs runtime. + **xnvme** + I/O engine using the xNVMe C API, for NVMe devices. The xnvme engine provides + flexibility to access GNU/Linux Kernel NVMe driver via libaio, IOCTLs, io_uring, + the SPDK NVMe driver, or your own custom NVMe driver. The xnvme engine includes + engine specific options. (See https://xnvme.io). + + **libblkio** + Use the libblkio library + (https://gitlab.com/libblkio/libblkio). The specific + *driver* to use must be set using + :option:`libblkio_driver`. If + :option:`mem`/:option:`iomem` is not specified, memory + allocation is delegated to libblkio (and so is + guaranteed to work with the selected *driver*). One + libblkio instance is used per process, so all jobs + setting option :option:`thread` will share a single + instance (with one queue per thread) and must specify + compatible options. Note that some drivers don't allow + several instances to access the same device or file + simultaneously, but allow it for threads. + + File/directory operation engines define how the job operates file or directory. The + following types are defined: + + **filecreate** + Simply create the files and do no I/O to them. You still need to + set `filesize` so that all the accounting still occurs, but no + actual I/O will be done other than creating the file. + Example job file: filecreate-ioengine.fio. + + **filestat** + Simply do stat() and do no I/O to the file. You need to set 'filesize' + and 'nrfiles', so that files will be created. + This engine is to measure file lookup and meta data access. + Example job file: filestat-ioengine.fio. + + **filedelete** + Simply delete the files by unlink() and do no I/O to them. You need to set 'filesize' + and 'nrfiles', so that the files will be created. + This engine is to measure file delete. + Example job file: filedelete-ioengine.fio. + + **dircreate** + Simply create the directories and do no I/O to them. You still need to + set `filesize` so that all the accounting still occurs, but no + actual I/O will be done other than creating the directories. + Example job file: dircreate-ioengine.fio. + + **dirstat** + Simply do stat() and do no I/O to the directories. You need to set 'filesize' + and 'nrfiles', so that directories will be created. + This engine is to measure directory lookup and meta data access. + Example job file: dirstat-ioengine.fio. + + **dirdelete** + Simply delete the directories by rmdir() and do no I/O to them. You need to set 'filesize' + and 'nrfiles', so that the directories will be created. + This engine is to measure directory delete. + Example job file: dirdelete-ioengine.fio. + + For file and directory operation engines, there is no I/O throughput, then the + statistics data in report have different meanings. The meaningful output indexes are: 'iops' and 'clat'. + 'bw' is meaningless. Refer to section: "Interpreting the output" for more details. + + I/O engine specific parameters ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -2168,8 +2402,8 @@ with the caveat that when used on the command line, they must come after the Set the percentage of I/O that will be issued with the highest priority. Default: 0. A single value applies to reads and writes. Comma-separated values may be specified for reads and writes. For this option to be - effective, NCQ priority must be supported and enabled, and `direct=1' - option must be used. fio must also be run as the root user. Unlike + effective, NCQ priority must be supported and enabled, and the :option:`direct` + option must be set. fio must also be run as the root user. Unlike slat/clat/lat stats, which can be tracked and reported independently, per priority stats only track and report a single type of latency. By default, completion latency (clat) will be reported, if :option:`lat_percentiles` is @@ -2186,6 +2420,16 @@ with the caveat that when used on the command line, they must come after the reads and writes. See :manpage:`ionice(1)`. See also the :option:`prioclass` option. +.. option:: cmdprio_hint=int[,int] : [io_uring] [libaio] + + Set the I/O priority hint to use for I/Os that must be issued with + a priority when :option:`cmdprio_percentage` or + :option:`cmdprio_bssplit` is set. If not specified when + :option:`cmdprio_percentage` or :option:`cmdprio_bssplit` is set, + this defaults to 0 (no hint). A single value applies to reads and + writes. Comma-separated values may be specified for reads and writes. + See also the :option:`priohint` option. + .. option:: cmdprio=int[,int] : [io_uring] [libaio] Set the I/O priority value to use for I/Os that must be issued with @@ -2200,52 +2444,122 @@ with the caveat that when used on the command line, they must come after the meaning of priority may differ. See also the :option:`prio` option. .. option:: cmdprio_bssplit=str[,str] : [io_uring] [libaio] + To get a finer control over I/O priority, this option allows specifying the percentage of IOs that must have a priority set depending on the block size of the IO. This option is useful only when used together with the :option:`bssplit` option, that is, multiple different block sizes are used for reads and writes. - The format for this option is the same as the format of the - :option:`bssplit` option, with the exception that values for - trim IOs are ignored. This option is mutually exclusive with the - :option:`cmdprio_percentage` option. -.. option:: fixedbufs : [io_uring] + The first accepted format for this option is the same as the format of + the :option:`bssplit` option: + + cmdprio_bssplit=blocksize/percentage:blocksize/percentage + + In this case, each entry will use the priority class, priority hint + and priority level defined by the options :option:`cmdprio_class`, + :option:`cmdprio` and :option:`cmdprio_hint` respectively. + + The second accepted format for this option is: + + cmdprio_bssplit=blocksize/percentage/class/level:blocksize/percentage/class/level + + In this case, the priority class and priority level is defined inside + each entry. In comparison with the first accepted format, the second + accepted format does not restrict all entries to have the same priority + class and priority level. + + The third accepted format for this option is: + + cmdprio_bssplit=blocksize/percentage/class/level/hint:... - If fio is asked to do direct IO, then Linux will map pages for each - IO call, and release them when IO is done. If this option is set, the - pages are pre-mapped before IO is started. This eliminates the need to - map and release for each IO. This is more efficient, and reduces the - IO latency as well. + This is an extension of the second accepted format that allows one to + also specify a priority hint. -.. option:: hipri : [io_uring] + For all formats, only the read and write data directions are supported, + values for trim IOs are ignored. This option is mutually exclusive with + the :option:`cmdprio_percentage` option. - If this option is set, fio will attempt to use polled IO completions. - Normal IO completions generate interrupts to signal the completion of - IO, polled completions do not. Hence they are require active reaping - by the application. The benefits are more efficient IO for high IOPS - scenarios, and lower latencies for low queue depth IO. +.. option:: fixedbufs : [io_uring] [io_uring_cmd] -.. option:: registerfiles : [io_uring] + If fio is asked to do direct IO, then Linux will map pages for each + IO call, and release them when IO is done. If this option is set, the + pages are pre-mapped before IO is started. This eliminates the need to + map and release for each IO. This is more efficient, and reduces the + IO latency as well. + +.. option:: nonvectored=int : [io_uring] [io_uring_cmd] + + With this option, fio will use non-vectored read/write commands, where + address must contain the address directly. Default is -1. + +.. option:: force_async=int : [io_uring] [io_uring_cmd] + + Normal operation for io_uring is to try and issue an sqe as + non-blocking first, and if that fails, execute it in an async manner. + With this option set to N, then every N request fio will ask sqe to + be issued in an async manner. Default is 0. + +.. option:: registerfiles : [io_uring] [io_uring_cmd] With this option, fio registers the set of files being used with the kernel. This avoids the overhead of managing file counts in the kernel, making the submission and completion part more lightweight. Required for the below :option:`sqthread_poll` option. -.. option:: sqthread_poll : [io_uring] +.. option:: sqthread_poll : [io_uring] [io_uring_cmd] [xnvme] Normally fio will submit IO by issuing a system call to notify the kernel of available items in the SQ ring. If this option is set, the act of submitting IO will be done by a polling thread in the kernel. This frees up cycles for fio, at the cost of using more CPU in the - system. + system. As submission is just the time it takes to fill in the sqe + entries and any syscall required to wake up the idle kernel thread, + fio will not report submission latencies. -.. option:: sqthread_poll_cpu : [io_uring] +.. option:: sqthread_poll_cpu=int : [io_uring] [io_uring_cmd] When :option:`sqthread_poll` is set, this option provides a way to define which CPU should be used for the polling thread. +.. option:: cmd_type=str : [io_uring_cmd] + + Specifies the type of uring passthrough command to be used. Supported + value is nvme. Default is nvme. + +.. option:: hipri + + [io_uring] [io_uring_cmd] [xnvme] + + If this option is set, fio will attempt to use polled IO completions. + Normal IO completions generate interrupts to signal the completion of + IO, polled completions do not. Hence they are require active reaping + by the application. The benefits are more efficient IO for high IOPS + scenarios, and lower latencies for low queue depth IO. + + [libblkio] + + Use poll queues. This is incompatible with + :option:`libblkio_wait_mode=eventfd ` and + :option:`libblkio_force_enable_completion_eventfd`. + + [pvsync2] + + Set RWF_HIPRI on I/O, indicating to the kernel that it's of higher priority + than normal. + + [sg] + + If this option is set, fio will attempt to use polled IO completions. + This will have a similar effect as (io_uring)hipri. Only SCSI READ and + WRITE commands will have the SGV4_FLAG_HIPRI set (not UNMAP (trim) nor + VERIFY). Older versions of the Linux sg driver that do not support + hipri will simply ignore this flag and do normal IO. The Linux SCSI + Low Level Driver (LLD) that "owns" the device also needs to support + hipri (also known as iopoll and mq_poll). The MegaRAID driver is an + example of a SCSI LLD. Default: clear (0) which does normal + (interrupted based) IO. + .. option:: userspace_reap : [libaio] Normally, with the libaio engine in use, fio will use the @@ -2254,17 +2568,12 @@ with the caveat that when used on the command line, they must come after the reap events. The reaping mode is only enabled when polling for a minimum of 0 events (e.g. when :option:`iodepth_batch_complete` `=0`). -.. option:: hipri : [pvsync2] - - Set RWF_HIPRI on I/O, indicating to the kernel that it's of higher priority - than normal. - .. option:: hipri_percentage : [pvsync2] When hipri is set this determines the probability of a pvsync2 I/O being high priority. The default is 100%. -.. option:: nowait : [pvsync2] [libaio] [io_uring] +.. option:: nowait=bool : [pvsync2] [libaio] [io_uring] [io_uring_cmd] By default if a request cannot be executed immediately (e.g. resource starvation, waiting on locks) it is queued and the initiating process will be blocked until @@ -2284,6 +2593,165 @@ with the caveat that when used on the command line, they must come after the For direct I/O, requests will only succeed if cache invalidation isn't required, file blocks are fully allocated and the disk request could be issued immediately. +.. option:: uncached=int : [pvsync2] [io_uring] + + This option will perform buffered IO without retaining data in the + page cache after the operation completes. + + Reads work like a normal buffered read but pages are evicted immediately + after data is copied to userspace. Writes work like buffered writes but + a writeback is initiated before the syscall returns. Pages are evicted + once the writeback completes. + + This option sets the RWF_UNCACHED flag (supported from the 6.14 Linux kernel) on + a per-IO basis. + +.. option:: atomic=bool : [pvsync2] [libaio] [io_uring] + + This option means that writes are issued with torn-write protection, meaning + that for a power fail or kernel crash, all or none of the data from the write + will be stored, but never a mix of old and new data. Torn-write protection is + also known as atomic writes. + + This option sets the RWF_ATOMIC flag (supported from the 6.11 Linux kernel) on + a per-IO basis. + + Writes with RWF_ATOMIC set will be rejected by the kernel when the file does + not support torn-write protection. To learn a file's torn-write limits, issue + statx with STATX_WRITE_ATOMIC. + +.. option:: libaio_vectored=bool : [libaio] + + Submit vectored read and write requests. + +.. option:: fdp=bool : [io_uring_cmd] [xnvme] + + Enable Flexible Data Placement mode for write commands. + +.. option:: dataplacement=str : [io_uring_cmd] [xnvme] + + Specifies the data placement directive type to use for write commands. + The following types are supported: + + **none** + Do not use a data placement directive. This is the + default. + + **fdp** + Use Flexible Data Placement directives for write + commands. This is equivalent to specifying + :option:`fdp` =1. + + **streams** + Use Streams directives for write commands. + +.. option:: plid_select=str, fdp_pli_select=str : [io_uring_cmd] [xnvme] + + Defines how fio decides which placement ID to use next. The following + types are defined: + + **random** + Choose a placement ID at random (uniform). + + **roundrobin** + Round robin over available placement IDs. This is the + default. + + **scheme** + Choose a placement ID (index) based on the scheme file defined by + the option :option:`dp_scheme`. + + The available placement ID (indices) are defined by the option :option:`fdp_pli` + or :option:`plids` except for the case of **scheme**. + +.. option:: plids=str, fdp_pli=str : [io_uring_cmd] [xnvme] + + Select which Placement ID Indices (FDP) or Placement IDs (streams) this + job is allowed to use for writes. This option accepts a comma-separated + list of values or ranges (e.g., 1,2-4,5,6-8). + + For FDP by default, the job will cycle through all available Placement + IDs, so use this option to be selective. The values specified here are + array indices for the list of placement IDs returned by the nvme-cli + command ``nvme fdp status``. If you want fio to use FDP placement + identifiers only at indices 0, 2 and 5, set ``plids=0,2,5``. + + For streams this should be a list of Stream IDs. + +.. option:: dp_scheme=str : [io_uring_cmd] [xnvme] + + Defines which placement ID (index) to be selected based on offset(LBA) range. + The file should contains one or more scheme entries in the following format: + + 0, 10737418240, 0 + 10737418240, 21474836480, 1 + 21474836480, 32212254720, 2 + ... + + Each line, a scheme entry, contains start offset, end offset, and placement ID + (index) separated by comma(,). If the write offset is within the range of a certain + scheme entry(start offset ≤ offset < end offset), the corresponding placement ID + (index) will be selected. If the write offset belongs to multiple scheme entries, + the first matched scheme entry will be applied. If the offset is not within any range + of scheme entry, dspec field will be set to 0, default RUH. (Caution: In case of + multiple devices in a job, all devices of the job will be affected by the scheme. If + this option is specified, the option :option:`plids` or :option:`fdp_pli` will be + ignored.) + +.. option:: md_per_io_size=int : [io_uring_cmd] [xnvme] + + Size in bytes for separate metadata buffer per IO. For io_uring_cmd + these buffers are allocated using malloc regardless of what is set for + :option:`iomem`. Default: 0. + +.. option:: pi_act=int : [io_uring_cmd] [xnvme] + + Action to take when nvme namespace is formatted with protection + information. If this is set to 1 and namespace is formatted with + metadata size equal to protection information size, fio won't use + separate metadata buffer or extended logical block. If this is set to + 1 and namespace is formatted with metadata size greater than protection + information size, fio will not generate or verify the protection + information portion of metadata for write or read case respectively. + If this is set to 0, fio generates protection information for + write case and verifies for read case. Default: 1. + + For 16 bit CRC generation fio will use isa-l if available otherwise + it will use the default slower generator. + (see: https://github.com/intel/isa-l) + +.. option:: pi_chk=str[,str][,str] : [io_uring_cmd] [xnvme] + + Controls the protection information check. This can take one or more + of these values. Default: none. + + **GUARD** + Enables protection information checking of guard field. + **REFTAG** + Enables protection information checking of logical block + reference tag field. + **APPTAG** + Enables protection information checking of application tag field. + +.. option:: apptag=int : [io_uring_cmd] [xnvme] + + Specifies logical block application tag value, if namespace is + formatted to use end to end protection information. Default: 0x1234. + +.. option:: apptag_mask=int : [io_uring_cmd] [xnvme] + + Specifies logical block application tag mask value, if namespace is + formatted to use end to end protection information. Default: 0xffff. + +.. option:: num_range=int : [io_uring_cmd] + + For trim command this will be the number of ranges to trim per I/O + request. The number of logical blocks per range is determined by the + :option:`bs` option which should be a multiple of logical block size. + This cannot be used with read or write. Note that setting this + option > 1, :option:`log_offset` will not be able to log all the + offsets. Default: 1. + .. option:: cpuload=int : [cpuio] Attempt to use the specified percentage of CPU cycles. This is a mandatory @@ -2293,6 +2761,16 @@ with the caveat that when used on the command line, they must come after the Split the load into cycles of the given time. In microseconds. +.. option:: cpumode=str : [cpuio] + + Specify how to stress the CPU. It can take these two values: + + **noop** + This is the default where the CPU executes noop instructions. + **qsort** + Replace the default noop instructions loop with a qsort algorithm to + consume more energy. + .. option:: exit_on_io_done=bool : [cpuio] Detect when I/O threads are done, then exit. @@ -2314,7 +2792,7 @@ with the caveat that when used on the command line, they must come after the this will be the starting port number since fio will use a range of ports. - [rdma], [librpma_*] + [rdma] The port to use for RDMA-CM communication. This should be the same value on the client and the server side. @@ -2325,20 +2803,6 @@ with the caveat that when used on the command line, they must come after the is a TCP listener or UDP reader, the hostname is not used and must be omitted unless it is a valid UDP multicast address. -.. option:: serverip=str : [librpma_*] - - The IP address to be used for RDMA-CM based I/O. - -.. option:: direct_write_to_pmem=bool : [librpma_*] - - Set to 1 only when Direct Write to PMem from the remote host is possible. - Otherwise, set to 0. - -.. option:: busy_wait_polling=bool : [librpma_*_server] - - Set to 0 to wait for completion instead of busy-wait polling completion. - Default: 1. - .. option:: interface=str : [netsplice] [net] The IP address of the network interface used to send or receive UDP @@ -2366,10 +2830,13 @@ with the caveat that when used on the command line, they must come after the User datagram protocol V6. **unix** UNIX domain socket. + **vsock** + VSOCK protocol. - When the protocol is TCP or UDP, the port must also be given, as well as the - hostname if the job is a TCP listener or UDP reader. For unix sockets, the + When the protocol is TCP, UDP or VSOCK, the port must also be given, as well as the + hostname if the job is a TCP or VSOCK listener or UDP reader. For unix sockets, the normal :option:`filename` option should be used and the port is invalid. + When the protocol is VSOCK, the :option:`hostname` is the CID of the remote VM. .. option:: listen : [netsplice] [net] @@ -2419,10 +2886,6 @@ with the caveat that when used on the command line, they must come after the Specifies the name of the RBD. -.. option:: pool=str : [rbd,rados] - - Specifies the name of the Ceph pool containing RBD or RADOS data. - .. option:: clientname=str : [rbd,rados] Specifies the username (without the 'client.' prefix) used to access the @@ -2430,6 +2893,11 @@ with the caveat that when used on the command line, they must come after the the full *type.id* string. If no type. prefix is given, fio will add 'client.' by default. +.. option:: conf=str : [rados] + + Specifies the configuration path of ceph cluster, so conf file does not + have to be /etc/ceph/ceph.conf. + .. option:: busy_poll=bool : [rbd,rados] Poll store instead of waiting for completion. Usually this provides better @@ -2441,6 +2909,52 @@ with the caveat that when used on the command line, they must come after the Touching all objects affects ceph caches and likely impacts test results. Enabled by default. +.. option:: rbd_encryption_format=str : [rbd] + + Specifies the encryption format of the RBD image. Supported values are + ``luks1`` and ``luks2``. If set, :option:`rbd_encryption_passphrase` + must also be specified. Note that the image must have been + previously formatted using :command:`rbd encryption format `; + the fio rbd engine will only attempt to load the encryption + context, not format the image. The RBD encryption feature is + disabled by default. Support for this feature requires librbd + version 16.2 (Ceph Pacific) or later. + +.. option:: rbd_encryption_passphrase=str : [rbd] + + The passphrase used to unlock the encrypted RBD image. Required if + :option:`rbd_encryption_format` is set. + +.. option:: pool=str : + + [rbd,rados] + + Specifies the name of the Ceph pool containing RBD or RADOS data. + + [dfs] + + Specify the label or UUID of the DAOS pool to connect to. + +.. option:: cont=str : [dfs] + + Specify the label or UUID of the DAOS container to open. + +.. option:: chunk_size=int + + [dfs] + + Specify a different chunk size (in bytes) for the dfs file. + Use DAOS container's chunk size by default. + + [libhdfs] + + The size of the chunk to use for each file. + +.. option:: object_class=str : [dfs] + + Specify a different object class for the dfs file. + Use DAOS container's object class by default. + .. option:: skip_bad=bool : [mtd] Skip operations against known bad blocks. @@ -2449,10 +2963,6 @@ with the caveat that when used on the command line, they must come after the libhdfs will create chunk in this HDFS directory. -.. option:: chunk_size : [libhdfs] - - The size of the chunk to use for each file. - .. option:: verb=str : [rdma] The RDMA verb to use on this side of the RDMA ioengine connection. Valid @@ -2474,27 +2984,56 @@ with the caveat that when used on the command line, they must come after the Specify stat system call type to measure lookup/getattr performance. Default is **stat** for :manpage:`stat(2)`. -.. option:: readfua=bool : [sg] +.. option:: readfua=bool : [sg] [io_uring_cmd] With readfua option set to 1, read operations include the force unit access (fua) flag. Default is 0. -.. option:: writefua=bool : [sg] +.. option:: writefua=bool : [sg] [io_uring_cmd] With writefua option set to 1, write operations include the force unit access (fua) flag. Default is 0. +.. option:: write_mode=str : [io_uring_cmd] + + Specifies the type of write operation. Defaults to 'write'. + + **write** + Use Write commands for write operations + + **uncor** + Use Write Uncorrectable commands for write operations + + **zeroes** + Use Write Zeroes commands for write operations + + **verify** + Use Verify commands for write operations + +.. option:: verify_mode=str : [io_uring_cmd] + + Specifies the type of command to be used in the verification phase. Defaults to 'read'. + + **read** + Use Read commands for data verification + **compare** + Use Compare commands for data verification. This option is only valid with + specific pattern(s), which means it *must* be given with `verify=pattern` and + `verify_pattern=`. + .. option:: sg_write_mode=str : [sg] - Specify the type of write commands to issue. This option can take three values: + Specify the type of write commands to issue. This option can take ten values: **write** This is the default where write opcodes are issued as usual. - **verify** + **write_and_verify** Issue WRITE AND VERIFY commands. The BYTCHK bit is set to 0. This directs the device to carry out a medium verification with no data comparison. The writefua option is ignored with this selection. - **same** + **verify** + This option is deprecated. Use write_and_verify instead. + **write_same** Issue WRITE SAME commands. This transfers a single block to the device and writes this same block of data to a contiguous sequence of LBAs beginning at the specified offset. fio's block size parameter specifies @@ -2505,23 +3044,46 @@ with the caveat that when used on the command line, they must come after the for each command but only the first 512 bytes will be used and transferred to the device. The writefua option is ignored with this selection. - -.. option:: hipri : [sg] - - If this option is set, fio will attempt to use polled IO completions. - This will have a similar effect as (io_uring)hipri. Only SCSI READ and - WRITE commands will have the SGV4_FLAG_HIPRI set (not UNMAP (trim) nor - VERIFY). Older versions of the Linux sg driver that do not support - hipri will simply ignore this flag and do normal IO. The Linux SCSI - Low Level Driver (LLD) that "owns" the device also needs to support - hipri (also known as iopoll and mq_poll). The MegaRAID driver is an - example of a SCSI LLD. Default: clear (0) which does normal - (interrupted based) IO. + **same** + This option is deprecated. Use write_same instead. + **write_same_ndob** + Issue WRITE SAME(16) commands as above but with the No Data Output + Buffer (NDOB) bit set. No data will be transferred to the device with + this bit set. Data written will be a pre-determined pattern such as + all zeroes. + **write_stream** + Issue WRITE STREAM(16) commands. Use the **stream_id** option to specify + the stream identifier. + **verify_bytchk_00** + Issue VERIFY commands with BYTCHK set to 00. This directs the + device to carry out a medium verification with no data comparison. + **verify_bytchk_01** + Issue VERIFY commands with BYTCHK set to 01. This directs the device to + compare the data on the device with the data transferred to the device. + **verify_bytchk_11** + Issue VERIFY commands with BYTCHK set to 11. This transfers a + single block to the device and compares the contents of this block with the + data on the device beginning at the specified offset. fio's block size + parameter specifies the total amount of data compared with this command. + However, only one block (sector) worth of data is transferred to the device. + This is similar to the WRITE SAME command except that data is compared instead + of written. + +.. option:: stream_id=int : [sg] + + Set the stream identifier for WRITE STREAM commands. If this is set to 0 (which is not + a valid stream identifier) fio will open a stream and then close it when done. Default + is 0. .. option:: http_host=str : [http] - Hostname to connect to. For S3, this could be the bucket hostname. - Default is **localhost** + Hostname to connect to. HTTP port 80 is used automatically when the value of + the https parameter is *off*, and HTTPS port 443 if it is *on*. A + virtual-hosted-style S3 hostname starts with a bucket name, while a + path-style S3 hostname does not. See + https://docs.aws.amazon.com/AmazonS3/latest/userguide/VirtualHosting.html for + detailed examples. + Default is **localhost** (path-style S3 hostname) .. option:: http_user=str : [http] @@ -2555,6 +3117,24 @@ with the caveat that when used on the command line, they must come after the The S3 key/access id. +.. option:: http_s3_security_token=str : [http] + + The S3 security token. + +.. option:: http_s3_sse_customer_key=str : [http] + + The encryption customer key in SSE server side. + +.. option:: http_s3_sse_customer_algorithm=str : [http] + + The encryption customer algorithm in SSE server side. + Default is **AES256** + +.. option:: http_s3_storage_class=str : [http] + + Which storage class to access. User-customizable settings. + Default is **STANDARD** + .. option:: http_swift_auth_token=str : [http] The Swift auth token. See the example configuration file on how @@ -2566,6 +3146,24 @@ with the caveat that when used on the command line, they must come after the turns on verbose logging from libcurl, 2 additionally enables HTTP IO tracing. Default is **0** +.. option:: http_object_mode=str : [http] + + How to structure objects for HTTP IO: *block* or *range*. + Default is **block**. + + In *block* mode, one object is created for every block. The HTTP engine + treats :option:`blocksize` as the size of the object to read or write, + and appends the block start/end offsets to the :option:`filename` to + create the target object path. Reads and writes operate on whole + objects at a time. + + In *range* mode, one object is created for every file. The object path + is the filename directly for both read and write I/O. For read + requests, the :option:`blocksize` and :option:`offset` will be used to + set the "Range" header on read requests to issue partial reads of the + object. For write requests, blocksize is used to set the size of the + object, the same as in *block* mode. + .. option:: uri=str : [nbd] Specify the NBD URI of the server to test. The string @@ -2597,24 +3195,6 @@ with the caveat that when used on the command line, they must come after the GPU to RAM before a write and copied from RAM to GPU after a read. :option:`verify` does not affect use of cudaMemcpy. -.. option:: pool=str : [dfs] - - Specify the label or UUID of the DAOS pool to connect to. - -.. option:: cont=str : [dfs] - - Specify the label or UUID of the DAOS container to open. - -.. option:: chunk_size=int : [dfs] - - Specificy a different chunk size (in bytes) for the dfs file. - Use DAOS container's chunk size by default. - -.. option:: object_class=str : [dfs] - - Specificy a different object class for the dfs file. - Use DAOS container's object class by default. - .. option:: nfs_url=str : [nfs] URL in libnfs format, eg nfs:///path[?arg=val[&arg=val]*] @@ -2642,6 +3222,169 @@ with the caveat that when used on the command line, they must come after the If set, stdout and stderr streams are redirected to files named from the job name. Default is true. +.. option:: xnvme_async=str : [xnvme] + + Select the xnvme async command interface. This can take these values. + + **emu** + This is default and use to emulate asynchronous I/O by using a + single thread to create a queue pair on top of a synchronous + I/O interface using the NVMe driver IOCTL. + **thrpool** + Emulate an asynchronous I/O interface with a pool of userspace + threads on top of a synchronous I/O interface using the NVMe + driver IOCTL. By default four threads are used. + **io_uring** + Linux native asynchronous I/O interface which supports both + direct and buffered I/O. + **io_uring_cmd** + Fast Linux native asynchronous I/O interface for NVMe pass + through commands. This only works with NVMe character device + (/dev/ngXnY). + **libaio** + Use Linux aio for Asynchronous I/O. + **posix** + Use the posix asynchronous I/O interface to perform one or + more I/O operations asynchronously. + **vfio** + Use the user-space VFIO-based backend, implemented using + libvfn instead of SPDK. + **nil** + Do not transfer any data; just pretend to. This is mainly used + for introspective performance evaluation. + +.. option:: xnvme_sync=str : [xnvme] + + Select the xnvme synchronous command interface. This can take these values. + + **nvme** + This is default and uses Linux NVMe Driver ioctl() for + synchronous I/O. + **psync** + This supports regular as well as vectored pread() and pwrite() + commands. + **block** + This is the same as psync except that it also supports zone + management commands using Linux block layer IOCTLs. + +.. option:: xnvme_admin=str : [xnvme] + + Select the xnvme admin command interface. This can take these values. + + **nvme** + This is default and uses linux NVMe Driver ioctl() for admin + commands. + **block** + Use Linux Block Layer ioctl() and sysfs for admin commands. + +.. option:: xnvme_dev_nsid=int : [xnvme] + + xnvme namespace identifier for userspace NVMe driver, SPDK or vfio. + +.. option:: xnvme_dev_subnqn=str : [xnvme] + + Sets the subsystem NQN for fabrics. This is for xNVMe to utilize a + fabrics target with multiple systems. + +.. option:: xnvme_mem=str : [xnvme] + + Select the xnvme memory backend. This can take these values. + + **posix** + This is the default posix memory backend for linux NVMe driver. + **hugepage** + Use hugepages, instead of existing posix memory backend. The + memory backend uses hugetlbfs. This require users to allocate + hugepages, mount hugetlbfs and set an environment variable for + XNVME_HUGETLB_PATH. + **spdk** + Uses SPDK's memory allocator. + **vfio** + Uses libvfn's memory allocator. This also specifies the use + of libvfn backend instead of SPDK. + +.. option:: xnvme_iovec=int : [xnvme] + + If this option is set. xnvme will use vectored read/write commands. + +.. option:: libblkio_driver=str : [libblkio] + + The libblkio *driver* to use. Different drivers access devices through + different underlying interfaces. Available drivers depend on the + libblkio version in use and are listed at + https://libblkio.gitlab.io/libblkio/blkio.html#drivers + +.. option:: libblkio_path=str : [libblkio] + + Sets the value of the driver-specific "path" property before connecting + the libblkio instance, which identifies the target device or file on + which to perform I/O. Its exact semantics are driver-dependent and not + all drivers may support it; see + https://libblkio.gitlab.io/libblkio/blkio.html#drivers + +.. option:: libblkio_pre_connect_props=str : [libblkio] + + A colon-separated list of additional libblkio properties to be set after + creating but before connecting the libblkio instance. Each property must + have the format ``=``. Colons can be escaped as ``\:``. + These are set after the engine sets any other properties, so those can + be overridden. Available properties depend on the libblkio version in use + and are listed at + https://libblkio.gitlab.io/libblkio/blkio.html#properties + +.. option:: libblkio_num_entries=int : [libblkio] + + Sets the value of the driver-specific "num-entries" property before + starting the libblkio instance. Its exact semantics are driver-dependent + and not all drivers may support it; see + https://libblkio.gitlab.io/libblkio/blkio.html#drivers + +.. option:: libblkio_queue_size=int : [libblkio] + + Sets the value of the driver-specific "queue-size" property before + starting the libblkio instance. Its exact semantics are driver-dependent + and not all drivers may support it; see + https://libblkio.gitlab.io/libblkio/blkio.html#drivers + +.. option:: libblkio_pre_start_props=str : [libblkio] + + A colon-separated list of additional libblkio properties to be set after + connecting but before starting the libblkio instance. Each property must + have the format ``=``. Colons can be escaped as ``\:``. + These are set after the engine sets any other properties, so those can + be overridden. Available properties depend on the libblkio version in use + and are listed at + https://libblkio.gitlab.io/libblkio/blkio.html#properties + +.. option:: libblkio_vectored : [libblkio] + + Submit vectored read and write requests. + +.. option:: libblkio_write_zeroes_on_trim : [libblkio] + + Submit trims as "write zeroes" requests instead of discard requests. + +.. option:: libblkio_wait_mode=str : [libblkio] + + How to wait for completions: + + **block** (default) + Use a blocking call to ``blkioq_do_io()``. + **eventfd** + Use a blocking call to ``read()`` on the completion eventfd. + **loop** + Use a busy loop with a non-blocking call to ``blkioq_do_io()``. + +.. option:: libblkio_force_enable_completion_eventfd : [libblkio] + + Enable the queue's completion eventfd even when unused. This may impact + performance. The default is to enable it only if + :option:`libblkio_wait_mode=eventfd `. + +.. option:: no_completion_thread : [windowsaio] + + Avoid using a separate thread for completion polling. + I/O depth ~~~~~~~~~ @@ -2743,6 +3486,14 @@ I/O depth I/O rate ~~~~~~~~ +.. option:: thinkcycles=int + + Stall the job for the specified number of cycles after an I/O has completed before + issuing the next. May be used to simulate processing being done by an application. + This is not taken into account for the time to be waited on for :option:`thinktime`. + Might not have any effect on some platforms, this can be checked by trying a setting + a high enough amount of thinkcycles. + .. option:: thinktime=time Stall the job for the specified period of time after an I/O has completed before issuing the @@ -2834,6 +3585,11 @@ I/O rate fio will ignore the thinktime and continue doing IO at the specified rate, instead of entering a catch-up mode after thinktime is done. +.. option:: rate_cycle=int + + Average bandwidth for :option:`rate_min` and :option:`rate_iops_min` + over this number of milliseconds. Defaults to 1000. + I/O latency ~~~~~~~~~~~ @@ -2872,11 +3628,6 @@ I/O latency microseconds. Comma-separated values may be specified for reads, writes, and trims as described in :option:`blocksize`. -.. option:: rate_cycle=int - - Average bandwidth for :option:`rate` and :option:`rate_min` over this number - of milliseconds. Defaults to 1000. - I/O replay ~~~~~~~~~~ @@ -2885,7 +3636,8 @@ I/O replay Write the issued I/O patterns to the specified file. See :option:`read_iolog`. Specify a separate file for each job, otherwise the - iologs will be interspersed and the file may be corrupt. + iologs will be interspersed and the file may be corrupt. This file will + be opened in append mode. .. option:: read_iolog=str @@ -3010,6 +3762,11 @@ Threads, processes and job synchronization (meaning no forward references). Second, if a job is being referenced as a waitee, it must have a unique name (no duplicate waitees). +.. option:: comm=str + + Set the job process comm to the specified string. See man :manpage:`prctrl(2)`. + Note: This option is currently supported only on Linux. + .. option:: nice=int Run the job with the given nice value. See man :manpage:`nice(2)`. @@ -3033,6 +3790,18 @@ Threads, processes and job synchronization priority setting, see I/O engine specific :option:`cmdprio_percentage` and :option:`cmdprio_class` options. +.. option:: priohint=int + + Set the I/O priority hint. This is only applicable to platforms that + support I/O priority classes and to devices with features controlled + through priority hints, e.g. block devices supporting command duration + limits, or CDL. CDL is a way to indicate the desired maximum latency + of I/Os so that the device can optimize its internal command scheduling + according to the latency limits indicated by the user. + + For per-I/O priority hint setting, see the I/O engine specific + :option:`cmdprio_hint` option. + .. option:: cpus_allowed=str Controls the same options as :option:`cpumask`, but accepts a textual @@ -3128,13 +3897,13 @@ Threads, processes and job synchronization .. option:: flow=int - Weight in token-based flow control. If this value is used, then there is a - 'flow counter' which is used to regulate the proportion of activity between - two or more jobs. Fio attempts to keep this flow counter near zero. The - ``flow`` parameter stands for how much should be added or subtracted to the - flow counter on each iteration of the main I/O loop. That is, if one job has - ``flow=8`` and another job has ``flow=-1``, then there will be a roughly 1:8 - ratio in how much one runs vs the other. + Weight in token-based flow control. If this value is used, then fio + regulates the activity between two or more jobs sharing the same + flow_id. Fio attempts to keep each job activity proportional to other + jobs' activities in the same flow_id group, with respect to requested + weight per job. That is, if one job has `flow=3', another job has + `flow=2' and another with `flow=1`, then there will be a roughly 3:2:1 + ratio in how much one runs vs the others. .. option:: flow_sleep=int @@ -3155,13 +3924,13 @@ Threads, processes and job synchronization make fio terminate all jobs in the same group, as soon as one job of that group finishes. -.. option:: exit_what +.. option:: exit_what=str By default, fio will continue running all other jobs when one job finishes. - Sometimes this is not the desired action. Setting ``exit_all`` will + Sometimes this is not the desired action. Setting ``exitall`` will instead make fio terminate all jobs in the same group. The option - ``exit_what`` allows to control which jobs get terminated when ``exitall`` is - enabled. The default is ``group`` and does not change the behaviour of + ``exit_what`` allows one to control which jobs get terminated when ``exitall`` + is enabled. The default is ``group`` and does not change the behaviour of ``exitall``. The setting ``all`` terminates all jobs. The setting ``stonewall`` terminates all currently running jobs across all groups and continues execution with the next stonewalled group. @@ -3197,7 +3966,10 @@ Verification invocation of this workload. This option allows one to check data multiple times at a later date without overwriting it. This option makes sense only for workloads that write data, and does not support workloads with the - :option:`time_based` option set. + :option:`time_based` option set. :option:`verify_write_sequence` and + :option:`verify_header_seed` will be disabled in this mode, unless they are + explicitly enabled. The writes reported in the output when this option is + specified are phantom writes, since no writes are actually issued. .. option:: do_verify=bool @@ -3210,8 +3982,9 @@ Verification of the job. Each verification method also implies verification of special header, which is written to the beginning of each block. This header also includes meta information, like offset of the block, block number, timestamp - when block was written, etc. :option:`verify` can be combined with - :option:`verify_pattern` option. The allowed values are: + when block was written, initial seed value used to generate the buffer + contents etc. :option:`verify` can be combined with :option:`verify_pattern` + option. The allowed values are: **md5** Use an md5 sum of the data area and store it in the header of @@ -3280,25 +4053,37 @@ Verification basic information and checksumming, but if this option is set, only the specific pattern set with :option:`verify_pattern` is verified. + **pattern_hdr** + Verify a pattern in conjunction with a header. + **null** Only pretend to verify. Useful for testing internals with :option:`ioengine`\=null, not for much else. This option can be used for repeated burn-in tests of a system to make sure - that the written data is also correctly read back. If the data direction - given is a read or random read, fio will assume that it should verify a - previously written file. If the data direction includes any form of write, - the verify will be of the newly written data. + that the written data is also correctly read back. + + If the data direction given is a read or random read, fio will assume that + it should verify a previously written file. In this scenario fio will not + verify the block number written in the header. The header seed won't be + verified, unless its explicitly requested by setting + :option:`verify_header_seed`. Note in this scenario the header seed check + will only work if the read invocation exactly matches the original write + invocation. + + If the data direction includes any form of write, the verify will be of the + newly written data. To avoid false verification errors, do not use the norandommap option when verifying data with async I/O engines and I/O depths > 1. Or use the norandommap and the lfsr random generator together to avoid writing to the - same offset with muliple outstanding I/Os. + same offset with multiple outstanding I/Os. .. option:: verify_offset=int Swap the verification header with data somewhere else in the block before - writing. It is swapped back before verifying. + writing. It is swapped back before verifying. This should be within the + range of :option:`verify_interval`. .. option:: verify_interval=int @@ -3324,6 +4109,14 @@ Verification verify_pattern=0xff%o"abcd"-12 +.. option:: verify_pattern_interval=bool + + Recreate an instance of the :option:`verify_pattern` every + :option:`verify_pattern_interval` bytes. This is only useful when + :option:`verify_pattern` contains the %o format specifier and can be + used to speed up the process of writing each block on a device with its + offset. Default: 0 (disabled). + .. option:: verify_fatal=bool Normally fio will keep checking the entire contents before quitting on a @@ -3391,6 +4184,33 @@ Verification verification pass, according to the settings in the job file used. Default false. +.. option:: experimental_verify=bool + + Enable experimental verification. Standard verify records I/O metadata + for later use during the verification phase. Experimental verify + instead resets the file after the write phase and then replays I/Os for + the verification phase. + +.. option:: verify_write_sequence=bool + + Verify the header write sequence number. In a scenario with multiple jobs, + verification of the write sequence number may fail. Disabling this option + will mean that write sequence number checking is skipped. Doing that can be + useful for testing atomic writes, as it means that checksum verification can + still be attempted. For when :option:`atomic` is enabled, checksum + verification is expected to succeed (while write sequence checking can still + fail). + Defaults to true. + +.. option:: verify_header_seed=bool + + Verify the header seed value which was used to generate the buffer contents. + In certain scenarios with read / verify only workloads, when + :option:`norandommap` is enabled, with offset modifiers + (refer :option:`readwrite` and :option:`rw_sequencer`) etc verification of + header seed may fail. Disabling this option will mean that header seed + checking is skipped. Defaults to true. + .. option:: trim_percentage=int Number of verify blocks to discard/trim. @@ -3407,10 +4227,6 @@ Verification Trim this number of I/O blocks. -.. option:: experimental_verify=bool - - Enable experimental verification. - Steady state ~~~~~~~~~~~~ @@ -3450,12 +4266,23 @@ Steady state Collect bandwidth data and calculate the least squares regression slope. Stop the job if the slope falls below the specified limit. + **lat** + Collect completion latency data and calculate the maximum mean + deviation. Stop the job if the deviation falls below the specified + limit. + + **lat_slope** + Collect completion latency data and calculate the least squares + regression slope. Stop the job if the slope falls below the + specified limit. + .. option:: steadystate_duration=time, ss_dur=time - A rolling window of this duration will be used to judge whether steady state - has been reached. Data will be collected once per second. The default is 0 - which disables steady state detection. When the unit is omitted, the - value is interpreted in seconds. + A rolling window of this duration will be used to judge whether steady + state has been reached. Data will be collected every + :option:`ss_interval`. The default is 0 which disables steady state + detection. When the unit is omitted, the value is interpreted in + seconds. .. option:: steadystate_ramp_time=time, ss_ramp=time @@ -3463,15 +4290,25 @@ Steady state collection for checking the steady state job termination criterion. The default is 0. When the unit is omitted, the value is interpreted in seconds. +.. option:: steadystate_check_interval=time, ss_interval=time + + The values during the rolling window will be collected with a period of + this value. If :option:`ss_interval` is 30s and :option:`ss_dur` is + 300s, 10 measurements will be taken. Default is 1s but that might not + converge, especially for slower devices, so set this accordingly. When + the unit is omitted, the value is interpreted in seconds. + Measurements and reporting ~~~~~~~~~~~~~~~~~~~~~~~~~~ .. option:: per_job_logs=bool - If set, this generates bw/clat/iops log with per file private filenames. If - not set, jobs with identical names will share the log filename. Default: - true. + If set to true, fio generates bw/clat/iops logs with per job unique + filenames. If set to false, jobs with identical names will share a log + filename. Note that when this option is set to false log files will be + opened in append mode and if log files already exist the previous + contents will not be overwritten. Default: true. .. option:: group_reporting @@ -3483,6 +4320,18 @@ Measurements and reporting same reporting group, unless if separated by a :option:`stonewall`, or by using :option:`new_group`. + NOTE: When :option:`group_reporting` is used along with `json` output, + there are certain per-job properties which can be different between jobs + but do not have a natural group-level equivalent. Examples include + `kb_base`, `unit_base`, `sig_figs`, `thread_number`, `pid`, and + `job_start`. For these properties, the values for the first job are + recorded for the group. + + Also, options like :option:`percentile_list` and + :option:`unified_rw_reporting` should be consistent among the jobs in a + reporting group. Having options like these vary across the jobs in a + reporting group is an unsupported configuration. + .. option:: new_group Start a new reporting group. See: :option:`group_reporting`. If not given, @@ -3555,12 +4404,15 @@ Measurements and reporting .. option:: log_avg_msec=int - By default, fio will log an entry in the iops, latency, or bw log for every - I/O that completes. When writing to the disk log, that can quickly grow to a - very large size. Setting this option makes fio average the each log entry - over the specified period of time, reducing the resolution of the log. See - :option:`log_max_value` as well. Defaults to 0, logging all entries. - Also see `Log File Formats`_. + By default, fio will log an entry in the iops, latency, or bw log for + every I/O that completes. When writing to the disk log, that can + quickly grow to a very large size. Setting this option directs fio to + instead record an average over the specified duration for each log + entry, reducing the resolution of the log. When the job completes, fio + will flush any accumulated latency log data, so the final log interval + may not match the value specified by this option and there may even be + duplicate timestamps. See :option:`log_window_value` as well. Defaults + to 0, logging entries for each I/O. Also see `Log File Formats`_. .. option:: log_hist_msec=int @@ -3580,11 +4432,28 @@ Measurements and reporting histogram logs contain 1216 latency bins. See :option:`write_hist_log` and `Log File Formats`_. -.. option:: log_max_value=bool +.. option:: log_window_value=str, log_max_value=str + + If :option:`log_avg_msec` is set, fio by default logs the average over that + window. This option determines whether fio logs the average, maximum or + both the values over the window. This only affects the latency logging, + as both average and maximum values for iops or bw log will be same. + Accepted values are: + + **avg** + Log average value over the window. The default. + + **max** + Log maximum value in the window. - If :option:`log_avg_msec` is set, fio logs the average over that window. If - you instead want to log the maximum value, set this option to 1. Defaults to - 0, meaning that averaged values are logged. + **both** + Log both average and maximum value over the window. + + **0** + Backward-compatible alias for **avg**. + + **1** + Backward-compatible alias for **max**. .. option:: log_offset=bool @@ -3592,6 +4461,21 @@ Measurements and reporting entry as well as the other data values. Defaults to 0 meaning that offsets are not present in logs. Also see `Log File Formats`_. +.. option:: log_prio=bool + + If this is set, the *Command priority* field in `Log File Formats`_ + shows the priority value and the IO priority class of the command. + Otherwise, the field shows if the command has the highest RT + priority class or not. Also see `Log File Formats`_. + +.. option:: log_issue_time=bool + + If this is set, the iolog options will include the command issue time + for the I/O entry as well as the other data values. Defaults to 0 + meaning that command issue times are not present in logs. Also see + `Log File Formats`_. This option shall be set together with + :option:`write_lat_log` and :option:`log_offset`. + .. option:: log_compression=int If this is set, fio will compress the I/O logs as it goes, to keep the @@ -3620,10 +4504,21 @@ Measurements and reporting .. option:: log_unix_epoch=bool - If set, fio will log Unix timestamps to the log files produced by enabling - write_type_log for each log type, instead of the default zero-based + Backwards compatible alias for log_alternate_epoch. + +.. option:: log_alternate_epoch=bool + + If set, fio will log timestamps based on the epoch used by the clock specified + in the log_alternate_epoch_clock_id option, to the log files produced by + enabling write_type_log for each log type, instead of the default zero-based timestamps. +.. option:: log_alternate_epoch_clock_id=int + + Specifies the clock_id to be used by clock_gettime to obtain the alternate + epoch if log_alternate_epoch is true. Otherwise has no effect. Default + value is 0, or CLOCK_REALTIME. + .. option:: block_error_percentiles=bool If set, record errors in trim block-sized units from writes and trims and @@ -3720,6 +4615,13 @@ Error handling appended, the total error count and the first error. The error field given in the stats is the first error that was hit during the run. + Note: a write error from the device may go unnoticed by fio when using + buffered IO, as the write() (or similar) system call merely dirties the + kernel pages, unless :option:`sync` or :option:`direct` is used. Device IO + errors occur when the dirty data is actually written out to disk. If fully + sync writes aren't desirable, :option:`fsync` or :option:`fdatasync` can be + used as well. This is specific to writes, as reads are always synchronous. + The allowed values are: **none** @@ -3988,35 +4890,83 @@ writes in the example above). In the order listed, they denote: **slat** Submission latency (**min** being the minimum, **max** being the maximum, **avg** being the average, **stdev** being the standard - deviation). This is the time it took to submit the I/O. For - sync I/O this row is not displayed as the slat is really the - completion latency (since queue/complete is one operation there). - This value can be in nanoseconds, microseconds or milliseconds --- - fio will choose the most appropriate base and print that (in the - example above nanoseconds was the best scale). Note: in :option:`--minimal` mode - latencies are always expressed in microseconds. + deviation). This is the time from when fio initialized the I/O + to submission. For synchronous ioengines this includes the time + up until just before the ioengine's queue function is called. + For asynchronous ioengines this includes the time up through the + completion of the ioengine's queue function (and commit function + if it is defined). For sync I/O this row is not displayed as the + slat is negligible. This value can be in nanoseconds, + microseconds or milliseconds --- fio will choose the most + appropriate base and print that (in the example above + nanoseconds was the best scale). Note: in :option:`--minimal` + mode latencies are always expressed in microseconds. **clat** Completion latency. Same names as slat, this denotes the time from - submission to completion of the I/O pieces. For sync I/O, clat will - usually be equal (or very close) to 0, as the time from submit to - complete is basically just CPU time (I/O has already been done, see slat - explanation). + submission to completion of the I/O pieces. For sync I/O, this + represents the time from when the I/O was submitted to the + operating system to when it was completed. For asynchronous + ioengines this is the time from when the ioengine's queue (and + commit if available) functions were completed to when the I/O's + completion was reaped by fio. + + For file and directory operation engines, **clat** denotes the time + to complete one file or directory operation. + + **filecreate engine**:the time cost to create a new file + + **filestat engine**: the time cost to look up an existing file + + **filedelete engine**:the time cost to delete a file + + **dircreate engine**: the time cost to create a new directory + + **dirstat engine**: the time cost to look up an existing directory + + **dirdelete engine**: the time cost to delete a directory **lat** Total latency. Same names as slat and clat, this denotes the time from when fio created the I/O unit to completion of the I/O operation. + It is the sum of submission and completion latency. **bw** - Bandwidth statistics based on samples. Same names as the xlat stats, - but also includes the number of samples taken (**samples**) and an - approximate percentage of total aggregate bandwidth this thread - received in its group (**per**). This last value is only really - useful if the threads in this group are on the same disk, since they - are then competing for disk access. + Bandwidth statistics based on measurements from discrete + intervals. Fio continuously monitors bytes transferred and I/O + operations completed. By default fio calculates bandwidth in + each half-second interval (see :option:`bwavgtime`) and reports + descriptive statistics for the measurements here. Same names as + the xlat stats, but also includes the number of samples taken + (**samples**) and an approximate percentage of total aggregate + bandwidth this thread received in its group (**per**). This + last value is only really useful if the threads in this group + are on the same disk, since they are then competing for disk + access. + + For file and directory operation engines, **bw** is meaningless. **iops** - IOPS statistics based on samples. Same names as bw. + IOPS statistics based on measurements from discrete intervals. + For details see the description for bw above. See + :option:`iopsavgtime` to control the duration of the intervals. + Same values reported here as for bw except for percentage. + + For file and directory operation engines, **iops** is the most + fundamental index to denote the performance. + It means how many files or directories can be operated per second. + + **filecreate engine**:number of files can be created per second + + **filestat engine**: number of files can be looked up per second + + **filedelete engine**:number of files can be deleted per second + + **dircreate engine**: number of directories can be created per second + + **dirstat engine**: number of directories can be looked up per second + + **dirdelete engine**: number of directories can be deleted per second **lat (nsec/usec/msec)** The distribution of I/O completion latencies. This is the time from when @@ -4090,13 +5040,15 @@ For each data direction it prints: And finally, the disk statistics are printed. This is Linux specific. They will look like this:: Disk stats (read/write): - sda: ios=16398/16511, merge=30/162, ticks=6853/819634, in_queue=826487, util=100.00% + sda: ios=16398/16511, sectors=32321/65472, merge=30/162, ticks=6853/819634, in_queue=826487, util=100.00% Each value is printed for both reads and writes, with reads first. The numbers denote: **ios** Number of I/Os performed by all groups. +**sectors** + Amount of data transferred in units of 512 bytes for all groups. **merge** Number of merges performed by the I/O scheduler. **ticks** @@ -4273,7 +5225,7 @@ Trace file format v2 ~~~~~~~~~~~~~~~~~~~~ The second version of the trace file format was added in fio version 1.17. It -allows to access more then one file per trace and has a bigger set of possible +allows one to access more than one file per trace and has a bigger set of possible file actions. The first line of the trace file has to be:: @@ -4308,7 +5260,9 @@ given in bytes. The `action` can be one of these: **wait** Wait for `offset` microseconds. Everything below 100 is discarded. - The time is relative to the previous `wait` statement. + The time is relative to the previous `wait` statement. Note that + action `wait` is not allowed as of version 3, as the same behavior + can be achieved using timestamps. **read** Read `length` bytes beginning from `offset`. **write** @@ -4321,6 +5275,31 @@ given in bytes. The `action` can be one of these: Trim the given file from the given `offset` for `length` bytes. +Trace file format v3 +~~~~~~~~~~~~~~~~~~~~ + +The third version of the trace file format was added in fio version 3.31. It +forces each action to have a timestamp associated with it. + +The first line of the trace file has to be:: + + fio version 3 iolog + +Following this can be lines in two different formats, which are described below. + +The file management format:: + + timestamp filename action + +The file I/O action format:: + + timestamp filename action offset length + +The `timestamp` is relative to the beginning of the run (ie starts at 0). The +`filename`, `action`, `offset` and `length` are identical to version 2, except +that version 3 does not allow the `wait` action. + + I/O Replay - Merging Traces --------------------------- @@ -4462,7 +5441,7 @@ Fio supports a variety of log file formats, for logging latencies, bandwidth, and IOPS. The logs share a common format, which looks like this: *time* (`msec`), *value*, *data direction*, *block size* (`bytes`), - *offset* (`bytes`), *command priority* + *offset* (`bytes`), *command priority*, *issue time* (`nsec`) *Time* for the log entry is always in milliseconds. The *value* logged depends on the type of log, it will be one of the following: @@ -4487,15 +5466,36 @@ The entry's *block size* is always in bytes. The *offset* is the position in byt from the start of the file for that particular I/O. The logging of the offset can be toggled with :option:`log_offset`. -*Command priority* is 0 for normal priority and 1 for high priority. This is controlled -by the ioengine specific :option:`cmdprio_percentage`. +If :option:`log_prio` is not set, the entry's *Command priority* is 1 for an IO +executed with the highest RT priority class (:option:`prioclass` =1 or +:option:`cmdprio_class` =1) and 0 otherwise. This is controlled by the +:option:`prioclass` option and the ioengine specific +:option:`cmdprio_percentage` :option:`cmdprio_class` options. If +:option:`log_prio` is set, the entry's *Command priority* is the priority set +for the IO, as a 16-bits hexadecimal number with the lowest 13 bits indicating +the priority value (:option:`prio` and :option:`cmdprio` options) and the +highest 3 bits indicating the IO priority class (:option:`prioclass` and +:option:`cmdprio_class` options). + +The entry's *issue time* is the command issue time in nanoseconds. The logging +of the issue time can be toggled with :option:`log_issue_time`. This field has +valid values in completion latency log file (clat), or submit latency log file +(slat). The field has value 0 in other logs files. Fio defaults to logging every individual I/O but when windowed logging is set -through :option:`log_avg_msec`, either the average (by default) or the maximum -(:option:`log_max_value` is set) *value* seen over the specified period of time -is recorded. Each *data direction* seen within the window period will aggregate -its values in a separate row. Further, when using windowed logging the *block -size* and *offset* entries will always contain 0. +through :option:`log_avg_msec`, either the average (by default), the maximum +(:option:`log_window_value` is set to max) *value* seen over the specified period +of time, or both the average *value* and maximum *value1* (:option:`log_window_value` +is set to both) is recorded. The log file format when both the values are reported +takes this form: + + *time* (`msec`), *value*, *value1*, *data direction*, *block size* (`bytes`), + *offset* (`bytes`), *command priority*, *issue time* (`nsec`) + + +Each *data direction* seen within the window period will aggregate its values in a +separate row. Further, when using windowed logging the *block size*, *offset* +and *issue time* entries will always contain 0. Client/Server @@ -4549,6 +5549,9 @@ is the connect string, and `remote-args` and `job file(s)` are sent to the server. The `server` string follows the same format as it does on the server side, to allow IP/hostname/socket and port strings. +Note that all job options must be defined in job files when running fio as a +client. Any job options specified in `remote-args` will be ignored. + Fio can connect to multiple servers this way:: fio --client= --client= @@ -4586,5 +5589,14 @@ containing two hostnames ``h1`` and ``h2`` with IP addresses 192.168.10.120 and /mnt/nfs/fio/192.168.10.120.fileio.tmp /mnt/nfs/fio/192.168.10.121.fileio.tmp +This behavior can be disabled by the :option:`unique_filename` option. + Terse output in client/server mode will differ slightly from what is produced when fio is run in stand-alone mode. See the terse output section for details. + +Also, if one fio invocation runs workloads on multiple servers, fio will +provide at the end an aggregate summary report for all workloads. This +aggregate summary report assumes that options affecting reporting like +:option:`unified_rw_reporting` and :option:`percentile_list` are identical +across all the jobs summarized. Having different values for these options is an +unsupported configuration. diff --git a/Makefile b/Makefile index 5d17bcab90..0337e8feb8 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,7 @@ include config-host.mak endif DEBUGFLAGS = -DFIO_INC_DEBUG -CPPFLAGS= -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DFIO_INTERNAL $(DEBUGFLAGS) +CPPFLAGS+= -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DFIO_INTERNAL $(DEBUGFLAGS) OPTFLAGS= -g -ffast-math FIO_CFLAGS= -std=gnu99 -Wwrite-strings -Wall -Wdeclaration-after-statement $(OPTFLAGS) $(EXTFLAGS) $(BUILD_CFLAGS) -I. -I$(SRCDIR) LIBS += -lm $(EXTLIBS) @@ -28,7 +28,7 @@ PROGS = fio SCRIPTS = $(addprefix $(SRCDIR)/,tools/fio_generate_plots tools/plot/fio2gnuplot tools/genfio tools/fiologparser.py tools/hist/fiologparser_hist.py tools/hist/fio-histo-log-pctiles.py tools/fio_jsonplus_clat2csv) ifndef CONFIG_FIO_NO_OPT - FIO_CFLAGS += -O3 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2 + FIO_CFLAGS += -O3 endif ifdef CONFIG_BUILD_NATIVE FIO_CFLAGS += -march=native @@ -53,16 +53,17 @@ SOURCE := $(sort $(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/crc/*.c)) \ $(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/lib/*.c))) \ gettime.c ioengines.c init.c stat.c log.c time.c filesetup.c \ eta.c verify.c memory.c io_u.c parse.c fio_sem.c rwlock.c \ - pshared.c options.c \ + pshared.c options.c fio_shared_sem.c \ smalloc.c filehash.c profile.c debug.c engines/cpu.c \ engines/mmap.c engines/sync.c engines/null.c engines/net.c \ - engines/ftruncate.c engines/filecreate.c engines/filestat.c engines/filedelete.c \ + engines/ftruncate.c engines/fileoperations.c \ engines/exec.c \ server.c client.c iolog.c backend.c libfio.c flow.c cconv.c \ gettime-thread.c helpers.c json.c idletime.c td_error.c \ profiles/tiobench.c profiles/act.c io_u_queue.c filelock.c \ workqueue.c rate-submit.c optgroup.c helper_thread.c \ - steadystate.c zone-dist.c zbd.c dedupe.c + steadystate.c zone-dist.c zbd.c dedupe.c dataplacement.c \ + sprandom.c ifdef CONFIG_LIBHDFS HDFSFLAGS= -I $(JAVA_HOME)/include -I $(JAVA_HOME)/include/linux -I $(FIO_LIBHDFS_INCLUDE) @@ -99,6 +100,7 @@ endif ifdef CONFIG_LIBAIO libaio_SRCS = engines/libaio.c cmdprio_SRCS = engines/cmdprio.c + LIBS += -laio libaio_LIBS = -laio ENGINES += libaio endif @@ -107,21 +109,6 @@ ifdef CONFIG_RDMA rdma_LIBS = -libverbs -lrdmacm ENGINES += rdma endif -ifdef CONFIG_LIBRPMA_APM - librpma_apm_SRCS = engines/librpma_apm.c - librpma_fio_SRCS = engines/librpma_fio.c - librpma_apm_LIBS = -lrpma -lpmem - ENGINES += librpma_apm -endif -ifdef CONFIG_LIBRPMA_GPSPM - librpma_gpspm_SRCS = engines/librpma_gpspm.c engines/librpma_gpspm_flush.pb-c.c - librpma_fio_SRCS = engines/librpma_fio.c - librpma_gpspm_LIBS = -lrpma -lpmem -lprotobuf-c - ENGINES += librpma_gpspm -endif -ifdef librpma_fio_SRCS - SOURCE += $(librpma_fio_SRCS) -endif ifdef CONFIG_POSIXAIO SOURCE += engines/posixaio.c endif @@ -199,11 +186,6 @@ ifdef CONFIG_MTD SOURCE += oslib/libmtd.c SOURCE += oslib/libmtd_legacy.c endif -ifdef CONFIG_PMEMBLK - pmemblk_SRCS = engines/pmemblk.c - pmemblk_LIBS = -lpmemblk - ENGINES += pmemblk -endif ifdef CONFIG_LINUX_DEVDAX dev-dax_SRCS = engines/dev-dax.c dev-dax_LIBS = -lpmem @@ -222,10 +204,21 @@ ifdef CONFIG_LIBZBC libzbc_LIBS = -lzbc ENGINES += libzbc endif - +ifdef CONFIG_LIBXNVME + xnvme_SRCS = engines/xnvme.c + xnvme_LIBS = $(LIBXNVME_LIBS) + xnvme_CFLAGS = $(LIBXNVME_CFLAGS) + ENGINES += xnvme +endif +ifdef CONFIG_LIBBLKIO + libblkio_SRCS = engines/libblkio.c + libblkio_LIBS = $(LIBBLKIO_LIBS) + libblkio_CFLAGS = $(LIBBLKIO_CFLAGS) + ENGINES += libblkio +endif ifeq ($(CONFIG_TARGET_OS), Linux) SOURCE += diskutil.c fifo.c blktrace.c cgroup.c trim.c engines/sg.c \ - oslib/linux-dev-lookup.c engines/io_uring.c + oslib/linux-dev-lookup.c engines/io_uring.c engines/nvme.c cmdprio_SRCS = engines/cmdprio.c ifdef CONFIG_HAS_BLKZONED SOURCE += oslib/linux-blkzoned.c @@ -235,7 +228,8 @@ endif endif ifeq ($(CONFIG_TARGET_OS), Android) SOURCE += diskutil.c fifo.c blktrace.c cgroup.c trim.c profiles/tiobench.c \ - oslib/linux-dev-lookup.c engines/io_uring.c + oslib/linux-dev-lookup.c engines/io_uring.c engines/nvme.c \ + engines/sg.c cmdprio_SRCS = engines/cmdprio.c ifdef CONFIG_HAS_BLKZONED SOURCE += oslib/linux-blkzoned.c @@ -276,12 +270,13 @@ ifeq ($(CONFIG_TARGET_OS), HP-UX) endif ifeq ($(CONFIG_TARGET_OS), Darwin) LIBS += -lpthread -ldl + SOURCE += os/mac/posix.c endif ifneq (,$(findstring CYGWIN,$(CONFIG_TARGET_OS))) SOURCE += os/windows/cpu-affinity.c os/windows/posix.c os/windows/dlls.c WINDOWS_OBJS = os/windows/cpu-affinity.o os/windows/posix.o os/windows/dlls.o lib/hweight.o LIBS += -lpthread -lpsapi -lws2_32 -lssp - FIO_CFLAGS += -DPSAPI_VERSION=1 -Ios/windows/posix/include -Wno-format + FIO_CFLAGS += -DPSAPI_VERSION=1 -I$(SRCDIR)/os/windows/posix/include -Wno-format endif ifdef cmdprio_SRCS @@ -294,7 +289,7 @@ define engine_template = $(1)_OBJS := $$($(1)_SRCS:.c=.o) $$($(1)_OBJS): CFLAGS := -fPIC $$($(1)_CFLAGS) $(CFLAGS) engines/fio-$(1).so: $$($(1)_OBJS) - $$(QUIET_LINK)$(CC) -shared -rdynamic -fPIC -Wl,-soname,fio-$(1).so.1 -o $$@ $$< $$($(1)_LIBS) + $$(QUIET_LINK)$(CC) $(LDFLAGS) -shared -rdynamic -fPIC -Wl,-soname,fio-$(1).so.1 -o $$@ $$< $$($(1)_LIBS) ENGS_OBJS += engines/fio-$(1).so endef else # !CONFIG_DYNAMIC_ENGINES @@ -325,7 +320,7 @@ FIO_OBJS += lex.yy.o y.tab.o GFIO_OBJS += lex.yy.o y.tab.o endif --include $(OBJS:.o=.d) +-include $(OBJS:.o=.d) $(T_OBJS:.o=.d) $(UT_OBJS:.o=.d) T_SMALLOC_OBJS = t/stest.o T_SMALLOC_OBJS += gettime.o fio_sem.o pshared.o smalloc.o t/log.o t/debug.o \ @@ -372,7 +367,7 @@ T_DEDUPE_PROGS = t/fio-dedupe T_VS_OBJS = t/verify-state.o t/log.o crc/crc32c.o crc/crc32c-intel.o crc/crc32c-arm64.o t/debug.o T_VS_PROGS = t/fio-verify-state -T_PIPE_ASYNC_OBJS = t/read-to-pipe-async.o +T_PIPE_ASYNC_OBJS = t/read-to-pipe-async.o t/log.o T_PIPE_ASYNC_PROGS = t/read-to-pipe-async T_IOU_RING_OBJS = t/io_uring.o lib/rand.o lib/pattern.o lib/strntol.o @@ -384,14 +379,16 @@ T_MEMLOCK_PROGS = t/memlock T_TT_OBJS = t/time-test.o T_TT_PROGS = t/time-test +ifneq (,$(findstring -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION,$(CFLAGS))) T_FUZZ_OBJS = t/fuzz/fuzz_parseini.o T_FUZZ_OBJS += $(OBJS) ifdef CONFIG_ARITHMETIC T_FUZZ_OBJS += lex.yy.o y.tab.o endif +# For proper fio code teardown CFLAGS needs to include -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION # in case there is no fuzz driver defined by environment variable LIB_FUZZING_ENGINE, use a simple one # For instance, with compiler clang, address sanitizer and libFuzzer as a fuzzing engine, you should define -# export CFLAGS="-fsanitize=address,fuzzer-no-link" +# export CFLAGS="-fsanitize=address,fuzzer-no-link -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION" # export LIB_FUZZING_ENGINE="-fsanitize=address" # export CC=clang # before running configure && make @@ -400,6 +397,10 @@ ifndef LIB_FUZZING_ENGINE T_FUZZ_OBJS += t/fuzz/onefile.o endif T_FUZZ_PROGS = t/fuzz/fuzz_parseini +else # CFLAGS includes -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION +T_FUZZ_OBJS = +T_FUZZ_PROGS = +endif T_OBJS = $(T_SMALLOC_OBJS) T_OBJS += $(T_IEEE_OBJS) @@ -429,7 +430,9 @@ T_TEST_PROGS += $(T_AXMAP_PROGS) T_TEST_PROGS += $(T_LFSR_TEST_PROGS) T_TEST_PROGS += $(T_GEN_RAND_PROGS) T_PROGS += $(T_BTRACE_FIO_PROGS) +ifdef CONFIG_ZLIB T_PROGS += $(T_DEDUPE_PROGS) +endif T_PROGS += $(T_VS_PROGS) T_TEST_PROGS += $(T_MEMLOCK_PROGS) ifdef CONFIG_PREAD @@ -447,6 +450,7 @@ UT_OBJS = unittests/unittest.o UT_OBJS += unittests/lib/memalign.o UT_OBJS += unittests/lib/num2str.o UT_OBJS += unittests/lib/strntol.o +UT_OBJS += unittests/lib/pcbuf.o UT_OBJS += unittests/oslib/strlcat.o UT_OBJS += unittests/oslib/strndup.o UT_OBJS += unittests/oslib/strcasestr.o @@ -483,14 +487,8 @@ endif prefix = $(INSTALL_PREFIX) bindir = $(prefix)/bin libdir = $(prefix)/lib/fio - -ifeq ($(CONFIG_TARGET_OS), Darwin) -mandir = /usr/share/man -sharedir = /usr/share/fio -else -mandir = $(prefix)/man +mandir = $(prefix)/share/man sharedir = $(prefix)/share/fio -endif all: $(PROGS) $(T_TEST_PROGS) $(UT_PROGS) $(SCRIPTS) $(ENGS_OBJS) FORCE @@ -521,11 +519,19 @@ else $(QUIET_LEX)$(LEX) $< endif +ifneq (,$(findstring -Wimplicit-fallthrough,$(CFLAGS))) +LEX_YY_CFLAGS := -Wno-implicit-fallthrough +endif + +ifdef CONFIG_HAVE_NO_STRINGOP +YTAB_YY_CFLAGS := -Wno-stringop-truncation +endif + lex.yy.o: lex.yy.c y.tab.h - $(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) -c $< + $(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) $(LEX_YY_CFLAGS) -c $< y.tab.o: y.tab.c y.tab.h - $(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) -c $< + $(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) $(YTAB_YY_CFLAGS) -c $< y.tab.c: exp/expression-parser.y $(QUIET_YACC)$(YACC) -o $@ -l -d -b y $< @@ -617,8 +623,10 @@ t/fio-btrace2fio: $(T_BTRACE_FIO_OBJS) $(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(T_BTRACE_FIO_OBJS) $(LIBS) endif +ifdef CONFIG_ZLIB t/fio-dedupe: $(T_DEDUPE_OBJS) $(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(T_DEDUPE_OBJS) $(LIBS) +endif t/fio-verify-state: $(T_VS_OBJS) $(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(T_VS_OBJS) $(LIBS) @@ -635,6 +643,7 @@ clean: FORCE @rm -f .depend $(FIO_OBJS) $(GFIO_OBJS) $(OBJS) $(T_OBJS) $(UT_OBJS) $(PROGS) $(T_PROGS) $(T_TEST_PROGS) core.* core gfio unittests/unittest FIO-VERSION-FILE *.[do] lib/*.d oslib/*.[do] crc/*.d engines/*.[do] engines/*.so profiles/*.[do] t/*.[do] t/*/*.[do] unittests/*.[do] unittests/*/*.[do] config-host.mak config-host.h y.tab.[ch] lex.yy.c exp/*.[do] lexer.h @rm -f t/fio-btrace2fio t/io_uring t/read-to-pipe-async @rm -rf doc/output + @$(MAKE) -C mock-tests clean distclean: clean FORCE @rm -f cscope.out fio.pdf fio_generate_plots.pdf fio2gnuplot.pdf fiologparser_hist.pdf @@ -654,6 +663,10 @@ doc: tools/plot/fio2gnuplot.1 test: fio ./fio --minimal --thread --exitall_on_error --runtime=1s --name=nulltest --ioengine=null --rw=randrw --iodepth=2 --norandommap --random_generator=tausworthe64 --size=16T --name=verifyfstest --filename=fiotestfile.tmp --unlink=1 --rw=write --verify=crc32c --verify_state_save=0 --size=16K + +mock-tests: + $(MAKE) -C mock-tests test + fulltest: sudo modprobe null_blk && \ if [ ! -e /usr/include/libzbc/zbc.h ]; then \ diff --git a/README b/README.rst similarity index 84% rename from README rename to README.rst index d566fae3de..06fa971ea4 100644 --- a/README +++ b/README.rst @@ -27,31 +27,20 @@ Source Fio resides in a git repo, the canonical place is: - git://git.kernel.dk/fio.git - -When inside a corporate firewall, git:// URL sometimes does not work. -If git:// does not work, use the http protocol instead: - - http://git.kernel.dk/fio.git + https://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio Snapshots are frequently generated and :file:`fio-git-*.tar.gz` include the git meta data as well. Other tarballs are archives of official fio releases. Snapshots can download from: - http://brick.kernel.dk/snaps/ + https://brick.kernel.dk/snaps/ There are also two official mirrors. Both of these are automatically synced with the main repository, when changes are pushed. If the main repo is down for some reason, either one of these is safe to use as a backup: - git://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio.git - https://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio.git -or - - git://github.com/axboe/fio.git - https://github.com/axboe/fio.git @@ -64,13 +53,12 @@ see REPORTING-BUGS. An automated mail detailing recent commits is automatically sent to the list at most daily. The list address is fio@vger.kernel.org, subscribe by sending an -email to majordomo@vger.kernel.org with - - subscribe fio +email to fio+subscribe@vger.kernel.org or visit +https://subspace.kernel.org/vger.kernel.org.html. -in the body of the email. Archives can be found here: +Archives can be found here: - http://www.spinics.net/lists/fio/ + https://www.spinics.net/lists/fio/ or here: @@ -92,22 +80,33 @@ benchmark/test tools out there weren't flexible enough to do what he wanted. Jens Axboe 20060905 +Maintainers +----------- + +Fio is maintained by Jens Axboe - however, for reporting bugs please use +the fio reflector or the GitHub page rather than email any of them +directly. By using the public resources, others will be able to learn from +the responses too. Chances are also good that other members will be able to +help with your inquiry as well. + + Binary packages --------------- Debian: Starting with Debian "Squeeze", fio packages are part of the official - Debian repository. http://packages.debian.org/search?keywords=fio . + Debian repository. https://packages.debian.org/search?keywords=fio . Ubuntu: Starting with Ubuntu 10.04 LTS (aka "Lucid Lynx"), fio packages are part of the Ubuntu "universe" repository. - http://packages.ubuntu.com/search?keywords=fio . + https://packages.ubuntu.com/search?keywords=fio . Red Hat, Fedora, CentOS & Co: Starting with Fedora 9/Extra Packages for Enterprise Linux 4, fio packages are part of the Fedora/EPEL repositories. - https://apps.fedoraproject.org/packages/fio . + https://packages.fedoraproject.org/pkgs/fio/ . Mandriva: Mandriva has integrated fio into their package repository, so installing @@ -123,10 +122,12 @@ Solaris: ``pkgutil -i fio``. Windows: - Rebecca Cran has fio packages for Windows at - https://bsdio.com/fio/ . The latest builds for Windows can also - be grabbed from https://ci.appveyor.com/project/axboe/fio by clicking - the latest x86 or x64 build, then selecting the ARTIFACTS tab. + Beginning with fio 3.31 Windows installers for tagged releases are + available on GitHub at https://github.com/axboe/fio/releases. The + latest installers for Windows can also be obtained as GitHub Actions + artifacts by selecting a build from + https://github.com/axboe/fio/actions. These require logging in to a + GitHub account. BSDs: Packages for BSDs may be available from their binary package repositories. @@ -148,7 +149,7 @@ GNU make isn't the default, type ``gmake`` instead of ``make``. Configure will print the enabled options. Note that on Linux based platforms, the libaio development packages must be installed to use the libaio -engine. Depending on distro, it is usually called libaio-devel or libaio-dev. +engine. Depending on the distro, it is usually called libaio-devel or libaio-dev. For gfio, gtk 2.18 (or newer), associated glib threads, and cairo are required to be installed. gfio isn't built automatically and can be enabled with a @@ -164,11 +165,15 @@ Configure will attempt to determine the target platform automatically. It's possible to build fio for ESX as well, use the ``--esx`` switch to configure. +The HTTP engine is enabled depending on if the curl and openssl shared libraries +are detected on the system. For Ubuntu, these packages are libcurl4-openssl-dev +and libssl-dev. + Windows ~~~~~~~ -The minimum versions of Windows for building/runing fio are Windows 7/Windows +The minimum versions of Windows for building/running fio are Windows 7/Windows Server 2008 R2. On Windows, Cygwin (https://www.cygwin.com/) is required in order to build fio. To create an MSI installer package install WiX from https://wixtoolset.org and run :file:`dobuild.cmd` from the :file:`os/windows` @@ -176,7 +181,7 @@ directory. How to compile fio on 64-bit Windows: - 1. Install Cygwin (http://www.cygwin.com/). Install **make** and all + 1. Install Cygwin (https://www.cygwin.com/). Install **make** and all packages starting with **mingw64-x86_64**. Ensure **mingw64-x86_64-zlib** are installed if you wish to enable fio's log compression functionality. @@ -205,8 +210,8 @@ browser to :file:`./doc/output/html/index.html`. To build manual page run ``make -C doc man`` and then ``man doc/output/man/fio.1``. To see what other output formats are supported run ``make -C doc help``. -.. _reStructuredText: http://www.sphinx-doc.org/rest.html -.. _Sphinx: http://www.sphinx-doc.org +.. _reStructuredText: https://www.sphinx-doc.org/rest.html +.. _Sphinx: https://www.sphinx-doc.org Platforms @@ -222,7 +227,7 @@ implemented, I'd be happy to take patches for that. An example of that is disk utility statistics and (I think) huge page support, support for that does exist in FreeBSD/Solaris. -Fio uses pthread mutexes for signalling and locking and some platforms do not +Fio uses pthread mutexes for signaling and locking and some platforms do not support process shared pthread mutexes. As a result, on such platforms only threads are supported. This could be fixed with sysv ipc locking or other locking alternatives. diff --git a/REPORTING-BUGS b/REPORTING-BUGS index c0204d7e09..17c4fcaae7 100644 --- a/REPORTING-BUGS +++ b/REPORTING-BUGS @@ -1,6 +1,13 @@ Reporting a bug --------------- +If you're contemplating reporting issues with deliberately constructed job +files that cause the parser to crash, then only do so if you also intend +to fix the issue. Legitimate job files that cause parsing issues should of +course be reported, however please don't waste our time with maliciously +constructed job files that cause double frees, fio crashes, or other effects +of that nature. + ...via the mailing list ======================= diff --git a/STEADYSTATE-TODO b/STEADYSTATE-TODO index e4b146e93c..2848eb5416 100644 --- a/STEADYSTATE-TODO +++ b/STEADYSTATE-TODO @@ -1,6 +1,14 @@ Known issues/TODO (for steady-state) -- Allow user to specify the frequency of measurements +- Replace the test script with a better one + - Add test cases for the new check_interval option + - Parse debug=steadystate output to check calculations + +- Instead of calculating `intervals` every time, calculate it once and stash it + somewhere + +- Add the time unit to the ss_dur and check_interval variable names to reduce + possible confusion - Better documentation for output diff --git a/arch/arch-aarch64.h b/arch/arch-aarch64.h index 2a86cc5ab4..919e579676 100644 --- a/arch/arch-aarch64.h +++ b/arch/arch-aarch64.h @@ -27,4 +27,101 @@ static inline int arch_ffz(unsigned long bitmask) #define ARCH_HAVE_FFZ +#define isb() asm volatile("isb" : : : "memory") + +static inline unsigned long long get_cpu_clock(void) +{ + unsigned long val; + + isb(); + asm volatile("mrs %0, cntvct_el0" : "=r" (val)); + return val; +} +#define ARCH_HAVE_CPU_CLOCK + +#define ARCH_HAVE_INIT +extern bool tsc_reliable; +static inline int arch_init(char *envp[]) +{ + tsc_reliable = true; + return 0; +} + +#define __do_syscallN(...) ({ \ + __asm__ volatile ( \ + "svc 0" \ + : "=r"(x0) \ + : __VA_ARGS__ \ + : "memory", "cc"); \ + (long) x0; \ +}) + +#define __do_syscall0(__n) ({ \ + register long x8 __asm__("x8") = __n; \ + register long x0 __asm__("x0"); \ + \ + __do_syscallN("r" (x8)); \ +}) + +#define __do_syscall1(__n, __a) ({ \ + register long x8 __asm__("x8") = __n; \ + register __typeof__(__a) x0 __asm__("x0") = __a; \ + \ + __do_syscallN("r" (x8), "0" (x0)); \ +}) + +#define __do_syscall2(__n, __a, __b) ({ \ + register long x8 __asm__("x8") = __n; \ + register __typeof__(__a) x0 __asm__("x0") = __a; \ + register __typeof__(__b) x1 __asm__("x1") = __b; \ + \ + __do_syscallN("r" (x8), "0" (x0), "r" (x1)); \ +}) + +#define __do_syscall3(__n, __a, __b, __c) ({ \ + register long x8 __asm__("x8") = __n; \ + register __typeof__(__a) x0 __asm__("x0") = __a; \ + register __typeof__(__b) x1 __asm__("x1") = __b; \ + register __typeof__(__c) x2 __asm__("x2") = __c; \ + \ + __do_syscallN("r" (x8), "0" (x0), "r" (x1), "r" (x2)); \ +}) + +#define __do_syscall4(__n, __a, __b, __c, __d) ({ \ + register long x8 __asm__("x8") = __n; \ + register __typeof__(__a) x0 __asm__("x0") = __a; \ + register __typeof__(__b) x1 __asm__("x1") = __b; \ + register __typeof__(__c) x2 __asm__("x2") = __c; \ + register __typeof__(__d) x3 __asm__("x3") = __d; \ + \ + __do_syscallN("r" (x8), "0" (x0), "r" (x1), "r" (x2), "r" (x3));\ +}) + +#define __do_syscall5(__n, __a, __b, __c, __d, __e) ({ \ + register long x8 __asm__("x8") = __n; \ + register __typeof__(__a) x0 __asm__("x0") = __a; \ + register __typeof__(__b) x1 __asm__("x1") = __b; \ + register __typeof__(__c) x2 __asm__("x2") = __c; \ + register __typeof__(__d) x3 __asm__("x3") = __d; \ + register __typeof__(__e) x4 __asm__("x4") = __e; \ + \ + __do_syscallN("r" (x8), "0" (x0), "r" (x1), "r" (x2), "r" (x3), \ + "r"(x4)); \ +}) + +#define __do_syscall6(__n, __a, __b, __c, __d, __e, __f) ({ \ + register long x8 __asm__("x8") = __n; \ + register __typeof__(__a) x0 __asm__("x0") = __a; \ + register __typeof__(__b) x1 __asm__("x1") = __b; \ + register __typeof__(__c) x2 __asm__("x2") = __c; \ + register __typeof__(__d) x3 __asm__("x3") = __d; \ + register __typeof__(__e) x4 __asm__("x4") = __e; \ + register __typeof__(__f) x5 __asm__("x5") = __f; \ + \ + __do_syscallN("r" (x8), "0" (x0), "r" (x1), "r" (x2), "r" (x3), \ + "r" (x4), "r"(x5)); \ +}) + +#define FIO_ARCH_HAS_SYSCALL + #endif diff --git a/arch/arch-loongarch64.h b/arch/arch-loongarch64.h new file mode 100644 index 0000000000..43ea83b436 --- /dev/null +++ b/arch/arch-loongarch64.h @@ -0,0 +1,10 @@ +#ifndef ARCH_LOONGARCH64_H +#define ARCH_LOONGARCH64_H + +#define FIO_ARCH (arch_loongarch64) + +#define read_barrier() __asm__ __volatile__("dbar 0": : :"memory") +#define write_barrier() __asm__ __volatile__("dbar 0": : :"memory") +#define nop __asm__ __volatile__("dbar 0": : :"memory") + +#endif diff --git a/arch/arch-mips.h b/arch/arch-mips.h index 6f157fbb19..8a0e9370ae 100644 --- a/arch/arch-mips.h +++ b/arch/arch-mips.h @@ -3,6 +3,10 @@ #define FIO_ARCH (arch_mips) +#ifndef __SANE_USERSPACE_TYPES__ +#define __SANE_USERSPACE_TYPES__ +#endif + #define read_barrier() __asm__ __volatile__("": : :"memory") #define write_barrier() __asm__ __volatile__("": : :"memory") #define nop __asm__ __volatile__("": : :"memory") diff --git a/arch/arch-ppc.h b/arch/arch-ppc.h index 804d596aec..7f3503b6cd 100644 --- a/arch/arch-ppc.h +++ b/arch/arch-ppc.h @@ -8,6 +8,10 @@ #define FIO_ARCH (arch_ppc) +#ifndef __SANE_USERSPACE_TYPES__ +#define __SANE_USERSPACE_TYPES__ +#endif + #define nop do { } while (0) #ifdef __powerpc64__ diff --git a/arch/arch-riscv64.h b/arch/arch-riscv64.h new file mode 100644 index 0000000000..8ac33fa31c --- /dev/null +++ b/arch/arch-riscv64.h @@ -0,0 +1,118 @@ +#ifndef ARCH_RISCV64_H +#define ARCH_RISCV64_H + +#include +#include +#include +#include + +#define FIO_ARCH (arch_riscv64) + +#define nop __asm__ __volatile__ ("nop") +#define read_barrier() __asm__ __volatile__("fence r, r": : :"memory") +#define write_barrier() __asm__ __volatile__("fence w, w": : :"memory") + +static inline unsigned long long get_cpu_clock(void) +{ + unsigned long val; + + asm volatile("rdtime %0" : "=r"(val)); + return val; +} +#define ARCH_HAVE_CPU_CLOCK + +#define ARCH_HAVE_INIT +extern bool tsc_reliable; +static inline int arch_init(char *envp[]) +{ + tsc_reliable = true; + return 0; +} + +#define __do_syscallM(...) ({ \ + __asm__ volatile ( \ + "ecall" \ + : "=r"(a0) \ + : __VA_ARGS__ \ + : "memory", "a1"); \ + (long) a0; \ +}) + +#define __do_syscallN(...) ({ \ + __asm__ volatile ( \ + "ecall" \ + : "=r"(a0) \ + : __VA_ARGS__ \ + : "memory"); \ + (long) a0; \ +}) + +#define __do_syscall0(__n) ({ \ + register long a7 __asm__("a7") = __n; \ + register long a0 __asm__("a0"); \ + \ + __do_syscallM("r" (a7)); \ +}) + +#define __do_syscall1(__n, __a) ({ \ + register long a7 __asm__("a7") = __n; \ + register __typeof__(__a) a0 __asm__("a0") = __a; \ + \ + __do_syscallM("r" (a7), "0" (a0)); \ +}) + +#define __do_syscall2(__n, __a, __b) ({ \ + register long a7 __asm__("a7") = __n; \ + register __typeof__(__a) a0 __asm__("a0") = __a; \ + register __typeof__(__b) a1 __asm__("a1") = __b; \ + \ + __do_syscallN("r" (a7), "0" (a0), "r" (a1)); \ +}) + +#define __do_syscall3(__n, __a, __b, __c) ({ \ + register long a7 __asm__("a7") = __n; \ + register __typeof__(__a) a0 __asm__("a0") = __a; \ + register __typeof__(__b) a1 __asm__("a1") = __b; \ + register __typeof__(__c) a2 __asm__("a2") = __c; \ + \ + __do_syscallN("r" (a7), "0" (a0), "r" (a1), "r" (a2)); \ +}) + +#define __do_syscall4(__n, __a, __b, __c, __d) ({ \ + register long a7 __asm__("a7") = __n; \ + register __typeof__(__a) a0 __asm__("a0") = __a; \ + register __typeof__(__b) a1 __asm__("a1") = __b; \ + register __typeof__(__c) a2 __asm__("a2") = __c; \ + register __typeof__(__d) a3 __asm__("a3") = __d; \ + \ + __do_syscallN("r" (a7), "0" (a0), "r" (a1), "r" (a2), "r" (a3));\ +}) + +#define __do_syscall5(__n, __a, __b, __c, __d, __e) ({ \ + register long a7 __asm__("a7") = __n; \ + register __typeof__(__a) a0 __asm__("a0") = __a; \ + register __typeof__(__b) a1 __asm__("a1") = __b; \ + register __typeof__(__c) a2 __asm__("a2") = __c; \ + register __typeof__(__d) a3 __asm__("a3") = __d; \ + register __typeof__(__e) a4 __asm__("a4") = __e; \ + \ + __do_syscallN("r" (a7), "0" (a0), "r" (a1), "r" (a2), "r" (a3), \ + "r"(a4)); \ +}) + +#define __do_syscall6(__n, __a, __b, __c, __d, __e, __f) ({ \ + register long a7 __asm__("a7") = __n; \ + register __typeof__(__a) a0 __asm__("a0") = __a; \ + register __typeof__(__b) a1 __asm__("a1") = __b; \ + register __typeof__(__c) a2 __asm__("a2") = __c; \ + register __typeof__(__d) a3 __asm__("a3") = __d; \ + register __typeof__(__e) a4 __asm__("a4") = __e; \ + register __typeof__(__f) a5 __asm__("a5") = __f; \ + \ + __do_syscallN("r" (a7), "0" (a0), "r" (a1), "r" (a2), "r" (a3), \ + "r" (a4), "r"(a5)); \ +}) + +#define FIO_ARCH_HAS_SYSCALL + +#endif diff --git a/arch/arch-x86-common.h b/arch/arch-x86-common.h index f32835cce3..ac26126ae8 100644 --- a/arch/arch-x86-common.h +++ b/arch/arch-x86-common.h @@ -68,7 +68,8 @@ static inline void arch_init(char *envp[]) (unsigned int *) &str[4]); str[12] = '\0'; - if (!strcmp(str, "GenuineIntel")) + if (!strcmp(str, "GenuineIntel") || !strcmp(str, " Shanghai ") || + !strcmp(str, "CentaurHauls")) arch_init_intel(); else if (!strcmp(str, "AuthenticAMD") || !strcmp(str, "HygonGenuine")) arch_init_amd(); diff --git a/arch/arch-x86_64.h b/arch/arch-x86_64.h index 25850f90e7..b402dc6df3 100644 --- a/arch/arch-x86_64.h +++ b/arch/arch-x86_64.h @@ -26,6 +26,11 @@ static inline unsigned long arch_ffz(unsigned long bitmask) return bitmask; } +static inline void tsc_barrier(void) +{ + __asm__ __volatile__("mfence":::"memory"); +} + static inline unsigned long long get_cpu_clock(void) { unsigned int lo, hi; @@ -68,4 +73,117 @@ static inline int arch_rand_seed(unsigned long *seed) return 0; } +#define __do_syscall0(NUM) ({ \ + intptr_t rax; \ + \ + __asm__ volatile( \ + "syscall" \ + : "=a"(rax) /* %rax */ \ + : "a"(NUM) /* %rax */ \ + : "rcx", "r11", "memory" \ + ); \ + rax; \ +}) + +#define __do_syscall1(NUM, ARG1) ({ \ + intptr_t rax; \ + \ + __asm__ volatile( \ + "syscall" \ + : "=a"(rax) /* %rax */ \ + : "a"((NUM)), /* %rax */ \ + "D"((ARG1)) /* %rdi */ \ + : "rcx", "r11", "memory" \ + ); \ + rax; \ +}) + +#define __do_syscall2(NUM, ARG1, ARG2) ({ \ + intptr_t rax; \ + \ + __asm__ volatile( \ + "syscall" \ + : "=a"(rax) /* %rax */ \ + : "a"((NUM)), /* %rax */ \ + "D"((ARG1)), /* %rdi */ \ + "S"((ARG2)) /* %rsi */ \ + : "rcx", "r11", "memory" \ + ); \ + rax; \ +}) + +#define __do_syscall3(NUM, ARG1, ARG2, ARG3) ({ \ + intptr_t rax; \ + \ + __asm__ volatile( \ + "syscall" \ + : "=a"(rax) /* %rax */ \ + : "a"((NUM)), /* %rax */ \ + "D"((ARG1)), /* %rdi */ \ + "S"((ARG2)), /* %rsi */ \ + "d"((ARG3)) /* %rdx */ \ + : "rcx", "r11", "memory" \ + ); \ + rax; \ +}) + +#define __do_syscall4(NUM, ARG1, ARG2, ARG3, ARG4) ({ \ + intptr_t rax; \ + register __typeof__(ARG4) __r10 __asm__("r10") = (ARG4); \ + \ + __asm__ volatile( \ + "syscall" \ + : "=a"(rax) /* %rax */ \ + : "a"((NUM)), /* %rax */ \ + "D"((ARG1)), /* %rdi */ \ + "S"((ARG2)), /* %rsi */ \ + "d"((ARG3)), /* %rdx */ \ + "r"(__r10) /* %r10 */ \ + : "rcx", "r11", "memory" \ + ); \ + rax; \ +}) + +#define __do_syscall5(NUM, ARG1, ARG2, ARG3, ARG4, ARG5) ({ \ + intptr_t rax; \ + register __typeof__(ARG4) __r10 __asm__("r10") = (ARG4); \ + register __typeof__(ARG5) __r8 __asm__("r8") = (ARG5); \ + \ + __asm__ volatile( \ + "syscall" \ + : "=a"(rax) /* %rax */ \ + : "a"((NUM)), /* %rax */ \ + "D"((ARG1)), /* %rdi */ \ + "S"((ARG2)), /* %rsi */ \ + "d"((ARG3)), /* %rdx */ \ + "r"(__r10), /* %r10 */ \ + "r"(__r8) /* %r8 */ \ + : "rcx", "r11", "memory" \ + ); \ + rax; \ +}) + +#define __do_syscall6(NUM, ARG1, ARG2, ARG3, ARG4, ARG5, ARG6) ({ \ + intptr_t rax; \ + register __typeof__(ARG4) __r10 __asm__("r10") = (ARG4); \ + register __typeof__(ARG5) __r8 __asm__("r8") = (ARG5); \ + register __typeof__(ARG6) __r9 __asm__("r9") = (ARG6); \ + \ + __asm__ volatile( \ + "syscall" \ + : "=a"(rax) /* %rax */ \ + : "a"((NUM)), /* %rax */ \ + "D"((ARG1)), /* %rdi */ \ + "S"((ARG2)), /* %rsi */ \ + "d"((ARG3)), /* %rdx */ \ + "r"(__r10), /* %r10 */ \ + "r"(__r8), /* %r8 */ \ + "r"(__r9) /* %r9 */ \ + : "rcx", "r11", "memory" \ + ); \ + rax; \ +}) + +#define FIO_ARCH_HAS_SYSCALL + #endif diff --git a/arch/arch.h b/arch/arch.h index fca003beab..437736f842 100644 --- a/arch/arch.h +++ b/arch/arch.h @@ -23,6 +23,8 @@ enum { arch_hppa, arch_mips, arch_aarch64, + arch_loongarch64, + arch_riscv64, arch_generic, @@ -51,6 +53,8 @@ extern unsigned long arch_flags; #define atomic_load_acquire(p) \ std::atomic_load_explicit(p, \ std::memory_order_acquire) +#define atomic_store_relaxed(p, v) \ + std::atomic_store_explicit((p), (v), std::memory_order_relaxed) #define atomic_store_release(p, v) \ std::atomic_store_explicit(p, (v), \ std::memory_order_release) @@ -65,6 +69,9 @@ extern unsigned long arch_flags; #define atomic_load_acquire(p) \ atomic_load_explicit((_Atomic typeof(*(p)) *)(p), \ memory_order_acquire) +#define atomic_store_relaxed(p, v) \ + atomic_store_explicit((_Atomic typeof(*(p)) *)(p), (v), \ + memory_order_relaxed) #define atomic_store_release(p, v) \ atomic_store_explicit((_Atomic typeof(*(p)) *)(p), (v), \ memory_order_release) @@ -97,11 +104,22 @@ extern unsigned long arch_flags; #include "arch-hppa.h" #elif defined(__aarch64__) #include "arch-aarch64.h" +#elif defined(__loongarch64) +#include "arch-loongarch64.h" +#elif defined(__riscv) && __riscv_xlen == 64 +#include "arch-riscv64.h" #else #warning "Unknown architecture, attempting to use generic model." #include "arch-generic.h" #endif +#if !defined(__x86_64__) && defined(CONFIG_SYNC_SYNC) +static inline void tsc_barrier(void) +{ + __sync_synchronize(); +} +#endif + #include "../lib/ffz.h" /* IWYU pragma: end_exports */ diff --git a/backend.c b/backend.c index c167f90862..568f306c24 100644 --- a/backend.c +++ b/backend.c @@ -31,6 +31,11 @@ #include #include +#ifdef CONFIG_LINUX +#include +#include +#endif + #include "fio.h" #include "smalloc.h" #include "verify.h" @@ -49,6 +54,7 @@ #include "helper_thread.h" #include "pshared.h" #include "zone-dist.h" +#include "fio_time.h" static struct fio_sem *startup_sem; static struct flist_head *cgroup_list; @@ -90,6 +96,22 @@ static void sig_int(int sig) } } +#ifdef WIN32 +static void sig_break(int sig) +{ + sig_int(sig); + + /** + * Windows terminates all job processes on SIGBREAK after the handler + * returns, so give them time to wrap-up and give stats + */ + for_each_td(td) { + while (td->runstate < TD_EXITED) + sleep(1); + } end_for_each(); +} +#endif + void sig_show_status(int sig) { show_running_run_stats(); @@ -112,7 +134,7 @@ static void set_sig_handlers(void) /* Windows uses SIGBREAK as a quit signal from other applications */ #ifdef WIN32 memset(&act, 0, sizeof(act)); - act.sa_handler = sig_int; + act.sa_handler = sig_break; act.sa_flags = SA_RESTART; sigaction(SIGBREAK, &act, NULL); #endif @@ -136,13 +158,10 @@ static void set_sig_handlers(void) static bool __check_min_rate(struct thread_data *td, struct timespec *now, enum fio_ddir ddir) { - unsigned long long bytes = 0; - unsigned long iops = 0; - unsigned long spent; - unsigned long long rate; - unsigned long long ratemin = 0; - unsigned int rate_iops = 0; - unsigned int rate_iops_min = 0; + unsigned long long current_rate_check_bytes = td->this_io_bytes[ddir]; + unsigned long current_rate_check_blocks = td->this_io_blocks[ddir]; + unsigned long long option_rate_bytes_min = td->o.ratemin[ddir]; + unsigned int option_rate_iops_min = td->o.rate_iops_min[ddir]; assert(ddir_rw(ddir)); @@ -155,68 +174,44 @@ static bool __check_min_rate(struct thread_data *td, struct timespec *now, if (mtime_since(&td->start, now) < 2000) return false; - iops += td->this_io_blocks[ddir]; - bytes += td->this_io_bytes[ddir]; - ratemin += td->o.ratemin[ddir]; - rate_iops += td->o.rate_iops[ddir]; - rate_iops_min += td->o.rate_iops_min[ddir]; - /* - * if rate blocks is set, sample is running + * if last_rate_check_blocks or last_rate_check_bytes is set, + * we can compute a rate per ratecycle */ - if (td->rate_bytes[ddir] || td->rate_blocks[ddir]) { - spent = mtime_since(&td->lastrate[ddir], now); - if (spent < td->o.ratecycle) + if (td->last_rate_check_bytes[ddir] || td->last_rate_check_blocks[ddir]) { + unsigned long spent = mtime_since(&td->last_rate_check_time[ddir], now); + if (spent < td->o.ratecycle || spent==0) return false; - if (td->o.rate[ddir] || td->o.ratemin[ddir]) { + if (td->o.ratemin[ddir]) { /* * check bandwidth specified rate */ - if (bytes < td->rate_bytes[ddir]) { - log_err("%s: rate_min=%lluB/s not met, only transferred %lluB\n", - td->o.name, ratemin, bytes); + unsigned long long current_rate_bytes = + ((current_rate_check_bytes - td->last_rate_check_bytes[ddir]) * 1000) / spent; + if (current_rate_bytes < option_rate_bytes_min) { + log_err("%s: rate_min=%lluB/s not met, got %lluB/s\n", + td->o.name, option_rate_bytes_min, current_rate_bytes); return true; - } else { - if (spent) - rate = ((bytes - td->rate_bytes[ddir]) * 1000) / spent; - else - rate = 0; - - if (rate < ratemin || - bytes < td->rate_bytes[ddir]) { - log_err("%s: rate_min=%lluB/s not met, got %lluB/s\n", - td->o.name, ratemin, rate); - return true; - } } } else { /* * checks iops specified rate */ - if (iops < rate_iops) { - log_err("%s: rate_iops_min=%u not met, only performed %lu IOs\n", - td->o.name, rate_iops, iops); + unsigned long long current_rate_iops = + ((current_rate_check_blocks - td->last_rate_check_blocks[ddir]) * 1000) / spent; + + if (current_rate_iops < option_rate_iops_min) { + log_err("%s: rate_iops_min=%u not met, got %llu IOPS\n", + td->o.name, option_rate_iops_min, current_rate_iops); return true; - } else { - if (spent) - rate = ((iops - td->rate_blocks[ddir]) * 1000) / spent; - else - rate = 0; - - if (rate < rate_iops_min || - iops < td->rate_blocks[ddir]) { - log_err("%s: rate_iops_min=%u not met, got %llu IOPS\n", - td->o.name, rate_iops_min, rate); - return true; - } } } } - td->rate_bytes[ddir] = bytes; - td->rate_blocks[ddir] = iops; - memcpy(&td->lastrate[ddir], now, sizeof(*now)); + td->last_rate_check_bytes[ddir] = current_rate_check_bytes; + td->last_rate_check_blocks[ddir] = current_rate_check_blocks; + memcpy(&td->last_rate_check_time[ddir], now, sizeof(*now)); return false; } @@ -232,39 +227,6 @@ static bool check_min_rate(struct thread_data *td, struct timespec *now) return ret; } -/* - * When job exits, we can cancel the in-flight IO if we are using async - * io. Attempt to do so. - */ -static void cleanup_pending_aio(struct thread_data *td) -{ - int r; - - /* - * get immediately available events, if any - */ - r = io_u_queued_complete(td, 0); - - /* - * now cancel remaining active events - */ - if (td->io_ops->cancel) { - struct io_u *io_u; - int i; - - io_u_qiter(&td->io_u_all, io_u, i) { - if (io_u->flags & IO_U_F_FLIGHT) { - r = td->io_ops->cancel(td, io_u); - if (!r) - put_io_u(td, io_u); - } - } - } - - if (td->cur_depth) - r = io_u_queued_complete(td, td->cur_depth); -} - /* * Helper to handle the final sync of a file. Works just like the normal * io path, just does everything sync. @@ -341,7 +303,7 @@ static inline void update_ts_cache(struct thread_data *td) static inline bool runtime_exceeded(struct thread_data *td, struct timespec *t) { - if (in_ramp_time(td)) + if (in_ramp_period(td)) return false; if (!td->o.timeout) return false; @@ -459,6 +421,7 @@ int io_queue_event(struct thread_data *td, struct io_u *io_u, int *ret, case FIO_Q_COMPLETED: if (io_u->error) { *ret = -io_u->error; + invalidate_inflight(td, io_u); clear_io_u(td, io_u); } else if (io_u->resid) { long long bytes = io_u->xfer_buflen - io_u->resid; @@ -477,7 +440,8 @@ int io_queue_event(struct thread_data *td, struct io_u *io_u, int *ret, if (!from_verify) unlog_io_piece(td, io_u); td_verror(td, EIO, "full resid"); - put_io_u(td, io_u); + invalidate_inflight(td, io_u); + clear_io_u(td, io_u); break; } @@ -622,8 +586,8 @@ static void do_verify(struct thread_data *td, uint64_t verify_bytes) { struct fio_file *f; struct io_u *io_u; - int ret, min_events; unsigned int i; + int ret; dprint(FD_VERIFY, "starting loop\n"); @@ -645,15 +609,6 @@ static void do_verify(struct thread_data *td, uint64_t verify_bytes) if (td->error) return; - /* - * verify_state needs to be reset before verification - * proceeds so that expected random seeds match actual - * random seeds in headers. The main loop will reset - * all random number generators if randrepeat is set. - */ - if (!td->o.rand_repeatable) - td_fill_verify_state_seed(td); - td_set_runstate(td, TD_VERIFYING); io_u = NULL; @@ -690,7 +645,7 @@ static void do_verify(struct thread_data *td, uint64_t verify_bytes) break; } } else { - if (ddir_rw_sum(td->bytes_done) + td->o.rw_min_bs > verify_bytes) + if (td->bytes_verified + td->o.rw_min_bs > verify_bytes) break; while ((io_u = get_io_u(td)) != NULL) { @@ -716,10 +671,20 @@ static void do_verify(struct thread_data *td, uint64_t verify_bytes) } else if (io_u->ddir == DDIR_TRIM) { io_u->ddir = DDIR_READ; io_u_set(td, io_u, IO_U_F_TRIMMED); + if (td_io_prep(td, io_u)) { + put_io_u(td, io_u); + continue; + } break; } else if (io_u->ddir == DDIR_WRITE) { io_u->ddir = DDIR_READ; + io_u->numberio = td->verify_read_issues; + td->verify_read_issues++; populate_verify_io_u(td, io_u); + if (td_io_prep(td, io_u)) { + put_io_u(td, io_u); + continue; + } break; } else { put_io_u(td, io_u); @@ -731,7 +696,7 @@ static void do_verify(struct thread_data *td, uint64_t verify_bytes) break; } - if (verify_state_should_stop(td, io_u)) { + if (verify_state_should_stop(td, io_u->numberio)) { put_io_u(td, io_u); break; } @@ -766,13 +731,8 @@ static void do_verify(struct thread_data *td, uint64_t verify_bytes) check_update_rusage(td); - if (!td->error) { - min_events = td->cur_depth; - - if (min_events) - ret = io_u_queued_complete(td, min_events); - } else - cleanup_pending_aio(td); + if (td->cur_depth) + ret = io_u_queued_complete(td, td->cur_depth); td_set_runstate(td, TD_RUNNING); @@ -872,6 +832,7 @@ static void handle_thinktime(struct thread_data *td, enum fio_ddir ddir, struct timespec *time) { unsigned long long b; + unsigned long long runtime_left; uint64_t total; int left; struct timespec now; @@ -880,7 +841,7 @@ static void handle_thinktime(struct thread_data *td, enum fio_ddir ddir, if (td->o.thinktime_iotime) { fio_gettime(&now, NULL); if (utime_since(&td->last_thinktime, &now) - >= td->o.thinktime_iotime + td->o.thinktime) { + >= td->o.thinktime_iotime) { stall = true; } else if (!fio_option_is_set(&td->o, thinktime_blocks)) { /* @@ -903,11 +864,33 @@ static void handle_thinktime(struct thread_data *td, enum fio_ddir ddir, io_u_quiesce(td); + left = td->o.thinktime_spin; + if (td->o.timeout) { + runtime_left = td->o.timeout - utime_since_now(&td->epoch); + if (runtime_left < (unsigned long long)left) + left = runtime_left; + } + total = 0; - if (td->o.thinktime_spin) - total = usec_spin(td->o.thinktime_spin); + if (left) + total = usec_spin(left); + + /* + * usec_spin() might run for slightly longer than intended in a VM + * where the vCPU could get descheduled or the hypervisor could steal + * CPU time. Ensure "left" doesn't become negative. + */ + if (total < td->o.thinktime) + left = td->o.thinktime - total; + else + left = 0; + + if (td->o.timeout) { + runtime_left = td->o.timeout - utime_since_now(&td->epoch); + if (runtime_left < (unsigned long long)left) + left = runtime_left; + } - left = td->o.thinktime - total; if (left) total += usec_sleep(td, left); @@ -936,8 +919,106 @@ static void handle_thinktime(struct thread_data *td, enum fio_ddir ddir, fio_gettime(time, NULL); td->last_thinktime_blocks = b; - if (td->o.thinktime_iotime) + if (td->o.thinktime_iotime) { + fio_gettime(&now, NULL); td->last_thinktime = now; + } +} + +/* + * Add numberio from io_u to the inflight log. + */ +void log_inflight(struct thread_data *td, struct io_u *io_u) +{ + int idx, i; + + if (!td->inflight_numberio || io_u->ddir != DDIR_WRITE) + return; + + if (io_u->inflight_idx != -1) { + log_err("inflight_idx already set: inflight_idx=%d\n", + io_u->inflight_idx); + abort(); + } + + if (td->inflight_issued != io_u->numberio) { + log_err("inflight_issued does not match: numberio=%"PRIu64", inflight_issued=%"PRIu64"\n", + io_u->numberio, td->inflight_issued); + abort(); + } + + /* Walk the inflight list until we find a free slot. */ + idx = td->next_inflight_numberio_idx; + for (i = 0; i < td->o.iodepth; i++) { + if (td->inflight_numberio[idx] == INVALID_NUMBERIO) { + /* + * The order here is important - we must "protect" this write in the + * inflight list before making it visible in inflight_issued. + */ + atomic_store_release(&td->inflight_numberio[idx], io_u->numberio); + td->next_inflight_numberio_idx = (idx + 1) % td->o.iodepth; + io_u->inflight_idx = idx; + + atomic_store_release(&td->inflight_issued, io_u->numberio + 1); + dprint(FD_VERIFY, "log_inflight: numberio=%"PRIu64", inflight_idx=%d\n", + io_u->numberio, idx); + return; + } + idx = (idx + 1) % td->o.iodepth; + } + + log_err("failed to allocate inflight slot: next_inflight_numberio_idx=%u\n", + td->next_inflight_numberio_idx); + abort(); +} + +/* + * Invalidate inflight log entry. + */ +void invalidate_inflight(struct thread_data *td, struct io_u *io_u) +{ + if (!td->inflight_numberio || + io_u->ddir != DDIR_WRITE || + io_u->inflight_idx == -1) { + return; + } + + dprint(FD_VERIFY, "invalidate_inflight: numberio=%"PRIu64", inflight_idx=%d\n", + io_u->numberio, io_u->inflight_idx); + + if (td->inflight_numberio[io_u->inflight_idx] == INVALID_NUMBERIO) { + log_err("inflight entry already invalid: numberio=%"PRIu64", inflight_idx=%d\n", + io_u->numberio, io_u->inflight_idx); + abort(); + } else if (td->inflight_numberio[io_u->inflight_idx] != io_u->numberio) { + log_err("inflight entry numberio does not match: expected numberio=%"PRIu64", observed numberio=%"PRIu64", inflight_idx=%d\n", + io_u->numberio, td->inflight_numberio[io_u->inflight_idx], io_u->inflight_idx); + abort(); + } + + atomic_store_release(&td->inflight_numberio[io_u->inflight_idx], INVALID_NUMBERIO); + io_u->inflight_idx = -1; +} + +/* + * Clear inflight log. + */ +void clear_inflight(struct thread_data *td) +{ + int i; + + if (!td->inflight_numberio) + return; + + for (i = 0; i < td->o.iodepth; i++) + td->inflight_numberio[i] = INVALID_NUMBERIO; + + td->next_inflight_numberio_idx = 0; + /* + * Experimental verify can increment io_issues for writes, so catch + * inflight_issued up in between loops. + */ + td->inflight_issued = td->io_issues[DDIR_WRITE]; } /* @@ -955,7 +1036,7 @@ static void do_io(struct thread_data *td, uint64_t *bytes_done) for (i = 0; i < DDIR_RWDIR_CNT; i++) bytes_done[i] = td->bytes_done[i]; - if (in_ramp_time(td)) + if (in_ramp_period(td)) td_set_runstate(td, TD_RAMP); else td_set_runstate(td, TD_RUNNING); @@ -969,6 +1050,14 @@ static void do_io(struct thread_data *td, uint64_t *bytes_done) */ if (td_write(td) && td_random(td) && td->o.norandommap) total_bytes = max(total_bytes, (uint64_t) td->o.io_size); + + /* + * Don't break too early if io_size > size. The exception is when + * verify is enabled. + */ + if (td_rw(td) && !td_random(td) && td->o.verify == VERIFY_NONE) + total_bytes = max(total_bytes, (uint64_t)td->o.io_size); + /* * If verify_backlog is enabled, we'll run the verify in this * handler as well. For that case, we may need up to twice the @@ -979,9 +1068,11 @@ static void do_io(struct thread_data *td, uint64_t *bytes_done) total_bytes += td->o.size; /* In trimwrite mode, each byte is trimmed and then written, so - * allow total_bytes to be twice as big */ - if (td_trimwrite(td)) + * allow total_bytes or number of ios to be twice as big */ + if (td_trimwrite(td)) { total_bytes += td->total_io_size; + td->o.number_ios *= 2; + } while ((td->o.read_iolog_file && !flist_empty(&td->io_log_list)) || (!flist_empty(&td->trim_list)) || !io_issue_bytes_exceeded(td) || @@ -1036,8 +1127,14 @@ static void do_io(struct thread_data *td, uint64_t *bytes_done) break; } - if (io_u->ddir == DDIR_WRITE && td->flags & TD_F_DO_VERIFY) - populate_verify_io_u(td, io_u); + if (io_u->ddir == DDIR_WRITE && td->flags & TD_F_DO_VERIFY) { + if (!(io_u->flags & IO_U_F_PATTERN_DONE)) { + io_u_set(td, io_u, IO_U_F_PATTERN_DONE); + io_u->numberio = td->io_issues[io_u->ddir]; + populate_verify_io_u(td, io_u); + log_inflight(td, io_u); + } + } ddir = io_u->ddir; @@ -1049,7 +1146,18 @@ static void do_io(struct thread_data *td, uint64_t *bytes_done) if (td->o.verify != VERIFY_NONE && io_u->ddir == DDIR_READ && ((io_u->flags & IO_U_F_VER_LIST) || !td_rw(td))) { - if (verify_state_should_stop(td, io_u)) { + /* + * For read only workloads generate the seed. This way + * we can still verify header seed at any later + * invocation. + */ + if (!td_write(td) && !td->o.verify_pattern_bytes) { + io_u->rand_seed = __rand(&td->verify_state); + if (sizeof(int) != sizeof(long *)) + io_u->rand_seed *= __rand(&td->verify_state); + } + + if (verify_state_should_stop(td, td->io_issues[io_u->ddir])) { put_io_u(td, io_u); break; } @@ -1059,7 +1167,7 @@ static void do_io(struct thread_data *td, uint64_t *bytes_done) else io_u->end_io = verify_io_u; td_set_runstate(td, TD_VERIFYING); - } else if (in_ramp_time(td)) + } else if (in_ramp_period(td)) td_set_runstate(td, TD_RAMP); else td_set_runstate(td, TD_RUNNING); @@ -1091,13 +1199,15 @@ static void do_io(struct thread_data *td, uint64_t *bytes_done) td->rate_io_issue_bytes[__ddir] += blen; } - if (should_check_rate(td)) + if (ddir_rw(__ddir) && should_check_rate(td)) { td->rate_next_io_time[__ddir] = usec_for_io(td, __ddir); + fio_gettime(&comp_time, NULL); + } } else { ret = io_u_submit(td, io_u); - if (should_check_rate(td)) + if (ddir_rw(ddir) && should_check_rate(td)) td->rate_next_io_time[ddir] = usec_for_io(td, ddir); if (io_queue_event(td, io_u, &ret, ddir, &bytes_issued, 0, &comp_time)) @@ -1117,6 +1227,9 @@ static void do_io(struct thread_data *td, uint64_t *bytes_done) if (ret < 0) break; + if (ddir_rw(ddir) && td->o.thinkcycles) + cycles_spin(td->o.thinkcycles); + if (ddir_rw(ddir) && td->o.thinktime) handle_thinktime(td, ddir, &comp_time); @@ -1124,7 +1237,7 @@ static void do_io(struct thread_data *td, uint64_t *bytes_done) !td_ioengine_flagged(td, FIO_NOIO)) continue; - if (!in_ramp_time(td) && should_check_rate(td)) { + if (!in_ramp_period(td) && should_check_rate(td)) { if (check_min_rate(td, &comp_time)) { if (exitall_on_terminate || td->o.exitall_error) fio_terminate_threads(td->groupid, td->o.exit_what); @@ -1132,7 +1245,7 @@ static void do_io(struct thread_data *td, uint64_t *bytes_done) break; } } - if (!in_ramp_time(td) && td->o.latency_target) + if (!in_ramp_period(td) && td->o.latency_target) lat_target_check(td); } @@ -1172,8 +1285,11 @@ static void do_io(struct thread_data *td, uint64_t *bytes_done) f->file_name); } } - } else - cleanup_pending_aio(td); + } else { + if (td->o.io_submit_mode == IO_MODE_OFFLOAD) + workqueue_flush(&td->io_wq); + ret = io_u_queued_complete(td, td->cur_depth); + } /* * stop job if we failed doing any IO @@ -1185,39 +1301,29 @@ static void do_io(struct thread_data *td, uint64_t *bytes_done) bytes_done[i] = td->bytes_done[i] - bytes_done[i]; } -static void free_file_completion_logging(struct thread_data *td) +static int init_inflight_logging(struct thread_data *td) { - struct fio_file *f; - unsigned int i; - - for_each_file(td, f, i) { - if (!f->last_write_comp) - break; - sfree(f->last_write_comp); - } -} - -static int init_file_completion_logging(struct thread_data *td, - unsigned int depth) -{ - struct fio_file *f; unsigned int i; if (td->o.verify == VERIFY_NONE || !td->o.verify_state_save) return 0; - for_each_file(td, f, i) { - f->last_write_comp = scalloc(depth, sizeof(uint64_t)); - if (!f->last_write_comp) - goto cleanup; + td->inflight_numberio = scalloc(td->o.iodepth, sizeof(uint64_t)); + if (!td->inflight_numberio) { + log_err("fio: failed to alloc inflight write data\n"); + return 1; } + for (i = 0; i < td->o.iodepth; i++) + td->inflight_numberio[i] = INVALID_NUMBERIO; + return 0; +} -cleanup: - free_file_completion_logging(td); - log_err("fio: failed to alloc write comp data\n"); - return 1; +static void free_inflight_logging(struct thread_data *td) +{ + if (td->inflight_numberio) + sfree(td->inflight_numberio); } static void cleanup_io_u(struct thread_data *td) @@ -1232,22 +1338,25 @@ static void cleanup_io_u(struct thread_data *td) fio_memfree(io_u, sizeof(*io_u), td_offload_overlap(td)); } + while ((io_u = io_u_rpop(&td->io_u_requeues)) != NULL) { + put_io_u(td, io_u); + } + free_io_mem(td); io_u_rexit(&td->io_u_requeues); io_u_qexit(&td->io_u_freelist, false); io_u_qexit(&td->io_u_all, td_offload_overlap(td)); - free_file_completion_logging(td); + free_inflight_logging(td); } static int init_io_u(struct thread_data *td) { struct io_u *io_u; - int cl_align, i, max_units; + int cl_align, i; int err; - max_units = td->o.iodepth; err = 0; err += !io_u_rinit(&td->io_u_requeues, td->o.iodepth); @@ -1261,7 +1370,7 @@ static int init_io_u(struct thread_data *td) cl_align = os_cache_line_size(); - for (i = 0; i < max_units; i++) { + for (i = 0; i < td->o.iodepth; i++) { void *ptr; if (td->terminate) @@ -1278,6 +1387,7 @@ static int init_io_u(struct thread_data *td) INIT_FLIST_HEAD(&io_u->verify_list); dprint(FD_MEM, "io_u alloc %p, index %u\n", io_u, i); + io_u->inflight_idx = -1; io_u->index = i; io_u->flags = IO_U_F_FREE; io_u_qpush(&td->io_u_freelist, io_u); @@ -1298,9 +1408,10 @@ static int init_io_u(struct thread_data *td) } } - init_io_u_buffers(td); + if (init_io_u_buffers(td)) + return 1; - if (init_file_completion_logging(td, max_units)) + if (init_inflight_logging(td)) return 1; return 0; @@ -1309,7 +1420,7 @@ static int init_io_u(struct thread_data *td) int init_io_u_buffers(struct thread_data *td) { struct io_u *io_u; - unsigned long long max_bs, min_write; + unsigned long long max_bs, min_write, trim_bs = 0; int i, max_units; int data_xfer = 1; char *p; @@ -1320,7 +1431,18 @@ int init_io_u_buffers(struct thread_data *td) td->orig_buffer_size = (unsigned long long) max_bs * (unsigned long long) max_units; - if (td_ioengine_flagged(td, FIO_NOIO) || !(td_read(td) || td_write(td))) + if (td_trim(td) && td->o.num_range > 1) { + trim_bs = td->o.num_range * sizeof(struct trim_range); + td->orig_buffer_size = trim_bs + * (unsigned long long) max_units; + } + + /* + * For reads, writes, and multi-range trim operations we need a + * data buffer + */ + if (td_ioengine_flagged(td, FIO_NOIO) || + !(td_read(td) || td_write(td) || (td_trim(td) && td->o.num_range > 1))) data_xfer = 0; /* @@ -1329,7 +1451,7 @@ int init_io_u_buffers(struct thread_data *td) * overflow later. this adjustment may be too much if we get * lucky and the allocator gives us an aligned address. */ - if (td->o.odirect || td->o.mem_align || td->o.oatomic || + if (td->o.odirect || td->o.mem_align || td_ioengine_flagged(td, FIO_RAWIO)) td->orig_buffer_size += page_mask + td->o.mem_align; @@ -1348,7 +1470,7 @@ int init_io_u_buffers(struct thread_data *td) if (data_xfer && allocate_io_mem(td)) return 1; - if (td->o.odirect || td->o.mem_align || td->o.oatomic || + if (td->o.odirect || td->o.mem_align || td_ioengine_flagged(td, FIO_RAWIO)) p = PTR_ALIGN(td->orig_buffer, page_mask) + td->o.mem_align; else @@ -1372,7 +1494,10 @@ int init_io_u_buffers(struct thread_data *td) fill_verify_pattern(td, io_u->buf, max_bs, io_u, 0, 0); } } - p += max_bs; + if (td_trim(td) && td->o.num_range > 1) + p += trim_bs; + else + p += max_bs; } return 0; @@ -1583,8 +1708,11 @@ static uint64_t do_dry_run(struct thread_data *td) io_u_set(td, io_u, IO_U_F_FLIGHT); io_u->error = 0; io_u->resid = 0; - if (ddir_rw(acct_ddir(io_u))) + if (ddir_rw(acct_ddir(io_u))) { + io_u->numberio = td->io_issues[acct_ddir(io_u)]; td->io_issues[acct_ddir(io_u)]++; + } + if (ddir_rw(io_u->ddir)) { io_u_mark_depth(td, 1); td->ts.total_io_u[io_u->ddir]++; @@ -1622,7 +1750,7 @@ static void *thread_main(void *data) uint64_t bytes_done[DDIR_RWDIR_CNT]; int deadlock_loop_cnt; bool clear_state; - int res, ret; + int ret; sk_out_assign(sk_out); free(fd); @@ -1635,6 +1763,10 @@ static void *thread_main(void *data) fio_local_clock_init(); +#ifdef CONFIG_LINUX + prctl(PR_SET_NAME, o->comm); +#endif + dprint(FD_PROCESS, "jobs pid=%d started\n", (int) td->pid); if (is_backend) @@ -1777,28 +1909,35 @@ static void *thread_main(void *data) if (!init_iolog(td)) goto err; + /* ioprio_set() has to be done before td_io_init() */ + if (fio_option_is_set(o, ioprio) || + fio_option_is_set(o, ioprio_class) || + fio_option_is_set(o, ioprio_hint)) { + ret = ioprio_set(IOPRIO_WHO_PROCESS, 0, o->ioprio_class, + o->ioprio, o->ioprio_hint); + if (ret == -1) { + td_verror(td, errno, "ioprio_set"); + goto err; + } + td->ioprio = ioprio_value(o->ioprio_class, o->ioprio, + o->ioprio_hint); + td->ts.ioprio = td->ioprio; + } + if (td_io_init(td)) goto err; - if (init_io_u(td)) - goto err; + if (td_ioengine_flagged(td, FIO_SYNCIO) && td->o.iodepth > 1 && td->o.io_submit_mode != IO_MODE_OFFLOAD) { + log_info("note: both iodepth >= 1 and synchronous I/O engine " + "are selected, queue depth will be capped at 1\n"); + } - if (td->io_ops->post_init && td->io_ops->post_init(td)) + if (init_io_u(td)) goto err; if (o->verify_async && verify_async_init(td)) goto err; - if (fio_option_is_set(o, ioprio) || - fio_option_is_set(o, ioprio_class)) { - ret = ioprio_set(IOPRIO_WHO_PROCESS, 0, o->ioprio_class, o->ioprio); - if (ret == -1) { - td_verror(td, errno, "ioprio_set"); - goto err; - } - td->ioprio = ioprio_value(o->ioprio_class, o->ioprio); - } - if (o->cgroup && cgroup_setup(td, cgroup_list, &cgroup_mnt)) goto err; @@ -1814,6 +1953,9 @@ static void *thread_main(void *data) if (!o->create_serialize && setup_files(td)) goto err; + if (td->io_ops->post_init && td->io_ops->post_init(td)) + goto err; + if (!init_random_map(td)) goto err; @@ -1828,7 +1970,7 @@ static void *thread_main(void *data) if (rate_submit_init(td, sk_out)) goto err; - set_epoch_time(td, o->log_unix_epoch); + set_epoch_time(td, o->log_alternate_epoch_clock_id, o->job_start_clock_id); fio_getrusage(&td->ru_start); memcpy(&td->bw_sample_time, &td->epoch, sizeof(td->epoch)); memcpy(&td->iops_sample_time, &td->epoch, sizeof(td->epoch)); @@ -1838,11 +1980,11 @@ static void *thread_main(void *data) if (o->ratemin[DDIR_READ] || o->ratemin[DDIR_WRITE] || o->ratemin[DDIR_TRIM]) { - memcpy(&td->lastrate[DDIR_READ], &td->bw_sample_time, + memcpy(&td->last_rate_check_time[DDIR_READ], &td->bw_sample_time, sizeof(td->bw_sample_time)); - memcpy(&td->lastrate[DDIR_WRITE], &td->bw_sample_time, + memcpy(&td->last_rate_check_time[DDIR_WRITE], &td->bw_sample_time, sizeof(td->bw_sample_time)); - memcpy(&td->lastrate[DDIR_TRIM], &td->bw_sample_time, + memcpy(&td->last_rate_check_time[DDIR_TRIM], &td->bw_sample_time, sizeof(td->bw_sample_time)); } @@ -1867,8 +2009,12 @@ static void *thread_main(void *data) if (td->o.verify_only && td_write(td)) verify_bytes = do_dry_run(td); else { + if (!td->o.rand_repeatable) + /* save verify rand state to replay hdr seeds later at verify */ + frand_copy(&td->verify_state_last_do_io, &td->verify_state); do_io(td, bytes_done); - + if (!td->o.rand_repeatable) + frand_copy(&td->verify_state, &td->verify_state_last_do_io); if (!ddir_rw_sum(bytes_done)) { fio_mark_td_terminate(td); verify_bytes = 0; @@ -1908,11 +2054,13 @@ static void *thread_main(void *data) } } while (1); - if (td_read(td) && td->io_bytes[DDIR_READ]) + if (td->io_bytes[DDIR_READ] && (td_read(td) || + ((td->flags & TD_F_VER_BACKLOG) && td_write(td)))) update_runtime(td, elapsed_us, DDIR_READ); if (td_write(td) && td->io_bytes[DDIR_WRITE]) update_runtime(td, elapsed_us, DDIR_WRITE); - if (td_trim(td) && td->io_bytes[DDIR_TRIM]) + if (td->io_bytes[DDIR_TRIM] && (td_trim(td) || + ((td->flags & TD_F_TRIM_BACKLOG) && td_write(td)))) update_runtime(td, elapsed_us, DDIR_TRIM); fio_gettime(&td->start, NULL); fio_sem_up(stat_sem); @@ -1951,13 +2099,23 @@ static void *thread_main(void *data) * another thread is checking its io_u's for overlap */ if (td_offload_overlap(td)) { - int res = pthread_mutex_lock(&overlap_check); - assert(res == 0); + int res; + + res = pthread_mutex_lock(&overlap_check); + if (res) { + td->error = errno; + goto err; + } } td_set_runstate(td, TD_FINISHING); if (td_offload_overlap(td)) { + int res; + res = pthread_mutex_unlock(&overlap_check); - assert(res == 0); + if (res) { + td->error = errno; + goto err; + } } update_rusage_stat(td); @@ -2030,18 +2188,17 @@ static void *thread_main(void *data) static void reap_threads(unsigned int *nr_running, uint64_t *t_rate, uint64_t *m_rate) { - struct thread_data *td; unsigned int cputhreads, realthreads, pending; - int i, status, ret; + int ret; /* * reap exited threads (TD_EXITED -> TD_REAPED) */ realthreads = pending = cputhreads = 0; - for_each_td(td, i) { - int flags = 0; + for_each_td(td) { + int flags = 0, status; - if (!strcmp(td->o.ioengine, "cpuio")) + if (!strcmp(td->o.ioengine, "cpuio")) cputhreads++; else realthreads++; @@ -2131,7 +2288,7 @@ static void reap_threads(unsigned int *nr_running, uint64_t *t_rate, done_secs += mtime_since_now(&td->epoch) / 1000; profile_td_exit(td); flow_exit_job(td); - } + } end_for_each(); if (*nr_running == cputhreads && !pending && realthreads) fio_terminate_threads(TERMINATE_ALL, TERMINATE_ALL); @@ -2258,13 +2415,11 @@ static bool waitee_running(struct thread_data *me) { const char *waitee = me->o.wait_for; const char *self = me->o.name; - struct thread_data *td; - int i; if (!waitee) return false; - for_each_td(td, i) { + for_each_td(td) { if (!strcmp(td->o.name, self) || strcmp(td->o.name, waitee)) continue; @@ -2274,7 +2429,7 @@ static bool waitee_running(struct thread_data *me) runstate_to_name(td->runstate)); return true; } - } + } end_for_each(); dprint(FD_PROCESS, "%s: %s completed, can run\n", self, waitee); return false; @@ -2298,14 +2453,14 @@ static void run_threads(struct sk_out *sk_out) set_sig_handlers(); nr_thread = nr_process = 0; - for_each_td(td, i) { + for_each_td(td) { if (check_mount_writes(td)) return; if (td->o.use_thread) nr_thread++; else nr_process++; - } + } end_for_each(); if (output_format & FIO_OUTPUT_NORMAL) { struct buf_output out; @@ -2331,7 +2486,7 @@ static void run_threads(struct sk_out *sk_out) nr_started = 0; m_rate = t_rate = 0; - for_each_td(td, i) { + for_each_td(td) { print_status_init(td->thread_number - 1); if (!td->o.create_serialize) @@ -2367,7 +2522,10 @@ static void run_threads(struct sk_out *sk_out) td_io_close_file(td, f); } } - } + } end_for_each(); + + /* make sure child processes have empty stream buffers before fork */ + log_info_flush(); /* start idle threads before io threads start to run */ fio_idle_prof_start(); @@ -2383,7 +2541,7 @@ static void run_threads(struct sk_out *sk_out) /* * create threads (TD_NOT_CREATED -> TD_CREATED) */ - for_each_td(td, i) { + for_each_td(td) { if (td->runstate != TD_NOT_CREATED) continue; @@ -2453,14 +2611,20 @@ static void run_threads(struct sk_out *sk_out) } else { pid_t pid; dprint(FD_PROCESS, "will fork\n"); + read_barrier(); pid = fork(); if (!pid) { int ret; ret = (int)(uintptr_t)thread_main(fd); + /* _exit() does not flush buffers, so + * do it ourselves */ + log_info_flush(); _exit(ret); - } else if (i == fio_debug_jobno) + } else if (__td_index == fio_debug_jobno) *fio_debug_jobp = pid; + free(fd); + fd = NULL; } dprint(FD_MUTEX, "wait on startup_sem\n"); if (fio_sem_down_timeout(startup_sem, 10000)) { @@ -2472,7 +2636,7 @@ static void run_threads(struct sk_out *sk_out) break; } dprint(FD_MUTEX, "done waiting on startup_sem\n"); - } + } end_for_each(); /* * Wait for the started threads to transition to @@ -2517,11 +2681,11 @@ static void run_threads(struct sk_out *sk_out) /* * start created threads (TD_INITIALIZED -> TD_RUNNING). */ - for_each_td(td, i) { + for_each_td(td) { if (td->runstate != TD_INITIALIZED) continue; - if (in_ramp_time(td)) + if (in_ramp_period(td)) td_set_runstate(td, TD_RAMP); else td_set_runstate(td, TD_RUNNING); @@ -2531,7 +2695,7 @@ static void run_threads(struct sk_out *sk_out) t_rate += ddir_rw_sum(td->o.rate); todo--; fio_sem_up(td->sem); - } + } end_for_each(); reap_threads(&nr_running, &t_rate, &m_rate); @@ -2557,9 +2721,7 @@ static void free_disk_util(void) int fio_backend(struct sk_out *sk_out) { - struct thread_data *td; int i; - if (exec_profile) { if (load_profile(exec_profile)) return 1; @@ -2579,6 +2741,11 @@ int fio_backend(struct sk_out *sk_out) setup_log(&agg_io_log[DDIR_TRIM], &p, "agg-trim_bw.log"); } + if (init_global_dedupe_working_set_seeds()) { + log_err("fio: failed to initialize global dedupe working set\n"); + return 1; + } + startup_sem = fio_sem_init(FIO_SEM_LOCKED); if (!sk_out) is_local_backend = true; @@ -2610,7 +2777,10 @@ int fio_backend(struct sk_out *sk_out) } } - for_each_td(td, i) { + for_each_td(td) { + struct thread_stat *ts = &td->ts; + + free_clat_prio_stats(ts); steadystate_free(td); fio_options_free(td); fio_dump_options_free(td); @@ -2620,7 +2790,7 @@ int fio_backend(struct sk_out *sk_out) } fio_sem_remove(td->sem); td->sem = NULL; - } + } end_for_each(); free_disk_util(); if (cgroup_list) { diff --git a/blktrace.c b/blktrace.c index 64a610a959..ef9ce6bffd 100644 --- a/blktrace.c +++ b/blktrace.c @@ -4,71 +4,35 @@ #include #include #include +#include +#include #include "flist.h" #include "fio.h" +#include "iolog.h" #include "blktrace.h" #include "blktrace_api.h" #include "oslib/linux-dev-lookup.h" -#define TRACE_FIFO_SIZE 8192 - -/* - * fifo refill frontend, to avoid reading data in trace sized bites - */ -static int refill_fifo(struct thread_data *td, struct fifo *fifo, int fd) -{ - char buf[TRACE_FIFO_SIZE]; - unsigned int total; - int ret; - - total = sizeof(buf); - if (total > fifo_room(fifo)) - total = fifo_room(fifo); - - ret = read(fd, buf, total); - if (ret < 0) { - int read_err = errno; - - assert(read_err > 0); - td_verror(td, read_err, "read blktrace file"); - return -read_err; - } - - if (ret > 0) - ret = fifo_put(fifo, buf, ret); - - dprint(FD_BLKTRACE, "refill: filled %d bytes\n", ret); - return ret; -} - -/* - * Retrieve 'len' bytes from the fifo, refilling if necessary. - */ -static int trace_fifo_get(struct thread_data *td, struct fifo *fifo, int fd, - void *buf, unsigned int len) -{ - if (fifo_len(fifo) < len) { - int ret = refill_fifo(td, fifo, fd); - - if (ret < 0) - return ret; - } - - return fifo_get(fifo, buf, len); -} +struct file_cache { + unsigned int maj; + unsigned int min; + unsigned int fileno; +}; /* * Just discard the pdu by seeking past it. */ -static int discard_pdu(struct thread_data *td, struct fifo *fifo, int fd, - struct blk_io_trace *t) +static int discard_pdu(FILE* f, struct blk_io_trace *t) { if (t->pdu_len == 0) return 0; dprint(FD_BLKTRACE, "discard pdu len %u\n", t->pdu_len); - return trace_fifo_get(td, fifo, fd, NULL, t->pdu_len); + if (fseek(f, t->pdu_len, SEEK_CUR) < 0) + return -errno; + + return t->pdu_len; } /* @@ -130,28 +94,28 @@ static void trace_add_open_close_event(struct thread_data *td, int fileno, enum flist_add_tail(&ipo->list, &td->io_log_list); } -static int trace_add_file(struct thread_data *td, __u32 device) +static int trace_add_file(struct thread_data *td, __u32 device, + struct file_cache *cache) { - static unsigned int last_maj, last_min, last_fileno; unsigned int maj = FMAJOR(device); unsigned int min = FMINOR(device); struct fio_file *f; char dev[256]; unsigned int i; - if (last_maj == maj && last_min == min) - return last_fileno; + if (cache->maj == maj && cache->min == min) + return cache->fileno; - last_maj = maj; - last_min = min; + cache->maj = maj; + cache->min = min; /* * check for this file in our list */ for_each_file(td, f, i) if (f->major == maj && f->minor == min) { - last_fileno = f->fileno; - return last_fileno; + cache->fileno = f->fileno; + return cache->fileno; } strcpy(dev, "/dev"); @@ -171,10 +135,10 @@ static int trace_add_file(struct thread_data *td, __u32 device) td->files[fileno]->major = maj; td->files[fileno]->minor = min; trace_add_open_close_event(td, fileno, FIO_LOG_OPEN_FILE); - last_fileno = fileno; + cache->fileno = fileno; } - return last_fileno; + return cache->fileno; } static void t_bytes_align(struct thread_options *o, struct blk_io_trace *t) @@ -215,7 +179,7 @@ static void store_ipo(struct thread_data *td, unsigned long long offset, queue_io_piece(td, ipo); } -static void handle_trace_notify(struct blk_io_trace *t) +static bool handle_trace_notify(struct blk_io_trace *t) { switch (t->action) { case BLK_TN_PROCESS: @@ -232,22 +196,24 @@ static void handle_trace_notify(struct blk_io_trace *t) dprint(FD_BLKTRACE, "unknown trace act %x\n", t->action); break; } + return false; } -static void handle_trace_discard(struct thread_data *td, +static bool handle_trace_discard(struct thread_data *td, struct blk_io_trace *t, unsigned long long ttime, - unsigned long *ios, unsigned int *bs) + unsigned long *ios, unsigned long long *bs, + struct file_cache *cache) { struct io_piece *ipo; int fileno; if (td->o.replay_skip & (1u << DDIR_TRIM)) - return; + return false; ipo = calloc(1, sizeof(*ipo)); init_ipo(ipo); - fileno = trace_add_file(td, t->device); + fileno = trace_add_file(td, t->device, cache); ios[DDIR_TRIM]++; if (t->bytes > bs[DDIR_TRIM]) @@ -270,6 +236,7 @@ static void handle_trace_discard(struct thread_data *td, ipo->offset, ipo->len, ipo->delay); queue_io_piece(td, ipo); + return true; } static void dump_trace(struct blk_io_trace *t) @@ -277,29 +244,29 @@ static void dump_trace(struct blk_io_trace *t) log_err("blktrace: ignoring zero byte trace: action=%x\n", t->action); } -static void handle_trace_fs(struct thread_data *td, struct blk_io_trace *t, +static bool handle_trace_fs(struct thread_data *td, struct blk_io_trace *t, unsigned long long ttime, unsigned long *ios, - unsigned int *bs) + unsigned long long *bs, struct file_cache *cache) { int rw; int fileno; - fileno = trace_add_file(td, t->device); + fileno = trace_add_file(td, t->device, cache); rw = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0; if (rw) { if (td->o.replay_skip & (1u << DDIR_WRITE)) - return; + return false; } else { if (td->o.replay_skip & (1u << DDIR_READ)) - return; + return false; } if (!t->bytes) { if (!fio_did_warn(FIO_WARN_BTRACE_ZERO)) dump_trace(t); - return; + return false; } if (t->bytes > bs[rw]) @@ -308,20 +275,22 @@ static void handle_trace_fs(struct thread_data *td, struct blk_io_trace *t, ios[rw]++; td->o.size += t->bytes; store_ipo(td, t->sector, t->bytes, rw, ttime, fileno); + return true; } -static void handle_trace_flush(struct thread_data *td, struct blk_io_trace *t, - unsigned long long ttime, unsigned long *ios) +static bool handle_trace_flush(struct thread_data *td, struct blk_io_trace *t, + unsigned long long ttime, unsigned long *ios, + struct file_cache *cache) { struct io_piece *ipo; int fileno; if (td->o.replay_skip & (1u << DDIR_SYNC)) - return; + return false; ipo = calloc(1, sizeof(*ipo)); init_ipo(ipo); - fileno = trace_add_file(td, t->device); + fileno = trace_add_file(td, t->device, cache); ipo->delay = ttime / 1000; ipo->ddir = DDIR_SYNC; @@ -329,48 +298,43 @@ static void handle_trace_flush(struct thread_data *td, struct blk_io_trace *t, ios[DDIR_SYNC]++; dprint(FD_BLKTRACE, "store flush delay=%lu\n", ipo->delay); + + if (!(td->flags & TD_F_SYNCS)) + td->flags |= TD_F_SYNCS; + queue_io_piece(td, ipo); + return true; } /* * We only care for queue traces, most of the others are side effects * due to internal workings of the block layer. */ -static void handle_trace(struct thread_data *td, struct blk_io_trace *t, - unsigned long *ios, unsigned int *bs) +static bool queue_trace(struct thread_data *td, struct blk_io_trace *t, + unsigned long *ios, unsigned long long *bs, + struct file_cache *cache) { - static unsigned long long last_ttime; + unsigned long long *last_ttime = &td->io_log_last_ttime; unsigned long long delay = 0; if ((t->action & 0xffff) != __BLK_TA_QUEUE) - return; + return false; if (!(t->action & BLK_TC_ACT(BLK_TC_NOTIFY))) { - if (!last_ttime || td->o.no_stall) - delay = 0; - else if (td->o.replay_time_scale == 100) - delay = t->time - last_ttime; - else { - double tmp = t->time - last_ttime; - double scale; - - scale = (double) 100.0 / (double) td->o.replay_time_scale; - tmp *= scale; - delay = tmp; - } - last_ttime = t->time; + delay = delay_since_ttime(td, t->time); + *last_ttime = t->time; } t_bytes_align(&td->o, t); if (t->action & BLK_TC_ACT(BLK_TC_NOTIFY)) - handle_trace_notify(t); + return handle_trace_notify(t); else if (t->action & BLK_TC_ACT(BLK_TC_DISCARD)) - handle_trace_discard(td, t, delay, ios, bs); + return handle_trace_discard(td, t, delay, ios, bs, cache); else if (t->action & BLK_TC_ACT(BLK_TC_FLUSH)) - handle_trace_flush(td, t, delay, ios); + return handle_trace_flush(td, t, delay, ios, cache); else - handle_trace_fs(td, t, delay, ios, bs); + return handle_trace_fs(td, t, delay, ios, bs, cache); } static void byteswap_trace(struct blk_io_trace *t) @@ -438,43 +402,82 @@ static void depth_end(struct blk_io_trace *t, int *this_depth, int *depth) * Load a blktrace file by reading all the blk_io_trace entries, and storing * them as io_pieces like the fio text version would do. */ -bool load_blktrace(struct thread_data *td, const char *filename, int need_swap) +bool init_blktrace_read(struct thread_data *td, const char *filename, int need_swap) +{ + int old_state; + + td->io_log_rfile = fopen(filename, "rb"); + if (!td->io_log_rfile) { + td_verror(td, errno, "open blktrace file"); + goto err; + } + td->io_log_blktrace_swap = need_swap; + td->io_log_last_ttime = 0; + td->o.size = 0; + + free_release_files(td); + + old_state = td_bump_runstate(td, TD_SETTING_UP); + + if (!read_blktrace(td)) { + goto err; + } + + td_restore_runstate(td, old_state); + + if (!td->files_index) { + log_err("fio: did not find replay device(s)\n"); + return false; + } + + return true; + +err: + if (td->io_log_rfile) { + fclose(td->io_log_rfile); + td->io_log_rfile = NULL; + } + return false; +} + +bool read_blktrace(struct thread_data* td) { struct blk_io_trace t; + struct file_cache cache = { + .maj = ~0U, + .min = ~0U, + }; unsigned long ios[DDIR_RWDIR_SYNC_CNT] = { }; - unsigned int rw_bs[DDIR_RWDIR_CNT] = { }; + unsigned long long rw_bs[DDIR_RWDIR_CNT] = { }; unsigned long skipped_writes; - struct fifo *fifo; - int fd, i, old_state, max_depth; - struct fio_file *f; + FILE *f = td->io_log_rfile; + int i, max_depth; + struct fio_file *fiof; int this_depth[DDIR_RWDIR_CNT] = { }; int depth[DDIR_RWDIR_CNT] = { }; + int64_t items_to_fetch = 0; - fd = open(filename, O_RDONLY); - if (fd < 0) { - td_verror(td, errno, "open blktrace file"); - return false; + if (td->o.read_iolog_chunked) { + items_to_fetch = iolog_items_to_fetch(td); + if (!items_to_fetch) + return true; } - fifo = fifo_alloc(TRACE_FIFO_SIZE); - - old_state = td_bump_runstate(td, TD_SETTING_UP); - - td->o.size = 0; skipped_writes = 0; do { - int ret = trace_fifo_get(td, fifo, fd, &t, sizeof(t)); + int ret = fread(&t, 1, sizeof(t), f); - if (ret < 0) + if (ferror(f)) { + td_verror(td, errno, "read blktrace file"); goto err; - else if (!ret) + } else if (feof(f)) { break; - else if (ret < (int) sizeof(t)) { - log_err("fio: short fifo get\n"); + } else if (ret < (int) sizeof(t)) { + log_err("fio: iolog short read\n"); break; } - if (need_swap) + if (td->io_log_blktrace_swap) byteswap_trace(&t); if ((t.magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) { @@ -487,13 +490,10 @@ bool load_blktrace(struct thread_data *td, const char *filename, int need_swap) t.magic & 0xff); goto err; } - ret = discard_pdu(td, fifo, fd, &t); + ret = discard_pdu(f, &t); if (ret < 0) { td_verror(td, -ret, "blktrace lseek"); goto err; - } else if (t.pdu_len != ret) { - log_err("fio: discarded %d of %d\n", ret, t.pdu_len); - goto err; } if ((t.action & BLK_TC_ACT(BLK_TC_NOTIFY)) == 0) { if ((t.action & 0xffff) == __BLK_TA_QUEUE) @@ -510,22 +510,54 @@ bool load_blktrace(struct thread_data *td, const char *filename, int need_swap) } } - handle_trace(td, &t, ios, rw_bs); - } while (1); + if (!queue_trace(td, &t, ios, rw_bs, &cache)) + continue; - for_each_file(td, f, i) - trace_add_open_close_event(td, f->fileno, FIO_LOG_CLOSE_FILE); + if (td->o.read_iolog_chunked) { + td->io_log_current++; + items_to_fetch--; + if (items_to_fetch == 0) + break; + } + } while (1); - fifo_free(fifo); - close(fd); + if (td->o.read_iolog_chunked) { + td->io_log_highmark = td->io_log_current; + td->io_log_checkmark = (td->io_log_highmark + 1) / 2; + fio_gettime(&td->io_log_highmark_time, NULL); + } - td_restore_runstate(td, old_state); + if (skipped_writes) + log_err("fio: %s skips replay of %lu writes due to read-only\n", + td->o.name, skipped_writes); - if (!td->files_index) { - log_err("fio: did not find replay device(s)\n"); - return false; + if (td->o.read_iolog_chunked) { + if (td->io_log_current == 0) { + return false; + } + td->o.td_ddir = TD_DDIR_RW; + if ((rw_bs[DDIR_READ] > td->o.max_bs[DDIR_READ] || + rw_bs[DDIR_WRITE] > td->o.max_bs[DDIR_WRITE] || + rw_bs[DDIR_TRIM] > td->o.max_bs[DDIR_TRIM]) && + td->orig_buffer) + { + td->o.max_bs[DDIR_READ] = max(td->o.max_bs[DDIR_READ], rw_bs[DDIR_READ]); + td->o.max_bs[DDIR_WRITE] = max(td->o.max_bs[DDIR_WRITE], rw_bs[DDIR_WRITE]); + td->o.max_bs[DDIR_TRIM] = max(td->o.max_bs[DDIR_TRIM], rw_bs[DDIR_TRIM]); + io_u_quiesce(td); + free_io_mem(td); + if (init_io_u_buffers(td)) + return false; + } + return true; } + for_each_file(td, fiof, i) + trace_add_open_close_event(td, fiof->fileno, FIO_LOG_CLOSE_FILE); + + fclose(td->io_log_rfile); + td->io_log_rfile = NULL; + /* * For stacked devices, we don't always get a COMPLETE event so * the depth grows to insane values. Limit it to something sane(r). @@ -539,10 +571,6 @@ bool load_blktrace(struct thread_data *td, const char *filename, int need_swap) max_depth = max(depth[i], max_depth); } - if (skipped_writes) - log_err("fio: %s skips replay of %lu writes due to read-only\n", - td->o.name, skipped_writes); - if (!ios[DDIR_READ] && !ios[DDIR_WRITE] && !ios[DDIR_TRIM] && !ios[DDIR_SYNC]) { log_err("fio: found no ios in blktrace data\n"); @@ -563,14 +591,6 @@ bool load_blktrace(struct thread_data *td, const char *filename, int need_swap) td->o.max_bs[DDIR_TRIM] = rw_bs[DDIR_TRIM]; } - /* - * We need to do direct/raw ios to the device, to avoid getting - * read-ahead in our way. But only do so if the minimum block size - * is a multiple of 4k, otherwise we don't know if it's safe to do so. - */ - if (!fio_option_is_set(&td->o, odirect) && !(td_min_bs(td) & 4095)) - td->o.odirect = 1; - /* * If depth wasn't manually set, use probed depth */ @@ -579,8 +599,7 @@ bool load_blktrace(struct thread_data *td, const char *filename, int need_swap) return true; err: - close(fd); - fifo_free(fifo); + fclose(f); return false; } @@ -625,15 +644,14 @@ static void merge_finish_file(struct blktrace_cursor *bcs, int i, int *nr_logs) { bcs[i].iter++; if (bcs[i].iter < bcs[i].nr_iter) { - lseek(bcs[i].fd, 0, SEEK_SET); + fseek(bcs[i].f, 0, SEEK_SET); return; } *nr_logs -= 1; /* close file */ - fifo_free(bcs[i].fifo); - close(bcs[i].fd); + fclose(bcs[i].f); /* keep active files contiguous */ memmove(&bcs[i], &bcs[*nr_logs], sizeof(bcs[i])); @@ -646,15 +664,16 @@ static int read_trace(struct thread_data *td, struct blktrace_cursor *bc) read_skip: /* read an io trace */ - ret = trace_fifo_get(td, bc->fifo, bc->fd, t, sizeof(*t)); - if (ret < 0) { + ret = fread(&t, 1, sizeof(t), bc->f); + if (ferror(bc->f)) { + td_verror(td, errno, "read blktrace file"); return ret; - } else if (!ret) { + } else if (feof(bc->f)) { if (!bc->length) bc->length = bc->t.time; return ret; } else if (ret < (int) sizeof(*t)) { - log_err("fio: short fifo get\n"); + log_err("fio: iolog short read\n"); return -1; } @@ -664,14 +683,10 @@ static int read_trace(struct thread_data *td, struct blktrace_cursor *bc) /* skip over actions that fio does not care about */ if ((t->action & 0xffff) != __BLK_TA_QUEUE || t_get_ddir(t) == DDIR_INVAL) { - ret = discard_pdu(td, bc->fifo, bc->fd, t); + ret = discard_pdu(bc->f, t); if (ret < 0) { td_verror(td, -ret, "blktrace lseek"); return ret; - } else if (t->pdu_len != ret) { - log_err("fio: discarded %d of %d\n", ret, - t->pdu_len); - return -1; } goto read_skip; } @@ -729,14 +744,13 @@ int merge_blktrace_iologs(struct thread_data *td) str = ptr = strdup(td->o.read_iolog_file); nr_logs = 0; for (i = 0; (name = get_next_str(&ptr)) != NULL; i++) { - bcs[i].fd = open(name, O_RDONLY); - if (bcs[i].fd < 0) { + bcs[i].f = fopen(name, "rb"); + if (!bcs[i].f) { log_err("fio: could not open file: %s\n", name); - ret = bcs[i].fd; + ret = -errno; free(str); goto err_file; } - bcs[i].fifo = fifo_alloc(TRACE_FIFO_SIZE); nr_logs++; if (!is_blktrace(name, &bcs[i].swap)) { @@ -761,14 +775,10 @@ int merge_blktrace_iologs(struct thread_data *td) i = find_earliest_io(bcs, nr_logs); bc = &bcs[i]; /* skip over the pdu */ - ret = discard_pdu(td, bc->fifo, bc->fd, &bc->t); + ret = discard_pdu(bc->f, &bc->t); if (ret < 0) { td_verror(td, -ret, "blktrace lseek"); goto err_file; - } else if (bc->t.pdu_len != ret) { - log_err("fio: discarded %d of %d\n", ret, - bc->t.pdu_len); - goto err_file; } ret = write_trace(merge_fp, &bc->t); @@ -786,8 +796,7 @@ int merge_blktrace_iologs(struct thread_data *td) err_file: /* cleanup */ for (i = 0; i < nr_logs; i++) { - fifo_free(bcs[i].fifo); - close(bcs[i].fd); + fclose(bcs[i].f); } err_merge_buf: free(merge_buf); diff --git a/blktrace.h b/blktrace.h index a0e82faa05..c53b717ba4 100644 --- a/blktrace.h +++ b/blktrace.h @@ -10,7 +10,7 @@ struct blktrace_cursor { struct fifo *fifo; // fifo queue for reading - int fd; // blktrace file + FILE *f; // blktrace file __u64 length; // length of trace struct blk_io_trace t; // current io trace int swap; // bitwise reverse required @@ -20,7 +20,9 @@ struct blktrace_cursor { }; bool is_blktrace(const char *, int *); -bool load_blktrace(struct thread_data *, const char *, int); +bool init_blktrace_read(struct thread_data *, const char *, int); +bool read_blktrace(struct thread_data* td); + int merge_blktrace_iologs(struct thread_data *td); #else @@ -30,12 +32,18 @@ static inline bool is_blktrace(const char *fname, int *need_swap) return false; } -static inline bool load_blktrace(struct thread_data *td, const char *fname, +static inline bool init_blktrace_read(struct thread_data *td, const char *fname, int need_swap) { return false; } +static inline bool read_blktrace(struct thread_data* td) +{ + return false; +} + + static inline int merge_blktrace_iologs(struct thread_data *td) { return false; diff --git a/cairo_text_helpers.c b/cairo_text_helpers.c index 19fb8e03c1..5bdd60219f 100644 --- a/cairo_text_helpers.c +++ b/cairo_text_helpers.c @@ -1,3 +1,5 @@ +#include "cairo_text_helpers.h" + #include #include #include diff --git a/cairo_text_helpers.h b/cairo_text_helpers.h index 014001ad2f..d0f52d51ff 100644 --- a/cairo_text_helpers.h +++ b/cairo_text_helpers.h @@ -1,6 +1,8 @@ #ifndef CAIRO_TEXT_HELPERS_H #define CAIRO_TEXT_HELPERS_H +#include + void draw_centered_text(cairo_t *cr, const char *font, double x, double y, double fontsize, const char *text); diff --git a/cconv.c b/cconv.c index 4f8d27eb2d..9f82c724f4 100644 --- a/cconv.c +++ b/cconv.c @@ -34,6 +34,7 @@ static void free_thread_options_to_cpu(struct thread_options *o) free(o->opendir); free(o->ioengine); free(o->mmapfile); + free(o->comm); free(o->read_iolog_file); free(o->write_iolog_file); free(o->merge_blktrace_file); @@ -48,14 +49,24 @@ static void free_thread_options_to_cpu(struct thread_options *o) free(o->profile); free(o->cgroup); + free(o->verify_pattern); + free(o->buffer_pattern); + for (i = 0; i < DDIR_RWDIR_CNT; i++) { free(o->bssplit[i]); free(o->zone_split[i]); } } -void convert_thread_options_to_cpu(struct thread_options *o, - struct thread_options_pack *top) +size_t thread_options_pack_size(struct thread_options *o) +{ + return sizeof(struct thread_options_pack) + o->verify_pattern_bytes + + o->buffer_pattern_bytes; +} + +int convert_thread_options_to_cpu(struct thread_options *o, + struct thread_options_pack *top, + size_t top_sz) { int i, j; @@ -71,6 +82,7 @@ void convert_thread_options_to_cpu(struct thread_options *o, string_to_cpu(&o->opendir, top->opendir); string_to_cpu(&o->ioengine, top->ioengine); string_to_cpu(&o->mmapfile, top->mmapfile); + string_to_cpu(&o->comm, top->comm); string_to_cpu(&o->read_iolog_file, top->read_iolog_file); string_to_cpu(&o->write_iolog_file, top->write_iolog_file); string_to_cpu(&o->merge_blktrace_file, top->merge_blktrace_file); @@ -84,6 +96,7 @@ void convert_thread_options_to_cpu(struct thread_options *o, string_to_cpu(&o->ioscheduler, top->ioscheduler); string_to_cpu(&o->profile, top->profile); string_to_cpu(&o->cgroup, top->cgroup); + string_to_cpu(&o->dp_scheme_file, top->dp_scheme_file); o->allow_create = le32_to_cpu(top->allow_create); o->allow_mounted_write = le32_to_cpu(top->allow_mounted_write); @@ -101,6 +114,7 @@ void convert_thread_options_to_cpu(struct thread_options *o, o->serialize_overlap = le32_to_cpu(top->serialize_overlap); o->size = le64_to_cpu(top->size); o->io_size = le64_to_cpu(top->io_size); + o->num_range = le32_to_cpu(top->num_range); o->size_percent = le32_to_cpu(top->size_percent); o->io_size_percent = le32_to_cpu(top->io_size_percent); o->fill_device = le32_to_cpu(top->fill_device); @@ -160,6 +174,7 @@ void convert_thread_options_to_cpu(struct thread_options *o, o->create_fsync = le32_to_cpu(top->create_fsync); o->create_on_open = le32_to_cpu(top->create_on_open); o->create_only = le32_to_cpu(top->create_only); + o->filetype = le32_to_cpu(top->filetype); o->end_fsync = le32_to_cpu(top->end_fsync); o->pre_read = le32_to_cpu(top->pre_read); o->sync_io = le32_to_cpu(top->sync_io); @@ -170,11 +185,25 @@ void convert_thread_options_to_cpu(struct thread_options *o, o->verify_state = le32_to_cpu(top->verify_state); o->verify_interval = le32_to_cpu(top->verify_interval); o->verify_offset = le32_to_cpu(top->verify_offset); - - memcpy(o->verify_pattern, top->verify_pattern, MAX_PATTERN_SIZE); - memcpy(o->buffer_pattern, top->buffer_pattern, MAX_PATTERN_SIZE); + o->verify_write_sequence = le32_to_cpu(top->verify_write_sequence); + o->verify_header_seed = le32_to_cpu(top->verify_header_seed); o->verify_pattern_bytes = le32_to_cpu(top->verify_pattern_bytes); + o->verify_pattern_interval = le32_to_cpu(top->verify_pattern_interval); + o->buffer_pattern_bytes = le32_to_cpu(top->buffer_pattern_bytes); + if (o->verify_pattern_bytes >= MAX_PATTERN_SIZE || + o->buffer_pattern_bytes >= MAX_PATTERN_SIZE || + thread_options_pack_size(o) > top_sz) + return -EINVAL; + + o->verify_pattern = realloc(o->verify_pattern, + o->verify_pattern_bytes); + o->buffer_pattern = realloc(o->buffer_pattern, + o->buffer_pattern_bytes); + memcpy(o->verify_pattern, top->patterns, o->verify_pattern_bytes); + memcpy(o->buffer_pattern, &top->patterns[o->verify_pattern_bytes], + o->buffer_pattern_bytes); + o->verify_fatal = le32_to_cpu(top->verify_fatal); o->verify_dump = le32_to_cpu(top->verify_dump); o->verify_async = le32_to_cpu(top->verify_async); @@ -185,7 +214,6 @@ void convert_thread_options_to_cpu(struct thread_options *o, o->do_disk_util = le32_to_cpu(top->do_disk_util); o->override_sync = le32_to_cpu(top->override_sync); o->rand_repeatable = le32_to_cpu(top->rand_repeatable); - o->allrand_repeatable = le32_to_cpu(top->allrand_repeatable); o->rand_seed = le64_to_cpu(top->rand_seed); o->log_entries = le32_to_cpu(top->log_entries); o->log_avg_msec = le32_to_cpu(top->log_avg_msec); @@ -194,11 +222,18 @@ void convert_thread_options_to_cpu(struct thread_options *o, o->log_max = le32_to_cpu(top->log_max); o->log_offset = le32_to_cpu(top->log_offset); o->log_prio = le32_to_cpu(top->log_prio); + o->log_issue_time = le32_to_cpu(top->log_issue_time); o->log_gz = le32_to_cpu(top->log_gz); o->log_gz_store = le32_to_cpu(top->log_gz_store); - o->log_unix_epoch = le32_to_cpu(top->log_unix_epoch); + o->log_alternate_epoch = le32_to_cpu(top->log_alternate_epoch); + o->log_alternate_epoch_clock_id = le32_to_cpu(top->log_alternate_epoch_clock_id); + o->job_start_clock_id = le32_to_cpu(top->job_start_clock_id); o->norandommap = le32_to_cpu(top->norandommap); o->softrandommap = le32_to_cpu(top->softrandommap); + o->sprandom = le32_to_cpu(top->sprandom); + o->spr_num_regions = le32_to_cpu(top->spr_num_regions); + o->spr_over_provisioning.u.f = fio_uint64_to_double(le64_to_cpu(top->spr_over_provisioning.u.i)); + o->spr_cache_size = le64_to_cpu(top->spr_cache_size); o->bs_unaligned = le32_to_cpu(top->bs_unaligned); o->fsync_on_close = le32_to_cpu(top->fsync_on_close); o->bs_is_seq_rand = le32_to_cpu(top->bs_is_seq_rand); @@ -211,6 +246,7 @@ void convert_thread_options_to_cpu(struct thread_options *o, o->random_generator = le32_to_cpu(top->random_generator); o->hugepage_size = le32_to_cpu(top->hugepage_size); o->rw_min_bs = le64_to_cpu(top->rw_min_bs); + o->thinkcycles = le32_to_cpu(top->thinkcycles); o->thinktime = le32_to_cpu(top->thinktime); o->thinktime_spin = le32_to_cpu(top->thinktime_spin); o->thinktime_blocks = le32_to_cpu(top->thinktime_blocks); @@ -225,10 +261,12 @@ void convert_thread_options_to_cpu(struct thread_options *o, o->start_delay_high = le64_to_cpu(top->start_delay_high); o->timeout = le64_to_cpu(top->timeout); o->ramp_time = le64_to_cpu(top->ramp_time); + o->ramp_size = le64_to_cpu(top->ramp_size); o->ss_dur = le64_to_cpu(top->ss_dur); o->ss_ramp_time = le64_to_cpu(top->ss_ramp_time); o->ss_state = le32_to_cpu(top->ss_state); o->ss_limit.u.f = fio_uint64_to_double(le64_to_cpu(top->ss_limit.u.i)); + o->ss_check_interval = le64_to_cpu(top->ss_check_interval); o->zone_range = le64_to_cpu(top->zone_range); o->zone_size = le64_to_cpu(top->zone_size); o->zone_capacity = le64_to_cpu(top->zone_capacity); @@ -236,6 +274,7 @@ void convert_thread_options_to_cpu(struct thread_options *o, o->zone_mode = le32_to_cpu(top->zone_mode); o->max_open_zones = __le32_to_cpu(top->max_open_zones); o->ignore_zone_limits = le32_to_cpu(top->ignore_zone_limits); + o->recover_zbd_write_error = le32_to_cpu(top->recover_zbd_write_error); o->lockmem = le64_to_cpu(top->lockmem); o->offset_increment_percent = le32_to_cpu(top->offset_increment_percent); o->offset_increment = le64_to_cpu(top->offset_increment); @@ -258,6 +297,7 @@ void convert_thread_options_to_cpu(struct thread_options *o, o->nice = le32_to_cpu(top->nice); o->ioprio = le32_to_cpu(top->ioprio); o->ioprio_class = le32_to_cpu(top->ioprio_class); + o->ioprio_hint = le32_to_cpu(top->ioprio_hint); o->file_service_type = le32_to_cpu(top->file_service_type); o->group_reporting = le32_to_cpu(top->group_reporting); o->stats = le32_to_cpu(top->stats); @@ -266,7 +306,6 @@ void convert_thread_options_to_cpu(struct thread_options *o, o->zero_buffers = le32_to_cpu(top->zero_buffers); o->refill_buffers = le32_to_cpu(top->refill_buffers); o->scramble_buffers = le32_to_cpu(top->scramble_buffers); - o->buffer_pattern_bytes = le32_to_cpu(top->buffer_pattern_bytes); o->time_based = le32_to_cpu(top->time_based); o->disable_lat = le32_to_cpu(top->disable_lat); o->disable_clat = le32_to_cpu(top->disable_clat); @@ -303,6 +342,7 @@ void convert_thread_options_to_cpu(struct thread_options *o, o->dedupe_percentage = le32_to_cpu(top->dedupe_percentage); o->dedupe_mode = le32_to_cpu(top->dedupe_mode); o->dedupe_working_set_percentage = le32_to_cpu(top->dedupe_working_set_percentage); + o->dedupe_global = le32_to_cpu(top->dedupe_global); o->block_error_hist = le32_to_cpu(top->block_error_hist); o->replay_align = le32_to_cpu(top->replay_align); o->replay_scale = le32_to_cpu(top->replay_scale); @@ -326,11 +366,20 @@ void convert_thread_options_to_cpu(struct thread_options *o, for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++) o->merge_blktrace_iters[i].u.f = fio_uint64_to_double(le64_to_cpu(top->merge_blktrace_iters[i].u.i)); + + o->fdp = le32_to_cpu(top->fdp); + o->dp_type = le32_to_cpu(top->dp_type); + o->dp_id_select = le32_to_cpu(top->dp_id_select); + o->dp_nr_ids = le32_to_cpu(top->dp_nr_ids); + for (i = 0; i < o->dp_nr_ids; i++) + o->dp_ids[i] = le16_to_cpu(top->dp_ids[i]); #if 0 uint8_t cpumask[FIO_TOP_STR_MAX]; uint8_t verify_cpumask[FIO_TOP_STR_MAX]; uint8_t log_gz_cpumask[FIO_TOP_STR_MAX]; #endif + + return 0; } void convert_thread_options_to_net(struct thread_options_pack *top, @@ -350,6 +399,7 @@ void convert_thread_options_to_net(struct thread_options_pack *top, string_to_net(top->opendir, o->opendir); string_to_net(top->ioengine, o->ioengine); string_to_net(top->mmapfile, o->mmapfile); + string_to_net(top->comm, o->comm); string_to_net(top->read_iolog_file, o->read_iolog_file); string_to_net(top->write_iolog_file, o->write_iolog_file); string_to_net(top->merge_blktrace_file, o->merge_blktrace_file); @@ -363,6 +413,7 @@ void convert_thread_options_to_net(struct thread_options_pack *top, string_to_net(top->ioscheduler, o->ioscheduler); string_to_net(top->profile, o->profile); string_to_net(top->cgroup, o->cgroup); + string_to_net(top->dp_scheme_file, o->dp_scheme_file); top->allow_create = cpu_to_le32(o->allow_create); top->allow_mounted_write = cpu_to_le32(o->allow_mounted_write); @@ -394,6 +445,7 @@ void convert_thread_options_to_net(struct thread_options_pack *top, top->create_fsync = cpu_to_le32(o->create_fsync); top->create_on_open = cpu_to_le32(o->create_on_open); top->create_only = cpu_to_le32(o->create_only); + top->filetype = cpu_to_le32(o->filetype); top->end_fsync = cpu_to_le32(o->end_fsync); top->pre_read = cpu_to_le32(o->pre_read); top->sync_io = cpu_to_le32(o->sync_io); @@ -404,7 +456,10 @@ void convert_thread_options_to_net(struct thread_options_pack *top, top->verify_state = cpu_to_le32(o->verify_state); top->verify_interval = cpu_to_le32(o->verify_interval); top->verify_offset = cpu_to_le32(o->verify_offset); + top->verify_write_sequence = cpu_to_le32(o->verify_write_sequence); + top->verify_header_seed = cpu_to_le32(o->verify_header_seed); top->verify_pattern_bytes = cpu_to_le32(o->verify_pattern_bytes); + top->verify_pattern_interval = cpu_to_le32(o->verify_pattern_interval); top->verify_fatal = cpu_to_le32(o->verify_fatal); top->verify_dump = cpu_to_le32(o->verify_dump); top->verify_async = cpu_to_le32(o->verify_async); @@ -415,18 +470,24 @@ void convert_thread_options_to_net(struct thread_options_pack *top, top->do_disk_util = cpu_to_le32(o->do_disk_util); top->override_sync = cpu_to_le32(o->override_sync); top->rand_repeatable = cpu_to_le32(o->rand_repeatable); - top->allrand_repeatable = cpu_to_le32(o->allrand_repeatable); top->rand_seed = __cpu_to_le64(o->rand_seed); top->log_entries = cpu_to_le32(o->log_entries); top->log_avg_msec = cpu_to_le32(o->log_avg_msec); top->log_max = cpu_to_le32(o->log_max); top->log_offset = cpu_to_le32(o->log_offset); top->log_prio = cpu_to_le32(o->log_prio); + top->log_issue_time = cpu_to_le32(o->log_issue_time); top->log_gz = cpu_to_le32(o->log_gz); top->log_gz_store = cpu_to_le32(o->log_gz_store); - top->log_unix_epoch = cpu_to_le32(o->log_unix_epoch); + top->log_alternate_epoch = cpu_to_le32(o->log_alternate_epoch); + top->log_alternate_epoch_clock_id = cpu_to_le32(o->log_alternate_epoch_clock_id); + top->job_start_clock_id = cpu_to_le32(o->job_start_clock_id); top->norandommap = cpu_to_le32(o->norandommap); top->softrandommap = cpu_to_le32(o->softrandommap); + top->sprandom = cpu_to_le32(o->sprandom); + top->spr_num_regions = cpu_to_le32(o->spr_num_regions); + top->spr_over_provisioning.u.i = __cpu_to_le64(fio_double_to_uint64(o->spr_over_provisioning.u.f)); + top->spr_cache_size = __cpu_to_le64(o->spr_cache_size); top->bs_unaligned = cpu_to_le32(o->bs_unaligned); top->fsync_on_close = cpu_to_le32(o->fsync_on_close); top->bs_is_seq_rand = cpu_to_le32(o->bs_is_seq_rand); @@ -439,6 +500,7 @@ void convert_thread_options_to_net(struct thread_options_pack *top, top->random_generator = cpu_to_le32(o->random_generator); top->hugepage_size = cpu_to_le32(o->hugepage_size); top->rw_min_bs = __cpu_to_le64(o->rw_min_bs); + top->thinkcycles = cpu_to_le32(o->thinkcycles); top->thinktime = cpu_to_le32(o->thinktime); top->thinktime_spin = cpu_to_le32(o->thinktime_spin); top->thinktime_blocks = cpu_to_le32(o->thinktime_blocks); @@ -464,6 +526,7 @@ void convert_thread_options_to_net(struct thread_options_pack *top, top->nice = cpu_to_le32(o->nice); top->ioprio = cpu_to_le32(o->ioprio); top->ioprio_class = cpu_to_le32(o->ioprio_class); + top->ioprio_hint = cpu_to_le32(o->ioprio_hint); top->file_service_type = cpu_to_le32(o->file_service_type); top->group_reporting = cpu_to_le32(o->group_reporting); top->stats = cpu_to_le32(o->stats); @@ -509,6 +572,7 @@ void convert_thread_options_to_net(struct thread_options_pack *top, top->dedupe_percentage = cpu_to_le32(o->dedupe_percentage); top->dedupe_mode = cpu_to_le32(o->dedupe_mode); top->dedupe_working_set_percentage = cpu_to_le32(o->dedupe_working_set_percentage); + top->dedupe_global = cpu_to_le32(o->dedupe_global); top->block_error_hist = cpu_to_le32(o->block_error_hist); top->replay_align = cpu_to_le32(o->replay_align); top->replay_scale = cpu_to_le32(o->replay_scale); @@ -566,20 +630,24 @@ void convert_thread_options_to_net(struct thread_options_pack *top, top->max_latency[i] = __cpu_to_le64(o->max_latency[i]); } - memcpy(top->verify_pattern, o->verify_pattern, MAX_PATTERN_SIZE); - memcpy(top->buffer_pattern, o->buffer_pattern, MAX_PATTERN_SIZE); + memcpy(top->patterns, o->verify_pattern, o->verify_pattern_bytes); + memcpy(&top->patterns[o->verify_pattern_bytes], o->buffer_pattern, + o->buffer_pattern_bytes); top->size = __cpu_to_le64(o->size); top->io_size = __cpu_to_le64(o->io_size); + top->num_range = __cpu_to_le32(o->num_range); top->verify_backlog = __cpu_to_le64(o->verify_backlog); top->start_delay = __cpu_to_le64(o->start_delay); top->start_delay_high = __cpu_to_le64(o->start_delay_high); top->timeout = __cpu_to_le64(o->timeout); top->ramp_time = __cpu_to_le64(o->ramp_time); + top->ramp_size = __cpu_to_le64(o->ramp_size); top->ss_dur = __cpu_to_le64(top->ss_dur); top->ss_ramp_time = __cpu_to_le64(top->ss_ramp_time); top->ss_state = cpu_to_le32(top->ss_state); top->ss_limit.u.i = __cpu_to_le64(fio_double_to_uint64(o->ss_limit.u.f)); + top->ss_check_interval = __cpu_to_le64(top->ss_check_interval); top->zone_range = __cpu_to_le64(o->zone_range); top->zone_size = __cpu_to_le64(o->zone_size); top->zone_capacity = __cpu_to_le64(o->zone_capacity); @@ -587,6 +655,7 @@ void convert_thread_options_to_net(struct thread_options_pack *top, top->zone_mode = __cpu_to_le32(o->zone_mode); top->max_open_zones = __cpu_to_le32(o->max_open_zones); top->ignore_zone_limits = cpu_to_le32(o->ignore_zone_limits); + top->recover_zbd_write_error = cpu_to_le32(o->recover_zbd_write_error); top->lockmem = __cpu_to_le64(o->lockmem); top->ddir_seq_add = __cpu_to_le64(o->ddir_seq_add); top->file_size_low = __cpu_to_le64(o->file_size_low); @@ -609,12 +678,18 @@ void convert_thread_options_to_net(struct thread_options_pack *top, for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++) top->merge_blktrace_iters[i].u.i = __cpu_to_le64(fio_double_to_uint64(o->merge_blktrace_iters[i].u.f)); + + top->fdp = cpu_to_le32(o->fdp); + top->dp_type = cpu_to_le32(o->dp_type); + top->dp_id_select = cpu_to_le32(o->dp_id_select); + top->dp_nr_ids = cpu_to_le32(o->dp_nr_ids); + for (i = 0; i < o->dp_nr_ids; i++) + top->dp_ids[i] = cpu_to_le16(o->dp_ids[i]); #if 0 uint8_t cpumask[FIO_TOP_STR_MAX]; uint8_t verify_cpumask[FIO_TOP_STR_MAX]; uint8_t log_gz_cpumask[FIO_TOP_STR_MAX]; #endif - } /* @@ -624,18 +699,36 @@ void convert_thread_options_to_net(struct thread_options_pack *top, */ int fio_test_cconv(struct thread_options *__o) { - struct thread_options o; - struct thread_options_pack top1, top2; - - memset(&top1, 0, sizeof(top1)); - memset(&top2, 0, sizeof(top2)); - - convert_thread_options_to_net(&top1, __o); - memset(&o, 0, sizeof(o)); - convert_thread_options_to_cpu(&o, &top1); - convert_thread_options_to_net(&top2, &o); - - free_thread_options_to_cpu(&o); - - return memcmp(&top1, &top2, sizeof(top1)); + struct thread_options o1 = *__o, o2; + struct thread_options_pack *top1, *top2; + size_t top_sz; + int ret; + + o1.verify_pattern_bytes = 61; + o1.verify_pattern = malloc(o1.verify_pattern_bytes); + memset(o1.verify_pattern, 'V', o1.verify_pattern_bytes); + o1.buffer_pattern_bytes = 15; + o1.buffer_pattern = malloc(o1.buffer_pattern_bytes); + memset(o1.buffer_pattern, 'B', o1.buffer_pattern_bytes); + + top_sz = thread_options_pack_size(&o1); + top1 = calloc(1, top_sz); + top2 = calloc(1, top_sz); + + convert_thread_options_to_net(top1, &o1); + memset(&o2, 0, sizeof(o2)); + ret = convert_thread_options_to_cpu(&o2, top1, top_sz); + if (ret) + goto out; + + convert_thread_options_to_net(top2, &o2); + ret = memcmp(top1, top2, top_sz); + +out: + free_thread_options_to_cpu(&o2); + free(top2); + free(top1); + free(o1.buffer_pattern); + free(o1.verify_pattern); + return ret; } diff --git a/ci/actions-build.sh b/ci/actions-build.sh new file mode 100755 index 0000000000..861ed3a8d5 --- /dev/null +++ b/ci/actions-build.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash +# This script expects to be invoked from the base fio directory. +set -eu + +SCRIPT_DIR=$(dirname "$0") +# shellcheck disable=SC1091 +. "${SCRIPT_DIR}/common.sh" + +main() { + local extra_cflags="-Werror" + local configure_flags=() + + set_ci_target_os + case "${CI_TARGET_BUILD}/${CI_TARGET_OS}" in + android*/*) + export UNAME=Android + if [ -z "${CI_TARGET_ARCH}" ]; then + echo "Error: CI_TARGET_ARCH has not been set" + return 1 + fi + NDK=$PWD/android-ndk-r24/toolchains/llvm/prebuilt/linux-x86_64/bin + export PATH="${NDK}:${PATH}" + if [ "${CI_TARGET_BUILD}" = "android" ]; then + export LIBS="-landroid" + fi + CC=${NDK}/${CI_TARGET_ARCH}-clang + if [ ! -e "${CC}" ]; then + echo "Error: could not find ${CC}" + return 1 + fi + ;; + */linux | */ubuntu) + case "${CI_TARGET_ARCH}" in + "x86_64") + configure_flags+=( + "--enable-cuda" + ) + ;; + esac + ;;& + */linux | */ubuntu | */debian | */fedora | */alma | */oracle | */rocky) + case "${CI_TARGET_ARCH}" in + "i686") + extra_cflags="${extra_cflags} -m32" + export LDFLAGS="-m32" + ;; + "x86_64") + configure_flags+=( + "--enable-libiscsi" + "--enable-libnbd" + ) + ;; + esac + ;; + */windows) + configure_flags+=("--disable-native") + case "${CI_TARGET_ARCH}" in + "i686") + configure_flags+=("--build-32bit-win") + ;; + "x86_64") + ;; + esac + if [ "${CI_TARGET_BUILD}" = "windows-msys2-64" ]; then + configure_flags+=("--disable-tls") + fi + ;; + esac + configure_flags+=(--extra-cflags="${extra_cflags}") + + ./configure "${configure_flags[@]}" + make -j "$(nproc 2>/dev/null || sysctl -n hw.logicalcpu)" +# macOS does not have nproc, so we have to use sysctl to obtain the number of +# logical CPUs. +} + +main diff --git a/ci/actions-full-test.sh b/ci/actions-full-test.sh new file mode 100755 index 0000000000..5d6d1ca32d --- /dev/null +++ b/ci/actions-full-test.sh @@ -0,0 +1,59 @@ +#!/bin/bash +# This script expects to be invoked from the base fio directory. +set -eu + +main() { + case "${CI_TARGET_BUILD}" in + android*) + return 0;; + esac + + echo "Running long running tests..." + export PYTHONUNBUFFERED="TRUE" + # We can't load modules so skip 1018 which requires null_blk + skip=( + 6 + 1007 + 1008 + 1018 + ) + args=( + --debug + ) + if [ "${GITHUB_JOB}" == "build-containers" ]; then + # io_uring is disabled in containers + # so skip the io_uring test + skip+=( + 18 + ) + # cmd priority does not work in containers + # so skip the related latency test cases + args+=( + -p + "1010:--skip 15 16 17 18 19 20 21 22" + ) + + fi + + # If we are running a nightly test just run the verify tests. Skip the + # verify test script with pull requests and pushes because it takes so + # long. When this workflow is run manually everything will be run. + if [ "${GITHUB_EVENT_NAME}" == "schedule" ]; then + args+=( + --run-only + 1017 + -p + "1017:--complete" + ) + elif [ "${GITHUB_EVENT_NAME}" == "pull_request" ] || [ "${GITHUB_EVENT_NAME}" == "push" ]; then + skip+=( + 1017 + ) + fi + + echo python3 t/run-fio-tests.py --skip "${skip[@]}" "${args[@]}" + python3 t/run-fio-tests.py -c --skip "${skip[@]}" "${args[@]}" + make -C doc html +} + +main diff --git a/ci/actions-install.sh b/ci/actions-install.sh new file mode 100755 index 0000000000..6940e5b9ef --- /dev/null +++ b/ci/actions-install.sh @@ -0,0 +1,221 @@ +#!/usr/bin/env bash +# This script expects to be invoked from the base fio directory. +set -eu + +SCRIPT_DIR=$(dirname "$0") +# shellcheck disable=SC1091 +. "${SCRIPT_DIR}/common.sh" + +_sudo() { + if type -P sudo >/dev/null; then + sudo "$@" + else + "$@" + fi +} + +install_ubuntu() { + local pkgs + + cat < /dev/null +# Skip fsync +force-unsafe-io +# Don't install documentation +path-exclude=/usr/share/man/* +path-exclude=/usr/share/locale/*/LC_MESSAGES/*.mo +path-exclude=/usr/share/doc/* +DPKGCFG + + # Packages available on i686 and x86_64 + pkgs=( + libaio-dev + libcunit1-dev + libcurl4-openssl-dev + libfl-dev + libgnutls28-dev + libnuma-dev + libnfs-dev + valgrind + ) + case "${CI_TARGET_ARCH}" in + "i686") + _sudo dpkg --add-architecture i386 + pkgs=("${pkgs[@]/%/:i386}") + pkgs+=( + gcc-multilib + pkg-config:i386 + zlib1g-dev:i386 + libc6:i386 + libgcc-s1:i386 + ) + ;; + "x86_64") + pkgs+=( + libglusterfs-dev + libgoogle-perftools-dev + libisal-dev + libiscsi-dev + libnbd-dev + libpmem-dev + librbd-dev + libtcmalloc-minimal4 + libibverbs-dev + librdmacm-dev + pkg-config + ) + if apt list --installed | grep -c "libunwind-14-dev"; then + echo "Removing libunwind-14-dev because of conflicts with libunwind-dev" + _sudo apt remove -y libunwind-14-dev + fi + if [ "${CI_TARGET_OS}" == "linux" ] || [ "${CI_TARGET_OS}" == "ubuntu" ]; then + # Only for Ubuntu + pkgs+=( + nvidia-cuda-dev + ) + fi + ;; + esac + + # Architecture-independent packages and packages for which we don't + # care about the architecture. + pkgs+=( + python3-scipy + python3-sphinx + python3-statsmodels + sudo + ${EXTRA_PKGS:-} + ) + if [ "${GITHUB_JOB}" == "build-containers" ] || [ "${GITHUB_JOB}" == "qemu-guest" ]; then + pkgs+=( + bison + build-essential + flex + procps + zlib1g-dev + ) + fi + + echo "Updating APT..." + _sudo apt-get -qq update + echo "Installing packages... ${pkgs[@]}" + _sudo apt-get install -o APT::Immediate-Configure=false --no-install-recommends -qq -y "${pkgs[@]}" +} + +# Fedora and related distributions +install_fedora() { + pkgs=( + bison-devel + git + flex-devel + gnutls-devel + gperftools + isa-l-devel + kernel-devel + libaio-devel + libibverbs-devel + libiscsi-devel + libnbd-devel + libnfs-devel + libpmem-devel + libpmem2-devel + librbd-devel + numactl-devel + protobuf-c-devel + python3-scipy + python3-sphinx + sudo + valgrind-devel + ${EXTRA_PKGS:-} + ) + + case "${CI_TARGET_OS}" in + "fedora") + pkgs+=( + cunit-devel + libgfapi-devel + python3-statsmodels + ) + ;; + "rocky" | "alma" | "oracle") + pkgs+=( + CUnit-devel + python-pip + ) + ;;& + "rocky" | "alma") + pkgs+=( + glusterfs-api-devel + ) + ;; + esac + dnf install -y "${pkgs[@]}" +} + +install_rhel_clone() { + dnf install -y epel-release + install_fedora + + # I could not find a python3-statsmodels package in the repos + pip3 install statsmodels +} + +install_oracle() { + dnf config-manager --set-enabled ol9_codeready_builder + install_rhel_clone +} + +install_alma() { + dnf install -y 'dnf-command(config-manager)' + dnf config-manager --set-enabled crb + dnf install -y almalinux-release-devel + install_rhel_clone +} + +install_rocky() { + dnf install -y 'dnf-command(config-manager)' + dnf config-manager --set-enabled crb + dnf config-manager --set-enabled devel + install_rhel_clone +} + +install_debian() { + install_ubuntu +} + +install_linux() { + install_ubuntu +} + +install_macos() { + # Assumes homebrew and python3 are already installed + #echo "Updating homebrew..." + #brew update >/dev/null 2>&1 + echo "Installing packages..." + HOMEBREW_NO_AUTO_UPDATE=1 brew install cunit libnfs sphinx-doc + pip3 install scipy six statsmodels --user --break-system-packages +} + +install_windows() { + pip3 install scipy six statsmodels sphinx +} + +main() { + case "${CI_TARGET_BUILD}" in + android*) + echo "Installing Android NDK..." + wget --quiet https://dl.google.com/android/repository/android-ndk-r24-linux.zip + unzip -q android-ndk-r24-linux.zip + return 0 + ;; + esac + + set_ci_target_os + + install_function="install_${CI_TARGET_OS}" + ${install_function} + + echo "Python3 path: $(type -p python3 2>&1)" + echo "Python3 version: $(python3 -V 2>&1)" +} + +main diff --git a/ci/actions-smoke-test.sh b/ci/actions-smoke-test.sh new file mode 100755 index 0000000000..494462ac38 --- /dev/null +++ b/ci/actions-smoke-test.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# This script expects to be invoked from the base fio directory. +set -eu + +main() { + case "${CI_TARGET_BUILD}" in + android*) + return 0;; + esac + + echo "Running smoke tests..." + make test +} + +main diff --git a/ci/appveyor-install.sh b/ci/appveyor-install.sh deleted file mode 100755 index 3137f39ebe..0000000000 --- a/ci/appveyor-install.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -# The PATH to appropriate distro commands must already be set before invoking -# this script -# The following environment variables must be set: -# PLATFORM={i686,x64} -# DISTRO={cygwin,msys2} -# The following environment can optionally be set: -# CYG_MIRROR= -set -eu - -case "${ARCHITECTURE}" in - "x64") - PACKAGE_ARCH="x86_64" - ;; - "x86") - PACKAGE_ARCH="i686" - ;; -esac - -echo "Installing packages..." -case "${DISTRO}" in - "cygwin") - CYG_MIRROR=${CYG_MIRROR:-"http://cygwin.mirror.constant.com"} - setup-x86_64.exe --quiet-mode --no-shortcuts --only-site \ - --site "${CYG_MIRROR}" --packages \ - "mingw64-${PACKAGE_ARCH}-CUnit,mingw64-${PACKAGE_ARCH}-zlib" - ;; - "msys2") - #pacman --noconfirm -Syuu # MSYS2 core update - #pacman --noconfirm -Syuu # MSYS2 normal update - pacman.exe --noconfirm -S \ - mingw-w64-${PACKAGE_ARCH}-clang \ - mingw-w64-${PACKAGE_ARCH}-cunit \ - mingw-w64-${PACKAGE_ARCH}-toolchain \ - mingw-w64-${PACKAGE_ARCH}-lld - pacman.exe -Q # List installed packages - ;; -esac - -python.exe -m pip install scipy six - -echo "Python3 path: $(type -p python3 2>&1)" -echo "Python3 version: $(python3 -V 2>&1)" diff --git a/ci/common.sh b/ci/common.sh new file mode 100644 index 0000000000..3cf6a4169f --- /dev/null +++ b/ci/common.sh @@ -0,0 +1,34 @@ +# shellcheck shell=bash + +function set_ci_target_os { + # Function that exports CI_TARGET_OS to the current OS if it is not already + # set. + + # Don't override CI_TARGET_OS if already set + CI_TARGET_OS=${CI_TARGET_OS:-} + if [[ -z ${CI_TARGET_OS} ]]; then + # Detect operating system + case "${OSTYPE}" in + linux*) + CI_TARGET_OS="linux" + ;; + darwin*) + CI_TARGET_OS="macos" + ;; + cygwin|msys*) + CI_TARGET_OS="windows" + ;; + bsd*) + CI_TARGET_OS="bsd" + ;; + *) + CI_TARGET_OS="" + esac + fi + + # Don't override CI_TARGET_ARCH if already set + CI_TARGET_ARCH=${CI_TARGET_ARCH:-} + if [[ -z ${CI_TARGET_ARCH} ]]; then + CI_TARGET_ARCH="$(uname -m)" + fi +} diff --git a/ci/travis-build.sh b/ci/travis-build.sh deleted file mode 100755 index 923d882d57..0000000000 --- a/ci/travis-build.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash -set -eu - -CI_TARGET_ARCH="${BUILD_ARCH:-$TRAVIS_CPU_ARCH}" -EXTRA_CFLAGS="-Werror" -export PYTHONUNBUFFERED=TRUE -CONFIGURE_FLAGS=() - -case "$TRAVIS_OS_NAME" in - "linux") - CONFIGURE_FLAGS+=(--enable-libiscsi) - case "$CI_TARGET_ARCH" in - "x86") - EXTRA_CFLAGS="${EXTRA_CFLAGS} -m32" - export LDFLAGS="-m32" - ;; - "amd64") - CONFIGURE_FLAGS+=(--enable-cuda) - ;; - esac - ;; -esac -CONFIGURE_FLAGS+=(--extra-cflags="${EXTRA_CFLAGS}") - -./configure "${CONFIGURE_FLAGS[@]}" && - make && - make test && - if [[ "$CI_TARGET_ARCH" == "arm64" ]]; then - sudo python3 t/run-fio-tests.py --skip 6 1007 1008 --debug -p 1010:"--skip 15 16 17 18 19 20" - else - sudo python3 t/run-fio-tests.py --skip 6 1007 1008 --debug - fi diff --git a/ci/travis-install-librpma.sh b/ci/travis-install-librpma.sh deleted file mode 100755 index b127f3f569..0000000000 --- a/ci/travis-install-librpma.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -e - -# 11.02.2021 Merge pull request #866 from ldorau/rpma-mmap-memory-for-rpma_mr_reg-in-rpma_flush_apm_new -LIBRPMA_VERSION=fbac593917e98f3f26abf14f4fad5a832b330f5c -ZIP_FILE=rpma.zip - -WORKDIR=$(pwd) - -# install librpma -wget -O $ZIP_FILE https://github.com/pmem/rpma/archive/${LIBRPMA_VERSION}.zip -unzip $ZIP_FILE -mkdir -p rpma-${LIBRPMA_VERSION}/build -cd rpma-${LIBRPMA_VERSION}/build -cmake .. -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_INSTALL_PREFIX=/usr \ - -DBUILD_DOC=OFF \ - -DBUILD_EXAMPLES=OFF \ - -DBUILD_TESTS=OFF -make -j$(nproc) -sudo make -j$(nproc) install -cd $WORKDIR -rm -rf $ZIP_FILE rpma-${LIBRPMA_VERSION} diff --git a/ci/travis-install-pmdk.sh b/ci/travis-install-pmdk.sh deleted file mode 100755 index 803438f8f8..0000000000 --- a/ci/travis-install-pmdk.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash -e - -# pmdk v1.9.1 release -PMDK_VERSION=1.9.1 - -WORKDIR=$(pwd) - -# -# The '/bin/sh' shell used by PMDK's 'make install' -# does not know the exact localization of clang -# and fails with: -# /bin/sh: 1: clang: not found -# if CC is not set to the full path of clang. -# -export CC=$(which $CC) - -# Install PMDK libraries, because PMDK's libpmem -# is a dependency of the librpma fio engine. -# Install it from a release package -# with already generated documentation, -# in order to not install 'pandoc'. -wget https://github.com/pmem/pmdk/releases/download/${PMDK_VERSION}/pmdk-${PMDK_VERSION}.tar.gz -tar -xzf pmdk-${PMDK_VERSION}.tar.gz -cd pmdk-${PMDK_VERSION} -make -j$(nproc) NDCTL_ENABLE=n -sudo make -j$(nproc) install prefix=/usr NDCTL_ENABLE=n -cd $WORKDIR -rm -rf pmdk-${PMDK_VERSION} diff --git a/ci/travis-install.sh b/ci/travis-install.sh deleted file mode 100755 index 4c4c04c5d6..0000000000 --- a/ci/travis-install.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash -set -eu - -CI_TARGET_ARCH="${BUILD_ARCH:-$TRAVIS_CPU_ARCH}" -case "$TRAVIS_OS_NAME" in - "linux") - # Architecture-dependent packages. - pkgs=( - libaio-dev - libcunit1-dev - libfl-dev - libgoogle-perftools-dev - libibverbs-dev - libiscsi-dev - libnuma-dev - librbd-dev - librdmacm-dev - libz-dev - ) - case "$CI_TARGET_ARCH" in - "x86") - pkgs=("${pkgs[@]/%/:i386}") - pkgs+=( - gcc-multilib - pkg-config:i386 - ) - ;; - "amd64") - pkgs+=(nvidia-cuda-dev) - ;; - esac - if [[ $CI_TARGET_ARCH != "x86" ]]; then - pkgs+=(glusterfs-common) - fi - # Architecture-independent packages and packages for which we don't - # care about the architecture. - pkgs+=( - bison - flex - python3 - python3-scipy - python3-six - ) - sudo apt-get -qq update - sudo apt-get install --no-install-recommends -qq -y "${pkgs[@]}" - # librpma is supported on the amd64 (x86_64) architecture for now - if [[ $CI_TARGET_ARCH == "amd64" ]]; then - # install libprotobuf-c-dev required by librpma_gpspm - sudo apt-get install --no-install-recommends -qq -y libprotobuf-c-dev - # PMDK libraries have to be installed, because - # libpmem is a dependency of the librpma fio engine - ci/travis-install-pmdk.sh - # install librpma from sources from GitHub - ci/travis-install-librpma.sh - fi - ;; - "osx") - brew update >/dev/null 2>&1 - brew install cunit - pip3 install scipy six - ;; -esac - -echo "Python3 path: $(type -p python3 2>&1)" -echo "Python3 version: $(python3 -V 2>&1)" diff --git a/client.c b/client.c index 8b230617f7..374a744ab5 100644 --- a/client.c +++ b/client.c @@ -34,7 +34,7 @@ static void handle_start(struct fio_client *client, struct fio_net_cmd *cmd); static void convert_text(struct fio_net_cmd *cmd); static void client_display_thread_status(struct jobs_eta *je); -struct client_ops fio_client_ops = { +struct client_ops const fio_client_ops = { .text = handle_text, .disk_util = handle_du, .thread_status = handle_ts, @@ -61,7 +61,8 @@ int sum_stat_clients; static int sum_stat_nr; static struct buf_output allclients; static struct json_object *root = NULL; -static struct json_object *job_opt_object = NULL; +static struct json_object *global_opt_object = NULL; +static struct json_array *global_opt_array = NULL; static struct json_array *clients_array = NULL; static struct json_array *du_array = NULL; @@ -189,8 +190,13 @@ static void fio_client_json_init(void) json_object_add_value_int(root, "timestamp", time_p); json_object_add_value_string(root, "time", time_buf); - job_opt_object = json_create_object(); - json_object_add_value_object(root, "global options", job_opt_object); + if (nr_clients == 1) { + global_opt_object = json_create_object(); + json_object_add_value_object(root, "global options", global_opt_object); + } else { + global_opt_array = json_create_array(); + json_object_add_value_array(root, "global options", global_opt_array); + } clients_array = json_create_array(); json_object_add_value_array(root, "client_stats", clients_array); du_array = json_create_array(); @@ -215,7 +221,8 @@ static void fio_client_json_fini(void) json_free_object(root); root = NULL; - job_opt_object = NULL; + global_opt_object = NULL; + global_opt_array = NULL; clients_array = NULL; du_array = NULL; } @@ -284,9 +291,10 @@ static int fio_client_dec_jobs_eta(struct client_eta *eta, client_eta_op eta_fn) static void fio_drain_client_text(struct fio_client *client) { do { - struct fio_net_cmd *cmd; + struct fio_net_cmd *cmd = NULL; - cmd = fio_net_recv_cmd(client->fd, false); + if (fio_server_poll_fd(client->fd, POLLIN, 0)) + cmd = fio_net_recv_cmd(client->fd, false); if (!cmd) break; @@ -368,8 +376,7 @@ static struct fio_client *get_new_client(void) { struct fio_client *client; - client = malloc(sizeof(*client)); - memset(client, 0, sizeof(*client)); + client = calloc(1, sizeof(*client)); INIT_FLIST_HEAD(&client->list); INIT_FLIST_HEAD(&client->hash_list); @@ -446,7 +453,7 @@ int fio_client_add_ini_file(void *cookie, const char *ini_file, bool remote) return 0; } -int fio_client_add(struct client_ops *ops, const char *hostname, void **cookie) +int fio_client_add(struct client_ops const *ops, const char *hostname, void **cookie) { struct fio_client *existing = *cookie; struct fio_client *client; @@ -792,8 +799,7 @@ static int __fio_client_send_remote_ini(struct fio_client *client, dprint(FD_NET, "send remote ini %s to %s\n", filename, client->hostname); p_size = sizeof(*pdu) + strlen(filename) + 1; - pdu = malloc(p_size); - memset(pdu, 0, p_size); + pdu = calloc(1, p_size); pdu->name_len = strlen(filename); strcpy((char *) pdu->file, filename); pdu->client_type = cpu_to_le16((uint16_t) client->type); @@ -921,13 +927,20 @@ int fio_clients_send_ini(const char *filename) int fio_client_update_options(struct fio_client *client, struct thread_options *o, uint64_t *tag) { - struct cmd_add_job_pdu pdu; + size_t cmd_sz = offsetof(struct cmd_add_job_pdu, top) + + thread_options_pack_size(o); + struct cmd_add_job_pdu *pdu; + int ret; - pdu.thread_number = cpu_to_le32(client->thread_number); - pdu.groupid = cpu_to_le32(client->groupid); - convert_thread_options_to_net(&pdu.top, o); + pdu = malloc(cmd_sz); + pdu->thread_number = cpu_to_le32(client->thread_number); + pdu->groupid = cpu_to_le32(client->groupid); + convert_thread_options_to_net(&pdu->top, o); - return fio_net_send_cmd(client->fd, FIO_NET_CMD_UPDATE_JOB, &pdu, sizeof(pdu), tag, &client->cmd_list); + ret = fio_net_send_cmd(client->fd, FIO_NET_CMD_UPDATE_JOB, pdu, + cmd_sz, tag, &client->cmd_list); + free(pdu); + return ret; } static void convert_io_stat(struct io_stat *dst, struct io_stat *src) @@ -950,9 +963,12 @@ static void convert_ts(struct thread_stat *dst, struct thread_stat *src) dst->error = le32_to_cpu(src->error); dst->thread_number = le32_to_cpu(src->thread_number); dst->groupid = le32_to_cpu(src->groupid); + dst->job_start = le64_to_cpu(src->job_start); dst->pid = le32_to_cpu(src->pid); dst->members = le32_to_cpu(src->members); dst->unified_rw_rep = le32_to_cpu(src->unified_rw_rep); + dst->ioprio = le32_to_cpu(src->ioprio); + dst->disable_prio_stat = le32_to_cpu(src->disable_prio_stat); for (i = 0; i < DDIR_RWDIR_CNT; i++) { convert_io_stat(&dst->clat_stat[i], &src->clat_stat[i]); @@ -1011,7 +1027,6 @@ static void convert_ts(struct thread_stat *dst, struct thread_stat *src) dst->total_submit = le64_to_cpu(src->total_submit); dst->total_complete = le64_to_cpu(src->total_complete); - dst->nr_zone_resets = le64_to_cpu(src->nr_zone_resets); for (i = 0; i < DDIR_RWDIR_CNT; i++) { dst->io_bytes[i] = le64_to_cpu(src->io_bytes[i]); @@ -1027,6 +1042,9 @@ static void convert_ts(struct thread_stat *dst, struct thread_stat *src) dst->sig_figs = le32_to_cpu(src->sig_figs); + dst->nr_zone_resets = le64_to_cpu(src->nr_zone_resets); + dst->count_zone_resets = le16_to_cpu(src->count_zone_resets); + dst->latency_depth = le32_to_cpu(src->latency_depth); dst->latency_target = le64_to_cpu(src->latency_target); dst->latency_window = le64_to_cpu(src->latency_window); @@ -1035,14 +1053,6 @@ static void convert_ts(struct thread_stat *dst, struct thread_stat *src) dst->nr_block_infos = le64_to_cpu(src->nr_block_infos); for (i = 0; i < dst->nr_block_infos; i++) dst->block_infos[i] = le32_to_cpu(src->block_infos[i]); - for (i = 0; i < DDIR_RWDIR_CNT; i++) { - for (j = 0; j < FIO_IO_U_PLAT_NR; j++) { - dst->io_u_plat_high_prio[i][j] = le64_to_cpu(src->io_u_plat_high_prio[i][j]); - dst->io_u_plat_low_prio[i][j] = le64_to_cpu(src->io_u_plat_low_prio[i][j]); - } - convert_io_stat(&dst->clat_high_prio_stat[i], &src->clat_high_prio_stat[i]); - convert_io_stat(&dst->clat_low_prio_stat[i], &src->clat_low_prio_stat[i]); - } dst->ss_dur = le64_to_cpu(src->ss_dur); dst->ss_state = le32_to_cpu(src->ss_state); @@ -1052,10 +1062,24 @@ static void convert_ts(struct thread_stat *dst, struct thread_stat *src) dst->ss_deviation.u.f = fio_uint64_to_double(le64_to_cpu(src->ss_deviation.u.i)); dst->ss_criterion.u.f = fio_uint64_to_double(le64_to_cpu(src->ss_criterion.u.i)); + for (i = 0; i < DDIR_RWDIR_CNT; i++) { + dst->nr_clat_prio[i] = le32_to_cpu(src->nr_clat_prio[i]); + for (j = 0; j < dst->nr_clat_prio[i]; j++) { + for (k = 0; k < FIO_IO_U_PLAT_NR; k++) + dst->clat_prio[i][j].io_u_plat[k] = + le64_to_cpu(src->clat_prio[i][j].io_u_plat[k]); + convert_io_stat(&dst->clat_prio[i][j].clat_stat, + &src->clat_prio[i][j].clat_stat); + dst->clat_prio[i][j].ioprio = + le32_to_cpu(dst->clat_prio[i][j].ioprio); + } + } + if (dst->ss_state & FIO_SS_DATA) { for (i = 0; i < dst->ss_dur; i++ ) { dst->ss_iops_data[i] = le64_to_cpu(src->ss_iops_data[i]); dst->ss_bw_data[i] = le64_to_cpu(src->ss_bw_data[i]); + dst->ss_lat_data[i] = le64_to_cpu(src->ss_lat_data[i]); } } @@ -1102,18 +1126,30 @@ static void handle_ts(struct fio_client *client, struct fio_net_cmd *cmd) opt_list = &client->opt_lists[p->ts.thread_number - 1]; tsobj = show_thread_status(&p->ts, &p->rs, opt_list, &client->buf); - client->did_stat = true; if (tsobj) { json_object_add_client_info(tsobj, client); json_array_add_value_object(clients_array, tsobj); + if (!client->did_stat && client->global_opts) + json_array_add_value_object(global_opt_array, client->global_opts); } + client->did_stat = true; if (sum_stat_clients <= 1) return; - sum_thread_stats(&client_ts, &p->ts, sum_stat_nr == 1); + sum_thread_stats(&client_ts, &p->ts); sum_group_stats(&client_gs, &p->rs); + if (!client_ts.members) { + /* Arbitrarily use the percentile toggles and percentile list + * from the first thread_stat that comes our way */ + client_ts.slat_percentiles = p->ts.slat_percentiles; + client_ts.clat_percentiles = p->ts.clat_percentiles; + client_ts.lat_percentiles = p->ts.lat_percentiles; + + for (int i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++) + client_ts.percentile_list[i] = p->ts.percentile_list[i]; + } client_ts.members++; client_ts.thread_number = p->ts.thread_number; client_ts.groupid = p->ts.groupid; @@ -1147,12 +1183,31 @@ static void handle_job_opt(struct fio_client *client, struct fio_net_cmd *cmd) pdu->groupid = le32_to_cpu(pdu->groupid); if (pdu->global) { - if (!job_opt_object) + struct json_object *global_opts; + + if (!global_opt_object && !global_opt_array) return; - json_object_add_value_string(job_opt_object, + /* + * If we have only one server connection, add it to the single + * global option dictionary. When we have connections to + * multiple servers, add the global option to the + * server-specific dictionary. + */ + if (global_opt_object) { + global_opts = global_opt_object; + } else { + if (!client->global_opts) { + client->global_opts = json_create_object(); + json_object_add_client_info(client->global_opts, client); + } + global_opts = client->global_opts; + } + + json_object_add_value_string(global_opts, (const char *)pdu->name, (const char *)pdu->value); + return; } else if (client->opt_lists) { struct flist_head *opt_list = &client->opt_lists[pdu->groupid]; struct print_option *p; @@ -1374,8 +1429,8 @@ static void handle_eta(struct fio_client *client, struct fio_net_cmd *cmd) static void client_flush_hist_samples(FILE *f, int hist_coarseness, void *samples, uint64_t sample_size) { - struct io_sample *s; - int log_offset; + struct io_sample *s, *s_tmp; + bool log_offset, log_issue_time; uint64_t i, j, nr_samples; struct io_u_plat_entry *entry; uint64_t *io_u_plat; @@ -1385,15 +1440,17 @@ static void client_flush_hist_samples(FILE *f, int hist_coarseness, void *sample if (!sample_size) return; - s = __get_sample(samples, 0, 0); + s = __get_sample(samples, 0, 0, 0); log_offset = (s->__ddir & LOG_OFFSET_SAMPLE_BIT) != 0; + log_issue_time = (s->__ddir & LOG_ISSUE_TIME_SAMPLE_BIT) != 0; - nr_samples = sample_size / __log_entry_sz(log_offset); + nr_samples = sample_size / __log_entry_sz(log_offset, log_issue_time); for (i = 0; i < nr_samples; i++) { - s = (struct io_sample *)((char *)__get_sample(samples, log_offset, i) + - i * sizeof(struct io_u_plat_entry)); + s_tmp = __get_sample(samples, log_offset, log_issue_time, i); + s = (struct io_sample *)((char *)s_tmp + + i * sizeof(struct io_u_plat_entry)); entry = s->data.plat_entry; io_u_plat = entry->io_u_plat; @@ -1438,10 +1495,13 @@ static int fio_client_handle_iolog(struct fio_client *client, if (store_direct) { ssize_t wrote; size_t sz; - int fd; + int fd, flags; - fd = open((const char *) log_pathname, - O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (pdu->per_job_logs) + flags = O_WRONLY | O_CREAT | O_TRUNC; + else + flags = O_WRONLY | O_CREAT | O_APPEND; + fd = open((const char *) log_pathname, flags, 0644); if (fd < 0) { log_err("fio: open log %s: %s\n", log_pathname, strerror(errno)); @@ -1462,7 +1522,13 @@ static int fio_client_handle_iolog(struct fio_client *client, ret = 0; } else { FILE *f; - f = fopen((const char *) log_pathname, "w"); + const char *mode; + + if (pdu->per_job_logs) + mode = "w"; + else + mode = "a"; + f = fopen((const char *) log_pathname, mode); if (!f) { log_err("fio: fopen log %s : %s\n", log_pathname, strerror(errno)); @@ -1572,6 +1638,7 @@ static struct cmd_iolog_pdu *convert_iolog_gz(struct fio_net_cmd *cmd, uint64_t nr_samples; size_t total; char *p; + size_t log_entry_size; stream.zalloc = Z_NULL; stream.zfree = Z_NULL; @@ -1587,11 +1654,13 @@ static struct cmd_iolog_pdu *convert_iolog_gz(struct fio_net_cmd *cmd, */ nr_samples = le64_to_cpu(pdu->nr_samples); + log_entry_size = __log_entry_sz(le32_to_cpu(pdu->log_offset), + le32_to_cpu(pdu->log_issue_time)); if (pdu->log_type == IO_LOG_TYPE_HIST) - total = nr_samples * (__log_entry_sz(le32_to_cpu(pdu->log_offset)) + - sizeof(struct io_u_plat_entry)); + total = nr_samples * (log_entry_size + + sizeof(struct io_u_plat_entry)); else - total = nr_samples * __log_entry_sz(le32_to_cpu(pdu->log_offset)); + total = nr_samples * log_entry_size; ret = malloc(total + sizeof(*pdu)); ret->nr_samples = nr_samples; @@ -1680,7 +1749,9 @@ static struct cmd_iolog_pdu *convert_iolog(struct fio_net_cmd *cmd, ret->compressed = le32_to_cpu(ret->compressed); ret->log_offset = le32_to_cpu(ret->log_offset); ret->log_prio = le32_to_cpu(ret->log_prio); + ret->log_issue_time = le32_to_cpu(ret->log_issue_time); ret->log_hist_coarseness = le32_to_cpu(ret->log_hist_coarseness); + ret->per_job_logs = le32_to_cpu(ret->per_job_logs); if (*store_direct) return ret; @@ -1689,21 +1760,26 @@ static struct cmd_iolog_pdu *convert_iolog(struct fio_net_cmd *cmd, for (i = 0; i < ret->nr_samples; i++) { struct io_sample *s; - s = __get_sample(samples, ret->log_offset, i); + s = __get_sample(samples, ret->log_offset, ret->log_issue_time, i); if (ret->log_type == IO_LOG_TYPE_HIST) s = (struct io_sample *)((char *)s + sizeof(struct io_u_plat_entry) * i); s->time = le64_to_cpu(s->time); - s->data.val = le64_to_cpu(s->data.val); + if (ret->log_type != IO_LOG_TYPE_HIST) { + s->data.val.val0 = le64_to_cpu(s->data.val.val0); + s->data.val.val1 = le64_to_cpu(s->data.val.val1); + } s->__ddir = __le32_to_cpu(s->__ddir); s->bs = le64_to_cpu(s->bs); s->priority = le16_to_cpu(s->priority); - if (ret->log_offset) { - struct io_sample_offset *so = (void *) s; + if (ret->log_offset) + s->aux[IOS_AUX_OFFSET_INDEX] = + le64_to_cpu(s->aux[IOS_AUX_OFFSET_INDEX]); - so->offset = le64_to_cpu(so->offset); - } + if (ret->log_issue_time) + s->aux[IOS_AUX_ISSUE_TIME_INDEX] = + le64_to_cpu(s->aux[IOS_AUX_ISSUE_TIME_INDEX]); if (ret->log_type == IO_LOG_TYPE_HIST) { s->data.plat_entry = (struct io_u_plat_entry *)(((char *)s) + sizeof(*s)); @@ -1758,9 +1834,8 @@ static int fio_send_file(struct fio_client *client, struct cmd_sendfile *pdu, int fio_handle_client(struct fio_client *client) { - struct client_ops *ops = client->ops; + struct client_ops const *ops = client->ops; struct fio_net_cmd *cmd; - int size; dprint(FD_NET, "client: handle %s\n", client->hostname); @@ -1794,14 +1869,29 @@ int fio_handle_client(struct fio_client *client) } case FIO_NET_CMD_TS: { struct cmd_ts_pdu *p = (struct cmd_ts_pdu *) cmd->payload; + uint64_t offset; + int i; + + for (i = 0; i < DDIR_RWDIR_CNT; i++) { + if (le32_to_cpu(p->ts.nr_clat_prio[i])) { + offset = le64_to_cpu(p->ts.clat_prio_offset[i]); + p->ts.clat_prio[i] = + (struct clat_prio_stat *)((char *)p + offset); + } + } dprint(FD_NET, "client: ts->ss_state = %u\n", (unsigned int) le32_to_cpu(p->ts.ss_state)); if (le32_to_cpu(p->ts.ss_state) & FIO_SS_DATA) { dprint(FD_NET, "client: received steadystate ring buffers\n"); - size = le64_to_cpu(p->ts.ss_dur); - p->ts.ss_iops_data = (uint64_t *) ((struct cmd_ts_pdu *)cmd->payload + 1); - p->ts.ss_bw_data = p->ts.ss_iops_data + size; + offset = le64_to_cpu(p->ts.ss_iops_data_offset); + p->ts.ss_iops_data = (uint64_t *)((char *)p + offset); + + offset = le64_to_cpu(p->ts.ss_bw_data_offset); + p->ts.ss_bw_data = (uint64_t *)((char *)p + offset); + + offset = le64_to_cpu(p->ts.ss_lat_data_offset); + p->ts.ss_lat_data = (uint64_t *)((char *)p + offset); } convert_ts(&p->ts, &p->ts); @@ -1932,7 +2022,7 @@ int fio_clients_send_trigger(const char *cmd) return 0; } -static void request_client_etas(struct client_ops *ops) +static void request_client_etas(struct client_ops const *ops) { struct fio_client *client; struct flist_head *entry; @@ -2064,7 +2154,7 @@ static int fio_check_clients_timed_out(void) return ret; } -int fio_handle_clients(struct client_ops *ops) +int fio_handle_clients(struct client_ops const *ops) { struct pollfd *pfds; int i, ret = 0, retval = 0; @@ -2152,6 +2242,7 @@ int fio_handle_clients(struct client_ops *ops) fio_client_json_fini(); + free_clat_prio_stats(&client_ts); free(pfds); return retval || error_clients; } diff --git a/client.h b/client.h index 8033325ed0..11fc661a15 100644 --- a/client.h +++ b/client.h @@ -42,6 +42,7 @@ struct fio_client { char *name; struct flist_head *opt_lists; + struct json_object *global_opts; int state; @@ -69,7 +70,7 @@ struct fio_client { uint16_t argc; char **argv; - struct client_ops *ops; + struct client_ops const *ops; void *client_data; struct client_file *files; @@ -84,7 +85,7 @@ typedef void (client_eta_op)(struct jobs_eta *je); typedef void (client_timed_out_op)(struct fio_client *); typedef void (client_jobs_eta_op)(struct fio_client *client, struct jobs_eta *je); -extern struct client_ops fio_client_ops; +extern struct client_ops const fio_client_ops; struct client_ops { client_cmd_op *text; @@ -128,8 +129,8 @@ extern int fio_start_client(struct fio_client *); extern int fio_start_all_clients(void); extern int fio_clients_send_ini(const char *); extern int fio_client_send_ini(struct fio_client *, const char *, bool); -extern int fio_handle_clients(struct client_ops *); -extern int fio_client_add(struct client_ops *, const char *, void **); +extern int fio_handle_clients(struct client_ops const*); +extern int fio_client_add(struct client_ops const*, const char *, void **); extern struct fio_client *fio_client_add_explicit(struct client_ops *, const char *, int, int); extern void fio_client_add_cmd_option(void *, const char *); extern int fio_client_add_ini_file(void *, const char *, bool); diff --git a/compiler/compiler.h b/compiler/compiler.h index 44fa87b90c..fefadeaa89 100644 --- a/compiler/compiler.h +++ b/compiler/compiler.h @@ -67,13 +67,14 @@ #endif #ifndef __has_attribute +#define __has_attribute(x) __GCC4_has_attribute_##x #define __GCC4_has_attribute___fallthrough__ 0 #endif #if __has_attribute(__fallthrough__) -#define fallthrough __attribute__((__fallthrough__)) +#define fio_fallthrough __attribute__((__fallthrough__)) #else -#define fallthrough do {} while (0) /* fallthrough */ +#define fio_fallthrough do {} while (0) /* fallthrough */ #endif #endif diff --git a/configure b/configure index 84ccce040e..9927976e70 100755 --- a/configure +++ b/configure @@ -116,6 +116,10 @@ has() { type "$1" >/dev/null 2>&1 } +num() { + echo "$1" | grep -E -q "^[0-9]+$" +} + check_define() { cat > $TMPC < $TMPC <> $config_host_mak echo "#define $1" >> $config_host_h @@ -137,11 +155,11 @@ output_sym() { check_min_lib_version() { _feature=$3 - if "${cross_prefix}"pkg-config --atleast-version="$2" "$1" > /dev/null 2>&1; then + if pkg-config --atleast-version="$2" "$1" > /dev/null 2>&1; then return 0 fi : "${_feature:=${1}}" - if "${cross_prefix}"pkg-config --version > /dev/null 2>&1; then + if pkg-config --version > /dev/null 2>&1; then if test "$(eval echo \"\$$_feature\")" = "yes" ; then feature_not_found "$_feature" "$1 >= $2" fi @@ -159,10 +177,10 @@ show_help="no" exit_val=0 gfio_check="no" libhdfs="no" -pmemblk="no" devdax="no" pmem="no" cuda="no" +cuda13="no" libcufile="no" disable_lex="" disable_pmem="no" @@ -170,9 +188,14 @@ disable_native="no" march_set="no" libiscsi="no" libnbd="no" -libnfs="no" +libnfs="" +xnvme="" +isal="" +isal64="" +libblkio="" libzbc="" dfs="" +seed_buckets="" dynamic_engines="no" prefix=/usr/local @@ -192,6 +215,8 @@ for opt do ;; --extra-cflags=*) CFLAGS="$CFLAGS $optarg" ;; + --extra-ldflags=*) LDFLAGS="$LDFLAGS $optarg" + ;; --build-32bit-win) build_32bit_win="yes" ;; --target-win-ver=*) target_win_ver="$optarg" @@ -240,14 +265,30 @@ for opt do ;; --disable-libzbc) libzbc="no" ;; + --disable-xnvme) xnvme="no" + ;; + --disable-isal) isal="no" + ;; + --disable-isal64) isal64="no" + ;; + --disable-libblkio) libblkio="no" + ;; --disable-tcmalloc) disable_tcmalloc="yes" ;; - --disable-nfs) disable_nfs="yes" + --disable-libnfs) libnfs="no" + ;; + --enable-libnfs) libnfs="yes" ;; --dynamic-libengines) dynamic_engines="yes" ;; --disable-dfs) dfs="no" ;; + --enable-asan) asan="yes" + ;; + --seed-buckets=*) seed_buckets="$optarg" + ;; + --disable-tls) tls_check="no" + ;; --help) show_help="yes" ;; @@ -274,10 +315,10 @@ if test "$show_help" = "yes" ; then echo "--disable-rados Disable Rados support even if found" echo "--disable-rbd Disable Rados Block Device even if found" echo "--disable-http Disable HTTP support even if found" - echo "--disable-nfs Disable userspace NFS support even if found" echo "--disable-gfapi Disable gfapi" echo "--enable-libhdfs Enable hdfs support" echo "--enable-libnfs Enable nfs support" + echo "--disable-libnfs Disable nfs support" echo "--disable-lex Disable use of lex/yacc for math" echo "--disable-pmem Disable pmem based engines even if found" echo "--enable-lex Enable use of lex/yacc for math" @@ -289,15 +330,22 @@ if test "$show_help" = "yes" ; then echo "--with-ime= Install path for DDN's Infinite Memory Engine" echo "--enable-libiscsi Enable iscsi support" echo "--enable-libnbd Enable libnbd (NBD engine) support" + echo "--disable-xnvme Disable xnvme support even if found" + echo "--disable-isal Disable isal support even if found" + echo "--disable-isal64 Disable isal CRC64 support even if found" + echo "--disable-libblkio Disable libblkio support even if found" echo "--disable-libzbc Disable libzbc even if found" - echo "--disable-tcmalloc Disable tcmalloc support" - echo "--dynamic-libengines Lib-based ioengines as dynamic libraries" - echo "--disable-dfs Disable DAOS File System support even if found" + echo "--disable-tcmalloc Disable tcmalloc support" + echo "--dynamic-libengines Lib-based ioengines as dynamic libraries" + echo "--disable-dfs Disable DAOS File System support even if found" + echo "--enable-asan Enable address sanitizer" + echo "--seed-buckets= Number of seed buckets for the refill-buffer" + echo "--disable-tls Disable __thread local storage" exit $exit_val fi cross_prefix=${cross_prefix-${CROSS_COMPILE}} -# Preferred compiler (can be overriden later after we know the platform): +# Preferred compiler (can be overridden later after we know the platform): # ${CC} (if set) # ${cross_prefix}gcc (if cross-prefix specified) # gcc if available @@ -325,6 +373,8 @@ elif check_define __sun__ ; then CFLAGS="$CFLAGS -D_REENTRANT" elif check_define _WIN32 ; then targetos='CYGWIN' +elif check_define __QNX__ ; then + targetos='QNX' else targetos=`uname -s` fi @@ -419,12 +469,13 @@ CYGWIN*) build_static="yes" rusage_thread="yes" fdatasync="yes" - clock_gettime="yes" # clock_monotonic probe has dependency on this - clock_monotonic="yes" sched_idle="yes" pthread_condattr_setclock="no" pthread_affinity="no" ;; +QNX) + LIBS="-lsocket" + ;; esac # Now we know the target platform we can have another guess at the preferred @@ -476,13 +527,23 @@ elif check_define __aarch64__ ; then cpu="aarch64" elif check_define __hppa__ ; then cpu="hppa" +elif check_define __loongarch64 ; then + cpu="loongarch64" +elif check_define __riscv ; then + if check_val __riscv_xlen 32 ; then + cpu="riscv32" + elif check_val __riscv_xlen 64 ; then + cpu="riscv64" + elif check_val __riscv_xlen 128 ; then + cpu="riscv128" + fi else cpu=`uname -m` fi # Normalise host CPU name and set ARCH. case "$cpu" in - ia64|ppc|ppc64|s390|s390x|sparc64) + ia64|ppc|ppc64|s390|s390x|sparc64|loongarch64|riscv64) cpu="$cpu" ;; i386|i486|i586|i686|i86pc|BePC) @@ -645,6 +706,25 @@ if compile_prog "" "-lz" "zlib" ; then fi print_config "zlib" "$zlib" +########################################## +# fcntl(F_FULLFSYNC) support +if test "$fcntl_sync" != "yes" ; then + fcntl_sync="no" +fi +cat > $TMPC << EOF +#include +#include + +int main(int argc, char **argv) +{ + return fcntl(0, F_FULLFSYNC); +} +EOF +if compile_prog "" "" "fcntl(F_FULLFSYNC)" ; then + fcntl_sync="yes" +fi +print_config "fcntl(F_FULLFSYNC)" "$fcntl_sync" + ########################################## # linux-aio probe if test "$libaio" != "yes" ; then @@ -798,7 +878,8 @@ cat > $TMPC < /* pthread_sigmask() */ int main(void) { - return pthread_sigmask(0, NULL, NULL); + sigset_t sigmask; + return pthread_sigmask(0, NULL, &sigmask); } EOF if compile_prog "" "$LIBS" "pthread_sigmask" ; then @@ -929,8 +1010,9 @@ int main(int argc, char **argv) return pd != NULL; } EOF -if test "$disable_rdma" != "yes" && compile_prog "" "-libverbs" "libverbs" ; then +if test "$disable_rdma" != "yes" && compile_prog "" "-libverbs -lnl-3 -lnl-route-3" "libverbs" ; then libverbs="yes" + LIBS="-libverbs -lnl-3 -lnl-route-3 $LIBS" fi print_config "libverbs" "$libverbs" @@ -948,54 +1030,12 @@ int main(int argc, char **argv) return 0; } EOF -if test "$disable_rdma" != "yes" && compile_prog "" "-lrdmacm" "rdma"; then +if test "$disable_rdma" != "yes" && compile_prog "" "-lrdmacm -lnl-3 -lnl-route-3" "rdma"; then rdmacm="yes" + LIBS="-libverbs -lnl-3 -lnl-route-3 $LIBS" fi print_config "rdmacm" "$rdmacm" -########################################## -# librpma probe -if test "$librpma" != "yes" ; then - librpma="no" -fi -cat > $TMPC << EOF -#include -#include -int main(int argc, char **argv) -{ - enum rpma_conn_event event = RPMA_CONN_REJECTED; - (void) event; /* unused */ - rpma_log_set_threshold(RPMA_LOG_THRESHOLD, RPMA_LOG_LEVEL_INFO); - return 0; -} -EOF -if test "$disable_rdma" != "yes" && compile_prog "" "-lrpma" "rpma"; then - librpma="yes" -fi -print_config "librpma" "$librpma" - -########################################## -# libprotobuf-c probe -if test "$libprotobuf_c" != "yes" ; then - libprotobuf_c="no" -fi -cat > $TMPC << EOF -#include -#include -#if !defined(PROTOBUF_C_VERSION_NUMBER) -# error PROTOBUF_C_VERSION_NUMBER is not defined! -#endif -int main(int argc, char **argv) -{ - (void)protobuf_c_message_check(NULL); - return 0; -} -EOF -if compile_prog "" "-lprotobuf-c" "protobuf_c"; then - libprotobuf_c="yes" -fi -print_config "libprotobuf_c" "$libprotobuf_c" - ########################################## # asprintf() and vasprintf() probes if test "$have_asprintf" != "yes" ; then @@ -1020,6 +1060,7 @@ if test "$have_vasprintf" != "yes" ; then fi cat > $TMPC << EOF #include +#include int main(int argc, char **argv) { @@ -1103,7 +1144,8 @@ cat > $TMPC << EOF #include int main(int argc, char **argv) { - cpu_set_t mask; + cpu_set_t mask = { }; + return sched_setaffinity(0, sizeof(mask), &mask); } EOF @@ -1114,7 +1156,8 @@ else #include int main(int argc, char **argv) { - cpu_set_t mask; + cpu_set_t mask = { }; + return sched_setaffinity(0, &mask); } EOF @@ -1135,7 +1178,9 @@ cat > $TMPC << EOF #include int main(int argc, char **argv) { - return clock_gettime(0, NULL); + struct timespec ts; + + return clock_gettime(0, &ts); } EOF if compile_prog "" "" "clock_gettime"; then @@ -1143,6 +1188,8 @@ if compile_prog "" "" "clock_gettime"; then elif compile_prog "" "-lrt" "clock_gettime"; then clock_gettime="yes" LIBS="-lrt $LIBS" +else + fatal "clock_gettime is not supported" fi print_config "clock_gettime" "$clock_gettime" @@ -1157,7 +1204,9 @@ if test "$clock_gettime" = "yes" ; then #include int main(int argc, char **argv) { - return clock_gettime(CLOCK_MONOTONIC, NULL); + struct timespec ts; + + return clock_gettime(CLOCK_MONOTONIC, &ts); } EOF if compile_prog "" "$LIBS" "clock monotonic"; then @@ -1166,26 +1215,6 @@ EOF fi print_config "CLOCK_MONOTONIC" "$clock_monotonic" -########################################## -# clockid_t probe -if test "$clockid_t" != "yes" ; then - clockid_t="no" -fi -cat > $TMPC << EOF -#include -#include -int main(int argc, char **argv) -{ - volatile clockid_t cid; - memset((void*)&cid, 0, sizeof(cid)); - return 0; -} -EOF -if compile_prog "" "$LIBS" "clockid_t"; then - clockid_t="yes" -fi -print_config "clockid_t" "$clockid_t" - ########################################## # gettimeofday() probe if test "$gettimeofday" != "yes" ; then @@ -1298,6 +1327,23 @@ if compile_prog "" "" "sync_file_range"; then fi print_config "sync_file_range" "$sync_file_range" +########################################## +# ASharedMemory_create() probe +if test "$ASharedMemory_create" != "yes" ; then + ASharedMemory_create="no" +fi +cat > $TMPC << EOF +#include +int main(int argc, char **argv) +{ + return ASharedMemory_create("", 0); +} +EOF +if compile_prog "" "" "ASharedMemory_create"; then + ASharedMemory_create="yes" +fi +print_config "ASharedMemory_create" "$ASharedMemory_create" + ########################################## # ext4 move extent probe if test "$ext4_me" != "yes" ; then @@ -1505,7 +1551,8 @@ print_config "socklen_t" "$socklen_t" if test "$tls_thread" != "yes" ; then tls_thread="no" fi -cat > $TMPC << EOF +if test "$tls_check" != "no"; then + cat > $TMPC << EOF #include static __thread int ret; int main(int argc, char **argv) @@ -1516,6 +1563,7 @@ EOF if compile_prog "" "" "__thread"; then tls_thread="yes" fi +fi print_config "__thread" "$tls_thread" ########################################## @@ -1536,14 +1584,14 @@ int main(void) return GTK_CHECK_VERSION(2, 18, 0) ? 0 : 1; /* 0 on success */ } EOF -GTK_CFLAGS=$(${cross_prefix}pkg-config --cflags gtk+-2.0 gthread-2.0) +GTK_CFLAGS=$(pkg-config --cflags gtk+-2.0 gthread-2.0) ORG_LDFLAGS=$LDFLAGS LDFLAGS=$(echo $LDFLAGS | sed s/"-static"//g) if test "$?" != "0" ; then echo "configure: gtk and gthread not found" exit 1 fi -GTK_LIBS=$(${cross_prefix}pkg-config --libs gtk+-2.0 gthread-2.0) +GTK_LIBS=$(pkg-config --libs gtk+-2.0 gthread-2.0) if test "$?" != "0" ; then echo "configure: gtk and gthread not found" exit 1 @@ -1596,7 +1644,8 @@ cat > $TMPC << EOF #include int main(int argc, char **argv) { - struct sched_param p; + struct sched_param p = { }; + return sched_setscheduler(0, SCHED_IDLE, &p); } EOF @@ -1632,6 +1681,25 @@ elif compile_prog "" "-lws2_32" "TCP_NODELAY"; then fi print_config "TCP_NODELAY" "$tcp_nodelay" +########################################## +# Check whether we have vsock +if test "$vsock" != "yes" ; then + vsock="no" +fi +cat > $TMPC << EOF +#include +#include +#include +int main(int argc, char **argv) +{ + return socket(AF_VSOCK, SOCK_STREAM, 0); +} +EOF +if compile_prog "" "" "vsock"; then + vsock="yes" +fi +print_config "vsock" "$vsock" + ########################################## # Check whether we have SO_SNDBUF if test "$window_size" != "yes" ; then @@ -1718,7 +1786,9 @@ cat > $TMPC << EOF #include int main(int argc, char **argv) { - return pwritev(0, NULL, 1, 0) + preadv(0, NULL, 1, 0); + struct iovec iov[1] = { }; + + return pwritev(0, iov, 1, 0) + preadv(0, iov, 1, 0); } EOF if compile_prog "" "" "pwritev"; then @@ -1736,7 +1806,9 @@ cat > $TMPC << EOF #include int main(int argc, char **argv) { - return pwritev2(0, NULL, 1, 0, 0) + preadv2(0, NULL, 1, 0, 0); + struct iovec iov[1] = { }; + + return pwritev2(0, iov, 1, 0, 0) + preadv2(0, iov, 1, 0, 0); } EOF if compile_prog "" "" "pwritev2"; then @@ -1762,14 +1834,14 @@ cat > $TMPC << EOF #include int main(int argc, char **argv) { - struct addrinfo hints; - struct in6_addr addr; + struct addrinfo hints = { }; + struct in6_addr addr = in6addr_any; int ret; ret = getaddrinfo(NULL, NULL, &hints, NULL); freeaddrinfo(NULL); - printf("%s\n", gai_strerror(ret)); - addr = in6addr_any; + printf("%s %d\n", gai_strerror(ret), addr.s6_addr[0]); + return 0; } EOF @@ -1939,6 +2011,28 @@ fi print_config "rbd_invalidate_cache" "$rbd_inval" fi +########################################## +# check for rbd_encryption_load() +if test "$rbd_encryption" != "yes" ; then + rbd_encryption="no" +fi +if test "$rbd" = "yes" ; then +cat > $TMPC << EOF +#include + +int main(int argc, char **argv) +{ + rbd_image_t image; + + return rbd_encryption_load(image, RBD_ENCRYPTION_FORMAT_LUKS1, 0, 0); +} +EOF +if compile_prog "" "-lrbd -lrados" "rbd_encryption"; then + rbd_encryption="yes" +fi + print_config "rbd_encryption_load" "$rbd_encryption" +fi + ########################################## # Check whether we have setvbuf if test "$setvbuf" != "yes" ; then @@ -2080,7 +2174,7 @@ if test "$libhdfs" = "yes" ; then hdfs_conf_error=1 fi if test "$FIO_LIBHDFS_INCLUDE" = "" ; then - echo "configure: FIO_LIBHDFS_INCLUDE should be defined to libhdfs inlude path" + echo "configure: FIO_LIBHDFS_INCLUDE should be defined to libhdfs include path" hdfs_conf_error=1 fi if test "$FIO_LIBHDFS_LIB" = "" ; then @@ -2130,9 +2224,7 @@ cat > $TMPC << EOF #include int main(int argc, char **argv) { - int rc; - rc = pmem_is_pmem(NULL, 0); - return 0; + return pmem_is_pmem(NULL, 0); } EOF if compile_prog "" "-lpmem" "libpmem"; then @@ -2151,7 +2243,7 @@ if test "$libpmem" = "yes"; then #include int main(int argc, char **argv) { - pmem_memcpy(NULL, NULL, NULL, NULL); + pmem_memcpy(NULL, NULL, 0, 0); return 0; } EOF @@ -2162,26 +2254,24 @@ fi print_config "libpmem1_5" "$libpmem1_5" ########################################## -# Check whether we have libpmemblk -# libpmem is a prerequisite -if test "$libpmemblk" != "yes" ; then - libpmemblk="no" +# Check whether we have libpmem2 +if test "$libpmem2" != "yes" ; then + libpmem2="no" fi -if test "$libpmem" = "yes"; then - cat > $TMPC << EOF -#include +cat > $TMPC << EOF +#include int main(int argc, char **argv) { - PMEMblkpool *pbp; - pbp = pmemblk_open("", 0); + struct pmem2_config *cfg; + pmem2_config_new(&cfg); + pmem2_config_delete(&cfg); return 0; } EOF - if compile_prog "" "-lpmemblk" "libpmemblk"; then - libpmemblk="yes" - fi +if compile_prog "" "-lpmem2" "libpmem2"; then + libpmem2="yes" fi -print_config "libpmemblk" "$libpmemblk" +print_config "libpmem2" "$libpmem2" # Choose libpmem-based ioengines if test "$libpmem" = "yes" && test "$disable_pmem" = "no"; then @@ -2189,15 +2279,8 @@ if test "$libpmem" = "yes" && test "$disable_pmem" = "no"; then if test "$libpmem1_5" = "yes"; then pmem="yes" fi - if test "$libpmemblk" = "yes"; then - pmemblk="yes" - fi fi -########################################## -# Report whether pmemblk engine is enabled -print_config "PMDK pmemblk engine" "$pmemblk" - ########################################## # Report whether dev-dax engine is enabled print_config "PMDK dev-dax engine" "$devdax" @@ -2284,15 +2367,31 @@ print_config "DAOS File System (dfs) Engine" "$dfs" ########################################## # Check if we have libnfs (for userspace nfs support). -if test "$disable_nfs" != "yes"; then +if test "$libnfs" != "no" ; then if $(pkg-config libnfs > /dev/null 2>&1); then libnfs="yes" libnfs_cflags=$(pkg-config --cflags libnfs) libnfs_libs=$(pkg-config --libs libnfs) + + # libnfs >= 6.0.0 requires gnutls for TLS support + libnfs_version=$(pkg-config --modversion libnfs 2>/dev/null) + if test -n "$libnfs_version" ; then + libnfs_major=$(echo $libnfs_version | cut -d. -f1) + if test "$libnfs_major" -ge 6 ; then + if $(pkg-config gnutls > /dev/null 2>&1); then + libnfs_cflags="$libnfs_cflags $(pkg-config --cflags gnutls)" + libnfs_libs="$libnfs_libs $(pkg-config --libs gnutls)" + else + feature_not_found "gnutls" "gnutls (required for libnfs >= 6.0.0)" + libnfs="no" + fi + fi + fi else if test "$libnfs" = "yes" ; then - echo "libnfs" "Install libnfs" + feature_not_found "libnfs" "libnfs" fi + libnfs="no" fi fi print_config "NFS engine" "$libnfs" @@ -2367,7 +2466,7 @@ int main(int argc, char **argv) FILE *mtab = setmntent(NULL, "r"); struct mntent *mnt = getmntent(mtab); endmntent(mtab); - return 0; + return mnt != NULL; } EOF if compile_prog "" "" "getmntent"; then @@ -2500,7 +2599,7 @@ if compile_prog "" "" "valgrind_dev"; then fi print_config "Valgrind headers" "$valgrind_dev" -if test "$targetos" = "Linux" ; then +if test "$targetos" = "Linux" || test "$targetos" = "Android"; then ########################################## # probe if test "$linux_blkzoned" != "yes" ; then @@ -2548,6 +2647,10 @@ int main(int argc, char **argv) } EOF if test "$libzbc" != "no" ; then + if [ -e /usr/include/libzbc/libzbc ]; then + # SUSE Linux. + CFLAGS="$CFLAGS -I/usr/include/libzbc" + fi if compile_prog "" "-lzbc" "libzbc"; then libzbc="yes" if ! check_min_lib_version libzbc 5; then @@ -2562,22 +2665,115 @@ if test "$libzbc" != "no" ; then fi print_config "libzbc engine" "$libzbc" +if test "$targetos" = "Linux" || test "$targetos" = "Android"; then ########################################## -# check march=armv8-a+crc+crypto -if test "$march_armv8_a_crc_crypto" != "yes" ; then - march_armv8_a_crc_crypto="no" +# Check NVME_URING_CMD support +cat > $TMPC << EOF +#include +int main(void) +{ + return sizeof(struct nvme_uring_cmd); +} +EOF +if compile_prog "" "" "nvme uring cmd"; then + output_sym "CONFIG_NVME_URING_CMD" + nvme_uring_cmd="yes" +else + nvme_uring_cmd="no" +fi +print_config "NVMe uring command support" "$nvme_uring_cmd" fi + +########################################## +# Check if we have xnvme +if test "$xnvme" != "no" ; then + if check_min_lib_version xnvme 0.7.4; then + xnvme="yes" + xnvme_cflags=$(pkg-config --cflags xnvme) + xnvme_libs=$(pkg-config --libs xnvme) + else + xnvme="no" + fi +fi +print_config "xnvme engine" "$xnvme" + +if test "$targetos" = "Linux" ; then +########################################## +# Check ISA-L support +cat > $TMPC << EOF +#include +#include +int main(void) +{ + return crc16_t10dif(0, NULL, 4096); +} +EOF +if test "$isal" != "no" ; then + if compile_prog "" "-lisal" "ISAL"; then + isal="yes" + LIBS="-lisal $LIBS" + else + isal="no" + isal64="no" + fi +fi +print_config "isal" "$isal" +fi + +########################################## +# Check ISA-L CRC64 Rocksoft support +cat > $TMPC << EOF +#include +#include +int main(void) +{ + return crc64_rocksoft_refl(0, NULL, 4096); +} +EOF +if test "$isal64" != "no" ; then + if compile_prog "" "-lisal" "ISAL"; then + isal64="yes" + LIBS="-lisal $LIBS" + else + isal64="no" + fi +fi +print_config "isal CRC64" "$isal64" + +########################################## +# Check if we have libblkio +if test "$libblkio" != "no" ; then + if check_min_lib_version blkio 1.0.0; then + libblkio="yes" + libblkio_cflags=$(pkg-config --cflags blkio) + libblkio_libs=$(pkg-config --libs blkio) + else + if test "$libblkio" = "yes" ; then + feature_not_found "libblkio" "libblkio-dev or libblkio-devel" + fi + libblkio="no" + fi +fi +print_config "libblkio engine" "$libblkio" + +########################################## +# check march=armv8-a+crc+crypto +march_armv8_a_crc_crypto="no" if test "$cpu" = "arm64" ; then cat > $TMPC < #include #include +#endif int main(void) { /* Can we also do a runtime probe? */ #if __linux__ return getauxval(AT_HWCAP); +#elif defined(__APPLE__) + return 0; #else # error "Don't know how to do runtime probe for ARM CRC32c" #endif @@ -2624,9 +2820,9 @@ int main(int argc, char* argv[]) { return 0; } EOF - if compile_prog "" "-lcuda -lcudart -lcufile" "libcufile"; then + if compile_prog "" "-lcuda -lcudart -lcufile -ldl" "libcufile"; then libcufile="yes" - LIBS="-lcuda -lcudart -lcufile $LIBS" + LIBS="-lcuda -lcudart -lcufile -ldl $LIBS" else if test "$libcufile" = "yes" ; then feature_not_found "libcufile" "" @@ -2636,6 +2832,26 @@ EOF fi print_config "libcufile" "$libcufile" +########################################## +# cuda 13 probe +if test "$cuda" != "no" || test "$libcufile" != "no"; then +cat > $TMPC << EOF +#include + +int main(int argc, char **argv) +{ + cuCtxCreate(NULL, NULL, 0, NULL); + return 0; +} +EOF + if compile_prog "" "-lcuda" "cuda13"; then + cuda13="yes" + else + cuda13="no" + fi + print_config "cuda>=13" "$cuda13" +fi + ########################################## # check for cc -march=native build_native="no" @@ -2705,6 +2921,22 @@ if compile_prog "-Wimplicit-fallthrough=2" "" "-Wimplicit-fallthrough=2"; then fi print_config "-Wimplicit-fallthrough=2" "$fallthrough" +########################################## +# check if the compiler has -Wno-stringop-concatenation +no_stringop="no" +cat > $TMPC << EOF +#include + +int main(int argc, char **argv) +{ + return printf("%s\n", argv[0]); +} +EOF +if compile_prog "-Wno-stringop-truncation -Werror" "" "no_stringop"; then + no_stringop="yes" +fi +print_config "-Wno-stringop-truncation" "$no_stringop" + ########################################## # check for MADV_HUGEPAGE support if test "$thp" != "yes" ; then @@ -2843,6 +3075,9 @@ if test "$bigendian" = "yes" ; then else output_sym "CONFIG_LITTLE_ENDIAN" fi +if test "$targetos" = "Linux" ; then + output_sym "CONFIG_LINUX" +fi if test "$zlib" = "yes" ; then output_sym "CONFIG_ZLIB" fi @@ -2897,6 +3132,9 @@ fi if test "$sync_file_range" = "yes" ; then output_sym "CONFIG_SYNC_FILE_RANGE" fi +if test "$ASharedMemory_create" = "yes" ; then + output_sym "CONFIG_ASHAREDMEMORY_CREATE" +fi if test "$sfaa" = "yes" ; then output_sym "CONFIG_SFAA" fi @@ -2909,24 +3147,9 @@ fi if test "$libverbs" = "yes" -a "$rdmacm" = "yes" ; then output_sym "CONFIG_RDMA" fi -# librpma is supported on the 'x86_64' architecture for now -if test "$cpu" = "x86_64" -a "$libverbs" = "yes" -a "$rdmacm" = "yes" \ - -a "$librpma" = "yes" -a "$libpmem" = "yes" ; then - output_sym "CONFIG_LIBRPMA_APM" -fi -if test "$cpu" = "x86_64" -a "$libverbs" = "yes" -a "$rdmacm" = "yes" \ - -a "$librpma" = "yes" -a "$libpmem" = "yes" -a "$libprotobuf_c" = "yes" ; then - output_sym "CONFIG_LIBRPMA_GPSPM" -fi -if test "$clock_gettime" = "yes" ; then - output_sym "CONFIG_CLOCK_GETTIME" -fi if test "$clock_monotonic" = "yes" ; then output_sym "CONFIG_CLOCK_MONOTONIC" fi -if test "$clockid_t" = "yes"; then - output_sym "CONFIG_CLOCKID_T" -fi if test "$gettimeofday" = "yes" ; then output_sym "CONFIG_GETTIMEOFDAY" fi @@ -3005,6 +3228,9 @@ fi if test "$ipv6" = "yes" ; then output_sym "CONFIG_IPV6" fi +if test "$vsock" = "yes"; then + output_sym "CONFIG_VSOCK" +fi if test "$http" = "yes" ; then output_sym "CONFIG_HTTP" fi @@ -3020,6 +3246,9 @@ fi if test "$rbd_inval" = "yes" ; then output_sym "CONFIG_RBD_INVAL" fi +if test "$rbd_encryption" = "yes" ; then + output_sym "CONFIG_RBD_ENCRYPTION" +fi if test "$setvbuf" = "yes" ; then output_sym "CONFIG_SETVBUF" fi @@ -3050,15 +3279,15 @@ fi if test "$mtd" = "yes" ; then output_sym "CONFIG_MTD" fi -if test "$pmemblk" = "yes" ; then - output_sym "CONFIG_PMEMBLK" -fi if test "$devdax" = "yes" ; then output_sym "CONFIG_LINUX_DEVDAX" fi if test "$pmem" = "yes" ; then output_sym "CONFIG_LIBPMEM" fi +if test "$libpmem2" = "yes" ; then + output_sym "CONFIG_LIBPMEM2_INSTALLED" +fi if test "$libime" = "yes" ; then output_sym "CONFIG_IME" fi @@ -3104,7 +3333,7 @@ if test "$libzbc" = "yes" ; then output_sym "CONFIG_LIBZBC" fi if test "$zlib" = "no" ; then - echo "Consider installing zlib-dev (zlib-devel, some fio features depend on it." + echo "Consider installing zlib1g-dev (zlib-devel) as some fio features depend on it." if test "$build_static" = "yes"; then echo "Note that some distros have separate packages for static libraries." fi @@ -3118,12 +3347,12 @@ fi if test "$libcufile" = "yes" ; then output_sym "CONFIG_LIBCUFILE" fi +if test "$cuda13" = "yes" ; then + output_sym "CONFIG_CUDA13" +fi if test "$dfs" = "yes" ; then output_sym "CONFIG_DFS" fi -if test "$libnfs" = "yes" ; then - output_sym "CONFIG_NFS" -fi if test "$march_set" = "no" && test "$build_native" = "yes" ; then output_sym "CONFIG_BUILD_NATIVE" fi @@ -3148,6 +3377,9 @@ fi if test "$fallthrough" = "yes"; then CFLAGS="$CFLAGS -Wimplicit-fallthrough" fi +if test "$no_stringop" = "yes"; then + output_sym "CONFIG_HAVE_NO_STRINGOP" +fi if test "$thp" = "yes" ; then output_sym "CONFIG_HAVE_THP" fi @@ -3165,17 +3397,38 @@ if test "$libnbd" = "yes" ; then fi if test "$libnfs" = "yes" ; then output_sym "CONFIG_LIBNFS" - echo "CONFIG_LIBNFS=m" >> $config_host_mak echo "LIBNFS_CFLAGS=$libnfs_cflags" >> $config_host_mak echo "LIBNFS_LIBS=$libnfs_libs" >> $config_host_mak fi +if test "$xnvme" = "yes" ; then + output_sym "CONFIG_LIBXNVME" + echo "LIBXNVME_CFLAGS=$xnvme_cflags" >> $config_host_mak + echo "LIBXNVME_LIBS=$xnvme_libs" >> $config_host_mak +fi +if test "$isal" = "yes" ; then + output_sym "CONFIG_LIBISAL" +fi +if test "$isal64" = "yes" ; then + output_sym "CONFIG_LIBISAL64" +fi +if test "$libblkio" = "yes" ; then + output_sym "CONFIG_LIBBLKIO" + echo "LIBBLKIO_CFLAGS=$libblkio_cflags" >> $config_host_mak + echo "LIBBLKIO_LIBS=$libblkio_libs" >> $config_host_mak +fi if test "$dynamic_engines" = "yes" ; then output_sym "CONFIG_DYNAMIC_ENGINES" fi if test "$pdb" = yes; then output_sym "CONFIG_PDB" fi - +if test "$fcntl_sync" = "yes" ; then + output_sym "CONFIG_FCNTL_SYNC" +fi +if test "$asan" = "yes"; then + CFLAGS="$CFLAGS -fsanitize=address" + LDFLAGS="$LDFLAGS -fsanitize=address" +fi print_config "Lib-based ioengines dynamic" "$dynamic_engines" cat > $TMPC << EOF int main(int argc, char **argv) @@ -3195,6 +3448,15 @@ if test "$disable_tcmalloc" != "yes"; then fi fi print_config "TCMalloc support" "$tcmalloc" +if ! num "$seed_buckets"; then + seed_buckets=4 +elif test "$seed_buckets" -lt 2; then + seed_buckets=2 +elif test "$seed_buckets" -gt 16; then + seed_buckets=16 +fi +echo "#define CONFIG_SEED_BUCKETS $seed_buckets" >> $config_host_h +print_config "seed_buckets" "$seed_buckets" echo "LIBS+=$LIBS" >> $config_host_mak echo "GFIO_LIBS+=$GFIO_LIBS" >> $config_host_mak diff --git a/crc/crc-t10dif.h b/crc/crc-t10dif.h new file mode 100644 index 0000000000..fde4ccd7e3 --- /dev/null +++ b/crc/crc-t10dif.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __CRC_T10DIF_H +#define __CRC_T10DIF_H + +extern unsigned short fio_crc_t10dif(unsigned short crc, + const unsigned char *buffer, + unsigned int len); + +#endif diff --git a/crc/crc64.c b/crc/crc64.c index bf24a97bf2..aae54f40e1 100644 --- a/crc/crc64.c +++ b/crc/crc64.c @@ -1,4 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * crc64nvme[256] table is from the generator polynomial specified by NVMe + * 64b CRC and is defined as, + * + * x^64 + x^63 + x^61 + x^59 + x^58 + x^56 + x^55 + x^52 + x^49 + x^48 + x^47 + + * x^46 + x^44 + x^41 + x^37 + x^36 + x^34 + x^32 + x^31 + x^28 + x^26 + x^23 + + * x^22 + x^19 + x^16 + x^13 + x^12 + x^10 + x^9 + x^6 + x^4 + x^3 + 1 + * + */ + #include "crc64.h" +#include "crc64table.h" /* * poly 0x95AC9329AC4BC9B5ULL and init 0xFFFFFFFFFFFFFFFFULL @@ -102,3 +114,36 @@ unsigned long long fio_crc64(const unsigned char *buffer, unsigned long length) return crc; } +#ifdef CONFIG_LIBISAL64 +#include + +unsigned long long fio_crc64_nvme(unsigned long long crc, const void *p, + unsigned int len) +{ + return crc64_rocksoft_refl(crc, p, len); +} + +#else + +/** + * fio_crc64_nvme - Calculate bitwise NVMe CRC64 + * @crc: seed value for computation. 0 for a new CRC calculation, or the + * previous crc64 value if computing incrementally. + * @p: pointer to buffer over which CRC64 is run + * @len: length of buffer @p + */ +unsigned long long fio_crc64_nvme(unsigned long long crc, const void *p, + unsigned int len) +{ + const unsigned char *_p = p; + unsigned int i; + + crc = ~crc; + + for (i = 0; i < len; i++) + crc = (crc >> 8) ^ crc64nvmetable[(crc & 0xff) ^ *_p++]; + + return ~crc; +} + +#endif diff --git a/crc/crc64.h b/crc/crc64.h index fe9cad3e26..e586edee2d 100644 --- a/crc/crc64.h +++ b/crc/crc64.h @@ -3,4 +3,7 @@ unsigned long long fio_crc64(const unsigned char *, unsigned long); +unsigned long long fio_crc64_nvme(unsigned long long crc, const void *p, + unsigned int len); + #endif diff --git a/crc/crc64table.h b/crc/crc64table.h new file mode 100644 index 0000000000..04224d4fc6 --- /dev/null +++ b/crc/crc64table.h @@ -0,0 +1,130 @@ +static const unsigned long long crc64nvmetable[256] = { + 0x0000000000000000ULL, 0x7f6ef0c830358979ULL, + 0xfedde190606b12f2ULL, 0x81b31158505e9b8bULL, + 0xc962e5739841b68fULL, 0xb60c15bba8743ff6ULL, + 0x37bf04e3f82aa47dULL, 0x48d1f42bc81f2d04ULL, + 0xa61cecb46814fe75ULL, 0xd9721c7c5821770cULL, + 0x58c10d24087fec87ULL, 0x27affdec384a65feULL, + 0x6f7e09c7f05548faULL, 0x1010f90fc060c183ULL, + 0x91a3e857903e5a08ULL, 0xeecd189fa00bd371ULL, + 0x78e0ff3b88be6f81ULL, 0x078e0ff3b88be6f8ULL, + 0x863d1eabe8d57d73ULL, 0xf953ee63d8e0f40aULL, + 0xb1821a4810ffd90eULL, 0xceecea8020ca5077ULL, + 0x4f5ffbd87094cbfcULL, 0x30310b1040a14285ULL, + 0xdefc138fe0aa91f4ULL, 0xa192e347d09f188dULL, + 0x2021f21f80c18306ULL, 0x5f4f02d7b0f40a7fULL, + 0x179ef6fc78eb277bULL, 0x68f0063448deae02ULL, + 0xe943176c18803589ULL, 0x962de7a428b5bcf0ULL, + 0xf1c1fe77117cdf02ULL, 0x8eaf0ebf2149567bULL, + 0x0f1c1fe77117cdf0ULL, 0x7072ef2f41224489ULL, + 0x38a31b04893d698dULL, 0x47cdebccb908e0f4ULL, + 0xc67efa94e9567b7fULL, 0xb9100a5cd963f206ULL, + 0x57dd12c379682177ULL, 0x28b3e20b495da80eULL, + 0xa900f35319033385ULL, 0xd66e039b2936bafcULL, + 0x9ebff7b0e12997f8ULL, 0xe1d10778d11c1e81ULL, + 0x606216208142850aULL, 0x1f0ce6e8b1770c73ULL, + 0x8921014c99c2b083ULL, 0xf64ff184a9f739faULL, + 0x77fce0dcf9a9a271ULL, 0x08921014c99c2b08ULL, + 0x4043e43f0183060cULL, 0x3f2d14f731b68f75ULL, + 0xbe9e05af61e814feULL, 0xc1f0f56751dd9d87ULL, + 0x2f3dedf8f1d64ef6ULL, 0x50531d30c1e3c78fULL, + 0xd1e00c6891bd5c04ULL, 0xae8efca0a188d57dULL, + 0xe65f088b6997f879ULL, 0x9931f84359a27100ULL, + 0x1882e91b09fcea8bULL, 0x67ec19d339c963f2ULL, + 0xd75adabd7a6e2d6fULL, 0xa8342a754a5ba416ULL, + 0x29873b2d1a053f9dULL, 0x56e9cbe52a30b6e4ULL, + 0x1e383fcee22f9be0ULL, 0x6156cf06d21a1299ULL, + 0xe0e5de5e82448912ULL, 0x9f8b2e96b271006bULL, + 0x71463609127ad31aULL, 0x0e28c6c1224f5a63ULL, + 0x8f9bd7997211c1e8ULL, 0xf0f5275142244891ULL, + 0xb824d37a8a3b6595ULL, 0xc74a23b2ba0eececULL, + 0x46f932eaea507767ULL, 0x3997c222da65fe1eULL, + 0xafba2586f2d042eeULL, 0xd0d4d54ec2e5cb97ULL, + 0x5167c41692bb501cULL, 0x2e0934dea28ed965ULL, + 0x66d8c0f56a91f461ULL, 0x19b6303d5aa47d18ULL, + 0x980521650afae693ULL, 0xe76bd1ad3acf6feaULL, + 0x09a6c9329ac4bc9bULL, 0x76c839faaaf135e2ULL, + 0xf77b28a2faafae69ULL, 0x8815d86aca9a2710ULL, + 0xc0c42c4102850a14ULL, 0xbfaadc8932b0836dULL, + 0x3e19cdd162ee18e6ULL, 0x41773d1952db919fULL, + 0x269b24ca6b12f26dULL, 0x59f5d4025b277b14ULL, + 0xd846c55a0b79e09fULL, 0xa72835923b4c69e6ULL, + 0xeff9c1b9f35344e2ULL, 0x90973171c366cd9bULL, + 0x1124202993385610ULL, 0x6e4ad0e1a30ddf69ULL, + 0x8087c87e03060c18ULL, 0xffe938b633338561ULL, + 0x7e5a29ee636d1eeaULL, 0x0134d92653589793ULL, + 0x49e52d0d9b47ba97ULL, 0x368bddc5ab7233eeULL, + 0xb738cc9dfb2ca865ULL, 0xc8563c55cb19211cULL, + 0x5e7bdbf1e3ac9decULL, 0x21152b39d3991495ULL, + 0xa0a63a6183c78f1eULL, 0xdfc8caa9b3f20667ULL, + 0x97193e827bed2b63ULL, 0xe877ce4a4bd8a21aULL, + 0x69c4df121b863991ULL, 0x16aa2fda2bb3b0e8ULL, + 0xf86737458bb86399ULL, 0x8709c78dbb8deae0ULL, + 0x06bad6d5ebd3716bULL, 0x79d4261ddbe6f812ULL, + 0x3105d23613f9d516ULL, 0x4e6b22fe23cc5c6fULL, + 0xcfd833a67392c7e4ULL, 0xb0b6c36e43a74e9dULL, + 0x9a6c9329ac4bc9b5ULL, 0xe50263e19c7e40ccULL, + 0x64b172b9cc20db47ULL, 0x1bdf8271fc15523eULL, + 0x530e765a340a7f3aULL, 0x2c608692043ff643ULL, + 0xadd397ca54616dc8ULL, 0xd2bd67026454e4b1ULL, + 0x3c707f9dc45f37c0ULL, 0x431e8f55f46abeb9ULL, + 0xc2ad9e0da4342532ULL, 0xbdc36ec59401ac4bULL, + 0xf5129aee5c1e814fULL, 0x8a7c6a266c2b0836ULL, + 0x0bcf7b7e3c7593bdULL, 0x74a18bb60c401ac4ULL, + 0xe28c6c1224f5a634ULL, 0x9de29cda14c02f4dULL, + 0x1c518d82449eb4c6ULL, 0x633f7d4a74ab3dbfULL, + 0x2bee8961bcb410bbULL, 0x548079a98c8199c2ULL, + 0xd53368f1dcdf0249ULL, 0xaa5d9839ecea8b30ULL, + 0x449080a64ce15841ULL, 0x3bfe706e7cd4d138ULL, + 0xba4d61362c8a4ab3ULL, 0xc52391fe1cbfc3caULL, + 0x8df265d5d4a0eeceULL, 0xf29c951de49567b7ULL, + 0x732f8445b4cbfc3cULL, 0x0c41748d84fe7545ULL, + 0x6bad6d5ebd3716b7ULL, 0x14c39d968d029fceULL, + 0x95708ccedd5c0445ULL, 0xea1e7c06ed698d3cULL, + 0xa2cf882d2576a038ULL, 0xdda178e515432941ULL, + 0x5c1269bd451db2caULL, 0x237c997575283bb3ULL, + 0xcdb181ead523e8c2ULL, 0xb2df7122e51661bbULL, + 0x336c607ab548fa30ULL, 0x4c0290b2857d7349ULL, + 0x04d364994d625e4dULL, 0x7bbd94517d57d734ULL, + 0xfa0e85092d094cbfULL, 0x856075c11d3cc5c6ULL, + 0x134d926535897936ULL, 0x6c2362ad05bcf04fULL, + 0xed9073f555e26bc4ULL, 0x92fe833d65d7e2bdULL, + 0xda2f7716adc8cfb9ULL, 0xa54187de9dfd46c0ULL, + 0x24f29686cda3dd4bULL, 0x5b9c664efd965432ULL, + 0xb5517ed15d9d8743ULL, 0xca3f8e196da80e3aULL, + 0x4b8c9f413df695b1ULL, 0x34e26f890dc31cc8ULL, + 0x7c339ba2c5dc31ccULL, 0x035d6b6af5e9b8b5ULL, + 0x82ee7a32a5b7233eULL, 0xfd808afa9582aa47ULL, + 0x4d364994d625e4daULL, 0x3258b95ce6106da3ULL, + 0xb3eba804b64ef628ULL, 0xcc8558cc867b7f51ULL, + 0x8454ace74e645255ULL, 0xfb3a5c2f7e51db2cULL, + 0x7a894d772e0f40a7ULL, 0x05e7bdbf1e3ac9deULL, + 0xeb2aa520be311aafULL, 0x944455e88e0493d6ULL, + 0x15f744b0de5a085dULL, 0x6a99b478ee6f8124ULL, + 0x224840532670ac20ULL, 0x5d26b09b16452559ULL, + 0xdc95a1c3461bbed2ULL, 0xa3fb510b762e37abULL, + 0x35d6b6af5e9b8b5bULL, 0x4ab846676eae0222ULL, + 0xcb0b573f3ef099a9ULL, 0xb465a7f70ec510d0ULL, + 0xfcb453dcc6da3dd4ULL, 0x83daa314f6efb4adULL, + 0x0269b24ca6b12f26ULL, 0x7d0742849684a65fULL, + 0x93ca5a1b368f752eULL, 0xeca4aad306bafc57ULL, + 0x6d17bb8b56e467dcULL, 0x12794b4366d1eea5ULL, + 0x5aa8bf68aecec3a1ULL, 0x25c64fa09efb4ad8ULL, + 0xa4755ef8cea5d153ULL, 0xdb1bae30fe90582aULL, + 0xbcf7b7e3c7593bd8ULL, 0xc399472bf76cb2a1ULL, + 0x422a5673a732292aULL, 0x3d44a6bb9707a053ULL, + 0x759552905f188d57ULL, 0x0afba2586f2d042eULL, + 0x8b48b3003f739fa5ULL, 0xf42643c80f4616dcULL, + 0x1aeb5b57af4dc5adULL, 0x6585ab9f9f784cd4ULL, + 0xe436bac7cf26d75fULL, 0x9b584a0fff135e26ULL, + 0xd389be24370c7322ULL, 0xace74eec0739fa5bULL, + 0x2d545fb4576761d0ULL, 0x523aaf7c6752e8a9ULL, + 0xc41748d84fe75459ULL, 0xbb79b8107fd2dd20ULL, + 0x3acaa9482f8c46abULL, 0x45a459801fb9cfd2ULL, + 0x0d75adabd7a6e2d6ULL, 0x721b5d63e7936bafULL, + 0xf3a84c3bb7cdf024ULL, 0x8cc6bcf387f8795dULL, + 0x620ba46c27f3aa2cULL, 0x1d6554a417c62355ULL, + 0x9cd645fc4798b8deULL, 0xe3b8b53477ad31a7ULL, + 0xab69411fbfb21ca3ULL, 0xd407b1d78f8795daULL, + 0x55b4a08fdfd90e51ULL, 0x2ada5047efec8728ULL, +}; diff --git a/crc/crct10dif_common.c b/crc/crct10dif_common.c new file mode 100644 index 0000000000..1763b1c66b --- /dev/null +++ b/crc/crct10dif_common.c @@ -0,0 +1,91 @@ +/* + * Cryptographic API. + * + * T10 Data Integrity Field CRC16 Crypto Transform + * + * Copyright (c) 2007 Oracle Corporation. All rights reserved. + * Written by Martin K. Petersen + * Copyright (C) 2013 Intel Corporation + * Author: Tim Chen + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifdef CONFIG_LIBISAL +#include + +extern unsigned short fio_crc_t10dif(unsigned short crc, + const unsigned char *buffer, + unsigned int len) +{ + return crc16_t10dif(crc, buffer, len); +} + +#else +#include "crc-t10dif.h" + +/* Table generated using the following polynomium: + * x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1 + * gt: 0x8bb7 + */ +static const unsigned short t10_dif_crc_table[256] = { + 0x0000, 0x8BB7, 0x9CD9, 0x176E, 0xB205, 0x39B2, 0x2EDC, 0xA56B, + 0xEFBD, 0x640A, 0x7364, 0xF8D3, 0x5DB8, 0xD60F, 0xC161, 0x4AD6, + 0x54CD, 0xDF7A, 0xC814, 0x43A3, 0xE6C8, 0x6D7F, 0x7A11, 0xF1A6, + 0xBB70, 0x30C7, 0x27A9, 0xAC1E, 0x0975, 0x82C2, 0x95AC, 0x1E1B, + 0xA99A, 0x222D, 0x3543, 0xBEF4, 0x1B9F, 0x9028, 0x8746, 0x0CF1, + 0x4627, 0xCD90, 0xDAFE, 0x5149, 0xF422, 0x7F95, 0x68FB, 0xE34C, + 0xFD57, 0x76E0, 0x618E, 0xEA39, 0x4F52, 0xC4E5, 0xD38B, 0x583C, + 0x12EA, 0x995D, 0x8E33, 0x0584, 0xA0EF, 0x2B58, 0x3C36, 0xB781, + 0xD883, 0x5334, 0x445A, 0xCFED, 0x6A86, 0xE131, 0xF65F, 0x7DE8, + 0x373E, 0xBC89, 0xABE7, 0x2050, 0x853B, 0x0E8C, 0x19E2, 0x9255, + 0x8C4E, 0x07F9, 0x1097, 0x9B20, 0x3E4B, 0xB5FC, 0xA292, 0x2925, + 0x63F3, 0xE844, 0xFF2A, 0x749D, 0xD1F6, 0x5A41, 0x4D2F, 0xC698, + 0x7119, 0xFAAE, 0xEDC0, 0x6677, 0xC31C, 0x48AB, 0x5FC5, 0xD472, + 0x9EA4, 0x1513, 0x027D, 0x89CA, 0x2CA1, 0xA716, 0xB078, 0x3BCF, + 0x25D4, 0xAE63, 0xB90D, 0x32BA, 0x97D1, 0x1C66, 0x0B08, 0x80BF, + 0xCA69, 0x41DE, 0x56B0, 0xDD07, 0x786C, 0xF3DB, 0xE4B5, 0x6F02, + 0x3AB1, 0xB106, 0xA668, 0x2DDF, 0x88B4, 0x0303, 0x146D, 0x9FDA, + 0xD50C, 0x5EBB, 0x49D5, 0xC262, 0x6709, 0xECBE, 0xFBD0, 0x7067, + 0x6E7C, 0xE5CB, 0xF2A5, 0x7912, 0xDC79, 0x57CE, 0x40A0, 0xCB17, + 0x81C1, 0x0A76, 0x1D18, 0x96AF, 0x33C4, 0xB873, 0xAF1D, 0x24AA, + 0x932B, 0x189C, 0x0FF2, 0x8445, 0x212E, 0xAA99, 0xBDF7, 0x3640, + 0x7C96, 0xF721, 0xE04F, 0x6BF8, 0xCE93, 0x4524, 0x524A, 0xD9FD, + 0xC7E6, 0x4C51, 0x5B3F, 0xD088, 0x75E3, 0xFE54, 0xE93A, 0x628D, + 0x285B, 0xA3EC, 0xB482, 0x3F35, 0x9A5E, 0x11E9, 0x0687, 0x8D30, + 0xE232, 0x6985, 0x7EEB, 0xF55C, 0x5037, 0xDB80, 0xCCEE, 0x4759, + 0x0D8F, 0x8638, 0x9156, 0x1AE1, 0xBF8A, 0x343D, 0x2353, 0xA8E4, + 0xB6FF, 0x3D48, 0x2A26, 0xA191, 0x04FA, 0x8F4D, 0x9823, 0x1394, + 0x5942, 0xD2F5, 0xC59B, 0x4E2C, 0xEB47, 0x60F0, 0x779E, 0xFC29, + 0x4BA8, 0xC01F, 0xD771, 0x5CC6, 0xF9AD, 0x721A, 0x6574, 0xEEC3, + 0xA415, 0x2FA2, 0x38CC, 0xB37B, 0x1610, 0x9DA7, 0x8AC9, 0x017E, + 0x1F65, 0x94D2, 0x83BC, 0x080B, 0xAD60, 0x26D7, 0x31B9, 0xBA0E, + 0xF0D8, 0x7B6F, 0x6C01, 0xE7B6, 0x42DD, 0xC96A, 0xDE04, 0x55B3 +}; + +extern unsigned short fio_crc_t10dif(unsigned short crc, + const unsigned char *buffer, + unsigned int len) +{ + unsigned int i; + + for (i = 0 ; i < len ; i++) + crc = (crc << 8) ^ t10_dif_crc_table[((crc >> 8) ^ buffer[i]) & 0xff]; + + return crc; +} + +#endif diff --git a/crc/murmur3.c b/crc/murmur3.c index ba408a9e80..08660bc8cb 100644 --- a/crc/murmur3.c +++ b/crc/murmur3.c @@ -30,10 +30,10 @@ static uint32_t murmur3_tail(const uint8_t *data, const int nblocks, switch (len & 3) { case 3: k1 ^= tail[2] << 16; - fallthrough; + fio_fallthrough; case 2: k1 ^= tail[1] << 8; - fallthrough; + fio_fallthrough; case 1: k1 ^= tail[0]; k1 *= c1; diff --git a/crc/sha512.c b/crc/sha512.c index f599cdcc82..78e64ba2ef 100644 --- a/crc/sha512.c +++ b/crc/sha512.c @@ -195,3 +195,60 @@ void fio_sha512_update(struct fio_sha512_ctx *sctx, const uint8_t *data, /* erase our data */ memset(sctx->W, 0, sizeof(sctx->W)); } + +void fio_sha512_final(struct fio_sha512_ctx *sctx) +{ + uint8_t *hash = sctx->buf; + static uint8_t padding[128] = { 0x80, }; + unsigned int index, pad_len; + uint8_t bits[128]; + uint64_t t2; + uint32_t t; + int i, j; + + index = pad_len = t = i = j = 0; + t2 = 0; + + /* Save number of bits */ + t = sctx->count[0]; + bits[15] = t; t>>=8; + bits[14] = t; t>>=8; + bits[13] = t; t>>=8; + bits[12] = t; + t = sctx->count[1]; + bits[11] = t; t>>=8; + bits[10] = t; t>>=8; + bits[9 ] = t; t>>=8; + bits[8 ] = t; + t = sctx->count[2]; + bits[7 ] = t; t>>=8; + bits[6 ] = t; t>>=8; + bits[5 ] = t; t>>=8; + bits[4 ] = t; + t = sctx->count[3]; + bits[3 ] = t; t>>=8; + bits[2 ] = t; t>>=8; + bits[1 ] = t; t>>=8; + bits[0 ] = t; + + /* Pad out to 112 mod 128. */ + index = (sctx->count[0] >> 3) & 0x7f; + pad_len = (index < 112) ? (112 - index) : ((128+112) - index); + fio_sha512_update(sctx, padding, pad_len); + + /* Append length (before padding) */ + fio_sha512_update(sctx, bits, 16); + + /* Store state in digest */ + for (i = j = 0; i < 8; i++, j += 8) { + t2 = sctx->state[i]; + hash[j+7] = (char)t2 & 0xff; t2>>=8; + hash[j+6] = (char)t2 & 0xff; t2>>=8; + hash[j+5] = (char)t2 & 0xff; t2>>=8; + hash[j+4] = (char)t2 & 0xff; t2>>=8; + hash[j+3] = (char)t2 & 0xff; t2>>=8; + hash[j+2] = (char)t2 & 0xff; t2>>=8; + hash[j+1] = (char)t2 & 0xff; t2>>=8; + hash[j ] = (char)t2 & 0xff; + } +} diff --git a/crc/sha512.h b/crc/sha512.h index 5adf6271cd..dd26d8aa14 100644 --- a/crc/sha512.h +++ b/crc/sha512.h @@ -12,5 +12,6 @@ struct fio_sha512_ctx { void fio_sha512_init(struct fio_sha512_ctx *); void fio_sha512_update(struct fio_sha512_ctx *, const uint8_t *, unsigned int); +void fio_sha512_final(struct fio_sha512_ctx *sctx); #endif diff --git a/crc/test.c b/crc/test.c index b57f07a4d7..8102297ef6 100644 --- a/crc/test.c +++ b/crc/test.c @@ -3,10 +3,10 @@ #include #include +#include "../os/os.h" #include "../gettime.h" #include "../fio_time.h" #include "../lib/rand.h" -#include "../os/os.h" #include "../crc/md5.h" #include "../crc/crc64.h" diff --git a/crc/xxhash.c b/crc/xxhash.c index 4736c528fc..0119564be3 100644 --- a/crc/xxhash.c +++ b/crc/xxhash.c @@ -50,10 +50,10 @@ You can contact the author at : //#define XXH_ACCEPT_NULL_INPUT_POINTER 1 // XXH_FORCE_NATIVE_FORMAT : -// By default, xxHash library provides endian-independant Hash values, based on little-endian convention. +// By default, xxHash library provides endian-independent Hash values, based on little-endian convention. // Results are therefore identical for little-endian and big-endian CPU. // This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format. -// Should endian-independance be of no importance for your application, you may set the #define below to 1. +// Should endian-independence be of no importance for your application, you may set the #define below to 1. // It will improve speed for Big-endian CPU. // This option has no impact on Little_Endian CPU. #define XXH_FORCE_NATIVE_FORMAT 0 diff --git a/dataplacement.c b/dataplacement.c new file mode 100644 index 0000000000..fc45cd27d8 --- /dev/null +++ b/dataplacement.c @@ -0,0 +1,262 @@ +/* + * Note: This is similar to a very basic setup + * of ZBD devices + * + * Specify fdp=1 (With char devices /dev/ng0n1) + */ + +#include +#include +#include +#include +#include "fio.h" +#include "file.h" + +#include "pshared.h" +#include "dataplacement.h" + +static int fdp_ruh_info(struct thread_data *td, struct fio_file *f, + struct fio_ruhs_info *ruhs) +{ + int ret = -EINVAL; + + if (!td->io_ops) { + log_err("fio: no ops set in fdp init?!\n"); + return ret; + } + + if (td->io_ops->fdp_fetch_ruhs) { + ret = td->io_ops->fdp_fetch_ruhs(td, f, ruhs); + if (ret < 0) { + td_verror(td, errno, "fdp fetch ruhs failed"); + log_err("%s: fdp fetch ruhs failed (%d)\n", + f->file_name, errno); + } + } else { + log_err("%s: engine (%s) lacks fetch ruhs\n", + f->file_name, td->io_ops->name); + } + + return ret; +} + +static int init_ruh_info(struct thread_data *td, struct fio_file *f) +{ + struct fio_ruhs_info *ruhs, *tmp; + uint32_t nr_ruhs; + int i, ret; + + /* set up the data structure used for FDP to work with the supplied stream IDs */ + if (td->o.dp_type == FIO_DP_STREAMS) { + if (!td->o.dp_nr_ids) { + log_err("fio: stream IDs must be provided for dataplacement=streams\n"); + return -EINVAL; + } + ruhs = scalloc(1, sizeof(*ruhs) + td->o.dp_nr_ids * sizeof(*ruhs->plis)); + if (!ruhs) + return -ENOMEM; + + ruhs->nr_ruhs = td->o.dp_nr_ids; + for (int i = 0; i < ruhs->nr_ruhs; i++) + ruhs->plis[i] = td->o.dp_ids[i]; + + f->ruhs_info = ruhs; + return 0; + } + + /* + * Since we don't know the actual number of ruhs. Only fetch the header. + * We will reallocate this buffer and then fetch all the ruhs again. + */ + ruhs = calloc(1, sizeof(*ruhs)); + ret = fdp_ruh_info(td, f, ruhs); + if (ret) { + log_err("fio: ruh info failed for %s (%d)\n", + f->file_name, -ret); + goto out; + } + + nr_ruhs = ruhs->nr_ruhs; + ruhs = realloc(ruhs, sizeof(*ruhs) + nr_ruhs * sizeof(*ruhs->plis)); + if (!ruhs) { + log_err("fio: ruhs buffer realloc failed for %s\n", + f->file_name); + ret = -ENOMEM; + goto out; + } + + ruhs->nr_ruhs = nr_ruhs; + ret = fdp_ruh_info(td, f, ruhs); + if (ret) { + log_err("fio: ruh info failed for %s (%d)\n", + f->file_name, -ret); + goto out; + } + + if (td->o.dp_nr_ids == 0) { + if (ruhs->nr_ruhs > FIO_MAX_DP_IDS) + ruhs->nr_ruhs = FIO_MAX_DP_IDS; + } else { + for (i = 0; i < td->o.dp_nr_ids; i++) { + if (td->o.dp_ids[i] >= ruhs->nr_ruhs) { + log_err("fio: for %s PID index %d must be smaller than %d\n", + f->file_name, td->o.dp_ids[i], + ruhs->nr_ruhs); + ret = -EINVAL; + goto out; + } + } + ruhs->nr_ruhs = td->o.dp_nr_ids; + } + + tmp = scalloc(1, sizeof(*tmp) + ruhs->nr_ruhs * sizeof(*tmp->plis)); + if (!tmp) { + ret = -ENOMEM; + goto out; + } + + if (td->o.dp_nr_ids == 0) { + for (i = 0; i < ruhs->nr_ruhs; i++) + tmp->plis[i] = ruhs->plis[i]; + + tmp->nr_ruhs = ruhs->nr_ruhs; + f->ruhs_info = tmp; + free(ruhs); + + return 0; + } + + tmp->nr_ruhs = td->o.dp_nr_ids; + for (i = 0; i < td->o.dp_nr_ids; i++) + tmp->plis[i] = ruhs->plis[td->o.dp_ids[i]]; + f->ruhs_info = tmp; +out: + free(ruhs); + return ret; +} + +static int init_ruh_scheme(struct thread_data *td, struct fio_file *f) +{ + struct fio_ruhs_scheme *ruh_scheme; + FILE *scheme_fp; + unsigned long long start, end; + uint16_t pli; + int ret = 0; + + if (td->o.dp_id_select != FIO_DP_SCHEME) + return 0; + + /* Get the scheme from the file */ + scheme_fp = fopen(td->o.dp_scheme_file, "r"); + + if (!scheme_fp) { + log_err("fio: ruh scheme failed to open scheme file %s\n", + td->o.dp_scheme_file); + ret = -errno; + goto out; + } + + ruh_scheme = scalloc(1, sizeof(*ruh_scheme)); + if (!ruh_scheme) { + ret = -ENOMEM; + goto out_with_close_fp; + } + + for (int i = 0; + i < DP_MAX_SCHEME_ENTRIES && fscanf(scheme_fp, "%llu,%llu,%hu\n", &start, &end, &pli) == 3; + i++) { + + ruh_scheme->scheme_entries[i].start_offset = start; + ruh_scheme->scheme_entries[i].end_offset = end; + ruh_scheme->scheme_entries[i].pli = pli; + ruh_scheme->nr_schemes++; + } + + if (fscanf(scheme_fp, "%llu,%llu,%hu\n", &start, &end, &pli) == 3) + log_info("fio: too many scheme entries in %s. Only the first %d scheme entries are applied\n", + td->o.dp_scheme_file, + DP_MAX_SCHEME_ENTRIES); + + f->ruhs_scheme = ruh_scheme; + +out_with_close_fp: + fclose(scheme_fp); +out: + return ret; +} + +int dp_init(struct thread_data *td) +{ + struct fio_file *f; + int i, ret = 0; + + for_each_file(td, f, i) { + ret = init_ruh_info(td, f); + if (ret) + break; + + ret = init_ruh_scheme(td, f); + if (ret) + break; + } + return ret; +} + +void fdp_free_ruhs_info(struct fio_file *f) +{ + if (!f->ruhs_info) + return; + sfree(f->ruhs_info); + f->ruhs_info = NULL; + + if (!f->ruhs_scheme) + return; + sfree(f->ruhs_scheme); + f->ruhs_scheme = NULL; +} + +void dp_fill_dspec_data(struct thread_data *td, struct io_u *io_u) +{ + struct fio_file *f = io_u->file; + struct fio_ruhs_info *ruhs = f->ruhs_info; + int dspec; + + if (!ruhs || io_u->ddir != DDIR_WRITE) { + io_u->dtype = 0; + io_u->dspec = 0; + return; + } + + if (td->o.dp_id_select == FIO_DP_RR) { + if (ruhs->pli_loc >= ruhs->nr_ruhs) + ruhs->pli_loc = 0; + + dspec = ruhs->plis[ruhs->pli_loc++]; + } else if (td->o.dp_id_select == FIO_DP_SCHEME) { + struct fio_ruhs_scheme *ruhs_scheme = f->ruhs_scheme; + unsigned long long offset = io_u->offset; + int i; + + for (i = 0; i < ruhs_scheme->nr_schemes; i++) { + if (offset >= ruhs_scheme->scheme_entries[i].start_offset && + offset < ruhs_scheme->scheme_entries[i].end_offset) { + dspec = ruhs_scheme->scheme_entries[i].pli; + break; + } + } + + /* + * If the write offset is not affected by any scheme entry, + * 0(default RUH) will be assigned to dspec + */ + if (i == ruhs_scheme->nr_schemes) + dspec = 0; + } else { + ruhs->pli_loc = rand_between(&td->fdp_state, 0, ruhs->nr_ruhs - 1); + dspec = ruhs->plis[ruhs->pli_loc]; + } + + io_u->dtype = td->o.dp_type == FIO_DP_FDP ? FDP_DIR_DTYPE : STREAMS_DIR_DTYPE; + io_u->dspec = dspec; + dprint(FD_IO, "dtype set to 0x%x, dspec set to 0x%x\n", io_u->dtype, io_u->dspec); +} diff --git a/dataplacement.h b/dataplacement.h new file mode 100644 index 0000000000..84b7be5b40 --- /dev/null +++ b/dataplacement.h @@ -0,0 +1,48 @@ +#ifndef FIO_DATAPLACEMENT_H +#define FIO_DATAPLACEMENT_H + +#include "io_u.h" + +#define STREAMS_DIR_DTYPE 1 +#define FDP_DIR_DTYPE 2 +#define FIO_MAX_DP_IDS 128 +#define DP_MAX_SCHEME_ENTRIES 32 + +/* + * How fio chooses what placement identifier to use next. Choice of + * uniformly random, or roundrobin. + */ +enum { + FIO_DP_RANDOM = 0x1, + FIO_DP_RR = 0x2, + FIO_DP_SCHEME = 0x3, +}; + +enum { + FIO_DP_NONE = 0x0, + FIO_DP_FDP = 0x1, + FIO_DP_STREAMS = 0x2, +}; + +struct fio_ruhs_info { + uint32_t nr_ruhs; + uint32_t pli_loc; + uint16_t plis[]; +}; + +struct fio_ruhs_scheme_entry { + unsigned long long start_offset; + unsigned long long end_offset; + uint16_t pli; +}; + +struct fio_ruhs_scheme { + uint16_t nr_schemes; + struct fio_ruhs_scheme_entry scheme_entries[DP_MAX_SCHEME_ENTRIES]; +}; + +int dp_init(struct thread_data *td); +void fdp_free_ruhs_info(struct fio_file *f); +void dp_fill_dspec_data(struct thread_data *td, struct io_u *io_u); + +#endif /* FIO_DATAPLACEMENT_H */ diff --git a/debug.h b/debug.h index 51b18de235..49a8791d21 100644 --- a/debug.h +++ b/debug.h @@ -23,6 +23,7 @@ enum { FD_STEADYSTATE, FD_HELPERTHREAD, FD_ZBD, + FD_SPRANDOM, FD_DEBUG_MAX, }; diff --git a/dedupe.c b/dedupe.c index fd116dfba4..6170568918 100644 --- a/dedupe.c +++ b/dedupe.c @@ -1,13 +1,34 @@ #include "fio.h" -int init_dedupe_working_set_seeds(struct thread_data *td) +/** + * initializes the global dedup workset. + * this needs to be called after all jobs' seeds + * have been initialized + */ +int init_global_dedupe_working_set_seeds(void) { - unsigned long long i, j, num_seed_advancements; + for_each_td(td) { + if (!td->o.dedupe_global) + continue; + + if (init_dedupe_working_set_seeds(td, 1)) + return 1; + } end_for_each(); + + return 0; +} + +int init_dedupe_working_set_seeds(struct thread_data *td, bool global_dedup) +{ + int tindex; + struct thread_data *td_seed; + unsigned long long i, j, num_seed_advancements, pages_per_seed; struct frand_state dedupe_working_set_state = {0}; if (!td->o.dedupe_percentage || !(td->o.dedupe_mode == DEDUPE_MODE_WORKING_SET)) return 0; + tindex = td->thread_number - 1; num_seed_advancements = td->o.min_bs[DDIR_WRITE] / min_not_zero(td->o.min_bs[DDIR_WRITE], (unsigned long long) td->o.compress_chunk); /* @@ -20,9 +41,11 @@ int init_dedupe_working_set_seeds(struct thread_data *td) log_err("fio: could not allocate dedupe working set\n"); return 1; } + frand_copy(&dedupe_working_set_state, &td->buf_state); - for (i = 0; i < td->num_unique_pages; i++) { - frand_copy(&td->dedupe_working_set_states[i], &dedupe_working_set_state); + frand_copy(&td->dedupe_working_set_states[0], &dedupe_working_set_state); + pages_per_seed = max(td->num_unique_pages / thread_number, 1ull); + for (i = 1; i < td->num_unique_pages; i++) { /* * When compression is used the seed is advanced multiple times to * generate the buffer. We want to regenerate the same buffer when @@ -30,6 +53,18 @@ int init_dedupe_working_set_seeds(struct thread_data *td) */ for (j = 0; j < num_seed_advancements; j++) __get_next_seed(&dedupe_working_set_state); + + /* + * When global dedup is used, we rotate the seeds to allow + * generating same buffers across different jobs. Deduplication buffers + * are spread evenly across jobs participating in global dedupe + */ + if (global_dedup && i % pages_per_seed == 0) { + td_seed = tnumber_to_td(++tindex % thread_number); + frand_copy(&dedupe_working_set_state, &td_seed->buf_state); + } + + frand_copy(&td->dedupe_working_set_states[i], &dedupe_working_set_state); } return 0; diff --git a/dedupe.h b/dedupe.h index d4c4dc3779..bd1f9c0c0b 100644 --- a/dedupe.h +++ b/dedupe.h @@ -1,6 +1,7 @@ #ifndef DEDUPE_H #define DEDUPE_H -int init_dedupe_working_set_seeds(struct thread_data *td); +int init_dedupe_working_set_seeds(struct thread_data *td, bool global_dedupe); +int init_global_dedupe_working_set_seeds(void); #endif diff --git a/diskutil.c b/diskutil.c index ace7af3d5b..f018015cb7 100644 --- a/diskutil.c +++ b/diskutil.c @@ -1,3 +1,4 @@ +#include #include #include #include @@ -37,15 +38,13 @@ static void disk_util_free(struct disk_util *du) slave->users--; } - fio_sem_remove(du->lock); + fio_shared_sem_remove(du->lock); free(du->sysfs_root); sfree(du); } static int get_io_ticks(struct disk_util *du, struct disk_util_stat *dus) { - unsigned in_flight; - unsigned long long sectors[2]; char line[256]; FILE *f; char *p; @@ -65,23 +64,34 @@ static int get_io_ticks(struct disk_util *du, struct disk_util_stat *dus) dprint(FD_DISKUTIL, "%s: %s", du->path, p); - ret = sscanf(p, "%llu %llu %llu %llu %llu %llu %llu %llu %u %llu %llu\n", - (unsigned long long *) &dus->s.ios[0], - (unsigned long long *) &dus->s.merges[0], - §ors[0], - (unsigned long long *) &dus->s.ticks[0], - (unsigned long long *) &dus->s.ios[1], - (unsigned long long *) &dus->s.merges[1], - §ors[1], - (unsigned long long *) &dus->s.ticks[1], - &in_flight, - (unsigned long long *) &dus->s.io_ticks, - (unsigned long long *) &dus->s.time_in_queue); + ret = sscanf(p, "%"SCNu64" %"SCNu64" %"SCNu64" %"SCNu64" " + "%"SCNu64" %"SCNu64" %"SCNu64" %"SCNu64" " + "%*u %"SCNu64" %"SCNu64"\n", + &dus->s.ios[0], &dus->s.merges[0], &dus->s.sectors[0], + &dus->s.ticks[0], + &dus->s.ios[1], &dus->s.merges[1], &dus->s.sectors[1], + &dus->s.ticks[1], + &dus->s.io_ticks, &dus->s.time_in_queue); fclose(f); - dprint(FD_DISKUTIL, "%s: stat read ok? %d\n", du->path, ret == 1); - dus->s.sectors[0] = sectors[0]; - dus->s.sectors[1] = sectors[1]; - return ret != 11; + dprint(FD_DISKUTIL, "%s: stat read ok? %d\n", du->path, ret == 10); + return ret != 10; +} + +static uint64_t safe_32bit_diff(uint64_t nval, uint64_t oval) +{ + /* Linux kernel prints some of the stat fields as 32-bit integers. It is + * possible that the value overflows, but since fio uses unsigned 64-bit + * arithmetic in update_io_tick_disk(), it instead results in a huge + * bogus value being added to the respective accumulating field. Just + * in case Linux starts reporting these metrics as 64-bit values in the + * future, check that overflow actually happens around the 32-bit + * unsigned boundary; assume overflow only happens once between + * successive polls. + */ + if (oval <= nval || oval >= (1ull << 32)) + return nval - oval; + else + return (1ull << 32) + nval - oval; } static void update_io_tick_disk(struct disk_util *du) @@ -103,15 +113,16 @@ static void update_io_tick_disk(struct disk_util *du) dus->s.ios[1] += (__dus.s.ios[1] - ldus->s.ios[1]); dus->s.merges[0] += (__dus.s.merges[0] - ldus->s.merges[0]); dus->s.merges[1] += (__dus.s.merges[1] - ldus->s.merges[1]); - dus->s.ticks[0] += (__dus.s.ticks[0] - ldus->s.ticks[0]); - dus->s.ticks[1] += (__dus.s.ticks[1] - ldus->s.ticks[1]); - dus->s.io_ticks += (__dus.s.io_ticks - ldus->s.io_ticks); - dus->s.time_in_queue += (__dus.s.time_in_queue - ldus->s.time_in_queue); + dus->s.ticks[0] += safe_32bit_diff(__dus.s.ticks[0], ldus->s.ticks[0]); + dus->s.ticks[1] += safe_32bit_diff(__dus.s.ticks[1], ldus->s.ticks[1]); + dus->s.io_ticks += safe_32bit_diff(__dus.s.io_ticks, ldus->s.io_ticks); + dus->s.time_in_queue += + safe_32bit_diff(__dus.s.time_in_queue, ldus->s.time_in_queue); fio_gettime(&t, NULL); dus->s.msec += mtime_since(&du->time, &t); - memcpy(&du->time, &t, sizeof(t)); - memcpy(&ldus->s, &__dus.s, sizeof(__dus.s)); + du->time = t; + ldus->s = __dus.s; } int update_io_ticks(void) @@ -316,7 +327,7 @@ static struct disk_util *disk_util_add(struct thread_data *td, int majdev, du->minor = mindev; INIT_FLIST_HEAD(&du->slavelist); INIT_FLIST_HEAD(&du->slaves); - du->lock = fio_sem_init(FIO_SEM_UNLOCKED); + du->lock = fio_shared_sem_init(FIO_SEM_UNLOCKED); du->users = 0; fio_sem_down(disk_util_sem); diff --git a/diskutil.h b/diskutil.h index 83bcbf895e..9b28379983 100644 --- a/diskutil.h +++ b/diskutil.h @@ -2,9 +2,24 @@ #define FIO_DISKUTIL_H #define FIO_DU_NAME_SZ 64 +#include +#include + #include "helper_thread.h" #include "fio_sem.h" - +#include "flist.h" +#include "lib/ieee754.h" + +/** + * @ios: Number of I/O operations that have been completed successfully. + * @merges: Number of I/O operations that have been merged. + * @sectors: I/O size in 512-byte units. + * @ticks: Time spent on I/O in milliseconds. + * @io_ticks: CPU time spent on I/O in milliseconds. + * @time_in_queue: Weighted time spent doing I/O in milliseconds. + * + * For the array members, index 0 refers to reads and index 1 refers to writes. + */ struct disk_util_stats { uint64_t ios[2]; uint64_t merges[2]; @@ -16,7 +31,7 @@ struct disk_util_stats { }; /* - * Disk utils as read in /sys/block//stat + * Disk utilization as read from /sys/block//stat */ struct disk_util_stat { uint8_t name[FIO_DU_NAME_SZ]; diff --git a/doc/Makefile b/doc/Makefile index 3b979f9acb..a444d83a50 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -2,7 +2,7 @@ # # You can set these variables from the command line. -SPHINXOPTS = +SPHINXOPTS = -W --keep-going SPHINXBUILD = sphinx-build PAPER = BUILDDIR = output diff --git a/doc/conf.py b/doc/conf.py index 10b72ecb91..18a8dccce3 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -85,13 +85,6 @@ def fio_version(): version, release = fio_version() -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# -# This is also used if you do content translation via gettext catalogs. -# Usually you set "language" from the command line for these cases. -language = None - # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: # @@ -325,7 +318,7 @@ def fio_version(): # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ('fio_man', 'fio', 'flexible I/O tester', + ('fio_doc', 'fio', 'flexible I/O tester', [author], 1) ] diff --git a/doc/fio_doc.rst b/doc/fio_doc.rst index b5987b52a8..cad84c7c7e 100644 --- a/doc/fio_doc.rst +++ b/doc/fio_doc.rst @@ -2,50 +2,50 @@ fio - Flexible I/O tester rev. |version| ======================================== -.. include:: ../README +.. include:: ../README.rst -.. include:: ../HOWTO +.. include:: ../HOWTO.rst +.. only:: not man + Examples + ======== -Examples -======== + .. include:: fio_examples.rst -.. include:: fio_examples.rst + TODO + ==== -TODO -==== + GFIO TODO + --------- -GFIO TODO ---------- + .. include:: ../GFIO-TODO -.. include:: ../GFIO-TODO + Server TODO + ----------- -Server TODO ------------ + .. include:: ../SERVER-TODO -.. include:: ../SERVER-TODO + Steady State TODO + ----------------- -Steady State TODO ------------------ + .. include:: ../STEADYSTATE-TODO -.. include:: ../STEADYSTATE-TODO + Moral License + ============= -Moral License -============= + .. include:: ../MORAL-LICENSE -.. include:: ../MORAL-LICENSE + License + ======= -License -======= - -.. literalinclude:: ../COPYING + .. literalinclude:: ../COPYING diff --git a/doc/fio_man.rst b/doc/fio_man.rst deleted file mode 100644 index c6a6438ff3..0000000000 --- a/doc/fio_man.rst +++ /dev/null @@ -1,12 +0,0 @@ -:orphan: - -Fio Manpage -=========== - -(rev. |release|) - - -.. include:: ../README - - -.. include:: ../HOWTO diff --git a/engines/cmdprio.c b/engines/cmdprio.c index 92b752aecd..153e36911a 100644 --- a/engines/cmdprio.c +++ b/engines/cmdprio.c @@ -5,45 +5,201 @@ #include "cmdprio.h" -static int fio_cmdprio_bssplit_ddir(struct thread_options *to, void *cb_arg, - enum fio_ddir ddir, char *str, bool data) +/* + * Temporary array used during parsing. Will be freed after the corresponding + * struct bsprio_desc has been generated and saved in cmdprio->bsprio_desc. + */ +struct cmdprio_parse_result { + struct split_prio *entries; + int nr_entries; +}; + +/* + * Temporary array used during init. Will be freed after the corresponding + * struct clat_prio_stat array has been saved in td->ts.clat_prio and the + * matching clat_prio_indexes have been saved in each struct cmdprio_prio. + */ +struct cmdprio_values { + unsigned int *prios; + int nr_prios; +}; + +static int find_clat_prio_index(unsigned int *all_prios, int nr_prios, + int32_t prio) { - struct cmdprio *cmdprio = cb_arg; - struct split split; - unsigned int i; + int i; - if (ddir == DDIR_TRIM) - return 0; + for (i = 0; i < nr_prios; i++) { + if (all_prios[i] == prio) + return i; + } + + return -1; +} - memset(&split, 0, sizeof(split)); +/** + * assign_clat_prio_index - In order to avoid stat.c the need to loop through + * all possible priorities each time add_clat_sample() / add_lat_sample() is + * called, save which index to use in each cmdprio_prio. This will later be + * propagated to the io_u, if the specific io_u was determined to use a cmdprio + * priority value. + */ +static void assign_clat_prio_index(struct cmdprio_prio *prio, + struct cmdprio_values *values) +{ + int clat_prio_index = find_clat_prio_index(values->prios, + values->nr_prios, + prio->prio); + if (clat_prio_index == -1) { + clat_prio_index = values->nr_prios; + values->prios[clat_prio_index] = prio->prio; + values->nr_prios++; + } + prio->clat_prio_index = clat_prio_index; +} - if (split_parse_ddir(to, &split, str, data, BSSPLIT_MAX)) +/** + * init_cmdprio_values - Allocate a temporary array that can hold all unique + * priorities (per ddir), so that we can assign_clat_prio_index() for each + * cmdprio_prio during setup. This temporary array is freed after setup. + */ +static int init_cmdprio_values(struct cmdprio_values *values, + int max_unique_prios, struct thread_stat *ts) +{ + values->prios = calloc(max_unique_prios + 1, + sizeof(*values->prios)); + if (!values->prios) return 1; - if (!split.nr) - return 0; - cmdprio->bssplit_nr[ddir] = split.nr; - cmdprio->bssplit[ddir] = malloc(split.nr * sizeof(struct bssplit)); - if (!cmdprio->bssplit[ddir]) + /* td->ioprio/ts->ioprio is always stored at index 0. */ + values->prios[0] = ts->ioprio; + values->nr_prios++; + + return 0; +} + +/** + * init_ts_clat_prio - Allocates and fills a clat_prio_stat array which holds + * all unique priorities (per ddir). + */ +static int init_ts_clat_prio(struct thread_stat *ts, enum fio_ddir ddir, + struct cmdprio_values *values) +{ + int i; + + if (alloc_clat_prio_stat_ddir(ts, ddir, values->nr_prios)) return 1; - for (i = 0; i < split.nr; i++) { - cmdprio->bssplit[ddir][i].bs = split.val1[i]; - if (split.val2[i] == -1U) { - cmdprio->bssplit[ddir][i].perc = 0; - } else { - if (split.val2[i] > 100) - cmdprio->bssplit[ddir][i].perc = 100; - else - cmdprio->bssplit[ddir][i].perc = split.val2[i]; + for (i = 0; i < values->nr_prios; i++) + ts->clat_prio[ddir][i].ioprio = values->prios[i]; + + return 0; +} + +static int fio_cmdprio_fill_bsprio(struct cmdprio_bsprio *bsprio, + struct split_prio *entries, + struct cmdprio_values *values, + int implicit_cmdprio, int start, int end) +{ + struct cmdprio_prio *prio; + int i = end - start + 1; + + bsprio->prios = calloc(i, sizeof(*bsprio->prios)); + if (!bsprio->prios) + return 1; + + bsprio->bs = entries[start].bs; + bsprio->nr_prios = 0; + for (i = start; i <= end; i++) { + prio = &bsprio->prios[bsprio->nr_prios]; + prio->perc = entries[i].perc; + if (entries[i].prio == -1) + prio->prio = implicit_cmdprio; + else + prio->prio = entries[i].prio; + assign_clat_prio_index(prio, values); + bsprio->tot_perc += entries[i].perc; + if (bsprio->tot_perc > 100) { + log_err("fio: cmdprio_bssplit total percentage " + "for bs: %"PRIu64" exceeds 100\n", + bsprio->bs); + free(bsprio->prios); + return 1; } + bsprio->nr_prios++; } return 0; } -int fio_cmdprio_bssplit_parse(struct thread_data *td, const char *input, - struct cmdprio *cmdprio) +static int +fio_cmdprio_generate_bsprio_desc(struct cmdprio_bsprio_desc *bsprio_desc, + struct cmdprio_parse_result *parse_res, + struct cmdprio_values *values, + int implicit_cmdprio) +{ + struct split_prio *entries = parse_res->entries; + int nr_entries = parse_res->nr_entries; + struct cmdprio_bsprio *bsprio; + int i, start, count = 0; + + /* + * The parsed result is sorted by blocksize, so count only the number + * of different blocksizes, to know how many cmdprio_bsprio we need. + */ + for (i = 0; i < nr_entries; i++) { + while (i + 1 < nr_entries && entries[i].bs == entries[i + 1].bs) + i++; + count++; + } + + /* + * This allocation is not freed on error. Instead, the calling function + * is responsible for calling fio_cmdprio_cleanup() on error. + */ + bsprio_desc->bsprios = calloc(count, sizeof(*bsprio_desc->bsprios)); + if (!bsprio_desc->bsprios) + return 1; + + start = 0; + bsprio_desc->nr_bsprios = 0; + for (i = 0; i < nr_entries; i++) { + while (i + 1 < nr_entries && entries[i].bs == entries[i + 1].bs) + i++; + bsprio = &bsprio_desc->bsprios[bsprio_desc->nr_bsprios]; + /* + * All parsed entries with the same blocksize get saved in the + * same cmdprio_bsprio, to expedite the search in the hot path. + */ + if (fio_cmdprio_fill_bsprio(bsprio, entries, values, + implicit_cmdprio, start, i)) + return 1; + + start = i + 1; + bsprio_desc->nr_bsprios++; + } + + return 0; +} + +static int fio_cmdprio_bssplit_ddir(struct thread_options *to, void *cb_arg, + enum fio_ddir ddir, char *str, bool data) +{ + struct cmdprio_parse_result *parse_res_arr = cb_arg; + struct cmdprio_parse_result *parse_res = &parse_res_arr[ddir]; + + if (ddir == DDIR_TRIM) + return 0; + + if (split_parse_prio_ddir(to, &parse_res->entries, + &parse_res->nr_entries, str)) + return 1; + + return 0; +} + +static int fio_cmdprio_bssplit_parse(struct thread_data *td, const char *input, + struct cmdprio_parse_result *parse_res) { char *str, *p; int ret = 0; @@ -53,26 +209,39 @@ int fio_cmdprio_bssplit_parse(struct thread_data *td, const char *input, strip_blank_front(&str); strip_blank_end(str); - ret = str_split_parse(td, str, fio_cmdprio_bssplit_ddir, cmdprio, + ret = str_split_parse(td, str, fio_cmdprio_bssplit_ddir, parse_res, false); free(p); return ret; } -static int fio_cmdprio_percentage(struct cmdprio *cmdprio, struct io_u *io_u) +/** + * fio_cmdprio_percentage - Returns the percentage of I/Os that should + * use a cmdprio priority value (rather than the default context priority). + * + * For CMDPRIO_MODE_BSSPLIT, if the percentage is non-zero, we will also + * return the matching bsprio, to avoid the same linear search elsewhere. + * For CMDPRIO_MODE_PERC, we will never return a bsprio. + */ +static int fio_cmdprio_percentage(struct cmdprio *cmdprio, struct io_u *io_u, + struct cmdprio_bsprio **bsprio) { + struct cmdprio_bsprio *bsprio_entry; enum fio_ddir ddir = io_u->ddir; - struct cmdprio_options *options = cmdprio->options; int i; switch (cmdprio->mode) { case CMDPRIO_MODE_PERC: - return options->percentage[ddir]; + *bsprio = NULL; + return cmdprio->perc_entry[ddir].perc; case CMDPRIO_MODE_BSSPLIT: - for (i = 0; i < cmdprio->bssplit_nr[ddir]; i++) { - if (cmdprio->bssplit[ddir][i].bs == io_u->buflen) - return cmdprio->bssplit[ddir][i].perc; + for (i = 0; i < cmdprio->bsprio_desc[ddir].nr_bsprios; i++) { + bsprio_entry = &cmdprio->bsprio_desc[ddir].bsprios[i]; + if (bsprio_entry->bs == io_u->buflen) { + *bsprio = bsprio_entry; + return bsprio_entry->tot_perc; + } } break; default: @@ -83,6 +252,11 @@ static int fio_cmdprio_percentage(struct cmdprio *cmdprio, struct io_u *io_u) assert(0); } + /* + * This is totally fine, the given blocksize simply does not + * have any (non-zero) cmdprio_bssplit entries defined. + */ + *bsprio = NULL; return 0; } @@ -93,59 +267,172 @@ static int fio_cmdprio_percentage(struct cmdprio *cmdprio, struct io_u *io_u) * to be set. If the random percentage value is within the user specified * percentage of I/Os that should use a cmdprio priority value (rather than * the default priority), then this function updates the io_u with an ioprio - * value as defined by the cmdprio/cmdprio_class or cmdprio_bssplit options. + * value as defined by the cmdprio/cmdprio_hint/cmdprio_class or + * cmdprio_bssplit options. * * Return true if the io_u ioprio was changed and false otherwise. */ bool fio_cmdprio_set_ioprio(struct thread_data *td, struct cmdprio *cmdprio, struct io_u *io_u) { - enum fio_ddir ddir = io_u->ddir; - struct cmdprio_options *options = cmdprio->options; - unsigned int p; - unsigned int cmdprio_value = - ioprio_value(options->class[ddir], options->level[ddir]); - - p = fio_cmdprio_percentage(cmdprio, io_u); - if (p && rand_between(&td->prio_state, 0, 99) < p) { - io_u->ioprio = cmdprio_value; - if (!td->ioprio || cmdprio_value < td->ioprio) { - /* - * The async IO priority is higher (has a lower value) - * than the default priority (which is either 0 or the - * value set by "prio" and "prioclass" options). - */ - io_u->flags |= IO_U_F_HIGH_PRIO; - } + struct cmdprio_bsprio *bsprio; + unsigned int p, rand; + uint32_t perc = 0; + int i; + + p = fio_cmdprio_percentage(cmdprio, io_u, &bsprio); + if (!p) + return false; + + rand = rand_between(&td->prio_state, 0, 99); + if (rand >= p) + return false; + + switch (cmdprio->mode) { + case CMDPRIO_MODE_PERC: + io_u->ioprio = cmdprio->perc_entry[io_u->ddir].prio; + io_u->clat_prio_index = + cmdprio->perc_entry[io_u->ddir].clat_prio_index; return true; + case CMDPRIO_MODE_BSSPLIT: + assert(bsprio); + for (i = 0; i < bsprio->nr_prios; i++) { + struct cmdprio_prio *prio = &bsprio->prios[i]; + + perc += prio->perc; + if (rand < perc) { + io_u->ioprio = prio->prio; + io_u->clat_prio_index = prio->clat_prio_index; + return true; + } + } + break; + default: + assert(0); } - if (td->ioprio && td->ioprio < cmdprio_value) { + /* When rand < p (total perc), we should always find a cmdprio_prio. */ + assert(0); + return false; +} + +static int fio_cmdprio_gen_perc(struct thread_data *td, struct cmdprio *cmdprio) +{ + struct cmdprio_options *options = cmdprio->options; + struct cmdprio_prio *prio; + struct cmdprio_values values[CMDPRIO_RWDIR_CNT] = {}; + struct thread_stat *ts = &td->ts; + enum fio_ddir ddir; + int ret; + + for (ddir = 0; ddir < CMDPRIO_RWDIR_CNT; ddir++) { /* - * The IO will be executed with the default priority (which is - * either 0 or the value set by "prio" and "prioclass options), - * and this priority is higher (has a lower value) than the - * async IO priority. + * Do not allocate a clat_prio array nor set the cmdprio struct + * if zero percent of the I/Os (for the ddir) should use a + * cmdprio priority value, or when the ddir is not enabled. */ - io_u->flags |= IO_U_F_HIGH_PRIO; + if (!options->percentage[ddir] || + (ddir == DDIR_READ && !td_read(td)) || + (ddir == DDIR_WRITE && !td_write(td))) + continue; + + ret = init_cmdprio_values(&values[ddir], 1, ts); + if (ret) + goto err; + + prio = &cmdprio->perc_entry[ddir]; + prio->perc = options->percentage[ddir]; + prio->prio = ioprio_value(options->class[ddir], + options->level[ddir], + options->hint[ddir]); + assign_clat_prio_index(prio, &values[ddir]); + + ret = init_ts_clat_prio(ts, ddir, &values[ddir]); + if (ret) + goto err; + + free(values[ddir].prios); + values[ddir].prios = NULL; + values[ddir].nr_prios = 0; } - return false; + return 0; + +err: + for (ddir = 0; ddir < CMDPRIO_RWDIR_CNT; ddir++) + free(values[ddir].prios); + free_clat_prio_stats(ts); + + return ret; } static int fio_cmdprio_parse_and_gen_bssplit(struct thread_data *td, struct cmdprio *cmdprio) { struct cmdprio_options *options = cmdprio->options; - int ret; - - ret = fio_cmdprio_bssplit_parse(td, options->bssplit_str, cmdprio); + struct cmdprio_parse_result parse_res[CMDPRIO_RWDIR_CNT] = {}; + struct cmdprio_values values[CMDPRIO_RWDIR_CNT] = {}; + struct thread_stat *ts = &td->ts; + int ret, implicit_cmdprio; + enum fio_ddir ddir; + + ret = fio_cmdprio_bssplit_parse(td, options->bssplit_str, + &parse_res[0]); if (ret) goto err; + for (ddir = 0; ddir < CMDPRIO_RWDIR_CNT; ddir++) { + /* + * Do not allocate a clat_prio array nor set the cmdprio structs + * if there are no non-zero entries (for the ddir), or when the + * ddir is not enabled. + */ + if (!parse_res[ddir].nr_entries || + (ddir == DDIR_READ && !td_read(td)) || + (ddir == DDIR_WRITE && !td_write(td))) { + free(parse_res[ddir].entries); + parse_res[ddir].entries = NULL; + parse_res[ddir].nr_entries = 0; + continue; + } + + ret = init_cmdprio_values(&values[ddir], + parse_res[ddir].nr_entries, ts); + if (ret) + goto err; + + implicit_cmdprio = ioprio_value(options->class[ddir], + options->level[ddir], + options->hint[ddir]); + + ret = fio_cmdprio_generate_bsprio_desc(&cmdprio->bsprio_desc[ddir], + &parse_res[ddir], + &values[ddir], + implicit_cmdprio); + if (ret) + goto err; + + free(parse_res[ddir].entries); + parse_res[ddir].entries = NULL; + parse_res[ddir].nr_entries = 0; + + ret = init_ts_clat_prio(ts, ddir, &values[ddir]); + if (ret) + goto err; + + free(values[ddir].prios); + values[ddir].prios = NULL; + values[ddir].nr_prios = 0; + } + return 0; err: + for (ddir = 0; ddir < CMDPRIO_RWDIR_CNT; ddir++) { + free(parse_res[ddir].entries); + free(values[ddir].prios); + } + free_clat_prio_stats(ts); fio_cmdprio_cleanup(cmdprio); return ret; @@ -157,40 +444,46 @@ static int fio_cmdprio_parse_and_gen(struct thread_data *td, struct cmdprio_options *options = cmdprio->options; int i, ret; + /* + * If cmdprio_percentage/cmdprio_bssplit is set and cmdprio_class + * is not set, default to RT priority class. + */ + for (i = 0; i < CMDPRIO_RWDIR_CNT; i++) { + /* + * A cmdprio value is only used when fio_cmdprio_percentage() + * returns non-zero, so it is safe to set a class even for a + * DDIR that will never use it. + */ + if (!options->class[i]) + options->class[i] = IOPRIO_CLASS_RT; + } + switch (cmdprio->mode) { case CMDPRIO_MODE_BSSPLIT: ret = fio_cmdprio_parse_and_gen_bssplit(td, cmdprio); break; case CMDPRIO_MODE_PERC: - ret = 0; + ret = fio_cmdprio_gen_perc(td, cmdprio); break; default: assert(0); return 1; } - /* - * If cmdprio_percentage/cmdprio_bssplit is set and cmdprio_class - * is not set, default to RT priority class. - */ - for (i = 0; i < CMDPRIO_RWDIR_CNT; i++) { - if (options->percentage[i] || cmdprio->bssplit_nr[i]) { - if (!options->class[i]) - options->class[i] = IOPRIO_CLASS_RT; - } - } - return ret; } void fio_cmdprio_cleanup(struct cmdprio *cmdprio) { - int ddir; + enum fio_ddir ddir; + int i; for (ddir = 0; ddir < CMDPRIO_RWDIR_CNT; ddir++) { - free(cmdprio->bssplit[ddir]); - cmdprio->bssplit[ddir] = NULL; - cmdprio->bssplit_nr[ddir] = 0; + for (i = 0; i < cmdprio->bsprio_desc[ddir].nr_bsprios; i++) + free(cmdprio->bsprio_desc[ddir].bsprios[i].prios); + free(cmdprio->bsprio_desc[ddir].bsprios); + cmdprio->bsprio_desc[ddir].bsprios = NULL; + cmdprio->bsprio_desc[ddir].nr_bsprios = 0; } /* diff --git a/engines/cmdprio.h b/engines/cmdprio.h index 0c7bd6cf4b..81e6c390f0 100644 --- a/engines/cmdprio.h +++ b/engines/cmdprio.h @@ -7,6 +7,7 @@ #define FIO_CMDPRIO_H #include "../fio.h" +#include "../optgroup.h" /* read and writes only, no trim */ #define CMDPRIO_RWDIR_CNT 2 @@ -17,17 +18,140 @@ enum { CMDPRIO_MODE_BSSPLIT, }; +struct cmdprio_prio { + int32_t prio; + uint32_t perc; + uint16_t clat_prio_index; +}; + +struct cmdprio_bsprio { + uint64_t bs; + uint32_t tot_perc; + unsigned int nr_prios; + struct cmdprio_prio *prios; +}; + +struct cmdprio_bsprio_desc { + struct cmdprio_bsprio *bsprios; + unsigned int nr_bsprios; +}; + struct cmdprio_options { unsigned int percentage[CMDPRIO_RWDIR_CNT]; unsigned int class[CMDPRIO_RWDIR_CNT]; unsigned int level[CMDPRIO_RWDIR_CNT]; + unsigned int hint[CMDPRIO_RWDIR_CNT]; char *bssplit_str; }; +#ifdef FIO_HAVE_IOPRIO_CLASS +#define CMDPRIO_OPTIONS(opt_struct, opt_group) \ + { \ + .name = "cmdprio_percentage", \ + .lname = "high priority percentage", \ + .type = FIO_OPT_INT, \ + .off1 = offsetof(opt_struct, \ + cmdprio_options.percentage[DDIR_READ]), \ + .off2 = offsetof(opt_struct, \ + cmdprio_options.percentage[DDIR_WRITE]), \ + .minval = 0, \ + .maxval = 100, \ + .help = "Send high priority I/O this percentage of the time", \ + .category = FIO_OPT_C_ENGINE, \ + .group = opt_group, \ + }, \ + { \ + .name = "cmdprio_class", \ + .lname = "Asynchronous I/O priority class", \ + .type = FIO_OPT_INT, \ + .off1 = offsetof(opt_struct, \ + cmdprio_options.class[DDIR_READ]), \ + .off2 = offsetof(opt_struct, \ + cmdprio_options.class[DDIR_WRITE]), \ + .help = "Set asynchronous IO priority class", \ + .minval = IOPRIO_MIN_PRIO_CLASS + 1, \ + .maxval = IOPRIO_MAX_PRIO_CLASS, \ + .interval = 1, \ + .category = FIO_OPT_C_ENGINE, \ + .group = opt_group, \ + }, \ + { \ + .name = "cmdprio_hint", \ + .lname = "Asynchronous I/O priority hint", \ + .type = FIO_OPT_INT, \ + .off1 = offsetof(opt_struct, \ + cmdprio_options.hint[DDIR_READ]), \ + .off2 = offsetof(opt_struct, \ + cmdprio_options.hint[DDIR_WRITE]), \ + .help = "Set asynchronous IO priority hint", \ + .minval = IOPRIO_MIN_PRIO_HINT, \ + .maxval = IOPRIO_MAX_PRIO_HINT, \ + .interval = 1, \ + .category = FIO_OPT_C_ENGINE, \ + .group = opt_group, \ + }, \ + { \ + .name = "cmdprio", \ + .lname = "Asynchronous I/O priority level", \ + .type = FIO_OPT_INT, \ + .off1 = offsetof(opt_struct, \ + cmdprio_options.level[DDIR_READ]), \ + .off2 = offsetof(opt_struct, \ + cmdprio_options.level[DDIR_WRITE]), \ + .help = "Set asynchronous IO priority level", \ + .minval = IOPRIO_MIN_PRIO, \ + .maxval = IOPRIO_MAX_PRIO, \ + .interval = 1, \ + .category = FIO_OPT_C_ENGINE, \ + .group = opt_group, \ + }, \ + { \ + .name = "cmdprio_bssplit", \ + .lname = "Priority percentage block size split", \ + .type = FIO_OPT_STR_STORE, \ + .off1 = offsetof(opt_struct, cmdprio_options.bssplit_str), \ + .help = "Set priority percentages for different block sizes", \ + .category = FIO_OPT_C_ENGINE, \ + .group = opt_group, \ + } +#else +#define CMDPRIO_OPTIONS(opt_struct, opt_group) \ + { \ + .name = "cmdprio_percentage", \ + .lname = "high priority percentage", \ + .type = FIO_OPT_UNSUPPORTED, \ + .help = "Platform does not support I/O priority classes", \ + }, \ + { \ + .name = "cmdprio_class", \ + .lname = "Asynchronous I/O priority class", \ + .type = FIO_OPT_UNSUPPORTED, \ + .help = "Platform does not support I/O priority classes", \ + }, \ + { \ + .name = "cmdprio_hint", \ + .lname = "Asynchronous I/O priority hint", \ + .type = FIO_OPT_UNSUPPORTED, \ + .help = "Platform does not support I/O priority classes", \ + }, \ + { \ + .name = "cmdprio", \ + .lname = "Asynchronous I/O priority level", \ + .type = FIO_OPT_UNSUPPORTED, \ + .help = "Platform does not support I/O priority classes", \ + }, \ + { \ + .name = "cmdprio_bssplit", \ + .lname = "Priority percentage block size split", \ + .type = FIO_OPT_UNSUPPORTED, \ + .help = "Platform does not support I/O priority classes", \ + } +#endif + struct cmdprio { struct cmdprio_options *options; - unsigned int bssplit_nr[CMDPRIO_RWDIR_CNT]; - struct bssplit *bssplit[CMDPRIO_RWDIR_CNT]; + struct cmdprio_prio perc_entry[CMDPRIO_RWDIR_CNT]; + struct cmdprio_bsprio_desc bsprio_desc[CMDPRIO_RWDIR_CNT]; unsigned int mode; }; diff --git a/engines/cpu.c b/engines/cpu.c index ce74dbcea8..898fc00e39 100644 --- a/engines/cpu.c +++ b/engines/cpu.c @@ -9,7 +9,7 @@ #include "../optgroup.h" // number of 32 bit integers to sort -size_t qsort_size = (256 * (1ULL << 10)); // 256KB +static size_t qsort_size = (256 * (1ULL << 10)); // 256KB struct mwc { uint32_t w; @@ -93,7 +93,7 @@ static struct fio_option options[] = { * fast pseudo random number generator, see * http://www.cse.yorku.ca/~oz/marsaglia-rng.html */ -uint32_t mwc32(struct mwc *mwc) +static uint32_t mwc32(struct mwc *mwc) { mwc->z = 36969 * (mwc->z & 65535) + (mwc->z >> 16); mwc->w = 18000 * (mwc->w & 65535) + (mwc->w >> 16); diff --git a/engines/dfs.c b/engines/dfs.c index 664e8b13c7..e4da85f879 100644 --- a/engines/dfs.c +++ b/engines/dfs.c @@ -16,7 +16,7 @@ static pthread_mutex_t daos_mutex = PTHREAD_MUTEX_INITIALIZER; daos_handle_t poh; /* pool handle */ daos_handle_t coh; /* container handle */ daos_oclass_id_t cid = OC_UNKNOWN; /* object class */ -dfs_t *dfs; /* dfs mount reference */ +dfs_t *daosfs; /* dfs mount reference */ struct daos_iou { struct io_u *io_u; @@ -184,7 +184,7 @@ static int daos_fio_global_init(struct thread_data *td) } /* Mount encapsulated filesystem */ - rc = dfs_mount(poh, coh, O_RDWR, &dfs); + rc = dfs_mount(poh, coh, O_RDWR, &daosfs); if (rc) { log_err("Failed to mount DFS namespace: %d\n", rc); td_verror(td, rc, "dfs_mount"); @@ -205,7 +205,7 @@ static int daos_fio_global_cleanup() int rc; int ret = 0; - rc = dfs_umount(dfs); + rc = dfs_umount(daosfs); if (rc) { log_err("failed to umount dfs: %d\n", rc); ret = rc; @@ -336,7 +336,7 @@ static int daos_fio_get_file_size(struct thread_data *td, struct fio_file *f) if (!daos_initialized) return 0; - rc = dfs_stat(dfs, NULL, file_name, &stbuf); + rc = dfs_stat(daosfs, NULL, file_name, &stbuf); if (rc) { log_err("Failed to stat %s: %d\n", f->file_name, rc); td_verror(td, rc, "dfs_stat"); @@ -387,7 +387,7 @@ static int daos_fio_open(struct thread_data *td, struct fio_file *f) flags |= O_RDONLY; } - rc = dfs_open(dfs, NULL, f->file_name, + rc = dfs_open(daosfs, NULL, f->file_name, S_IFREG | S_IRUSR | S_IWUSR, flags, cid, eo->chsz, NULL, &dd->obj); if (rc) { @@ -405,7 +405,7 @@ static int daos_fio_unlink(struct thread_data *td, struct fio_file *f) dprint(FD_FILE, "dfs remove %s\n", f->file_name); - rc = dfs_remove(dfs, NULL, f->file_name, false, NULL); + rc = dfs_remove(daosfs, NULL, f->file_name, false, NULL); if (rc) { log_err("Failed to remove %s: %d\n", f->file_name, rc); td_verror(td, rc, "dfs_remove"); @@ -523,7 +523,7 @@ static enum fio_q_status daos_fio_queue(struct thread_data *td, switch (io_u->ddir) { case DDIR_WRITE: - rc = dfs_write(dfs, dd->obj, &io->sgl, offset, &io->ev); + rc = dfs_write(daosfs, dd->obj, &io->sgl, offset, &io->ev); if (rc) { log_err("dfs_write failed: %d\n", rc); io_u->error = rc; @@ -531,7 +531,7 @@ static enum fio_q_status daos_fio_queue(struct thread_data *td, } break; case DDIR_READ: - rc = dfs_read(dfs, dd->obj, &io->sgl, offset, &io->size, + rc = dfs_read(daosfs, dd->obj, &io->sgl, offset, &io->size, &io->ev); if (rc) { log_err("dfs_read failed: %d\n", rc); diff --git a/engines/e4defrag.c b/engines/e4defrag.c index 0a0004d047..e3a15fac0c 100644 --- a/engines/e4defrag.c +++ b/engines/e4defrag.c @@ -1,5 +1,5 @@ /* - * ioe_e4defrag: ioengine for git://git.kernel.dk/fio.git + * ioe_e4defrag: ioengine for https://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio * * IO engine that does regular EXT4_IOC_MOVE_EXT ioctls to simulate * defragment activity @@ -11,6 +11,7 @@ #include #include #include +#include #include "../fio.h" #include "../optgroup.h" @@ -77,12 +78,11 @@ static int fio_e4defrag_init(struct thread_data *td) return 1; } - ed = malloc(sizeof(*ed)); + ed = calloc(1, sizeof(*ed)); if (!ed) { td_verror(td, ENOMEM, "io_queue_init"); return 1; } - memset(ed, 0 ,sizeof(*ed)); if (td->o.directory) len = sprintf(donor_name, "%s/", td->o.directory); diff --git a/engines/exec.c b/engines/exec.c index ab3639c502..af20d5e14c 100644 --- a/engines/exec.c +++ b/engines/exec.c @@ -62,13 +62,13 @@ static struct fio_option options[] = { }, }; -char *str_replace(char *orig, const char *rep, const char *with) +static char *str_replace(char *orig, const char *rep, const char *with) { /* * Replace a substring by another. * - * Returns the new string if occurences were found - * Returns orig if no occurence is found + * Returns the new string if occurrences were found + * Returns orig if no occurrence is found */ char *result, *insert, *tmp; int len_rep, len_with, len_front, count; @@ -106,7 +106,7 @@ char *str_replace(char *orig, const char *rep, const char *with) return result; } -char *expand_variables(struct thread_options *o, char *arguments) +static char *expand_variables(const struct thread_options *o, char *arguments) { char str[16]; char *expanded_runtime, *expanded_name; @@ -122,7 +122,7 @@ char *expand_variables(struct thread_options *o, char *arguments) return expanded_name; } -static int exec_background(struct thread_options *o, struct exec_options *eo) +static int exec_background(const struct thread_options *o, struct exec_options *eo) { char *outfilename = NULL, *errfilename = NULL; int outfd = 0, errfd = 0; diff --git a/engines/falloc.c b/engines/falloc.c index 4b05ed68fb..5bd5aa54cd 100644 --- a/engines/falloc.c +++ b/engines/falloc.c @@ -1,5 +1,5 @@ /* - * falloc: ioengine for git://git.kernel.dk/fio.git + * falloc: ioengine for https://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio * * IO engine that does regular fallocate to simulate data transfer * as fio ioengine. @@ -76,14 +76,18 @@ static enum fio_q_status fio_fallocate_queue(struct thread_data *td, fio_ro_check(td, io_u); - if (io_u->ddir == DDIR_READ) - flags = FALLOC_FL_KEEP_SIZE; - else if (io_u->ddir == DDIR_WRITE) - flags = 0; - else if (io_u->ddir == DDIR_TRIM) - flags = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; - - ret = fallocate(f->fd, flags, io_u->offset, io_u->xfer_buflen); + if (io_u->ddir != DDIR_SYNC) { + if (io_u->ddir == DDIR_READ) + flags = FALLOC_FL_KEEP_SIZE; + else if (io_u->ddir == DDIR_WRITE) + flags = 0; + else if (io_u->ddir == DDIR_TRIM) + flags = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; + + ret = fallocate(f->fd, flags, io_u->offset, io_u->xfer_buflen); + } else { + ret = do_io_u_sync(td, io_u); + } if (ret) io_u->error = errno; diff --git a/engines/filecreate.c b/engines/filecreate.c deleted file mode 100644 index 4bb13c348c..0000000000 --- a/engines/filecreate.c +++ /dev/null @@ -1,118 +0,0 @@ -/* - * filecreate engine - * - * IO engine that doesn't do any IO, just creates files and tracks the latency - * of the file creation. - */ -#include -#include -#include - -#include "../fio.h" - -struct fc_data { - enum fio_ddir stat_ddir; -}; - -static int open_file(struct thread_data *td, struct fio_file *f) -{ - struct timespec start; - int do_lat = !td->o.disable_lat; - - dprint(FD_FILE, "fd open %s\n", f->file_name); - - if (f->filetype != FIO_TYPE_FILE) { - log_err("fio: only files are supported\n"); - return 1; - } - if (!strcmp(f->file_name, "-")) { - log_err("fio: can't read/write to stdin/out\n"); - return 1; - } - - if (do_lat) - fio_gettime(&start, NULL); - - f->fd = open(f->file_name, O_CREAT|O_RDWR, 0600); - - if (f->fd == -1) { - char buf[FIO_VERROR_SIZE]; - int e = errno; - - snprintf(buf, sizeof(buf), "open(%s)", f->file_name); - td_verror(td, e, buf); - return 1; - } - - if (do_lat) { - struct fc_data *data = td->io_ops_data; - uint64_t nsec; - - nsec = ntime_since_now(&start); - add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0, false); - } - - return 0; -} - -static enum fio_q_status queue_io(struct thread_data *td, - struct io_u fio_unused *io_u) -{ - return FIO_Q_COMPLETED; -} - -/* - * Ensure that we at least have a block size worth of IO to do for each - * file. If the job file has td->o.size < nr_files * block_size, then - * fio won't do anything. - */ -static int get_file_size(struct thread_data *td, struct fio_file *f) -{ - f->real_file_size = td_min_bs(td); - return 0; -} - -static int init(struct thread_data *td) -{ - struct fc_data *data; - - data = calloc(1, sizeof(*data)); - - if (td_read(td)) - data->stat_ddir = DDIR_READ; - else if (td_write(td)) - data->stat_ddir = DDIR_WRITE; - - td->io_ops_data = data; - return 0; -} - -static void cleanup(struct thread_data *td) -{ - struct fc_data *data = td->io_ops_data; - - free(data); -} - -static struct ioengine_ops ioengine = { - .name = "filecreate", - .version = FIO_IOOPS_VERSION, - .init = init, - .cleanup = cleanup, - .queue = queue_io, - .get_file_size = get_file_size, - .open_file = open_file, - .close_file = generic_close_file, - .flags = FIO_DISKLESSIO | FIO_SYNCIO | FIO_FAKEIO | - FIO_NOSTATS | FIO_NOFILEHASH, -}; - -static void fio_init fio_filecreate_register(void) -{ - register_ioengine(&ioengine); -} - -static void fio_exit fio_filecreate_unregister(void) -{ - unregister_ioengine(&ioengine); -} diff --git a/engines/filedelete.c b/engines/filedelete.c deleted file mode 100644 index e882ccf017..0000000000 --- a/engines/filedelete.c +++ /dev/null @@ -1,115 +0,0 @@ -/* - * file delete engine - * - * IO engine that doesn't do any IO, just delete files and track the latency - * of the file deletion. - */ -#include -#include -#include -#include -#include -#include "../fio.h" - -struct fc_data { - enum fio_ddir stat_ddir; -}; - -static int delete_file(struct thread_data *td, struct fio_file *f) -{ - struct timespec start; - int do_lat = !td->o.disable_lat; - int ret; - - dprint(FD_FILE, "fd delete %s\n", f->file_name); - - if (f->filetype != FIO_TYPE_FILE) { - log_err("fio: only files are supported\n"); - return 1; - } - if (!strcmp(f->file_name, "-")) { - log_err("fio: can't read/write to stdin/out\n"); - return 1; - } - - if (do_lat) - fio_gettime(&start, NULL); - - ret = unlink(f->file_name); - - if (ret == -1) { - char buf[FIO_VERROR_SIZE]; - int e = errno; - - snprintf(buf, sizeof(buf), "delete(%s)", f->file_name); - td_verror(td, e, buf); - return 1; - } - - if (do_lat) { - struct fc_data *data = td->io_ops_data; - uint64_t nsec; - - nsec = ntime_since_now(&start); - add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0, false); - } - - return 0; -} - - -static enum fio_q_status queue_io(struct thread_data *td, struct io_u fio_unused *io_u) -{ - return FIO_Q_COMPLETED; -} - -static int init(struct thread_data *td) -{ - struct fc_data *data; - - data = calloc(1, sizeof(*data)); - - if (td_read(td)) - data->stat_ddir = DDIR_READ; - else if (td_write(td)) - data->stat_ddir = DDIR_WRITE; - - td->io_ops_data = data; - return 0; -} - -static int delete_invalidate(struct thread_data *td, struct fio_file *f) -{ - /* do nothing because file not opened */ - return 0; -} - -static void cleanup(struct thread_data *td) -{ - struct fc_data *data = td->io_ops_data; - - free(data); -} - -static struct ioengine_ops ioengine = { - .name = "filedelete", - .version = FIO_IOOPS_VERSION, - .init = init, - .invalidate = delete_invalidate, - .cleanup = cleanup, - .queue = queue_io, - .get_file_size = generic_get_file_size, - .open_file = delete_file, - .flags = FIO_SYNCIO | FIO_FAKEIO | - FIO_NOSTATS | FIO_NOFILEHASH, -}; - -static void fio_init fio_filedelete_register(void) -{ - register_ioengine(&ioengine); -} - -static void fio_exit fio_filedelete_unregister(void) -{ - unregister_ioengine(&ioengine); -} diff --git a/engines/fileoperations.c b/engines/fileoperations.c new file mode 100644 index 0000000000..ce3e7c39b1 --- /dev/null +++ b/engines/fileoperations.c @@ -0,0 +1,427 @@ +/* + * file/directory operations engine + * + * IO engine that doesn't do any IO, just operates files/directories + * and tracks the latency of the operation. + */ +#include +#include +#include +#include +#include +#include +#include +#include "../fio.h" +#include "../optgroup.h" +#include "../oslib/statx.h" + +enum fio_engine { + UNKNOWN_OP_ENGINE = 0, + FILE_OP_ENGINE = 1, + DIR_OP_ENGINE = 2, +}; + +struct fc_data { + enum fio_ddir stat_ddir; + enum fio_engine op_engine; +}; + +struct filestat_options { + void *pad; + unsigned int stat_type; +}; + +enum { + FIO_FILESTAT_STAT = 1, + FIO_FILESTAT_LSTAT = 2, + FIO_FILESTAT_STATX = 3, +}; + +static struct fio_option options[] = { + { + .name = "stat_type", + .lname = "stat_type", + .type = FIO_OPT_STR, + .off1 = offsetof(struct filestat_options, stat_type), + .help = "Specify stat system call type to measure lookup/getattr performance", + .def = "stat", + .posval = { + { .ival = "stat", + .oval = FIO_FILESTAT_STAT, + .help = "Use stat(2)", + }, + { .ival = "lstat", + .oval = FIO_FILESTAT_LSTAT, + .help = "Use lstat(2)", + }, + { .ival = "statx", + .oval = FIO_FILESTAT_STATX, + .help = "Use statx(2) if exists", + }, + }, + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_FILESTAT, + }, + { + .name = NULL, + }, +}; + +static int setup_dirs(struct thread_data *td) +{ + int ret = 0; + int i; + struct fio_file *f; + + for_each_file(td, f, i) { + dprint(FD_FILE, "setup directory %s\n", f->file_name); + ret = fio_mkdir(f->file_name, 0700); + if ((ret && errno != EEXIST)) { + log_err("create directory %s failed with %d\n", + f->file_name, errno); + break; + } + ret = 0; + } + return ret; +} + +static int open_file(struct thread_data *td, struct fio_file *f) +{ + struct timespec start; + int do_lat = !td->o.disable_lat; + struct fc_data *fcd = td->io_ops_data; + + dprint(FD_FILE, "fd open %s\n", f->file_name); + + if (f->filetype != FIO_TYPE_FILE) { + log_err("fio: only files are supported\n"); + return 1; + } + if (!strcmp(f->file_name, "-")) { + log_err("fio: can't read/write to stdin/out\n"); + return 1; + } + + if (do_lat) + fio_gettime(&start, NULL); + + if (fcd->op_engine == FILE_OP_ENGINE) + f->fd = open(f->file_name, O_CREAT|O_RDWR, 0600); + else if (fcd->op_engine == DIR_OP_ENGINE) + f->fd = fio_mkdir(f->file_name, S_IFDIR); + else { + log_err("fio: unknown file/directory operation engine\n"); + return 1; + } + + if (f->fd == -1) { + char buf[FIO_VERROR_SIZE]; + int e = errno; + + snprintf(buf, sizeof(buf), "open(%s)", f->file_name); + td_verror(td, e, buf); + return 1; + } + + if (do_lat) { + struct fc_data *data = td->io_ops_data; + uint64_t nsec; + + nsec = ntime_since_now(&start); + add_clat_sample(td, data->stat_ddir, nsec, 0, NULL); + } + + return 0; +} + +static int stat_file(struct thread_data *td, struct fio_file *f) +{ + struct filestat_options *o = td->eo; + struct timespec start; + int do_lat = !td->o.disable_lat; + struct stat statbuf; +#ifndef WIN32 + struct statx statxbuf; + char *abspath; +#endif + int ret; + + dprint(FD_FILE, "fd stat %s\n", f->file_name); + + if (f->filetype != FIO_TYPE_FILE) { + log_err("fio: only files are supported\n"); + return 1; + } + if (!strcmp(f->file_name, "-")) { + log_err("fio: can't read/write to stdin/out\n"); + return 1; + } + + if (do_lat) + fio_gettime(&start, NULL); + + switch (o->stat_type) { + case FIO_FILESTAT_STAT: + ret = stat(f->file_name, &statbuf); + break; + case FIO_FILESTAT_LSTAT: + ret = lstat(f->file_name, &statbuf); + break; + case FIO_FILESTAT_STATX: +#ifndef WIN32 + abspath = realpath(f->file_name, NULL); + if (abspath) { + ret = statx(-1, abspath, 0, STATX_ALL, &statxbuf); + free(abspath); + } else + ret = -1; +#else + ret = -1; +#endif + break; + default: + ret = -1; + break; + } + + if (ret == -1) { + char buf[FIO_VERROR_SIZE]; + int e = errno; + + snprintf(buf, sizeof(buf), "stat(%s) type=%u", f->file_name, + o->stat_type); + td_verror(td, e, buf); + return 1; + } + + if (do_lat) { + struct fc_data *data = td->io_ops_data; + uint64_t nsec; + + nsec = ntime_since_now(&start); + add_clat_sample(td, data->stat_ddir, nsec, 0, NULL); + } + + return 0; +} + +static int delete_file(struct thread_data *td, struct fio_file *f) +{ + struct timespec start; + int do_lat = !td->o.disable_lat; + struct fc_data *fcd = td->io_ops_data; + int ret; + + dprint(FD_FILE, "fd delete %s\n", f->file_name); + + if (f->filetype != FIO_TYPE_FILE) { + log_err("fio: only files are supported\n"); + return 1; + } + if (!strcmp(f->file_name, "-")) { + log_err("fio: can't read/write to stdin/out\n"); + return 1; + } + + if (do_lat) + fio_gettime(&start, NULL); + + if (fcd->op_engine == FILE_OP_ENGINE) + ret = unlink(f->file_name); + else if (fcd->op_engine == DIR_OP_ENGINE) + ret = rmdir(f->file_name); + else { + log_err("fio: unknown file/directory operation engine\n"); + return 1; + } + + if (ret == -1) { + char buf[FIO_VERROR_SIZE]; + int e = errno; + + snprintf(buf, sizeof(buf), "delete(%s)", f->file_name); + td_verror(td, e, buf); + return 1; + } + + if (do_lat) { + struct fc_data *data = td->io_ops_data; + uint64_t nsec; + + nsec = ntime_since_now(&start); + add_clat_sample(td, data->stat_ddir, nsec, 0, NULL); + } + + return 0; +} + +static int invalidate_do_nothing(struct thread_data *td, struct fio_file *f) +{ + /* do nothing because file not opened */ + return 0; +} + +static enum fio_q_status queue_io(struct thread_data *td, struct io_u *io_u) +{ + if (io_u->ddir == DDIR_SYNC && do_io_u_sync(td, io_u)) + io_u->error = errno; + return FIO_Q_COMPLETED; +} + +/* + * Ensure that we at least have a block size worth of IO to do for each + * file. If the job file has td->o.size < nr_files * block_size, then + * fio won't do anything. + */ +static int get_file_size(struct thread_data *td, struct fio_file *f) +{ + f->real_file_size = td_min_bs(td); + return 0; +} + +static int init(struct thread_data *td) +{ + struct fc_data *data; + + data = calloc(1, sizeof(*data)); + + if (td_read(td)) + data->stat_ddir = DDIR_READ; + else if (td_write(td)) + data->stat_ddir = DDIR_WRITE; + + data->op_engine = UNKNOWN_OP_ENGINE; + + if (!strncmp(td->o.ioengine, "file", 4)) { + data->op_engine = FILE_OP_ENGINE; + dprint(FD_FILE, "Operate engine type: file\n"); + } + if (!strncmp(td->o.ioengine, "dir", 3)) { + data->op_engine = DIR_OP_ENGINE; + dprint(FD_FILE, "Operate engine type: directory\n"); + } + + td->io_ops_data = data; + return 0; +} + +static void cleanup(struct thread_data *td) +{ + struct fc_data *data = td->io_ops_data; + + free(data); +} + +static int remove_dir(struct thread_data *td, struct fio_file *f) +{ + dprint(FD_FILE, "remove directory %s\n", f->file_name); + return rmdir(f->file_name); +} + +static struct ioengine_ops ioengine_filecreate = { + .name = "filecreate", + .version = FIO_IOOPS_VERSION, + .init = init, + .cleanup = cleanup, + .queue = queue_io, + .get_file_size = get_file_size, + .open_file = open_file, + .close_file = generic_close_file, + .flags = FIO_DISKLESSIO | FIO_SYNCIO | FIO_FAKEIO | + FIO_NOSTATS | FIO_NOFILEHASH, +}; + +static struct ioengine_ops ioengine_filestat = { + .name = "filestat", + .version = FIO_IOOPS_VERSION, + .init = init, + .cleanup = cleanup, + .queue = queue_io, + .invalidate = invalidate_do_nothing, + .get_file_size = generic_get_file_size, + .open_file = stat_file, + .flags = FIO_SYNCIO | FIO_FAKEIO | + FIO_NOSTATS | FIO_NOFILEHASH, + .options = options, + .option_struct_size = sizeof(struct filestat_options), +}; + +static struct ioengine_ops ioengine_filedelete = { + .name = "filedelete", + .version = FIO_IOOPS_VERSION, + .init = init, + .invalidate = invalidate_do_nothing, + .cleanup = cleanup, + .queue = queue_io, + .get_file_size = generic_get_file_size, + .open_file = delete_file, + .flags = FIO_SYNCIO | FIO_FAKEIO | + FIO_NOSTATS | FIO_NOFILEHASH, +}; + +static struct ioengine_ops ioengine_dircreate = { + .name = "dircreate", + .version = FIO_IOOPS_VERSION, + .init = init, + .cleanup = cleanup, + .queue = queue_io, + .get_file_size = get_file_size, + .open_file = open_file, + .close_file = generic_close_file, + .unlink_file = remove_dir, + .flags = FIO_DISKLESSIO | FIO_SYNCIO | FIO_FAKEIO | + FIO_NOSTATS | FIO_NOFILEHASH, +}; + +static struct ioengine_ops ioengine_dirstat = { + .name = "dirstat", + .version = FIO_IOOPS_VERSION, + .setup = setup_dirs, + .init = init, + .cleanup = cleanup, + .queue = queue_io, + .invalidate = invalidate_do_nothing, + .get_file_size = generic_get_file_size, + .open_file = stat_file, + .unlink_file = remove_dir, + .flags = FIO_DISKLESSIO | FIO_SYNCIO | FIO_FAKEIO | + FIO_NOSTATS | FIO_NOFILEHASH, + .options = options, + .option_struct_size = sizeof(struct filestat_options), +}; + +static struct ioengine_ops ioengine_dirdelete = { + .name = "dirdelete", + .version = FIO_IOOPS_VERSION, + .setup = setup_dirs, + .init = init, + .invalidate = invalidate_do_nothing, + .cleanup = cleanup, + .queue = queue_io, + .get_file_size = get_file_size, + .open_file = delete_file, + .unlink_file = remove_dir, + .flags = FIO_DISKLESSIO | FIO_SYNCIO | FIO_FAKEIO | + FIO_NOSTATS | FIO_NOFILEHASH, +}; + +static void fio_init fio_fileoperations_register(void) +{ + register_ioengine(&ioengine_filecreate); + register_ioengine(&ioengine_filestat); + register_ioengine(&ioengine_filedelete); + register_ioengine(&ioengine_dircreate); + register_ioengine(&ioengine_dirstat); + register_ioengine(&ioengine_dirdelete); +} + +static void fio_exit fio_fileoperations_unregister(void) +{ + unregister_ioengine(&ioengine_filecreate); + unregister_ioengine(&ioengine_filestat); + unregister_ioengine(&ioengine_filedelete); + unregister_ioengine(&ioengine_dircreate); + unregister_ioengine(&ioengine_dirstat); + unregister_ioengine(&ioengine_dirdelete); +} diff --git a/engines/filestat.c b/engines/filestat.c deleted file mode 100644 index 003112474b..0000000000 --- a/engines/filestat.c +++ /dev/null @@ -1,190 +0,0 @@ -/* - * filestat engine - * - * IO engine that doesn't do any IO, just stat files and tracks the latency - * of the file stat. - */ -#include -#include -#include -#include -#include -#include -#include -#include "../fio.h" -#include "../optgroup.h" -#include "../oslib/statx.h" - -struct fc_data { - enum fio_ddir stat_ddir; -}; - -struct filestat_options { - void *pad; - unsigned int stat_type; -}; - -enum { - FIO_FILESTAT_STAT = 1, - FIO_FILESTAT_LSTAT = 2, - FIO_FILESTAT_STATX = 3, -}; - -static struct fio_option options[] = { - { - .name = "stat_type", - .lname = "stat_type", - .type = FIO_OPT_STR, - .off1 = offsetof(struct filestat_options, stat_type), - .help = "Specify stat system call type to measure lookup/getattr performance", - .def = "stat", - .posval = { - { .ival = "stat", - .oval = FIO_FILESTAT_STAT, - .help = "Use stat(2)", - }, - { .ival = "lstat", - .oval = FIO_FILESTAT_LSTAT, - .help = "Use lstat(2)", - }, - { .ival = "statx", - .oval = FIO_FILESTAT_STATX, - .help = "Use statx(2) if exists", - }, - }, - .category = FIO_OPT_C_ENGINE, - .group = FIO_OPT_G_FILESTAT, - }, - { - .name = NULL, - }, -}; - -static int stat_file(struct thread_data *td, struct fio_file *f) -{ - struct filestat_options *o = td->eo; - struct timespec start; - int do_lat = !td->o.disable_lat; - struct stat statbuf; -#ifndef WIN32 - struct statx statxbuf; - char *abspath; -#endif - int ret; - - dprint(FD_FILE, "fd stat %s\n", f->file_name); - - if (f->filetype != FIO_TYPE_FILE) { - log_err("fio: only files are supported\n"); - return 1; - } - if (!strcmp(f->file_name, "-")) { - log_err("fio: can't read/write to stdin/out\n"); - return 1; - } - - if (do_lat) - fio_gettime(&start, NULL); - - switch (o->stat_type){ - case FIO_FILESTAT_STAT: - ret = stat(f->file_name, &statbuf); - break; - case FIO_FILESTAT_LSTAT: - ret = lstat(f->file_name, &statbuf); - break; - case FIO_FILESTAT_STATX: -#ifndef WIN32 - abspath = realpath(f->file_name, NULL); - if (abspath) { - ret = statx(-1, abspath, 0, STATX_ALL, &statxbuf); - free(abspath); - } else - ret = -1; -#else - ret = -1; -#endif - break; - default: - ret = -1; - break; - } - - if (ret == -1) { - char buf[FIO_VERROR_SIZE]; - int e = errno; - - snprintf(buf, sizeof(buf), "stat(%s) type=%u", f->file_name, - o->stat_type); - td_verror(td, e, buf); - return 1; - } - - if (do_lat) { - struct fc_data *data = td->io_ops_data; - uint64_t nsec; - - nsec = ntime_since_now(&start); - add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0, false); - } - - return 0; -} - -static enum fio_q_status queue_io(struct thread_data *td, struct io_u fio_unused *io_u) -{ - return FIO_Q_COMPLETED; -} - -static int init(struct thread_data *td) -{ - struct fc_data *data; - - data = calloc(1, sizeof(*data)); - - if (td_read(td)) - data->stat_ddir = DDIR_READ; - else if (td_write(td)) - data->stat_ddir = DDIR_WRITE; - - td->io_ops_data = data; - return 0; -} - -static void cleanup(struct thread_data *td) -{ - struct fc_data *data = td->io_ops_data; - - free(data); -} - -static int stat_invalidate(struct thread_data *td, struct fio_file *f) -{ - /* do nothing because file not opened */ - return 0; -} - -static struct ioengine_ops ioengine = { - .name = "filestat", - .version = FIO_IOOPS_VERSION, - .init = init, - .cleanup = cleanup, - .queue = queue_io, - .invalidate = stat_invalidate, - .get_file_size = generic_get_file_size, - .open_file = stat_file, - .flags = FIO_SYNCIO | FIO_FAKEIO | - FIO_NOSTATS | FIO_NOFILEHASH, - .options = options, - .option_struct_size = sizeof(struct filestat_options), -}; - -static void fio_init fio_filestat_register(void) -{ - register_ioengine(&ioengine); -} - -static void fio_exit fio_filestat_unregister(void) -{ - unregister_ioengine(&ioengine); -} diff --git a/engines/ftruncate.c b/engines/ftruncate.c index c7ad038c0a..70211e0705 100644 --- a/engines/ftruncate.c +++ b/engines/ftruncate.c @@ -1,5 +1,5 @@ /* - * ftruncate: ioengine for git://git.kernel.dk/fio.git + * ftruncate: ioengine for https://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio * * IO engine that does regular truncates to simulate data transfer * as fio ioengine. @@ -15,16 +15,17 @@ static enum fio_q_status fio_ftruncate_queue(struct thread_data *td, struct io_u *io_u) { struct fio_file *f = io_u->file; - int ret; + int ret = 0; fio_ro_check(td, io_u); - if (io_u->ddir != DDIR_WRITE) { + if (io_u->ddir == DDIR_WRITE) + ret = ftruncate(f->fd, io_u->offset); + else if (io_u->ddir == DDIR_SYNC) + ret = do_io_u_sync(td, io_u); + else io_u->error = EINVAL; - return FIO_Q_COMPLETED; - } - ret = ftruncate(f->fd, io_u->offset); if (ret) io_u->error = errno; diff --git a/engines/http.c b/engines/http.c index 35c44871da..83891f1fc5 100644 --- a/engines/http.c +++ b/engines/http.c @@ -29,15 +29,22 @@ #include "fio.h" #include "../optgroup.h" +/* + * Silence OpenSSL 3.0 deprecated function warnings + */ +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" enum { - FIO_HTTP_WEBDAV = 0, - FIO_HTTP_S3 = 1, - FIO_HTTP_SWIFT = 2, + FIO_HTTP_WEBDAV = 0, + FIO_HTTP_S3 = 1, + FIO_HTTP_SWIFT = 2, + + FIO_HTTPS_OFF = 0, + FIO_HTTPS_ON = 1, + FIO_HTTPS_INSECURE = 2, - FIO_HTTPS_OFF = 0, - FIO_HTTPS_ON = 1, - FIO_HTTPS_INSECURE = 2, + FIO_HTTP_OBJECT_BLOCK = 0, + FIO_HTTP_OBJECT_RANGE = 1, }; struct http_data { @@ -52,10 +59,15 @@ struct http_options { char *pass; char *s3_key; char *s3_keyid; + char *s3_security_token; char *s3_region; + char *s3_sse_customer_key; + char *s3_sse_customer_algorithm; + char *s3_storage_class; char *swift_auth_token; int verbose; unsigned int mode; + unsigned int object_mode; }; struct http_curl_stream { @@ -137,6 +149,16 @@ static struct fio_option options[] = { .category = FIO_OPT_C_ENGINE, .group = FIO_OPT_G_HTTP, }, + { + .name = "http_s3_security_token", + .lname = "S3 security token", + .type = FIO_OPT_STR_STORE, + .help = "S3 security token", + .off1 = offsetof(struct http_options, s3_security_token), + .def = "", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_HTTP, + }, { .name = "http_swift_auth_token", .lname = "Swift auth token", @@ -157,6 +179,36 @@ static struct fio_option options[] = { .category = FIO_OPT_C_ENGINE, .group = FIO_OPT_G_HTTP, }, + { + .name = "http_s3_sse_customer_key", + .lname = "SSE Customer Key", + .type = FIO_OPT_STR_STORE, + .help = "S3 SSE Customer Key", + .off1 = offsetof(struct http_options, s3_sse_customer_key), + .def = "", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_HTTP, + }, + { + .name = "http_s3_sse_customer_algorithm", + .lname = "SSE Customer Algorithm", + .type = FIO_OPT_STR_STORE, + .help = "S3 SSE Customer Algorithm", + .off1 = offsetof(struct http_options, s3_sse_customer_algorithm), + .def = "AES256", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_HTTP, + }, + { + .name = "http_s3_storage_class", + .lname = "S3 Storage class", + .type = FIO_OPT_STR_STORE, + .help = "S3 Storage Class", + .off1 = offsetof(struct http_options, s3_storage_class), + .def = "STANDARD", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_HTTP, + }, { .name = "http_mode", .lname = "Request mode to use", @@ -191,6 +243,26 @@ static struct fio_option options[] = { .category = FIO_OPT_C_ENGINE, .group = FIO_OPT_G_HTTP, }, + { + .name = "http_object_mode", + .lname = "Object mode to use", + .type = FIO_OPT_STR, + .help = "How to structure objects when issuing HTTP requests", + .off1 = offsetof(struct http_options, object_mode), + .def = "block", + .posval = { + { .ival = "block", + .oval = FIO_HTTP_OBJECT_BLOCK, + .help = "One object per block", + }, + { .ival = "range", + .oval = FIO_HTTP_OBJECT_RANGE, + .help = "One object per file, range reads per block", + }, + }, + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_HTTP, + }, { .name = NULL, }, @@ -213,6 +285,7 @@ static char *_aws_uriencode(const char *uri) for (i = 0; (c = uri[i]); i++) { if (n > bufsize-5) { log_err("encoding the URL failed\n"); + free(r); return NULL; } @@ -262,6 +335,54 @@ static char *_gen_hex_md5(const char *p, size_t len) return _conv_hex(hash, MD5_DIGEST_LENGTH); } +static char *_conv_base64_encode(const unsigned char *p, size_t len) +{ + char *r, *ret; + int i; + static const char sEncodingTable[] = { + 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', + 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', + 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', + 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', + 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', + 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', + 'w', 'x', 'y', 'z', '0', '1', '2', '3', + '4', '5', '6', '7', '8', '9', '+', '/' + }; + + size_t out_len = 4 * ((len + 2) / 3); + ret = r = malloc(out_len + 1); + + for (i = 0; i < len - 2; i += 3) { + *r++ = sEncodingTable[(p[i] >> 2) & 0x3F]; + *r++ = sEncodingTable[((p[i] & 0x3) << 4) | ((int) (p[i + 1] & 0xF0) >> 4)]; + *r++ = sEncodingTable[((p[i + 1] & 0xF) << 2) | ((int) (p[i + 2] & 0xC0) >> 6)]; + *r++ = sEncodingTable[p[i + 2] & 0x3F]; + } + + if (i < len) { + *r++ = sEncodingTable[(p[i] >> 2) & 0x3F]; + if (i == (len - 1)) { + *r++ = sEncodingTable[((p[i] & 0x3) << 4)]; + *r++ = '='; + } else { + *r++ = sEncodingTable[((p[i] & 0x3) << 4) | ((int) (p[i + 1] & 0xF0) >> 4)]; + *r++ = sEncodingTable[((p[i + 1] & 0xF) << 2)]; + } + *r++ = '='; + } + + ret[out_len]=0; + return ret; +} + +static char *_gen_base64_md5(const unsigned char *p, size_t len) +{ + unsigned char hash[MD5_DIGEST_LENGTH]; + MD5((unsigned char*)p, len, hash); + return _conv_base64_encode(hash, MD5_DIGEST_LENGTH); +} + static void _hmac(unsigned char *md, void *key, int key_len, char *data) { #ifndef CONFIG_HAVE_OPAQUE_HMAC_CTX HMAC_CTX _ctx; @@ -297,7 +418,7 @@ static int _curl_trace(CURL *handle, curl_infotype type, switch (type) { case CURLINFO_TEXT: fprintf(stderr, "== Info: %s", data); - fallthrough; + fio_fallthrough; default: case CURLINFO_SSL_DATA_OUT: case CURLINFO_SSL_DATA_IN: @@ -331,9 +452,9 @@ static void _add_aws_auth_header(CURL *curl, struct curl_slist *slist, struct ht char date_iso[32]; char method[8]; char dkey[128]; - char creq[512]; - char sts[256]; - char s[512]; + char creq[4096]; + char sts[512]; + char s[2048]; char *uri_encoded = NULL; char *dsha = NULL; char *csha = NULL; @@ -341,6 +462,11 @@ static void _add_aws_auth_header(CURL *curl, struct curl_slist *slist, struct ht const char *service = "s3"; const char *aws = "aws4_request"; unsigned char md[SHA256_DIGEST_LENGTH]; + unsigned char sse_key[33] = {0}; + char *sse_key_base64 = NULL; + char *sse_key_md5_base64 = NULL; + char security_token_header[2048] = {0}; + char security_token_list_item[24] = {0}; time_t t = time(NULL); struct tm *gtm = gmtime(&t); @@ -349,6 +475,15 @@ static void _add_aws_auth_header(CURL *curl, struct curl_slist *slist, struct ht strftime (date_iso, sizeof(date_iso), "%Y%m%dT%H%M%SZ", gtm); uri_encoded = _aws_uriencode(uri); + if (o->s3_security_token != NULL) { + snprintf(security_token_header, sizeof(security_token_header), + "x-amz-security-token:%s\n", o->s3_security_token); + sprintf(security_token_list_item, "x-amz-security-token;"); + } + + if (o->s3_sse_customer_key != NULL) + strncpy((char*)sse_key, o->s3_sse_customer_key, sizeof(sse_key) - 1); + if (op == DDIR_WRITE) { dsha = _gen_hex_sha256(buf, len); sprintf(method, "PUT"); @@ -362,22 +497,56 @@ static void _add_aws_auth_header(CURL *curl, struct curl_slist *slist, struct ht } /* Create the canonical request first */ - snprintf(creq, sizeof(creq), - "%s\n" - "%s\n" - "\n" - "host:%s\n" - "x-amz-content-sha256:%s\n" - "x-amz-date:%s\n" - "\n" - "host;x-amz-content-sha256;x-amz-date\n" - "%s" - , method - , uri_encoded, o->host, dsha, date_iso, dsha); + if (sse_key[0] != '\0') { + sse_key_base64 = _conv_base64_encode(sse_key, sizeof(sse_key) - 1); + sse_key_md5_base64 = _gen_base64_md5(sse_key, sizeof(sse_key) - 1); + snprintf(creq, sizeof(creq), + "%s\n" + "%s\n" + "\n" + "host:%s\n" + "x-amz-content-sha256:%s\n" + "x-amz-date:%s\n" + "x-amz-server-side-encryption-customer-algorithm:%s\n" + "x-amz-server-side-encryption-customer-key:%s\n" + "x-amz-server-side-encryption-customer-key-md5:%s\n" + "%s" /* security token if provided */ + "x-amz-storage-class:%s\n" + "\n" + "host;x-amz-content-sha256;x-amz-date;" + "x-amz-server-side-encryption-customer-algorithm;" + "x-amz-server-side-encryption-customer-key;" + "x-amz-server-side-encryption-customer-key-md5;" + "%s" + "x-amz-storage-class\n" + "%s" + , method + , uri_encoded, o->host, dsha, date_iso + , o->s3_sse_customer_algorithm, sse_key_base64 + , sse_key_md5_base64, security_token_header + , o->s3_storage_class, security_token_list_item, dsha); + } else { + snprintf(creq, sizeof(creq), + "%s\n" + "%s\n" + "\n" + "host:%s\n" + "x-amz-content-sha256:%s\n" + "x-amz-date:%s\n" + "%s" /* security token if provided */ + "x-amz-storage-class:%s\n" + "\n" + "host;x-amz-content-sha256;x-amz-date;%sx-amz-storage-class\n" + "%s" + , method + , uri_encoded, o->host, dsha, date_iso + , security_token_header, o->s3_storage_class + , security_token_list_item, dsha); + } csha = _gen_hex_sha256(creq, strlen(creq)); snprintf(sts, sizeof(sts), "AWS4-HMAC-SHA256\n%s\n%s/%s/%s/%s\n%s", - date_iso, date_short, o->s3_region, service, aws, csha); + date_iso, date_short, o->s3_region, service, aws, csha); snprintf((char *)dkey, sizeof(dkey), "AWS4%s", o->s3_key); _hmac(md, dkey, strlen(dkey), date_short); @@ -388,7 +557,7 @@ static void _add_aws_auth_header(CURL *curl, struct curl_slist *slist, struct ht signature = _conv_hex(md, SHA256_DIGEST_LENGTH); - /* Surpress automatic Accept: header */ + /* Suppress automatic Accept: header */ slist = curl_slist_append(slist, "Accept:"); snprintf(s, sizeof(s), "x-amz-content-sha256: %s", dsha); @@ -397,9 +566,38 @@ static void _add_aws_auth_header(CURL *curl, struct curl_slist *slist, struct ht snprintf(s, sizeof(s), "x-amz-date: %s", date_iso); slist = curl_slist_append(slist, s); - snprintf(s, sizeof(s), "Authorization: AWS4-HMAC-SHA256 Credential=%s/%s/%s/s3/aws4_request," - "SignedHeaders=host;x-amz-content-sha256;x-amz-date,Signature=%s", - o->s3_keyid, date_short, o->s3_region, signature); + if (sse_key[0] != '\0') { + snprintf(s, sizeof(s), "x-amz-server-side-encryption-customer-algorithm: %s", o->s3_sse_customer_algorithm); + slist = curl_slist_append(slist, s); + snprintf(s, sizeof(s), "x-amz-server-side-encryption-customer-key: %s", sse_key_base64); + slist = curl_slist_append(slist, s); + snprintf(s, sizeof(s), "x-amz-server-side-encryption-customer-key-md5: %s", sse_key_md5_base64); + slist = curl_slist_append(slist, s); + } + + if (o->s3_security_token != NULL) { + snprintf(s, sizeof(s), "x-amz-security-token: %s", o->s3_security_token); + slist = curl_slist_append(slist, s); + } + + snprintf(s, sizeof(s), "x-amz-storage-class: %s", o->s3_storage_class); + slist = curl_slist_append(slist, s); + + if (sse_key[0] != '\0') { + snprintf(s, sizeof(s), "Authorization: AWS4-HMAC-SHA256 Credential=%s/%s/%s/s3/aws4_request," + "SignedHeaders=host;x-amz-content-sha256;" + "x-amz-date;x-amz-server-side-encryption-customer-algorithm;" + "x-amz-server-side-encryption-customer-key;" + "x-amz-server-side-encryption-customer-key-md5;" + "%s" + "x-amz-storage-class," + "Signature=%s", + o->s3_keyid, date_short, o->s3_region, security_token_list_item, signature); + } else { + snprintf(s, sizeof(s), "Authorization: AWS4-HMAC-SHA256 Credential=%s/%s/%s/s3/aws4_request," + "SignedHeaders=host;x-amz-content-sha256;x-amz-date;%sx-amz-storage-class,Signature=%s", + o->s3_keyid, date_short, o->s3_region, security_token_list_item, signature); + } slist = curl_slist_append(slist, s); curl_easy_setopt(curl, CURLOPT_HTTPHEADER, slist); @@ -408,6 +606,10 @@ static void _add_aws_auth_header(CURL *curl, struct curl_slist *slist, struct ht free(csha); free(dsha); free(signature); + if (sse_key_base64 != NULL) { + free(sse_key_base64); + free(sse_key_md5_base64); + } } static void _add_swift_header(CURL *curl, struct curl_slist *slist, struct http_options *o, @@ -419,7 +621,7 @@ static void _add_swift_header(CURL *curl, struct curl_slist *slist, struct http_ if (op == DDIR_WRITE) { dsha = _gen_hex_md5(buf, len); } - /* Surpress automatic Accept: header */ + /* Suppress automatic Accept: header */ slist = curl_slist_append(slist, "Accept:"); snprintf(s, sizeof(s), "etag: %s", dsha); @@ -433,6 +635,26 @@ static void _add_swift_header(CURL *curl, struct curl_slist *slist, struct http_ free(dsha); } +static struct curl_slist* _append_range_header(struct curl_slist *slist, unsigned long long offset, unsigned long long length, unsigned long long file_size) +{ + char s[256]; + unsigned long long end_byte; + + /* Don't request beyond end of file */ + if (offset >= file_size) { + return slist; + } + + /* Calculate end byte, but cap it at file size - 1 because end range is inclusive */ + end_byte = offset + length - 1; + if (end_byte >= file_size) { + end_byte = file_size - 1; + } + + snprintf(s, sizeof(s), "Range: bytes=%llu-%llu", offset, end_byte); + return curl_slist_append(slist, s); +} + static void fio_http_cleanup(struct thread_data *td) { struct http_data *http = td->io_ops_data; @@ -489,31 +711,39 @@ static enum fio_q_status fio_http_queue(struct thread_data *td, struct http_options *o = td->eo; struct http_curl_stream _curl_stream; struct curl_slist *slist = NULL; - char object[512]; + char object_path_buf[512]; + char *object_path; char url[1024]; long status; CURLcode res; - int r = -1; fio_ro_check(td, io_u); memset(&_curl_stream, 0, sizeof(_curl_stream)); - snprintf(object, sizeof(object), "%s_%llu_%llu", td->files[0]->file_name, - io_u->offset, io_u->xfer_buflen); + if (o->object_mode == FIO_HTTP_OBJECT_BLOCK) { + snprintf(object_path_buf, sizeof(object_path_buf), "%s_%llu_%llu", io_u->file->file_name, + io_u->offset, io_u->xfer_buflen); + object_path = object_path_buf; + } else + object_path = io_u->file->file_name; if (o->https == FIO_HTTPS_OFF) - snprintf(url, sizeof(url), "http://%s%s", o->host, object); + snprintf(url, sizeof(url), "http://%s%s", o->host, object_path); else - snprintf(url, sizeof(url), "https://%s%s", o->host, object); + snprintf(url, sizeof(url), "https://%s%s", o->host, object_path); + curl_easy_setopt(http->curl, CURLOPT_URL, url); _curl_stream.buf = io_u->xfer_buf; _curl_stream.max = io_u->xfer_buflen; curl_easy_setopt(http->curl, CURLOPT_SEEKDATA, &_curl_stream); curl_easy_setopt(http->curl, CURLOPT_INFILESIZE_LARGE, (curl_off_t)io_u->xfer_buflen); + if (io_u->ddir == DDIR_READ && o->object_mode == FIO_HTTP_OBJECT_RANGE) + slist = _append_range_header(slist, io_u->offset, io_u->xfer_buflen, io_u->file->real_file_size); + if (o->mode == FIO_HTTP_S3) - _add_aws_auth_header(http->curl, slist, o, io_u->ddir, object, + _add_aws_auth_header(http->curl, slist, o, io_u->ddir, object_path, io_u->xfer_buf, io_u->xfer_buflen); else if (o->mode == FIO_HTTP_SWIFT) - _add_swift_header(http->curl, slist, o, io_u->ddir, object, + _add_swift_header(http->curl, slist, o, io_u->ddir, object_path, io_u->xfer_buf, io_u->xfer_buflen); if (io_u->ddir == DDIR_WRITE) { @@ -526,8 +756,8 @@ static enum fio_q_status fio_http_queue(struct thread_data *td, if (status == 100 || (status >= 200 && status <= 204)) goto out; log_err("DDIR_WRITE failed with HTTP status code %ld\n", status); - goto err; } + goto err; } else if (io_u->ddir == DDIR_READ) { curl_easy_setopt(http->curl, CURLOPT_READDATA, NULL); curl_easy_setopt(http->curl, CURLOPT_WRITEDATA, &_curl_stream); @@ -535,7 +765,9 @@ static enum fio_q_status fio_http_queue(struct thread_data *td, res = curl_easy_perform(http->curl); if (res == CURLE_OK) { curl_easy_getinfo(http->curl, CURLINFO_RESPONSE_CODE, &status); - if (status == 200) + /* 206 "Partial Content" means success when using the + * Range header */ + if (status == 200 || (o->object_mode == FIO_HTTP_OBJECT_RANGE && status == 206)) goto out; else if (status == 404) { /* Object doesn't exist. Pretend we read @@ -565,7 +797,7 @@ static enum fio_q_status fio_http_queue(struct thread_data *td, log_err("WARNING: Only DDIR_READ/DDIR_WRITE/DDIR_TRIM are supported!\n"); err: - io_u->error = r; + io_u->error = EIO; td_verror(td, io_u->error, "transfer"); out: curl_slist_free_all(slist); @@ -578,7 +810,7 @@ static struct io_u *fio_http_event(struct thread_data *td, int event) return NULL; } -int fio_http_getevents(struct thread_data *td, unsigned int min, +static int fio_http_getevents(struct thread_data *td, unsigned int min, unsigned int max, const struct timespec *t) { /* sync IO engine - never any outstanding events */ diff --git a/engines/ime.c b/engines/ime.c index 440cc29e8e..037b8419e2 100644 --- a/engines/ime.c +++ b/engines/ime.c @@ -83,7 +83,7 @@ struct ime_data { }; struct iovec *iovecs; /* array of queued iovecs */ struct io_u **io_us; /* array of queued io_u pointers */ - struct io_u **event_io_us; /* array of the events retieved afer get_events*/ + struct io_u **event_io_us; /* array of the events retrieved after get_events*/ unsigned int queued; /* iovecs/io_us in the queue */ unsigned int events; /* number of committed iovecs/io_us */ @@ -188,10 +188,6 @@ static int fio_ime_open_file(struct thread_data *td, struct fio_file *f) return 1; } - if (td->o.oatomic) { - td_verror(td, EINVAL, "IME does not support atomic IO"); - return 1; - } if (td->o.odirect) flags |= O_DIRECT; flags |= td->o.sync_io; diff --git a/engines/io_uring.c b/engines/io_uring.c index 00ae34823f..0ea3aba12e 100644 --- a/engines/io_uring.c +++ b/engines/io_uring.c @@ -1,9 +1,7 @@ /* * io_uring engine * - * IO engine using the new native Linux aio io_uring interface. See: - * - * http://git.kernel.dk/cgit/linux-block/log/?h=io_uring + * IO engine using the new native Linux aio io_uring interface. * */ #include @@ -18,12 +16,98 @@ #include "../lib/memalign.h" #include "../lib/fls.h" #include "../lib/roundup.h" +#include "../verify.h" #ifdef ARCH_HAVE_IOURING #include "../lib/types.h" #include "../os/linux/io_uring.h" #include "cmdprio.h" +#include "zbd.h" +#include "nvme.h" + +#include + +#ifndef IO_INTEGRITY_CHK_GUARD +/* flags for integrity meta */ +#define IO_INTEGRITY_CHK_GUARD (1U << 0) /* enforce guard check */ +#define IO_INTEGRITY_CHK_REFTAG (1U << 1) /* enforce ref check */ +#define IO_INTEGRITY_CHK_APPTAG (1U << 2) /* enforce app check */ +#endif /* IO_INTEGRITY_CHK_GUARD */ + +#ifndef FS_IOC_GETLBMD_CAP +/* Protection info capability flags */ +#define LBMD_PI_CAP_INTEGRITY (1 << 0) +#define LBMD_PI_CAP_REFTAG (1 << 1) + +/* Checksum types for Protection Information */ +#define LBMD_PI_CSUM_NONE 0 +#define LBMD_PI_CSUM_IP 1 +#define LBMD_PI_CSUM_CRC16_T10DIF 2 +#define LBMD_PI_CSUM_CRC64_NVME 4 + +/* + * Logical block metadata capability descriptor + * If the device does not support metadata, all the fields will be zero. + * Applications must check lbmd_flags to determine whether metadata is + * supported or not. + */ +struct logical_block_metadata_cap { + /* Bitmask of logical block metadata capability flags */ + __u32 lbmd_flags; + /* + * The amount of data described by each unit of logical block + * metadata + */ + __u16 lbmd_interval; + /* + * Size in bytes of the logical block metadata associated with each + * interval + */ + __u8 lbmd_size; + /* + * Size in bytes of the opaque block tag associated with each + * interval + */ + __u8 lbmd_opaque_size; + /* + * Offset in bytes of the opaque block tag within the logical block + * metadata + */ + __u8 lbmd_opaque_offset; + /* Size in bytes of the T10 PI tuple associated with each interval */ + __u8 lbmd_pi_size; + /* Offset in bytes of T10 PI tuple within the logical block metadata */ + __u8 lbmd_pi_offset; + /* T10 PI guard tag type */ + __u8 lbmd_guard_tag_type; + /* Size in bytes of the T10 PI application tag */ + __u8 lbmd_app_tag_size; + /* Size in bytes of the T10 PI reference tag */ + __u8 lbmd_ref_tag_size; + /* Size in bytes of the T10 PI storage tag */ + __u8 lbmd_storage_tag_size; + __u8 pad; +}; + +#define FS_IOC_GETLBMD_CAP _IOWR(0x15, 2, struct logical_block_metadata_cap) +#endif /* FS_IOC_GETLBMD_CAP */ + +enum uring_cmd_type { + FIO_URING_CMD_NVME = 1, +}; + +enum uring_cmd_write_mode { + FIO_URING_CMD_WMODE_WRITE = 1, + FIO_URING_CMD_WMODE_UNCOR, + FIO_URING_CMD_WMODE_ZEROES, + FIO_URING_CMD_WMODE_VERIFY, +}; + +enum uring_cmd_verify_mode { + FIO_URING_CMD_VMODE_READ = 1, + FIO_URING_CMD_VMODE_COMPARE, +}; struct io_sq_ring { unsigned *head; @@ -51,6 +135,8 @@ struct ioring_data { int ring_fd; struct io_u **io_u_index; + char *md_buf; + char *pi_attr; int *fds; @@ -62,6 +148,7 @@ struct ioring_data { struct io_cq_ring cq_ring; unsigned cq_ring_mask; + int async_trim_fail; int queued; int cq_ring_off; unsigned iodepth; @@ -70,11 +157,24 @@ struct ioring_data { struct ioring_mmap mmap[3]; struct cmdprio cmdprio; + + struct nvme_dsm *dsm; + uint32_t cdw12_flags[DDIR_RWDIR_CNT]; + uint8_t write_opcode; + + bool is_uring_cmd_eng; + + struct nvme_cmd_ext_io_opts ext_opts; }; struct ioring_options { struct thread_data *td; unsigned int hipri; + unsigned int readfua; + unsigned int writefua; + unsigned int deac; + unsigned int write_mode; + unsigned int verify_mode; struct cmdprio_options cmdprio_options; unsigned int fixedbufs; unsigned int registerfiles; @@ -85,8 +185,17 @@ struct ioring_options { unsigned int uncached; unsigned int nowait; unsigned int force_async; + unsigned int md_per_io_size; + unsigned int pi_act; + unsigned int apptag; + unsigned int apptag_mask; + unsigned int prchk; + char *pi_chk; + enum uring_cmd_type cmd_type; }; +static unsigned int enter_flags = IORING_ENTER_GETEVENTS; + static const int ddir_to_op[2][2] = { { IORING_OP_READV, IORING_OP_READ }, { IORING_OP_WRITEV, IORING_OP_WRITE } @@ -116,87 +225,74 @@ static struct fio_option options[] = { .category = FIO_OPT_C_ENGINE, .group = FIO_OPT_G_IOURING, }, -#ifdef FIO_HAVE_IOPRIO_CLASS { - .name = "cmdprio_percentage", - .lname = "high priority percentage", - .type = FIO_OPT_INT, - .off1 = offsetof(struct ioring_options, - cmdprio_options.percentage[DDIR_READ]), - .off2 = offsetof(struct ioring_options, - cmdprio_options.percentage[DDIR_WRITE]), - .minval = 0, - .maxval = 100, - .help = "Send high priority I/O this percentage of the time", + .name = "readfua", + .lname = "Read fua flag support", + .type = FIO_OPT_BOOL, + .off1 = offsetof(struct ioring_options, readfua), + .help = "Set FUA flag (force unit access) for all Read operations", + .def = "0", .category = FIO_OPT_C_ENGINE, .group = FIO_OPT_G_IOURING, }, { - .name = "cmdprio_class", - .lname = "Asynchronous I/O priority class", - .type = FIO_OPT_INT, - .off1 = offsetof(struct ioring_options, - cmdprio_options.class[DDIR_READ]), - .off2 = offsetof(struct ioring_options, - cmdprio_options.class[DDIR_WRITE]), - .help = "Set asynchronous IO priority class", - .minval = IOPRIO_MIN_PRIO_CLASS + 1, - .maxval = IOPRIO_MAX_PRIO_CLASS, - .interval = 1, + .name = "writefua", + .lname = "Write fua flag support", + .type = FIO_OPT_BOOL, + .off1 = offsetof(struct ioring_options, writefua), + .help = "Set FUA flag (force unit access) for all Write operations", + .def = "0", .category = FIO_OPT_C_ENGINE, .group = FIO_OPT_G_IOURING, }, { - .name = "cmdprio", - .lname = "Asynchronous I/O priority level", - .type = FIO_OPT_INT, - .off1 = offsetof(struct ioring_options, - cmdprio_options.level[DDIR_READ]), - .off2 = offsetof(struct ioring_options, - cmdprio_options.level[DDIR_WRITE]), - .help = "Set asynchronous IO priority level", - .minval = IOPRIO_MIN_PRIO, - .maxval = IOPRIO_MAX_PRIO, - .interval = 1, + .name = "write_mode", + .lname = "Additional Write commands support (Write Uncorrectable, Write Zeores)", + .type = FIO_OPT_STR, + .off1 = offsetof(struct ioring_options, write_mode), + .help = "Issue Write Uncorrectable or Zeroes command instead of Write command", + .def = "write", + .posval = { + { .ival = "write", + .oval = FIO_URING_CMD_WMODE_WRITE, + .help = "Issue Write commands for write operations" + }, + { .ival = "uncor", + .oval = FIO_URING_CMD_WMODE_UNCOR, + .help = "Issue Write Uncorrectable commands for write operations" + }, + { .ival = "zeroes", + .oval = FIO_URING_CMD_WMODE_ZEROES, + .help = "Issue Write Zeroes commands for write operations" + }, + { .ival = "verify", + .oval = FIO_URING_CMD_WMODE_VERIFY, + .help = "Issue Verify commands for write operations" + }, + }, .category = FIO_OPT_C_ENGINE, .group = FIO_OPT_G_IOURING, }, { - .name = "cmdprio_bssplit", - .lname = "Priority percentage block size split", - .type = FIO_OPT_STR_STORE, - .off1 = offsetof(struct ioring_options, - cmdprio_options.bssplit_str), - .help = "Set priority percentages for different block sizes", + .name = "verify_mode", + .lname = "Do verify based on the configured command (e.g., Read or Compare command)", + .type = FIO_OPT_STR, + .off1 = offsetof(struct ioring_options, verify_mode), + .help = "Issue Read or Compare command in the verification phase", + .def = "read", + .posval = { + { .ival = "read", + .oval = FIO_URING_CMD_VMODE_READ, + .help = "Issue Read commands in the verification phase" + }, + { .ival = "compare", + .oval = FIO_URING_CMD_VMODE_COMPARE, + .help = "Issue Compare commands in the verification phase" + }, + }, .category = FIO_OPT_C_ENGINE, .group = FIO_OPT_G_IOURING, }, -#else - { - .name = "cmdprio_percentage", - .lname = "high priority percentage", - .type = FIO_OPT_UNSUPPORTED, - .help = "Your platform does not support I/O priority classes", - }, - { - .name = "cmdprio_class", - .lname = "Asynchronous I/O priority class", - .type = FIO_OPT_UNSUPPORTED, - .help = "Your platform does not support I/O priority classes", - }, - { - .name = "cmdprio", - .lname = "Asynchronous I/O priority level", - .type = FIO_OPT_UNSUPPORTED, - .help = "Your platform does not support I/O priority classes", - }, - { - .name = "cmdprio_bssplit", - .lname = "Priority percentage block size split", - .type = FIO_OPT_UNSUPPORTED, - .help = "Your platform does not support I/O priority classes", - }, -#endif { .name = "fixedbufs", .lname = "Fixed (pre-mapped) IO buffers", @@ -218,7 +314,7 @@ static struct fio_option options[] = { { .name = "sqthread_poll", .lname = "Kernel SQ thread polling", - .type = FIO_OPT_INT, + .type = FIO_OPT_STR_SET, .off1 = offsetof(struct ioring_options, sqpoll_thread), .help = "Offload submission/completion to kernel thread", .category = FIO_OPT_C_ENGINE, @@ -248,7 +344,7 @@ static struct fio_option options[] = { .lname = "Uncached", .type = FIO_OPT_INT, .off1 = offsetof(struct ioring_options, uncached), - .help = "Use RWF_UNCACHED for buffered read/writes", + .help = "Use RWF_DONTCACHE for buffered read/writes", .category = FIO_OPT_C_ENGINE, .group = FIO_OPT_G_IOURING, }, @@ -270,6 +366,83 @@ static struct fio_option options[] = { .category = FIO_OPT_C_ENGINE, .group = FIO_OPT_G_IOURING, }, + { + .name = "cmd_type", + .lname = "Uring cmd type", + .type = FIO_OPT_STR, + .off1 = offsetof(struct ioring_options, cmd_type), + .help = "Specify uring-cmd type", + .def = "nvme", + .posval = { + { .ival = "nvme", + .oval = FIO_URING_CMD_NVME, + .help = "Issue nvme-uring-cmd", + }, + }, + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_IOURING, + }, + CMDPRIO_OPTIONS(struct ioring_options, FIO_OPT_G_IOURING), + { + .name = "md_per_io_size", + .lname = "Separate Metadata Buffer Size per I/O", + .type = FIO_OPT_INT, + .off1 = offsetof(struct ioring_options, md_per_io_size), + .def = "0", + .help = "Size of separate metadata buffer per I/O (Default: 0)", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_IOURING, + }, + { + .name = "pi_act", + .lname = "Protection Information Action", + .type = FIO_OPT_BOOL, + .off1 = offsetof(struct ioring_options, pi_act), + .def = "1", + .help = "Protection Information Action bit (pi_act=1 or pi_act=0)", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_IOURING, + }, + { + .name = "pi_chk", + .lname = "Protection Information Check", + .type = FIO_OPT_STR_STORE, + .off1 = offsetof(struct ioring_options, pi_chk), + .def = NULL, + .help = "Control of Protection Information Checking (pi_chk=GUARD,REFTAG,APPTAG)", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_IOURING, + }, + { + .name = "apptag", + .lname = "Application Tag used in Protection Information", + .type = FIO_OPT_INT, + .off1 = offsetof(struct ioring_options, apptag), + .def = "0x1234", + .help = "Application Tag used in Protection Information field (Default: 0x1234)", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_IOURING, + }, + { + .name = "apptag_mask", + .lname = "Application Tag Mask", + .type = FIO_OPT_INT, + .off1 = offsetof(struct ioring_options, apptag_mask), + .def = "0xffff", + .help = "Application Tag Mask used with Application Tag (Default: 0xffff)", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_IOURING, + }, + { + .name = "deac", + .lname = "Deallocate bit for write zeroes command", + .type = FIO_OPT_BOOL, + .off1 = offsetof(struct ioring_options, deac), + .help = "Set DEAC (deallocate) flag for write zeroes command", + .def = "0", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_IOURING, + }, { .name = NULL, }, @@ -278,8 +451,36 @@ static struct fio_option options[] = { static int io_uring_enter(struct ioring_data *ld, unsigned int to_submit, unsigned int min_complete, unsigned int flags) { +#ifdef FIO_ARCH_HAS_SYSCALL + return __do_syscall6(__NR_io_uring_enter, ld->ring_fd, to_submit, + min_complete, flags, NULL, 0); +#else return syscall(__NR_io_uring_enter, ld->ring_fd, to_submit, min_complete, flags, NULL, 0); +#endif +} + +#ifndef BLOCK_URING_CMD_DISCARD +#define BLOCK_URING_CMD_DISCARD _IO(0x12, 0) +#endif + +static void fio_ioring_prep_md(struct thread_data *td, struct io_u *io_u) +{ + struct ioring_data *ld = td->io_ops_data; + struct io_uring_attr_pi *pi_attr = io_u->pi_attr; + struct nvme_data *data = FILE_ENG_DATA(io_u->file); + struct io_uring_sqe *sqe; + + sqe = &ld->sqes[io_u->index]; + + sqe->attr_type_mask = IORING_RW_ATTR_FLAG_PI; + sqe->attr_ptr = (__u64)(uintptr_t)pi_attr; + pi_attr->addr = (__u64)(uintptr_t)io_u->mmap_data; + + if (pi_attr->flags & IO_INTEGRITY_CHK_REFTAG) { + __u64 slba = get_slba(data, io_u->offset); + pi_attr->seed = (__u32)slba; + } } static int fio_ioring_prep(struct thread_data *td, struct io_u *io_u) @@ -324,17 +525,21 @@ static int fio_ioring_prep(struct thread_data *td, struct io_u *io_u) sqe->len = 1; } } + if (o->md_per_io_size) + fio_ioring_prep_md(td, io_u); sqe->rw_flags = 0; if (!td->o.odirect && o->uncached) - sqe->rw_flags |= RWF_UNCACHED; + sqe->rw_flags |= RWF_DONTCACHE; if (o->nowait) sqe->rw_flags |= RWF_NOWAIT; + if (td->o.oatomic && io_u->ddir == DDIR_WRITE) + sqe->rw_flags |= RWF_ATOMIC; /* * Since io_uring can have a submission context (sqthread_poll) * that is different from the process context, we cannot rely on - * the IO priority set by ioprio_set() (option prio/prioclass) - * to be inherited. + * the IO priority set by ioprio_set() (options prio, prioclass, + * and priohint) to be inherited. * td->ioprio will have the value of the "default prio", so set * this unconditionally. This value might get overridden by * fio_ioring_cmdprio_prep() if the option cmdprio_percentage or @@ -357,6 +562,16 @@ static int fio_ioring_prep(struct thread_data *td, struct io_u *io_u) sqe->fsync_flags |= IORING_FSYNC_DATASYNC; sqe->opcode = IORING_OP_FSYNC; } + } else if (io_u->ddir == DDIR_TRIM) { + sqe->opcode = IORING_OP_URING_CMD; + sqe->addr = io_u->offset; + sqe->addr3 = io_u->xfer_buflen; + sqe->rw_flags = 0; + sqe->len = sqe->off = 0; + sqe->ioprio = 0; + sqe->cmd_op = BLOCK_URING_CMD_DISCARD; + sqe->__pad1 = 0; + sqe->file_index = 0; } if (o->force_async && ++ld->prepped == o->force_async) { @@ -368,9 +583,96 @@ static int fio_ioring_prep(struct thread_data *td, struct io_u *io_u) return 0; } +static int fio_ioring_cmd_prep(struct thread_data *td, struct io_u *io_u) +{ + struct ioring_data *ld = td->io_ops_data; + struct ioring_options *o = td->eo; + struct fio_file *f = io_u->file; + struct nvme_uring_cmd *cmd; + struct io_uring_sqe *sqe; + struct nvme_dsm *dsm; + void *ptr = ld->dsm; + unsigned int dsm_size; + uint8_t read_opcode = nvme_cmd_read; + + /* only supports nvme_uring_cmd */ + if (o->cmd_type != FIO_URING_CMD_NVME) + return -EINVAL; + + if (io_u->ddir == DDIR_TRIM && td->io_ops->flags & FIO_ASYNCIO_SYNC_TRIM) + return 0; + + sqe = &ld->sqes[(io_u->index) << 1]; + + if (o->registerfiles) { + sqe->fd = f->engine_pos; + sqe->flags = IOSQE_FIXED_FILE; + } else { + sqe->fd = f->fd; + } + sqe->rw_flags = 0; + if (!td->o.odirect && o->uncached) + sqe->rw_flags |= RWF_DONTCACHE; + if (o->nowait) + sqe->rw_flags |= RWF_NOWAIT; + + sqe->opcode = IORING_OP_URING_CMD; + sqe->user_data = (unsigned long) io_u; + if (o->nonvectored) + sqe->cmd_op = NVME_URING_CMD_IO; + else + sqe->cmd_op = NVME_URING_CMD_IO_VEC; + if (o->force_async && ++ld->prepped == o->force_async) { + ld->prepped = 0; + sqe->flags |= IOSQE_ASYNC; + } + if (o->fixedbufs) { + sqe->uring_cmd_flags = IORING_URING_CMD_FIXED; + sqe->buf_index = io_u->index; + } + + cmd = (struct nvme_uring_cmd *)sqe->cmd; + dsm_size = sizeof(*ld->dsm) + td->o.num_range * sizeof(struct nvme_dsm_range); + ptr += io_u->index * dsm_size; + dsm = (struct nvme_dsm *)ptr; + + /* + * If READ command belongs to the verification phase and the + * verify_mode=compare, convert READ to COMPARE command. + */ + if (io_u->flags & IO_U_F_VER_LIST && io_u->ddir == DDIR_READ && + o->verify_mode == FIO_URING_CMD_VMODE_COMPARE) { + populate_verify_io_u(td, io_u); + read_opcode = nvme_cmd_compare; + io_u_set(td, io_u, IO_U_F_VER_IN_DEV); + } + + return fio_nvme_uring_cmd_prep(cmd, io_u, + o->nonvectored ? NULL : &ld->iovecs[io_u->index], + dsm, read_opcode, ld->write_opcode, + ld->cdw12_flags[io_u->ddir]); +} + +static void fio_ioring_validate_md(struct thread_data *td, struct io_u *io_u) +{ + struct nvme_data *data; + struct ioring_options *o = td->eo; + int ret; + + data = FILE_ENG_DATA(io_u->file); + if (data->pi_type && (io_u->ddir == DDIR_READ) && !o->pi_act) { + ret = fio_nvme_pi_verify(data, io_u); + if (ret) + io_u->error = -ret; + } + + return; +} + static struct io_u *fio_ioring_event(struct thread_data *td, int event) { struct ioring_data *ld = td->io_ops_data; + struct ioring_options *o = td->eo; struct io_uring_cqe *cqe; struct io_u *io_u; unsigned index; @@ -380,36 +682,126 @@ static struct io_u *fio_ioring_event(struct thread_data *td, int event) cqe = &ld->cq_ring.cqes[index]; io_u = (struct io_u *) (uintptr_t) cqe->user_data; - if (cqe->res != io_u->xfer_buflen) { - if (cqe->res > io_u->xfer_buflen) - io_u->error = -cqe->res; - else - io_u->resid = io_u->xfer_buflen - cqe->res; - } else + /* trim returns 0 on success */ + if (cqe->res == io_u->xfer_buflen || + (io_u->ddir == DDIR_TRIM && !cqe->res)) { io_u->error = 0; + if (io_u->ddir == DDIR_READ && o->md_per_io_size && !o->pi_act) + fio_ioring_validate_md(td, io_u); + return io_u; + } + + if (io_u->ddir == DDIR_TRIM) { + ld->async_trim_fail = 1; + cqe->res = 0; + } + if (cqe->res > io_u->xfer_buflen) + io_u->error = -cqe->res; + else + io_u->resid = io_u->xfer_buflen - cqe->res; return io_u; } -static int fio_ioring_cqring_reap(struct thread_data *td, unsigned int events, - unsigned int max) +static struct io_u *fio_ioring_cmd_event(struct thread_data *td, int event) { struct ioring_data *ld = td->io_ops_data; - struct io_cq_ring *ring = &ld->cq_ring; - unsigned head, reaped = 0; + struct ioring_options *o = td->eo; + struct io_uring_cqe *cqe; + struct io_u *io_u; + struct nvme_data *data; + unsigned index; + int ret; - head = *ring->head; - do { - if (head == atomic_load_acquire(ring->tail)) - break; - reaped++; - head++; - } while (reaped + events < max); + index = (event + ld->cq_ring_off) & ld->cq_ring_mask; + if (o->cmd_type == FIO_URING_CMD_NVME) + index <<= 1; + + cqe = &ld->cq_ring.cqes[index]; + io_u = (struct io_u *) (uintptr_t) cqe->user_data; + + io_u->error = cqe->res; + if (io_u->error != 0) + goto ret; + + if (o->cmd_type == FIO_URING_CMD_NVME) { + data = FILE_ENG_DATA(io_u->file); + if (data->pi_type && (io_u->ddir == DDIR_READ) && !o->pi_act) { + ret = fio_nvme_pi_verify(data, io_u); + if (ret) + io_u->error = ret; + } + } + +ret: + /* + * If IO_U_F_DEVICE_ERROR is not set, io_u->error will be parsed as an + * errno, otherwise device-specific error value (status value in CQE). + */ + if ((int)io_u->error > 0) + io_u_set(td, io_u, IO_U_F_DEVICE_ERROR); + else + io_u_clear(td, io_u, IO_U_F_DEVICE_ERROR); + io_u->error = abs((int)io_u->error); + return io_u; +} + +static char *fio_ioring_cmd_errdetails(struct thread_data *td, + struct io_u *io_u) +{ + struct ioring_options *o = td->eo; + unsigned int sct = (io_u->error >> 8) & 0x7; + unsigned int sc = io_u->error & 0xff; +#define MAXERRDETAIL 1024 +#define MAXMSGCHUNK 128 + char *msg, msgchunk[MAXMSGCHUNK]; + + if (!(io_u->flags & IO_U_F_DEVICE_ERROR)) + return NULL; + + msg = calloc(1, MAXERRDETAIL); + strcpy(msg, "io_uring_cmd: "); + + snprintf(msgchunk, MAXMSGCHUNK, "%s: ", io_u->file->file_name); + strlcat(msg, msgchunk, MAXERRDETAIL); + + if (o->cmd_type == FIO_URING_CMD_NVME) { + strlcat(msg, "cq entry status (", MAXERRDETAIL); + + snprintf(msgchunk, MAXMSGCHUNK, "sct=0x%02x; ", sct); + strlcat(msg, msgchunk, MAXERRDETAIL); + + snprintf(msgchunk, MAXMSGCHUNK, "sc=0x%02x)", sc); + strlcat(msg, msgchunk, MAXERRDETAIL); + } else { + /* Print status code in generic */ + snprintf(msgchunk, MAXMSGCHUNK, "status=0x%x", io_u->error); + strlcat(msg, msgchunk, MAXERRDETAIL); + } + + return msg; +} + +static unsigned fio_ioring_cqring_reap(struct thread_data *td, unsigned int max) +{ + struct ioring_data *ld = td->io_ops_data; + struct io_cq_ring *ring = &ld->cq_ring; + unsigned head = *ring->head; + unsigned available = atomic_load_acquire(ring->tail) - head; - if (reaped) - atomic_store_release(ring->head, head); + if (!available) + return 0; - return reaped; + available = min(available, max); + /* + * The CQ consumer index is advanced before the CQEs are actually read. + * This is generally unsafe, as it lets the kernel reuse the CQE slots. + * However, the CQ is sized large enough for the maximum iodepth and a + * new SQE won't be submitted until the CQE is processed, so the CQE + * slot won't actually be reused until it has been processed. + */ + atomic_store_relaxed(ring->head, head + available); + return available; } static int fio_ioring_getevents(struct thread_data *td, unsigned int min, @@ -423,28 +815,55 @@ static int fio_ioring_getevents(struct thread_data *td, unsigned int min, int r; ld->cq_ring_off = *ring->head; - do { - r = fio_ioring_cqring_reap(td, events, max); + for (;;) { + r = fio_ioring_cqring_reap(td, max - events); if (r) { events += r; + if (events >= min) + return events; + if (actual_min != 0) actual_min -= r; - continue; } if (!o->sqpoll_thread) { - r = io_uring_enter(ld, 0, actual_min, - IORING_ENTER_GETEVENTS); + r = io_uring_enter(ld, 0, actual_min, enter_flags); if (r < 0) { if (errno == EAGAIN || errno == EINTR) continue; + r = -errno; td_verror(td, errno, "io_uring_enter"); - break; + return r; } } - } while (events < min); + } +} + +static inline void fio_ioring_cmd_nvme_pi(struct thread_data *td, + struct io_u *io_u) +{ + struct ioring_data *ld = td->io_ops_data; + struct nvme_uring_cmd *cmd; + struct io_uring_sqe *sqe; - return r < 0 ? r : events; + if (io_u->ddir == DDIR_TRIM) + return; + + sqe = &ld->sqes[(io_u->index) << 1]; + cmd = (struct nvme_uring_cmd *)sqe->cmd; + + fio_nvme_pi_fill(cmd, io_u, &ld->ext_opts); +} + +static inline void fio_ioring_setup_pi(struct thread_data *td, + struct io_u *io_u) +{ + struct ioring_data *ld = td->io_ops_data; + + if (io_u->ddir == DDIR_TRIM) + return; + + fio_nvme_generate_guard(io_u, &ld->ext_opts); } static inline void fio_ioring_cmdprio_prep(struct thread_data *td, @@ -461,34 +880,39 @@ static enum fio_q_status fio_ioring_queue(struct thread_data *td, struct io_u *io_u) { struct ioring_data *ld = td->io_ops_data; + struct ioring_options *o = td->eo; struct io_sq_ring *ring = &ld->sq_ring; - unsigned tail, next_tail; + unsigned tail; fio_ro_check(td, io_u); - if (ld->queued == ld->iodepth) + /* should not hit... */ + if (ld->queued == td->o.iodepth) return FIO_Q_BUSY; - if (io_u->ddir == DDIR_TRIM) { + /* if async trim has been tried and failed, punt to sync */ + if (io_u->ddir == DDIR_TRIM && ld->async_trim_fail) { if (ld->queued) return FIO_Q_BUSY; do_io_u_trim(td, io_u); + io_u_mark_submit(td, 1); io_u_mark_complete(td, 1); return FIO_Q_COMPLETED; } - tail = *ring->tail; - next_tail = tail + 1; - if (next_tail == atomic_load_acquire(ring->head)) - return FIO_Q_BUSY; - if (ld->cmdprio.mode != CMDPRIO_MODE_NONE) fio_ioring_cmdprio_prep(td, io_u); + if (o->cmd_type == FIO_URING_CMD_NVME && ld->is_uring_cmd_eng) + fio_ioring_cmd_nvme_pi(td, io_u); + else if (o->md_per_io_size) + fio_ioring_setup_pi(td, io_u); + + tail = *ring->tail; ring->array[tail & ld->sq_ring_mask] = io_u->index; - atomic_store_release(ring->tail, next_tail); + atomic_store_release(ring->tail, tail + 1); ld->queued++; return FIO_Q_QUEUED; @@ -514,6 +938,12 @@ static void fio_ioring_queued(struct thread_data *td, int start, int nr) start++; } + + /* + * only used for iolog + */ + if (td->o.read_iolog_file) + memcpy(&td->last_issue, &now, sizeof(now)); } static int fio_ioring_commit(struct thread_data *td) @@ -532,12 +962,16 @@ static int fio_ioring_commit(struct thread_data *td) */ if (o->sqpoll_thread) { struct io_sq_ring *ring = &ld->sq_ring; + unsigned start = *ld->sq_ring.tail - ld->queued; unsigned flags; - flags = atomic_load_acquire(ring->flags); + flags = atomic_load_relaxed(ring->flags); if (flags & IORING_SQ_NEED_WAKEUP) io_uring_enter(ld, ld->queued, 0, IORING_ENTER_SQ_WAKEUP); + fio_ioring_queued(td, start, ld->queued); + io_u_mark_submit(td, ld->queued); + ld->queued = 0; return 0; } @@ -546,7 +980,7 @@ static int fio_ioring_commit(struct thread_data *td) unsigned start = *ld->sq_ring.head; long nr = ld->queued; - ret = io_uring_enter(ld, nr, 0, IORING_ENTER_GETEVENTS); + ret = io_uring_enter(ld, nr, 0, enter_flags); if (ret > 0) { fio_ioring_queued(td, start, ret); io_u_mark_submit(td, ret); @@ -558,13 +992,14 @@ static int fio_ioring_commit(struct thread_data *td) continue; } else { if (errno == EAGAIN || errno == EINTR) { - ret = fio_ioring_cqring_reap(td, 0, ld->queued); + ret = fio_ioring_cqring_reap(td, ld->queued); if (ret) continue; /* Shouldn't happen */ usleep(1); continue; } + ret = -errno; td_verror(td, errno, "io_uring_enter submit"); break; } @@ -592,8 +1027,11 @@ static void fio_ioring_cleanup(struct thread_data *td) fio_cmdprio_cleanup(&ld->cmdprio); free(ld->io_u_index); + free(ld->md_buf); + free(ld->pi_attr); free(ld->iovecs); free(ld->fds); + free(ld->dsm); free(ld); } } @@ -617,14 +1055,22 @@ static int fio_ioring_mmap(struct ioring_data *ld, struct io_uring_params *p) sring->array = ptr + p->sq_off.array; ld->sq_ring_mask = *sring->ring_mask; - ld->mmap[1].len = p->sq_entries * sizeof(struct io_uring_sqe); + if (p->flags & IORING_SETUP_SQE128) + ld->mmap[1].len = 2 * p->sq_entries * sizeof(struct io_uring_sqe); + else + ld->mmap[1].len = p->sq_entries * sizeof(struct io_uring_sqe); ld->sqes = mmap(0, ld->mmap[1].len, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, ld->ring_fd, IORING_OFF_SQES); ld->mmap[1].ptr = ld->sqes; - ld->mmap[2].len = p->cq_off.cqes + - p->cq_entries * sizeof(struct io_uring_cqe); + if (p->flags & IORING_SETUP_CQE32) { + ld->mmap[2].len = p->cq_off.cqes + + 2 * p->cq_entries * sizeof(struct io_uring_cqe); + } else { + ld->mmap[2].len = p->cq_off.cqes + + p->cq_entries * sizeof(struct io_uring_cqe); + } ptr = mmap(0, ld->mmap[2].len, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, ld->ring_fd, IORING_OFF_CQ_RING); @@ -652,11 +1098,10 @@ static void fio_ioring_probe(struct thread_data *td) /* default to off, as that's always safe */ o->nonvectored = 0; - p = malloc(sizeof(*p) + 256 * sizeof(struct io_uring_probe_op)); + p = calloc(1, sizeof(*p) + 256 * sizeof(struct io_uring_probe_op)); if (!p) return; - memset(p, 0, sizeof(*p) + 256 * sizeof(struct io_uring_probe_op)); ret = syscall(__NR_io_uring_register, ld->ring_fd, IORING_REGISTER_PROBE, p, 256); if (ret < 0) @@ -676,7 +1121,7 @@ static int fio_ioring_queue_init(struct thread_data *td) { struct ioring_data *ld = td->io_ops_data; struct ioring_options *o = td->eo; - int depth = td->o.iodepth; + int depth = ld->iodepth; struct io_uring_params p; int ret; @@ -690,6 +1135,14 @@ static int fio_ioring_queue_init(struct thread_data *td) p.flags |= IORING_SETUP_SQ_AFF; p.sq_thread_cpu = o->sqpoll_cpu; } + + /* + * Submission latency for sqpoll_thread is just the time it + * takes to fill in the SQ ring entries, and any syscall if + * IORING_SQ_NEED_WAKEUP is set, we don't need to log that time + * separately. + */ + td->o.disable_slat = 1; } /* @@ -699,10 +1152,39 @@ static int fio_ioring_queue_init(struct thread_data *td) p.flags |= IORING_SETUP_CQSIZE; p.cq_entries = depth; + /* + * Setup COOP_TASKRUN as we don't need to get IPI interrupted for + * completing IO operations. + */ + p.flags |= IORING_SETUP_COOP_TASKRUN; + + /* + * io_uring is always a single issuer, and we can defer task_work + * runs until we reap events. + */ + p.flags |= IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN; + +retry: ret = syscall(__NR_io_uring_setup, depth, &p); - if (ret < 0) + if (ret < 0) { + if (errno == EINVAL && p.flags & IORING_SETUP_DEFER_TASKRUN) { + p.flags &= ~IORING_SETUP_DEFER_TASKRUN; + p.flags &= ~IORING_SETUP_SINGLE_ISSUER; + goto retry; + } + if (errno == EINVAL && p.flags & IORING_SETUP_COOP_TASKRUN) { + p.flags &= ~IORING_SETUP_COOP_TASKRUN; + goto retry; + } + if (errno == EINVAL && p.flags & IORING_SETUP_CQSIZE) { + p.flags &= ~IORING_SETUP_CQSIZE; + goto retry; + } return ret; + } + if (p.features & IORING_FEAT_NO_IOWAIT) + enter_flags |= IORING_ENTER_NO_IOWAIT; ld->ring_fd = ret; fio_ioring_probe(td); @@ -717,24 +1199,108 @@ static int fio_ioring_queue_init(struct thread_data *td) return fio_ioring_mmap(ld, &p); } -static int fio_ioring_register_files(struct thread_data *td) +static int fio_ioring_cmd_queue_init(struct thread_data *td) { struct ioring_data *ld = td->io_ops_data; - struct fio_file *f; - unsigned int i; + struct ioring_options *o = td->eo; + int depth = ld->iodepth; + struct io_uring_params p; int ret; - ld->fds = calloc(td->o.nr_files, sizeof(int)); + memset(&p, 0, sizeof(p)); - for_each_file(td, f, i) { - ret = generic_open_file(td, f); - if (ret) - goto err; - ld->fds[i] = f->fd; - f->engine_pos = i; + if (o->hipri) + p.flags |= IORING_SETUP_IOPOLL; + if (o->sqpoll_thread) { + p.flags |= IORING_SETUP_SQPOLL; + if (o->sqpoll_set) { + p.flags |= IORING_SETUP_SQ_AFF; + p.sq_thread_cpu = o->sqpoll_cpu; + } + + /* + * Submission latency for sqpoll_thread is just the time it + * takes to fill in the SQ ring entries, and any syscall if + * IORING_SQ_NEED_WAKEUP is set, we don't need to log that time + * separately. + */ + td->o.disable_slat = 1; + } + if (o->cmd_type == FIO_URING_CMD_NVME) { + p.flags |= IORING_SETUP_SQE128; + p.flags |= IORING_SETUP_CQE32; } - ret = syscall(__NR_io_uring_register, ld->ring_fd, + /* + * Clamp CQ ring size at our SQ ring size, we don't need more entries + * than that. + */ + p.flags |= IORING_SETUP_CQSIZE; + p.cq_entries = depth; + + /* + * Setup COOP_TASKRUN as we don't need to get IPI interrupted for + * completing IO operations. + */ + p.flags |= IORING_SETUP_COOP_TASKRUN; + + /* + * io_uring is always a single issuer, and we can defer task_work + * runs until we reap events. + */ + p.flags |= IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN; + +retry: + ret = syscall(__NR_io_uring_setup, depth, &p); + if (ret < 0) { + if (errno == EINVAL && p.flags & IORING_SETUP_DEFER_TASKRUN) { + p.flags &= ~IORING_SETUP_DEFER_TASKRUN; + p.flags &= ~IORING_SETUP_SINGLE_ISSUER; + goto retry; + } + if (errno == EINVAL && p.flags & IORING_SETUP_COOP_TASKRUN) { + p.flags &= ~IORING_SETUP_COOP_TASKRUN; + goto retry; + } + if (errno == EINVAL && p.flags & IORING_SETUP_CQSIZE) { + p.flags &= ~IORING_SETUP_CQSIZE; + goto retry; + } + return ret; + } + + ld->ring_fd = ret; + + fio_ioring_probe(td); + + if (o->fixedbufs) { + ret = syscall(__NR_io_uring_register, ld->ring_fd, + IORING_REGISTER_BUFFERS, ld->iovecs, depth); + if (ret < 0) + return ret; + } + + return fio_ioring_mmap(ld, &p); +} + +static int fio_ioring_register_files(struct thread_data *td) +{ + struct ioring_data *ld = td->io_ops_data; + struct fio_file *f; + unsigned int i; + int ret; + + ld->fds = calloc(td->o.nr_files, sizeof(int)); + + for_each_file(td, f, i) { + ret = generic_open_file(td, f); + if (ret) + goto err; + ld->fds[i] = f->fd; + f->engine_pos = i; + } + + ret = syscall(__NR_io_uring_register, ld->ring_fd, IORING_REGISTER_FILES, ld->fds, td->o.nr_files); if (ret) { err: @@ -782,7 +1348,7 @@ static int fio_ioring_post_init(struct thread_data *td) return 1; } - for (i = 0; i < td->o.iodepth; i++) { + for (i = 0; i < ld->iodepth; i++) { struct io_uring_sqe *sqe; sqe = &ld->sqes[i]; @@ -800,11 +1366,106 @@ static int fio_ioring_post_init(struct thread_data *td) return 0; } +static int fio_ioring_cmd_post_init(struct thread_data *td) +{ + struct ioring_data *ld = td->io_ops_data; + struct ioring_options *o = td->eo; + struct io_u *io_u; + int err, i; + + for (i = 0; i < td->o.iodepth; i++) { + struct iovec *iov = &ld->iovecs[i]; + + io_u = ld->io_u_index[i]; + iov->iov_base = io_u->buf; + iov->iov_len = td_max_bs(td); + } + + err = fio_ioring_cmd_queue_init(td); + if (err) { + int init_err = errno; + + td_verror(td, init_err, "io_queue_init"); + return 1; + } + + for (i = 0; i < ld->iodepth; i++) { + struct io_uring_sqe *sqe; + + if (o->cmd_type == FIO_URING_CMD_NVME) { + sqe = &ld->sqes[i << 1]; + memset(sqe, 0, 2 * sizeof(*sqe)); + } else { + sqe = &ld->sqes[i]; + memset(sqe, 0, sizeof(*sqe)); + } + } + + if (o->registerfiles) { + err = fio_ioring_register_files(td); + if (err) { + td_verror(td, errno, "ioring_register_files"); + return 1; + } + } + + return 0; +} + +static void parse_prchk_flags(struct ioring_options *o) +{ + if (!o->pi_chk) + return; + + if (strstr(o->pi_chk, "GUARD") != NULL) + o->prchk = NVME_IO_PRINFO_PRCHK_GUARD; + if (strstr(o->pi_chk, "REFTAG") != NULL) + o->prchk |= NVME_IO_PRINFO_PRCHK_REF; + if (strstr(o->pi_chk, "APPTAG") != NULL) + o->prchk |= NVME_IO_PRINFO_PRCHK_APP; +} + +static int fio_ioring_cmd_init(struct thread_data *td, struct ioring_data *ld) +{ + struct ioring_options *o = td->eo; + + if (td_write(td)) { + switch (o->write_mode) { + case FIO_URING_CMD_WMODE_UNCOR: + ld->write_opcode = nvme_cmd_write_uncor; + break; + case FIO_URING_CMD_WMODE_ZEROES: + ld->write_opcode = nvme_cmd_write_zeroes; + if (o->deac) + ld->cdw12_flags[DDIR_WRITE] = 1 << 25; + break; + case FIO_URING_CMD_WMODE_VERIFY: + ld->write_opcode = nvme_cmd_verify; + break; + default: + ld->write_opcode = nvme_cmd_write; + break; + } + } + + if (o->readfua) + ld->cdw12_flags[DDIR_READ] = 1 << 30; + if (o->writefua) + ld->cdw12_flags[DDIR_WRITE] = 1 << 30; + + return 0; +} + static int fio_ioring_init(struct thread_data *td) { struct ioring_options *o = td->eo; struct ioring_data *ld; - int ret; + struct nvme_dsm *dsm; + void *ptr; + unsigned int dsm_size; + unsigned long long md_size; + int ret, i; + struct nvme_cmd_ext_io_opts *ext_opts; /* sqthread submission requires registered files */ if (o->sqpoll_thread) @@ -818,13 +1479,66 @@ static int fio_ioring_init(struct thread_data *td) ld = calloc(1, sizeof(*ld)); - /* ring depth must be a power-of-2 */ - ld->iodepth = td->o.iodepth; - td->o.iodepth = roundup_pow2(td->o.iodepth); + ld->is_uring_cmd_eng = (td->io_ops->prep == fio_ioring_cmd_prep); + + /* + * The internal io_uring queue depth must be a power-of-2, as that's + * how the ring interface works. So round that up, in case the user + * set iodepth isn't a power-of-2. Leave the fio depth the same, as + * not to be driving too much of an iodepth, if we did round up. + */ + ld->iodepth = roundup_pow2(td->o.iodepth); /* io_u index */ ld->io_u_index = calloc(td->o.iodepth, sizeof(struct io_u *)); - ld->iovecs = calloc(td->o.iodepth, sizeof(struct iovec)); + + if (!ld->is_uring_cmd_eng && o->md_per_io_size) { + if (o->apptag_mask != 0xffff) { + log_err("fio: io_uring with metadata requires an apptag_mask of 0xffff\n"); + free(ld->io_u_index); + free(ld); + return 1; + } + } + + /* + * metadata buffer + * We are only supporting iomem=malloc / mem=malloc as of now. + */ + if (o->md_per_io_size && (!ld->is_uring_cmd_eng || + (ld->is_uring_cmd_eng && o->cmd_type == FIO_URING_CMD_NVME))) { + md_size = (unsigned long long) o->md_per_io_size + * (unsigned long long) td->o.iodepth; + md_size += page_mask + td->o.mem_align; + if (td->o.mem_align && td->o.mem_align > page_size) + md_size += td->o.mem_align - page_size; + ld->md_buf = malloc(md_size); + if (!ld->md_buf) { + free(ld->io_u_index); + free(ld); + return 1; + } + + if (!ld->is_uring_cmd_eng) { + ld->pi_attr = calloc(ld->iodepth, sizeof(struct io_uring_attr_pi)); + if (!ld->pi_attr) { + free(ld->io_u_index); + free(ld->md_buf); + free(ld); + return 1; + } + } + + } + parse_prchk_flags(o); + ext_opts = &ld->ext_opts; + if (o->pi_act) + ext_opts->io_flags |= NVME_IO_PRINFO_PRACT; + ext_opts->io_flags |= o->prchk; + ext_opts->apptag = o->apptag; + ext_opts->apptag_mask = o->apptag_mask; + + ld->iovecs = calloc(ld->iodepth, sizeof(struct iovec)); td->io_ops_data = ld; @@ -834,22 +1548,178 @@ static int fio_ioring_init(struct thread_data *td) return 1; } + /* + * For io_uring_cmd, trims are async operations unless we are operating + * in zbd mode where trim means zone reset. + */ + if (td_trim(td) && td->o.zone_mode == ZONE_MODE_ZBD && + ld->is_uring_cmd_eng) { + td->io_ops->flags |= FIO_ASYNCIO_SYNC_TRIM; + } else { + dsm_size = sizeof(*ld->dsm); + dsm_size += td->o.num_range * sizeof(struct nvme_dsm_range); + ld->dsm = calloc(td->o.iodepth, dsm_size); + ptr = ld->dsm; + for (i = 0; i < td->o.iodepth; i++) { + dsm = (struct nvme_dsm *)ptr; + dsm->nr_ranges = td->o.num_range; + ptr += dsm_size; + } + } + + if (ld->is_uring_cmd_eng) + return fio_ioring_cmd_init(td, ld); return 0; } static int fio_ioring_io_u_init(struct thread_data *td, struct io_u *io_u) { struct ioring_data *ld = td->io_ops_data; + struct ioring_options *o = td->eo; + struct nvme_pi_data *pi_data; + char *p, *q; ld->io_u_index[io_u->index] = io_u; + + p = PTR_ALIGN(ld->md_buf, page_mask) + td->o.mem_align; + p += o->md_per_io_size * io_u->index; + io_u->mmap_data = p; + + if (ld->pi_attr) { + struct io_uring_attr_pi *pi_attr; + + q = ld->pi_attr; + q += (sizeof(struct io_uring_attr_pi) * io_u->index); + io_u->pi_attr = q; + + pi_attr = io_u->pi_attr; + pi_attr->len = o->md_per_io_size; + pi_attr->app_tag = o->apptag; + pi_attr->flags = 0; + if (o->prchk & NVME_IO_PRINFO_PRCHK_GUARD) + pi_attr->flags |= IO_INTEGRITY_CHK_GUARD; + if (o->prchk & NVME_IO_PRINFO_PRCHK_REF) + pi_attr->flags |= IO_INTEGRITY_CHK_REFTAG; + if (o->prchk & NVME_IO_PRINFO_PRCHK_APP) + pi_attr->flags |= IO_INTEGRITY_CHK_APPTAG; + } + + if (!o->pi_act) { + pi_data = calloc(1, sizeof(*pi_data)); + pi_data->io_flags |= o->prchk; + pi_data->apptag_mask = o->apptag_mask; + pi_data->apptag = o->apptag; + io_u->engine_data = pi_data; + } + return 0; } +static void fio_ioring_io_u_free(struct thread_data *td, struct io_u *io_u) +{ + struct nvme_pi *pi = io_u->engine_data; + + free(pi); + io_u->engine_data = NULL; +} + +static int fio_get_pi_info(struct fio_file *f, struct nvme_data *data) +{ + struct logical_block_metadata_cap md_cap; + int ret; + int fd, err = 0; + + fd = open(f->file_name, O_RDONLY); + if (fd < 0) + return -errno; + + ret = ioctl(fd, FS_IOC_GETLBMD_CAP, &md_cap); + if (ret < 0) { + err = -errno; + log_err("%s: failed to query protection information capabilities; error %d\n", f->file_name, errno); + goto out; + } + + if (!(md_cap.lbmd_flags & LBMD_PI_CAP_INTEGRITY)) { + log_err("%s: Protection information not supported\n", f->file_name); + err = -ENOTSUP; + goto out; + } + + /* Currently we don't support storage tags */ + if (md_cap.lbmd_storage_tag_size) { + log_err("%s: Storage tag not supported\n", f->file_name); + err = -ENOTSUP; + goto out; + } + + data->lba_size = md_cap.lbmd_interval; + data->lba_shift = ilog2(data->lba_size); + data->ms = md_cap.lbmd_size; + data->pi_size = md_cap.lbmd_pi_size; + data->pi_loc = !(md_cap.lbmd_pi_offset); + + /* Assume Type 1 PI if reference tags supported */ + if (md_cap.lbmd_flags & LBMD_PI_CAP_REFTAG) + data->pi_type = NVME_NS_DPS_PI_TYPE1; + else + data->pi_type = NVME_NS_DPS_PI_TYPE3; + + switch (md_cap.lbmd_guard_tag_type) { + case LBMD_PI_CSUM_CRC16_T10DIF: + data->guard_type = NVME_NVM_NS_16B_GUARD; + break; + case LBMD_PI_CSUM_CRC64_NVME: + data->guard_type = NVME_NVM_NS_64B_GUARD; + break; + default: + log_err("%s: unsupported checksum type %d\n", f->file_name, + md_cap.lbmd_guard_tag_type); + err = -ENOTSUP; + goto out; + } + +out: + close(fd); + return err; +} + +static inline int fio_ioring_open_file_md(struct thread_data *td, struct fio_file *f) +{ + int ret = 0; + struct nvme_data *data = NULL; + + data = FILE_ENG_DATA(f); + if (data == NULL) { + data = calloc(1, sizeof(struct nvme_data)); + ret = fio_get_pi_info(f, data); + if (ret) { + free(data); + return ret; + } + + FILE_SET_ENG_DATA(f, data); + } + + return ret; +} + static int fio_ioring_open_file(struct thread_data *td, struct fio_file *f) { struct ioring_data *ld = td->io_ops_data; struct ioring_options *o = td->eo; + if (o->md_per_io_size) { + /* + * This will be a no-op when called by the io_uring_cmd + * ioengine because engine data has already been collected by + * the time this call is made + */ + int ret = fio_ioring_open_file_md(td, f); + if (ret) + return ret; + } + if (!ld || !o->registerfiles) return generic_open_file(td, f); @@ -857,6 +1727,106 @@ static int fio_ioring_open_file(struct thread_data *td, struct fio_file *f) return 0; } +static int verify_params(struct thread_data *td, struct nvme_data *data, + struct fio_file *f, enum fio_ddir ddir) +{ + struct ioring_options *o = td->eo; + unsigned int lba_size; + + lba_size = data->lba_ext ? data->lba_ext : data->lba_size; + if (td->o.min_bs[ddir] % lba_size || td->o.max_bs[ddir] % lba_size) { + if (data->lba_ext) { + log_err("%s: block size must be a multiple of %u " + "(LBA data size + Metadata size)\n", f->file_name, lba_size); + if (td->o.min_bs[ddir] == td->o.max_bs[ddir] && + !(td->o.min_bs[ddir] % data->lba_size)) { + /* fixed block size is actually a multiple of LBA data size */ + unsigned long long suggestion = lba_size * + (td->o.min_bs[ddir] / data->lba_size); + log_err("Did you mean to use a block size of %llu?\n", suggestion); + } + } else { + log_err("%s: block size must be a multiple of LBA data size\n", + f->file_name); + } + td_verror(td, EINVAL, "fio_ioring_cmd_open_file"); + return 1; + } + if (data->ms && !data->lba_ext && ddir != DDIR_TRIM && + (o->md_per_io_size < ((td->o.max_bs[ddir] / data->lba_size) * data->ms))) { + log_err("%s: md_per_io_size should be at least %llu bytes\n", + f->file_name, + ((td->o.max_bs[ddir] / data->lba_size) * data->ms)); + td_verror(td, EINVAL, "fio_ioring_cmd_open_file"); + return 1; + } + + return 0; +} + +static int fio_ioring_open_nvme(struct thread_data *td, struct fio_file *f) +{ + struct ioring_options *o = td->eo; + struct nvme_data *data = NULL; + __u64 nlba = 0; + int ret; + + /* Store the namespace-id and lba size. */ + data = FILE_ENG_DATA(f); + if (data == NULL) { + data = calloc(1, sizeof(struct nvme_data)); + ret = fio_nvme_get_info(f, &nlba, o->pi_act, data); + if (ret) { + free(data); + return ret; + } + + FILE_SET_ENG_DATA(f, data); + } + + for_each_rw_ddir(ddir) { + ret = verify_params(td, data, f, ddir); + if (ret) + return ret; + } + + /* + * For extended logical block sizes we cannot use verify when + * end to end data protection checks are enabled, as the PI + * section of data buffer conflicts with verify. + */ + if (data->ms && data->pi_type && data->lba_ext && + td->o.verify != VERIFY_NONE) { + log_err("%s: for extended LBA, verify cannot be used when E2E " + "data protection is enabled\n", f->file_name); + td_verror(td, EINVAL, "fio_ioring_cmd_open_file"); + return 1; + } + + if (o->write_mode != FIO_URING_CMD_WMODE_WRITE && !td_write(td)) { + log_err("%s: 'readwrite=|rw=' has no write\n", f->file_name); + td_verror(td, EINVAL, "fio_ioring_cmd_open_file"); + return 1; + } + + return 0; +} + +static int fio_ioring_cmd_open_file(struct thread_data *td, struct fio_file *f) +{ + struct ioring_options *o = td->eo; + + if (o->cmd_type == FIO_URING_CMD_NVME) { + int ret; + + ret = fio_ioring_open_nvme(td, f); + if (ret) + return ret; + } + + return fio_ioring_open_file(td, f); +} + static int fio_ioring_close_file(struct thread_data *td, struct fio_file *f) { struct ioring_data *ld = td->io_ops_data; @@ -869,13 +1839,156 @@ static int fio_ioring_close_file(struct thread_data *td, struct fio_file *f) return 0; } -static struct ioengine_ops ioengine = { +static int fio_ioring_cmd_close_file(struct thread_data *td, + struct fio_file *f) +{ + struct ioring_options *o = td->eo; + + if (o->cmd_type == FIO_URING_CMD_NVME) { + struct nvme_data *data = FILE_ENG_DATA(f); + + FILE_SET_ENG_DATA(f, NULL); + free(data); + } + + return fio_ioring_close_file(td, f); +} + +static int fio_ioring_cmd_get_file_size(struct thread_data *td, + struct fio_file *f) +{ + struct ioring_options *o = td->eo; + + if (fio_file_size_known(f)) + return 0; + + if (o->cmd_type == FIO_URING_CMD_NVME) { + struct nvme_data *data = NULL; + __u64 nlba = 0; + int ret; + + data = calloc(1, sizeof(struct nvme_data)); + ret = fio_nvme_get_info(f, &nlba, o->pi_act, data); + if (ret) { + free(data); + return ret; + } + + if (data->lba_ext) + f->real_file_size = data->lba_ext * nlba; + else + f->real_file_size = data->lba_size * nlba; + fio_file_set_size_known(f); + + FILE_SET_ENG_DATA(f, data); + return 0; + } + return generic_get_file_size(td, f); +} + +static int fio_ioring_get_zoned_model(struct thread_data *td, + struct fio_file *f, + enum zbd_zoned_model *model) +{ + return blkzoned_get_zoned_model(td, f, model); +} + +static int fio_ioring_report_zones(struct thread_data *td, + struct fio_file *f, uint64_t offset, + struct zbd_zone *zbdz, + unsigned int nr_zones) +{ + return blkzoned_report_zones(td, f, offset, zbdz, nr_zones); +} + +static int fio_ioring_reset_wp(struct thread_data *td, struct fio_file *f, + uint64_t offset, uint64_t length) +{ + return blkzoned_reset_wp(td, f, offset, length); +} + +static int fio_ioring_get_max_open_zones(struct thread_data *td, + struct fio_file *f, + unsigned int *max_open_zones) +{ + return blkzoned_get_max_open_zones(td, f, max_open_zones); +} + +static int fio_ioring_finish_zone(struct thread_data *td, struct fio_file *f, + uint64_t offset, uint64_t length) +{ + return blkzoned_finish_zone(td, f, offset, length); +} + +static int fio_ioring_move_zone_wp(struct thread_data *td, struct fio_file *f, + struct zbd_zone *z, uint64_t length, + const char *buf) +{ + return blkzoned_move_zone_wp(td, f, z, length, buf); +} + +static int fio_ioring_cmd_get_zoned_model(struct thread_data *td, + struct fio_file *f, + enum zbd_zoned_model *model) +{ + return fio_nvme_get_zoned_model(td, f, model); +} + +static int fio_ioring_cmd_report_zones(struct thread_data *td, + struct fio_file *f, uint64_t offset, + struct zbd_zone *zbdz, + unsigned int nr_zones) +{ + return fio_nvme_report_zones(td, f, offset, zbdz, nr_zones); +} + +static int fio_ioring_cmd_reset_wp(struct thread_data *td, struct fio_file *f, + uint64_t offset, uint64_t length) +{ + return fio_nvme_reset_wp(td, f, offset, length); +} + +static int fio_ioring_cmd_get_max_open_zones(struct thread_data *td, + struct fio_file *f, + unsigned int *max_open_zones) +{ + return fio_nvme_get_max_open_zones(td, f, max_open_zones); +} + +static int fio_ioring_cmd_fetch_ruhs(struct thread_data *td, struct fio_file *f, + struct fio_ruhs_info *fruhs_info) +{ + struct nvme_fdp_ruh_status *ruhs; + int bytes, nr_ruhs, ret, i; + + nr_ruhs = fruhs_info->nr_ruhs; + bytes = sizeof(*ruhs) + fruhs_info->nr_ruhs * sizeof(struct nvme_fdp_ruh_status_desc); + + ruhs = calloc(1, bytes); + if (!ruhs) + return -ENOMEM; + + ret = fio_nvme_iomgmt_ruhs(td, f, ruhs, bytes); + if (ret) + goto free; + + fruhs_info->nr_ruhs = le16_to_cpu(ruhs->nruhsd); + for (i = 0; i < nr_ruhs; i++) + fruhs_info->plis[i] = le16_to_cpu(ruhs->ruhss[i].pid); +free: + free(ruhs); + return ret; +} + +static struct ioengine_ops ioengine_uring = { .name = "io_uring", .version = FIO_IOOPS_VERSION, - .flags = FIO_ASYNCIO_SYNC_TRIM | FIO_NO_OFFLOAD, + .flags = FIO_NO_OFFLOAD | FIO_ASYNCIO_SETS_ISSUE_TIME | + FIO_ATOMICWRITES, .init = fio_ioring_init, .post_init = fio_ioring_post_init, .io_u_init = fio_ioring_io_u_init, + .io_u_free = fio_ioring_io_u_free, .prep = fio_ioring_prep, .queue = fio_ioring_queue, .commit = fio_ioring_commit, @@ -885,17 +1998,54 @@ static struct ioengine_ops ioengine = { .open_file = fio_ioring_open_file, .close_file = fio_ioring_close_file, .get_file_size = generic_get_file_size, + .get_zoned_model = fio_ioring_get_zoned_model, + .report_zones = fio_ioring_report_zones, + .reset_wp = fio_ioring_reset_wp, + .get_max_open_zones = fio_ioring_get_max_open_zones, + .finish_zone = fio_ioring_finish_zone, + .move_zone_wp = fio_ioring_move_zone_wp, + .options = options, + .option_struct_size = sizeof(struct ioring_options), +}; + +static struct ioengine_ops ioengine_uring_cmd = { + .name = "io_uring_cmd", + .version = FIO_IOOPS_VERSION, + .flags = FIO_NO_OFFLOAD | FIO_MEMALIGN | FIO_RAWIO | + FIO_ASYNCIO_SETS_ISSUE_TIME | + FIO_MULTI_RANGE_TRIM, + .init = fio_ioring_init, + .post_init = fio_ioring_cmd_post_init, + .io_u_init = fio_ioring_io_u_init, + .io_u_free = fio_ioring_io_u_free, + .prep = fio_ioring_cmd_prep, + .queue = fio_ioring_queue, + .commit = fio_ioring_commit, + .getevents = fio_ioring_getevents, + .event = fio_ioring_cmd_event, + .errdetails = fio_ioring_cmd_errdetails, + .cleanup = fio_ioring_cleanup, + .open_file = fio_ioring_cmd_open_file, + .close_file = fio_ioring_cmd_close_file, + .get_file_size = fio_ioring_cmd_get_file_size, + .get_zoned_model = fio_ioring_cmd_get_zoned_model, + .report_zones = fio_ioring_cmd_report_zones, + .reset_wp = fio_ioring_cmd_reset_wp, + .get_max_open_zones = fio_ioring_cmd_get_max_open_zones, .options = options, .option_struct_size = sizeof(struct ioring_options), + .fdp_fetch_ruhs = fio_ioring_cmd_fetch_ruhs, }; static void fio_init fio_ioring_register(void) { - register_ioengine(&ioengine); + register_ioengine(&ioengine_uring); + register_ioengine(&ioengine_uring_cmd); } static void fio_exit fio_ioring_unregister(void) { - unregister_ioengine(&ioengine); + unregister_ioengine(&ioengine_uring); + unregister_ioengine(&ioengine_uring_cmd); } #endif diff --git a/engines/libaio.c b/engines/libaio.c index 9c278d060b..0c207d60d9 100644 --- a/engines/libaio.c +++ b/engines/libaio.c @@ -37,6 +37,7 @@ struct libaio_data { struct io_u **io_us; struct io_u **io_u_index; + struct iovec *iovecs; /* for vectored requests */ /* * Basic ring buffer. 'head' is incremented in _queue(), and @@ -60,6 +61,7 @@ struct libaio_options { unsigned int userspace_reap; struct cmdprio_options cmdprio_options; unsigned int nowait; + unsigned int vectored; }; static struct fio_option options[] = { @@ -72,87 +74,6 @@ static struct fio_option options[] = { .category = FIO_OPT_C_ENGINE, .group = FIO_OPT_G_LIBAIO, }, -#ifdef FIO_HAVE_IOPRIO_CLASS - { - .name = "cmdprio_percentage", - .lname = "high priority percentage", - .type = FIO_OPT_INT, - .off1 = offsetof(struct libaio_options, - cmdprio_options.percentage[DDIR_READ]), - .off2 = offsetof(struct libaio_options, - cmdprio_options.percentage[DDIR_WRITE]), - .minval = 0, - .maxval = 100, - .help = "Send high priority I/O this percentage of the time", - .category = FIO_OPT_C_ENGINE, - .group = FIO_OPT_G_LIBAIO, - }, - { - .name = "cmdprio_class", - .lname = "Asynchronous I/O priority class", - .type = FIO_OPT_INT, - .off1 = offsetof(struct libaio_options, - cmdprio_options.class[DDIR_READ]), - .off2 = offsetof(struct libaio_options, - cmdprio_options.class[DDIR_WRITE]), - .help = "Set asynchronous IO priority class", - .minval = IOPRIO_MIN_PRIO_CLASS + 1, - .maxval = IOPRIO_MAX_PRIO_CLASS, - .interval = 1, - .category = FIO_OPT_C_ENGINE, - .group = FIO_OPT_G_LIBAIO, - }, - { - .name = "cmdprio", - .lname = "Asynchronous I/O priority level", - .type = FIO_OPT_INT, - .off1 = offsetof(struct libaio_options, - cmdprio_options.level[DDIR_READ]), - .off2 = offsetof(struct libaio_options, - cmdprio_options.level[DDIR_WRITE]), - .help = "Set asynchronous IO priority level", - .minval = IOPRIO_MIN_PRIO, - .maxval = IOPRIO_MAX_PRIO, - .interval = 1, - .category = FIO_OPT_C_ENGINE, - .group = FIO_OPT_G_LIBAIO, - }, - { - .name = "cmdprio_bssplit", - .lname = "Priority percentage block size split", - .type = FIO_OPT_STR_STORE, - .off1 = offsetof(struct libaio_options, - cmdprio_options.bssplit_str), - .help = "Set priority percentages for different block sizes", - .category = FIO_OPT_C_ENGINE, - .group = FIO_OPT_G_LIBAIO, - }, -#else - { - .name = "cmdprio_percentage", - .lname = "high priority percentage", - .type = FIO_OPT_UNSUPPORTED, - .help = "Your platform does not support I/O priority classes", - }, - { - .name = "cmdprio_class", - .lname = "Asynchronous I/O priority class", - .type = FIO_OPT_UNSUPPORTED, - .help = "Your platform does not support I/O priority classes", - }, - { - .name = "cmdprio", - .lname = "Asynchronous I/O priority level", - .type = FIO_OPT_UNSUPPORTED, - .help = "Your platform does not support I/O priority classes", - }, - { - .name = "cmdprio_bssplit", - .lname = "Priority percentage block size split", - .type = FIO_OPT_UNSUPPORTED, - .help = "Your platform does not support I/O priority classes", - }, -#endif { .name = "nowait", .lname = "RWF_NOWAIT", @@ -162,6 +83,17 @@ static struct fio_option options[] = { .category = FIO_OPT_C_ENGINE, .group = FIO_OPT_G_LIBAIO, }, + { + .name = "libaio_vectored", + .lname = "Use libaio preadv,pwritev", + .type = FIO_OPT_BOOL, + .off1 = offsetof(struct libaio_options, vectored), + .help = "Use libaio {preadv,pwritev} instead of libaio {pread,pwrite}", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_LIBAIO, + }, + + CMDPRIO_OPTIONS(struct libaio_options, FIO_OPT_G_LIBAIO), { .name = NULL, }, @@ -181,15 +113,38 @@ static int fio_libaio_prep(struct thread_data *td, struct io_u *io_u) struct libaio_options *o = td->eo; struct fio_file *f = io_u->file; struct iocb *iocb = &io_u->iocb; + struct libaio_data *ld = td->io_ops_data; if (io_u->ddir == DDIR_READ) { - io_prep_pread(iocb, f->fd, io_u->xfer_buf, io_u->xfer_buflen, io_u->offset); + if (o->vectored) { + struct iovec *iov = &ld->iovecs[io_u->index]; + + iov->iov_base = io_u->xfer_buf; + iov->iov_len = (size_t)io_u->xfer_buflen; + io_prep_preadv(iocb, f->fd, iov, 1, io_u->offset); + } else { + io_prep_pread(iocb, f->fd, io_u->xfer_buf, io_u->xfer_buflen, + io_u->offset); + } if (o->nowait) iocb->aio_rw_flags |= RWF_NOWAIT; } else if (io_u->ddir == DDIR_WRITE) { - io_prep_pwrite(iocb, f->fd, io_u->xfer_buf, io_u->xfer_buflen, io_u->offset); + if (o->vectored) { + struct iovec *iov = &ld->iovecs[io_u->index]; + + iov->iov_base = io_u->xfer_buf; + iov->iov_len = (size_t)io_u->xfer_buflen; + io_prep_pwritev(iocb, f->fd, iov, 1, io_u->offset); + } else { + io_prep_pwrite(iocb, f->fd, io_u->xfer_buf, io_u->xfer_buflen, + io_u->offset); + } if (o->nowait) iocb->aio_rw_flags |= RWF_NOWAIT; +#ifdef FIO_HAVE_RWF_ATOMIC + if (td->o.oatomic) + iocb->aio_rw_flags |= RWF_ATOMIC; +#endif } else if (ddir_sync(io_u->ddir)) io_prep_fsync(iocb, f->fd); @@ -288,14 +243,16 @@ static int fio_libaio_getevents(struct thread_data *td, unsigned int min, && actual_min == 0 && ((struct aio_ring *)(ld->aio_ctx))->magic == AIO_RING_MAGIC) { - r = user_io_getevents(ld->aio_ctx, max, + r = user_io_getevents(ld->aio_ctx, max - events, ld->aio_events + events); } else { r = io_getevents(ld->aio_ctx, actual_min, - max, ld->aio_events + events, lt); + max - events, ld->aio_events + events, lt); } - if (r > 0) + if (r > 0) { events += r; + actual_min -= min((unsigned int)events, actual_min); + } else if ((min && r == 0) || r == -EAGAIN) { fio_libaio_commit(td); if (actual_min) @@ -317,20 +274,6 @@ static enum fio_q_status fio_libaio_queue(struct thread_data *td, if (ld->queued == td->o.iodepth) return FIO_Q_BUSY; - /* - * fsync is tricky, since it can fail and we need to do it - * serialized with other io. the reason is that linux doesn't - * support aio fsync yet. So return busy for the case where we - * have pending io, to let fio complete those first. - */ - if (ddir_sync(io_u->ddir)) { - if (ld->queued) - return FIO_Q_BUSY; - - do_io_u_sync(td, io_u); - return FIO_Q_COMPLETED; - } - if (io_u->ddir == DDIR_TRIM) { if (ld->queued) return FIO_Q_BUSY; @@ -368,6 +311,12 @@ static void fio_libaio_queued(struct thread_data *td, struct io_u **io_us, memcpy(&io_u->issue_time, &now, sizeof(now)); io_u_queued(td, io_u); } + + /* + * only used for iolog + */ + if (td->o.read_iolog_file) + memcpy(&td->last_issue, &now, sizeof(now)); } static int fio_libaio_commit(struct thread_data *td) @@ -440,13 +389,6 @@ static int fio_libaio_commit(struct thread_data *td) return ret; } -static int fio_libaio_cancel(struct thread_data *td, struct io_u *io_u) -{ - struct libaio_data *ld = td->io_ops_data; - - return io_cancel(ld->aio_ctx, &io_u->iocb, ld->aio_events); -} - static void fio_libaio_cleanup(struct thread_data *td) { struct libaio_data *ld = td->io_ops_data; @@ -462,6 +404,7 @@ static void fio_libaio_cleanup(struct thread_data *td) io_destroy(ld->aio_ctx); fio_cmdprio_cleanup(&ld->cmdprio); + free(ld->iovecs); free(ld->aio_events); free(ld->iocbs); free(ld->io_us); @@ -496,6 +439,7 @@ static int fio_libaio_init(struct thread_data *td) ld->aio_events = calloc(ld->entries, sizeof(struct io_event)); ld->iocbs = calloc(ld->entries, sizeof(struct iocb *)); ld->io_us = calloc(ld->entries, sizeof(struct io_u *)); + ld->iovecs = calloc(ld->entries, sizeof(ld->iovecs[0])); td->io_ops_data = ld; @@ -511,13 +455,14 @@ static int fio_libaio_init(struct thread_data *td) FIO_STATIC struct ioengine_ops ioengine = { .name = "libaio", .version = FIO_IOOPS_VERSION, - .flags = FIO_ASYNCIO_SYNC_TRIM, + .flags = FIO_ASYNCIO_SYNC_TRIM | + FIO_ASYNCIO_SETS_ISSUE_TIME | + FIO_ATOMICWRITES, .init = fio_libaio_init, .post_init = fio_libaio_post_init, .prep = fio_libaio_prep, .queue = fio_libaio_queue, .commit = fio_libaio_commit, - .cancel = fio_libaio_cancel, .getevents = fio_libaio_getevents, .event = fio_libaio_event, .cleanup = fio_libaio_cleanup, diff --git a/engines/libblkio.c b/engines/libblkio.c new file mode 100644 index 0000000000..ee42d11c17 --- /dev/null +++ b/engines/libblkio.c @@ -0,0 +1,912 @@ +/* + * libblkio engine + * + * IO engine using libblkio to access various block I/O interfaces: + * https://gitlab.com/libblkio/libblkio + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "../fio.h" +#include "../optgroup.h" +#include "../options.h" +#include "../parse.h" + +/* per-process state */ +static struct { + pthread_mutex_t mutex; + int initted_threads; + int initted_hipri_threads; + struct blkio *b; +} proc_state = { PTHREAD_MUTEX_INITIALIZER, 0, 0, NULL }; + +static void fio_blkio_proc_lock(void) { + int ret; + ret = pthread_mutex_lock(&proc_state.mutex); + assert(ret == 0); +} + +static void fio_blkio_proc_unlock(void) { + int ret; + ret = pthread_mutex_unlock(&proc_state.mutex); + assert(ret == 0); +} + +/* per-thread state */ +struct fio_blkio_data { + struct blkioq *q; + int completion_fd; /* may be -1 if not FIO_BLKIO_WAIT_MODE_EVENTFD */ + + bool has_mem_region; /* whether mem_region is valid */ + struct blkio_mem_region mem_region; /* only if allocated by libblkio */ + + struct iovec *iovecs; /* for vectored requests */ + struct blkio_completion *completions; +}; + +enum fio_blkio_wait_mode { + FIO_BLKIO_WAIT_MODE_BLOCK, + FIO_BLKIO_WAIT_MODE_EVENTFD, + FIO_BLKIO_WAIT_MODE_LOOP, +}; + +struct fio_blkio_options { + void *pad; /* option fields must not have offset 0 */ + + char *driver; + + char *path; + char *pre_connect_props; + + int num_entries; + int queue_size; + char *pre_start_props; + + unsigned int hipri; + unsigned int vectored; + unsigned int write_zeroes_on_trim; + enum fio_blkio_wait_mode wait_mode; + unsigned int force_enable_completion_eventfd; +}; + +static struct fio_option options[] = { + { + .name = "libblkio_driver", + .lname = "libblkio driver name", + .type = FIO_OPT_STR_STORE, + .off1 = offsetof(struct fio_blkio_options, driver), + .help = "Name of the driver to be used by libblkio", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_LIBBLKIO, + }, + { + .name = "libblkio_path", + .lname = "libblkio \"path\" property", + .type = FIO_OPT_STR_STORE, + .off1 = offsetof(struct fio_blkio_options, path), + .help = "Value to set the \"path\" property to", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_LIBBLKIO, + }, + { + .name = "libblkio_pre_connect_props", + .lname = "Additional properties to be set before blkio_connect()", + .type = FIO_OPT_STR_STORE, + .off1 = offsetof(struct fio_blkio_options, pre_connect_props), + .help = "", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_LIBBLKIO, + }, + { + .name = "libblkio_num_entries", + .lname = "libblkio \"num-entries\" property", + .type = FIO_OPT_INT, + .off1 = offsetof(struct fio_blkio_options, num_entries), + .help = "Value to set the \"num-entries\" property to", + .minval = 1, + .interval = 1, + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_LIBBLKIO, + }, + { + .name = "libblkio_queue_size", + .lname = "libblkio \"queue-size\" property", + .type = FIO_OPT_INT, + .off1 = offsetof(struct fio_blkio_options, queue_size), + .help = "Value to set the \"queue-size\" property to", + .minval = 1, + .interval = 1, + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_LIBBLKIO, + }, + { + .name = "libblkio_pre_start_props", + .lname = "Additional properties to be set before blkio_start()", + .type = FIO_OPT_STR_STORE, + .off1 = offsetof(struct fio_blkio_options, pre_start_props), + .help = "", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_LIBBLKIO, + }, + { + .name = "hipri", + .lname = "Use poll queues", + .type = FIO_OPT_STR_SET, + .off1 = offsetof(struct fio_blkio_options, hipri), + .help = "Use poll queues", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_LIBBLKIO, + }, + { + .name = "libblkio_vectored", + .lname = "Use blkioq_{readv,writev}()", + .type = FIO_OPT_STR_SET, + .off1 = offsetof(struct fio_blkio_options, vectored), + .help = "Use blkioq_{readv,writev}() instead of blkioq_{read,write}()", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_LIBBLKIO, + }, + { + .name = "libblkio_write_zeroes_on_trim", + .lname = "Use blkioq_write_zeroes() for TRIM", + .type = FIO_OPT_STR_SET, + .off1 = offsetof(struct fio_blkio_options, + write_zeroes_on_trim), + .help = "Use blkioq_write_zeroes() for TRIM instead of blkioq_discard()", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_LIBBLKIO, + }, + { + .name = "libblkio_wait_mode", + .lname = "How to wait for completions", + .type = FIO_OPT_STR, + .off1 = offsetof(struct fio_blkio_options, wait_mode), + .help = "How to wait for completions", + .def = "block", + .posval = { + { .ival = "block", + .oval = FIO_BLKIO_WAIT_MODE_BLOCK, + .help = "Blocking blkioq_do_io()", + }, + { .ival = "eventfd", + .oval = FIO_BLKIO_WAIT_MODE_EVENTFD, + .help = "Blocking read() on the completion eventfd", + }, + { .ival = "loop", + .oval = FIO_BLKIO_WAIT_MODE_LOOP, + .help = "Busy loop with non-blocking blkioq_do_io()", + }, + }, + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_LIBBLKIO, + }, + { + .name = "libblkio_force_enable_completion_eventfd", + .lname = "Force enable the completion eventfd, even if unused", + .type = FIO_OPT_STR_SET, + .off1 = offsetof(struct fio_blkio_options, + force_enable_completion_eventfd), + .help = "This can impact performance", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_LIBBLKIO, + }, + { + .name = NULL, + }, +}; + +static int fio_blkio_set_props_from_str(struct blkio *b, const char *opt_name, + const char *str) { + int ret = 0; + char *new_str, *name, *value; + + if (!str) + return 0; + + /* iteration can mutate string, so copy it */ + new_str = strdup(str); + if (!new_str) { + log_err("fio: strdup() failed\n"); + return 1; + } + + /* iterate over property name-value pairs */ + while ((name = get_next_str(&new_str))) { + /* split into property name and value */ + value = strchr(name, '='); + if (!value) { + log_err("fio: missing '=' in option %s\n", opt_name); + ret = 1; + break; + } + + *value = '\0'; + ++value; + + /* strip whitespace from property name */ + strip_blank_front(&name); + strip_blank_end(name); + + if (name[0] == '\0') { + log_err("fio: empty property name in option %s\n", + opt_name); + ret = 1; + break; + } + + /* strip whitespace from property value */ + strip_blank_front(&value); + strip_blank_end(value); + + /* set property */ + if (blkio_set_str(b, name, value) != 0) { + log_err("fio: error setting property '%s' to '%s': %s\n", + name, value, blkio_get_error_msg()); + ret = 1; + break; + } + } + + free(new_str); + return ret; +} + +/* + * Log the failure of a libblkio function. + * + * `(void)func` is to ensure `func` exists and prevent typos + */ +#define fio_blkio_log_err(func) \ + ({ \ + (void)func; \ + log_err("fio: %s() failed: %s\n", #func, \ + blkio_get_error_msg()); \ + }) + +static bool possibly_null_strs_equal(const char *a, const char *b) +{ + return (!a && !b) || (a && b && strcmp(a, b) == 0); +} + +/* + * Returns the total number of subjobs using the 'libblkio' ioengine and setting + * the 'thread' option in the entire workload that have the given value for the + * 'hipri' option. + */ +static int total_threaded_subjobs(bool hipri) +{ + int count = 0; + + for_each_td(td) { + const struct fio_blkio_options *options = td->eo; + if (strcmp(td->o.ioengine, "libblkio") == 0 && + td->o.use_thread && (bool)options->hipri == hipri) + ++count; + } end_for_each(); + + return count; +} + +static struct { + bool set_up; + bool direct; + struct fio_blkio_options opts; +} first_threaded_subjob = { 0 }; + +static void fio_blkio_log_opt_compat_err(const char *option_name) +{ + log_err("fio: jobs using engine libblkio and sharing a process must agree on the %s option\n", + option_name); +} + +/* + * If td represents a subjob with option 'thread', check if its options are + * compatible with those of other threaded subjobs that were already set up. + */ +static int fio_blkio_check_opt_compat(struct thread_data *td) +{ + const struct fio_blkio_options *options = td->eo, *prev_options; + + if (!td->o.use_thread) + return 0; /* subjob doesn't use 'thread' */ + + if (!first_threaded_subjob.set_up) { + /* first subjob using 'thread', store options for later */ + first_threaded_subjob.set_up = true; + first_threaded_subjob.direct = td->o.odirect; + first_threaded_subjob.opts = *options; + return 0; + } + + /* not first subjob using 'thread', check option compatibility */ + prev_options = &first_threaded_subjob.opts; + + if (td->o.odirect != first_threaded_subjob.direct) { + fio_blkio_log_opt_compat_err("direct/buffered"); + return 1; + } + + if (strcmp(options->driver, prev_options->driver) != 0) { + fio_blkio_log_opt_compat_err("libblkio_driver"); + return 1; + } + + if (!possibly_null_strs_equal(options->path, prev_options->path)) { + fio_blkio_log_opt_compat_err("libblkio_path"); + return 1; + } + + if (!possibly_null_strs_equal(options->pre_connect_props, + prev_options->pre_connect_props)) { + fio_blkio_log_opt_compat_err("libblkio_pre_connect_props"); + return 1; + } + + if (options->num_entries != prev_options->num_entries) { + fio_blkio_log_opt_compat_err("libblkio_num_entries"); + return 1; + } + + if (options->queue_size != prev_options->queue_size) { + fio_blkio_log_opt_compat_err("libblkio_queue_size"); + return 1; + } + + if (!possibly_null_strs_equal(options->pre_start_props, + prev_options->pre_start_props)) { + fio_blkio_log_opt_compat_err("libblkio_pre_start_props"); + return 1; + } + + return 0; +} + +static int fio_blkio_create_and_connect(struct thread_data *td, + struct blkio **out_blkio) +{ + const struct fio_blkio_options *options = td->eo; + struct blkio *b; + int ret; + + if (!options->driver) { + log_err("fio: engine libblkio requires option libblkio_driver to be set\n"); + return 1; + } + + if (blkio_create(options->driver, &b) != 0) { + fio_blkio_log_err(blkio_create); + return 1; + } + + /* don't fail if driver doesn't have a "direct" property */ + ret = blkio_set_bool(b, "direct", td->o.odirect); + if (ret != 0 && ret != -ENOENT) { + fio_blkio_log_err(blkio_set_bool); + goto err_blkio_destroy; + } + + if (blkio_set_bool(b, "read-only", read_only) != 0) { + fio_blkio_log_err(blkio_set_bool); + goto err_blkio_destroy; + } + + if (options->path) { + if (blkio_set_str(b, "path", options->path) != 0) { + fio_blkio_log_err(blkio_set_str); + goto err_blkio_destroy; + } + } + + if (fio_blkio_set_props_from_str(b, "libblkio_pre_connect_props", + options->pre_connect_props) != 0) + goto err_blkio_destroy; + + if (blkio_connect(b) != 0) { + fio_blkio_log_err(blkio_connect); + goto err_blkio_destroy; + } + + if (options->num_entries != 0) { + if (blkio_set_int(b, "num-entries", + options->num_entries) != 0) { + fio_blkio_log_err(blkio_set_int); + goto err_blkio_destroy; + } + } + + if (options->queue_size != 0) { + if (blkio_set_int(b, "queue-size", options->queue_size) != 0) { + fio_blkio_log_err(blkio_set_int); + goto err_blkio_destroy; + } + } + + if (fio_blkio_set_props_from_str(b, "libblkio_pre_start_props", + options->pre_start_props) != 0) + goto err_blkio_destroy; + + *out_blkio = b; + return 0; + +err_blkio_destroy: + blkio_destroy(&b); + return 1; +} + +static bool incompatible_threaded_subjob_options = false; + +/* + * This callback determines the device/file size, so it creates and connects a + * blkio instance. But it is invoked from the main thread in the original fio + * process, not from the processes in which jobs will actually run. It thus + * subsequently destroys the blkio, which is recreated in the init() callback. + */ +static int fio_blkio_setup(struct thread_data *td) +{ + const struct fio_blkio_options *options = td->eo; + struct blkio *b; + int ret = 0; + uint64_t capacity; + + assert(td->files_index == 1); + + if (fio_blkio_check_opt_compat(td) != 0) { + incompatible_threaded_subjob_options = true; + return 1; + } + + if (options->hipri && + options->wait_mode == FIO_BLKIO_WAIT_MODE_EVENTFD) { + log_err("fio: option hipri is incompatible with option libblkio_wait_mode=eventfd\n"); + return 1; + } + + if (options->hipri && options->force_enable_completion_eventfd) { + log_err("fio: option hipri is incompatible with option libblkio_force_enable_completion_eventfd\n"); + return 1; + } + + if (fio_blkio_create_and_connect(td, &b) != 0) + return 1; + + if (blkio_get_uint64(b, "capacity", &capacity) != 0) { + fio_blkio_log_err(blkio_get_uint64); + ret = 1; + goto out_blkio_destroy; + } + + td->files[0]->real_file_size = capacity; + fio_file_set_size_known(td->files[0]); + +out_blkio_destroy: + blkio_destroy(&b); + return ret; +} + +static int fio_blkio_init(struct thread_data *td) +{ + const struct fio_blkio_options *options = td->eo; + struct fio_blkio_data *data; + int flags; + + if (td->o.use_thread && incompatible_threaded_subjob_options) { + /* + * Different subjobs using option 'thread' specified + * incompatible options. We don't know which configuration + * should win, so we just fail all such subjobs. + */ + return 1; + } + + /* + * Request enqueueing is fast, and it's not possible to know exactly + * when a request is submitted, so never report submission latencies. + */ + td->o.disable_slat = 1; + + data = calloc(1, sizeof(*data)); + if (!data) { + log_err("fio: calloc() failed\n"); + return 1; + } + + data->iovecs = calloc(td->o.iodepth, sizeof(data->iovecs[0])); + data->completions = calloc(td->o.iodepth, sizeof(data->completions[0])); + if (!data->iovecs || !data->completions) { + log_err("fio: calloc() failed\n"); + goto err_free; + } + + fio_blkio_proc_lock(); + + if (proc_state.initted_threads == 0) { + /* initialize per-process blkio */ + int num_queues, num_poll_queues; + + if (td->o.use_thread) { + num_queues = total_threaded_subjobs(false); + num_poll_queues = total_threaded_subjobs(true); + } else { + num_queues = options->hipri ? 0 : 1; + num_poll_queues = options->hipri ? 1 : 0; + } + + if (fio_blkio_create_and_connect(td, &proc_state.b) != 0) + goto err_unlock; + + if (blkio_set_int(proc_state.b, "num-queues", + num_queues) != 0) { + fio_blkio_log_err(blkio_set_int); + goto err_blkio_destroy; + } + + if (blkio_set_int(proc_state.b, "num-poll-queues", + num_poll_queues) != 0) { + fio_blkio_log_err(blkio_set_int); + goto err_blkio_destroy; + } + + if (blkio_start(proc_state.b) != 0) { + fio_blkio_log_err(blkio_start); + goto err_blkio_destroy; + } + } + + if (options->hipri) { + int i = proc_state.initted_hipri_threads; + data->q = blkio_get_poll_queue(proc_state.b, i); + } else { + int i = proc_state.initted_threads - + proc_state.initted_hipri_threads; + data->q = blkio_get_queue(proc_state.b, i); + } + + if (options->wait_mode == FIO_BLKIO_WAIT_MODE_EVENTFD || + options->force_enable_completion_eventfd) { + /* enable completion fd and make it blocking */ + blkioq_set_completion_fd_enabled(data->q, true); + data->completion_fd = blkioq_get_completion_fd(data->q); + + flags = fcntl(data->completion_fd, F_GETFL); + if (flags < 0) { + log_err("fio: fcntl(F_GETFL) failed: %s\n", + strerror(errno)); + goto err_blkio_destroy; + } + + if (fcntl(data->completion_fd, F_SETFL, + flags & ~O_NONBLOCK) != 0) { + log_err("fio: fcntl(F_SETFL) failed: %s\n", + strerror(errno)); + goto err_blkio_destroy; + } + } else { + data->completion_fd = -1; + } + + ++proc_state.initted_threads; + if (options->hipri) + ++proc_state.initted_hipri_threads; + + /* Set data last so cleanup() does nothing if init() fails. */ + td->io_ops_data = data; + + fio_blkio_proc_unlock(); + + return 0; + +err_blkio_destroy: + if (proc_state.initted_threads == 0) + blkio_destroy(&proc_state.b); +err_unlock: + if (proc_state.initted_threads == 0) + proc_state.b = NULL; + fio_blkio_proc_unlock(); +err_free: + free(data->completions); + free(data->iovecs); + free(data); + return 1; +} + +static int fio_blkio_post_init(struct thread_data *td) +{ + struct fio_blkio_data *data = td->io_ops_data; + + if (!data->has_mem_region) { + /* + * Memory was allocated by the fio core and not iomem_alloc(), + * so we need to register it as a memory region here. + * + * `td->orig_buffer_size` is computed like `len` below, but then + * fio can add some padding to it to make sure it is + * sufficiently aligned to the page size and the mem_align + * option. However, this can make it become unaligned to the + * "mem-region-alignment" property in ways that the user can't + * control, so we essentially recompute `td->orig_buffer_size` + * here but without adding that padding. + */ + + unsigned long long max_block_size; + struct blkio_mem_region region; + + max_block_size = max(td->o.max_bs[DDIR_READ], + max(td->o.max_bs[DDIR_WRITE], + td->o.max_bs[DDIR_TRIM])); + + region = (struct blkio_mem_region) { + .addr = td->orig_buffer, + .len = (size_t)max_block_size * + (size_t)td->o.iodepth, + .fd = -1, + }; + + if (blkio_map_mem_region(proc_state.b, ®ion) != 0) { + fio_blkio_log_err(blkio_map_mem_region); + return 1; + } + } + + return 0; +} + +static void fio_blkio_cleanup(struct thread_data *td) +{ + struct fio_blkio_data *data = td->io_ops_data; + + /* + * Subjobs from different jobs can be terminated at different times, so + * this callback may be invoked for one subjob while another is still + * doing I/O. Those subjobs may share the process, so we must wait until + * the last subjob in the process wants to clean up to actually destroy + * the blkio. + */ + + if (data) { + free(data->completions); + free(data->iovecs); + free(data); + + fio_blkio_proc_lock(); + if (--proc_state.initted_threads == 0) { + blkio_destroy(&proc_state.b); + proc_state.b = NULL; + } + fio_blkio_proc_unlock(); + } +} + +#define align_up(x, y) ((((x) + (y) - 1) / (y)) * (y)) + +static int fio_blkio_iomem_alloc(struct thread_data *td, size_t size) +{ + struct fio_blkio_data *data = td->io_ops_data; + int ret; + uint64_t mem_region_alignment; + + if (blkio_get_uint64(proc_state.b, "mem-region-alignment", + &mem_region_alignment) != 0) { + fio_blkio_log_err(blkio_get_uint64); + return 1; + } + + /* round up size to satisfy mem-region-alignment */ + size = align_up(size, (size_t)mem_region_alignment); + + fio_blkio_proc_lock(); + + if (blkio_alloc_mem_region(proc_state.b, &data->mem_region, + size) != 0) { + fio_blkio_log_err(blkio_alloc_mem_region); + ret = 1; + goto out; + } + + if (blkio_map_mem_region(proc_state.b, &data->mem_region) != 0) { + fio_blkio_log_err(blkio_map_mem_region); + ret = 1; + goto out_free; + } + + td->orig_buffer = data->mem_region.addr; + data->has_mem_region = true; + + ret = 0; + goto out; + +out_free: + blkio_free_mem_region(proc_state.b, &data->mem_region); +out: + fio_blkio_proc_unlock(); + return ret; +} + +static void fio_blkio_iomem_free(struct thread_data *td) +{ + struct fio_blkio_data *data = td->io_ops_data; + + if (data && data->has_mem_region) { + fio_blkio_proc_lock(); + blkio_unmap_mem_region(proc_state.b, &data->mem_region); + blkio_free_mem_region(proc_state.b, &data->mem_region); + fio_blkio_proc_unlock(); + + data->has_mem_region = false; + } +} + +static int fio_blkio_open_file(struct thread_data *td, struct fio_file *f) +{ + return 0; +} + +static enum fio_q_status fio_blkio_queue(struct thread_data *td, + struct io_u *io_u) +{ + const struct fio_blkio_options *options = td->eo; + struct fio_blkio_data *data = td->io_ops_data; + + fio_ro_check(td, io_u); + + switch (io_u->ddir) { + case DDIR_READ: + if (options->vectored) { + struct iovec *iov = &data->iovecs[io_u->index]; + iov->iov_base = io_u->xfer_buf; + iov->iov_len = (size_t)io_u->xfer_buflen; + + blkioq_readv(data->q, io_u->offset, iov, 1, + io_u, 0); + } else { + blkioq_read(data->q, io_u->offset, + io_u->xfer_buf, + (size_t)io_u->xfer_buflen, io_u, 0); + } + break; + case DDIR_WRITE: + if (options->vectored) { + struct iovec *iov = &data->iovecs[io_u->index]; + iov->iov_base = io_u->xfer_buf; + iov->iov_len = (size_t)io_u->xfer_buflen; + + blkioq_writev(data->q, io_u->offset, iov, 1, + io_u, 0); + } else { + blkioq_write(data->q, io_u->offset, + io_u->xfer_buf, + (size_t)io_u->xfer_buflen, io_u, + 0); + } + break; + case DDIR_TRIM: + if (options->write_zeroes_on_trim) { + blkioq_write_zeroes(data->q, io_u->offset, + io_u->xfer_buflen, io_u, 0); + } else { + blkioq_discard(data->q, io_u->offset, + io_u->xfer_buflen, io_u, 0); + } + break; + case DDIR_SYNC: + case DDIR_DATASYNC: + blkioq_flush(data->q, io_u, 0); + break; + default: + io_u->error = ENOTSUP; + io_u_log_error(td, io_u); + return FIO_Q_COMPLETED; + } + + return FIO_Q_QUEUED; +} + +static int fio_blkio_getevents(struct thread_data *td, unsigned int min, + unsigned int max, const struct timespec *t) +{ + const struct fio_blkio_options *options = td->eo; + struct fio_blkio_data *data = td->io_ops_data; + int ret, n; + uint64_t event; + + switch (options->wait_mode) { + case FIO_BLKIO_WAIT_MODE_BLOCK: + n = blkioq_do_io(data->q, data->completions, (int)min, (int)max, + NULL); + if (n < 0) { + fio_blkio_log_err(blkioq_do_io); + return -1; + } + return n; + case FIO_BLKIO_WAIT_MODE_EVENTFD: + n = blkioq_do_io(data->q, data->completions, 0, (int)max, NULL); + if (n < 0) { + fio_blkio_log_err(blkioq_do_io); + return -1; + } + while (n < (int)min) { + ret = read(data->completion_fd, &event, sizeof(event)); + if (ret != sizeof(event)) { + log_err("fio: read() on the completion fd returned %d\n", + ret); + return -1; + } + + ret = blkioq_do_io(data->q, data->completions + n, 0, + (int)max - n, NULL); + if (ret < 0) { + fio_blkio_log_err(blkioq_do_io); + return -1; + } + + n += ret; + } + return n; + case FIO_BLKIO_WAIT_MODE_LOOP: + for (n = 0; n < (int)min; ) { + ret = blkioq_do_io(data->q, data->completions + n, 0, + (int)max - n, NULL); + if (ret < 0) { + fio_blkio_log_err(blkioq_do_io); + return -1; + } + + n += ret; + } + return n; + default: + return -1; + } +} + +static struct io_u *fio_blkio_event(struct thread_data *td, int event) +{ + struct fio_blkio_data *data = td->io_ops_data; + struct blkio_completion *completion = &data->completions[event]; + struct io_u *io_u = completion->user_data; + + io_u->error = -completion->ret; + + return io_u; +} + +FIO_STATIC struct ioengine_ops ioengine = { + .name = "libblkio", + .version = FIO_IOOPS_VERSION, + .flags = FIO_DISKLESSIO | FIO_NOEXTEND | + FIO_NO_OFFLOAD | FIO_SKIPPABLE_IOMEM_ALLOC, + + .setup = fio_blkio_setup, + .init = fio_blkio_init, + .post_init = fio_blkio_post_init, + .cleanup = fio_blkio_cleanup, + + .iomem_alloc = fio_blkio_iomem_alloc, + .iomem_free = fio_blkio_iomem_free, + + .open_file = fio_blkio_open_file, + + .queue = fio_blkio_queue, + .getevents = fio_blkio_getevents, + .event = fio_blkio_event, + + .options = options, + .option_struct_size = sizeof(struct fio_blkio_options), +}; + +static void fio_init fio_blkio_register(void) +{ + register_ioengine(&ioengine); +} + +static void fio_exit fio_blkio_unregister(void) +{ + unregister_ioengine(&ioengine); +} diff --git a/engines/libcufile.c b/engines/libcufile.c index e575b7864d..2bedf26136 100644 --- a/engines/libcufile.c +++ b/engines/libcufile.c @@ -606,6 +606,7 @@ FIO_STATIC struct ioengine_ops ioengine = { .version = FIO_IOOPS_VERSION, .init = fio_libcufile_init, .queue = fio_libcufile_queue, + .get_file_size = generic_get_file_size, .open_file = fio_libcufile_open_file, .close_file = fio_libcufile_close_file, .iomem_alloc = fio_libcufile_iomem_alloc, diff --git a/engines/libhdfs.c b/engines/libhdfs.c index eb55c3c549..d0a2684084 100644 --- a/engines/libhdfs.c +++ b/engines/libhdfs.c @@ -27,7 +27,7 @@ struct hdfsio_data { }; struct hdfsio_options { - void *pad; /* needed because offset can't be 0 for a option defined used offsetof */ + void *pad; /* needed because offset can't be 0 for an option defined used offsetof */ char *host; char *directory; unsigned int port; @@ -315,8 +315,7 @@ static int fio_hdfsio_setup(struct thread_data *td) uint64_t file_size, total_file_size; if (!td->io_ops_data) { - hd = malloc(sizeof(*hd)); - memset(hd, 0, sizeof(*hd)); + hd = calloc(1, sizeof(*hd)); hd->curr_file_id = -1; diff --git a/engines/libiscsi.c b/engines/libiscsi.c index c97b5709ae..37c9b55a91 100644 --- a/engines/libiscsi.c +++ b/engines/libiscsi.c @@ -68,8 +68,7 @@ static int fio_iscsi_setup_lun(struct iscsi_info *iscsi_info, struct scsi_readcapacity16 *rc16 = NULL; int ret = 0; - iscsi_lun = malloc(sizeof(struct iscsi_lun)); - memset(iscsi_lun, 0, sizeof(struct iscsi_lun)); + iscsi_lun = calloc(1, sizeof(struct iscsi_lun)); iscsi_lun->iscsi_info = iscsi_info; diff --git a/engines/librpma_apm.c b/engines/librpma_apm.c deleted file mode 100644 index ffa3769d33..0000000000 --- a/engines/librpma_apm.c +++ /dev/null @@ -1,256 +0,0 @@ -/* -* librpma_apm: IO engine that uses PMDK librpma to read and write data, - * based on Appliance Persistency Method - * - * Copyright 2020-2021, Intel Corporation - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License, - * version 2 as published by the Free Software Foundation.. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#include "librpma_fio.h" - -/* client side implementation */ - -static inline int client_io_flush(struct thread_data *td, - struct io_u *first_io_u, struct io_u *last_io_u, - unsigned long long int len); - -static int client_get_io_u_index(struct rpma_completion *cmpl, - unsigned int *io_u_index); - -static int client_init(struct thread_data *td) -{ - struct librpma_fio_client_data *ccd; - unsigned int sq_size; - uint32_t cq_size; - struct rpma_conn_cfg *cfg = NULL; - struct rpma_peer_cfg *pcfg = NULL; - int ret; - - /* not supported readwrite = trim / randtrim / trimwrite */ - if (td_trim(td)) { - td_verror(td, EINVAL, "Not supported mode."); - return -1; - } - - /* - * Calculate the required queue sizes where: - * - the send queue (SQ) has to be big enough to accommodate - * all io_us (WRITEs) and all flush requests (FLUSHes) - * - the completion queue (CQ) has to be big enough to accommodate all - * success and error completions (cq_size = sq_size) - */ - if (td_random(td) || td_rw(td)) { - /* - * sq_size = max(rand_read_sq_size, rand_write_sq_size) - * where rand_read_sq_size < rand_write_sq_size because read - * does not require flush afterwards - * rand_write_sq_size = N * (WRITE + FLUSH) - * - * Note: rw is no different from random write since having - * interleaved reads with writes in extreme forces you to flush - * as often as when the writes are random. - */ - sq_size = 2 * td->o.iodepth; - } else if (td_write(td)) { - /* sequential TD_DDIR_WRITE only */ - if (td->o.sync_io) { - sq_size = 2; /* WRITE + FLUSH */ - } else { - /* - * N * WRITE + B * FLUSH where: - * - B == ceil(iodepth / iodepth_batch) - * which is the number of batches for N writes - */ - sq_size = td->o.iodepth + LIBRPMA_FIO_CEIL(td->o.iodepth, - td->o.iodepth_batch); - } - } else { - /* TD_DDIR_READ only */ - if (td->o.sync_io) { - sq_size = 1; /* READ */ - } else { - sq_size = td->o.iodepth; /* N x READ */ - } - } - cq_size = sq_size; - - /* create a connection configuration object */ - if ((ret = rpma_conn_cfg_new(&cfg))) { - librpma_td_verror(td, ret, "rpma_conn_cfg_new"); - return -1; - } - - /* apply queue sizes */ - if ((ret = rpma_conn_cfg_set_sq_size(cfg, sq_size))) { - librpma_td_verror(td, ret, "rpma_conn_cfg_set_sq_size"); - goto err_cfg_delete; - } - if ((ret = rpma_conn_cfg_set_cq_size(cfg, cq_size))) { - librpma_td_verror(td, ret, "rpma_conn_cfg_set_cq_size"); - goto err_cfg_delete; - } - - if (librpma_fio_client_init(td, cfg)) - goto err_cfg_delete; - - ccd = td->io_ops_data; - - if (ccd->server_mr_flush_type == RPMA_FLUSH_TYPE_PERSISTENT) { - if (!ccd->ws->direct_write_to_pmem) { - if (td->thread_number == 1) - log_err( - "Fio librpma engine will not work until the Direct Write to PMem on the server side is possible (direct_write_to_pmem)\n"); - goto err_cleanup_common; - } - - /* configure peer's direct write to pmem support */ - if ((ret = rpma_peer_cfg_new(&pcfg))) { - librpma_td_verror(td, ret, "rpma_peer_cfg_new"); - goto err_cleanup_common; - } - - if ((ret = rpma_peer_cfg_set_direct_write_to_pmem(pcfg, true))) { - librpma_td_verror(td, ret, - "rpma_peer_cfg_set_direct_write_to_pmem"); - (void) rpma_peer_cfg_delete(&pcfg); - goto err_cleanup_common; - } - - if ((ret = rpma_conn_apply_remote_peer_cfg(ccd->conn, pcfg))) { - librpma_td_verror(td, ret, - "rpma_conn_apply_remote_peer_cfg"); - (void) rpma_peer_cfg_delete(&pcfg); - goto err_cleanup_common; - } - - (void) rpma_peer_cfg_delete(&pcfg); - } else if (td->thread_number == 1) { - /* XXX log_info mixes with the JSON output */ - log_err( - "Note: Direct Write to PMem is not supported by default nor required if you use DRAM instead of PMem on the server side (direct_write_to_pmem).\n" - "Remember that flushing to DRAM does not make your data persistent and may be used only for experimental purposes.\n"); - } - - if ((ret = rpma_conn_cfg_delete(&cfg))) { - librpma_td_verror(td, ret, "rpma_conn_cfg_delete"); - /* non fatal error - continue */ - } - - ccd->flush = client_io_flush; - ccd->get_io_u_index = client_get_io_u_index; - - return 0; - -err_cleanup_common: - librpma_fio_client_cleanup(td); - -err_cfg_delete: - (void) rpma_conn_cfg_delete(&cfg); - - return -1; -} - -static void client_cleanup(struct thread_data *td) -{ - struct librpma_fio_client_data *ccd = td->io_ops_data; - - if (ccd == NULL) - return; - - free(ccd->client_data); - - librpma_fio_client_cleanup(td); -} - -static inline int client_io_flush(struct thread_data *td, - struct io_u *first_io_u, struct io_u *last_io_u, - unsigned long long int len) -{ - struct librpma_fio_client_data *ccd = td->io_ops_data; - size_t dst_offset = first_io_u->offset; - int ret; - - if ((ret = rpma_flush(ccd->conn, ccd->server_mr, dst_offset, len, - ccd->server_mr_flush_type, RPMA_F_COMPLETION_ALWAYS, - (void *)(uintptr_t)last_io_u->index))) { - librpma_td_verror(td, ret, "rpma_flush"); - return -1; - } - - return 0; -} - -static int client_get_io_u_index(struct rpma_completion *cmpl, - unsigned int *io_u_index) -{ - memcpy(io_u_index, &cmpl->op_context, sizeof(*io_u_index)); - - return 1; -} - -FIO_STATIC struct ioengine_ops ioengine_client = { - .name = "librpma_apm_client", - .version = FIO_IOOPS_VERSION, - .init = client_init, - .post_init = librpma_fio_client_post_init, - .get_file_size = librpma_fio_client_get_file_size, - .open_file = librpma_fio_file_nop, - .queue = librpma_fio_client_queue, - .commit = librpma_fio_client_commit, - .getevents = librpma_fio_client_getevents, - .event = librpma_fio_client_event, - .errdetails = librpma_fio_client_errdetails, - .close_file = librpma_fio_file_nop, - .cleanup = client_cleanup, - .flags = FIO_DISKLESSIO, - .options = librpma_fio_options, - .option_struct_size = sizeof(struct librpma_fio_options_values), -}; - -/* server side implementation */ - -static int server_open_file(struct thread_data *td, struct fio_file *f) -{ - return librpma_fio_server_open_file(td, f, NULL); -} - -static enum fio_q_status server_queue(struct thread_data *td, struct io_u *io_u) -{ - return FIO_Q_COMPLETED; -} - -FIO_STATIC struct ioengine_ops ioengine_server = { - .name = "librpma_apm_server", - .version = FIO_IOOPS_VERSION, - .init = librpma_fio_server_init, - .open_file = server_open_file, - .close_file = librpma_fio_server_close_file, - .queue = server_queue, - .invalidate = librpma_fio_file_nop, - .cleanup = librpma_fio_server_cleanup, - .flags = FIO_SYNCIO, - .options = librpma_fio_options, - .option_struct_size = sizeof(struct librpma_fio_options_values), -}; - -/* register both engines */ - -static void fio_init fio_librpma_apm_register(void) -{ - register_ioengine(&ioengine_client); - register_ioengine(&ioengine_server); -} - -static void fio_exit fio_librpma_apm_unregister(void) -{ - unregister_ioengine(&ioengine_client); - unregister_ioengine(&ioengine_server); -} diff --git a/engines/librpma_fio.c b/engines/librpma_fio.c deleted file mode 100644 index 3d605ed6c3..0000000000 --- a/engines/librpma_fio.c +++ /dev/null @@ -1,1062 +0,0 @@ -/* - * librpma_fio: librpma_apm and librpma_gpspm engines' common part. - * - * Copyright 2021, Intel Corporation - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License, - * version 2 as published by the Free Software Foundation.. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#include "librpma_fio.h" - -#include - -struct fio_option librpma_fio_options[] = { - { - .name = "serverip", - .lname = "rpma_server_ip", - .type = FIO_OPT_STR_STORE, - .off1 = offsetof(struct librpma_fio_options_values, server_ip), - .help = "IP address the server is listening on", - .def = "", - .category = FIO_OPT_C_ENGINE, - .group = FIO_OPT_G_LIBRPMA, - }, - { - .name = "port", - .lname = "rpma_server port", - .type = FIO_OPT_STR_STORE, - .off1 = offsetof(struct librpma_fio_options_values, port), - .help = "port the server is listening on", - .def = "7204", - .category = FIO_OPT_C_ENGINE, - .group = FIO_OPT_G_LIBRPMA, - }, - { - .name = "direct_write_to_pmem", - .lname = "Direct Write to PMem (via RDMA) from the remote host is possible", - .type = FIO_OPT_BOOL, - .off1 = offsetof(struct librpma_fio_options_values, - direct_write_to_pmem), - .help = "Set to true ONLY when Direct Write to PMem from the remote host is possible (https://pmem.io/rpma/documentation/basic-direct-write-to-pmem.html)", - .def = "", - .category = FIO_OPT_C_ENGINE, - .group = FIO_OPT_G_LIBRPMA, - }, - { - .name = "busy_wait_polling", - .lname = "Set to 0 to wait for completion instead of busy-wait polling completion.", - .type = FIO_OPT_BOOL, - .off1 = offsetof(struct librpma_fio_options_values, - busy_wait_polling), - .help = "Set to false if you want to reduce CPU usage", - .def = "1", - .category = FIO_OPT_C_ENGINE, - .group = FIO_OPT_G_LIBRPMA, - }, - { - .name = NULL, - }, -}; - -int librpma_fio_td_port(const char *port_base_str, struct thread_data *td, - char *port_out) -{ - unsigned long int port_ul = strtoul(port_base_str, NULL, 10); - unsigned int port_new; - - port_out[0] = '\0'; - - if (port_ul == ULONG_MAX) { - td_verror(td, errno, "strtoul"); - return -1; - } - port_ul += td->thread_number - 1; - if (port_ul >= UINT_MAX) { - log_err("[%u] port number (%lu) bigger than UINT_MAX\n", - td->thread_number, port_ul); - return -1; - } - - port_new = port_ul; - snprintf(port_out, LIBRPMA_FIO_PORT_STR_LEN_MAX - 1, "%u", port_new); - - return 0; -} - -char *librpma_fio_allocate_dram(struct thread_data *td, size_t size, - struct librpma_fio_mem *mem) -{ - char *mem_ptr = NULL; - int ret; - - if ((ret = posix_memalign((void **)&mem_ptr, page_size, size))) { - log_err("fio: posix_memalign() failed\n"); - td_verror(td, ret, "posix_memalign"); - return NULL; - } - - mem->mem_ptr = mem_ptr; - mem->size_mmap = 0; - - return mem_ptr; -} - -char *librpma_fio_allocate_pmem(struct thread_data *td, const char *filename, - size_t size, struct librpma_fio_mem *mem) -{ - size_t size_mmap = 0; - char *mem_ptr = NULL; - int is_pmem = 0; - size_t ws_offset; - - if (size % page_size) { - log_err("fio: size (%zu) is not aligned to page size (%zu)\n", - size, page_size); - return NULL; - } - - ws_offset = (td->thread_number - 1) * size; - - if (!filename) { - log_err("fio: filename is not set\n"); - return NULL; - } - - /* map the file */ - mem_ptr = pmem_map_file(filename, 0 /* len */, 0 /* flags */, - 0 /* mode */, &size_mmap, &is_pmem); - if (mem_ptr == NULL) { - log_err("fio: pmem_map_file(%s) failed\n", filename); - /* pmem_map_file() sets errno on failure */ - td_verror(td, errno, "pmem_map_file"); - return NULL; - } - - /* pmem is expected */ - if (!is_pmem) { - log_err("fio: %s is not located in persistent memory\n", - filename); - goto err_unmap; - } - - /* check size of allocated persistent memory */ - if (size_mmap < ws_offset + size) { - log_err( - "fio: %s is too small to handle so many threads (%zu < %zu)\n", - filename, size_mmap, ws_offset + size); - goto err_unmap; - } - - log_info("fio: size of memory mapped from the file %s: %zu\n", - filename, size_mmap); - - mem->mem_ptr = mem_ptr; - mem->size_mmap = size_mmap; - - return mem_ptr + ws_offset; - -err_unmap: - (void) pmem_unmap(mem_ptr, size_mmap); - return NULL; -} - -void librpma_fio_free(struct librpma_fio_mem *mem) -{ - if (mem->size_mmap) - (void) pmem_unmap(mem->mem_ptr, mem->size_mmap); - else - free(mem->mem_ptr); -} - -#define LIBRPMA_FIO_RETRY_MAX_NO 10 -#define LIBRPMA_FIO_RETRY_DELAY_S 5 - -int librpma_fio_client_init(struct thread_data *td, - struct rpma_conn_cfg *cfg) -{ - struct librpma_fio_client_data *ccd; - struct librpma_fio_options_values *o = td->eo; - struct ibv_context *dev = NULL; - char port_td[LIBRPMA_FIO_PORT_STR_LEN_MAX]; - struct rpma_conn_req *req = NULL; - enum rpma_conn_event event; - struct rpma_conn_private_data pdata; - enum rpma_log_level log_level_aux = RPMA_LOG_LEVEL_WARNING; - int remote_flush_type; - int retry; - int ret; - - /* --debug=net sets RPMA_LOG_THRESHOLD_AUX to RPMA_LOG_LEVEL_INFO */ -#ifdef FIO_INC_DEBUG - if ((1UL << FD_NET) & fio_debug) - log_level_aux = RPMA_LOG_LEVEL_INFO; -#endif - - /* configure logging thresholds to see more details */ - rpma_log_set_threshold(RPMA_LOG_THRESHOLD, RPMA_LOG_LEVEL_INFO); - rpma_log_set_threshold(RPMA_LOG_THRESHOLD_AUX, log_level_aux); - - /* obtain an IBV context for a remote IP address */ - if ((ret = rpma_utils_get_ibv_context(o->server_ip, - RPMA_UTIL_IBV_CONTEXT_REMOTE, &dev))) { - librpma_td_verror(td, ret, "rpma_utils_get_ibv_context"); - return -1; - } - - /* allocate client's data */ - ccd = calloc(1, sizeof(*ccd)); - if (ccd == NULL) { - td_verror(td, errno, "calloc"); - return -1; - } - - /* allocate all in-memory queues */ - ccd->io_us_queued = calloc(td->o.iodepth, sizeof(*ccd->io_us_queued)); - if (ccd->io_us_queued == NULL) { - td_verror(td, errno, "calloc"); - goto err_free_ccd; - } - - ccd->io_us_flight = calloc(td->o.iodepth, sizeof(*ccd->io_us_flight)); - if (ccd->io_us_flight == NULL) { - td_verror(td, errno, "calloc"); - goto err_free_io_u_queues; - } - - ccd->io_us_completed = calloc(td->o.iodepth, - sizeof(*ccd->io_us_completed)); - if (ccd->io_us_completed == NULL) { - td_verror(td, errno, "calloc"); - goto err_free_io_u_queues; - } - - /* create a new peer object */ - if ((ret = rpma_peer_new(dev, &ccd->peer))) { - librpma_td_verror(td, ret, "rpma_peer_new"); - goto err_free_io_u_queues; - } - - /* create a connection request */ - if (librpma_fio_td_port(o->port, td, port_td)) - goto err_peer_delete; - - for (retry = 0; retry < LIBRPMA_FIO_RETRY_MAX_NO; retry++) { - if ((ret = rpma_conn_req_new(ccd->peer, o->server_ip, port_td, - cfg, &req))) { - librpma_td_verror(td, ret, "rpma_conn_req_new"); - goto err_peer_delete; - } - - /* - * Connect the connection request - * and obtain the connection object. - */ - if ((ret = rpma_conn_req_connect(&req, NULL, &ccd->conn))) { - librpma_td_verror(td, ret, "rpma_conn_req_connect"); - goto err_req_delete; - } - - /* wait for the connection to establish */ - if ((ret = rpma_conn_next_event(ccd->conn, &event))) { - librpma_td_verror(td, ret, "rpma_conn_next_event"); - goto err_conn_delete; - } else if (event == RPMA_CONN_ESTABLISHED) { - break; - } else if (event == RPMA_CONN_REJECTED) { - (void) rpma_conn_disconnect(ccd->conn); - (void) rpma_conn_delete(&ccd->conn); - if (retry < LIBRPMA_FIO_RETRY_MAX_NO - 1) { - log_err("Thread [%d]: Retrying (#%i) ...\n", - td->thread_number, retry + 1); - sleep(LIBRPMA_FIO_RETRY_DELAY_S); - } else { - log_err( - "Thread [%d]: The maximum number of retries exceeded. Closing.\n", - td->thread_number); - } - } else { - log_err( - "rpma_conn_next_event returned an unexptected event: (%s != RPMA_CONN_ESTABLISHED)\n", - rpma_utils_conn_event_2str(event)); - goto err_conn_delete; - } - } - - if (retry > 0) - log_err("Thread [%d]: Connected after retry #%i\n", - td->thread_number, retry); - - if (ccd->conn == NULL) - goto err_peer_delete; - - /* get the connection's private data sent from the server */ - if ((ret = rpma_conn_get_private_data(ccd->conn, &pdata))) { - librpma_td_verror(td, ret, "rpma_conn_get_private_data"); - goto err_conn_delete; - } - - /* get the server's workspace representation */ - ccd->ws = pdata.ptr; - - /* create the server's memory representation */ - if ((ret = rpma_mr_remote_from_descriptor(&ccd->ws->descriptor[0], - ccd->ws->mr_desc_size, &ccd->server_mr))) { - librpma_td_verror(td, ret, "rpma_mr_remote_from_descriptor"); - goto err_conn_delete; - } - - /* get the total size of the shared server memory */ - if ((ret = rpma_mr_remote_get_size(ccd->server_mr, &ccd->ws_size))) { - librpma_td_verror(td, ret, "rpma_mr_remote_get_size"); - goto err_conn_delete; - } - - /* get flush type of the remote node */ - if ((ret = rpma_mr_remote_get_flush_type(ccd->server_mr, - &remote_flush_type))) { - librpma_td_verror(td, ret, "rpma_mr_remote_get_flush_type"); - goto err_conn_delete; - } - - ccd->server_mr_flush_type = - (remote_flush_type & RPMA_MR_USAGE_FLUSH_TYPE_PERSISTENT) ? - RPMA_FLUSH_TYPE_PERSISTENT : RPMA_FLUSH_TYPE_VISIBILITY; - - /* - * Assure an io_us buffer allocation is page-size-aligned which is required - * to register for RDMA. User-provided value is intentionally ignored. - */ - td->o.mem_align = page_size; - - td->io_ops_data = ccd; - - return 0; - -err_conn_delete: - (void) rpma_conn_disconnect(ccd->conn); - (void) rpma_conn_delete(&ccd->conn); - -err_req_delete: - (void) rpma_conn_req_delete(&req); - -err_peer_delete: - (void) rpma_peer_delete(&ccd->peer); - -err_free_io_u_queues: - free(ccd->io_us_queued); - free(ccd->io_us_flight); - free(ccd->io_us_completed); - -err_free_ccd: - free(ccd); - - return -1; -} - -void librpma_fio_client_cleanup(struct thread_data *td) -{ - struct librpma_fio_client_data *ccd = td->io_ops_data; - enum rpma_conn_event ev; - int ret; - - if (ccd == NULL) - return; - - /* delete the iou's memory registration */ - if ((ret = rpma_mr_dereg(&ccd->orig_mr))) - librpma_td_verror(td, ret, "rpma_mr_dereg"); - /* delete the iou's memory registration */ - if ((ret = rpma_mr_remote_delete(&ccd->server_mr))) - librpma_td_verror(td, ret, "rpma_mr_remote_delete"); - /* initiate disconnection */ - if ((ret = rpma_conn_disconnect(ccd->conn))) - librpma_td_verror(td, ret, "rpma_conn_disconnect"); - /* wait for disconnection to end up */ - if ((ret = rpma_conn_next_event(ccd->conn, &ev))) { - librpma_td_verror(td, ret, "rpma_conn_next_event"); - } else if (ev != RPMA_CONN_CLOSED) { - log_err( - "client_cleanup received an unexpected event (%s != RPMA_CONN_CLOSED)\n", - rpma_utils_conn_event_2str(ev)); - } - /* delete the connection */ - if ((ret = rpma_conn_delete(&ccd->conn))) - librpma_td_verror(td, ret, "rpma_conn_delete"); - /* delete the peer */ - if ((ret = rpma_peer_delete(&ccd->peer))) - librpma_td_verror(td, ret, "rpma_peer_delete"); - /* free the software queues */ - free(ccd->io_us_queued); - free(ccd->io_us_flight); - free(ccd->io_us_completed); - free(ccd); - td->io_ops_data = NULL; /* zero ccd */ -} - -int librpma_fio_file_nop(struct thread_data *td, struct fio_file *f) -{ - /* NOP */ - return 0; -} - -int librpma_fio_client_post_init(struct thread_data *td) -{ - struct librpma_fio_client_data *ccd = td->io_ops_data; - size_t io_us_size; - int ret; - - /* - * td->orig_buffer is not aligned. The engine requires aligned io_us - * so FIO alignes up the address using the formula below. - */ - ccd->orig_buffer_aligned = PTR_ALIGN(td->orig_buffer, page_mask) + - td->o.mem_align; - - /* - * td->orig_buffer_size beside the space really consumed by io_us - * has paddings which can be omitted for the memory registration. - */ - io_us_size = (unsigned long long)td_max_bs(td) * - (unsigned long long)td->o.iodepth; - - if ((ret = rpma_mr_reg(ccd->peer, ccd->orig_buffer_aligned, io_us_size, - RPMA_MR_USAGE_READ_DST | RPMA_MR_USAGE_READ_SRC | - RPMA_MR_USAGE_WRITE_DST | RPMA_MR_USAGE_WRITE_SRC | - RPMA_MR_USAGE_FLUSH_TYPE_PERSISTENT, &ccd->orig_mr))) - librpma_td_verror(td, ret, "rpma_mr_reg"); - return ret; -} - -int librpma_fio_client_get_file_size(struct thread_data *td, - struct fio_file *f) -{ - struct librpma_fio_client_data *ccd = td->io_ops_data; - - f->real_file_size = ccd->ws_size; - fio_file_set_size_known(f); - - return 0; -} - -static enum fio_q_status client_queue_sync(struct thread_data *td, - struct io_u *io_u) -{ - struct librpma_fio_client_data *ccd = td->io_ops_data; - struct rpma_completion cmpl; - unsigned io_u_index; - int ret; - - /* execute io_u */ - if (io_u->ddir == DDIR_READ) { - /* post an RDMA read operation */ - if (librpma_fio_client_io_read(td, io_u, - RPMA_F_COMPLETION_ALWAYS)) - goto err; - } else if (io_u->ddir == DDIR_WRITE) { - /* post an RDMA write operation */ - if (librpma_fio_client_io_write(td, io_u)) - goto err; - if (ccd->flush(td, io_u, io_u, io_u->xfer_buflen)) - goto err; - } else { - log_err("unsupported IO mode: %s\n", io_ddir_name(io_u->ddir)); - goto err; - } - - do { - /* get a completion */ - ret = rpma_conn_completion_get(ccd->conn, &cmpl); - if (ret == RPMA_E_NO_COMPLETION) { - /* lack of completion is not an error */ - continue; - } else if (ret != 0) { - /* an error occurred */ - librpma_td_verror(td, ret, "rpma_conn_completion_get"); - goto err; - } - - /* if io_us has completed with an error */ - if (cmpl.op_status != IBV_WC_SUCCESS) - goto err; - - if (cmpl.op == RPMA_OP_SEND) - ++ccd->op_send_completed; - else { - if (cmpl.op == RPMA_OP_RECV) - ++ccd->op_recv_completed; - - break; - } - } while (1); - - if (ccd->get_io_u_index(&cmpl, &io_u_index) != 1) - goto err; - - if (io_u->index != io_u_index) { - log_err( - "no matching io_u for received completion found (io_u_index=%u)\n", - io_u_index); - goto err; - } - - /* make sure all SENDs are completed before exit - clean up SQ */ - if (librpma_fio_client_io_complete_all_sends(td)) - goto err; - - return FIO_Q_COMPLETED; - -err: - io_u->error = -1; - return FIO_Q_COMPLETED; -} - -enum fio_q_status librpma_fio_client_queue(struct thread_data *td, - struct io_u *io_u) -{ - struct librpma_fio_client_data *ccd = td->io_ops_data; - - if (ccd->io_u_queued_nr == (int)td->o.iodepth) - return FIO_Q_BUSY; - - if (td->o.sync_io) - return client_queue_sync(td, io_u); - - /* io_u -> queued[] */ - ccd->io_us_queued[ccd->io_u_queued_nr] = io_u; - ccd->io_u_queued_nr++; - - return FIO_Q_QUEUED; -} - -int librpma_fio_client_commit(struct thread_data *td) -{ - struct librpma_fio_client_data *ccd = td->io_ops_data; - int flags = RPMA_F_COMPLETION_ON_ERROR; - struct timespec now; - bool fill_time; - int i; - struct io_u *flush_first_io_u = NULL; - unsigned long long int flush_len = 0; - - if (!ccd->io_us_queued) - return -1; - - /* execute all io_us from queued[] */ - for (i = 0; i < ccd->io_u_queued_nr; i++) { - struct io_u *io_u = ccd->io_us_queued[i]; - - if (io_u->ddir == DDIR_READ) { - if (i + 1 == ccd->io_u_queued_nr || - ccd->io_us_queued[i + 1]->ddir == DDIR_WRITE) - flags = RPMA_F_COMPLETION_ALWAYS; - /* post an RDMA read operation */ - if (librpma_fio_client_io_read(td, io_u, flags)) - return -1; - } else if (io_u->ddir == DDIR_WRITE) { - /* post an RDMA write operation */ - if (librpma_fio_client_io_write(td, io_u)) - return -1; - - /* cache the first io_u in the sequence */ - if (flush_first_io_u == NULL) - flush_first_io_u = io_u; - - /* - * the flush length is the sum of all io_u's creating - * the sequence - */ - flush_len += io_u->xfer_buflen; - - /* - * if io_u's are random the rpma_flush is required - * after each one of them - */ - if (!td_random(td)) { - /* - * When the io_u's are sequential and - * the current io_u is not the last one and - * the next one is also a write operation - * the flush can be postponed by one io_u and - * cover all of them which build a continuous - * sequence. - */ - if ((i + 1 < ccd->io_u_queued_nr) && - (ccd->io_us_queued[i + 1]->ddir == DDIR_WRITE)) - continue; - } - - /* flush all writes which build a continuous sequence */ - if (ccd->flush(td, flush_first_io_u, io_u, flush_len)) - return -1; - - /* - * reset the flush parameters in preparation for - * the next one - */ - flush_first_io_u = NULL; - flush_len = 0; - } else { - log_err("unsupported IO mode: %s\n", - io_ddir_name(io_u->ddir)); - return -1; - } - } - - if ((fill_time = fio_fill_issue_time(td))) - fio_gettime(&now, NULL); - - /* move executed io_us from queued[] to flight[] */ - for (i = 0; i < ccd->io_u_queued_nr; i++) { - struct io_u *io_u = ccd->io_us_queued[i]; - - /* FIO does not do this if the engine is asynchronous */ - if (fill_time) - memcpy(&io_u->issue_time, &now, sizeof(now)); - - /* move executed io_us from queued[] to flight[] */ - ccd->io_us_flight[ccd->io_u_flight_nr] = io_u; - ccd->io_u_flight_nr++; - - /* - * FIO says: - * If an engine has the commit hook - * it has to call io_u_queued() itself. - */ - io_u_queued(td, io_u); - } - - /* FIO does not do this if an engine has the commit hook. */ - io_u_mark_submit(td, ccd->io_u_queued_nr); - ccd->io_u_queued_nr = 0; - - return 0; -} - -/* - * RETURN VALUE - * - > 0 - a number of completed io_us - * - 0 - when no complicitions received - * - (-1) - when an error occurred - */ -static int client_getevent_process(struct thread_data *td) -{ - struct librpma_fio_client_data *ccd = td->io_ops_data; - struct rpma_completion cmpl; - /* io_u->index of completed io_u (cmpl.op_context) */ - unsigned int io_u_index; - /* # of completed io_us */ - int cmpl_num = 0; - /* helpers */ - struct io_u *io_u; - int i; - int ret; - - /* get a completion */ - if ((ret = rpma_conn_completion_get(ccd->conn, &cmpl))) { - /* lack of completion is not an error */ - if (ret == RPMA_E_NO_COMPLETION) { - /* lack of completion is not an error */ - return 0; - } - - /* an error occurred */ - librpma_td_verror(td, ret, "rpma_conn_completion_get"); - return -1; - } - - /* if io_us has completed with an error */ - if (cmpl.op_status != IBV_WC_SUCCESS) { - td->error = cmpl.op_status; - return -1; - } - - if (cmpl.op == RPMA_OP_SEND) - ++ccd->op_send_completed; - else if (cmpl.op == RPMA_OP_RECV) - ++ccd->op_recv_completed; - - if ((ret = ccd->get_io_u_index(&cmpl, &io_u_index)) != 1) - return ret; - - /* look for an io_u being completed */ - for (i = 0; i < ccd->io_u_flight_nr; ++i) { - if (ccd->io_us_flight[i]->index == io_u_index) { - cmpl_num = i + 1; - break; - } - } - - /* if no matching io_u has been found */ - if (cmpl_num == 0) { - log_err( - "no matching io_u for received completion found (io_u_index=%u)\n", - io_u_index); - return -1; - } - - /* move completed io_us to the completed in-memory queue */ - for (i = 0; i < cmpl_num; ++i) { - /* get and prepare io_u */ - io_u = ccd->io_us_flight[i]; - - /* append to the queue */ - ccd->io_us_completed[ccd->io_u_completed_nr] = io_u; - ccd->io_u_completed_nr++; - } - - /* remove completed io_us from the flight queue */ - for (i = cmpl_num; i < ccd->io_u_flight_nr; ++i) - ccd->io_us_flight[i - cmpl_num] = ccd->io_us_flight[i]; - ccd->io_u_flight_nr -= cmpl_num; - - return cmpl_num; -} - -int librpma_fio_client_getevents(struct thread_data *td, unsigned int min, - unsigned int max, const struct timespec *t) -{ - struct librpma_fio_client_data *ccd = td->io_ops_data; - /* total # of completed io_us */ - int cmpl_num_total = 0; - /* # of completed io_us from a single event */ - int cmpl_num; - - do { - cmpl_num = client_getevent_process(td); - if (cmpl_num > 0) { - /* new completions collected */ - cmpl_num_total += cmpl_num; - } else if (cmpl_num == 0) { - /* - * It is required to make sure that CQEs for SENDs - * will flow at least at the same pace as CQEs for RECVs. - */ - if (cmpl_num_total >= min && - ccd->op_send_completed >= ccd->op_recv_completed) - break; - - /* - * To reduce CPU consumption one can use - * the rpma_conn_completion_wait() function. - * Note this greatly increase the latency - * and make the results less stable. - * The bandwidth stays more or less the same. - */ - } else { - /* an error occurred */ - return -1; - } - - /* - * The expected max can be exceeded if CQEs for RECVs will come up - * faster than CQEs for SENDs. But it is required to make sure CQEs for - * SENDs will flow at least at the same pace as CQEs for RECVs. - */ - } while (cmpl_num_total < max || - ccd->op_send_completed < ccd->op_recv_completed); - - /* - * All posted SENDs are completed and RECVs for them (responses) are - * completed. This is the initial situation so the counters are reset. - */ - if (ccd->op_send_posted == ccd->op_send_completed && - ccd->op_send_completed == ccd->op_recv_completed) { - ccd->op_send_posted = 0; - ccd->op_send_completed = 0; - ccd->op_recv_completed = 0; - } - - return cmpl_num_total; -} - -struct io_u *librpma_fio_client_event(struct thread_data *td, int event) -{ - struct librpma_fio_client_data *ccd = td->io_ops_data; - struct io_u *io_u; - int i; - - /* get the first io_u from the queue */ - io_u = ccd->io_us_completed[0]; - - /* remove the first io_u from the queue */ - for (i = 1; i < ccd->io_u_completed_nr; ++i) - ccd->io_us_completed[i - 1] = ccd->io_us_completed[i]; - ccd->io_u_completed_nr--; - - dprint_io_u(io_u, "client_event"); - - return io_u; -} - -char *librpma_fio_client_errdetails(struct io_u *io_u) -{ - /* get the string representation of an error */ - enum ibv_wc_status status = io_u->error; - const char *status_str = ibv_wc_status_str(status); - - char *details = strdup(status_str); - if (details == NULL) { - fprintf(stderr, "Error: %s\n", status_str); - fprintf(stderr, "Fatal error: out of memory. Aborting.\n"); - abort(); - } - - /* FIO frees the returned string when it becomes obsolete */ - return details; -} - -int librpma_fio_server_init(struct thread_data *td) -{ - struct librpma_fio_options_values *o = td->eo; - struct librpma_fio_server_data *csd; - struct ibv_context *dev = NULL; - enum rpma_log_level log_level_aux = RPMA_LOG_LEVEL_WARNING; - int ret = -1; - - /* --debug=net sets RPMA_LOG_THRESHOLD_AUX to RPMA_LOG_LEVEL_INFO */ -#ifdef FIO_INC_DEBUG - if ((1UL << FD_NET) & fio_debug) - log_level_aux = RPMA_LOG_LEVEL_INFO; -#endif - - /* configure logging thresholds to see more details */ - rpma_log_set_threshold(RPMA_LOG_THRESHOLD, RPMA_LOG_LEVEL_INFO); - rpma_log_set_threshold(RPMA_LOG_THRESHOLD_AUX, log_level_aux); - - - /* obtain an IBV context for a remote IP address */ - if ((ret = rpma_utils_get_ibv_context(o->server_ip, - RPMA_UTIL_IBV_CONTEXT_LOCAL, &dev))) { - librpma_td_verror(td, ret, "rpma_utils_get_ibv_context"); - return -1; - } - - /* allocate server's data */ - csd = calloc(1, sizeof(*csd)); - if (csd == NULL) { - td_verror(td, errno, "calloc"); - return -1; - } - - /* create a new peer object */ - if ((ret = rpma_peer_new(dev, &csd->peer))) { - librpma_td_verror(td, ret, "rpma_peer_new"); - goto err_free_csd; - } - - td->io_ops_data = csd; - - return 0; - -err_free_csd: - free(csd); - - return -1; -} - -void librpma_fio_server_cleanup(struct thread_data *td) -{ - struct librpma_fio_server_data *csd = td->io_ops_data; - int ret; - - if (csd == NULL) - return; - - /* free the peer */ - if ((ret = rpma_peer_delete(&csd->peer))) - librpma_td_verror(td, ret, "rpma_peer_delete"); - - free(csd); -} - -int librpma_fio_server_open_file(struct thread_data *td, struct fio_file *f, - struct rpma_conn_cfg *cfg) -{ - struct librpma_fio_server_data *csd = td->io_ops_data; - struct librpma_fio_options_values *o = td->eo; - enum rpma_conn_event conn_event = RPMA_CONN_UNDEFINED; - struct librpma_fio_workspace ws = {0}; - struct rpma_conn_private_data pdata; - uint32_t max_msg_num; - struct rpma_conn_req *conn_req; - struct rpma_conn *conn; - struct rpma_mr_local *mr; - char port_td[LIBRPMA_FIO_PORT_STR_LEN_MAX]; - struct rpma_ep *ep; - size_t mem_size = td->o.size; - size_t mr_desc_size; - void *ws_ptr; - int usage_mem_type; - int ret; - - if (!f->file_name) { - log_err("fio: filename is not set\n"); - return -1; - } - - /* start a listening endpoint at addr:port */ - if (librpma_fio_td_port(o->port, td, port_td)) - return -1; - - if ((ret = rpma_ep_listen(csd->peer, o->server_ip, port_td, &ep))) { - librpma_td_verror(td, ret, "rpma_ep_listen"); - return -1; - } - - if (strcmp(f->file_name, "malloc") == 0) { - /* allocation from DRAM using posix_memalign() */ - ws_ptr = librpma_fio_allocate_dram(td, mem_size, &csd->mem); - usage_mem_type = RPMA_MR_USAGE_FLUSH_TYPE_VISIBILITY; - } else { - /* allocation from PMEM using pmem_map_file() */ - ws_ptr = librpma_fio_allocate_pmem(td, f->file_name, - mem_size, &csd->mem); - usage_mem_type = RPMA_MR_USAGE_FLUSH_TYPE_PERSISTENT; - } - - if (ws_ptr == NULL) - goto err_ep_shutdown; - - f->real_file_size = mem_size; - - if ((ret = rpma_mr_reg(csd->peer, ws_ptr, mem_size, - RPMA_MR_USAGE_READ_DST | RPMA_MR_USAGE_READ_SRC | - RPMA_MR_USAGE_WRITE_DST | RPMA_MR_USAGE_WRITE_SRC | - usage_mem_type, &mr))) { - librpma_td_verror(td, ret, "rpma_mr_reg"); - goto err_free; - } - - /* get size of the memory region's descriptor */ - if ((ret = rpma_mr_get_descriptor_size(mr, &mr_desc_size))) { - librpma_td_verror(td, ret, "rpma_mr_get_descriptor_size"); - goto err_mr_dereg; - } - - /* verify size of the memory region's descriptor */ - if (mr_desc_size > LIBRPMA_FIO_DESCRIPTOR_MAX_SIZE) { - log_err( - "size of the memory region's descriptor is too big (max=%i)\n", - LIBRPMA_FIO_DESCRIPTOR_MAX_SIZE); - goto err_mr_dereg; - } - - /* get the memory region's descriptor */ - if ((ret = rpma_mr_get_descriptor(mr, &ws.descriptor[0]))) { - librpma_td_verror(td, ret, "rpma_mr_get_descriptor"); - goto err_mr_dereg; - } - - if (cfg != NULL) { - if ((ret = rpma_conn_cfg_get_rq_size(cfg, &max_msg_num))) { - librpma_td_verror(td, ret, "rpma_conn_cfg_get_rq_size"); - goto err_mr_dereg; - } - - /* verify whether iodepth fits into uint16_t */ - if (max_msg_num > UINT16_MAX) { - log_err("fio: iodepth too big (%u > %u)\n", - max_msg_num, UINT16_MAX); - return -1; - } - - ws.max_msg_num = max_msg_num; - } - - /* prepare a workspace description */ - ws.direct_write_to_pmem = o->direct_write_to_pmem; - ws.mr_desc_size = mr_desc_size; - pdata.ptr = &ws; - pdata.len = sizeof(ws); - - /* receive an incoming connection request */ - if ((ret = rpma_ep_next_conn_req(ep, cfg, &conn_req))) { - librpma_td_verror(td, ret, "rpma_ep_next_conn_req"); - goto err_mr_dereg; - } - - if (csd->prepare_connection && csd->prepare_connection(td, conn_req)) - goto err_req_delete; - - /* accept the connection request and obtain the connection object */ - if ((ret = rpma_conn_req_connect(&conn_req, &pdata, &conn))) { - librpma_td_verror(td, ret, "rpma_conn_req_connect"); - goto err_req_delete; - } - - /* wait for the connection to be established */ - if ((ret = rpma_conn_next_event(conn, &conn_event))) { - librpma_td_verror(td, ret, "rpma_conn_next_event"); - goto err_conn_delete; - } else if (conn_event != RPMA_CONN_ESTABLISHED) { - log_err("rpma_conn_next_event returned an unexptected event\n"); - goto err_conn_delete; - } - - /* end-point is no longer needed */ - (void) rpma_ep_shutdown(&ep); - - csd->ws_mr = mr; - csd->ws_ptr = ws_ptr; - csd->conn = conn; - - return 0; - -err_conn_delete: - (void) rpma_conn_delete(&conn); - -err_req_delete: - (void) rpma_conn_req_delete(&conn_req); - -err_mr_dereg: - (void) rpma_mr_dereg(&mr); - -err_free: - librpma_fio_free(&csd->mem); - -err_ep_shutdown: - (void) rpma_ep_shutdown(&ep); - - return -1; -} - -int librpma_fio_server_close_file(struct thread_data *td, struct fio_file *f) -{ - struct librpma_fio_server_data *csd = td->io_ops_data; - enum rpma_conn_event conn_event = RPMA_CONN_UNDEFINED; - int rv = 0; - int ret; - - /* wait for the connection to be closed */ - ret = rpma_conn_next_event(csd->conn, &conn_event); - if (!ret && conn_event != RPMA_CONN_CLOSED) { - log_err("rpma_conn_next_event returned an unexptected event\n"); - rv = -1; - } - - if ((ret = rpma_conn_disconnect(csd->conn))) { - librpma_td_verror(td, ret, "rpma_conn_disconnect"); - rv = -1; - } - - if ((ret = rpma_conn_delete(&csd->conn))) { - librpma_td_verror(td, ret, "rpma_conn_delete"); - rv = -1; - } - - if ((ret = rpma_mr_dereg(&csd->ws_mr))) { - librpma_td_verror(td, ret, "rpma_mr_dereg"); - rv = -1; - } - - librpma_fio_free(&csd->mem); - - return rv; -} diff --git a/engines/librpma_fio.h b/engines/librpma_fio.h deleted file mode 100644 index fb89d99d69..0000000000 --- a/engines/librpma_fio.h +++ /dev/null @@ -1,275 +0,0 @@ -/* - * librpma_fio: librpma_apm and librpma_gpspm engines' common header. - * - * Copyright 2021, Intel Corporation - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License, - * version 2 as published by the Free Software Foundation.. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#ifndef LIBRPMA_FIO_H -#define LIBRPMA_FIO_H 1 - -#include "../fio.h" -#include "../optgroup.h" - -#include - -/* servers' and clients' common */ - -#define librpma_td_verror(td, err, func) \ - td_vmsg((td), (err), rpma_err_2str(err), (func)) - -/* ceil(a / b) = (a + b - 1) / b */ -#define LIBRPMA_FIO_CEIL(a, b) (((a) + (b) - 1) / (b)) - -/* common option structure for server and client */ -struct librpma_fio_options_values { - /* - * FIO considers .off1 == 0 absent so the first meaningful field has to - * have padding ahead of it. - */ - void *pad; - char *server_ip; - /* base server listening port */ - char *port; - /* Direct Write to PMem is possible */ - unsigned int direct_write_to_pmem; - /* Set to 0 to wait for completion instead of busy-wait polling completion. */ - unsigned int busy_wait_polling; -}; - -extern struct fio_option librpma_fio_options[]; - -/* - * Limited by the maximum length of the private data - * for rdma_connect() in case of RDMA_PS_TCP (28 bytes). - */ -#define LIBRPMA_FIO_DESCRIPTOR_MAX_SIZE 24 - -struct librpma_fio_workspace { - uint16_t max_msg_num; /* # of RQ slots */ - uint8_t direct_write_to_pmem; /* Direct Write to PMem is possible */ - uint8_t mr_desc_size; /* size of mr_desc in descriptor[] */ - /* buffer containing mr_desc */ - char descriptor[LIBRPMA_FIO_DESCRIPTOR_MAX_SIZE]; -}; - -#define LIBRPMA_FIO_PORT_STR_LEN_MAX 12 - -int librpma_fio_td_port(const char *port_base_str, struct thread_data *td, - char *port_out); - -struct librpma_fio_mem { - /* memory buffer */ - char *mem_ptr; - - /* size of the mapped persistent memory */ - size_t size_mmap; -}; - -char *librpma_fio_allocate_dram(struct thread_data *td, size_t size, - struct librpma_fio_mem *mem); - -char *librpma_fio_allocate_pmem(struct thread_data *td, const char *filename, - size_t size, struct librpma_fio_mem *mem); - -void librpma_fio_free(struct librpma_fio_mem *mem); - -/* clients' common */ - -typedef int (*librpma_fio_flush_t)(struct thread_data *td, - struct io_u *first_io_u, struct io_u *last_io_u, - unsigned long long int len); - -/* - * RETURN VALUE - * - ( 1) - on success - * - ( 0) - skip - * - (-1) - on error - */ -typedef int (*librpma_fio_get_io_u_index_t)(struct rpma_completion *cmpl, - unsigned int *io_u_index); - -struct librpma_fio_client_data { - struct rpma_peer *peer; - struct rpma_conn *conn; - - /* aligned td->orig_buffer */ - char *orig_buffer_aligned; - - /* ious's base address memory registration (cd->orig_buffer_aligned) */ - struct rpma_mr_local *orig_mr; - - struct librpma_fio_workspace *ws; - - /* a server's memory representation */ - struct rpma_mr_remote *server_mr; - enum rpma_flush_type server_mr_flush_type; - - /* remote workspace description */ - size_t ws_size; - - /* in-memory queues */ - struct io_u **io_us_queued; - int io_u_queued_nr; - struct io_u **io_us_flight; - int io_u_flight_nr; - struct io_u **io_us_completed; - int io_u_completed_nr; - - /* SQ control. Note: all of them have to be kept in sync. */ - uint32_t op_send_posted; - uint32_t op_send_completed; - uint32_t op_recv_completed; - - librpma_fio_flush_t flush; - librpma_fio_get_io_u_index_t get_io_u_index; - - /* engine-specific client data */ - void *client_data; -}; - -int librpma_fio_client_init(struct thread_data *td, - struct rpma_conn_cfg *cfg); -void librpma_fio_client_cleanup(struct thread_data *td); - -int librpma_fio_file_nop(struct thread_data *td, struct fio_file *f); -int librpma_fio_client_get_file_size(struct thread_data *td, - struct fio_file *f); - -int librpma_fio_client_post_init(struct thread_data *td); - -enum fio_q_status librpma_fio_client_queue(struct thread_data *td, - struct io_u *io_u); - -int librpma_fio_client_commit(struct thread_data *td); - -int librpma_fio_client_getevents(struct thread_data *td, unsigned int min, - unsigned int max, const struct timespec *t); - -struct io_u *librpma_fio_client_event(struct thread_data *td, int event); - -char *librpma_fio_client_errdetails(struct io_u *io_u); - -static inline int librpma_fio_client_io_read(struct thread_data *td, - struct io_u *io_u, int flags) -{ - struct librpma_fio_client_data *ccd = td->io_ops_data; - size_t dst_offset = (char *)(io_u->xfer_buf) - ccd->orig_buffer_aligned; - size_t src_offset = io_u->offset; - int ret; - - if ((ret = rpma_read(ccd->conn, ccd->orig_mr, dst_offset, - ccd->server_mr, src_offset, io_u->xfer_buflen, - flags, (void *)(uintptr_t)io_u->index))) { - librpma_td_verror(td, ret, "rpma_read"); - return -1; - } - - return 0; -} - -static inline int librpma_fio_client_io_write(struct thread_data *td, - struct io_u *io_u) -{ - struct librpma_fio_client_data *ccd = td->io_ops_data; - size_t src_offset = (char *)(io_u->xfer_buf) - ccd->orig_buffer_aligned; - size_t dst_offset = io_u->offset; - int ret; - - if ((ret = rpma_write(ccd->conn, ccd->server_mr, dst_offset, - ccd->orig_mr, src_offset, io_u->xfer_buflen, - RPMA_F_COMPLETION_ON_ERROR, - (void *)(uintptr_t)io_u->index))) { - librpma_td_verror(td, ret, "rpma_write"); - return -1; - } - - return 0; -} - -static inline int librpma_fio_client_io_complete_all_sends( - struct thread_data *td) -{ - struct librpma_fio_client_data *ccd = td->io_ops_data; - struct rpma_completion cmpl; - int ret; - - while (ccd->op_send_posted != ccd->op_send_completed) { - /* get a completion */ - ret = rpma_conn_completion_get(ccd->conn, &cmpl); - if (ret == RPMA_E_NO_COMPLETION) { - /* lack of completion is not an error */ - continue; - } else if (ret != 0) { - /* an error occurred */ - librpma_td_verror(td, ret, "rpma_conn_completion_get"); - break; - } - - if (cmpl.op_status != IBV_WC_SUCCESS) - return -1; - - if (cmpl.op == RPMA_OP_SEND) - ++ccd->op_send_completed; - else { - log_err( - "A completion other than RPMA_OP_SEND got during cleaning up the CQ from SENDs\n"); - return -1; - } - } - - /* - * All posted SENDs are completed and RECVs for them (responses) are - * completed. This is the initial situation so the counters are reset. - */ - if (ccd->op_send_posted == ccd->op_send_completed && - ccd->op_send_completed == ccd->op_recv_completed) { - ccd->op_send_posted = 0; - ccd->op_send_completed = 0; - ccd->op_recv_completed = 0; - } - - return 0; -} - -/* servers' common */ - -typedef int (*librpma_fio_prepare_connection_t)( - struct thread_data *td, - struct rpma_conn_req *conn_req); - -struct librpma_fio_server_data { - struct rpma_peer *peer; - - /* resources of an incoming connection */ - struct rpma_conn *conn; - - char *ws_ptr; - struct rpma_mr_local *ws_mr; - struct librpma_fio_mem mem; - - /* engine-specific server data */ - void *server_data; - - librpma_fio_prepare_connection_t prepare_connection; -}; - -int librpma_fio_server_init(struct thread_data *td); - -void librpma_fio_server_cleanup(struct thread_data *td); - -int librpma_fio_server_open_file(struct thread_data *td, - struct fio_file *f, struct rpma_conn_cfg *cfg); - -int librpma_fio_server_close_file(struct thread_data *td, - struct fio_file *f); - -#endif /* LIBRPMA_FIO_H */ diff --git a/engines/librpma_gpspm.c b/engines/librpma_gpspm.c deleted file mode 100644 index 7414770971..0000000000 --- a/engines/librpma_gpspm.c +++ /dev/null @@ -1,776 +0,0 @@ -/* - * librpma_gpspm: IO engine that uses PMDK librpma to write data, - * based on General Purpose Server Persistency Method - * - * Copyright 2020-2021, Intel Corporation - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License, - * version 2 as published by the Free Software Foundation.. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#include "librpma_fio.h" - -#include - -/* Generated by the protocol buffer compiler from: librpma_gpspm_flush.proto */ -#include "librpma_gpspm_flush.pb-c.h" - -#define MAX_MSG_SIZE (512) -#define IO_U_BUF_LEN (2 * MAX_MSG_SIZE) -#define SEND_OFFSET (0) -#define RECV_OFFSET (SEND_OFFSET + MAX_MSG_SIZE) - -#define GPSPM_FLUSH_REQUEST__LAST \ - { PROTOBUF_C_MESSAGE_INIT(&gpspm_flush_request__descriptor), 0, 0, 0 } - -/* - * 'Flush_req_last' is the last flush request - * the client has to send to server to indicate - * that the client is done. - */ -static const GPSPMFlushRequest Flush_req_last = GPSPM_FLUSH_REQUEST__LAST; - -#define IS_NOT_THE_LAST_MESSAGE(flush_req) \ - (flush_req->length != Flush_req_last.length || \ - flush_req->offset != Flush_req_last.offset) - -/* client side implementation */ - -/* get next io_u message buffer in the round-robin fashion */ -#define IO_U_NEXT_BUF_OFF_CLIENT(cd) \ - (IO_U_BUF_LEN * ((cd->msg_curr++) % cd->msg_num)) - -struct client_data { - /* memory for sending and receiving buffered */ - char *io_us_msgs; - - /* resources for messaging buffer */ - uint32_t msg_num; - uint32_t msg_curr; - struct rpma_mr_local *msg_mr; -}; - -static inline int client_io_flush(struct thread_data *td, - struct io_u *first_io_u, struct io_u *last_io_u, - unsigned long long int len); - -static int client_get_io_u_index(struct rpma_completion *cmpl, - unsigned int *io_u_index); - -static int client_init(struct thread_data *td) -{ - struct librpma_fio_client_data *ccd; - struct client_data *cd; - uint32_t write_num; - struct rpma_conn_cfg *cfg = NULL; - int ret; - - /* - * not supported: - * - readwrite = read / trim / randread / randtrim / - * / rw / randrw / trimwrite - */ - if (td_read(td) || td_trim(td)) { - td_verror(td, EINVAL, "Not supported mode."); - return -1; - } - - /* allocate client's data */ - cd = calloc(1, sizeof(*cd)); - if (cd == NULL) { - td_verror(td, errno, "calloc"); - return -1; - } - - /* - * Calculate the required number of WRITEs and FLUSHes. - * - * Note: Each flush is a request (SEND) and response (RECV) pair. - */ - if (td_random(td)) { - write_num = td->o.iodepth; /* WRITE * N */ - cd->msg_num = td->o.iodepth; /* FLUSH * N */ - } else { - if (td->o.sync_io) { - write_num = 1; /* WRITE */ - cd->msg_num = 1; /* FLUSH */ - } else { - write_num = td->o.iodepth; /* WRITE * N */ - /* - * FLUSH * B where: - * - B == ceil(iodepth / iodepth_batch) - * which is the number of batches for N writes - */ - cd->msg_num = LIBRPMA_FIO_CEIL(td->o.iodepth, - td->o.iodepth_batch); - } - } - - /* create a connection configuration object */ - if ((ret = rpma_conn_cfg_new(&cfg))) { - librpma_td_verror(td, ret, "rpma_conn_cfg_new"); - goto err_free_cd; - } - - /* - * Calculate the required queue sizes where: - * - the send queue (SQ) has to be big enough to accommodate - * all io_us (WRITEs) and all flush requests (SENDs) - * - the receive queue (RQ) has to be big enough to accommodate - * all flush responses (RECVs) - * - the completion queue (CQ) has to be big enough to accommodate all - * success and error completions (sq_size + rq_size) - */ - if ((ret = rpma_conn_cfg_set_sq_size(cfg, write_num + cd->msg_num))) { - librpma_td_verror(td, ret, "rpma_conn_cfg_set_sq_size"); - goto err_cfg_delete; - } - if ((ret = rpma_conn_cfg_set_rq_size(cfg, cd->msg_num))) { - librpma_td_verror(td, ret, "rpma_conn_cfg_set_rq_size"); - goto err_cfg_delete; - } - if ((ret = rpma_conn_cfg_set_cq_size(cfg, write_num + cd->msg_num * 2))) { - librpma_td_verror(td, ret, "rpma_conn_cfg_set_cq_size"); - goto err_cfg_delete; - } - - if (librpma_fio_client_init(td, cfg)) - goto err_cfg_delete; - - ccd = td->io_ops_data; - - if (ccd->ws->direct_write_to_pmem && - ccd->server_mr_flush_type == RPMA_FLUSH_TYPE_PERSISTENT && - td->thread_number == 1) { - /* XXX log_info mixes with the JSON output */ - log_err( - "Note: The server side supports Direct Write to PMem and it is equipped with PMem (direct_write_to_pmem).\n" - "You can use librpma_client and librpma_server engines for better performance instead of GPSPM.\n"); - } - - /* validate the server's RQ capacity */ - if (cd->msg_num > ccd->ws->max_msg_num) { - log_err( - "server's RQ size (iodepth) too small to handle the client's workspace requirements (%u < %u)\n", - ccd->ws->max_msg_num, cd->msg_num); - goto err_cleanup_common; - } - - if ((ret = rpma_conn_cfg_delete(&cfg))) { - librpma_td_verror(td, ret, "rpma_conn_cfg_delete"); - /* non fatal error - continue */ - } - - ccd->flush = client_io_flush; - ccd->get_io_u_index = client_get_io_u_index; - ccd->client_data = cd; - - return 0; - -err_cleanup_common: - librpma_fio_client_cleanup(td); - -err_cfg_delete: - (void) rpma_conn_cfg_delete(&cfg); - -err_free_cd: - free(cd); - - return -1; -} - -static int client_post_init(struct thread_data *td) -{ - struct librpma_fio_client_data *ccd = td->io_ops_data; - struct client_data *cd = ccd->client_data; - unsigned int io_us_msgs_size; - int ret; - - /* message buffers initialization and registration */ - io_us_msgs_size = cd->msg_num * IO_U_BUF_LEN; - if ((ret = posix_memalign((void **)&cd->io_us_msgs, page_size, - io_us_msgs_size))) { - td_verror(td, ret, "posix_memalign"); - return ret; - } - if ((ret = rpma_mr_reg(ccd->peer, cd->io_us_msgs, io_us_msgs_size, - RPMA_MR_USAGE_SEND | RPMA_MR_USAGE_RECV, - &cd->msg_mr))) { - librpma_td_verror(td, ret, "rpma_mr_reg"); - return ret; - } - - return librpma_fio_client_post_init(td); -} - -static void client_cleanup(struct thread_data *td) -{ - struct librpma_fio_client_data *ccd = td->io_ops_data; - struct client_data *cd; - size_t flush_req_size; - size_t io_u_buf_off; - size_t send_offset; - void *send_ptr; - int ret; - - if (ccd == NULL) - return; - - cd = ccd->client_data; - if (cd == NULL) { - librpma_fio_client_cleanup(td); - return; - } - - /* - * Make sure all SEND completions are collected ergo there are free - * slots in the SQ for the last SEND message. - * - * Note: If any operation will fail we still can send the termination - * notice. - */ - (void) librpma_fio_client_io_complete_all_sends(td); - - /* prepare the last flush message and pack it to the send buffer */ - flush_req_size = gpspm_flush_request__get_packed_size(&Flush_req_last); - if (flush_req_size > MAX_MSG_SIZE) { - log_err( - "Packed flush request size is bigger than available send buffer space (%zu > %d\n", - flush_req_size, MAX_MSG_SIZE); - } else { - io_u_buf_off = IO_U_NEXT_BUF_OFF_CLIENT(cd); - send_offset = io_u_buf_off + SEND_OFFSET; - send_ptr = cd->io_us_msgs + send_offset; - (void) gpspm_flush_request__pack(&Flush_req_last, send_ptr); - - /* send the flush message */ - if ((ret = rpma_send(ccd->conn, cd->msg_mr, send_offset, - flush_req_size, RPMA_F_COMPLETION_ALWAYS, - NULL))) - librpma_td_verror(td, ret, "rpma_send"); - - ++ccd->op_send_posted; - - /* Wait for the SEND to complete */ - (void) librpma_fio_client_io_complete_all_sends(td); - } - - /* deregister the messaging buffer memory */ - if ((ret = rpma_mr_dereg(&cd->msg_mr))) - librpma_td_verror(td, ret, "rpma_mr_dereg"); - - free(ccd->client_data); - - librpma_fio_client_cleanup(td); -} - -static inline int client_io_flush(struct thread_data *td, - struct io_u *first_io_u, struct io_u *last_io_u, - unsigned long long int len) -{ - struct librpma_fio_client_data *ccd = td->io_ops_data; - struct client_data *cd = ccd->client_data; - size_t io_u_buf_off = IO_U_NEXT_BUF_OFF_CLIENT(cd); - size_t send_offset = io_u_buf_off + SEND_OFFSET; - size_t recv_offset = io_u_buf_off + RECV_OFFSET; - void *send_ptr = cd->io_us_msgs + send_offset; - void *recv_ptr = cd->io_us_msgs + recv_offset; - GPSPMFlushRequest flush_req = GPSPM_FLUSH_REQUEST__INIT; - size_t flush_req_size = 0; - int ret; - - /* prepare a response buffer */ - if ((ret = rpma_recv(ccd->conn, cd->msg_mr, recv_offset, MAX_MSG_SIZE, - recv_ptr))) { - librpma_td_verror(td, ret, "rpma_recv"); - return -1; - } - - /* prepare a flush message and pack it to a send buffer */ - flush_req.offset = first_io_u->offset; - flush_req.length = len; - flush_req.op_context = last_io_u->index; - flush_req_size = gpspm_flush_request__get_packed_size(&flush_req); - if (flush_req_size > MAX_MSG_SIZE) { - log_err( - "Packed flush request size is bigger than available send buffer space (%" - PRIu64 " > %d\n", flush_req_size, MAX_MSG_SIZE); - return -1; - } - (void) gpspm_flush_request__pack(&flush_req, send_ptr); - - /* send the flush message */ - if ((ret = rpma_send(ccd->conn, cd->msg_mr, send_offset, flush_req_size, - RPMA_F_COMPLETION_ALWAYS, NULL))) { - librpma_td_verror(td, ret, "rpma_send"); - return -1; - } - - ++ccd->op_send_posted; - - return 0; -} - -static int client_get_io_u_index(struct rpma_completion *cmpl, - unsigned int *io_u_index) -{ - GPSPMFlushResponse *flush_resp; - - if (cmpl->op != RPMA_OP_RECV) - return 0; - - /* unpack a response from the received buffer */ - flush_resp = gpspm_flush_response__unpack(NULL, - cmpl->byte_len, cmpl->op_context); - if (flush_resp == NULL) { - log_err("Cannot unpack the flush response buffer\n"); - return -1; - } - - memcpy(io_u_index, &flush_resp->op_context, sizeof(*io_u_index)); - - gpspm_flush_response__free_unpacked(flush_resp, NULL); - - return 1; -} - -FIO_STATIC struct ioengine_ops ioengine_client = { - .name = "librpma_gpspm_client", - .version = FIO_IOOPS_VERSION, - .init = client_init, - .post_init = client_post_init, - .get_file_size = librpma_fio_client_get_file_size, - .open_file = librpma_fio_file_nop, - .queue = librpma_fio_client_queue, - .commit = librpma_fio_client_commit, - .getevents = librpma_fio_client_getevents, - .event = librpma_fio_client_event, - .errdetails = librpma_fio_client_errdetails, - .close_file = librpma_fio_file_nop, - .cleanup = client_cleanup, - .flags = FIO_DISKLESSIO, - .options = librpma_fio_options, - .option_struct_size = sizeof(struct librpma_fio_options_values), -}; - -/* server side implementation */ - -#define IO_U_BUFF_OFF_SERVER(i) (i * IO_U_BUF_LEN) - -struct server_data { - /* aligned td->orig_buffer */ - char *orig_buffer_aligned; - - /* resources for messaging buffer from DRAM allocated by fio */ - struct rpma_mr_local *msg_mr; - - uint32_t msg_sqe_available; /* # of free SQ slots */ - - /* in-memory queues */ - struct rpma_completion *msgs_queued; - uint32_t msg_queued_nr; -}; - -static int server_init(struct thread_data *td) -{ - struct librpma_fio_server_data *csd; - struct server_data *sd; - int ret = -1; - - if ((ret = librpma_fio_server_init(td))) - return ret; - - csd = td->io_ops_data; - - /* allocate server's data */ - sd = calloc(1, sizeof(*sd)); - if (sd == NULL) { - td_verror(td, errno, "calloc"); - goto err_server_cleanup; - } - - /* allocate in-memory queue */ - sd->msgs_queued = calloc(td->o.iodepth, sizeof(*sd->msgs_queued)); - if (sd->msgs_queued == NULL) { - td_verror(td, errno, "calloc"); - goto err_free_sd; - } - - /* - * Assure a single io_u buffer can store both SEND and RECV messages and - * an io_us buffer allocation is page-size-aligned which is required - * to register for RDMA. User-provided values are intentionally ignored. - */ - td->o.max_bs[DDIR_READ] = IO_U_BUF_LEN; - td->o.mem_align = page_size; - - csd->server_data = sd; - - return 0; - -err_free_sd: - free(sd); - -err_server_cleanup: - librpma_fio_server_cleanup(td); - - return -1; -} - -static int server_post_init(struct thread_data *td) -{ - struct librpma_fio_server_data *csd = td->io_ops_data; - struct server_data *sd = csd->server_data; - size_t io_us_size; - size_t io_u_buflen; - int ret; - - /* - * td->orig_buffer is not aligned. The engine requires aligned io_us - * so FIO alignes up the address using the formula below. - */ - sd->orig_buffer_aligned = PTR_ALIGN(td->orig_buffer, page_mask) + - td->o.mem_align; - - /* - * XXX - * Each io_u message buffer contains recv and send messages. - * Aligning each of those buffers may potentially give - * some performance benefits. - */ - io_u_buflen = td_max_bs(td); - - /* check whether io_u buffer is big enough */ - if (io_u_buflen < IO_U_BUF_LEN) { - log_err( - "blocksize too small to accommodate assumed maximal request/response pair size (%" PRIu64 " < %d)\n", - io_u_buflen, IO_U_BUF_LEN); - return -1; - } - - /* - * td->orig_buffer_size beside the space really consumed by io_us - * has paddings which can be omitted for the memory registration. - */ - io_us_size = (unsigned long long)io_u_buflen * - (unsigned long long)td->o.iodepth; - - if ((ret = rpma_mr_reg(csd->peer, sd->orig_buffer_aligned, io_us_size, - RPMA_MR_USAGE_SEND | RPMA_MR_USAGE_RECV, - &sd->msg_mr))) { - librpma_td_verror(td, ret, "rpma_mr_reg"); - return -1; - } - - return 0; -} - -static void server_cleanup(struct thread_data *td) -{ - struct librpma_fio_server_data *csd = td->io_ops_data; - struct server_data *sd; - int ret; - - if (csd == NULL) - return; - - sd = csd->server_data; - - if (sd != NULL) { - /* rpma_mr_dereg(messaging buffer from DRAM) */ - if ((ret = rpma_mr_dereg(&sd->msg_mr))) - librpma_td_verror(td, ret, "rpma_mr_dereg"); - - free(sd->msgs_queued); - free(sd); - } - - librpma_fio_server_cleanup(td); -} - -static int prepare_connection(struct thread_data *td, - struct rpma_conn_req *conn_req) -{ - struct librpma_fio_server_data *csd = td->io_ops_data; - struct server_data *sd = csd->server_data; - int ret; - int i; - - /* prepare buffers for a flush requests */ - sd->msg_sqe_available = td->o.iodepth; - for (i = 0; i < td->o.iodepth; i++) { - size_t offset_recv_msg = IO_U_BUFF_OFF_SERVER(i) + RECV_OFFSET; - if ((ret = rpma_conn_req_recv(conn_req, sd->msg_mr, - offset_recv_msg, MAX_MSG_SIZE, - (const void *)(uintptr_t)i))) { - librpma_td_verror(td, ret, "rpma_conn_req_recv"); - return ret; - } - } - - return 0; -} - -static int server_open_file(struct thread_data *td, struct fio_file *f) -{ - struct librpma_fio_server_data *csd = td->io_ops_data; - struct rpma_conn_cfg *cfg = NULL; - uint16_t max_msg_num = td->o.iodepth; - int ret; - - csd->prepare_connection = prepare_connection; - - /* create a connection configuration object */ - if ((ret = rpma_conn_cfg_new(&cfg))) { - librpma_td_verror(td, ret, "rpma_conn_cfg_new"); - return -1; - } - - /* - * Calculate the required queue sizes where: - * - the send queue (SQ) has to be big enough to accommodate - * all possible flush requests (SENDs) - * - the receive queue (RQ) has to be big enough to accommodate - * all flush responses (RECVs) - * - the completion queue (CQ) has to be big enough to accommodate - * all success and error completions (sq_size + rq_size) - */ - if ((ret = rpma_conn_cfg_set_sq_size(cfg, max_msg_num))) { - librpma_td_verror(td, ret, "rpma_conn_cfg_set_sq_size"); - goto err_cfg_delete; - } - if ((ret = rpma_conn_cfg_set_rq_size(cfg, max_msg_num))) { - librpma_td_verror(td, ret, "rpma_conn_cfg_set_rq_size"); - goto err_cfg_delete; - } - if ((ret = rpma_conn_cfg_set_cq_size(cfg, max_msg_num * 2))) { - librpma_td_verror(td, ret, "rpma_conn_cfg_set_cq_size"); - goto err_cfg_delete; - } - - ret = librpma_fio_server_open_file(td, f, cfg); - -err_cfg_delete: - (void) rpma_conn_cfg_delete(&cfg); - - return ret; -} - -static int server_qe_process(struct thread_data *td, - struct rpma_completion *cmpl) -{ - struct librpma_fio_server_data *csd = td->io_ops_data; - struct server_data *sd = csd->server_data; - GPSPMFlushRequest *flush_req; - GPSPMFlushResponse flush_resp = GPSPM_FLUSH_RESPONSE__INIT; - size_t flush_resp_size = 0; - size_t send_buff_offset; - size_t recv_buff_offset; - size_t io_u_buff_offset; - void *send_buff_ptr; - void *recv_buff_ptr; - void *op_ptr; - int msg_index; - int ret; - - /* calculate SEND/RECV pair parameters */ - msg_index = (int)(uintptr_t)cmpl->op_context; - io_u_buff_offset = IO_U_BUFF_OFF_SERVER(msg_index); - send_buff_offset = io_u_buff_offset + SEND_OFFSET; - recv_buff_offset = io_u_buff_offset + RECV_OFFSET; - send_buff_ptr = sd->orig_buffer_aligned + send_buff_offset; - recv_buff_ptr = sd->orig_buffer_aligned + recv_buff_offset; - - /* unpack a flush request from the received buffer */ - flush_req = gpspm_flush_request__unpack(NULL, cmpl->byte_len, - recv_buff_ptr); - if (flush_req == NULL) { - log_err("cannot unpack the flush request buffer\n"); - goto err_terminate; - } - - if (IS_NOT_THE_LAST_MESSAGE(flush_req)) { - op_ptr = csd->ws_ptr + flush_req->offset; - pmem_persist(op_ptr, flush_req->length); - } else { - /* - * This is the last message - the client is done. - */ - gpspm_flush_request__free_unpacked(flush_req, NULL); - td->done = true; - return 0; - } - - /* initiate the next receive operation */ - if ((ret = rpma_recv(csd->conn, sd->msg_mr, recv_buff_offset, - MAX_MSG_SIZE, - (const void *)(uintptr_t)msg_index))) { - librpma_td_verror(td, ret, "rpma_recv"); - goto err_free_unpacked; - } - - /* prepare a flush response and pack it to a send buffer */ - flush_resp.op_context = flush_req->op_context; - flush_resp_size = gpspm_flush_response__get_packed_size(&flush_resp); - if (flush_resp_size > MAX_MSG_SIZE) { - log_err( - "Size of the packed flush response is bigger than the available space of the send buffer (%" - PRIu64 " > %i\n", flush_resp_size, MAX_MSG_SIZE); - goto err_free_unpacked; - } - - (void) gpspm_flush_response__pack(&flush_resp, send_buff_ptr); - - /* send the flush response */ - if ((ret = rpma_send(csd->conn, sd->msg_mr, send_buff_offset, - flush_resp_size, RPMA_F_COMPLETION_ALWAYS, NULL))) { - librpma_td_verror(td, ret, "rpma_send"); - goto err_free_unpacked; - } - --sd->msg_sqe_available; - - gpspm_flush_request__free_unpacked(flush_req, NULL); - - return 0; - -err_free_unpacked: - gpspm_flush_request__free_unpacked(flush_req, NULL); - -err_terminate: - td->terminate = true; - - return -1; -} - -static inline int server_queue_process(struct thread_data *td) -{ - struct librpma_fio_server_data *csd = td->io_ops_data; - struct server_data *sd = csd->server_data; - int ret; - int i; - - /* min(# of queue entries, # of SQ entries available) */ - uint32_t qes_to_process = min(sd->msg_queued_nr, sd->msg_sqe_available); - if (qes_to_process == 0) - return 0; - - /* process queued completions */ - for (i = 0; i < qes_to_process; ++i) { - if ((ret = server_qe_process(td, &sd->msgs_queued[i]))) - return ret; - } - - /* progress the queue */ - for (i = 0; i < sd->msg_queued_nr - qes_to_process; ++i) { - memcpy(&sd->msgs_queued[i], - &sd->msgs_queued[qes_to_process + i], - sizeof(sd->msgs_queued[i])); - } - - sd->msg_queued_nr -= qes_to_process; - - return 0; -} - -static int server_cmpl_process(struct thread_data *td) -{ - struct librpma_fio_server_data *csd = td->io_ops_data; - struct server_data *sd = csd->server_data; - struct rpma_completion *cmpl = &sd->msgs_queued[sd->msg_queued_nr]; - struct librpma_fio_options_values *o = td->eo; - int ret; - - ret = rpma_conn_completion_get(csd->conn, cmpl); - if (ret == RPMA_E_NO_COMPLETION) { - if (o->busy_wait_polling == 0) { - ret = rpma_conn_completion_wait(csd->conn); - if (ret == RPMA_E_NO_COMPLETION) { - /* lack of completion is not an error */ - return 0; - } else if (ret != 0) { - librpma_td_verror(td, ret, "rpma_conn_completion_wait"); - goto err_terminate; - } - - ret = rpma_conn_completion_get(csd->conn, cmpl); - if (ret == RPMA_E_NO_COMPLETION) { - /* lack of completion is not an error */ - return 0; - } else if (ret != 0) { - librpma_td_verror(td, ret, "rpma_conn_completion_get"); - goto err_terminate; - } - } else { - /* lack of completion is not an error */ - return 0; - } - } else if (ret != 0) { - librpma_td_verror(td, ret, "rpma_conn_completion_get"); - goto err_terminate; - } - - /* validate the completion */ - if (cmpl->op_status != IBV_WC_SUCCESS) - goto err_terminate; - - if (cmpl->op == RPMA_OP_RECV) - ++sd->msg_queued_nr; - else if (cmpl->op == RPMA_OP_SEND) - ++sd->msg_sqe_available; - - return 0; - -err_terminate: - td->terminate = true; - - return -1; -} - -static enum fio_q_status server_queue(struct thread_data *td, struct io_u *io_u) -{ - do { - if (server_cmpl_process(td)) - return FIO_Q_BUSY; - - if (server_queue_process(td)) - return FIO_Q_BUSY; - - } while (!td->done); - - return FIO_Q_COMPLETED; -} - -FIO_STATIC struct ioengine_ops ioengine_server = { - .name = "librpma_gpspm_server", - .version = FIO_IOOPS_VERSION, - .init = server_init, - .post_init = server_post_init, - .open_file = server_open_file, - .close_file = librpma_fio_server_close_file, - .queue = server_queue, - .invalidate = librpma_fio_file_nop, - .cleanup = server_cleanup, - .flags = FIO_SYNCIO, - .options = librpma_fio_options, - .option_struct_size = sizeof(struct librpma_fio_options_values), -}; - -/* register both engines */ - -static void fio_init fio_librpma_gpspm_register(void) -{ - register_ioengine(&ioengine_client); - register_ioengine(&ioengine_server); -} - -static void fio_exit fio_librpma_gpspm_unregister(void) -{ - unregister_ioengine(&ioengine_client); - unregister_ioengine(&ioengine_server); -} diff --git a/engines/librpma_gpspm_flush.pb-c.c b/engines/librpma_gpspm_flush.pb-c.c deleted file mode 100644 index 3ff2475612..0000000000 --- a/engines/librpma_gpspm_flush.pb-c.c +++ /dev/null @@ -1,214 +0,0 @@ -/* - * Copyright 2020, Intel Corporation - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License, - * version 2 as published by the Free Software Foundation.. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -/* Generated by the protocol buffer compiler. DO NOT EDIT! */ -/* Generated from: librpma_gpspm_flush.proto */ - -/* Do not generate deprecated warnings for self */ -#ifndef PROTOBUF_C__NO_DEPRECATED -#define PROTOBUF_C__NO_DEPRECATED -#endif - -#include "librpma_gpspm_flush.pb-c.h" -void gpspm_flush_request__init - (GPSPMFlushRequest *message) -{ - static const GPSPMFlushRequest init_value = GPSPM_FLUSH_REQUEST__INIT; - *message = init_value; -} -size_t gpspm_flush_request__get_packed_size - (const GPSPMFlushRequest *message) -{ - assert(message->base.descriptor == &gpspm_flush_request__descriptor); - return protobuf_c_message_get_packed_size ((const ProtobufCMessage*)(message)); -} -size_t gpspm_flush_request__pack - (const GPSPMFlushRequest *message, - uint8_t *out) -{ - assert(message->base.descriptor == &gpspm_flush_request__descriptor); - return protobuf_c_message_pack ((const ProtobufCMessage*)message, out); -} -size_t gpspm_flush_request__pack_to_buffer - (const GPSPMFlushRequest *message, - ProtobufCBuffer *buffer) -{ - assert(message->base.descriptor == &gpspm_flush_request__descriptor); - return protobuf_c_message_pack_to_buffer ((const ProtobufCMessage*)message, buffer); -} -GPSPMFlushRequest * - gpspm_flush_request__unpack - (ProtobufCAllocator *allocator, - size_t len, - const uint8_t *data) -{ - return (GPSPMFlushRequest *) - protobuf_c_message_unpack (&gpspm_flush_request__descriptor, - allocator, len, data); -} -void gpspm_flush_request__free_unpacked - (GPSPMFlushRequest *message, - ProtobufCAllocator *allocator) -{ - if(!message) - return; - assert(message->base.descriptor == &gpspm_flush_request__descriptor); - protobuf_c_message_free_unpacked ((ProtobufCMessage*)message, allocator); -} -void gpspm_flush_response__init - (GPSPMFlushResponse *message) -{ - static const GPSPMFlushResponse init_value = GPSPM_FLUSH_RESPONSE__INIT; - *message = init_value; -} -size_t gpspm_flush_response__get_packed_size - (const GPSPMFlushResponse *message) -{ - assert(message->base.descriptor == &gpspm_flush_response__descriptor); - return protobuf_c_message_get_packed_size ((const ProtobufCMessage*)(message)); -} -size_t gpspm_flush_response__pack - (const GPSPMFlushResponse *message, - uint8_t *out) -{ - assert(message->base.descriptor == &gpspm_flush_response__descriptor); - return protobuf_c_message_pack ((const ProtobufCMessage*)message, out); -} -size_t gpspm_flush_response__pack_to_buffer - (const GPSPMFlushResponse *message, - ProtobufCBuffer *buffer) -{ - assert(message->base.descriptor == &gpspm_flush_response__descriptor); - return protobuf_c_message_pack_to_buffer ((const ProtobufCMessage*)message, buffer); -} -GPSPMFlushResponse * - gpspm_flush_response__unpack - (ProtobufCAllocator *allocator, - size_t len, - const uint8_t *data) -{ - return (GPSPMFlushResponse *) - protobuf_c_message_unpack (&gpspm_flush_response__descriptor, - allocator, len, data); -} -void gpspm_flush_response__free_unpacked - (GPSPMFlushResponse *message, - ProtobufCAllocator *allocator) -{ - if(!message) - return; - assert(message->base.descriptor == &gpspm_flush_response__descriptor); - protobuf_c_message_free_unpacked ((ProtobufCMessage*)message, allocator); -} -static const ProtobufCFieldDescriptor gpspm_flush_request__field_descriptors[3] = -{ - { - "offset", - 1, - PROTOBUF_C_LABEL_REQUIRED, - PROTOBUF_C_TYPE_FIXED64, - 0, /* quantifier_offset */ - offsetof(GPSPMFlushRequest, offset), - NULL, - NULL, - 0, /* flags */ - 0,NULL,NULL /* reserved1,reserved2, etc */ - }, - { - "length", - 2, - PROTOBUF_C_LABEL_REQUIRED, - PROTOBUF_C_TYPE_FIXED64, - 0, /* quantifier_offset */ - offsetof(GPSPMFlushRequest, length), - NULL, - NULL, - 0, /* flags */ - 0,NULL,NULL /* reserved1,reserved2, etc */ - }, - { - "op_context", - 3, - PROTOBUF_C_LABEL_REQUIRED, - PROTOBUF_C_TYPE_FIXED64, - 0, /* quantifier_offset */ - offsetof(GPSPMFlushRequest, op_context), - NULL, - NULL, - 0, /* flags */ - 0,NULL,NULL /* reserved1,reserved2, etc */ - }, -}; -static const unsigned gpspm_flush_request__field_indices_by_name[] = { - 1, /* field[1] = length */ - 0, /* field[0] = offset */ - 2, /* field[2] = op_context */ -}; -static const ProtobufCIntRange gpspm_flush_request__number_ranges[1 + 1] = -{ - { 1, 0 }, - { 0, 3 } -}; -const ProtobufCMessageDescriptor gpspm_flush_request__descriptor = -{ - PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC, - "GPSPM_flush_request", - "GPSPMFlushRequest", - "GPSPMFlushRequest", - "", - sizeof(GPSPMFlushRequest), - 3, - gpspm_flush_request__field_descriptors, - gpspm_flush_request__field_indices_by_name, - 1, gpspm_flush_request__number_ranges, - (ProtobufCMessageInit) gpspm_flush_request__init, - NULL,NULL,NULL /* reserved[123] */ -}; -static const ProtobufCFieldDescriptor gpspm_flush_response__field_descriptors[1] = -{ - { - "op_context", - 1, - PROTOBUF_C_LABEL_REQUIRED, - PROTOBUF_C_TYPE_FIXED64, - 0, /* quantifier_offset */ - offsetof(GPSPMFlushResponse, op_context), - NULL, - NULL, - 0, /* flags */ - 0,NULL,NULL /* reserved1,reserved2, etc */ - }, -}; -static const unsigned gpspm_flush_response__field_indices_by_name[] = { - 0, /* field[0] = op_context */ -}; -static const ProtobufCIntRange gpspm_flush_response__number_ranges[1 + 1] = -{ - { 1, 0 }, - { 0, 1 } -}; -const ProtobufCMessageDescriptor gpspm_flush_response__descriptor = -{ - PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC, - "GPSPM_flush_response", - "GPSPMFlushResponse", - "GPSPMFlushResponse", - "", - sizeof(GPSPMFlushResponse), - 1, - gpspm_flush_response__field_descriptors, - gpspm_flush_response__field_indices_by_name, - 1, gpspm_flush_response__number_ranges, - (ProtobufCMessageInit) gpspm_flush_response__init, - NULL,NULL,NULL /* reserved[123] */ -}; diff --git a/engines/librpma_gpspm_flush.pb-c.h b/engines/librpma_gpspm_flush.pb-c.h deleted file mode 100644 index ad475a955f..0000000000 --- a/engines/librpma_gpspm_flush.pb-c.h +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright 2020, Intel Corporation - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License, - * version 2 as published by the Free Software Foundation.. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -/* Generated by the protocol buffer compiler. DO NOT EDIT! */ -/* Generated from: librpma_gpspm_flush.proto */ - -#ifndef PROTOBUF_C_GPSPM_5fflush_2eproto__INCLUDED -#define PROTOBUF_C_GPSPM_5fflush_2eproto__INCLUDED - -#include - -PROTOBUF_C__BEGIN_DECLS - -#if PROTOBUF_C_VERSION_NUMBER < 1000000 -# error This file was generated by a newer version of protoc-c which is incompatible with your libprotobuf-c headers. Please update your headers. -#elif 1003003 < PROTOBUF_C_MIN_COMPILER_VERSION -# error This file was generated by an older version of protoc-c which is incompatible with your libprotobuf-c headers. Please regenerate this file with a newer version of protoc-c. -#endif - - -typedef struct _GPSPMFlushRequest GPSPMFlushRequest; -typedef struct _GPSPMFlushResponse GPSPMFlushResponse; - - -/* --- enums --- */ - - -/* --- messages --- */ - -struct _GPSPMFlushRequest -{ - ProtobufCMessage base; - uint64_t offset; - uint64_t length; - uint64_t op_context; -}; -#define GPSPM_FLUSH_REQUEST__INIT \ - { PROTOBUF_C_MESSAGE_INIT (&gpspm_flush_request__descriptor) \ - , 0, 0, 0 } - - -struct _GPSPMFlushResponse -{ - ProtobufCMessage base; - uint64_t op_context; -}; -#define GPSPM_FLUSH_RESPONSE__INIT \ - { PROTOBUF_C_MESSAGE_INIT (&gpspm_flush_response__descriptor) \ - , 0 } - - -/* GPSPMFlushRequest methods */ -void gpspm_flush_request__init - (GPSPMFlushRequest *message); -size_t gpspm_flush_request__get_packed_size - (const GPSPMFlushRequest *message); -size_t gpspm_flush_request__pack - (const GPSPMFlushRequest *message, - uint8_t *out); -size_t gpspm_flush_request__pack_to_buffer - (const GPSPMFlushRequest *message, - ProtobufCBuffer *buffer); -GPSPMFlushRequest * - gpspm_flush_request__unpack - (ProtobufCAllocator *allocator, - size_t len, - const uint8_t *data); -void gpspm_flush_request__free_unpacked - (GPSPMFlushRequest *message, - ProtobufCAllocator *allocator); -/* GPSPMFlushResponse methods */ -void gpspm_flush_response__init - (GPSPMFlushResponse *message); -size_t gpspm_flush_response__get_packed_size - (const GPSPMFlushResponse *message); -size_t gpspm_flush_response__pack - (const GPSPMFlushResponse *message, - uint8_t *out); -size_t gpspm_flush_response__pack_to_buffer - (const GPSPMFlushResponse *message, - ProtobufCBuffer *buffer); -GPSPMFlushResponse * - gpspm_flush_response__unpack - (ProtobufCAllocator *allocator, - size_t len, - const uint8_t *data); -void gpspm_flush_response__free_unpacked - (GPSPMFlushResponse *message, - ProtobufCAllocator *allocator); -/* --- per-message closures --- */ - -typedef void (*GPSPMFlushRequest_Closure) - (const GPSPMFlushRequest *message, - void *closure_data); -typedef void (*GPSPMFlushResponse_Closure) - (const GPSPMFlushResponse *message, - void *closure_data); - -/* --- services --- */ - - -/* --- descriptors --- */ - -extern const ProtobufCMessageDescriptor gpspm_flush_request__descriptor; -extern const ProtobufCMessageDescriptor gpspm_flush_response__descriptor; - -PROTOBUF_C__END_DECLS - - -#endif /* PROTOBUF_C_GPSPM_5fflush_2eproto__INCLUDED */ diff --git a/engines/librpma_gpspm_flush.proto b/engines/librpma_gpspm_flush.proto deleted file mode 100644 index 91765a7fb3..0000000000 --- a/engines/librpma_gpspm_flush.proto +++ /dev/null @@ -1,15 +0,0 @@ -syntax = "proto2"; - -message GPSPM_flush_request { - /* an offset of a region to be flushed within its memory registration */ - required fixed64 offset = 1; - /* a length of a region to be flushed */ - required fixed64 length = 2; - /* a user-defined operation context */ - required fixed64 op_context = 3; -} - -message GPSPM_flush_response { - /* the operation context of a completed request */ - required fixed64 op_context = 1; -} diff --git a/engines/libzbc.c b/engines/libzbc.c index 2bc2c7e0e4..0fa6bfd168 100644 --- a/engines/libzbc.c +++ b/engines/libzbc.c @@ -68,18 +68,9 @@ static int libzbc_open_dev(struct thread_data *td, struct fio_file *f, if (!read_only) flags |= O_RDWR; } else if (td_read(td)) { - if (f->filetype == FIO_TYPE_CHAR && !read_only) - flags |= O_RDWR; - else flags |= O_RDONLY; } - if (td->o.oatomic) { - td_verror(td, EINVAL, "libzbc does not support O_ATOMIC"); - log_err("%s: libzbc does not support O_ATOMIC\n", f->file_name); - return -EINVAL; - } - ld = calloc(1, sizeof(*ld)); if (!ld) return -ENOMEM; @@ -332,6 +323,66 @@ static int libzbc_reset_wp(struct thread_data *td, struct fio_file *f, return -ret; } +static int libzbc_move_zone_wp(struct thread_data *td, struct fio_file *f, + struct zbd_zone *z, uint64_t length, + const char *buf) +{ + struct libzbc_data *ld = td->io_ops_data; + uint64_t sector = z->wp >> 9; + size_t count = length >> 9; + struct zbc_errno err; + int ret; + + assert(ld); + assert(ld->zdev); + assert(buf); + + ret = zbc_pwrite(ld->zdev, buf, count, sector); + if (ret == count) + return 0; + + zbc_errno(ld->zdev, &err); + td_verror(td, errno, "zbc_write for write pointer move failed"); + if (err.sk) + log_err("%s: wp move failed %s:%s\n", + f->file_name, + zbc_sk_str(err.sk), zbc_asc_ascq_str(err.asc_ascq)); + return -ret; +} + +static int libzbc_finish_zone(struct thread_data *td, struct fio_file *f, + uint64_t offset, uint64_t length) +{ + struct libzbc_data *ld = td->io_ops_data; + uint64_t sector = offset >> 9; + unsigned int nr_zones; + struct zbc_errno err; + int i, ret; + + assert(ld); + assert(ld->zdev); + + nr_zones = (length + td->o.zone_size - 1) / td->o.zone_size; + assert(nr_zones > 0); + + for (i = 0; i < nr_zones; i++, sector += td->o.zone_size >> 9) { + ret = zbc_finish_zone(ld->zdev, sector, 0); + if (ret) + goto err; + } + + return 0; + +err: + zbc_errno(ld->zdev, &err); + td_verror(td, errno, "zbc_finish_zone failed"); + if (err.sk) + log_err("%s: finish zone failed %s:%s\n", + f->file_name, + zbc_sk_str(err.sk), zbc_asc_ascq_str(err.asc_ascq)); + return -ret; +} + static int libzbc_get_max_open_zones(struct thread_data *td, struct fio_file *f, unsigned int *max_open_zones) { @@ -433,7 +484,9 @@ FIO_STATIC struct ioengine_ops ioengine = { .get_zoned_model = libzbc_get_zoned_model, .report_zones = libzbc_report_zones, .reset_wp = libzbc_reset_wp, + .move_zone_wp = libzbc_move_zone_wp, .get_max_open_zones = libzbc_get_max_open_zones, + .finish_zone = libzbc_finish_zone, .queue = libzbc_queue, .flags = FIO_SYNCIO | FIO_NOEXTEND | FIO_RAWIO, }; diff --git a/engines/mmap.c b/engines/mmap.c index 55ba1ab36c..1585d7238f 100644 --- a/engines/mmap.c +++ b/engines/mmap.c @@ -15,7 +15,7 @@ #include "../verify.h" /* - * Limits us to 1GiB of mapped files in total + * Limits us to 1GiB of mapped files in total on 32-bit architectures */ #define MMAP_TOTAL_SZ (1 * 1024 * 1024 * 1024UL) @@ -53,6 +53,7 @@ static bool fio_madvise_file(struct thread_data *td, struct fio_file *f, size_t length) { + int flags; struct fio_mmap_data *fmd = FILE_ENG_DATA(f); #ifdef CONFIG_HAVE_THP struct mmap_options *o = td->eo; @@ -65,16 +66,20 @@ static bool fio_madvise_file(struct thread_data *td, struct fio_file *f, if (!td->o.fadvise_hint) return true; - if (!td_random(td)) { - if (posix_madvise(fmd->mmap_ptr, length, POSIX_MADV_SEQUENTIAL) < 0) { - td_verror(td, errno, "madvise"); - return false; - } - } else { - if (posix_madvise(fmd->mmap_ptr, length, POSIX_MADV_RANDOM) < 0) { - td_verror(td, errno, "madvise"); - return false; - } + if (td->o.fadvise_hint == F_ADV_TYPE) + flags = td_random(td) ? POSIX_MADV_RANDOM : POSIX_MADV_SEQUENTIAL; + else if (td->o.fadvise_hint == F_ADV_RANDOM) + flags = POSIX_MADV_RANDOM; + else if (td->o.fadvise_hint == F_ADV_SEQUENTIAL) + flags = POSIX_MADV_SEQUENTIAL; + else { + log_err("fio: unknown madvise type %d\n", td->o.fadvise_hint); + return false; + } + + if (posix_madvise(fmd->mmap_ptr, length, flags) < 0) { + td_verror(td, errno, "madvise"); + return false; } return true; @@ -152,11 +157,8 @@ static int fio_mmapio_prep_limited(struct thread_data *td, struct io_u *io_u) return EIO; } - fmd->mmap_sz = mmap_map_size; - if (fmd->mmap_sz > f->io_size) - fmd->mmap_sz = f->io_size; - fmd->mmap_off = io_u->offset; + fmd->mmap_sz = io_u->buflen; return fio_mmap_file(td, f, fmd->mmap_sz, fmd->mmap_off); } @@ -172,14 +174,14 @@ static int fio_mmapio_prep_full(struct thread_data *td, struct io_u *io_u) if (fio_file_partial_mmap(f)) return EINVAL; - if (io_u->offset != (size_t) io_u->offset || - f->io_size != (size_t) f->io_size) { + + if (sizeof(size_t) < 8 && f->io_size > mmap_map_size) { fio_file_set_partial_mmap(f); return EINVAL; } fmd->mmap_sz = f->io_size; - fmd->mmap_off = 0; + fmd->mmap_off = f->file_offset; ret = fio_mmap_file(td, f, fmd->mmap_sz, fmd->mmap_off); if (ret) @@ -218,8 +220,7 @@ static int fio_mmapio_prep(struct thread_data *td, struct io_u *io_u) } done: - io_u->mmap_data = fmd->mmap_ptr + io_u->offset - fmd->mmap_off - - f->file_offset; + io_u->mmap_data = fmd->mmap_ptr + io_u->offset - fmd->mmap_off; return 0; } diff --git a/engines/nbd.c b/engines/nbd.c index b0ba75e694..7c2d5f4ba6 100644 --- a/engines/nbd.c +++ b/engines/nbd.c @@ -52,7 +52,7 @@ static struct fio_option options[] = { }, }; -/* Alocates nbd_data. */ +/* Allocates nbd_data. */ static int nbd_setup(struct thread_data *td) { struct nbd_data *nbd_data; diff --git a/engines/net.c b/engines/net.c index c6cec5845a..29150bb348 100644 --- a/engines/net.c +++ b/engines/net.c @@ -18,6 +18,16 @@ #include #include +#ifdef CONFIG_VSOCK +#include +#else +struct sockaddr_vm { +}; +#ifndef AF_VSOCK +#define AF_VSOCK -1 +#endif +#endif + #include "../fio.h" #include "../verify.h" #include "../optgroup.h" @@ -30,6 +40,7 @@ struct netio_data { struct sockaddr_in addr; struct sockaddr_in6 addr6; struct sockaddr_un addr_un; + struct sockaddr_vm addr_vm; uint64_t udp_send_seq; uint64_t udp_recv_seq; }; @@ -69,6 +80,7 @@ enum { FIO_TYPE_UNIX = 3, FIO_TYPE_TCP_V6 = 4, FIO_TYPE_UDP_V6 = 5, + FIO_TYPE_VSOCK_STREAM = 6, }; static int str_hostname_cb(void *data, const char *input); @@ -126,6 +138,10 @@ static struct fio_option options[] = { .oval = FIO_TYPE_UNIX, .help = "UNIX domain socket", }, + { .ival = "vsock", + .oval = FIO_TYPE_VSOCK_STREAM, + .help = "Virtual socket", + }, }, .category = FIO_OPT_C_ENGINE, .group = FIO_OPT_G_NETIO, @@ -223,6 +239,11 @@ static inline int is_ipv6(struct netio_options *o) return o->proto == FIO_TYPE_UDP_V6 || o->proto == FIO_TYPE_TCP_V6; } +static inline int is_vsock(struct netio_options *o) +{ + return o->proto == FIO_TYPE_VSOCK_STREAM; +} + static int set_window_size(struct thread_data *td, int fd) { #ifdef CONFIG_NET_WINDOWSIZE @@ -732,6 +753,9 @@ static int fio_netio_connect(struct thread_data *td, struct fio_file *f) } else if (o->proto == FIO_TYPE_UNIX) { domain = AF_UNIX; type = SOCK_STREAM; + } else if (is_vsock(o)) { + domain = AF_VSOCK; + type = SOCK_STREAM; } else { log_err("fio: bad network type %d\n", o->proto); f->fd = -1; @@ -809,7 +833,14 @@ static int fio_netio_connect(struct thread_data *td, struct fio_file *f) close(f->fd); return 1; } + } else if (is_vsock(o)) { + socklen_t len = sizeof(nd->addr_vm); + if (connect(f->fd, (struct sockaddr *) &nd->addr_vm, len) < 0) { + td_verror(td, errno, "connect"); + close(f->fd); + return 1; + } } else { struct sockaddr_un *addr = &nd->addr_un; socklen_t len; @@ -849,6 +880,9 @@ static int fio_netio_accept(struct thread_data *td, struct fio_file *f) if (o->proto == FIO_TYPE_TCP) { socklen = sizeof(nd->addr); f->fd = accept(nd->listenfd, (struct sockaddr *) &nd->addr, &socklen); + } else if (is_vsock(o)) { + socklen = sizeof(nd->addr_vm); + f->fd = accept(nd->listenfd, (struct sockaddr *) &nd->addr_vm, &socklen); } else { socklen = sizeof(nd->addr6); f->fd = accept(nd->listenfd, (struct sockaddr *) &nd->addr6, &socklen); @@ -890,6 +924,9 @@ static void fio_netio_send_close(struct thread_data *td, struct fio_file *f) if (is_ipv6(o)) { to = (struct sockaddr *) &nd->addr6; len = sizeof(nd->addr6); + } else if (is_vsock(o)) { + to = NULL; + len = 0; } else { to = (struct sockaddr *) &nd->addr; len = sizeof(nd->addr); @@ -960,6 +997,9 @@ static int fio_netio_send_open(struct thread_data *td, struct fio_file *f) if (is_ipv6(o)) { len = sizeof(nd->addr6); to = (struct sockaddr *) &nd->addr6; + } else if (is_vsock(o)) { + len = sizeof(nd->addr_vm); + to = (struct sockaddr *) &nd->addr_vm; } else { len = sizeof(nd->addr); to = (struct sockaddr *) &nd->addr; @@ -1023,13 +1063,17 @@ static int fio_fill_addr(struct thread_data *td, const char *host, int af, memset(&hints, 0, sizeof(hints)); - if (is_tcp(o)) + if (is_tcp(o) || is_vsock(o)) hints.ai_socktype = SOCK_STREAM; else hints.ai_socktype = SOCK_DGRAM; if (is_ipv6(o)) hints.ai_family = AF_INET6; +#ifdef CONFIG_VSOCK + else if (is_vsock(o)) + hints.ai_family = AF_VSOCK; +#endif else hints.ai_family = AF_INET; @@ -1110,12 +1154,50 @@ static int fio_netio_setup_connect_unix(struct thread_data *td, return 0; } +static int fio_netio_setup_connect_vsock(struct thread_data *td, + const char *host, unsigned short port) +{ +#ifdef CONFIG_VSOCK + struct netio_data *nd = td->io_ops_data; + struct sockaddr_vm *addr = &nd->addr_vm; + int cid; + + if (!host) { + log_err("fio: connect with no host to connect to.\n"); + if (td_read(td)) + log_err("fio: did you forget to set 'listen'?\n"); + + td_verror(td, EINVAL, "no hostname= set"); + return 1; + } + + addr->svm_family = AF_VSOCK; + addr->svm_port = port; + + if (host) { + cid = atoi(host); + if (cid < 0 || cid > UINT32_MAX) { + log_err("fio: invalid CID %d\n", cid); + return 1; + } + addr->svm_cid = cid; + } + + return 0; +#else + td_verror(td, -EINVAL, "vsock not supported"); + return 1; +#endif +} + static int fio_netio_setup_connect(struct thread_data *td) { struct netio_options *o = td->eo; if (is_udp(o) || is_tcp(o)) return fio_netio_setup_connect_inet(td, td->o.filename,o->port); + else if (is_vsock(o)) + return fio_netio_setup_connect_vsock(td, td->o.filename, o->port); else return fio_netio_setup_connect_unix(td, td->o.filename); } @@ -1268,6 +1350,47 @@ static int fio_netio_setup_listen_inet(struct thread_data *td, short port) return 0; } +static int fio_netio_setup_listen_vsock(struct thread_data *td, short port, int type) +{ +#ifdef CONFIG_VSOCK + struct netio_data *nd = td->io_ops_data; + struct sockaddr_vm *addr = &nd->addr_vm; + int fd, opt; + socklen_t len; + + fd = socket(AF_VSOCK, type, 0); + if (fd < 0) { + td_verror(td, errno, "socket"); + return 1; + } + + opt = 1; + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (void *) &opt, sizeof(opt)) < 0) { + td_verror(td, errno, "setsockopt"); + close(fd); + return 1; + } + + len = sizeof(*addr); + + nd->addr_vm.svm_family = AF_VSOCK; + nd->addr_vm.svm_cid = VMADDR_CID_ANY; + nd->addr_vm.svm_port = port; + + if (bind(fd, (struct sockaddr *) addr, len) < 0) { + td_verror(td, errno, "bind"); + close(fd); + return 1; + } + + nd->listenfd = fd; + return 0; +#else + td_verror(td, -EINVAL, "vsock not supported"); + return -1; +#endif +} + static int fio_netio_setup_listen(struct thread_data *td) { struct netio_data *nd = td->io_ops_data; @@ -1276,6 +1399,8 @@ static int fio_netio_setup_listen(struct thread_data *td) if (is_udp(o) || is_tcp(o)) ret = fio_netio_setup_listen_inet(td, o->port); + else if (is_vsock(o)) + ret = fio_netio_setup_listen_vsock(td, o->port, SOCK_STREAM); else ret = fio_netio_setup_listen_unix(td, td->o.filename); @@ -1311,6 +1436,9 @@ static int fio_netio_init(struct thread_data *td) if (o->proto == FIO_TYPE_UNIX && o->port) { log_err("fio: network IO port not valid with unix socket\n"); return 1; + } else if (is_vsock(o) && !o->port) { + log_err("fio: network IO requires port for vsock\n"); + return 1; } else if (o->proto != FIO_TYPE_UNIX && !o->port) { log_err("fio: network IO requires port for tcp or udp\n"); return 1; @@ -1318,7 +1446,7 @@ static int fio_netio_init(struct thread_data *td) o->port += td->subjob_number; - if (!is_tcp(o)) { + if (!is_tcp(o) && !is_vsock(o)) { if (o->listen) { log_err("fio: listen only valid for TCP proto IO\n"); return 1; @@ -1370,9 +1498,7 @@ static int fio_netio_setup(struct thread_data *td) } if (!td->io_ops_data) { - nd = malloc(sizeof(*nd)); - - memset(nd, 0, sizeof(*nd)); + nd = calloc(1, sizeof(*nd)); nd->listenfd = -1; nd->pipes[0] = nd->pipes[1] = -1; td->io_ops_data = nd; diff --git a/engines/nfs.c b/engines/nfs.c index 21be88334d..13b55038cc 100644 --- a/engines/nfs.c +++ b/engines/nfs.c @@ -16,10 +16,17 @@ enum nfs_op_type { struct fio_libnfs_options { struct nfs_context *context; char *nfs_url; - unsigned int queue_depth; /* nfs_callback needs this info, but doesn't have fio td structure to pull it from */ + /* nfs_callback needs this info, but doesn't have fio td structure to + * pull it from + */ + unsigned int queue_depth; + /* the following implement a circular queue of outstanding IOs */ - int outstanding_events; /* IOs issued to libnfs, that have not returned yet */ - int prev_requested_event_index; /* event last returned via fio_libnfs_event */ + + /* IOs issued to libnfs, that have not returned yet */ + int outstanding_events; + /* event last returned via fio_libnfs_event */ + int prev_requested_event_index; int next_buffered_event; /* round robin-pointer within events[] */ int buffered_event_count; /* IOs completed by libnfs, waiting for FIO */ int free_event_buffer_index; /* next free buffer */ @@ -33,11 +40,12 @@ struct nfs_data { static struct fio_option options[] = { { - .name = "nfs_url", - .lname = "nfs_url", - .type = FIO_OPT_STR_STORE, - .help = "URL in libnfs format, eg nfs:///path[?arg=val[&arg=val]*]", - .off1 = offsetof(struct fio_libnfs_options, nfs_url), + .name = "nfs_url", + .lname = "nfs_url", + .type = FIO_OPT_STR_STORE, + .help = "URL in libnfs format, eg nfs:///path[?arg=val[&arg=val]*]", + .off1 = offsetof(struct fio_libnfs_options, nfs_url), .category = FIO_OPT_C_ENGINE, .group = __FIO_OPT_G_NFS, }, @@ -50,44 +58,53 @@ static struct io_u *fio_libnfs_event(struct thread_data *td, int event) { struct fio_libnfs_options *o = td->eo; struct io_u *io_u = o->events[o->next_buffered_event]; + assert(o->events[o->next_buffered_event]); o->events[o->next_buffered_event] = NULL; o->next_buffered_event = (o->next_buffered_event + 1) % td->o.iodepth; + /* validate our state machine */ assert(o->buffered_event_count); o->buffered_event_count--; assert(io_u); + /* assert that fio_libnfs_event is being called in sequential fashion */ assert(event == 0 || o->prev_requested_event_index + 1 == event); - if (o->buffered_event_count == 0) { + if (o->buffered_event_count == 0) o->prev_requested_event_index = -1; - } else { + else o->prev_requested_event_index = event; - } return io_u; } -static int nfs_event_loop(struct thread_data *td, bool flush) { +/* + * fio core logic seems to stop calling this event-loop if we ever return with + * 0 events + */ +#define SHOULD_WAIT(td, o, flush) \ + ((o)->outstanding_events == (td)->o.iodepth || \ + (flush && (o)->outstanding_events)) + +static int nfs_event_loop(struct thread_data *td, bool flush) +{ struct fio_libnfs_options *o = td->eo; struct pollfd pfds[1]; /* nfs:0 */ + /* we already have stuff queued for fio, no need to waste cpu on poll() */ if (o->buffered_event_count) return o->buffered_event_count; - /* fio core logic seems to stop calling this event-loop if we ever return with 0 events */ - #define SHOULD_WAIT() (o->outstanding_events == td->o.iodepth || (flush && o->outstanding_events)) do { - int timeout = SHOULD_WAIT() ? -1 : 0; + int timeout = SHOULD_WAIT(td, o, flush) ? -1 : 0; int ret = 0; + pfds[0].fd = nfs_get_fd(o->context); pfds[0].events = nfs_which_events(o->context); ret = poll(&pfds[0], 1, timeout); if (ret < 0) { - if (errno == EINTR || errno == EAGAIN) { + if (errno == EINTR || errno == EAGAIN) continue; - } - log_err("nfs: failed to poll events: %s.\n", - strerror(errno)); + log_err("nfs: failed to poll events: %s\n", strerror(errno)); break; } @@ -96,27 +113,30 @@ static int nfs_event_loop(struct thread_data *td, bool flush) { log_err("nfs: socket is in an unrecoverable error state.\n"); break; } - } while (SHOULD_WAIT()); + } while (SHOULD_WAIT(td, o, flush)); + return o->buffered_event_count; -#undef SHOULD_WAIT } static int fio_libnfs_getevents(struct thread_data *td, unsigned int min, - unsigned int max, const struct timespec *t) + unsigned int max, const struct timespec *t) { return nfs_event_loop(td, false); } static void nfs_callback(int res, struct nfs_context *nfs, void *data, - void *private_data) + void *private_data) { struct io_u *io_u = private_data; struct nfs_data *nfs_data = io_u->file->engine_data; struct fio_libnfs_options *o = nfs_data->options; if (res < 0) { - log_err("Failed NFS operation(code:%d): %s\n", res, nfs_get_error(o->context)); + log_err("Failed NFS operation(code:%d): %s\n", res, + nfs_get_error(o->context)); io_u->error = -res; - /* res is used for read math below, don't wanna pass negative there */ + /* res is used for read math below, don't want to pass negative + * there + */ res = 0; } else if (io_u->ddir == DDIR_READ) { memcpy(io_u->buf, data, res); @@ -133,42 +153,58 @@ static void nfs_callback(int res, struct nfs_context *nfs, void *data, o->buffered_event_count++; } -static int queue_write(struct fio_libnfs_options *o, struct io_u *io_u) { +static int queue_write(struct fio_libnfs_options *o, struct io_u *io_u) +{ struct nfs_data *nfs_data = io_u->engine_data; + +#ifdef LIBNFS_API_V2 return nfs_pwrite_async(o->context, nfs_data->nfsfh, - io_u->offset, io_u->buflen, io_u->buf, nfs_callback, - io_u); + io_u->buf, io_u->buflen, io_u->offset, + nfs_callback, io_u); +#else + return nfs_pwrite_async(o->context, nfs_data->nfsfh, io_u->offset, + io_u->buflen, io_u->buf, nfs_callback, io_u); +#endif } -static int queue_read(struct fio_libnfs_options *o, struct io_u *io_u) { +static int queue_read(struct fio_libnfs_options *o, struct io_u *io_u) +{ struct nfs_data *nfs_data = io_u->engine_data; - return nfs_pread_async(o->context, nfs_data->nfsfh, io_u->offset, io_u->buflen, nfs_callback, io_u); + +#ifdef LIBNFS_API_V2 + return nfs_pread_async(o->context, nfs_data->nfsfh, + io_u->buf, io_u->buflen, io_u->offset, + nfs_callback, io_u); +#else + return nfs_pread_async(o->context, nfs_data->nfsfh, io_u->offset, + io_u->buflen, nfs_callback, io_u); +#endif } static enum fio_q_status fio_libnfs_queue(struct thread_data *td, - struct io_u *io_u) + struct io_u *io_u) { struct nfs_data *nfs_data = io_u->file->engine_data; struct fio_libnfs_options *o = nfs_data->options; struct nfs_context *nfs = o->context; - int err; enum fio_q_status ret = FIO_Q_QUEUED; + int err; io_u->engine_data = nfs_data; - switch(io_u->ddir) { - case DDIR_WRITE: - err = queue_write(o, io_u); - break; - case DDIR_READ: - err = queue_read(o, io_u); - break; - case DDIR_TRIM: - log_err("nfs: trim is not supported"); - err = -1; - break; - default: - log_err("nfs: unhandled io %d\n", io_u->ddir); - err = -1; + switch (io_u->ddir) { + case DDIR_WRITE: + err = queue_write(o, io_u); + break; + case DDIR_READ: + err = queue_read(o, io_u); + break; + case DDIR_TRIM: + log_err("nfs: trim is not supported"); + err = -1; + break; + default: + log_err("nfs: unhandled io %d\n", io_u->ddir); + err = -1; } if (err) { log_err("nfs: Failed to queue nfs op: %s\n", nfs_get_error(nfs)); @@ -195,13 +231,12 @@ static int do_mount(struct thread_data *td, const char *url) return 0; options->context = nfs_init_context(); - if (options->context == NULL) { + if (!options->context) { log_err("nfs: failed to init nfs context\n"); return -1; } - options->events = malloc(event_size); - memset(options->events, 0, event_size); + options->events = calloc(1, event_size); options->prev_requested_event_index = -1; options->queue_depth = td->o.iodepth; @@ -219,7 +254,9 @@ static int do_mount(struct thread_data *td, const char *url) static int fio_libnfs_setup(struct thread_data *td) { - /* Using threads with libnfs causes fio to hang on exit, lower performance */ + /* Using threads with libnfs causes fio to hang on exit, lower + * performance + */ td->o.use_thread = 0; return 0; } @@ -227,6 +264,7 @@ static int fio_libnfs_setup(struct thread_data *td) static void fio_libnfs_cleanup(struct thread_data *td) { struct fio_libnfs_options *o = td->eo; + nfs_umount(o->context); nfs_destroy_context(o->context); free(o->events); @@ -234,10 +272,10 @@ static void fio_libnfs_cleanup(struct thread_data *td) static int fio_libnfs_open(struct thread_data *td, struct fio_file *f) { - int ret; struct fio_libnfs_options *options = td->eo; struct nfs_data *nfs_data = NULL; int flags = 0; + int ret; if (!options->nfs_url) { log_err("nfs: nfs_url is a required parameter\n"); @@ -246,23 +284,24 @@ static int fio_libnfs_open(struct thread_data *td, struct fio_file *f) ret = do_mount(td, options->nfs_url); - if (ret != 0) { - log_err("nfs: Failed to mount %s with code %d: %s\n", options->nfs_url, ret, nfs_get_error(options->context)); + if (ret) { + log_err("nfs: Failed to mount %s with code %d: %s\n", + options->nfs_url, ret, nfs_get_error(options->context)); return ret; } - nfs_data = malloc(sizeof(struct nfs_data)); - memset(nfs_data, 0, sizeof(struct nfs_data)); + nfs_data = calloc(1, sizeof(struct nfs_data)); nfs_data->options = options; - if (td->o.td_ddir == TD_DDIR_WRITE) { + if (td_write(td)) flags |= O_CREAT | O_RDWR; - } else { + else flags |= O_RDWR; - } + ret = nfs_open(options->context, f->file_name, flags, &nfs_data->nfsfh); - if (ret != 0) - log_err("Failed to open %s: %s\n", f->file_name, nfs_get_error(options->context)); + if (ret) + log_err("Failed to open %s: %s\n", f->file_name, + nfs_get_error(options->context)); f->engine_data = nfs_data; return ret; } @@ -272,22 +311,16 @@ static int fio_libnfs_close(struct thread_data *td, struct fio_file *f) struct nfs_data *nfs_data = f->engine_data; struct fio_libnfs_options *o = nfs_data->options; int ret = 0; + if (nfs_data->nfsfh) ret = nfs_close(o->context, nfs_data->nfsfh); + free(nfs_data); f->engine_data = NULL; return ret; } -/* - * Hook for writing out outstanding data. - */ -static int fio_libnfs_commit(struct thread_data *td) { - nfs_event_loop(td, true); - return 0; -} - -struct ioengine_ops ioengine = { +static struct ioengine_ops ioengine = { .name = "nfs", .version = FIO_IOOPS_VERSION, .setup = fio_libnfs_setup, @@ -297,8 +330,7 @@ struct ioengine_ops ioengine = { .cleanup = fio_libnfs_cleanup, .open_file = fio_libnfs_open, .close_file = fio_libnfs_close, - .commit = fio_libnfs_commit, - .flags = FIO_DISKLESSIO | FIO_NOEXTEND | FIO_NODISKUTIL, + .flags = FIO_DISKLESSIO | FIO_NOEXTEND | FIO_NODISKUTIL, .options = options, .option_struct_size = sizeof(struct fio_libnfs_options), }; diff --git a/engines/null.c b/engines/null.c index 4cc0102b99..7236ec9488 100644 --- a/engines/null.c +++ b/engines/null.c @@ -6,7 +6,8 @@ * * It also can act as external C++ engine - compiled with: * - * g++ -O2 -g -shared -rdynamic -fPIC -o cpp_null null.c -DFIO_EXTERNAL_ENGINE + * g++ -O2 -g -shared -rdynamic -fPIC -o cpp_null null.c \ + * -include ../config-host.h -DFIO_EXTERNAL_ENGINE * * to test it execute: * @@ -43,9 +44,28 @@ static int null_getevents(struct null_data *nd, unsigned int min_events, return ret; } +static void null_queued(struct thread_data *td, struct null_data *nd) +{ + struct timespec now; + + if (!fio_fill_issue_time(td)) + return; + + fio_gettime(&now, NULL); + + for (int i = 0; i < nd->queued; i++) { + struct io_u *io_u = nd->io_us[i]; + + memcpy(&io_u->issue_time, &now, sizeof(now)); + io_u_queued(td, io_u); + } +} + static int null_commit(struct thread_data *td, struct null_data *nd) { if (!nd->events) { + null_queued(td, nd); + #ifndef FIO_EXTERNAL_ENGINE io_u_mark_submit(td, nd->queued); #endif @@ -86,16 +106,18 @@ static void null_cleanup(struct null_data *nd) static struct null_data *null_init(struct thread_data *td) { - struct null_data *nd = (struct null_data *) malloc(sizeof(*nd)); + struct null_data *nd; + nd = malloc(sizeof(*nd)); memset(nd, 0, sizeof(*nd)); if (td->o.iodepth != 1) { - nd->io_us = (struct io_u **) malloc(td->o.iodepth * sizeof(struct io_u *)); - memset(nd->io_us, 0, td->o.iodepth * sizeof(struct io_u *)); + nd->io_us = calloc(td->o.iodepth, sizeof(struct io_u *)); + td->io_ops->flags |= FIO_ASYNCIO_SETS_ISSUE_TIME; } else td->io_ops->flags |= FIO_SYNCIO; + td_set_ioengine_flags(td); return nd; } @@ -201,7 +223,7 @@ struct NullData { return null_commit(td, impl_); } - int fio_null_queue(struct thread_data *td, struct io_u *io_u) + fio_q_status fio_null_queue(struct thread_data *td, struct io_u *io_u) { return null_queue(td, impl_, io_u); } @@ -233,7 +255,7 @@ static int fio_null_commit(struct thread_data *td) return NullData::get(td)->fio_null_commit(td); } -static int fio_null_queue(struct thread_data *td, struct io_u *io_u) +static fio_q_status fio_null_queue(struct thread_data *td, struct io_u *io_u) { return NullData::get(td)->fio_null_queue(td, io_u); } diff --git a/engines/nvme.c b/engines/nvme.c new file mode 100644 index 0000000000..528b2a29ae --- /dev/null +++ b/engines/nvme.c @@ -0,0 +1,906 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * nvme structure declarations and helper functions for the + * io_uring_cmd engine. + */ + +#include "nvme.h" +#include "../crc/crc-t10dif.h" +#include "../crc/crc64.h" + +static void fio_nvme_generate_pi_16b_guard(struct nvme_data *data, + struct io_u *io_u, + struct nvme_cmd_ext_io_opts *opts) +{ + struct nvme_pi_data *pi_data = io_u->engine_data; + struct nvme_16b_guard_pif *pi; + unsigned char *buf = io_u->xfer_buf; + unsigned char *md_buf = io_u->mmap_data; + __u64 slba = get_slba(data, io_u->offset); + __u32 nlb = get_nlb(data, io_u->xfer_buflen) + 1; + __u32 lba_num = 0; + __u16 guard = 0; + + if (data->pi_loc) { + if (data->lba_ext) + pi_data->interval = data->lba_ext - data->ms; + else + pi_data->interval = 0; + } else { + if (data->lba_ext) + pi_data->interval = data->lba_ext - sizeof(struct nvme_16b_guard_pif); + else + pi_data->interval = data->ms - sizeof(struct nvme_16b_guard_pif); + } + + if (io_u->ddir != DDIR_WRITE) + return; + + while (lba_num < nlb) { + if (data->lba_ext) + pi = (struct nvme_16b_guard_pif *)(buf + pi_data->interval); + else + pi = (struct nvme_16b_guard_pif *)(md_buf + pi_data->interval); + + if (opts->io_flags & NVME_IO_PRINFO_PRCHK_GUARD) { + if (data->lba_ext) { + guard = fio_crc_t10dif(0, buf, pi_data->interval); + } else { + guard = fio_crc_t10dif(0, buf, data->lba_size); + guard = fio_crc_t10dif(guard, md_buf, pi_data->interval); + } + pi->guard = cpu_to_be16(guard); + } + + if (opts->io_flags & NVME_IO_PRINFO_PRCHK_APP) + pi->apptag = cpu_to_be16(pi_data->apptag); + + if (opts->io_flags & NVME_IO_PRINFO_PRCHK_REF) { + switch (data->pi_type) { + case NVME_NS_DPS_PI_TYPE1: + case NVME_NS_DPS_PI_TYPE2: + pi->srtag = cpu_to_be32((__u32)slba + lba_num); + break; + case NVME_NS_DPS_PI_TYPE3: + break; + } + } + if (data->lba_ext) { + buf += data->lba_ext; + } else { + buf += data->lba_size; + md_buf += data->ms; + } + lba_num++; + } +} + +static int fio_nvme_verify_pi_16b_guard(struct nvme_data *data, + struct io_u *io_u) +{ + struct nvme_pi_data *pi_data = io_u->engine_data; + struct nvme_16b_guard_pif *pi; + struct fio_file *f = io_u->file; + unsigned char *buf = io_u->xfer_buf; + unsigned char *md_buf = io_u->mmap_data; + __u64 slba = get_slba(data, io_u->offset); + __u32 nlb = get_nlb(data, io_u->xfer_buflen) + 1; + __u32 lba_num = 0; + __u16 unmask_app, unmask_app_exp, guard = 0; + + while (lba_num < nlb) { + if (data->lba_ext) + pi = (struct nvme_16b_guard_pif *)(buf + pi_data->interval); + else + pi = (struct nvme_16b_guard_pif *)(md_buf + pi_data->interval); + + if (data->pi_type == NVME_NS_DPS_PI_TYPE3) { + if (pi->apptag == NVME_PI_APP_DISABLE && + pi->srtag == NVME_PI_REF_DISABLE) + goto next; + } else if (data->pi_type == NVME_NS_DPS_PI_TYPE1 || + data->pi_type == NVME_NS_DPS_PI_TYPE2) { + if (pi->apptag == NVME_PI_APP_DISABLE) + goto next; + } + + if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_GUARD) { + if (data->lba_ext) { + guard = fio_crc_t10dif(0, buf, pi_data->interval); + } else { + guard = fio_crc_t10dif(0, buf, data->lba_size); + guard = fio_crc_t10dif(guard, md_buf, pi_data->interval); + } + if (be16_to_cpu(pi->guard) != guard) { + log_err("%s: Guard compare error: LBA: %llu Expected=%x, Actual=%x\n", + f->file_name, (unsigned long long)slba, + guard, be16_to_cpu(pi->guard)); + return -EIO; + } + } + + if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_APP) { + unmask_app = be16_to_cpu(pi->apptag) & pi_data->apptag_mask; + unmask_app_exp = pi_data->apptag & pi_data->apptag_mask; + if (unmask_app != unmask_app_exp) { + log_err("%s: APPTAG compare error: LBA: %llu Expected=%x, Actual=%x\n", + f->file_name, (unsigned long long)slba, + unmask_app_exp, unmask_app); + return -EIO; + } + } + + if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_REF) { + switch (data->pi_type) { + case NVME_NS_DPS_PI_TYPE1: + case NVME_NS_DPS_PI_TYPE2: + if (be32_to_cpu(pi->srtag) != + ((__u32)slba + lba_num)) { + log_err("%s: REFTAG compare error: LBA: %llu Expected=%x, Actual=%x\n", + f->file_name, (unsigned long long)slba, + (__u32)slba + lba_num, + be32_to_cpu(pi->srtag)); + return -EIO; + } + break; + case NVME_NS_DPS_PI_TYPE3: + break; + } + } +next: + if (data->lba_ext) { + buf += data->lba_ext; + } else { + buf += data->lba_size; + md_buf += data->ms; + } + lba_num++; + } + + return 0; +} + +static void fio_nvme_generate_pi_64b_guard(struct nvme_data *data, + struct io_u *io_u, + struct nvme_cmd_ext_io_opts *opts) +{ + struct nvme_pi_data *pi_data = io_u->engine_data; + struct nvme_64b_guard_pif *pi; + unsigned char *buf = io_u->xfer_buf; + unsigned char *md_buf = io_u->mmap_data; + uint64_t guard = 0; + __u64 slba = get_slba(data, io_u->offset); + __u32 nlb = get_nlb(data, io_u->xfer_buflen) + 1; + __u32 lba_num = 0; + + if (data->pi_loc) { + if (data->lba_ext) + pi_data->interval = data->lba_ext - data->ms; + else + pi_data->interval = 0; + } else { + if (data->lba_ext) + pi_data->interval = data->lba_ext - sizeof(struct nvme_64b_guard_pif); + else + pi_data->interval = data->ms - sizeof(struct nvme_64b_guard_pif); + } + + if (io_u->ddir != DDIR_WRITE) + return; + + while (lba_num < nlb) { + if (data->lba_ext) + pi = (struct nvme_64b_guard_pif *)(buf + pi_data->interval); + else + pi = (struct nvme_64b_guard_pif *)(md_buf + pi_data->interval); + + if (opts->io_flags & NVME_IO_PRINFO_PRCHK_GUARD) { + if (data->lba_ext) { + guard = fio_crc64_nvme(0, buf, pi_data->interval); + } else { + guard = fio_crc64_nvme(0, buf, data->lba_size); + guard = fio_crc64_nvme(guard, md_buf, pi_data->interval); + } + pi->guard = cpu_to_be64(guard); + } + + if (opts->io_flags & NVME_IO_PRINFO_PRCHK_APP) + pi->apptag = cpu_to_be16(pi_data->apptag); + + if (opts->io_flags & NVME_IO_PRINFO_PRCHK_REF) { + switch (data->pi_type) { + case NVME_NS_DPS_PI_TYPE1: + case NVME_NS_DPS_PI_TYPE2: + put_unaligned_be48(slba + lba_num, pi->srtag); + break; + case NVME_NS_DPS_PI_TYPE3: + break; + } + } + if (data->lba_ext) { + buf += data->lba_ext; + } else { + buf += data->lba_size; + md_buf += data->ms; + } + lba_num++; + } +} + +static int fio_nvme_verify_pi_64b_guard(struct nvme_data *data, + struct io_u *io_u) +{ + struct nvme_pi_data *pi_data = io_u->engine_data; + struct nvme_64b_guard_pif *pi; + struct fio_file *f = io_u->file; + unsigned char *buf = io_u->xfer_buf; + unsigned char *md_buf = io_u->mmap_data; + __u64 slba = get_slba(data, io_u->offset); + __u64 ref, ref_exp, guard = 0; + __u32 nlb = get_nlb(data, io_u->xfer_buflen) + 1; + __u32 lba_num = 0; + __u16 unmask_app, unmask_app_exp; + + while (lba_num < nlb) { + if (data->lba_ext) + pi = (struct nvme_64b_guard_pif *)(buf + pi_data->interval); + else + pi = (struct nvme_64b_guard_pif *)(md_buf + pi_data->interval); + + if (data->pi_type == NVME_NS_DPS_PI_TYPE3) { + if (pi->apptag == NVME_PI_APP_DISABLE && + fio_nvme_pi_ref_escape(pi->srtag)) + goto next; + } else if (data->pi_type == NVME_NS_DPS_PI_TYPE1 || + data->pi_type == NVME_NS_DPS_PI_TYPE2) { + if (pi->apptag == NVME_PI_APP_DISABLE) + goto next; + } + + if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_GUARD) { + if (data->lba_ext) { + guard = fio_crc64_nvme(0, buf, pi_data->interval); + } else { + guard = fio_crc64_nvme(0, buf, data->lba_size); + guard = fio_crc64_nvme(guard, md_buf, pi_data->interval); + } + if (be64_to_cpu((uint64_t)pi->guard) != guard) { + log_err("%s: Guard compare error: LBA: %llu Expected=%llx, Actual=%llx\n", + f->file_name, (unsigned long long)slba, + guard, be64_to_cpu((uint64_t)pi->guard)); + return -EIO; + } + } + + if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_APP) { + unmask_app = be16_to_cpu(pi->apptag) & pi_data->apptag_mask; + unmask_app_exp = pi_data->apptag & pi_data->apptag_mask; + if (unmask_app != unmask_app_exp) { + log_err("%s: APPTAG compare error: LBA: %llu Expected=%x, Actual=%x\n", + f->file_name, (unsigned long long)slba, + unmask_app_exp, unmask_app); + return -EIO; + } + } + + if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_REF) { + switch (data->pi_type) { + case NVME_NS_DPS_PI_TYPE1: + case NVME_NS_DPS_PI_TYPE2: + ref = get_unaligned_be48(pi->srtag); + ref_exp = (slba + lba_num) & ((1ULL << 48) - 1); + if (ref != ref_exp) { + log_err("%s: REFTAG compare error: LBA: %llu Expected=%llx, Actual=%llx\n", + f->file_name, (unsigned long long)slba, + ref_exp, ref); + return -EIO; + } + break; + case NVME_NS_DPS_PI_TYPE3: + break; + } + } +next: + if (data->lba_ext) { + buf += data->lba_ext; + } else { + buf += data->lba_size; + md_buf += data->ms; + } + lba_num++; + } + + return 0; +} +static void fio_nvme_uring_cmd_trim_prep(struct nvme_uring_cmd *cmd, struct io_u *io_u, + struct nvme_dsm *dsm) +{ + struct nvme_data *data = FILE_ENG_DATA(io_u->file); + struct trim_range *range; + uint8_t *buf_point; + int i; + + cmd->opcode = nvme_cmd_dsm; + cmd->nsid = data->nsid; + cmd->cdw11 = NVME_ATTRIBUTE_DEALLOCATE; + cmd->addr = (__u64) (uintptr_t) (&dsm->range[0]); + + if (dsm->nr_ranges == 1) { + dsm->range[0].slba = get_slba(data, io_u->offset); + /* nlb is a 1-based value for deallocate */ + dsm->range[0].nlb = get_nlb(data, io_u->xfer_buflen) + 1; + cmd->cdw10 = 0; + cmd->data_len = sizeof(struct nvme_dsm_range); + } else { + buf_point = io_u->xfer_buf; + for (i = 0; i < io_u->number_trim; i++) { + range = (struct trim_range *)buf_point; + dsm->range[i].slba = get_slba(data, range->start); + /* nlb is a 1-based value for deallocate */ + dsm->range[i].nlb = get_nlb(data, range->len) + 1; + buf_point += sizeof(struct trim_range); + } + cmd->cdw10 = io_u->number_trim - 1; + cmd->data_len = io_u->number_trim * sizeof(struct nvme_dsm_range); + } +} + +int fio_nvme_uring_cmd_prep(struct nvme_uring_cmd *cmd, struct io_u *io_u, + struct iovec *iov, struct nvme_dsm *dsm, + uint8_t read_opcode, uint8_t write_opcode, + unsigned int cdw12_flags) +{ + struct nvme_data *data = FILE_ENG_DATA(io_u->file); + __u64 slba; + __u32 nlb; + + memset(cmd, 0, sizeof(struct nvme_uring_cmd)); + + switch (io_u->ddir) { + case DDIR_READ: + cmd->opcode = read_opcode; + break; + case DDIR_WRITE: + cmd->opcode = write_opcode; + break; + case DDIR_TRIM: + fio_nvme_uring_cmd_trim_prep(cmd, io_u, dsm); + return 0; + case DDIR_SYNC: + case DDIR_DATASYNC: + cmd->opcode = nvme_cmd_flush; + cmd->nsid = data->nsid; + return 0; + default: + return -ENOTSUP; + } + + slba = get_slba(data, io_u->offset); + nlb = get_nlb(data, io_u->xfer_buflen); + + /* cdw10 and cdw11 represent starting lba */ + cmd->cdw10 = slba & 0xffffffff; + cmd->cdw11 = slba >> 32; + /* cdw12 represent number of lba's for read/write */ + cmd->cdw12 = nlb | (io_u->dtype << 20) | cdw12_flags; + cmd->cdw13 = io_u->dspec << 16; + if (iov) { + iov->iov_base = io_u->xfer_buf; + iov->iov_len = io_u->xfer_buflen; + cmd->addr = (__u64)(uintptr_t)iov; + cmd->data_len = 1; + } else { + /* no buffer for write zeroes */ + if (cmd->opcode != nvme_cmd_write_zeroes) + cmd->addr = (__u64)(uintptr_t)io_u->xfer_buf; + else + cmd->addr = (__u64)(uintptr_t)NULL; + cmd->data_len = io_u->xfer_buflen; + } + if (data->lba_shift && data->ms) { + cmd->metadata = (__u64)(uintptr_t)io_u->mmap_data; + cmd->metadata_len = (nlb + 1) * data->ms; + } + cmd->nsid = data->nsid; + return 0; +} + +void fio_nvme_generate_guard(struct io_u *io_u, struct nvme_cmd_ext_io_opts *opts) +{ + struct nvme_data *data = FILE_ENG_DATA(io_u->file); + + if (data->pi_type && !(opts->io_flags & NVME_IO_PRINFO_PRACT)) { + if (data->guard_type == NVME_NVM_NS_16B_GUARD) + fio_nvme_generate_pi_16b_guard(data, io_u, opts); + else if (data->guard_type == NVME_NVM_NS_64B_GUARD) + fio_nvme_generate_pi_64b_guard(data, io_u, opts); + } +} + +void fio_nvme_pi_fill(struct nvme_uring_cmd *cmd, struct io_u *io_u, + struct nvme_cmd_ext_io_opts *opts) +{ + struct nvme_data *data = FILE_ENG_DATA(io_u->file); + __u64 slba; + + slba = get_slba(data, io_u->offset); + cmd->cdw12 |= opts->io_flags; + + fio_nvme_generate_guard(io_u, opts); + + switch (data->pi_type) { + case NVME_NS_DPS_PI_TYPE1: + case NVME_NS_DPS_PI_TYPE2: + switch (data->guard_type) { + case NVME_NVM_NS_16B_GUARD: + if (opts->io_flags & NVME_IO_PRINFO_PRCHK_REF) + cmd->cdw14 = (__u32)slba; + break; + case NVME_NVM_NS_64B_GUARD: + if (opts->io_flags & NVME_IO_PRINFO_PRCHK_REF) { + cmd->cdw14 = (__u32)slba; + cmd->cdw3 = ((slba >> 32) & 0xffff); + } + break; + default: + break; + } + if (opts->io_flags & NVME_IO_PRINFO_PRCHK_APP) + cmd->cdw15 = (opts->apptag_mask << 16 | opts->apptag); + break; + case NVME_NS_DPS_PI_TYPE3: + if (opts->io_flags & NVME_IO_PRINFO_PRCHK_APP) + cmd->cdw15 = (opts->apptag_mask << 16 | opts->apptag); + break; + case NVME_NS_DPS_PI_NONE: + break; + } +} + +int fio_nvme_pi_verify(struct nvme_data *data, struct io_u *io_u) +{ + int ret = 0; + + switch (data->guard_type) { + case NVME_NVM_NS_16B_GUARD: + ret = fio_nvme_verify_pi_16b_guard(data, io_u); + break; + case NVME_NVM_NS_64B_GUARD: + ret = fio_nvme_verify_pi_64b_guard(data, io_u); + break; + default: + break; + } + + return ret; +} + +static int nvme_identify(int fd, __u32 nsid, enum nvme_identify_cns cns, + enum nvme_csi csi, void *data) +{ + struct nvme_passthru_cmd cmd = { + .opcode = nvme_admin_identify, + .nsid = nsid, + .addr = (__u64)(uintptr_t)data, + .data_len = NVME_IDENTIFY_DATA_SIZE, + .cdw10 = cns, + .cdw11 = csi << NVME_IDENTIFY_CSI_SHIFT, + .timeout_ms = NVME_DEFAULT_IOCTL_TIMEOUT, + }; + + return ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd); +} + +int fio_nvme_get_info(struct fio_file *f, __u64 *nlba, __u32 pi_act, + struct nvme_data *data) +{ + struct nvme_id_ns ns; + struct nvme_id_ctrl ctrl; + struct nvme_nvm_id_ns nvm_ns; + int namespace_id; + int fd, err; + __u32 format_idx, elbaf; + + if (f->filetype != FIO_TYPE_CHAR) { + log_err("ioengine io_uring_cmd only works with nvme ns " + "generic char devices (/dev/ngXnY)\n"); + return 1; + } + + fd = open(f->file_name, O_RDONLY); + if (fd < 0) + return -errno; + + namespace_id = ioctl(fd, NVME_IOCTL_ID); + if (namespace_id < 0) { + err = -errno; + log_err("%s: failed to fetch namespace-id\n", f->file_name); + goto out; + } + + err = nvme_identify(fd, 0, NVME_IDENTIFY_CNS_CTRL, NVME_CSI_NVM, &ctrl); + if (err) { + log_err("%s: failed to fetch identify ctrl\n", f->file_name); + goto out; + } + + /* + * Identify namespace to get namespace-id, namespace size in LBA's + * and LBA data size. + */ + err = nvme_identify(fd, namespace_id, NVME_IDENTIFY_CNS_NS, + NVME_CSI_NVM, &ns); + if (err) { + log_err("%s: failed to fetch identify namespace\n", + f->file_name); + goto out; + } + + data->nsid = namespace_id; + + /* + * 16 or 64 as maximum number of supported LBA formats. + * From flbas bit 0-3 indicates lsb and bit 5-6 indicates msb + * of the format index used to format the namespace. + */ + if (ns.nlbaf < 16) + format_idx = ns.flbas & 0xf; + else + format_idx = (ns.flbas & 0xf) + (((ns.flbas >> 5) & 0x3) << 4); + + data->lba_size = 1 << ns.lbaf[format_idx].ds; + data->ms = le16_to_cpu(ns.lbaf[format_idx].ms); + + /* Check for end to end data protection support */ + if (data->ms && (ns.dps & NVME_NS_DPS_PI_MASK)) + data->pi_type = (ns.dps & NVME_NS_DPS_PI_MASK); + + if (!data->pi_type) + goto check_elba; + + if (ctrl.ctratt & NVME_CTRL_CTRATT_ELBAS) { + err = nvme_identify(fd, namespace_id, NVME_IDENTIFY_CNS_CSI_NS, + NVME_CSI_NVM, &nvm_ns); + if (err) { + log_err("%s: failed to fetch identify nvm namespace\n", + f->file_name); + goto out; + } + + elbaf = le32_to_cpu(nvm_ns.elbaf[format_idx]); + + /* Currently we don't support storage tags */ + if (elbaf & NVME_ID_NS_NVM_STS_MASK) { + log_err("%s: Storage tag not supported\n", + f->file_name); + err = -ENOTSUP; + goto out; + } + + data->guard_type = (elbaf >> NVME_ID_NS_NVM_GUARD_SHIFT) & + NVME_ID_NS_NVM_GUARD_MASK; + + /* No 32 bit guard, as storage tag is mandatory for it */ + switch (data->guard_type) { + case NVME_NVM_NS_16B_GUARD: + data->pi_size = sizeof(struct nvme_16b_guard_pif); + break; + case NVME_NVM_NS_64B_GUARD: + data->pi_size = sizeof(struct nvme_64b_guard_pif); + break; + default: + break; + } + } else { + data->guard_type = NVME_NVM_NS_16B_GUARD; + data->pi_size = sizeof(struct nvme_16b_guard_pif); + } + + /* + * when PRACT bit is set to 1, and metadata size is equal to protection + * information size, controller inserts and removes PI for write and + * read commands respectively. + */ + if (pi_act && data->ms == data->pi_size) + data->ms = 0; + + data->pi_loc = (ns.dps & NVME_NS_DPS_PI_FIRST); + +check_elba: + /* + * Bit 4 for flbas indicates if metadata is transferred at the end of + * logical block creating an extended LBA. + */ + if (data->ms && ((ns.flbas >> 4) & 0x1)) + data->lba_ext = data->lba_size + data->ms; + else + data->lba_shift = ilog2(data->lba_size); + + *nlba = ns.nsze; + +out: + close(fd); + return err; +} + +int fio_nvme_get_zoned_model(struct thread_data *td, struct fio_file *f, + enum zbd_zoned_model *model) +{ + struct nvme_data *data = FILE_ENG_DATA(f); + struct nvme_id_ns ns; + struct nvme_passthru_cmd cmd; + int fd, ret = 0; + + if (f->filetype != FIO_TYPE_CHAR) + return -EINVAL; + + /* File is not yet opened */ + fd = open(f->file_name, O_RDONLY | O_LARGEFILE); + if (fd < 0) + return -errno; + + /* Using nvme_id_ns for data as sizes are same */ + ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_CTRL, + NVME_CSI_ZNS, &ns); + if (ret) { + *model = ZBD_NONE; + goto out; + } + + memset(&cmd, 0, sizeof(struct nvme_passthru_cmd)); + + /* Using nvme_id_ns for data as sizes are same */ + ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_NS, + NVME_CSI_ZNS, &ns); + if (ret) { + *model = ZBD_NONE; + goto out; + } + + *model = ZBD_HOST_MANAGED; +out: + close(fd); + return 0; +} + +static int nvme_report_zones(int fd, __u32 nsid, __u64 slba, __u32 zras_feat, + __u32 data_len, void *data) +{ + struct nvme_passthru_cmd cmd = { + .opcode = nvme_zns_cmd_mgmt_recv, + .nsid = nsid, + .addr = (__u64)(uintptr_t)data, + .data_len = data_len, + .cdw10 = slba & 0xffffffff, + .cdw11 = slba >> 32, + .cdw12 = (data_len >> 2) - 1, + .cdw13 = NVME_ZNS_ZRA_REPORT_ZONES | zras_feat, + .timeout_ms = NVME_DEFAULT_IOCTL_TIMEOUT, + }; + + return ioctl(fd, NVME_IOCTL_IO_CMD, &cmd); +} + +int fio_nvme_report_zones(struct thread_data *td, struct fio_file *f, + uint64_t offset, struct zbd_zone *zbdz, + unsigned int nr_zones) +{ + struct nvme_data *data = FILE_ENG_DATA(f); + struct nvme_zone_report *zr; + struct nvme_zns_id_ns zns_ns; + struct nvme_id_ns ns; + unsigned int i = 0, j, zones_fetched = 0; + unsigned int max_zones, zones_chunks = 1024; + int fd, ret = 0; + __u32 zr_len; + __u64 zlen; + + /* File is not yet opened */ + fd = open(f->file_name, O_RDONLY | O_LARGEFILE); + if (fd < 0) + return -errno; + + zones_fetched = 0; + zr_len = sizeof(*zr) + (zones_chunks * sizeof(struct nvme_zns_desc)); + zr = calloc(1, zr_len); + if (!zr) { + close(fd); + return -ENOMEM; + } + + ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_NS, + NVME_CSI_NVM, &ns); + if (ret) { + log_err("%s: nvme_identify_ns failed, err=%d\n", f->file_name, + ret); + goto out; + } + + ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_NS, + NVME_CSI_ZNS, &zns_ns); + if (ret) { + log_err("%s: nvme_zns_identify_ns failed, err=%d\n", + f->file_name, ret); + goto out; + } + zlen = zns_ns.lbafe[ns.flbas & 0x0f].zsze << data->lba_shift; + + max_zones = (f->real_file_size - offset) / zlen; + if (max_zones < nr_zones) + nr_zones = max_zones; + + if (nr_zones < zones_chunks) + zones_chunks = nr_zones; + + while (zones_fetched < nr_zones) { + if (zones_fetched + zones_chunks >= nr_zones) { + zones_chunks = nr_zones - zones_fetched; + zr_len = sizeof(*zr) + (zones_chunks * sizeof(struct nvme_zns_desc)); + } + ret = nvme_report_zones(fd, data->nsid, offset >> data->lba_shift, + NVME_ZNS_ZRAS_FEAT_ERZ, zr_len, (void *)zr); + if (ret) { + log_err("%s: nvme_zns_report_zones failed, err=%d\n", + f->file_name, ret); + goto out; + } + + /* Transform the zone-report */ + for (j = 0; j < zr->nr_zones; j++, i++) { + struct nvme_zns_desc *desc = (struct nvme_zns_desc *)&(zr->entries[j]); + + zbdz[i].start = desc->zslba << data->lba_shift; + zbdz[i].len = zlen; + zbdz[i].wp = desc->wp << data->lba_shift; + zbdz[i].capacity = desc->zcap << data->lba_shift; + + /* Zone Type is stored in first 4 bits. */ + switch (desc->zt & 0x0f) { + case NVME_ZONE_TYPE_SEQWRITE_REQ: + zbdz[i].type = ZBD_ZONE_TYPE_SWR; + break; + default: + log_err("%s: invalid type for zone at offset %llu.\n", + f->file_name, (unsigned long long) desc->zslba); + ret = -EIO; + goto out; + } + + /* Zone State is stored in last 4 bits. */ + switch (desc->zs >> 4) { + case NVME_ZNS_ZS_EMPTY: + zbdz[i].cond = ZBD_ZONE_COND_EMPTY; + break; + case NVME_ZNS_ZS_IMPL_OPEN: + zbdz[i].cond = ZBD_ZONE_COND_IMP_OPEN; + break; + case NVME_ZNS_ZS_EXPL_OPEN: + zbdz[i].cond = ZBD_ZONE_COND_EXP_OPEN; + break; + case NVME_ZNS_ZS_CLOSED: + zbdz[i].cond = ZBD_ZONE_COND_CLOSED; + break; + case NVME_ZNS_ZS_FULL: + zbdz[i].cond = ZBD_ZONE_COND_FULL; + break; + case NVME_ZNS_ZS_READ_ONLY: + case NVME_ZNS_ZS_OFFLINE: + default: + /* Treat all these conditions as offline (don't use!) */ + zbdz[i].cond = ZBD_ZONE_COND_OFFLINE; + zbdz[i].wp = zbdz[i].start; + } + } + zones_fetched += zr->nr_zones; + offset += zr->nr_zones * zlen; + } + + ret = zones_fetched; +out: + free(zr); + close(fd); + + return ret; +} + +int fio_nvme_reset_wp(struct thread_data *td, struct fio_file *f, + uint64_t offset, uint64_t length) +{ + struct nvme_data *data = FILE_ENG_DATA(f); + unsigned int nr_zones; + unsigned long long zslba; + int i, fd, ret = 0; + + /* If the file is not yet opened, open it for this function. */ + fd = f->fd; + if (fd < 0) { + fd = open(f->file_name, O_RDWR | O_LARGEFILE); + if (fd < 0) + return -errno; + } + + zslba = offset >> data->lba_shift; + nr_zones = (length + td->o.zone_size - 1) / td->o.zone_size; + + for (i = 0; i < nr_zones; i++, zslba += (td->o.zone_size >> data->lba_shift)) { + struct nvme_passthru_cmd cmd = { + .opcode = nvme_zns_cmd_mgmt_send, + .nsid = data->nsid, + .cdw10 = zslba & 0xffffffff, + .cdw11 = zslba >> 32, + .cdw13 = NVME_ZNS_ZSA_RESET, + .addr = (__u64)(uintptr_t)NULL, + .data_len = 0, + .timeout_ms = NVME_DEFAULT_IOCTL_TIMEOUT, + }; + + ret = ioctl(fd, NVME_IOCTL_IO_CMD, &cmd); + } + + if (f->fd < 0) + close(fd); + return -ret; +} + +int fio_nvme_get_max_open_zones(struct thread_data *td, struct fio_file *f, + unsigned int *max_open_zones) +{ + struct nvme_data *data = FILE_ENG_DATA(f); + struct nvme_zns_id_ns zns_ns; + int fd, ret = 0; + + fd = open(f->file_name, O_RDONLY | O_LARGEFILE); + if (fd < 0) + return -errno; + + ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_NS, + NVME_CSI_ZNS, &zns_ns); + if (ret) { + log_err("%s: nvme_zns_identify_ns failed, err=%d\n", + f->file_name, ret); + goto out; + } + + *max_open_zones = zns_ns.mor + 1; +out: + close(fd); + return ret; +} + +static inline int nvme_fdp_reclaim_unit_handle_status(int fd, __u32 nsid, + __u32 data_len, void *data) +{ + struct nvme_passthru_cmd cmd = { + .opcode = nvme_cmd_io_mgmt_recv, + .nsid = nsid, + .addr = (__u64)(uintptr_t)data, + .data_len = data_len, + .cdw10 = 1, + .cdw11 = (data_len >> 2) - 1, + }; + + return ioctl(fd, NVME_IOCTL_IO_CMD, &cmd); +} + +int fio_nvme_iomgmt_ruhs(struct thread_data *td, struct fio_file *f, + struct nvme_fdp_ruh_status *ruhs, __u32 bytes) +{ + struct nvme_data *data = FILE_ENG_DATA(f); + int fd, ret; + + fd = open(f->file_name, O_RDONLY | O_LARGEFILE); + if (fd < 0) + return -errno; + + ret = nvme_fdp_reclaim_unit_handle_status(fd, data->nsid, bytes, ruhs); + if (ret) { + log_err("%s: nvme_fdp_reclaim_unit_handle_status failed, err=%d\n", + f->file_name, ret); + errno = ENOTSUP; + } else + errno = 0; + + ret = -errno; + close(fd); + return ret; +} diff --git a/engines/nvme.h b/engines/nvme.h new file mode 100644 index 0000000000..4371eb5b3b --- /dev/null +++ b/engines/nvme.h @@ -0,0 +1,497 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * nvme structure declarations and helper functions for the + * io_uring_cmd engine. + */ + +#ifndef FIO_NVME_H +#define FIO_NVME_H + +#include +#include "../fio.h" + +/* + * If the uapi headers installed on the system lacks nvme uring command + * support, use the local version to prevent compilation issues. + */ +#ifndef CONFIG_NVME_URING_CMD +struct nvme_uring_cmd { + __u8 opcode; + __u8 flags; + __u16 rsvd1; + __u32 nsid; + __u32 cdw2; + __u32 cdw3; + __u64 metadata; + __u64 addr; + __u32 metadata_len; + __u32 data_len; + __u32 cdw10; + __u32 cdw11; + __u32 cdw12; + __u32 cdw13; + __u32 cdw14; + __u32 cdw15; + __u32 timeout_ms; + __u32 rsvd2; +}; + +#define NVME_URING_CMD_IO _IOWR('N', 0x80, struct nvme_uring_cmd) +#define NVME_URING_CMD_IO_VEC _IOWR('N', 0x81, struct nvme_uring_cmd) +#endif /* CONFIG_NVME_URING_CMD */ + +#define NVME_DEFAULT_IOCTL_TIMEOUT 0 +#define NVME_IDENTIFY_DATA_SIZE 4096 +#define NVME_IDENTIFY_CSI_SHIFT 24 +#define NVME_NQN_LENGTH 256 + +#define NVME_PI_APP_DISABLE 0xFFFF +#define NVME_PI_REF_DISABLE 0xFFFFFFFF + +#define NVME_ZNS_ZRA_REPORT_ZONES 0 +#define NVME_ZNS_ZRAS_FEAT_ERZ (1 << 16) +#define NVME_ZNS_ZSA_RESET 0x4 +#define NVME_ZONE_TYPE_SEQWRITE_REQ 0x2 + +#define NVME_ATTRIBUTE_DEALLOCATE (1 << 2) + +enum nvme_identify_cns { + NVME_IDENTIFY_CNS_NS = 0x00, + NVME_IDENTIFY_CNS_CTRL = 0x01, + NVME_IDENTIFY_CNS_CSI_NS = 0x05, + NVME_IDENTIFY_CNS_CSI_CTRL = 0x06, +}; + +enum nvme_csi { + NVME_CSI_NVM = 0, + NVME_CSI_KV = 1, + NVME_CSI_ZNS = 2, +}; + +enum nvme_admin_opcode { + nvme_admin_identify = 0x06, +}; + +enum nvme_io_opcode { + nvme_cmd_flush = 0x00, + nvme_cmd_write = 0x01, + nvme_cmd_read = 0x02, + nvme_cmd_write_uncor = 0x04, + nvme_cmd_compare = 0x05, + nvme_cmd_write_zeroes = 0x08, + nvme_cmd_dsm = 0x09, + nvme_cmd_verify = 0x0c, + nvme_cmd_io_mgmt_recv = 0x12, + nvme_zns_cmd_mgmt_send = 0x79, + nvme_zns_cmd_mgmt_recv = 0x7a, +}; + +enum nvme_zns_zs { + NVME_ZNS_ZS_EMPTY = 0x1, + NVME_ZNS_ZS_IMPL_OPEN = 0x2, + NVME_ZNS_ZS_EXPL_OPEN = 0x3, + NVME_ZNS_ZS_CLOSED = 0x4, + NVME_ZNS_ZS_READ_ONLY = 0xd, + NVME_ZNS_ZS_FULL = 0xe, + NVME_ZNS_ZS_OFFLINE = 0xf, +}; + +enum nvme_id_ctrl_ctratt { + NVME_CTRL_CTRATT_ELBAS = 1 << 15, +}; + +enum { + NVME_ID_NS_NVM_STS_MASK = 0x7f, + NVME_ID_NS_NVM_GUARD_SHIFT = 7, + NVME_ID_NS_NVM_GUARD_MASK = 0x3, +}; + +enum { + NVME_NVM_NS_16B_GUARD = 0, + NVME_NVM_NS_32B_GUARD = 1, + NVME_NVM_NS_64B_GUARD = 2, +}; + +struct nvme_data { + __u32 nsid; + __u32 lba_shift; + __u32 lba_size; + __u32 lba_ext; + __u16 ms; + __u16 pi_size; + __u8 pi_type; + __u8 guard_type; + __u8 pi_loc; +}; + +enum nvme_id_ns_dps { + NVME_NS_DPS_PI_NONE = 0, + NVME_NS_DPS_PI_TYPE1 = 1, + NVME_NS_DPS_PI_TYPE2 = 2, + NVME_NS_DPS_PI_TYPE3 = 3, + NVME_NS_DPS_PI_MASK = 7 << 0, + NVME_NS_DPS_PI_FIRST = 1 << 3, +}; + +enum nvme_io_control_flags { + NVME_IO_PRINFO_PRCHK_REF = 1U << 26, + NVME_IO_PRINFO_PRCHK_APP = 1U << 27, + NVME_IO_PRINFO_PRCHK_GUARD = 1U << 28, + NVME_IO_PRINFO_PRACT = 1U << 29, +}; + +struct nvme_pi_data { + __u32 interval; + __u32 io_flags; + __u16 apptag; + __u16 apptag_mask; +}; + +struct nvme_lbaf { + __le16 ms; + __u8 ds; + __u8 rp; +}; + +/* 16 bit guard protection Information format */ +struct nvme_16b_guard_pif { + __be16 guard; + __be16 apptag; + __be32 srtag; +}; + +/* 64 bit guard protection Information format */ +struct nvme_64b_guard_pif { + __be64 guard; + __be16 apptag; + __u8 srtag[6]; +}; + +struct nvme_id_ns { + __le64 nsze; + __le64 ncap; + __le64 nuse; + __u8 nsfeat; + __u8 nlbaf; + __u8 flbas; + __u8 mc; + __u8 dpc; + __u8 dps; + __u8 nmic; + __u8 rescap; + __u8 fpi; + __u8 dlfeat; + __le16 nawun; + __le16 nawupf; + __le16 nacwu; + __le16 nabsn; + __le16 nabo; + __le16 nabspf; + __le16 noiob; + __u8 nvmcap[16]; + __le16 npwg; + __le16 npwa; + __le16 npdg; + __le16 npda; + __le16 nows; + __le16 mssrl; + __le32 mcl; + __u8 msrc; + __u8 rsvd81[11]; + __le32 anagrpid; + __u8 rsvd96[3]; + __u8 nsattr; + __le16 nvmsetid; + __le16 endgid; + __u8 nguid[16]; + __u8 eui64[8]; + struct nvme_lbaf lbaf[64]; + __u8 vs[3712]; +}; + +struct nvme_id_psd { + __le16 mp; + __u8 rsvd2; + __u8 flags; + __le32 enlat; + __le32 exlat; + __u8 rrt; + __u8 rrl; + __u8 rwt; + __u8 rwl; + __le16 idlp; + __u8 ips; + __u8 rsvd19; + __le16 actp; + __u8 apws; + __u8 rsvd23[9]; +}; + +struct nvme_id_ctrl { + __le16 vid; + __le16 ssvid; + char sn[20]; + char mn[40]; + char fr[8]; + __u8 rab; + __u8 ieee[3]; + __u8 cmic; + __u8 mdts; + __le16 cntlid; + __le32 ver; + __le32 rtd3r; + __le32 rtd3e; + __le32 oaes; + __le32 ctratt; + __le16 rrls; + __u8 rsvd102[9]; + __u8 cntrltype; + __u8 fguid[16]; + __le16 crdt1; + __le16 crdt2; + __le16 crdt3; + __u8 rsvd134[119]; + __u8 nvmsr; + __u8 vwci; + __u8 mec; + __le16 oacs; + __u8 acl; + __u8 aerl; + __u8 frmw; + __u8 lpa; + __u8 elpe; + __u8 npss; + __u8 avscc; + __u8 apsta; + __le16 wctemp; + __le16 cctemp; + __le16 mtfa; + __le32 hmpre; + __le32 hmmin; + __u8 tnvmcap[16]; + __u8 unvmcap[16]; + __le32 rpmbs; + __le16 edstt; + __u8 dsto; + __u8 fwug; + __le16 kas; + __le16 hctma; + __le16 mntmt; + __le16 mxtmt; + __le32 sanicap; + __le32 hmminds; + __le16 hmmaxd; + __le16 nsetidmax; + __le16 endgidmax; + __u8 anatt; + __u8 anacap; + __le32 anagrpmax; + __le32 nanagrpid; + __le32 pels; + __le16 domainid; + __u8 rsvd358[10]; + __u8 megcap[16]; + __u8 rsvd384[128]; + __u8 sqes; + __u8 cqes; + __le16 maxcmd; + __le32 nn; + __le16 oncs; + __le16 fuses; + __u8 fna; + __u8 vwc; + __le16 awun; + __le16 awupf; + __u8 icsvscc; + __u8 nwpc; + __le16 acwu; + __le16 ocfs; + __le32 sgls; + __le32 mnan; + __u8 maxdna[16]; + __le32 maxcna; + __u8 rsvd564[204]; + char subnqn[NVME_NQN_LENGTH]; + __u8 rsvd1024[768]; + + /* Fabrics Only */ + __le32 ioccsz; + __le32 iorcsz; + __le16 icdoff; + __u8 fcatt; + __u8 msdbd; + __le16 ofcs; + __u8 dctype; + __u8 rsvd1807[241]; + + struct nvme_id_psd psd[32]; + __u8 vs[1024]; +}; + +struct nvme_nvm_id_ns { + __le64 lbstm; + __u8 pic; + __u8 rsvd9[3]; + __le32 elbaf[64]; + __u8 rsvd268[3828]; +}; + +static inline int ilog2(uint32_t i) +{ + int log = -1; + + while (i) { + i >>= 1; + log++; + } + return log; +} + +struct nvme_zns_lbafe { + __le64 zsze; + __u8 zdes; + __u8 rsvd9[7]; +}; + +struct nvme_zns_id_ns { + __le16 zoc; + __le16 ozcs; + __le32 mar; + __le32 mor; + __le32 rrl; + __le32 frl; + __le32 rrl1; + __le32 rrl2; + __le32 rrl3; + __le32 frl1; + __le32 frl2; + __le32 frl3; + __le32 numzrwa; + __le16 zrwafg; + __le16 zrwasz; + __u8 zrwacap; + __u8 rsvd53[2763]; + struct nvme_zns_lbafe lbafe[64]; + __u8 vs[256]; +}; + +struct nvme_zns_desc { + __u8 zt; + __u8 zs; + __u8 za; + __u8 zai; + __u8 rsvd4[4]; + __le64 zcap; + __le64 zslba; + __le64 wp; + __u8 rsvd32[32]; +}; + +struct nvme_zone_report { + __le64 nr_zones; + __u8 rsvd8[56]; + struct nvme_zns_desc entries[]; +}; + +struct nvme_fdp_ruh_status_desc { + __u16 pid; + __u16 ruhid; + __u32 earutr; + __u64 ruamw; + __u8 rsvd16[16]; +}; + +struct nvme_fdp_ruh_status { + __u8 rsvd0[14]; + __le16 nruhsd; + struct nvme_fdp_ruh_status_desc ruhss[]; +}; + +struct nvme_dsm_range { + __le32 cattr; + __le32 nlb; + __le64 slba; +}; + +struct nvme_dsm { + __u32 nr_ranges; + struct nvme_dsm_range range[]; +}; + +struct nvme_cmd_ext_io_opts { + __u32 io_flags; + __u16 apptag; + __u16 apptag_mask; +}; + +int fio_nvme_iomgmt_ruhs(struct thread_data *td, struct fio_file *f, + struct nvme_fdp_ruh_status *ruhs, __u32 bytes); + +int fio_nvme_get_info(struct fio_file *f, __u64 *nlba, __u32 pi_act, + struct nvme_data *data); + +int fio_nvme_uring_cmd_prep(struct nvme_uring_cmd *cmd, struct io_u *io_u, + struct iovec *iov, struct nvme_dsm *dsm, + uint8_t read_opcode, uint8_t write_opcode, + unsigned int cdw12_flags); + +void fio_nvme_pi_fill(struct nvme_uring_cmd *cmd, struct io_u *io_u, + struct nvme_cmd_ext_io_opts *opts); + +void fio_nvme_generate_guard(struct io_u *io_u, struct nvme_cmd_ext_io_opts *opts); + +int fio_nvme_pi_verify(struct nvme_data *data, struct io_u *io_u); + +int fio_nvme_get_zoned_model(struct thread_data *td, struct fio_file *f, + enum zbd_zoned_model *model); + +int fio_nvme_report_zones(struct thread_data *td, struct fio_file *f, + uint64_t offset, struct zbd_zone *zbdz, + unsigned int nr_zones); + +int fio_nvme_reset_wp(struct thread_data *td, struct fio_file *f, + uint64_t offset, uint64_t length); + +int fio_nvme_get_max_open_zones(struct thread_data *td, struct fio_file *f, + unsigned int *max_open_zones); + +static inline void put_unaligned_be48(__u64 val, __u8 *p) +{ + *p++ = val >> 40; + *p++ = val >> 32; + *p++ = val >> 24; + *p++ = val >> 16; + *p++ = val >> 8; + *p++ = val; +} + +static inline __u64 get_unaligned_be48(__u8 *p) +{ + return (__u64)p[0] << 40 | (__u64)p[1] << 32 | (__u64)p[2] << 24 | + p[3] << 16 | p[4] << 8 | p[5]; +} + +static inline bool fio_nvme_pi_ref_escape(__u8 *reftag) +{ + __u8 ref_esc[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; + + return memcmp(reftag, ref_esc, sizeof(ref_esc)) == 0; +} + +static inline __u64 get_slba(struct nvme_data *data, __u64 offset) +{ + if (data->lba_ext) + return offset / data->lba_ext; + + return offset >> data->lba_shift; +} + +static inline __u32 get_nlb(struct nvme_data *data, __u64 len) +{ + if (data->lba_ext) + return len / data->lba_ext - 1; + + return (len >> data->lba_shift) - 1; +} + +#endif diff --git a/engines/pmemblk.c b/engines/pmemblk.c deleted file mode 100644 index fc6358e8e1..0000000000 --- a/engines/pmemblk.c +++ /dev/null @@ -1,448 +0,0 @@ -/* - * pmemblk: IO engine that uses PMDK libpmemblk to read and write data - * - * Copyright (C) 2016 Hewlett Packard Enterprise Development LP - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License, - * version 2 as published by the Free Software Foundation.. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the Free - * Software Foundation, Inc., 51 Franklin Street, Fifth Floor, - * Boston, MA 02110-1301, USA. - */ - -/* - * pmemblk engine - * - * IO engine that uses libpmemblk to read and write data - * - * To use: - * ioengine=pmemblk - * - * Other relevant settings: - * thread=1 REQUIRED - * iodepth=1 - * direct=1 - * unlink=1 - * filename=/mnt/pmem0/fiotestfile,BSIZE,FSIZEMiB - * - * thread must be set to 1 for pmemblk as multiple processes cannot - * open the same block pool file. - * - * iodepth should be set to 1 as pmemblk is always synchronous. - * Use numjobs to scale up. - * - * direct=1 is implied as pmemblk is always direct. A warning message - * is printed if this is not specified. - * - * unlink=1 removes the block pool file after testing, and is optional. - * - * The pmem device must have a DAX-capable filesystem and be mounted - * with DAX enabled. filename must point to a file on that filesystem. - * - * Example: - * mkfs.xfs /dev/pmem0 - * mkdir /mnt/pmem0 - * mount -o dax /dev/pmem0 /mnt/pmem0 - * - * When specifying the filename, if the block pool file does not already - * exist, then the pmemblk engine creates the pool file if you specify - * the block and file sizes. BSIZE is the block size in bytes. - * FSIZEMB is the pool file size in MiB. - * - * See examples/pmemblk.fio for more. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../fio.h" - -/* - * libpmemblk - */ -typedef struct fio_pmemblk_file *fio_pmemblk_file_t; - -struct fio_pmemblk_file { - fio_pmemblk_file_t pmb_next; - char *pmb_filename; - uint64_t pmb_refcnt; - PMEMblkpool *pmb_pool; - size_t pmb_bsize; - size_t pmb_nblocks; -}; - -static fio_pmemblk_file_t Cache; - -static pthread_mutex_t CacheLock = PTHREAD_MUTEX_INITIALIZER; - -#define PMB_CREATE (0x0001) /* should create file */ - -fio_pmemblk_file_t fio_pmemblk_cache_lookup(const char *filename) -{ - fio_pmemblk_file_t i; - - for (i = Cache; i != NULL; i = i->pmb_next) - if (!strcmp(filename, i->pmb_filename)) - return i; - - return NULL; -} - -static void fio_pmemblk_cache_insert(fio_pmemblk_file_t pmb) -{ - pmb->pmb_next = Cache; - Cache = pmb; -} - -static void fio_pmemblk_cache_remove(fio_pmemblk_file_t pmb) -{ - fio_pmemblk_file_t i; - - if (pmb == Cache) { - Cache = Cache->pmb_next; - pmb->pmb_next = NULL; - return; - } - - for (i = Cache; i != NULL; i = i->pmb_next) - if (pmb == i->pmb_next) { - i->pmb_next = i->pmb_next->pmb_next; - pmb->pmb_next = NULL; - return; - } -} - -/* - * to control block size and gross file size at the libpmemblk - * level, we allow the block size and file size to be appended - * to the file name: - * - * path[,bsize,fsizemib] - * - * note that we do not use the fio option "filesize" to dictate - * the file size because we can only give libpmemblk the gross - * file size, which is different from the net or usable file - * size (which is probably what fio wants). - * - * the final path without the parameters is returned in ppath. - * the block size and file size are returned in pbsize and fsize. - * - * note that the user specifies the file size in MiB, but - * we return bytes from here. - */ -static void pmb_parse_path(const char *pathspec, char **ppath, uint64_t *pbsize, - uint64_t *pfsize) -{ - char *path; - char *s; - uint64_t bsize; - uint64_t fsizemib; - - path = strdup(pathspec); - if (!path) { - *ppath = NULL; - return; - } - - /* extract sizes, if given */ - s = strrchr(path, ','); - if (s && (fsizemib = strtoull(s + 1, NULL, 10))) { - *s = 0; - s = strrchr(path, ','); - if (s && (bsize = strtoull(s + 1, NULL, 10))) { - *s = 0; - *ppath = path; - *pbsize = bsize; - *pfsize = fsizemib << 20; - return; - } - } - - /* size specs not found */ - strcpy(path, pathspec); - *ppath = path; - *pbsize = 0; - *pfsize = 0; -} - -static fio_pmemblk_file_t pmb_open(const char *pathspec, int flags) -{ - fio_pmemblk_file_t pmb; - char *path = NULL; - uint64_t bsize = 0; - uint64_t fsize = 0; - - pmb_parse_path(pathspec, &path, &bsize, &fsize); - if (!path) - return NULL; - - pthread_mutex_lock(&CacheLock); - - pmb = fio_pmemblk_cache_lookup(path); - if (!pmb) { - pmb = malloc(sizeof(*pmb)); - if (!pmb) - goto error; - - /* try opening existing first, create it if needed */ - pmb->pmb_pool = pmemblk_open(path, bsize); - if (!pmb->pmb_pool && (errno == ENOENT) && - (flags & PMB_CREATE) && (0 < fsize) && (0 < bsize)) { - pmb->pmb_pool = - pmemblk_create(path, bsize, fsize, 0644); - } - if (!pmb->pmb_pool) { - log_err("pmemblk: unable to open pmemblk pool file %s (%s)\n", - path, strerror(errno)); - goto error; - } - - pmb->pmb_filename = path; - pmb->pmb_next = NULL; - pmb->pmb_refcnt = 0; - pmb->pmb_bsize = pmemblk_bsize(pmb->pmb_pool); - pmb->pmb_nblocks = pmemblk_nblock(pmb->pmb_pool); - - fio_pmemblk_cache_insert(pmb); - } else { - free(path); - } - - pmb->pmb_refcnt += 1; - - pthread_mutex_unlock(&CacheLock); - - return pmb; - -error: - if (pmb) { - if (pmb->pmb_pool) - pmemblk_close(pmb->pmb_pool); - pmb->pmb_pool = NULL; - pmb->pmb_filename = NULL; - free(pmb); - } - if (path) - free(path); - - pthread_mutex_unlock(&CacheLock); - return NULL; -} - -static void pmb_close(fio_pmemblk_file_t pmb, const bool keep) -{ - pthread_mutex_lock(&CacheLock); - - pmb->pmb_refcnt--; - - if (!keep && !pmb->pmb_refcnt) { - pmemblk_close(pmb->pmb_pool); - pmb->pmb_pool = NULL; - free(pmb->pmb_filename); - pmb->pmb_filename = NULL; - fio_pmemblk_cache_remove(pmb); - free(pmb); - } - - pthread_mutex_unlock(&CacheLock); -} - -static int pmb_get_flags(struct thread_data *td, uint64_t *pflags) -{ - static int thread_warned = 0; - static int odirect_warned = 0; - - uint64_t flags = 0; - - if (!td->o.use_thread) { - if (!thread_warned) { - thread_warned = 1; - log_err("pmemblk: must set thread=1 for pmemblk engine\n"); - } - return 1; - } - - if (!td->o.odirect && !odirect_warned) { - odirect_warned = 1; - log_info("pmemblk: direct == 0, but pmemblk is always direct\n"); - } - - if (td->o.allow_create) - flags |= PMB_CREATE; - - (*pflags) = flags; - return 0; -} - -static int fio_pmemblk_open_file(struct thread_data *td, struct fio_file *f) -{ - uint64_t flags = 0; - fio_pmemblk_file_t pmb; - - if (pmb_get_flags(td, &flags)) - return 1; - - pmb = pmb_open(f->file_name, flags); - if (!pmb) - return 1; - - FILE_SET_ENG_DATA(f, pmb); - return 0; -} - -static int fio_pmemblk_close_file(struct thread_data fio_unused *td, - struct fio_file *f) -{ - fio_pmemblk_file_t pmb = FILE_ENG_DATA(f); - - if (pmb) - pmb_close(pmb, false); - - FILE_SET_ENG_DATA(f, NULL); - return 0; -} - -static int fio_pmemblk_get_file_size(struct thread_data *td, struct fio_file *f) -{ - uint64_t flags = 0; - fio_pmemblk_file_t pmb = FILE_ENG_DATA(f); - - if (fio_file_size_known(f)) - return 0; - - if (!pmb) { - if (pmb_get_flags(td, &flags)) - return 1; - pmb = pmb_open(f->file_name, flags); - if (!pmb) - return 1; - } - - f->real_file_size = pmb->pmb_bsize * pmb->pmb_nblocks; - - fio_file_set_size_known(f); - - if (!FILE_ENG_DATA(f)) - pmb_close(pmb, true); - - return 0; -} - -static enum fio_q_status fio_pmemblk_queue(struct thread_data *td, - struct io_u *io_u) -{ - struct fio_file *f = io_u->file; - fio_pmemblk_file_t pmb = FILE_ENG_DATA(f); - - unsigned long long off; - unsigned long len; - void *buf; - - fio_ro_check(td, io_u); - - switch (io_u->ddir) { - case DDIR_READ: - case DDIR_WRITE: - off = io_u->offset; - len = io_u->xfer_buflen; - - io_u->error = EINVAL; - if (off % pmb->pmb_bsize) - break; - if (len % pmb->pmb_bsize) - break; - if ((off + len) / pmb->pmb_bsize > pmb->pmb_nblocks) - break; - - io_u->error = 0; - buf = io_u->xfer_buf; - off /= pmb->pmb_bsize; - len /= pmb->pmb_bsize; - while (0 < len) { - if (io_u->ddir == DDIR_READ && - 0 != pmemblk_read(pmb->pmb_pool, buf, off)) { - io_u->error = errno; - break; - } else if (0 != pmemblk_write(pmb->pmb_pool, buf, off)) { - io_u->error = errno; - break; - } - buf += pmb->pmb_bsize; - off++; - len--; - } - off *= pmb->pmb_bsize; - len *= pmb->pmb_bsize; - io_u->resid = io_u->xfer_buflen - (off - io_u->offset); - break; - case DDIR_SYNC: - case DDIR_DATASYNC: - case DDIR_SYNC_FILE_RANGE: - /* we're always sync'd */ - io_u->error = 0; - break; - default: - io_u->error = EINVAL; - break; - } - - return FIO_Q_COMPLETED; -} - -static int fio_pmemblk_unlink_file(struct thread_data *td, struct fio_file *f) -{ - char *path = NULL; - uint64_t bsize = 0; - uint64_t fsize = 0; - - /* - * we need our own unlink in case the user has specified - * the block and file sizes in the path name. we parse - * the file_name to determine the file name we actually used. - */ - - pmb_parse_path(f->file_name, &path, &bsize, &fsize); - if (!path) - return ENOENT; - - unlink(path); - free(path); - return 0; -} - -FIO_STATIC struct ioengine_ops ioengine = { - .name = "pmemblk", - .version = FIO_IOOPS_VERSION, - .queue = fio_pmemblk_queue, - .open_file = fio_pmemblk_open_file, - .close_file = fio_pmemblk_close_file, - .get_file_size = fio_pmemblk_get_file_size, - .unlink_file = fio_pmemblk_unlink_file, - .flags = FIO_SYNCIO | FIO_DISKLESSIO | FIO_NOEXTEND | FIO_NODISKUTIL, -}; - -static void fio_init fio_pmemblk_register(void) -{ - register_ioengine(&ioengine); -} - -static void fio_exit fio_pmemblk_unregister(void) -{ - unregister_ioengine(&ioengine); -} diff --git a/engines/posixaio.c b/engines/posixaio.c index 135d088c7a..2d0ac9fcc3 100644 --- a/engines/posixaio.c +++ b/engines/posixaio.c @@ -27,18 +27,6 @@ static unsigned long long ts_utime_since_now(const struct timespec *start) return utime_since(start, &now); } -static int fio_posixaio_cancel(struct thread_data fio_unused *td, - struct io_u *io_u) -{ - struct fio_file *f = io_u->file; - int r = aio_cancel(f->fd, &io_u->aiocb); - - if (r == AIO_ALLDONE || r == AIO_CANCELED) - return 0; - - return 1; -} - static int fio_posixaio_prep(struct thread_data fio_unused *td, struct io_u *io_u) { @@ -197,11 +185,9 @@ static void fio_posixaio_cleanup(struct thread_data *td) static int fio_posixaio_init(struct thread_data *td) { - struct posixaio_data *pd = malloc(sizeof(*pd)); - - memset(pd, 0, sizeof(*pd)); - pd->aio_events = malloc(td->o.iodepth * sizeof(struct io_u *)); - memset(pd->aio_events, 0, td->o.iodepth * sizeof(struct io_u *)); + struct posixaio_data *pd; + pd = calloc(1, sizeof(*pd)); + pd->aio_events = calloc(td->o.iodepth, sizeof(struct io_u *)); td->io_ops_data = pd; return 0; @@ -214,7 +200,6 @@ static struct ioengine_ops ioengine = { .init = fio_posixaio_init, .prep = fio_posixaio_prep, .queue = fio_posixaio_queue, - .cancel = fio_posixaio_cancel, .getevents = fio_posixaio_getevents, .event = fio_posixaio_event, .cleanup = fio_posixaio_cleanup, diff --git a/engines/rados.c b/engines/rados.c index 23e62c4c45..d0d15c5b54 100644 --- a/engines/rados.c +++ b/engines/rados.c @@ -37,6 +37,7 @@ struct rados_options { char *cluster_name; char *pool_name; char *client_name; + char *conf; int busy_poll; int touch_objects; }; @@ -69,6 +70,16 @@ static struct fio_option options[] = { .category = FIO_OPT_C_ENGINE, .group = FIO_OPT_G_RBD, }, + { + .name = "conf", + .lname = "ceph configuration file path", + .type = FIO_OPT_STR_STORE, + .help = "Path of the ceph configuration file", + .off1 = offsetof(struct rados_options, conf), + .def = "/etc/ceph/ceph.conf", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_RBD, + }, { .name = "busy_poll", .lname = "busy poll mode", @@ -151,7 +162,7 @@ static int _fio_rados_connect(struct thread_data *td) char *client_name = NULL; /* - * If we specify cluser name, the rados_create2 + * If we specify cluster name, the rados_create2 * will not assume 'client.'. name is considered * as a full type.id namestr */ @@ -184,7 +195,7 @@ static int _fio_rados_connect(struct thread_data *td) goto failed_early; } - r = rados_conf_read_file(rados->cluster, NULL); + r = rados_conf_read_file(rados->cluster, o->conf); if (r < 0) { log_err("rados_conf_read_file failed.\n"); goto failed_early; diff --git a/engines/rbd.c b/engines/rbd.c index c6203d4c2a..ab4b679b2b 100644 --- a/engines/rbd.c +++ b/engines/rbd.c @@ -40,6 +40,8 @@ struct rbd_options { char *pool_name; char *client_name; int busy_poll; + char *encryption_format; + char *encryption_passphrase; }; static struct fio_option options[] = { @@ -89,6 +91,24 @@ static struct fio_option options[] = { .category = FIO_OPT_C_ENGINE, .group = FIO_OPT_G_RBD, }, + { + .name = "rbd_encryption_format", + .lname = "RBD Encryption Format", + .type = FIO_OPT_STR_STORE, + .off1 = offsetof(struct rbd_options, encryption_format), + .help = "RBD Encryption Format (luks1, luks2)", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_RBD, + }, + { + .name = "rbd_encryption_passphrase", + .lname = "RBD Encryption Passphrase", + .type = FIO_OPT_STR_STORE, + .off1 = offsetof(struct rbd_options, encryption_passphrase), + .help = "Passphrase for unlocking the RBD image", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_RBD, + }, { .name = NULL, }, @@ -134,6 +154,65 @@ static int _fio_setup_rbd_data(struct thread_data *td, } +#ifdef CONFIG_RBD_ENCRYPTION +static bool _fio_rbd_setup_encryption(struct rbd_data *rbd, struct rbd_options *options) +{ + rbd_encryption_format_t fmt; + void *opts_ptr = NULL; + size_t opts_size = 0; + int r; + + rbd_encryption_luks1_format_options_t luks1_opts; + rbd_encryption_luks2_format_options_t luks2_opts; + + if (!options->encryption_format) + return true; // No encryption requested + + if (!options->encryption_passphrase) { + log_err("rbd_encryption_passphrase is required when a rbd_encryption_format is specified.\n"); + return false; + } + + if (!strcmp(options->encryption_format, "luks2")) { + fmt = RBD_ENCRYPTION_FORMAT_LUKS2; + memset(&luks2_opts, 0, sizeof(luks2_opts)); + luks2_opts.passphrase = options->encryption_passphrase; + luks2_opts.passphrase_size = strlen(options->encryption_passphrase); + opts_ptr = &luks2_opts; + opts_size = sizeof(luks2_opts); + } else if (!strcmp(options->encryption_format, "luks1")) { + fmt = RBD_ENCRYPTION_FORMAT_LUKS1; + memset(&luks1_opts, 0, sizeof(luks1_opts)); + luks1_opts.passphrase = options->encryption_passphrase; + luks1_opts.passphrase_size = strlen(options->encryption_passphrase); + opts_ptr = &luks1_opts; + opts_size = sizeof(luks1_opts); + } else { + log_err("rbd_encryption_load failed. Unknown rbd_encryption_format: %s\n", options->encryption_format); + return false; + } + r = rbd_encryption_load(rbd->image, fmt, opts_ptr, opts_size); + if (r < 0) { + log_err("rbd_encryption_load failed.\n"); + return false; + } + return true; +} +#else +static bool _fio_rbd_setup_encryption(struct rbd_data *rbd, struct rbd_options *options) +{ + if (options->encryption_format) { + int major, minor, extra; + rbd_version(&major, &minor, &extra); + + log_err("rbd encryption requested but not supported by this librbd version (%d.%d.%d).\n", + major, minor, extra); + return false; + } + return true; +} +#endif + #ifdef CONFIG_RBD_POLL static bool _fio_rbd_setup_poll(struct rbd_data *rbd) { @@ -173,7 +252,7 @@ static int _fio_rbd_connect(struct thread_data *td) char *client_name = NULL; /* - * If we specify cluser name, the rados_create2 + * If we specify cluster name, the rados_create2 * will not assume 'client.'. name is considered * as a full type.id namestr */ @@ -251,12 +330,15 @@ static int _fio_rbd_connect(struct thread_data *td) } } + if (!_fio_rbd_setup_encryption(rbd, o)) + goto failed_post_open; + if (!_fio_rbd_setup_poll(rbd)) - goto failed_poll; + goto failed_post_open; return 0; -failed_poll: +failed_post_open: rbd_close(rbd->image); rbd->image = NULL; failed_open: @@ -633,7 +715,7 @@ static int fio_rbd_setup(struct thread_data *td) /* taken from "net" engine. Pretend we deal with files, * even if we do not have any ideas about files. - * The size of the RBD is set instead of a artificial file. + * The size of the RBD is set instead of an artificial file. */ if (!td->files_index) { add_file(td, td->o.filename ? : "rbd", 0, 0); diff --git a/engines/rdma.c b/engines/rdma.c index f447186981..07336f3b88 100644 --- a/engines/rdma.c +++ b/engines/rdma.c @@ -276,7 +276,6 @@ static int cq_event_handler(struct thread_data *td, enum ibv_wc_opcode opcode) int i; while ((ret = ibv_poll_cq(rd->cq, 1, &wc)) == 1) { - ret = 0; compevnum++; if (wc.status) { @@ -832,6 +831,12 @@ static void fio_rdmaio_queued(struct thread_data *td, struct io_u **io_us, memcpy(&io_u->issue_time, &now, sizeof(now)); io_u_queued(td, io_u); } + + /* + * only used for iolog + */ + if (td->o.read_iolog_file) + memcpy(&td->last_issue, &now, sizeof(now)); } static int fio_rdmaio_commit(struct thread_data *td) @@ -850,8 +855,6 @@ static int fio_rdmaio_commit(struct thread_data *td) ret = fio_rdmaio_send(td, io_us, rd->io_u_queued_nr); else if (!rd->is_client) ret = fio_rdmaio_recv(td, io_us, rd->io_u_queued_nr); - else - ret = 0; /* must be a SYNC */ if (ret > 0) { fio_rdmaio_queued(td, io_us, ret); @@ -1194,7 +1197,7 @@ static int check_set_rlimits(struct thread_data *td) static int compat_options(struct thread_data *td) { - // The original RDMA engine had an ugly / seperator + // The original RDMA engine had an ugly / separator // on the filename for it's options. This function // retains backwards compatibility with it. Note we do not // support setting the bindname option is this legacy mode. @@ -1290,23 +1293,18 @@ static int fio_rdmaio_init(struct thread_data *td) if ((rd->rdma_protocol == FIO_RDMA_MEM_WRITE) || (rd->rdma_protocol == FIO_RDMA_MEM_READ)) { - rd->rmt_us = - malloc(FIO_RDMA_MAX_IO_DEPTH * sizeof(struct remote_u)); - memset(rd->rmt_us, 0, - FIO_RDMA_MAX_IO_DEPTH * sizeof(struct remote_u)); + rd->rmt_us = calloc(FIO_RDMA_MAX_IO_DEPTH, + sizeof(struct remote_u)); rd->rmt_nr = 0; } - rd->io_us_queued = malloc(td->o.iodepth * sizeof(struct io_u *)); - memset(rd->io_us_queued, 0, td->o.iodepth * sizeof(struct io_u *)); + rd->io_us_queued = calloc(td->o.iodepth, sizeof(struct io_u *)); rd->io_u_queued_nr = 0; - rd->io_us_flight = malloc(td->o.iodepth * sizeof(struct io_u *)); - memset(rd->io_us_flight, 0, td->o.iodepth * sizeof(struct io_u *)); + rd->io_us_flight = calloc(td->o.iodepth, sizeof(struct io_u *)); rd->io_u_flight_nr = 0; - rd->io_us_completed = malloc(td->o.iodepth * sizeof(struct io_u *)); - memset(rd->io_us_completed, 0, td->o.iodepth * sizeof(struct io_u *)); + rd->io_us_completed = calloc(td->o.iodepth, sizeof(struct io_u *)); rd->io_u_completed_nr = 0; if (td_read(td)) { /* READ as the server */ @@ -1333,8 +1331,7 @@ static int fio_rdmaio_post_init(struct thread_data *td) for (i = 0; i < td->io_u_freelist.nr; i++) { struct io_u *io_u = td->io_u_freelist.io_us[i]; - io_u->engine_data = malloc(sizeof(struct rdma_io_u_data)); - memset(io_u->engine_data, 0, sizeof(struct rdma_io_u_data)); + io_u->engine_data = calloc(1, sizeof(struct rdma_io_u_data)); ((struct rdma_io_u_data *)io_u->engine_data)->wr_id = i; io_u->mr = ibv_reg_mr(rd->pd, io_u->buf, max_bs, @@ -1380,10 +1377,8 @@ static int fio_rdmaio_setup(struct thread_data *td) } if (!td->io_ops_data) { - rd = malloc(sizeof(*rd)); - - memset(rd, 0, sizeof(*rd)); - init_rand_seed(&rd->rand_state, (unsigned int) GOLDEN_RATIO_PRIME, 0); + rd = calloc(1, sizeof(*rd)); + init_rand_seed(&rd->rand_state, (unsigned int) GOLDEN_RATIO_64, 0); td->io_ops_data = rd; } @@ -1404,7 +1399,8 @@ FIO_STATIC struct ioengine_ops ioengine = { .cleanup = fio_rdmaio_cleanup, .open_file = fio_rdmaio_open_file, .close_file = fio_rdmaio_close_file, - .flags = FIO_DISKLESSIO | FIO_UNIDIR | FIO_PIPEIO, + .flags = FIO_DISKLESSIO | FIO_UNIDIR | FIO_PIPEIO | + FIO_ASYNCIO_SETS_ISSUE_TIME, .options = options, .option_struct_size = sizeof(struct rdmaio_options), }; diff --git a/engines/sg.c b/engines/sg.c index 1c0193840d..7912e9c893 100644 --- a/engines/sg.c +++ b/engines/sg.c @@ -66,8 +66,13 @@ enum { FIO_SG_WRITE = 1, - FIO_SG_WRITE_VERIFY = 2, - FIO_SG_WRITE_SAME = 3 + FIO_SG_WRITE_VERIFY, + FIO_SG_WRITE_SAME, + FIO_SG_WRITE_SAME_NDOB, + FIO_SG_WRITE_STREAM, + FIO_SG_VERIFY_BYTCHK_00, + FIO_SG_VERIFY_BYTCHK_01, + FIO_SG_VERIFY_BYTCHK_11, }; struct sg_options { @@ -76,6 +81,7 @@ struct sg_options { unsigned int readfua; unsigned int writefua; unsigned int write_mode; + uint16_t stream_id; }; static struct fio_option options[] = { @@ -120,18 +126,58 @@ static struct fio_option options[] = { .oval = FIO_SG_WRITE, .help = "Issue standard SCSI WRITE commands", }, - { .ival = "verify", + { .ival = "write_and_verify", .oval = FIO_SG_WRITE_VERIFY, .help = "Issue SCSI WRITE AND VERIFY commands", }, - { .ival = "same", + { .ival = "verify", + .oval = FIO_SG_WRITE_VERIFY, + .help = "Issue SCSI WRITE AND VERIFY commands. This " + "option is deprecated. Use write_and_verify instead.", + }, + { .ival = "write_same", .oval = FIO_SG_WRITE_SAME, .help = "Issue SCSI WRITE SAME commands", }, + { .ival = "same", + .oval = FIO_SG_WRITE_SAME, + .help = "Issue SCSI WRITE SAME commands. This " + "option is deprecated. Use write_same instead.", + }, + { .ival = "write_same_ndob", + .oval = FIO_SG_WRITE_SAME_NDOB, + .help = "Issue SCSI WRITE SAME(16) commands with NDOB flag set", + }, + { .ival = "verify_bytchk_00", + .oval = FIO_SG_VERIFY_BYTCHK_00, + .help = "Issue SCSI VERIFY commands with BYTCHK set to 00", + }, + { .ival = "verify_bytchk_01", + .oval = FIO_SG_VERIFY_BYTCHK_01, + .help = "Issue SCSI VERIFY commands with BYTCHK set to 01", + }, + { .ival = "verify_bytchk_11", + .oval = FIO_SG_VERIFY_BYTCHK_11, + .help = "Issue SCSI VERIFY commands with BYTCHK set to 11", + }, + { .ival = "write_stream", + .oval = FIO_SG_WRITE_STREAM, + .help = "Issue SCSI WRITE STREAM(16) commands", + }, }, .category = FIO_OPT_C_ENGINE, .group = FIO_OPT_G_SG, }, + { + .name = "stream_id", + .lname = "stream id for WRITE STREAM(16) commands", + .type = FIO_OPT_INT, + .off1 = offsetof(struct sg_options, stream_id), + .help = "Stream ID for WRITE STREAM(16) commands", + .def = "0", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_SG, + }, { .name = NULL, }, @@ -171,6 +217,11 @@ struct sgio_data { #endif }; +static inline uint16_t sgio_get_be16(uint8_t *buf) +{ + return be16_to_cpu(*((uint16_t *) buf)); +} + static inline uint32_t sgio_get_be32(uint8_t *buf) { return be32_to_cpu(*((uint32_t *) buf)); @@ -502,9 +553,9 @@ static enum fio_q_status fio_sgio_doio(struct thread_data *td, } static void fio_sgio_rw_lba(struct sg_io_hdr *hdr, unsigned long long lba, - unsigned long long nr_blocks) + unsigned long long nr_blocks, bool override16) { - if (lba < MAX_10B_LBA) { + if (lba < MAX_10B_LBA && !override16) { sgio_set_be32((uint32_t) lba, &hdr->cmdp[2]); sgio_set_be16((uint16_t) nr_blocks, &hdr->cmdp[7]); } else { @@ -545,7 +596,7 @@ static int fio_sgio_prep(struct thread_data *td, struct io_u *io_u) if (o->readfua) hdr->cmdp[1] |= 0x08; - fio_sgio_rw_lba(hdr, lba, nr_blocks); + fio_sgio_rw_lba(hdr, lba, nr_blocks, false); } else if (io_u->ddir == DDIR_WRITE) { sgio_hdr_init(sd, hdr, io_u, 1); @@ -576,9 +627,46 @@ static int fio_sgio_prep(struct thread_data *td, struct io_u *io_u) else hdr->cmdp[0] = 0x93; // write same(16) break; + case FIO_SG_WRITE_SAME_NDOB: + hdr->cmdp[0] = 0x93; // write same(16) + hdr->cmdp[1] |= 0x1; // no data output buffer + hdr->dxfer_len = 0; + break; + case FIO_SG_WRITE_STREAM: + hdr->cmdp[0] = 0x9a; // write stream (16) + if (o->writefua) + hdr->cmdp[1] |= 0x08; + sgio_set_be64(lba, &hdr->cmdp[2]); + sgio_set_be16((uint16_t) io_u->file->engine_pos, &hdr->cmdp[10]); + sgio_set_be16((uint16_t) nr_blocks, &hdr->cmdp[12]); + break; + case FIO_SG_VERIFY_BYTCHK_00: + if (lba < MAX_10B_LBA) + hdr->cmdp[0] = 0x2f; // VERIFY(10) + else + hdr->cmdp[0] = 0x8f; // VERIFY(16) + hdr->dxfer_len = 0; + break; + case FIO_SG_VERIFY_BYTCHK_01: + if (lba < MAX_10B_LBA) + hdr->cmdp[0] = 0x2f; // VERIFY(10) + else + hdr->cmdp[0] = 0x8f; // VERIFY(16) + hdr->cmdp[1] |= 0x02; // BYTCHK = 01b + break; + case FIO_SG_VERIFY_BYTCHK_11: + if (lba < MAX_10B_LBA) + hdr->cmdp[0] = 0x2f; // VERIFY(10) + else + hdr->cmdp[0] = 0x8f; // VERIFY(16) + hdr->cmdp[1] |= 0x06; // BYTCHK = 11b + hdr->dxfer_len = sd->bs; + break; }; - fio_sgio_rw_lba(hdr, lba, nr_blocks); + if (o->write_mode != FIO_SG_WRITE_STREAM) + fio_sgio_rw_lba(hdr, lba, nr_blocks, + o->write_mode == FIO_SG_WRITE_SAME_NDOB); } else if (io_u->ddir == DDIR_TRIM) { struct sgio_trim *st; @@ -970,9 +1058,60 @@ static int fio_sgio_type_check(struct thread_data *td, struct fio_file *f) return 0; } +static int fio_sgio_stream_control(struct fio_file *f, bool open_stream, uint16_t *stream_id) +{ + struct sg_io_hdr hdr; + unsigned char cmd[16]; + unsigned char sb[64]; + unsigned char buf[8]; + int ret; + + memset(&hdr, 0, sizeof(hdr)); + memset(cmd, 0, sizeof(cmd)); + memset(sb, 0, sizeof(sb)); + memset(buf, 0, sizeof(buf)); + + hdr.interface_id = 'S'; + hdr.cmdp = cmd; + hdr.cmd_len = 16; + hdr.sbp = sb; + hdr.mx_sb_len = sizeof(sb); + hdr.timeout = SCSI_TIMEOUT_MS; + hdr.cmdp[0] = 0x9e; + hdr.dxfer_direction = SG_DXFER_FROM_DEV; + hdr.dxferp = buf; + hdr.dxfer_len = sizeof(buf); + sgio_set_be32(sizeof(buf), &hdr.cmdp[10]); + + if (open_stream) + hdr.cmdp[1] = 0x34; + else { + hdr.cmdp[1] = 0x54; + sgio_set_be16(*stream_id, &hdr.cmdp[4]); + } + + ret = ioctl(f->fd, SG_IO, &hdr); + + if (ret < 0) + return ret; + + if (hdr.info & SG_INFO_CHECK) + return 1; + + if (open_stream) { + *stream_id = sgio_get_be16(&buf[4]); + dprint(FD_FILE, "sgio_stream_control: opened stream %u\n", (unsigned int) *stream_id); + assert(*stream_id != 0); + } else + dprint(FD_FILE, "sgio_stream_control: closed stream %u\n", (unsigned int) *stream_id); + + return 0; +} + static int fio_sgio_open(struct thread_data *td, struct fio_file *f) { struct sgio_data *sd = td->io_ops_data; + struct sg_options *o = td->eo; int ret; ret = generic_open_file(td, f); @@ -984,14 +1123,38 @@ static int fio_sgio_open(struct thread_data *td, struct fio_file *f) return ret; } + if (o->write_mode == FIO_SG_WRITE_STREAM) { + if (o->stream_id) + f->engine_pos = o->stream_id; + else { + ret = fio_sgio_stream_control(f, true, (uint16_t *) &f->engine_pos); + if (ret) + return ret; + } + } + return 0; } +static int fio_sgio_close(struct thread_data *td, struct fio_file *f) +{ + struct sg_options *o = td->eo; + int ret; + + if (!o->stream_id && o->write_mode == FIO_SG_WRITE_STREAM) { + ret = fio_sgio_stream_control(f, false, (uint16_t *) &f->engine_pos); + if (ret) + return ret; + } + + return generic_close_file(td, f); +} + /* * Build an error string with details about the driver, host or scsi * error contained in the sg header Caller will use as necessary. */ -static char *fio_sgio_errdetails(struct io_u *io_u) +static char *fio_sgio_errdetails(struct thread_data *td, struct io_u *io_u) { struct sg_io_hdr *hdr = &io_u->hdr; #define MAXERRDETAIL 1024 @@ -1168,10 +1331,12 @@ static char *fio_sgio_errdetails(struct io_u *io_u) strlcat(msg, ". ", MAXERRDETAIL); } if (hdr->sb_len_wr) { + const uint8_t *const sbp = hdr->sbp; + snprintf(msgchunk, MAXMSGCHUNK, "Sense Data (%d bytes):", hdr->sb_len_wr); strlcat(msg, msgchunk, MAXERRDETAIL); for (i = 0; i < hdr->sb_len_wr; i++) { - snprintf(msgchunk, MAXMSGCHUNK, " %02x", hdr->sbp[i]); + snprintf(msgchunk, MAXMSGCHUNK, " %02x", sbp[i]); strlcat(msg, msgchunk, MAXERRDETAIL); } strlcat(msg, ". ", MAXERRDETAIL); @@ -1261,9 +1426,9 @@ static struct ioengine_ops ioengine = { .event = fio_sgio_event, .cleanup = fio_sgio_cleanup, .open_file = fio_sgio_open, - .close_file = generic_close_file, + .close_file = fio_sgio_close, .get_file_size = fio_sgio_get_file_size, - .flags = FIO_SYNCIO | FIO_RAWIO, + .flags = FIO_SYNCIO | FIO_RAWIO | FIO_RO_NEEDS_RW_OPEN, .options = options, .option_struct_size = sizeof(struct sg_options) }; diff --git a/engines/skeleton_external.c b/engines/skeleton_external.c index cff83a10ef..f2b3fce950 100644 --- a/engines/skeleton_external.c +++ b/engines/skeleton_external.c @@ -71,15 +71,6 @@ static int fio_skeleton_getevents(struct thread_data *td, unsigned int min, return 0; } -/* - * The ->cancel() hook attempts to cancel the io_u. Only relevant for - * async io engines, and need not be supported. - */ -static int fio_skeleton_cancel(struct thread_data *td, struct io_u *io_u) -{ - return 0; -} - /* * The ->queue() hook is responsible for initiating io on the io_u * being passed in. If the io engine is a synchronous one, io may complete @@ -214,7 +205,6 @@ struct ioengine_ops ioengine = { .init = fio_skeleton_init, .prep = fio_skeleton_prep, .queue = fio_skeleton_queue, - .cancel = fio_skeleton_cancel, .getevents = fio_skeleton_getevents, .event = fio_skeleton_event, .cleanup = fio_skeleton_cleanup, diff --git a/engines/solarisaio.c b/engines/solarisaio.c index 21e95935b2..e179c0a100 100644 --- a/engines/solarisaio.c +++ b/engines/solarisaio.c @@ -19,12 +19,6 @@ struct solarisaio_data { unsigned int max_depth; }; -static int fio_solarisaio_cancel(struct thread_data fio_unused *td, - struct io_u *io_u) -{ - return aiocancel(&io_u->resultp); -} - static int fio_solarisaio_prep(struct thread_data fio_unused *td, struct io_u *io_u) { @@ -185,8 +179,9 @@ static void fio_solarisaio_init_sigio(void) static int fio_solarisaio_init(struct thread_data *td) { - struct solarisaio_data *sd = malloc(sizeof(*sd)); unsigned int max_depth; + struct solarisaio_data *sd; + sd = calloc(1, sizeof(*sd)); max_depth = td->o.iodepth; if (max_depth > MAXASYNCHIO) { @@ -195,9 +190,7 @@ static int fio_solarisaio_init(struct thread_data *td) max_depth); } - memset(sd, 0, sizeof(*sd)); - sd->aio_events = malloc(max_depth * sizeof(struct io_u *)); - memset(sd->aio_events, 0, max_depth * sizeof(struct io_u *)); + sd->aio_events = calloc(max_depth, sizeof(struct io_u *)); sd->max_depth = max_depth; #ifdef USE_SIGNAL_COMPLETIONS @@ -214,7 +207,6 @@ static struct ioengine_ops ioengine = { .init = fio_solarisaio_init, .prep = fio_solarisaio_prep, .queue = fio_solarisaio_queue, - .cancel = fio_solarisaio_cancel, .getevents = fio_solarisaio_getevents, .event = fio_solarisaio_event, .cleanup = fio_solarisaio_cleanup, diff --git a/engines/sync.c b/engines/sync.c index 339ba99970..89466ca596 100644 --- a/engines/sync.c +++ b/engines/sync.c @@ -70,7 +70,7 @@ static struct fio_option options[] = { .lname = "Uncached", .type = FIO_OPT_INT, .off1 = offsetof(struct psyncv2_options, uncached), - .help = "Use RWF_UNCACHED for buffered read/writes", + .help = "Use RWF_DONTCACHE for buffered read/writes", .category = FIO_OPT_C_ENGINE, .group = FIO_OPT_G_INVALID, }, @@ -173,7 +173,7 @@ static enum fio_q_status fio_pvsyncio2_queue(struct thread_data *td, (rand_between(&sd->rand_state, 1, 100) <= o->hipri_percentage)) flags |= RWF_HIPRI; if (!td->o.odirect && o->uncached) - flags |= RWF_UNCACHED; + flags |= RWF_DONTCACHE; if (o->nowait) flags |= RWF_NOWAIT; @@ -182,9 +182,11 @@ static enum fio_q_status fio_pvsyncio2_queue(struct thread_data *td, if (io_u->ddir == DDIR_READ) ret = preadv2(f->fd, iov, 1, io_u->offset, flags); - else if (io_u->ddir == DDIR_WRITE) + else if (io_u->ddir == DDIR_WRITE) { + if (td->o.oatomic) + flags |= RWF_ATOMIC; ret = pwritev2(f->fd, iov, 1, io_u->offset, flags); - else if (io_u->ddir == DDIR_TRIM) { + } else if (io_u->ddir == DDIR_TRIM) { do_io_u_trim(td, io_u); return FIO_Q_COMPLETED; } else @@ -402,8 +404,7 @@ static int fio_vsyncio_init(struct thread_data *td) { struct syncio_data *sd; - sd = malloc(sizeof(*sd)); - memset(sd, 0, sizeof(*sd)); + sd = calloc(1, sizeof(*sd)); sd->last_offset = -1ULL; sd->iovecs = malloc(td->o.iodepth * sizeof(struct iovec)); sd->io_us = malloc(td->o.iodepth * sizeof(struct io_u *)); @@ -484,7 +485,8 @@ static struct ioengine_ops ioengine_pvrw2 = { .open_file = generic_open_file, .close_file = generic_close_file, .get_file_size = generic_get_file_size, - .flags = FIO_SYNCIO, + .flags = FIO_SYNCIO | + FIO_ATOMICWRITES, .options = options, .option_struct_size = sizeof(struct psyncv2_options), }; diff --git a/engines/windowsaio.c b/engines/windowsaio.c index 9868e816ad..6681f8bbab 100644 --- a/engines/windowsaio.c +++ b/engines/windowsaio.c @@ -11,6 +11,7 @@ #include #include "../fio.h" +#include "../optgroup.h" typedef BOOL (WINAPI *CANCELIOEX)(HANDLE hFile, LPOVERLAPPED lpOverlapped); @@ -35,6 +36,26 @@ struct thread_ctx { struct windowsaio_data *wd; }; +struct windowsaio_options { + struct thread_data *td; + unsigned int no_completion_thread; +}; + +static struct fio_option options[] = { + { + .name = "no_completion_thread", + .lname = "No completion polling thread", + .type = FIO_OPT_STR_SET, + .off1 = offsetof(struct windowsaio_options, no_completion_thread), + .help = "Use to avoid separate completion polling thread", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_WINDOWSAIO, + }, + { + .name = NULL, + }, +}; + static DWORD WINAPI IoCompletionRoutine(LPVOID lpParameter); static int fio_windowsaio_init(struct thread_data *td) @@ -80,6 +101,7 @@ static int fio_windowsaio_init(struct thread_data *td) struct thread_ctx *ctx; struct windowsaio_data *wd; HANDLE hFile; + struct windowsaio_options *o = td->eo; hFile = CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 0); if (hFile == INVALID_HANDLE_VALUE) { @@ -91,29 +113,30 @@ static int fio_windowsaio_init(struct thread_data *td) wd->iothread_running = TRUE; wd->iocp = hFile; - if (!rc) - ctx = malloc(sizeof(struct thread_ctx)); + if (o->no_completion_thread == 0) { + if (!rc) + ctx = malloc(sizeof(struct thread_ctx)); - if (!rc && ctx == NULL) { - log_err("windowsaio: failed to allocate memory for thread context structure\n"); - CloseHandle(hFile); - rc = 1; - } + if (!rc && ctx == NULL) { + log_err("windowsaio: failed to allocate memory for thread context structure\n"); + CloseHandle(hFile); + rc = 1; + } - if (!rc) { - DWORD threadid; + if (!rc) { + DWORD threadid; - ctx->iocp = hFile; - ctx->wd = wd; - wd->iothread = CreateThread(NULL, 0, IoCompletionRoutine, ctx, 0, &threadid); - if (!wd->iothread) - log_err("windowsaio: failed to create io completion thread\n"); - else if (fio_option_is_set(&td->o, cpumask)) - fio_setaffinity(threadid, td->o.cpumask); + ctx->iocp = hFile; + ctx->wd = wd; + wd->iothread = CreateThread(NULL, 0, IoCompletionRoutine, ctx, 0, &threadid); + if (!wd->iothread) + log_err("windowsaio: failed to create io completion thread\n"); + else if (fio_option_is_set(&td->o, cpumask)) + fio_setaffinity(threadid, td->o.cpumask); + } + if (rc || wd->iothread == NULL) + rc = 1; } - - if (rc || wd->iothread == NULL) - rc = 1; } return rc; @@ -225,7 +248,7 @@ static int fio_windowsaio_open_file(struct thread_data *td, struct fio_file *f) log_err("fio: unknown fadvise type %d\n", td->o.fadvise_hint); } - if (!td_write(td) || read_only) + if ((!td_write(td) && !(td->flags & TD_F_SYNCS)) || read_only) access = GENERIC_READ; else access = (GENERIC_READ | GENERIC_WRITE); @@ -302,9 +325,63 @@ static struct io_u* fio_windowsaio_event(struct thread_data *td, int event) return wd->aio_events[event]; } -static int fio_windowsaio_getevents(struct thread_data *td, unsigned int min, - unsigned int max, - const struct timespec *t) +/* dequeue completion entrees directly (no separate completion thread) */ +static int fio_windowsaio_getevents_nothread(struct thread_data *td, unsigned int min, + unsigned int max, const struct timespec *t) +{ + struct windowsaio_data *wd = td->io_ops_data; + unsigned int dequeued = 0; + struct io_u *io_u; + DWORD start_count = 0; + DWORD end_count = 0; + DWORD mswait = 250; + struct fio_overlapped *fov; + + if (t != NULL) { + mswait = (t->tv_sec * 1000) + (t->tv_nsec / 1000000); + start_count = GetTickCount(); + end_count = start_count + (t->tv_sec * 1000) + (t->tv_nsec / 1000000); + } + + do { + BOOL ret; + OVERLAPPED *ovl; + + ULONG entries = min(16, max-dequeued); + OVERLAPPED_ENTRY oe[16]; + ret = GetQueuedCompletionStatusEx(wd->iocp, oe, 16, &entries, mswait, 0); + if (ret && entries) { + int entry_num; + + for (entry_num=0; entry_numio_u; + + if (ovl->Internal == ERROR_SUCCESS) { + io_u->resid = io_u->xfer_buflen - ovl->InternalHigh; + io_u->error = 0; + } else { + io_u->resid = io_u->xfer_buflen; + io_u->error = win_to_posix_error(GetLastError()); + } + + fov->io_complete = FALSE; + wd->aio_events[dequeued] = io_u; + dequeued++; + } + } + + if (dequeued >= min || + (t != NULL && timeout_expired(start_count, end_count))) + break; + } while (1); + return dequeued; +} + +/* dequeue completion entrees creates by separate IoCompletionRoutine thread */ +static int fio_windowaio_getevents_thread(struct thread_data *td, unsigned int min, + unsigned int max, const struct timespec *t) { struct windowsaio_data *wd = td->io_ops_data; unsigned int dequeued = 0; @@ -334,7 +411,6 @@ static int fio_windowsaio_getevents(struct thread_data *td, unsigned int min, wd->aio_events[dequeued] = io_u; dequeued++; } - } if (dequeued >= min) break; @@ -353,6 +429,16 @@ static int fio_windowsaio_getevents(struct thread_data *td, unsigned int min, return dequeued; } +static int fio_windowsaio_getevents(struct thread_data *td, unsigned int min, + unsigned int max, const struct timespec *t) +{ + struct windowsaio_options *o = td->eo; + + if (o->no_completion_thread) + return fio_windowsaio_getevents_nothread(td, min, max, t); + return fio_windowaio_getevents_thread(td, min, max, t); +} + static enum fio_q_status fio_windowsaio_queue(struct thread_data *td, struct io_u *io_u) { @@ -484,6 +570,8 @@ static struct ioengine_ops ioengine = { .get_file_size = generic_get_file_size, .io_u_init = fio_windowsaio_io_u_init, .io_u_free = fio_windowsaio_io_u_free, + .options = options, + .option_struct_size = sizeof(struct windowsaio_options), }; static void fio_init fio_windowsaio_register(void) diff --git a/engines/xnvme.c b/engines/xnvme.c new file mode 100644 index 0000000000..5f1af78d3d --- /dev/null +++ b/engines/xnvme.c @@ -0,0 +1,1394 @@ +/* + * fio xNVMe IO Engine + * + * IO engine using the xNVMe C API. + * + * See: http://xnvme.io/ + * + * SPDX-License-Identifier: Apache-2.0 + */ +#include +#include +#include +#include "fio.h" +#include "verify.h" +#include "zbd_types.h" +#include "dataplacement.h" +#include "optgroup.h" + +static pthread_mutex_t g_serialize = PTHREAD_MUTEX_INITIALIZER; + +struct xnvme_fioe_fwrap { + /* fio file representation */ + struct fio_file *fio_file; + + /* xNVMe device handle */ + struct xnvme_dev *dev; + /* xNVMe device geometry */ + const struct xnvme_geo *geo; + + struct xnvme_queue *queue; + + uint32_t ssw; + uint32_t lba_nbytes; + uint32_t md_nbytes; + uint32_t lba_pow2; + + uint8_t _pad[16]; +}; +XNVME_STATIC_ASSERT(sizeof(struct xnvme_fioe_fwrap) == 64, "Incorrect size") + +struct xnvme_fioe_data { + /* I/O completion queue */ + struct io_u **iocq; + + /* # of iocq entries; incremented via getevents()/cb_pool() */ + uint64_t completed; + + /* + * # of errors; incremented when observed on completion via + * getevents()/cb_pool() + */ + uint64_t ecount; + + /* Controller which device/file to select */ + int32_t prev; + int32_t cur; + + /* Number of devices/files for which open() has been called */ + int64_t nopen; + /* Number of devices/files allocated in files[] */ + uint64_t nallocated; + + struct iovec *iovec; + struct iovec *md_iovec; + + struct xnvme_fioe_fwrap files[]; +}; +XNVME_STATIC_ASSERT(sizeof(struct xnvme_fioe_data) == 64, "Incorrect size") + +struct xnvme_fioe_request { + /* Context for NVMe PI */ + struct xnvme_pi_ctx pi_ctx; + + /* Separate metadata buffer pointer */ + void *md_buf; +}; + +struct xnvme_fioe_options { + void *padding; + unsigned int hipri; + unsigned int sqpoll_thread; + unsigned int xnvme_dev_nsid; + unsigned int xnvme_iovec; + unsigned int md_per_io_size; + unsigned int pi_act; + unsigned int apptag; + unsigned int apptag_mask; + unsigned int prchk; + char *xnvme_be; + char *xnvme_mem; + char *xnvme_async; + char *xnvme_sync; + char *xnvme_admin; + char *xnvme_dev_subnqn; +}; + +static int str_pi_chk_cb(void *data, const char *str) +{ + struct xnvme_fioe_options *o = data; + + if (strstr(str, "GUARD") != NULL) + o->prchk = XNVME_PI_FLAGS_GUARD_CHECK; + if (strstr(str, "REFTAG") != NULL) + o->prchk |= XNVME_PI_FLAGS_REFTAG_CHECK; + if (strstr(str, "APPTAG") != NULL) + o->prchk |= XNVME_PI_FLAGS_APPTAG_CHECK; + + return 0; +} + +static struct fio_option options[] = { + { + .name = "hipri", + .lname = "High Priority", + .type = FIO_OPT_STR_SET, + .off1 = offsetof(struct xnvme_fioe_options, hipri), + .help = "Use polled IO completions", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_XNVME, + }, + { + .name = "sqthread_poll", + .lname = "Kernel SQ thread polling", + .type = FIO_OPT_STR_SET, + .off1 = offsetof(struct xnvme_fioe_options, sqpoll_thread), + .help = "Offload submission/completion to kernel thread", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_XNVME, + }, + { + .name = "xnvme_be", + .lname = "xNVMe Backend", + .type = FIO_OPT_STR_STORE, + .off1 = offsetof(struct xnvme_fioe_options, xnvme_be), + .help = "Select xNVMe backend [spdk,linux,fbsd]", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_XNVME, + }, + { + .name = "xnvme_mem", + .lname = "xNVMe Memory Backend", + .type = FIO_OPT_STR_STORE, + .off1 = offsetof(struct xnvme_fioe_options, xnvme_mem), + .help = "Select xNVMe memory backend", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_XNVME, + }, + { + .name = "xnvme_async", + .lname = "xNVMe Asynchronous command-interface", + .type = FIO_OPT_STR_STORE, + .off1 = offsetof(struct xnvme_fioe_options, xnvme_async), + .help = "Select xNVMe async. interface: " + "[emu,thrpool,io_uring,io_uring_cmd,libaio,posix,vfio,nil]", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_XNVME, + }, + { + .name = "xnvme_sync", + .lname = "xNVMe Synchronous. command-interface", + .type = FIO_OPT_STR_STORE, + .off1 = offsetof(struct xnvme_fioe_options, xnvme_sync), + .help = "Select xNVMe sync. interface: [nvme,psync,block]", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_XNVME, + }, + { + .name = "xnvme_admin", + .lname = "xNVMe Admin command-interface", + .type = FIO_OPT_STR_STORE, + .off1 = offsetof(struct xnvme_fioe_options, xnvme_admin), + .help = "Select xNVMe admin. cmd-interface: [nvme,block]", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_XNVME, + }, + { + .name = "xnvme_dev_nsid", + .lname = "xNVMe Namespace-Identifier, for user-space NVMe driver", + .type = FIO_OPT_INT, + .off1 = offsetof(struct xnvme_fioe_options, xnvme_dev_nsid), + .help = "xNVMe Namespace-Identifier, for user-space NVMe driver", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_XNVME, + }, + { + .name = "xnvme_dev_subnqn", + .lname = "Subsystem nqn for Fabrics", + .type = FIO_OPT_STR_STORE, + .off1 = offsetof(struct xnvme_fioe_options, xnvme_dev_subnqn), + .help = "Subsystem NQN for Fabrics", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_XNVME, + }, + { + .name = "xnvme_iovec", + .lname = "Vectored IOs", + .type = FIO_OPT_STR_SET, + .off1 = offsetof(struct xnvme_fioe_options, xnvme_iovec), + .help = "Send vectored IOs", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_XNVME, + }, + { + .name = "md_per_io_size", + .lname = "Separate Metadata Buffer Size per I/O", + .type = FIO_OPT_INT, + .off1 = offsetof(struct xnvme_fioe_options, md_per_io_size), + .def = "0", + .help = "Size of separate metadata buffer per I/O (Default: 0)", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_XNVME, + }, + { + .name = "pi_act", + .lname = "Protection Information Action", + .type = FIO_OPT_BOOL, + .off1 = offsetof(struct xnvme_fioe_options, pi_act), + .def = "1", + .help = "Protection Information Action bit (pi_act=1 or pi_act=0)", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_XNVME, + }, + { + .name = "pi_chk", + .lname = "Protection Information Check", + .type = FIO_OPT_STR_STORE, + .def = NULL, + .help = "Control of Protection Information Checking (pi_chk=GUARD,REFTAG,APPTAG)", + .cb = str_pi_chk_cb, + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_XNVME, + }, + { + .name = "apptag", + .lname = "Application Tag used in Protection Information", + .type = FIO_OPT_INT, + .off1 = offsetof(struct xnvme_fioe_options, apptag), + .def = "0x1234", + .help = "Application Tag used in Protection Information field (Default: 0x1234)", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_XNVME, + }, + { + .name = "apptag_mask", + .lname = "Application Tag Mask", + .type = FIO_OPT_INT, + .off1 = offsetof(struct xnvme_fioe_options, apptag_mask), + .def = "0xffff", + .help = "Application Tag Mask used with Application Tag (Default: 0xffff)", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_XNVME, + }, + + { + .name = NULL, + }, +}; + +static void cb_pool(struct xnvme_cmd_ctx *ctx, void *cb_arg) +{ + struct io_u *io_u = cb_arg; + struct xnvme_fioe_data *xd = io_u->mmap_data; + struct xnvme_fioe_request *fio_req = io_u->engine_data; + struct xnvme_fioe_fwrap *fwrap = &xd->files[io_u->file->fileno]; + bool pi_act = (fio_req->pi_ctx.pi_flags >> 3); + int err; + + if (xnvme_cmd_ctx_cpl_status(ctx)) { + xnvme_cmd_ctx_pr(ctx, XNVME_PR_DEF); + xd->ecount += 1; + io_u->error = EIO; + } + + if (!io_u->error && fwrap->geo->pi_type && (io_u->ddir == DDIR_READ) && !pi_act) { + err = xnvme_pi_verify(&fio_req->pi_ctx, io_u->xfer_buf, + fio_req->md_buf, io_u->xfer_buflen / fwrap->lba_nbytes); + if (err) { + xd->ecount += 1; + io_u->error = EIO; + } + } + + xd->iocq[xd->completed++] = io_u; + xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx); +} + +static struct xnvme_opts xnvme_opts_from_fioe(struct thread_data *td) +{ + struct xnvme_fioe_options *o = td->eo; + struct xnvme_opts opts = xnvme_opts_default(); + + opts.nsid = o->xnvme_dev_nsid; + opts.subnqn = o->xnvme_dev_subnqn; + opts.be = o->xnvme_be; + opts.mem = o->xnvme_mem; + opts.async = o->xnvme_async; + opts.sync = o->xnvme_sync; + opts.admin = o->xnvme_admin; + + opts.poll_io = o->hipri; + opts.poll_sq = o->sqpoll_thread; + + opts.direct = td->o.odirect; + + return opts; +} + +static void _dev_close(struct thread_data *td, struct xnvme_fioe_fwrap *fwrap) +{ + if (fwrap->dev) + xnvme_queue_term(fwrap->queue); + + xnvme_dev_close(fwrap->dev); + + memset(fwrap, 0, sizeof(*fwrap)); +} + +static void xnvme_fioe_cleanup(struct thread_data *td) +{ + struct xnvme_fioe_data *xd = NULL; + int err; + + if (!td->io_ops_data) + return; + + xd = td->io_ops_data; + + err = pthread_mutex_lock(&g_serialize); + if (err) + log_err("ioeng->cleanup(): pthread_mutex_lock(), err(%d)\n", err); + /* NOTE: not returning here */ + + for (uint64_t i = 0; i < xd->nallocated; ++i) + _dev_close(td, &xd->files[i]); + + if (!err) { + err = pthread_mutex_unlock(&g_serialize); + if (err) + log_err("ioeng->cleanup(): pthread_mutex_unlock(), err(%d)\n", err); + } + + free(xd->iocq); + free(xd->iovec); + free(xd->md_iovec); + free(xd); + td->io_ops_data = NULL; +} + +static int _verify_options(struct thread_data *td, struct fio_file *f, + struct xnvme_fioe_fwrap *fwrap) +{ + struct xnvme_fioe_options *o = td->eo; + unsigned int correct_md_size; + + for_each_rw_ddir(ddir) { + if (td->o.min_bs[ddir] % fwrap->lba_nbytes || td->o.max_bs[ddir] % fwrap->lba_nbytes) { + if (!fwrap->lba_pow2) { + log_err("ioeng->_verify_options(%s): block size must be a multiple of %u " + "(LBA data size + Metadata size)\n", f->file_name, fwrap->lba_nbytes); + } else { + log_err("ioeng->_verify_options(%s): block size must be a multiple of LBA data size\n", + f->file_name); + } + return 1; + } + if (ddir == DDIR_TRIM) + continue; + + correct_md_size = (td->o.max_bs[ddir] / fwrap->lba_nbytes) * fwrap->md_nbytes; + if (fwrap->md_nbytes && fwrap->lba_pow2 && (o->md_per_io_size < correct_md_size)) { + log_err("ioeng->_verify_options(%s): md_per_io_size should be at least %u bytes\n", + f->file_name, correct_md_size); + return 1; + } + } + + /* + * For extended logical block sizes we cannot use verify when + * end to end data protection checks are enabled, as the PI + * section of data buffer conflicts with verify. + */ + if (fwrap->md_nbytes && fwrap->geo->pi_type && !fwrap->lba_pow2 && + td->o.verify != VERIFY_NONE) { + log_err("ioeng->_verify_options(%s): for extended LBA, verify cannot be used when E2E data protection is enabled\n", + f->file_name); + return 1; + } + + return 0; +} + +/** + * Helper function setting up device handles as addressed by the naming + * convention of the given `fio_file` filename. + * + * Checks thread-options for explicit control of asynchronous implementation via + * the ``--xnvme_async={thrpool,emu,posix,io_uring,libaio,nil}``. + */ +static int _dev_open(struct thread_data *td, struct fio_file *f) +{ + struct xnvme_opts opts = xnvme_opts_from_fioe(td); + struct xnvme_fioe_options *o = td->eo; + struct xnvme_fioe_data *xd = td->io_ops_data; + struct xnvme_fioe_fwrap *fwrap; + int flags = 0; + int err; + + if (f->fileno > (int)xd->nallocated) { + log_err("ioeng->_dev_open(%s): invalid assumption\n", f->file_name); + return 1; + } + + fwrap = &xd->files[f->fileno]; + + err = pthread_mutex_lock(&g_serialize); + if (err) { + log_err("ioeng->_dev_open(%s): pthread_mutex_lock(), err(%d)\n", f->file_name, + err); + return -err; + } + + fwrap->dev = xnvme_dev_open(f->file_name, &opts); + if (!fwrap->dev) { + log_err("ioeng->_dev_open(%s): xnvme_dev_open(), err(%d)\n", f->file_name, errno); + goto failure; + } + fwrap->geo = xnvme_dev_get_geo(fwrap->dev); + + if (xnvme_queue_init(fwrap->dev, td->o.iodepth, flags, &(fwrap->queue))) { + log_err("ioeng->_dev_open(%s): xnvme_queue_init(), err(?)\n", f->file_name); + goto failure; + } + xnvme_queue_set_cb(fwrap->queue, cb_pool, NULL); + + fwrap->ssw = xnvme_dev_get_ssw(fwrap->dev); + fwrap->lba_nbytes = fwrap->geo->lba_nbytes; + fwrap->md_nbytes = fwrap->geo->nbytes_oob; + + if (fwrap->geo->lba_extended) + fwrap->lba_pow2 = 0; + else + fwrap->lba_pow2 = 1; + + /* + * When PI action is set and PI size is equal to metadata size, the + * controller inserts/removes PI. So update the LBA data and metadata + * sizes accordingly. + */ + if (o->pi_act && fwrap->geo->pi_type && + fwrap->geo->nbytes_oob == xnvme_pi_size(fwrap->geo->pi_format)) { + if (fwrap->geo->lba_extended) { + fwrap->lba_nbytes -= fwrap->geo->nbytes_oob; + fwrap->lba_pow2 = 1; + } + fwrap->md_nbytes = 0; + } + + if (_verify_options(td, f, fwrap)) { + td_verror(td, EINVAL, "_dev_open"); + goto failure; + } + + fwrap->fio_file = f; + fwrap->fio_file->filetype = FIO_TYPE_BLOCK; + fwrap->fio_file->real_file_size = fwrap->geo->tbytes; + fio_file_set_size_known(fwrap->fio_file); + + err = pthread_mutex_unlock(&g_serialize); + if (err) + log_err("ioeng->_dev_open(%s): pthread_mutex_unlock(), err(%d)\n", f->file_name, + err); + + return 0; + +failure: + xnvme_queue_term(fwrap->queue); + xnvme_dev_close(fwrap->dev); + + err = pthread_mutex_unlock(&g_serialize); + if (err) + log_err("ioeng->_dev_open(%s): pthread_mutex_unlock(), err(%d)\n", f->file_name, + err); + + return 1; +} + +static int xnvme_fioe_init(struct thread_data *td) +{ + struct xnvme_fioe_data *xd = NULL; + struct xnvme_fioe_options *o = td->eo; + struct fio_file *f; + unsigned int i; + + if (!td->o.use_thread) { + log_err("ioeng->init(): --thread=1 is required\n"); + return 1; + } + + /* Allocate xd and iocq */ + xd = calloc(1, sizeof(*xd) + sizeof(*xd->files) * td->o.nr_files); + if (!xd) { + log_err("ioeng->init(): !calloc(), err(%d)\n", errno); + return 1; + } + + xd->iocq = calloc(td->o.iodepth, sizeof(struct io_u *)); + if (!xd->iocq) { + free(xd); + log_err("ioeng->init(): !calloc(xd->iocq), err(%d)\n", errno); + return 1; + } + + if (o->xnvme_iovec) { + xd->iovec = calloc(td->o.iodepth, sizeof(*xd->iovec)); + if (!xd->iovec) { + free(xd->iocq); + free(xd); + log_err("ioeng->init(): !calloc(xd->iovec), err(%d)\n", errno); + return 1; + } + } + + if (o->xnvme_iovec && o->md_per_io_size) { + xd->md_iovec = calloc(td->o.iodepth, sizeof(*xd->md_iovec)); + if (!xd->md_iovec) { + free(xd->iocq); + free(xd->iovec); + free(xd); + log_err("ioeng->init(): !calloc(xd->md_iovec), err(%d)\n", errno); + return 1; + } + } + + xd->prev = -1; + td->io_ops_data = xd; + + for_each_file(td, f, i) + { + if (_dev_open(td, f)) { + /* + * Note: We are not freeing xd, iocq, iovec and md_iovec. + * This will be done as part of cleanup routine. + */ + log_err("ioeng->init(): failed; _dev_open(%s)\n", f->file_name); + return 1; + } + + ++(xd->nallocated); + } + + if (xd->nallocated != td->o.nr_files) { + log_err("ioeng->init(): failed; nallocated != td->o.nr_files\n"); + return 1; + } + + return 0; +} + +/* NOTE: using the first device for buffer-allocators) */ +static int xnvme_fioe_iomem_alloc(struct thread_data *td, size_t total_mem) +{ + struct xnvme_fioe_data *xd = td->io_ops_data; + struct xnvme_fioe_fwrap *fwrap = &xd->files[0]; + + if (!fwrap->dev) { + log_err("ioeng->iomem_alloc(): failed; no dev-handle\n"); + return 1; + } + + td->orig_buffer = xnvme_buf_alloc(fwrap->dev, total_mem); + + return td->orig_buffer == NULL; +} + +/* NOTE: using the first device for buffer-allocators) */ +static void xnvme_fioe_iomem_free(struct thread_data *td) +{ + struct xnvme_fioe_data *xd = NULL; + struct xnvme_fioe_fwrap *fwrap = NULL; + + if (!td->io_ops_data) + return; + + xd = td->io_ops_data; + fwrap = &xd->files[0]; + + if (!fwrap->dev) { + log_err("ioeng->iomem_free(): failed no dev-handle\n"); + return; + } + + xnvme_buf_free(fwrap->dev, td->orig_buffer); +} + +static int xnvme_fioe_io_u_init(struct thread_data *td, struct io_u *io_u) +{ + struct xnvme_fioe_request *fio_req; + struct xnvme_fioe_options *o = td->eo; + struct xnvme_fioe_data *xd = td->io_ops_data; + struct xnvme_fioe_fwrap *fwrap = &xd->files[0]; + + if (!fwrap->dev) { + log_err("ioeng->io_u_init(): failed; no dev-handle\n"); + return 1; + } + + io_u->mmap_data = td->io_ops_data; + io_u->engine_data = NULL; + + fio_req = calloc(1, sizeof(*fio_req)); + if (!fio_req) { + log_err("ioeng->io_u_init(): !calloc(fio_req), err(%d)\n", errno); + return 1; + } + + if (o->md_per_io_size) { + fio_req->md_buf = xnvme_buf_alloc(fwrap->dev, o->md_per_io_size); + if (!fio_req->md_buf) { + free(fio_req); + return 1; + } + } + + io_u->engine_data = fio_req; + + return 0; +} + +static void xnvme_fioe_io_u_free(struct thread_data *td, struct io_u *io_u) +{ + struct xnvme_fioe_data *xd = NULL; + struct xnvme_fioe_fwrap *fwrap = NULL; + struct xnvme_fioe_request *fio_req = NULL; + + if (!td->io_ops_data) + return; + + xd = td->io_ops_data; + fwrap = &xd->files[0]; + + if (!fwrap->dev) { + log_err("ioeng->io_u_free(): failed no dev-handle\n"); + return; + } + + fio_req = io_u->engine_data; + if (fio_req->md_buf) + xnvme_buf_free(fwrap->dev, fio_req->md_buf); + + free(fio_req); + + io_u->mmap_data = NULL; +} + +static struct io_u *xnvme_fioe_event(struct thread_data *td, int event) +{ + struct xnvme_fioe_data *xd = td->io_ops_data; + + assert(event >= 0); + assert((unsigned)event < xd->completed); + + return xd->iocq[event]; +} + +static int xnvme_fioe_getevents(struct thread_data *td, unsigned int min, unsigned int max, + const struct timespec *t) +{ + struct xnvme_fioe_data *xd = td->io_ops_data; + struct xnvme_fioe_fwrap *fwrap = NULL; + int nfiles = xd->nallocated; + int err = 0; + + if (xd->prev != -1 && ++xd->prev < nfiles) { + fwrap = &xd->files[xd->prev]; + xd->cur = xd->prev; + } + + xd->completed = 0; + for (;;) { + if (fwrap == NULL || xd->cur == nfiles) { + fwrap = &xd->files[0]; + xd->cur = 0; + } + + while (fwrap != NULL && xd->cur < nfiles && err >= 0) { + err = xnvme_queue_poke(fwrap->queue, max - xd->completed); + if (err < 0) { + switch (err) { + case -EBUSY: + case -EAGAIN: + usleep(1); + break; + + default: + log_err("ioeng->getevents(): unhandled IO error\n"); + assert(false); + return 0; + } + } + if (xd->completed >= min) { + xd->prev = xd->cur; + return xd->completed; + } + xd->cur++; + fwrap = &xd->files[xd->cur]; + + if (err < 0) { + switch (err) { + case -EBUSY: + case -EAGAIN: + usleep(1); + break; + } + } + } + } + + xd->cur = 0; + + return xd->completed; +} + +static enum fio_q_status xnvme_fioe_queue(struct thread_data *td, struct io_u *io_u) +{ + struct xnvme_fioe_data *xd = td->io_ops_data; + struct xnvme_fioe_options *o = td->eo; + struct xnvme_fioe_fwrap *fwrap; + struct xnvme_cmd_ctx *ctx; + struct xnvme_fioe_request *fio_req = io_u->engine_data; + uint32_t nsid; + uint64_t slba; + uint16_t nlb; + int err; + bool vectored_io = ((struct xnvme_fioe_options *)td->eo)->xnvme_iovec; + uint32_t dir = io_u->dtype; + + fio_ro_check(td, io_u); + + fwrap = &xd->files[io_u->file->fileno]; + nsid = xnvme_dev_get_nsid(fwrap->dev); + + if (fwrap->lba_pow2) { + slba = io_u->offset >> fwrap->ssw; + nlb = (io_u->xfer_buflen >> fwrap->ssw) - 1; + } else { + slba = io_u->offset / fwrap->lba_nbytes; + nlb = (io_u->xfer_buflen / fwrap->lba_nbytes) - 1; + } + + ctx = xnvme_queue_get_cmd_ctx(fwrap->queue); + ctx->async.cb_arg = io_u; + + ctx->cmd.common.nsid = nsid; + ctx->cmd.nvm.slba = slba; + ctx->cmd.nvm.nlb = nlb; + if (dir) { + ctx->cmd.nvm.dtype = io_u->dtype; + ctx->cmd.nvm.cdw13.dspec = io_u->dspec; + } + + switch (io_u->ddir) { + case DDIR_READ: + ctx->cmd.common.opcode = XNVME_SPEC_NVM_OPC_READ; + break; + + case DDIR_WRITE: + ctx->cmd.common.opcode = XNVME_SPEC_NVM_OPC_WRITE; + break; + + default: + log_err("ioeng->queue(): ENOSYS: %u\n", io_u->ddir); + xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx); + + io_u->error = ENOSYS; + assert(false); + return FIO_Q_COMPLETED; + } + + if (fwrap->geo->pi_type && !o->pi_act) { + err = xnvme_pi_ctx_init(&fio_req->pi_ctx, fwrap->lba_nbytes, + fwrap->geo->nbytes_oob, fwrap->geo->lba_extended, + fwrap->geo->pi_loc, fwrap->geo->pi_type, + (o->pi_act << 3 | o->prchk), slba, o->apptag_mask, + o->apptag, fwrap->geo->pi_format); + if (err) { + log_err("ioeng->queue(): err: '%d'\n", err); + + xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx); + + io_u->error = abs(err); + return FIO_Q_COMPLETED; + } + + if (io_u->ddir == DDIR_WRITE) + xnvme_pi_generate(&fio_req->pi_ctx, io_u->xfer_buf, fio_req->md_buf, + nlb + 1); + } + + if (fwrap->geo->pi_type) + ctx->cmd.nvm.prinfo = (o->pi_act << 3 | o->prchk); + + switch (fwrap->geo->pi_type) { + case XNVME_PI_TYPE1: + case XNVME_PI_TYPE2: + switch (fwrap->geo->pi_format) { + case XNVME_SPEC_NVM_NS_16B_GUARD: + if (o->prchk & XNVME_PI_FLAGS_REFTAG_CHECK) + ctx->cmd.nvm.ilbrt = (uint32_t)slba; + break; + case XNVME_SPEC_NVM_NS_64B_GUARD: + if (o->prchk & XNVME_PI_FLAGS_REFTAG_CHECK) { + ctx->cmd.nvm.ilbrt = (uint32_t)slba; + ctx->cmd.common.cdw03 = ((slba >> 32) & 0xffff); + } + break; + default: + break; + } + if (o->prchk & XNVME_PI_FLAGS_APPTAG_CHECK) { + ctx->cmd.nvm.lbat = o->apptag; + ctx->cmd.nvm.lbatm = o->apptag_mask; + } + break; + case XNVME_PI_TYPE3: + if (o->prchk & XNVME_PI_FLAGS_APPTAG_CHECK) { + ctx->cmd.nvm.lbat = o->apptag; + ctx->cmd.nvm.lbatm = o->apptag_mask; + } + break; + case XNVME_PI_DISABLE: + break; + } + + if (vectored_io) { + xd->iovec[io_u->index].iov_base = io_u->xfer_buf; + xd->iovec[io_u->index].iov_len = io_u->xfer_buflen; + if (fwrap->md_nbytes && fwrap->lba_pow2) { + xd->md_iovec[io_u->index].iov_base = fio_req->md_buf; + xd->md_iovec[io_u->index].iov_len = fwrap->md_nbytes * (nlb + 1); + err = xnvme_cmd_passv(ctx, &xd->iovec[io_u->index], 1, io_u->xfer_buflen, + &xd->md_iovec[io_u->index], 1, + fwrap->md_nbytes * (nlb + 1)); + } else { + err = xnvme_cmd_passv(ctx, &xd->iovec[io_u->index], 1, io_u->xfer_buflen, + NULL, 0, 0); + } + } else { + if (fwrap->md_nbytes && fwrap->lba_pow2) + err = xnvme_cmd_pass(ctx, io_u->xfer_buf, io_u->xfer_buflen, + fio_req->md_buf, fwrap->md_nbytes * (nlb + 1)); + else + err = xnvme_cmd_pass(ctx, io_u->xfer_buf, io_u->xfer_buflen, NULL, 0); + } + switch (err) { + case 0: + return FIO_Q_QUEUED; + + case -EBUSY: + case -EAGAIN: + xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx); + return FIO_Q_BUSY; + + default: + log_err("ioeng->queue(): err: '%d'\n", err); + + xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx); + + io_u->error = abs(err); + assert(false); + return FIO_Q_COMPLETED; + } +} + +static int xnvme_fioe_close(struct thread_data *td, struct fio_file *f) +{ + struct xnvme_fioe_data *xd = td->io_ops_data; + + dprint(FD_FILE, "xnvme close %s -- nopen: %ld\n", f->file_name, xd->nopen); + + --(xd->nopen); + + return 0; +} + +static int xnvme_fioe_open(struct thread_data *td, struct fio_file *f) +{ + struct xnvme_fioe_data *xd = td->io_ops_data; + + dprint(FD_FILE, "xnvme open %s -- nopen: %ld\n", f->file_name, xd->nopen); + + if (f->fileno > (int)xd->nallocated) { + log_err("ioeng->open(): f->fileno > xd->nallocated; invalid assumption\n"); + return 1; + } + if (xd->files[f->fileno].fio_file != f) { + log_err("ioeng->open(): fio_file != f; invalid assumption\n"); + return 1; + } + + ++(xd->nopen); + + return 0; +} + +static int xnvme_fioe_invalidate(struct thread_data *td, struct fio_file *f) +{ + /* Consider only doing this with be:spdk */ + return 0; +} + +static int xnvme_fioe_get_max_open_zones(struct thread_data *td, struct fio_file *f, + unsigned int *max_open_zones) +{ + struct xnvme_opts opts = xnvme_opts_from_fioe(td); + struct xnvme_dev *dev; + const struct xnvme_spec_znd_idfy_ns *zns; + int err = 0, err_lock; + + if (f->filetype != FIO_TYPE_FILE && f->filetype != FIO_TYPE_BLOCK && + f->filetype != FIO_TYPE_CHAR) { + log_info("ioeng->get_max_open_zoned(): ignoring filetype: %d\n", f->filetype); + return 0; + } + err_lock = pthread_mutex_lock(&g_serialize); + if (err_lock) { + log_err("ioeng->get_max_open_zones(): pthread_mutex_lock(), err(%d)\n", err_lock); + return -err_lock; + } + + dev = xnvme_dev_open(f->file_name, &opts); + if (!dev) { + log_err("ioeng->get_max_open_zones(): xnvme_dev_open(), err(%d)\n", err_lock); + err = -errno; + goto exit; + } + if (xnvme_dev_get_geo(dev)->type != XNVME_GEO_ZONED) { + errno = EINVAL; + err = -errno; + goto exit; + } + + zns = (void *)xnvme_dev_get_ns_css(dev); + if (!zns) { + log_err("ioeng->get_max_open_zones(): xnvme_dev_get_ns_css(), err(%d)\n", errno); + err = -errno; + goto exit; + } + + /* + * intentional overflow as the value is zero-based and NVMe + * defines 0xFFFFFFFF as unlimited thus overflowing to 0 which + * is how fio indicates unlimited and otherwise just converting + * to one-based. + */ + *max_open_zones = zns->mor + 1; + +exit: + xnvme_dev_close(dev); + err_lock = pthread_mutex_unlock(&g_serialize); + if (err_lock) + log_err("ioeng->get_max_open_zones(): pthread_mutex_unlock(), err(%d)\n", + err_lock); + + return err; +} + +/** + * Currently, this function is called before of I/O engine initialization, so, + * we cannot consult the file-wrapping done when 'fioe' initializes. + * Instead we just open based on the given filename. + * + * TODO: unify the different setup methods, consider keeping the handle around, + * and consider how to support the --be option in this usecase + */ +static int xnvme_fioe_get_zoned_model(struct thread_data *td, struct fio_file *f, + enum zbd_zoned_model *model) +{ + struct xnvme_opts opts = xnvme_opts_from_fioe(td); + struct xnvme_dev *dev; + int err = 0, err_lock; + + if (f->filetype != FIO_TYPE_FILE && f->filetype != FIO_TYPE_BLOCK && + f->filetype != FIO_TYPE_CHAR) { + log_info("ioeng->get_zoned_model(): ignoring filetype: %d\n", f->filetype); + return -EINVAL; + } + + err = pthread_mutex_lock(&g_serialize); + if (err) { + log_err("ioeng->get_zoned_model(): pthread_mutex_lock(), err(%d)\n", err); + return -err; + } + + dev = xnvme_dev_open(f->file_name, &opts); + if (!dev) { + log_err("ioeng->get_zoned_model(): xnvme_dev_open(%s) failed, errno: %d\n", + f->file_name, errno); + err = -errno; + goto exit; + } + + switch (xnvme_dev_get_geo(dev)->type) { + case XNVME_GEO_UNKNOWN: + dprint(FD_ZBD, "%s: got 'unknown', assigning ZBD_NONE\n", f->file_name); + *model = ZBD_NONE; + break; + + case XNVME_GEO_CONVENTIONAL: + dprint(FD_ZBD, "%s: got 'conventional', assigning ZBD_NONE\n", f->file_name); + *model = ZBD_NONE; + break; + + case XNVME_GEO_ZONED: + dprint(FD_ZBD, "%s: got 'zoned', assigning ZBD_HOST_MANAGED\n", f->file_name); + *model = ZBD_HOST_MANAGED; + break; + + default: + dprint(FD_ZBD, "%s: hit-default, assigning ZBD_NONE\n", f->file_name); + *model = ZBD_NONE; + errno = EINVAL; + err = -errno; + break; + } + +exit: + xnvme_dev_close(dev); + + err_lock = pthread_mutex_unlock(&g_serialize); + if (err_lock) + log_err("ioeng->get_zoned_model(): pthread_mutex_unlock(), err(%d)\n", err_lock); + + return err; +} + +/** + * Fills the given ``zbdz`` with at most ``nr_zones`` zone-descriptors. + * + * The implementation converts the NVMe Zoned Command Set log-pages for Zone + * descriptors into the Linux Kernel Zoned Block Report format. + * + * NOTE: This function is called before I/O engine initialization, that is, + * before ``_dev_open`` has been called and file-wrapping is setup. Thus is has + * to do the ``_dev_open`` itself, and shut it down again once it is done + * retrieving the log-pages and converting them to the report format. + * + * TODO: unify the different setup methods, consider keeping the handle around, + * and consider how to support the --async option in this usecase + */ +static int xnvme_fioe_report_zones(struct thread_data *td, struct fio_file *f, uint64_t offset, + struct zbd_zone *zbdz, unsigned int nr_zones) +{ + struct xnvme_opts opts = xnvme_opts_from_fioe(td); + const struct xnvme_spec_znd_idfy_lbafe *lbafe = NULL; + struct xnvme_dev *dev = NULL; + const struct xnvme_geo *geo = NULL; + struct xnvme_znd_report *rprt = NULL; + uint32_t ssw; + uint64_t slba; + unsigned int limit = 0; + int err = 0, err_lock; + + dprint(FD_ZBD, "%s: report_zones() offset: %zu, nr_zones: %u\n", f->file_name, offset, + nr_zones); + + err = pthread_mutex_lock(&g_serialize); + if (err) { + log_err("ioeng->report_zones(%s): pthread_mutex_lock(), err(%d)\n", f->file_name, + err); + return -err; + } + + dev = xnvme_dev_open(f->file_name, &opts); + if (!dev) { + log_err("ioeng->report_zones(%s): xnvme_dev_open(), err(%d)\n", f->file_name, + errno); + goto exit; + } + + geo = xnvme_dev_get_geo(dev); + ssw = xnvme_dev_get_ssw(dev); + lbafe = xnvme_znd_dev_get_lbafe(dev); + + limit = nr_zones > geo->nzone ? geo->nzone : nr_zones; + + dprint(FD_ZBD, "%s: limit: %u\n", f->file_name, limit); + + slba = ((offset >> ssw) / geo->nsect) * geo->nsect; + + rprt = xnvme_znd_report_from_dev(dev, slba, limit, 0); + if (!rprt) { + log_err("ioeng->report_zones(%s): xnvme_znd_report_from_dev(), err(%d)\n", + f->file_name, errno); + err = -errno; + goto exit; + } + if (rprt->nentries != limit) { + log_err("ioeng->report_zones(%s): nentries != nr_zones\n", f->file_name); + err = 1; + goto exit; + } + if (offset > geo->tbytes) { + log_err("ioeng->report_zones(%s): out-of-bounds\n", f->file_name); + goto exit; + } + + /* Transform the zone-report */ + for (uint32_t idx = 0; idx < rprt->nentries; ++idx) { + struct xnvme_spec_znd_descr *descr = XNVME_ZND_REPORT_DESCR(rprt, idx); + + zbdz[idx].start = descr->zslba << ssw; + zbdz[idx].len = lbafe->zsze << ssw; + zbdz[idx].capacity = descr->zcap << ssw; + zbdz[idx].wp = descr->wp << ssw; + + switch (descr->zt) { + case XNVME_SPEC_ZND_TYPE_SEQWR: + zbdz[idx].type = ZBD_ZONE_TYPE_SWR; + break; + + default: + log_err("ioeng->report_zones(%s): invalid type for zone at offset(%zu)\n", + f->file_name, zbdz[idx].start); + err = -EIO; + goto exit; + } + + switch (descr->zs) { + case XNVME_SPEC_ZND_STATE_EMPTY: + zbdz[idx].cond = ZBD_ZONE_COND_EMPTY; + break; + case XNVME_SPEC_ZND_STATE_IOPEN: + zbdz[idx].cond = ZBD_ZONE_COND_IMP_OPEN; + break; + case XNVME_SPEC_ZND_STATE_EOPEN: + zbdz[idx].cond = ZBD_ZONE_COND_EXP_OPEN; + break; + case XNVME_SPEC_ZND_STATE_CLOSED: + zbdz[idx].cond = ZBD_ZONE_COND_CLOSED; + break; + case XNVME_SPEC_ZND_STATE_FULL: + zbdz[idx].cond = ZBD_ZONE_COND_FULL; + break; + + case XNVME_SPEC_ZND_STATE_RONLY: + case XNVME_SPEC_ZND_STATE_OFFLINE: + default: + zbdz[idx].cond = ZBD_ZONE_COND_OFFLINE; + break; + } + } + +exit: + xnvme_buf_virt_free(rprt); + + xnvme_dev_close(dev); + + err_lock = pthread_mutex_unlock(&g_serialize); + if (err_lock) + log_err("ioeng->report_zones(): pthread_mutex_unlock(), err: %d\n", err_lock); + + dprint(FD_ZBD, "err: %d, nr_zones: %d\n", err, (int)nr_zones); + + return err ? err : (int)limit; +} + +/** + * NOTE: This function may get called before I/O engine initialization, that is, + * before ``_dev_open`` has been called and file-wrapping is setup. In such + * case it has to do ``_dev_open`` itself, and shut it down again once it is + * done resetting write pointer of zones. + */ +static int xnvme_fioe_reset_wp(struct thread_data *td, struct fio_file *f, uint64_t offset, + uint64_t length) +{ + struct xnvme_opts opts = xnvme_opts_from_fioe(td); + struct xnvme_fioe_data *xd = NULL; + struct xnvme_fioe_fwrap *fwrap = NULL; + struct xnvme_dev *dev = NULL; + const struct xnvme_geo *geo = NULL; + uint64_t first, last; + uint32_t ssw; + uint32_t nsid; + int err = 0, err_lock; + + if (td->io_ops_data) { + xd = td->io_ops_data; + fwrap = &xd->files[f->fileno]; + + assert(fwrap->dev); + assert(fwrap->geo); + + dev = fwrap->dev; + geo = fwrap->geo; + ssw = fwrap->ssw; + } else { + err = pthread_mutex_lock(&g_serialize); + if (err) { + log_err("ioeng->reset_wp(): pthread_mutex_lock(), err(%d)\n", err); + return -err; + } + + dev = xnvme_dev_open(f->file_name, &opts); + if (!dev) { + log_err("ioeng->reset_wp(): xnvme_dev_open(%s) failed, errno(%d)\n", + f->file_name, errno); + goto exit; + } + geo = xnvme_dev_get_geo(dev); + ssw = xnvme_dev_get_ssw(dev); + } + + nsid = xnvme_dev_get_nsid(dev); + + first = ((offset >> ssw) / geo->nsect) * geo->nsect; + last = (((offset + length) >> ssw) / geo->nsect) * geo->nsect; + dprint(FD_ZBD, "first: 0x%lx, last: 0x%lx\n", first, last); + + for (uint64_t zslba = first; zslba < last; zslba += geo->nsect) { + struct xnvme_cmd_ctx ctx = xnvme_cmd_ctx_from_dev(dev); + + if (zslba >= (geo->nsect * geo->nzone)) { + log_err("ioeng->reset_wp(): out-of-bounds\n"); + err = 0; + break; + } + + err = xnvme_znd_mgmt_send(&ctx, nsid, zslba, false, + XNVME_SPEC_ZND_CMD_MGMT_SEND_RESET, 0x0, NULL); + if (err || xnvme_cmd_ctx_cpl_status(&ctx)) { + err = err ? err : -EIO; + log_err("ioeng->reset_wp(): err(%d), sc(%d)", err, ctx.cpl.status.sc); + goto exit; + } + } + +exit: + if (!td->io_ops_data) { + xnvme_dev_close(dev); + + err_lock = pthread_mutex_unlock(&g_serialize); + if (err_lock) + log_err("ioeng->reset_wp(): pthread_mutex_unlock(), err(%d)\n", err_lock); + } + + return err; +} + +static int xnvme_fioe_fetch_ruhs(struct thread_data *td, struct fio_file *f, + struct fio_ruhs_info *fruhs_info) +{ + struct xnvme_opts opts = xnvme_opts_from_fioe(td); + struct xnvme_dev *dev; + struct xnvme_spec_ruhs *ruhs; + struct xnvme_cmd_ctx ctx; + uint32_t ruhs_nbytes, nr_ruhs; + uint32_t nsid; + int err = 0, err_lock; + + if (f->filetype != FIO_TYPE_CHAR && f->filetype != FIO_TYPE_FILE) { + log_err("ioeng->fdp_ruhs(): ignoring filetype: %d\n", f->filetype); + return -EINVAL; + } + + err = pthread_mutex_lock(&g_serialize); + if (err) { + log_err("ioeng->fdp_ruhs(): pthread_mutex_lock(), err(%d)\n", err); + return -err; + } + + dev = xnvme_dev_open(f->file_name, &opts); + if (!dev) { + log_err("ioeng->fdp_ruhs(): xnvme_dev_open(%s) failed, errno: %d\n", + f->file_name, errno); + err = -errno; + goto exit; + } + + nr_ruhs = fruhs_info->nr_ruhs; + ruhs_nbytes = sizeof(*ruhs) + (fruhs_info->nr_ruhs * sizeof(struct xnvme_spec_ruhs_desc)); + ruhs = xnvme_buf_alloc(dev, ruhs_nbytes); + if (!ruhs) { + err = -errno; + goto exit; + } + memset(ruhs, 0, ruhs_nbytes); + + ctx = xnvme_cmd_ctx_from_dev(dev); + nsid = xnvme_dev_get_nsid(dev); + + err = xnvme_nvm_mgmt_recv(&ctx, nsid, XNVME_SPEC_IO_MGMT_RECV_RUHS, 0, ruhs, ruhs_nbytes); + + if (err || xnvme_cmd_ctx_cpl_status(&ctx)) { + err = err ? err : -EIO; + log_err("ioeng->fdp_ruhs(): err(%d), sc(%d)", err, ctx.cpl.status.sc); + goto free_buffer; + } + + fruhs_info->nr_ruhs = ruhs->nruhsd; + for (uint32_t idx = 0; idx < nr_ruhs; ++idx) { + fruhs_info->plis[idx] = le16_to_cpu(ruhs->desc[idx].pi); + } + +free_buffer: + xnvme_buf_free(dev, ruhs); +exit: + xnvme_dev_close(dev); + + err_lock = pthread_mutex_unlock(&g_serialize); + if (err_lock) + log_err("ioeng->fdp_ruhs(): pthread_mutex_unlock(), err(%d)\n", err_lock); + + return err; +} + +static int xnvme_fioe_get_file_size(struct thread_data *td, struct fio_file *f) +{ + struct xnvme_opts opts = xnvme_opts_from_fioe(td); + struct xnvme_dev *dev; + int ret = 0, err; + + if (fio_file_size_known(f)) + return 0; + + ret = pthread_mutex_lock(&g_serialize); + if (ret) { + log_err("ioeng->reset_wp(): pthread_mutex_lock(), err(%d)\n", ret); + return -ret; + } + + dev = xnvme_dev_open(f->file_name, &opts); + if (!dev) { + log_err("%s: failed retrieving device handle, errno: %d\n", f->file_name, errno); + ret = -errno; + goto exit; + } + + f->real_file_size = xnvme_dev_get_geo(dev)->tbytes; + fio_file_set_size_known(f); + + if (td->o.zone_mode == ZONE_MODE_ZBD) + f->filetype = FIO_TYPE_BLOCK; + +exit: + xnvme_dev_close(dev); + err = pthread_mutex_unlock(&g_serialize); + if (err) + log_err("ioeng->reset_wp(): pthread_mutex_unlock(), err(%d)\n", err); + + return ret; +} + +FIO_STATIC struct ioengine_ops ioengine = { + .name = "xnvme", + .version = FIO_IOOPS_VERSION, + .options = options, + .option_struct_size = sizeof(struct xnvme_fioe_options), + .flags = FIO_DISKLESSIO | FIO_NODISKUTIL | FIO_NOEXTEND | FIO_MEMALIGN | FIO_RAWIO, + + .cleanup = xnvme_fioe_cleanup, + .init = xnvme_fioe_init, + + .iomem_free = xnvme_fioe_iomem_free, + .iomem_alloc = xnvme_fioe_iomem_alloc, + + .io_u_free = xnvme_fioe_io_u_free, + .io_u_init = xnvme_fioe_io_u_init, + + .event = xnvme_fioe_event, + .getevents = xnvme_fioe_getevents, + .queue = xnvme_fioe_queue, + + .close_file = xnvme_fioe_close, + .open_file = xnvme_fioe_open, + .get_file_size = xnvme_fioe_get_file_size, + + .invalidate = xnvme_fioe_invalidate, + .get_max_open_zones = xnvme_fioe_get_max_open_zones, + .get_zoned_model = xnvme_fioe_get_zoned_model, + .report_zones = xnvme_fioe_report_zones, + .reset_wp = xnvme_fioe_reset_wp, + + .fdp_fetch_ruhs = xnvme_fioe_fetch_ruhs, +}; + +static void fio_init fio_xnvme_register(void) +{ + register_ioengine(&ioengine); +} + +static void fio_exit fio_xnvme_unregister(void) +{ + unregister_ioengine(&ioengine); +} diff --git a/eta.c b/eta.c index ea1781f3b7..c6e3cffb81 100644 --- a/eta.c +++ b/eta.c @@ -3,6 +3,7 @@ */ #include #include +#include #ifdef CONFIG_VALGRIND_DEV #include #else @@ -214,8 +215,9 @@ static unsigned long thread_eta(struct thread_data *td) perc = td->o.rwmix[DDIR_WRITE]; bytes_total += (bytes_total * perc) / 100; - } else + } else { bytes_total <<= 1; + } } if (td->runstate == TD_RUNNING || td->runstate == TD_VERIFYING) { @@ -227,8 +229,9 @@ static unsigned long thread_eta(struct thread_data *td) perc = (double) bytes_done / (double) bytes_total; if (perc > 1.0) perc = 1.0; - } else + } else { perc = 0.0; + } if (td->o.time_based) { if (timeout) { @@ -271,12 +274,11 @@ static unsigned long thread_eta(struct thread_data *td) uint64_t ramp_time = td->o.ramp_time; t_eta = __timeout + start_delay; - if (!td->ramp_time_over) { + if (in_ramp_period(td)) t_eta += ramp_time; - } t_eta /= 1000000ULL; - if ((td->runstate == TD_RAMP) && in_ramp_time(td)) { + if ((td->runstate == TD_RAMP) && in_ramp_period(td)) { unsigned long ramp_left; ramp_left = mtime_since_now(&td->epoch); @@ -374,14 +376,30 @@ bool eta_time_within_slack(unsigned int time) return time > ((eta_interval_msec * 95) / 100); } +/* + * These are the conditions under which we might be able to skip the eta + * calculation. + */ +static bool skip_eta(void) +{ + if (!(output_format & FIO_OUTPUT_NORMAL) && f_out == stdout) + return true; + if (temp_stall_ts || eta_print == FIO_ETA_NEVER) + return true; + if (!isatty(STDOUT_FILENO) && eta_print != FIO_ETA_ALWAYS) + return true; + + return false; +} + /* * Print status of the jobs we know about. This includes rate estimates, * ETA, thread state, etc. */ -bool calc_thread_status(struct jobs_eta *je, int force) +static bool calc_thread_status(struct jobs_eta *je, int force) { - struct thread_data *td; - int i, unified_rw_rep; + int unified_rw_rep; + bool any_td_in_ramp; uint64_t rate_time, disp_time, bw_avg_time, *eta_secs; unsigned long long io_bytes[DDIR_RWDIR_CNT] = {}; unsigned long long io_iops[DDIR_RWDIR_CNT] = {}; @@ -392,14 +410,12 @@ bool calc_thread_status(struct jobs_eta *je, int force) static unsigned long long disp_io_iops[DDIR_RWDIR_CNT]; static struct timespec rate_prev_time, disp_prev_time; - if (!force) { - if (!(output_format & FIO_OUTPUT_NORMAL) && - f_out == stdout) - return false; - if (temp_stall_ts || eta_print == FIO_ETA_NEVER) - return false; + bool ret = true; - if (!isatty(STDOUT_FILENO) && (eta_print != FIO_ETA_ALWAYS)) + if (!force && skip_eta()) { + if (write_bw_log) + ret = false; + else return false; } @@ -408,18 +424,18 @@ bool calc_thread_status(struct jobs_eta *je, int force) if (!ddir_rw_sum(disp_io_bytes)) fill_start_time(&disp_prev_time); - eta_secs = malloc(thread_number * sizeof(uint64_t)); - memset(eta_secs, 0, thread_number * sizeof(uint64_t)); + eta_secs = calloc(thread_number, sizeof(uint64_t)); je->elapsed_sec = (mtime_since_genesis() + 999) / 1000; bw_avg_time = ULONG_MAX; unified_rw_rep = 0; - for_each_td(td, i) { + for_each_td(td) { unified_rw_rep += td->o.unified_rw_rep; if (is_power_of_2(td->o.kb_base)) je->is_pow2 = 1; je->unit_base = td->o.unit_base; + je->sig_figs = td->o.sig_figs; if (td->o.bw_avg_time < bw_avg_time) bw_avg_time = td->o.bw_avg_time; if (td->runstate == TD_RUNNING || td->runstate == TD_VERIFYING @@ -456,9 +472,9 @@ bool calc_thread_status(struct jobs_eta *je, int force) je->nr_pending++; if (je->elapsed_sec >= 3) - eta_secs[i] = thread_eta(td); + eta_secs[__td_index] = thread_eta(td); else - eta_secs[i] = INT_MAX; + eta_secs[__td_index] = INT_MAX; check_str_update(td); @@ -475,26 +491,26 @@ bool calc_thread_status(struct jobs_eta *je, int force) } } } - } + } end_for_each(); if (exitall_on_terminate) { je->eta_sec = INT_MAX; - for_each_td(td, i) { - if (eta_secs[i] < je->eta_sec) - je->eta_sec = eta_secs[i]; - } + for_each_td_index() { + if (eta_secs[__td_index] < je->eta_sec) + je->eta_sec = eta_secs[__td_index]; + } end_for_each(); } else { unsigned long eta_stone = 0; je->eta_sec = 0; - for_each_td(td, i) { + for_each_td(td) { if ((td->runstate == TD_NOT_CREATED) && td->o.stonewall) - eta_stone += eta_secs[i]; + eta_stone += eta_secs[__td_index]; else { - if (eta_secs[i] > je->eta_sec) - je->eta_sec = eta_secs[i]; + if (eta_secs[__td_index] > je->eta_sec) + je->eta_sec = eta_secs[__td_index]; } - } + } end_for_each(); je->eta_sec += eta_stone; } @@ -503,7 +519,11 @@ bool calc_thread_status(struct jobs_eta *je, int force) fio_gettime(&now, NULL); rate_time = mtime_since(&rate_prev_time, &now); - if (write_bw_log && rate_time > bw_avg_time && !in_ramp_time(td)) { + any_td_in_ramp = false; + for_each_td(td) { + any_td_in_ramp |= in_ramp_period(td); + } end_for_each(); + if (write_bw_log && rate_time > bw_avg_time && !any_td_in_ramp) { calc_rate(unified_rw_rep, rate_time, io_bytes, rate_io_bytes, je->rate); memcpy(&rate_prev_time, &now, sizeof(now)); @@ -529,7 +549,7 @@ bool calc_thread_status(struct jobs_eta *je, int force) je->nr_threads = thread_number; update_condensed_str(__run_str, run_str); memcpy(je->run_str, run_str, strlen(run_str)); - return true; + return ret; } static int gen_eta_str(struct jobs_eta *je, char *p, size_t left, @@ -600,9 +620,9 @@ void display_thread_status(struct jobs_eta *je) char *tr, *mr; mr = num2str(je->m_rate[0] + je->m_rate[1] + je->m_rate[2], - je->sig_figs, 0, je->is_pow2, N2S_BYTEPERSEC); + je->sig_figs, 1, je->is_pow2, N2S_BYTEPERSEC); tr = num2str(je->t_rate[0] + je->t_rate[1] + je->t_rate[2], - je->sig_figs, 0, je->is_pow2, N2S_BYTEPERSEC); + je->sig_figs, 1, je->is_pow2, N2S_BYTEPERSEC); p += sprintf(p, ", %s-%s", mr, tr); free(tr); @@ -686,10 +706,9 @@ struct jobs_eta *get_jobs_eta(bool force, size_t *size) return NULL; *size = sizeof(*je) + THREAD_RUNSTR_SZ + 8; - je = malloc(*size); + je = calloc(1, *size); if (!je) return NULL; - memset(je, 0, *size); if (!calc_thread_status(je, force)) { free(je); @@ -706,10 +725,10 @@ void print_thread_status(void) size_t size; je = get_jobs_eta(false, &size); - if (je) + if (je) { display_thread_status(je); - - free(je); + free(je); + } } void print_status_init(int thr_number) diff --git a/example_latency_steadystate.fio b/example_latency_steadystate.fio new file mode 100644 index 0000000000..b769ad1509 --- /dev/null +++ b/example_latency_steadystate.fio @@ -0,0 +1,47 @@ +# Example FIO job file demonstrating latency steady state detection +# This example shows how to use FIO's latency steady state detection +# to automatically terminate workloads when latency stabilizes +# +# Based on SNIA SSD Performance Test Specification requirements: +# - Steady state is achieved when latency measurements don't change more than +# 20% for 5 measurement windows and remain within 5% of a line with 10% slope +# - This example uses more conservative 5% deviation threshold for demonstration + +[global] +# Basic I/O parameters +ioengine=libaio +iodepth=32 +bs=4k +direct=1 +rw=randread +numjobs=1 +time_based=1 +runtime=3600 # Max runtime: 1 hour (will terminate early if steady state reached) + +# Steady state detection parameters +steadystate=lat:5% # Stop when latency mean deviation < 5% of average +steadystate_duration=300 # Use 5-minute rolling window for measurements +steadystate_ramp_time=60 # Wait 1 minute before starting measurements +steadystate_check_interval=10 # Take measurements every 10 seconds + +# Output options +write_lat_log=lat_steadystate +log_avg_msec=10000 # Log average latency every 10 seconds + +[latency_steady_test] +filename=/dev/nvme3n1 +size=10G + +# Alternative steady state configurations (uncomment to try): + +# Use slope-based detection instead of deviation: +# steadystate=lat_slope:0.1% + +# More aggressive detection (faster convergence): +# steadystate=lat:2% +# steadystate_duration=120 # 2-minute window +# steadystate_check_interval=5 # Check every 5 seconds + +# More conservative detection (slower convergence): +# steadystate=lat:10% +# steadystate_duration=600 # 10-minute window diff --git a/examples/atomic-verify.fio b/examples/atomic-verify.fio new file mode 100644 index 0000000000..17bcd89f86 --- /dev/null +++ b/examples/atomic-verify.fio @@ -0,0 +1,36 @@ +# Data verification with atomic writes +# +# Some background on atomic writes: +# +# The main selling point of atomic writes is that it is guaranteed writes +# to storage will not be torn for a power failure or kernel crash. + +# Another aspect of atomic writes is that they handle racing writes and +# reads, such that a read racing with a write will see all the data from +# the write or none. Well, SCSI and NVMe guarantee this if using +# RWF_ATOMIC, but it is not formally stated as a feature of RWF_ATOMIC. +# +# Fio verify mode can be used to prove that atomic writes can make "safe" +# racing reads and writes. This done by having many jobs in a xsum verify +# mode. In this way, xsums should be correct, although a job may be +# reading a data block written by another job; however +# verify_write_sequence must be disabled, as it cannot be helped that data +# blocks will be out of sequence between with many jobs. +# +# Atomic write limits: +# For a block device, the max block size for atomic=1 is in +# /sys/block/sdXXX/queue/atomic_write_unit_max_bytes +# or this value can also be read with a statx syscall on the bdev file. + +[write-and-verify] +rw=randwrite +bs=4k +direct=1 +ioengine=libaio +iodepth=16 +verify=crc64 +atomic=1 +verify_write_sequence=0 +numjobs=10 +# Use /dev/XXX or filename +filename=/dev/XXX diff --git a/examples/cmdprio-bssplit.fio b/examples/cmdprio-bssplit.fio index 47e9a79060..ee202d74f1 100644 --- a/examples/cmdprio-bssplit.fio +++ b/examples/cmdprio-bssplit.fio @@ -1,17 +1,79 @@ ; Randomly read/write a block device file at queue depth 16. -; 40 % of read IOs are 64kB and 60% are 1MB. 100% of writes are 1MB. -; 100% of the 64kB reads are executed at the highest priority and -; all other IOs executed without a priority set. [global] filename=/dev/sda direct=1 write_lat_log=prio-run.log log_prio=1 - -[randrw] rw=randrw -bssplit=64k/40:1024k/60,1024k/100 ioengine=libaio iodepth=16 + +; Simple cmdprio_bssplit format. All non-zero percentage entries will +; use the same prio class and prio level defined by the cmdprio_class +; and cmdprio options. +[cmdprio] +; 40% of read I/Os are 64kB and 60% are 1MB. 100% of writes are 1MB. +; 100% of the 64kB reads are executed with prio class 1 and prio level 0. +; All other I/Os are executed without a priority set. +bssplit=64k/40:1024k/60,1024k/100 cmdprio_bssplit=64k/100:1024k/0,1024k/0 cmdprio_class=1 +cmdprio=0 + +; Advanced cmdprio_bssplit format. Each non-zero percentage entry can +; use a different prio class and prio level (appended to each entry). +[cmdprio-adv] +; 40% of read I/Os are 64kB and 60% are 1MB. 100% of writes are 1MB. +; 25% of the 64kB reads are executed with prio class 1 and prio level 1, +; 75% of the 64kB reads are executed with prio class 3 and prio level 2. +; All other I/Os are executed without a priority set. +stonewall +bssplit=64k/40:1024k/60,1024k/100 +cmdprio_bssplit=64k/25/1/1:64k/75/3/2:1024k/0,1024k/0 + +; Identical to the previous example, but with a default priority defined. +[cmdprio-adv-def] +; 40% of read I/Os are 64kB and 60% are 1MB. 100% of writes are 1MB. +; 25% of the 64kB reads are executed with prio class 1 and prio level 1, +; 75% of the 64kB reads are executed with prio class 3 and prio level 2. +; All other I/Os are executed with prio class 2 and prio level 7. +stonewall +prioclass=2 +prio=7 +bssplit=64k/40:1024k/60,1024k/100 +cmdprio_bssplit=64k/25/1/1:64k/75/3/2:1024k/0,1024k/0 + +; Example of how to use cmdprio_bssplit with Command Duration Limits (CDL) +; using I/O priority hints. The drive has to support CDL, and CDL has to be +; enabled in sysfs, otherwise the hints will not be sent down to the drive. +[cmdprio-hints] +; 40% of the I/Os are 1MB reads and 60% of the I/Os are 2MB reads. +; +; 10% of the 1MB reads are executed with prio class 2 (Best Effort), +; prio level 0, and prio hint 1. Prio hint 1 means CDL descriptor 1. +; Since 40% of read I/Os are 1MB, and 10% of the 1MB I/Os use CDL desc 1, +; this means that 4% of all the issued I/O will use this configuration. +; +; 30% of the 1MB reads are executed with prio class 2 (Best Effort), +; prio level 0, and prio hint 2. Prio hint 2 means CDL descriptor 2. +; Since 40% of read I/Os are 1MB, and 30% of the 1MB I/Os use CDL desc 2, +; this means that 12% of all the issued I/O will use this configuration. +; +; 60% of the 1MB reads are executed with prio class 2 (Best Effort), +; prio level 0, and prio hint 0. Prio hint 0 means no hint. +; Since 40% of read I/Os are 1MB, and 60% of the 1MB I/Os use no hint, +; this means that 24% of all the issued I/O will use this configuration. +; +; 10% of the 2MB reads are executed with prio class 2 (Best Effort), +; prio level 0, and prio hint 3. Prio hint 3 means CDL descriptor 3. +; Since 60% of read I/Os are 2MB, and 10% of the 2MB I/Os use CDL desc 3, +; this means that 6% of all the issued I/O will use this configuration. +; +; 90% of the 2MB reads are executed with prio class 2 (Best Effort), +; prio level 0, and prio hint 0. Prio hint 0 means no hint. +; Since 60% of read I/Os are 2MB, and 90% of the 2MB I/Os use no hint, +; this means that 54% of all the issued I/O will use this configuration. +stonewall +rw=randread +bssplit=1M/40:2M/60 +cmdprio_bssplit=1M/10/2/0/1:1M/30/2/0/2:1M/60/2/0/0:2M/10/2/0/3:2M/90/2/0/0 diff --git a/examples/cmdprio-bssplit.png b/examples/cmdprio-bssplit.png index a0bb3ff439..83a5570bc1 100644 Binary files a/examples/cmdprio-bssplit.png and b/examples/cmdprio-bssplit.png differ diff --git a/examples/dedupe-global.fio b/examples/dedupe-global.fio new file mode 100644 index 0000000000..edaaad55b7 --- /dev/null +++ b/examples/dedupe-global.fio @@ -0,0 +1,57 @@ +# Writing to 2 files that share the duplicate blocks. +# The dedupe working set is spread uniformly such that when +# each of the jobs choose to perform a dedup operation they will +# regenerate a buffer from the global space. +# If you test the dedup ratio on either file by itself the result +# is likely lower than if you test the ratio of the two files combined. +# +# Use `./t/fio-dedupe -C 1 -c 1 -b 4096` to test the total +# data reduction ratio. +# +# +# Full example of test: +# $ ./fio ./examples/dedupe-global.fio +# +# Checking ratio on a and b individually: +# $ ./t/fio-dedupe a.0.0 -C 1 -c 1 -b 4096 +# +# $ Extents=25600, Unique extents=16817 Duplicated extents=5735 +# $ De-dupe ratio: 1:0.52 +# $ De-dupe working set at least: 22.40% +# $ Fio setting: dedupe_percentage=34 +# $ Unique capacity 33MB +# +# ./t/fio-dedupe b.0.0 -C 1 -c 1 -b 4096 +# $ Extents=25600, Unique extents=17009 Duplicated extents=5636 +# $ De-dupe ratio: 1:0.51 +# $ De-dupe working set at least: 22.02% +# $ Fio setting: dedupe_percentage=34 +# $ Unique capacity 34MB +# +# Combining files: +# $ cat a.0.0 > c.0.0 +# $ cat b.0.0 >> c.0.0 +# +# Checking data reduction ratio on combined file: +# $ ./t/fio-dedupe c.0.0 -C 1 -c 1 -b 4096 +# $ Extents=51200, Unique extents=25747 Duplicated extents=11028 +# $ De-dupe ratio: 1:0.99 +# $ De-dupe working set at least: 21.54% +# $ Fio setting: dedupe_percentage=50 +# $ Unique capacity 51MB +# +[global] +ioengine=libaio +iodepth=256 +size=100m +dedupe_mode=working_set +dedupe_global=1 +dedupe_percentage=50 +blocksize=4k +rw=write +buffer_compress_percentage=50 +dedupe_working_set_percentage=50 + +[a] + +[b] diff --git a/examples/dedupe-global.png b/examples/dedupe-global.png new file mode 100644 index 0000000000..fd4602e315 Binary files /dev/null and b/examples/dedupe-global.png differ diff --git a/examples/dircreate-ioengine.fio b/examples/dircreate-ioengine.fio new file mode 100644 index 0000000000..c89d9e4d00 --- /dev/null +++ b/examples/dircreate-ioengine.fio @@ -0,0 +1,25 @@ +# Example dircreate job +# +# create_on_open is needed so that the open happens during the run and not the +# setup. +# +# openfiles needs to be set so that you do not exceed the maximum allowed open +# files. +# +# filesize needs to be set to a non zero value so fio will actually run, but the +# IO will not really be done and the write latency numbers will only reflect the +# open times. +[global] +create_on_open=1 +nrfiles=30 +ioengine=dircreate +fallocate=none +filesize=4k +openfiles=1 + +[t0] +[t1] +[t2] +[t3] +[t4] +[t5] diff --git a/examples/dircreate-ioengine.png b/examples/dircreate-ioengine.png new file mode 100644 index 0000000000..da1a8c40a0 Binary files /dev/null and b/examples/dircreate-ioengine.png differ diff --git a/examples/dirdelete-ioengine.fio b/examples/dirdelete-ioengine.fio new file mode 100644 index 0000000000..4e5b1e2c7b --- /dev/null +++ b/examples/dirdelete-ioengine.fio @@ -0,0 +1,18 @@ +# Example dirdelete job + +# 'filedelete' engine only do 'rmdir(dirname)'. +# 'filesize' must be set, then directories will be created at setup stage. +# 'unlink' is better set to 0, since the directory is deleted in measurement. +# the options disabled completion latency output such as 'disable_clat' and 'gtod_reduce' must not set. +[global] +ioengine=dirdelete +filesize=4k +nrfiles=200 +unlink=0 + +[t0] +[t1] +[t2] +[t3] +[t4] +[t5] diff --git a/examples/dirdelete-ioengine.png b/examples/dirdelete-ioengine.png new file mode 100644 index 0000000000..af2461952d Binary files /dev/null and b/examples/dirdelete-ioengine.png differ diff --git a/examples/dirstat-ioengine.fio b/examples/dirstat-ioengine.fio new file mode 100644 index 0000000000..1322dd28fa --- /dev/null +++ b/examples/dirstat-ioengine.fio @@ -0,0 +1,18 @@ +# Example dirstat job + +# 'dirstat' engine only do 'stat(dirname)', file will not be open(). +# 'filesize' must be set, then files will be created at setup stage. + +[global] +ioengine=dirstat +numjobs=10 +filesize=4k +nrfiles=5 +thread + +[t0] +[t1] +[t2] +[t3] +[t4] +[t5] diff --git a/examples/dirstat-ioengine.png b/examples/dirstat-ioengine.png new file mode 100644 index 0000000000..14b948ba53 Binary files /dev/null and b/examples/dirstat-ioengine.png differ diff --git a/examples/disk-zone-profile.fio b/examples/disk-zone-profile.fio index 96e5669556..577820ebe7 100644 --- a/examples/disk-zone-profile.fio +++ b/examples/disk-zone-profile.fio @@ -1,4 +1,4 @@ -; Read disk in zones of 128m/2g, generating a plot of that afterwards +; Read disk in zones of 256m/2g. Generating a plot of that afterwards ; should give a nice picture of the zoning of this drive [global] @@ -7,8 +7,11 @@ direct=1 rw=read ioengine=libaio iodepth=2 +zonemode=strided zonesize=256m zoneskip=2g -write_bw_log -[/dev/sdb] +[disk-zone-profile] +filename=/dev/sdb +write_bw_log +log_offset=1 diff --git a/examples/enospc-pressure.fio b/examples/enospc-pressure.fio index ca9d8f7a7a..fa404fd505 100644 --- a/examples/enospc-pressure.fio +++ b/examples/enospc-pressure.fio @@ -35,8 +35,8 @@ bs=4k rw=randtrim filename=raicer -# Verifier thread continiously write to newly allcated blocks -# and veryfy written content +# Verifier thread continuously writes to newly allcated blocks +# and verifies written content [aio-dio-verifier] create_on_open=1 verify=crc32c-intel diff --git a/examples/falloc.fio b/examples/falloc.fio index fadf132169..5a3e88b81e 100644 --- a/examples/falloc.fio +++ b/examples/falloc.fio @@ -29,7 +29,7 @@ rw=randtrim numjobs=2 filename=fragmented_file -## Mesure IO performance on fragmented file +## Measure IO performance on fragmented file [sequential aio-dio write] stonewall ioengine=libaio diff --git a/examples/http-s3-crypto.fio b/examples/http-s3-crypto.fio new file mode 100644 index 0000000000..2403746edc --- /dev/null +++ b/examples/http-s3-crypto.fio @@ -0,0 +1,38 @@ +# Example test for the HTTP engine's S3 support against Amazon AWS. +# Obviously, you have to adjust the S3 credentials; for this example, +# they're passed in via the environment. +# And you can set the SSE Customer Key and Algorithm to test Server +# Side Encryption. +# + +[global] +ioengine=http +name=test +direct=1 +filename=/larsmb-fio-test/object +http_verbose=0 +https=on +http_mode=s3 +http_s3_key=${S3_KEY} +http_s3_keyid=${S3_ID} +http_host=s3.eu-central-1.amazonaws.com +http_s3_region=eu-central-1 +http_s3_sse_customer_key=${SSE_KEY} +http_s3_sse_customer_algorithm=AES256 +group_reporting + +# With verify, this both writes and reads the object +[create] +rw=write +bs=4k +size=64k +io_size=4k +verify=sha256 + +[trim] +stonewall +rw=trim +bs=4k +size=64k +io_size=4k + diff --git a/examples/http-s3-crypto.png b/examples/http-s3-crypto.png new file mode 100644 index 0000000000..b452cf4541 Binary files /dev/null and b/examples/http-s3-crypto.png differ diff --git a/examples/http-s3-storage-class.fio b/examples/http-s3-storage-class.fio new file mode 100644 index 0000000000..9ee23837df --- /dev/null +++ b/examples/http-s3-storage-class.fio @@ -0,0 +1,37 @@ +# Example test for the HTTP engine's S3 support against Amazon AWS. +# Obviously, you have to adjust the S3 credentials; for this example, +# they're passed in via the environment. +# And here add storage class parameter, you can set normal test for +# STANDARD and compression test for another storage class. +# + +[global] +ioengine=http +name=test +direct=1 +filename=/larsmb-fio-test/object +http_verbose=0 +https=on +http_mode=s3 +http_s3_key=${S3_KEY} +http_s3_keyid=${S3_ID} +http_host=s3.eu-central-1.amazonaws.com +http_s3_region=eu-central-1 +http_s3_storage_class=${STORAGE_CLASS} +group_reporting + +# With verify, this both writes and reads the object +[create] +rw=write +bs=4k +size=64k +io_size=4k +verify=sha256 + +[trim] +stonewall +rw=trim +bs=4k +size=64k +io_size=4k + diff --git a/examples/http-s3-storage-class.png b/examples/http-s3-storage-class.png new file mode 100644 index 0000000000..b893a4eb28 Binary files /dev/null and b/examples/http-s3-storage-class.png differ diff --git a/examples/http-s3.fio b/examples/http-s3.fio index 2dcae364a9..043426baef 100644 --- a/examples/http-s3.fio +++ b/examples/http-s3.fio @@ -1,19 +1,36 @@ # Example test for the HTTP engine's S3 support against Amazon AWS. # Obviously, you have to adjust the S3 credentials; for this example, # they're passed in via the environment. +# For non-AWS S3 implementations, refer to your S3 vendor's region +# settings. Note that the region value appears twice, in http_host and +# http_s3_region. +# This example uses virtual-hosted-style requests: +# https://bucket-name.s3.region-code.amazonaws.com/k/e.y +# For path-style, prefix the key with the bucket name in the filename +# so that filename=/bucket-name/k/e.y: +# https://s3.region-code.amazonaws.com/bucket-name/k/e.y # +# IMPORTANT: filename needs to begin with a '/': +# FIO formats the url as `"http://%s%s", o->host, object`, so if +# filename does not begin with a '/' DNS will fail. For example, if +# http_host=amazonaws.com and filename=k/1, URL will be set to +# amazonaws.comk/1 and curl will attempt to resolve amazonaws.comk +# which will fail. + +# Reference for Virtual-hosted-style vs. Path-style URLs: +# https://docs.aws.amazon.com/AmazonS3/latest/userguide/VirtualHosting.html [global] ioengine=http name=test direct=1 -filename=/larsmb-fio-test/object +filename=/k/e.y http_verbose=0 https=on http_mode=s3 http_s3_key=${S3_KEY} http_s3_keyid=${S3_ID} -http_host=s3.eu-central-1.amazonaws.com +http_host=bucket-name.s3.eu-central-1.amazonaws.com http_s3_region=eu-central-1 group_reporting diff --git a/examples/libblkio-io_uring.fio b/examples/libblkio-io_uring.fio new file mode 100644 index 0000000000..40f625cfa6 --- /dev/null +++ b/examples/libblkio-io_uring.fio @@ -0,0 +1,29 @@ +; Benchmark accessing a regular file or block device using libblkio. +; +; Replace "/dev/nvme0n1" below with the path to your file or device, or override +; it by passing the '--libblkio_path=...' flag to fio. +; +; In the example below, the two subjobs of "job-B" *and* the single subjob of +; "job-C" will share a single libblkio instance, and "job-A" will use a separate +; libblkio instance. +; +; For information on libblkio, see: https://gitlab.com/libblkio/libblkio + +[global] +ioengine=libblkio +libblkio_driver=io_uring +libblkio_path=/dev/nvme0n1 ; REPLACE THIS WITH THE RIGHT PATH +rw=randread +blocksize=4k +direct=1 +time_based=1 +runtime=10s + +[job-A] + +[job-B] +numjobs=2 ; run two copies of this job simultaneously +thread=1 ; have each copy run as a separate thread in the *same* process + +[job-C] +thread=1 ; have the job run as a thread in the *same* process as "job-B" diff --git a/examples/libblkio-io_uring.png b/examples/libblkio-io_uring.png new file mode 100644 index 0000000000..1bc6cc9874 Binary files /dev/null and b/examples/libblkio-io_uring.png differ diff --git a/examples/libblkio-virtio-blk-vfio-pci.fio b/examples/libblkio-virtio-blk-vfio-pci.fio new file mode 100644 index 0000000000..024224a6ad --- /dev/null +++ b/examples/libblkio-virtio-blk-vfio-pci.fio @@ -0,0 +1,29 @@ +; Benchmark accessing a PCI virtio-blk device using libblkio. +; +; Replace "/sys/bus/pci/devices/0000:00:01.0" below with the path to your +; device's sysfs directory, or override it by passing the '--libblkio_path=...' +; flag to fio. +; +; In the example below, the two subjobs of "job-B" *and* the single subjob of +; "job-C" will share a single libblkio instance, and "job-A" will use a separate +; libblkio instance. +; +; For information on libblkio, see: https://gitlab.com/libblkio/libblkio + +[global] +ioengine=libblkio +libblkio_driver=virtio-blk-vfio-pci +libblkio_path=/sys/bus/pci/devices/0000:00:01.0 ; REPLACE THIS WITH THE RIGHT PATH +rw=randread +blocksize=4k +time_based=1 +runtime=10s + +[job-A] + +[job-B] +numjobs=2 ; run two copies of this job simultaneously +thread=1 ; have each copy run as a separate thread in the *same* process + +[job-C] +thread=1 ; have the job run as a thread in the *same* process as "job-B" diff --git a/examples/libblkio-virtio-blk-vfio-pci.png b/examples/libblkio-virtio-blk-vfio-pci.png new file mode 100644 index 0000000000..8a670cc280 Binary files /dev/null and b/examples/libblkio-virtio-blk-vfio-pci.png differ diff --git a/examples/librpma_apm-client.fio b/examples/librpma_apm-client.fio deleted file mode 100644 index 82a5d20cb5..0000000000 --- a/examples/librpma_apm-client.fio +++ /dev/null @@ -1,24 +0,0 @@ -# Example of the librpma_apm_client job - -[global] -ioengine=librpma_apm_client -create_serialize=0 # (required) forces specific initiation sequence -serverip=[serverip] #IP address the server is listening on -port=7204 # port(s) the server will listen on, will be used -thread - -# The client will get a remote memory region description after establishing -# a connection. - -[client] -numjobs=1 # number of parallel connections -group_reporting=1 -sync=1 # 1 is the best for latency measurements, 0 for bandwidth -iodepth=2 # total number of ious -iodepth_batch_submit=1 # number of ious to be submitted at once -rw=write # read/write/randread/randwrite/readwrite/rw -rwmixread=70 # % of a mixed workload that should be reads -blocksize=4KiB -ramp_time=15s # gives some time to stabilize the workload -time_based -runtime=60s # run the workload for the specified period of time diff --git a/examples/librpma_apm-client.png b/examples/librpma_apm-client.png deleted file mode 100644 index 2fe02cdfdc..0000000000 Binary files a/examples/librpma_apm-client.png and /dev/null differ diff --git a/examples/librpma_apm-server.fio b/examples/librpma_apm-server.fio deleted file mode 100644 index 062b5215d2..0000000000 --- a/examples/librpma_apm-server.fio +++ /dev/null @@ -1,26 +0,0 @@ -# Example of the librpma_apm_server job - -[global] -ioengine=librpma_apm_server -create_serialize=0 # (required) forces specific initiation sequence -kb_base=1000 # turn on the straight units handling (non-compatibility mode) -serverip=[serverip] # IP address to listen on -port=7204 # port(s) the server jobs will listen on, ports will be used -thread - -# The server side spawns one thread for each expected connection from -# the client-side, opens and registers the range dedicated for this thread -# (a workspace) from the provided memory. -# Each of the server threads accepts a connection on the dedicated port -# (different for each and every working thread) and waits for it to end up, -# and closes itself. - -[server] -# set to 1 (true) ONLY when Direct Write to PMem from the remote host is possible -# (https://pmem.io/rpma/documentation/basic-direct-write-to-pmem.html) -direct_write_to_pmem=0 - -numjobs=1 # number of expected incomming connections -size=100MiB # size of workspace for a single connection -filename=malloc # device dax or an existing fsdax file or "malloc" for allocation from DRAM -# filename=/dev/dax1.0 diff --git a/examples/librpma_apm-server.png b/examples/librpma_apm-server.png deleted file mode 100644 index f78ae02e87..0000000000 Binary files a/examples/librpma_apm-server.png and /dev/null differ diff --git a/examples/librpma_gpspm-client.fio b/examples/librpma_gpspm-client.fio deleted file mode 100644 index 843382df66..0000000000 --- a/examples/librpma_gpspm-client.fio +++ /dev/null @@ -1,23 +0,0 @@ -# Example of the librpma_gpspm_client job - -[global] -ioengine=librpma_gpspm_client -create_serialize=0 # (required) forces specific initiation sequence -serverip=[serverip] #IP address the server is listening on -port=7204 # port(s) the server will listen on, will be used -thread - -# The client will get a remote memory region description after establishing -# a connection. - -[client] -numjobs=1 # number of parallel connections -group_reporting=1 -sync=1 # 1 is the best for latency measurements, 0 for bandwidth -iodepth=2 # total number of ious -iodepth_batch_submit=1 # number of ious to be submitted at once -rw=write # write/randwrite -blocksize=4KiB -ramp_time=15s # gives some time to stabilize the workload -time_based -runtime=60s # run the workload for the specified period of time diff --git a/examples/librpma_gpspm-client.png b/examples/librpma_gpspm-client.png deleted file mode 100644 index 0c975a275a..0000000000 Binary files a/examples/librpma_gpspm-client.png and /dev/null differ diff --git a/examples/librpma_gpspm-server.fio b/examples/librpma_gpspm-server.fio deleted file mode 100644 index 67e92a28ad..0000000000 --- a/examples/librpma_gpspm-server.fio +++ /dev/null @@ -1,33 +0,0 @@ -# Example of the librpma_gpspm_server job - -[global] -ioengine=librpma_gpspm_server -create_serialize=0 # (required) forces specific initiation sequence -kb_base=1000 # turn on the straight units handling (non-compatibility mode) -serverip=[serverip] #IP address to listen on -port=7204 # port(s) the server jobs will listen on, ports will be used -thread - -# The server side spawns one thread for each expected connection from -# the client-side, opens and registers the range dedicated for this thread -# (a workspace) from the provided memory. -# Each of the server threads accepts a connection on the dedicated port -# (different for each and every working thread), accepts and executes flush -# requests, and sends back a flush response for each of the requests. -# When the client is done it sends the termination notice to the server's thread. - -[server] -# set to 1 (true) ONLY when Direct Write to PMem from the remote host is possible -# (https://pmem.io/rpma/documentation/basic-direct-write-to-pmem.html) -direct_write_to_pmem=0 -# set to 0 (false) to wait for completion instead of busy-wait polling completion. -busy_wait_polling=1 -numjobs=1 # number of expected incomming connections -iodepth=2 # number of parallel GPSPM requests -size=100MiB # size of workspace for a single connection -filename=malloc # device dax or an existing fsdax file or "malloc" for allocation from DRAM -# filename=/dev/dax1.0 - -# The client will terminate the server when the client will end up its job. -time_based -runtime=365d diff --git a/examples/librpma_gpspm-server.png b/examples/librpma_gpspm-server.png deleted file mode 100644 index 56124533da..0000000000 Binary files a/examples/librpma_gpspm-server.png and /dev/null differ diff --git a/examples/nbd.fio b/examples/nbd.fio index 6900ebe7f0..31629fad70 100644 --- a/examples/nbd.fio +++ b/examples/nbd.fio @@ -1,21 +1,25 @@ -# To use fio to test nbdkit: +# To use fio to test nbdkit + RAM disk: # -# nbdkit -U - memory size=256M --run 'export unixsocket; fio examples/nbd.fio' +# nbdkit -U - memory size=256M --run 'export uri; fio examples/nbd.fio' # -# To use fio to test qemu-nbd: +# To use fio to test nbdkit + local file: # -# rm -f /tmp/disk.img /tmp/socket -# truncate -s 256M /tmp/disk.img -# export unixsocket=/tmp/socket -# qemu-nbd -t -k $unixsocket -f raw /tmp/disk.img & -# fio examples/nbd.fio -# killall qemu-nbd +# rm -f /var/tmp/disk.img +# truncate -s 256M /var/tmp/disk.img +# nbdkit -U - file /var/tmp/disk.img --run 'export uri; fio examples/nbd.fio' +# +# To use fio to test qemu-nbd + local file: +# +# rm -f /var/tmp/disk.img /var/tmp/socket +# truncate -s 256M /var/tmp/disk.img +# export uri='nbd+unix:///?socket=/var/tmp/socket' +# qemu-nbd -t -k /var/tmp/socket -f raw /var/tmp/disk.img & +# fio examples/nbd.fio +# killall qemu-nbd [global] ioengine=nbd -uri=nbd+unix:///?socket=${unixsocket} -# Starting from nbdkit 1.14 the following will work: -#uri=${uri} +uri=${uri} rw=randrw time_based runtime=60 diff --git a/examples/nbd.png b/examples/nbd.png index e3bcf61058..3a933c9ba0 100644 Binary files a/examples/nbd.png and b/examples/nbd.png differ diff --git a/examples/netio_vsock.fio b/examples/netio_vsock.fio new file mode 100644 index 0000000000..8c328f7dd3 --- /dev/null +++ b/examples/netio_vsock.fio @@ -0,0 +1,22 @@ +# Example network vsock job, just defines two clients that send/recv data +[global] +ioengine=net + +port=8888 +protocol=vsock +bs=4k +size=100g + +#set the below option to enable end-to-end data integrity tests +#verify=md5 + +[receiver] +listen +rw=read + +[sender] +# 1 (VMADDR_CID_LOCAL) is the well-known address +# for local communication (loopback) +hostname=1 +startdelay=1 +rw=write diff --git a/examples/netio_vsock.png b/examples/netio_vsock.png new file mode 100644 index 0000000000..01aadde556 Binary files /dev/null and b/examples/netio_vsock.png differ diff --git a/examples/netio_vsock_receiver.fio b/examples/netio_vsock_receiver.fio new file mode 100644 index 0000000000..e2a00c4d79 --- /dev/null +++ b/examples/netio_vsock_receiver.fio @@ -0,0 +1,14 @@ +# Example network vsock job, just defines a receiver +[global] +ioengine=net +port=8888 +protocol=vsock +bs=4k +size=100g + +#set the below option to enable end-to-end data integrity tests +#verify=md5 + +[receiver] +listen +rw=read diff --git a/examples/netio_vsock_receiver.png b/examples/netio_vsock_receiver.png new file mode 100644 index 0000000000..524a7a1c95 Binary files /dev/null and b/examples/netio_vsock_receiver.png differ diff --git a/examples/netio_vsock_sender.fio b/examples/netio_vsock_sender.fio new file mode 100644 index 0000000000..2451d99005 --- /dev/null +++ b/examples/netio_vsock_sender.fio @@ -0,0 +1,17 @@ +# Example network vsock job, just defines a sender +[global] +ioengine=net +port=8888 +protocol=vsock +bs=4k +size=100g + +#set the below option to enable end-to-end data integrity tests +#verify=md5 + +[sender] +# set the 'hostname' option to the CID of the listening domain +hostname=3 +startdelay=1 +rw=write + diff --git a/examples/netio_vsock_sender.png b/examples/netio_vsock_sender.png new file mode 100644 index 0000000000..75802aafe4 Binary files /dev/null and b/examples/netio_vsock_sender.png differ diff --git a/examples/pmemblk.fio b/examples/pmemblk.fio deleted file mode 100644 index 59bb2a8a5a..0000000000 --- a/examples/pmemblk.fio +++ /dev/null @@ -1,71 +0,0 @@ -[global] -bs=1m -ioengine=pmemblk -norandommap -time_based -runtime=30 -group_reporting -disable_lat=1 -disable_slat=1 -disable_clat=1 -clat_percentiles=0 -cpus_allowed_policy=split - -# For the pmemblk engine: -# -# IOs always complete immediately -# IOs are always direct -# Must use threads -# -iodepth=1 -direct=1 -thread -numjobs=16 -# -# Unlink can be used to remove the files when done, but if you are -# using serial runs with stonewall, and you want the files to be created -# only once and unlinked only at the very end, then put the unlink=1 -# in the last group. This is the method demonstrated here. -# -# Note that if you have a read-only group and if the files will be -# newly created, then all of the data will read back as zero and the -# read will be optimized, yielding performance that is different from -# that of reading non-zero blocks (or unoptimized zero blocks). -# -unlink=0 -# -# The pmemblk engine does IO to files in a DAX-mounted filesystem. -# The filesystem should be created on an NVDIMM (e.g /dev/pmem0) -# and then mounted with the '-o dax' option. Note that the engine -# accesses the underlying NVDIMM directly, bypassing the kernel block -# layer, so the usual filesystem/disk performance monitoring tools such -# as iostat will not provide useful data. -# -# Here we specify a test file on each of two NVDIMMs. The first -# number after the file name is the block size in bytes (4096 bytes -# in this example). The second number is the size of the file to -# create in MiB (1 GiB in this example); note that the actual usable -# space available to fio will be less than this as libpmemblk requires -# some space for metadata. -# -# Currently, the minimum block size is 512 bytes and the minimum file -# size is about 17 MiB (these are libpmemblk requirements). -# -# While both files in this example have the same block size and file -# size, this is not required. -# -filename=/pmem0/fio-test,4096,1024 -#filename=/pmem1/fio-test,4096,1024 - -[pmemblk-write] -rw=randwrite -stonewall - -[pmemblk-read] -rw=randread -stonewall -# -# We're done, so unlink the file: -# -unlink=1 - diff --git a/examples/pmemblk.png b/examples/pmemblk.png deleted file mode 100644 index 250e254b72..0000000000 Binary files a/examples/pmemblk.png and /dev/null differ diff --git a/examples/rados.fio b/examples/rados.fio index 035cbff4ab..dd86f354c8 100644 --- a/examples/rados.fio +++ b/examples/rados.fio @@ -14,6 +14,7 @@ ioengine=rados clientname=admin pool=rados +conf=/etc/ceph/ceph.conf busy_poll=0 rw=randwrite bs=4k diff --git a/examples/rand-zones.fio b/examples/rand-zones.fio index 169137d493..10e717278f 100644 --- a/examples/rand-zones.fio +++ b/examples/rand-zones.fio @@ -21,6 +21,6 @@ random_distribution=zoned:50/5:30/15:20/ # The above applies to all of reads/writes/trims. If we wanted to do # something differently for writes, let's say 50% for the first 10% # and 50% for the remaining 90%, we could do it by adding a new section -# after a a comma. +# after a comma. # random_distribution=zoned:50/5:30/15:20/,50/10:50/90 diff --git a/examples/sg_verify-fail.fio b/examples/sg_verify-fail.fio new file mode 100644 index 0000000000..64feece3bc --- /dev/null +++ b/examples/sg_verify-fail.fio @@ -0,0 +1,48 @@ +# +# ********************************** +# * !!THIS IS A DESTRUCTIVE TEST!! * +# * IF NOT CHANGED THIS TEST WILL * +# * DESTROY DATA ON /dev/sdb * +# ********************************** +# +# Test SCSI VERIFY commands issued via the sg ioengine +# The jobs with fail in the name should produce errors +# +# job description +# precon precondition the device by writing with a known +# pattern +# verify01 verify each block one at a time by comparing to known +# pattern +# verify01-fail verifying one too many blocks should produce a failure +# verify11-one_ios verify all 20 blocks by sending only 512 bytes +# verify11-fail verifying beyond the preconditioned region should +# produce a failure + +[global] +filename=/dev/sdb +buffer_pattern=0x01 +ioengine=sg +rw=write +bs=512 +number_ios=20 +stonewall + +[precon] + +[verify01] +sg_write_mode=verify_bytchk_01 +number_ios=20 + +[verify01-fail] +sg_write_mode=verify_bytchk_01 +number_ios=21 + +[verify11-one_ios] +sg_write_mode=verify_bytchk_11 +number_ios=1 +bs=10240 + +[verify11-fail] +sg_write_mode=verify_bytchk_11 +number_ios=1 +bs=10752 diff --git a/examples/sg_verify-fail.png b/examples/sg_verify-fail.png new file mode 100644 index 0000000000..516e2d4061 Binary files /dev/null and b/examples/sg_verify-fail.png differ diff --git a/examples/sg_verify.fio b/examples/sg_verify.fio new file mode 100644 index 0000000000..6db0dd0a62 --- /dev/null +++ b/examples/sg_verify.fio @@ -0,0 +1,57 @@ +# +# ********************************** +# * !!THIS IS A DESTRUCTIVE TEST!! * +# * IF NOT CHANGED THIS TEST WILL * +# * DESTROY DATA ON /dev/sdb * +# ********************************** +# +# Test SCSI VERIFY commands issued via the sg ioengine +# All of the jobs below should complete without error +# +# job description +# precon precondition the device by writing with a known +# pattern +# verify00 verify written data on medium only +# verify01 verify each block one at a time by comparing to known +# pattern +# verify01-two_ios verify same data but with only two VERIFY operations +# verify11 verify each block one at a time +# verify11-five_ios verify data with five IOs, four blocks at a time, +# sending 512 bytes for each IO +# verify11-one_ios verify all 20 blocks by sending only 512 bytes +# + +[global] +filename=/dev/sdb +buffer_pattern=0x01 +ioengine=sg +rw=write +bs=512 +number_ios=20 +stonewall + +[precon] + +[verify00] +sg_write_mode=verify_bytchk_00 + +[verify01] +sg_write_mode=verify_bytchk_01 + +[verify01-two_ios] +sg_write_mode=verify_bytchk_01 +bs=5120 +number_ios=2 + +[verify11] +sg_write_mode=verify_bytchk_11 + +[verify11-five_ios] +sg_write_mode=verify_bytchk_11 +bs=2048 +number_ios=5 + +[verify11-one_ios] +sg_write_mode=verify_bytchk_11 +bs=10240 +number_ios=1 diff --git a/examples/sg_verify.png b/examples/sg_verify.png new file mode 100644 index 0000000000..f244a74890 Binary files /dev/null and b/examples/sg_verify.png differ diff --git a/examples/sg_write_same_ndob.fio b/examples/sg_write_same_ndob.fio new file mode 100644 index 0000000000..fb0473196b --- /dev/null +++ b/examples/sg_write_same_ndob.fio @@ -0,0 +1,44 @@ +# +# ********************************** +# * !!THIS IS A DESTRUCTIVE TEST!! * +# * IF NOT CHANGED THIS TEST WILL * +# * DESTROY DATA ON /dev/sdb * +# ********************************** +# +# Test WRITE SAME commands with the NDOB flag set +# issued via the sg ioengine +# All of the jobs below should complete without error +# except the last one +# +# job description +# precon Precondition the device by writing 20 blocks with a +# known pattern +# write_same_ndob Write 19 sectors of all zeroes with the NDOB flag set +# verify-pass Verify 19 blocks of all zeroes +# verify-fail Verify 20 blocks of all zeroes. This should fail. +# + +[global] +filename=/dev/sdb +buffer_pattern=0x01 +ioengine=sg +rw=write +bs=512 +stonewall + +[precon] +number_ios=20 + +[write_same_ndob] +sg_write_mode=write_same_ndob +number_ios=19 + +[verify-pass] +sg_write_mode=verify_bytchk_01 +buffer_pattern=0x00 +number_ios=19 + +[verify-fail] +sg_write_mode=verify_bytchk_01 +buffer_pattern=0x00 +number_ios=20 diff --git a/examples/sg_write_same_ndob.png b/examples/sg_write_same_ndob.png new file mode 100644 index 0000000000..8b76fc6c76 Binary files /dev/null and b/examples/sg_write_same_ndob.png differ diff --git a/examples/sprandom.fio b/examples/sprandom.fio new file mode 100644 index 0000000000..b94b226eb2 --- /dev/null +++ b/examples/sprandom.fio @@ -0,0 +1,41 @@ +; (SPRandom) SanDisk Random preconditioning example +; Requirements +; 1. Single file +; 2. Single job (numjobs=1) +; 3. Assumes norandommap=1 +; 4. Assumes random_generator=lfsr +; +; FIO_BS should be set to driver indirection unit (IU) size. +; IU is the smallest unit of data that can be mapped from a LBA +; on the host to a physical location on the SSD's flash memory. +; +; Basic execution example, run with io_uring +; env FIO_BS=4096 \ +; fio --filename=/dev/nvme0n1 --ioengine=io_uring examples/sprandom.fio +; +; Enable debug output for the 'sprandom' module +; env FIO_BS=4096 \ +; fio --debug=sprandom --filename=/dev/nvme0n1 examples/sprandom.fio +; +; Set over-provisioning according to vendor recommendation (21%) +; env FIO_BS=4096 \ +; fio --spr_op=0.21 --filename=/dev/nvme0n1 examples/sprandom.fio +; +; For large devices it is better to use more regions, to increase precision +; and reduce memory allocation. The allocation is proportional to the region size. +; env FIO_BS=4096 \ +; fio --spr_num_regions=400 --filename=/dev/nvme0n1 examples/sprandom.fio +; +[global] +ioengine=libaio +rw=randwrite +bs=${FIO_BS} +blockalign=${FIO_BS} +direct=1 +norandommap=1 +iodepth=64 +[preconditioning] +sprandom=1 +spr_op=0.15 +spr_num_regions=100 + diff --git a/examples/test.png b/examples/test.png deleted file mode 100644 index 6be500293c..0000000000 Binary files a/examples/test.png and /dev/null differ diff --git a/examples/uring-cmd-fdp.fio b/examples/uring-cmd-fdp.fio new file mode 100644 index 0000000000..55d741d3f5 --- /dev/null +++ b/examples/uring-cmd-fdp.fio @@ -0,0 +1,37 @@ +# io_uring_cmd I/O engine for nvme-ns generic character device with FDP enabled +# This assumes the namespace is already configured with FDP support and has at +# least 8 available reclaim units. +# +# Each job targets different ranges of LBAs with different placement +# identifiers, and has different write intensity. + +[global] +filename=/dev/ng0n1 +ioengine=io_uring_cmd +cmd_type=nvme +iodepth=32 +bs=4K +fdp=1 +time_based=1 +runtime=1000 + +[write-heavy] +rw=randrw +rwmixwrite=90 +fdp_pli=0,1,2,3 +offset=0% +size=30% + +[write-mid] +rw=randrw +rwmixwrite=30 +fdp_pli=4,5 +offset=30% +size=30% + +[write-light] +rw=randrw +rwmixwrite=10 +fdp_pli=6 +offset=60% +size=30% diff --git a/examples/uring-cmd-fdp.png b/examples/uring-cmd-fdp.png new file mode 100644 index 0000000000..251f4fe34a Binary files /dev/null and b/examples/uring-cmd-fdp.png differ diff --git a/examples/uring-cmd-ng.fio b/examples/uring-cmd-ng.fio new file mode 100644 index 0000000000..b2888a0035 --- /dev/null +++ b/examples/uring-cmd-ng.fio @@ -0,0 +1,25 @@ +# io_uring_cmd I/O engine for nvme-ns generic character device + +[global] +filename=/dev/ng0n1 +ioengine=io_uring_cmd +cmd_type=nvme +size=1G +iodepth=32 +bs=4K +thread=1 +stonewall=1 + +[rand-write] +rw=randwrite +sqthread_poll=1 + +[rand-read] +rw=randread + +[write-opts] +rw=write +sqthread_poll=1 +sqthread_poll_cpu=0 +nonvectored=1 +registerfiles=1 diff --git a/examples/uring-cmd-ng.png b/examples/uring-cmd-ng.png new file mode 100644 index 0000000000..cd2ff16249 Binary files /dev/null and b/examples/uring-cmd-ng.png differ diff --git a/examples/uring-cmd-pi-ext.fio b/examples/uring-cmd-pi-ext.fio new file mode 100644 index 0000000000..e22ec06243 --- /dev/null +++ b/examples/uring-cmd-pi-ext.fio @@ -0,0 +1,31 @@ +# Protection information test with io_uring_cmd I/O engine for nvme-ns generic +# character device. +# +# This requires nvme device to be formatted with extended LBA data size and +# protection information enabled. This can be done with nvme-cli utility. +# Replace bs below with the correct extended LBA size. +# +# First we sequentially write to the device, without protection information +# action being set. FIO will generate and send necessary protection +# information data as per the protection information check option. Later on we +# sequentially read and verify the device returned protection information data. +# +[global] +filename=/dev/ng0n1 +ioengine=io_uring_cmd +cmd_type=nvme +size=1G +iodepth=32 +bs=4160 +pi_act=0 +pi_chk=GUARD,APPTAG,REFTAG +apptag=0x0888 +apptag_mask=0xFFFF +thread=1 +stonewall=1 + +[write] +rw=write + +[read] +rw=read diff --git a/examples/uring-cmd-pi-ext.png b/examples/uring-cmd-pi-ext.png new file mode 100644 index 0000000000..a102fc1a7a Binary files /dev/null and b/examples/uring-cmd-pi-ext.png differ diff --git a/examples/uring-cmd-pi-sb.fio b/examples/uring-cmd-pi-sb.fio new file mode 100644 index 0000000000..b201a7ce00 --- /dev/null +++ b/examples/uring-cmd-pi-sb.fio @@ -0,0 +1,32 @@ +# Protection information test with io_uring_cmd I/O engine for nvme-ns generic +# character device. +# +# This requires nvme device to be formatted with separate metadata buffer and +# protection information enabled. This can be done with nvme-cli utility. +# Replace md_per_io_size as per the required metadata buffer size for each IO. +# +# First we sequentially write to the device, without protection information +# action being set. FIO will generate and send necessary protection +# information data as per the protection information check option. Later on we +# sequentially read and verify the device returned protection information data. +# +[global] +filename=/dev/ng0n1 +ioengine=io_uring_cmd +cmd_type=nvme +size=1G +iodepth=32 +bs=4096 +md_per_io_size=64 +pi_act=0 +pi_chk=GUARD,APPTAG,REFTAG +apptag=0x0888 +apptag_mask=0xFFFF +thread=1 +stonewall=1 + +[write] +rw=write + +[read] +rw=read diff --git a/examples/uring-cmd-pi-sb.png b/examples/uring-cmd-pi-sb.png new file mode 100644 index 0000000000..dcdda8cdab Binary files /dev/null and b/examples/uring-cmd-pi-sb.png differ diff --git a/examples/uring-cmd-trim-multi-range.fio b/examples/uring-cmd-trim-multi-range.fio new file mode 100644 index 0000000000..b376481bbb --- /dev/null +++ b/examples/uring-cmd-trim-multi-range.fio @@ -0,0 +1,21 @@ +# Multi-range trim command test with io_uring_cmd I/O engine for nvme-ns +# generic character device. +# +[global] +filename=/dev/ng0n1 +ioengine=io_uring_cmd +cmd_type=nvme +size=10M +iodepth=32 +thread=1 +stonewall=1 + +[write_bs] +bs=4096 +rw=randtrim +num_range=8 + +[write_bssplit] +bssplit=4k/10:64k/50:32k/40 +rw=trim +num_range=8 diff --git a/examples/uring-cmd-trim-multi-range.png b/examples/uring-cmd-trim-multi-range.png new file mode 100644 index 0000000000..c3ffd54640 Binary files /dev/null and b/examples/uring-cmd-trim-multi-range.png differ diff --git a/examples/uring-cmd-zoned.fio b/examples/uring-cmd-zoned.fio new file mode 100644 index 0000000000..89be61beae --- /dev/null +++ b/examples/uring-cmd-zoned.fio @@ -0,0 +1,35 @@ +# io_uring_cmd I/O engine for nvme-ns generic zoned character device +# +# NOTE: +# Regular writes against a zone should be limited to QD1, as the device can +# reorder the requests. +# +# As the passthrough path do not use an IO scheduler (such as mq-deadline), +# the queue depth should be limited to 1 to avoid zone invalid writes. + +[global] +filename=/dev/ng0n1 +ioengine=io_uring_cmd +cmd_type=nvme +zonemode=zbd +size=1G +iodepth=1 +bs=256K +verify=crc32c +stonewall=1 + +[rand-write] +rw=randwrite + +[write-opts] +rw=write +registerfiles=1 +sqthread_poll=1 +sqthread_poll_cpu=0 + +[randwrite-opts] +rw=randwrite +sqthread_poll=1 +sqthread_poll_cpu=0 +nonvectored=1 +registerfiles=1 diff --git a/examples/uring-cmd-zoned.png b/examples/uring-cmd-zoned.png new file mode 100644 index 0000000000..a3dd199dc5 Binary files /dev/null and b/examples/uring-cmd-zoned.png differ diff --git a/examples/xnvme-compare.fio b/examples/xnvme-compare.fio new file mode 100644 index 0000000000..b89dfdf4db --- /dev/null +++ b/examples/xnvme-compare.fio @@ -0,0 +1,72 @@ +; Compare fio IO engines with a random-read workload using BS=4k at QD=1 +; +; README +; +; This job-file is intended to be used as: +; +; # Use the built-in io_uring engine to get baseline numbers +; fio examples/xnvme-compare.fio \ +; --section=default \ +; --ioengine=io_uring \ +; --sqthread_poll=1 \ +; --filename=/dev/nvme0n1 +; +; # Use the xNVMe io-engine engine with Linux backend and io_uring async. impl. +; fio examples/xnvme-compare.fio \ +; --section=default \ +; --ioengine=xnvme \ +; --sqthread_poll=1 \ +; --xnvme_async=io_uring \ +; --filename=/dev/nvme0n1 +; +; # Use the xNVMe io-engine engine with Linux backend and libaio async. impl. +; fio examples/xnvme-compare.fio \ +; --section=default \ +; --ioengine=xnvme \ +; --xnvme_async=libaio \ +; --filename=/dev/nvme0n1 +; +; # Use the xNVMe io-engine engine with SPDK backend, note that you have to set the Namespace-id +; fio examples/xnvme-compare.fio \ +; --section=default \ +; --ioengine=xnvme \ +; --xnvme_dev_nsid=1 \ +; --filename=0000\\:01\\:00.0 +; +; NOTE: The URI encoded in the filename above, the ":" must be escaped. +; +; On the command-line using two "\\": +; +; --filename=0000\\:01\\:00.0 +; +; Within a fio-script using a single "\": +; +; filename=0000\:01\:00.0 +; +; NOTE: If you want to override the default bs, iodepth, and workload, then +; invoke it as: +; +; FIO_BS="512" FIO_RW="verify" FIO_IODEPTH=16 fio examples/xnvme-compare.fio \ +; --section=override +; +[global] +rw=randread +size=12G +iodepth=1 +bs=4K +direct=1 +thread=1 +time_based=1 +runtime=7 +ramp_time=3 +norandommap=1 + +; Avoid accidentally creating device files; e.g. "/dev/nvme0n1", "/dev/nullb0" +allow_file_create=0 + +[default] + +[override] +rw=${FIO_RW} +iodepth=${FIO_IODEPTH} +bs=${FIO_BS} diff --git a/examples/xnvme-compare.png b/examples/xnvme-compare.png new file mode 100644 index 0000000000..2af92f6299 Binary files /dev/null and b/examples/xnvme-compare.png differ diff --git a/examples/xnvme-fdp.fio b/examples/xnvme-fdp.fio new file mode 100644 index 0000000000..c50959f1f3 --- /dev/null +++ b/examples/xnvme-fdp.fio @@ -0,0 +1,56 @@ +; README +; +; This job-file is intended to be used either as: +; +; # Use the xNVMe io-engine engine io_uring_cmd async. impl. +; fio examples/xnvme-fdp.fio \ +; --section=default \ +; --ioengine=xnvme \ +; --xnvme_async=io_uring_cmd \ +; --filename=/dev/ng0n1 +; +; # Use the xNVMe io-engine engine with nvme sync. impl. +; fio examples/xnvme-fdp.fio \ +; --section=default \ +; --ioengine=xnvme \ +; --xnvme_sync=nvme \ +; --filename=/dev/ng0n1 +; +; # Use the xNVMe io-engine engine with SPDK backend, note that you have to set the Namespace-id +; fio examples/xnvme-fdp.fio \ +; --section=default \ +; --ioengine=xnvme \ +; --xnvme_dev_nsid=1 \ +; --filename=0000\\:01\\:00.0 +; +; NOTE: The URI encoded in the filename above, the ":" must be escaped. +; +; On the command-line using two "\\": +; +; --filename=0000\\:01\\:00.0 +; +; Within a fio-script using a single "\": +; +; filename=0000\:01\:00.0 +; +; NOTE: If you want to override the default bs, iodepth, and workload, then +; invoke it as: +; +; FIO_BS="512" FIO_RW="read" FIO_IODEPTH=16 fio examples/xnvme-fdp.fio \ +; --section=override --ioengine=xnvme --xnvme_sync=nvme --filename=/dev/ng0n1 +; +[global] +rw=randwrite +size=2M +iodepth=1 +bs=4K +thread=1 +fdp=1 +fdp_pli=4,5 + +[default] + +[override] +rw=${FIO_RW} +iodepth=${FIO_IODEPTH} +bs=${FIO_BS} diff --git a/examples/xnvme-fdp.png b/examples/xnvme-fdp.png new file mode 100644 index 0000000000..7f80274197 Binary files /dev/null and b/examples/xnvme-fdp.png differ diff --git a/examples/xnvme-pi.fio b/examples/xnvme-pi.fio new file mode 100644 index 0000000000..ca8c0101ae --- /dev/null +++ b/examples/xnvme-pi.fio @@ -0,0 +1,53 @@ +; README +; +; This job-file is intended to be used either as: +; +; # Use the xNVMe io-engine engine io_uring_cmd async. impl. +; fio examples/xnvme-pi.fio \ +; --ioengine=xnvme \ +; --xnvme_async=io_uring_cmd \ +; --filename=/dev/ng0n1 +; +; # Use the xNVMe io-engine engine with nvme sync. impl. +; fio examples/xnvme-pi.fio \ +; --ioengine=xnvme \ +; --xnvme_sync=nvme \ +; --filename=/dev/ng0n1 +; +; # Use the xNVMe io-engine engine with SPDK backend, note that you have to set the Namespace-id +; fio examples/xnvme-pi.fio \ +; --ioengine=xnvme \ +; --xnvme_dev_nsid=1 \ +; --filename=0000\\:01\\:00.0 +; +; NOTE: The URI encoded in the filename above, the ":" must be escaped. +; +; On the command-line using two "\\": +; +; --filename=0000\\:01\\:00.0 +; +; Within a fio-script using a single "\": +; +; filename=0000\:01\:00.0 +; +; NOTE: This example configuration assumes that the NVMe device is formatted +; with a separate metadata buffer. If you want to run on an extended LBA format +; update the "bs" accordingly. +; +[global] +size=100M +iodepth=16 +bs=4K +md_per_io_size=64 +pi_act=0 +pi_chk=GUARD,APPTAG,REFTAG +apptag=0x0234 +apptag_mask=0xFFFF +thread=1 +stonewall=1 + +[write] +rw=write + +[read] +rw=read diff --git a/examples/xnvme-pi.png b/examples/xnvme-pi.png new file mode 100644 index 0000000000..def7e68087 Binary files /dev/null and b/examples/xnvme-pi.png differ diff --git a/examples/xnvme-zoned.fio b/examples/xnvme-zoned.fio new file mode 100644 index 0000000000..1344f9a1c8 --- /dev/null +++ b/examples/xnvme-zoned.fio @@ -0,0 +1,87 @@ +; Running xNVMe/fio on a Zoned Device +; +; Writes 1GB at QD1 using 4K BS and verifies it. +; +; README +; +; This job-file is intended to be used as: +; +; # Use the built-in io_uring engine to get baseline numbers +; fio examples/xnvme-zoned.fio \ +; --section=default \ +; --ioengine=io_uring \ +; --sqthread_poll=1 \ +; --filename=/dev/nvme0n1 +; +; # Use the xNVMe io-engine engine with Linux backend and io_uring async. impl. +; fio examples/xnvme-zoned.fio \ +; --section=default \ +; --ioengine=xnvme \ +; --sqthread_poll=1 \ +; --xnvme_async=io_uring \ +; --filename=/dev/nvme0n1 +; +; # Use the xNVMe io-engine engine with Linux backend and libaio async. impl. +; fio examples/xnvme-zoned.fio \ +; --section=default \ +; --ioengine=xnvme \ +; --xnvme_async=libaio \ +; --filename=/dev/nvme0n1 +; +; # Use the xNVMe io-engine engine with SPDK backend, note that you have to set the Namespace-id +; fio examples/xnvme-zoned.fio \ +; --section=default \ +; --ioengine=xnvme \ +; --xnvme_dev_nsid=1 \ +; --filename=0000\\:01\\:00.0 +; +; NOTE: The URI encoded in the filename above, the ":" must be escaped. +; +; On the command-line using two "\\": +; +; --filename=0000\\:01\\:00.0 +; +; Within a fio-script using a single "\": +; +; filename=0000\:01\:00.0 +; +; NOTE: If you want to override the default bs, iodepth, and workload, then +; invoke it as: +; +; FIO_BS="512" FIO_RW="verify" FIO_IODEPTH=16 fio examples/xnvme-zoned.fio \ +; --section=override +; +; To reset all zones on the device to EMPTY state aka. wipe the entire device. +; +; # zoned mgmt-reset /dev/nvme0n2 --slba 0x0 --all +; +[global] +zonemode=zbd +rw=write +size=1G +iodepth=1 +bs=4K +direct=1 +thread=1 +ramp_time=1 +norandommap=1 +verify=crc32c +; Avoid accidentally creating device files; e.g. "/dev/nvme0n1", "/dev/nullb0" +allow_file_create=0 +; +; NOTE: If fio complains about zone-size, then run: +; +; # zoned info /dev/nvme0n1 +; +; The command will provide the values you need, then in the fio-script define: +; +; zonesize=nsect * nbytes +; +;zonesize= + +[default] + +[override] +rw=${FIO_RW} +iodepth=${FIO_IODEPTH} +bs=${FIO_BS} diff --git a/examples/xnvme-zoned.png b/examples/xnvme-zoned.png new file mode 100644 index 0000000000..2f85074084 Binary files /dev/null and b/examples/xnvme-zoned.png differ diff --git a/examples/zbd-rand-write-trim-gc.fio b/examples/zbd-rand-write-trim-gc.fio new file mode 100644 index 0000000000..139d2c43fc --- /dev/null +++ b/examples/zbd-rand-write-trim-gc.fio @@ -0,0 +1,43 @@ +; Using the libaio ioengine, random write to a (zoned) block device. Write +; target zones are chosen randomly among the first 128 zones starting from +; device offset corresponding to the 524th zone of the device (524 x 256 MB). +; For first 3 seconds, run only random write. After that, run random write job +; and garbage collection simulation job in parallel. The garbage collection +; simulation job runs trim workload to reset the 128 zones randomly. Use flow +; option to make the zone resets happen every 128 blocks writes by the other +; job. This example does not specify max_open_zones. The limit of maximum +; open zones is obtained from the target block device. + +[global] +group_reporting +zonemode=zbd +zonesize=256M +direct=1 +time_based +runtime=30 + +filename=/dev/sdb +offset=524z + +[warmup] +rw=randwrite +bs=2M +size=128z +ioengine=libaio +runtime=3 + +[wjob] +wait_for=warmup +rw=randwrite +bs=2M +size=128z +ioengine=libaio +flow=128 + +[trimjob] +wait_for=warmup +rw=randtrim +bs=256M +size=128z +ioengine=psync +flow=1 diff --git a/examples/zbd-rand-write-trim-gc.png b/examples/zbd-rand-write-trim-gc.png new file mode 100644 index 0000000000..f58dd412f8 Binary files /dev/null and b/examples/zbd-rand-write-trim-gc.png differ diff --git a/examples/zbd-rand-write-zone-reset-gc.fio b/examples/zbd-rand-write-zone-reset-gc.fio new file mode 100644 index 0000000000..8f77baf392 --- /dev/null +++ b/examples/zbd-rand-write-zone-reset-gc.fio @@ -0,0 +1,27 @@ +; Using the psync ioengine, random write to a (zoned) block device. Write +; target zones are chosen randomly among the first 8 zones starting from device +; offset corresponding to the 524th zone of the device (524 x 256 MB). Simulate +; garbage collection operation using zone_reset_threshold and +; zone_reset_frequency options. The zone resets happen when total written data +; bytes is beyond 70% of 8 zones, and 8 = 1 / 0.125 blocks are written. This +; example does not specify max_open_zones. The limit of maximum open zones is +; obtained from the target block device. + +[global] +name=zbd-rand-write-gc +group_reporting +rw=randwrite +zonemode=zbd +zonesize=256M +bs=32M +direct=1 +time_based +runtime=40 + +[dev1] +filename=/dev/sdb +size=8z +offset=524z +ioengine=psync +zone_reset_threshold=0.7 +zone_reset_frequency=0.125 diff --git a/examples/zbd-rand-write-zone-reset-gc.png b/examples/zbd-rand-write-zone-reset-gc.png new file mode 100644 index 0000000000..b10acc807f Binary files /dev/null and b/examples/zbd-rand-write-zone-reset-gc.png differ diff --git a/examples/zbd-rand-write.fio b/examples/zbd-rand-write.fio index 46cddd0609..9494a583dd 100644 --- a/examples/zbd-rand-write.fio +++ b/examples/zbd-rand-write.fio @@ -1,4 +1,4 @@ -; Using the libaio ioengine, random write to a (zoned) block device, +; Using the psync ioengine, random write to a (zoned) block device, ; writing at most 32 zones at a time. Target zones are chosen randomly ; and writes directed at the write pointer of the chosen zones diff --git a/file.h b/file.h index faf65a2a01..f400155fd9 100644 --- a/file.h +++ b/file.h @@ -12,6 +12,7 @@ /* Forward declarations */ struct zoned_block_device_info; +struct fdp_ruh_info; /* * The type of object we are working on @@ -101,6 +102,9 @@ struct fio_file { uint64_t file_offset; uint64_t io_size; + struct fio_ruhs_info *ruhs_info; + struct fio_ruhs_scheme *ruhs_scheme; + /* * Zoned block device information. See also zonemode=zbd. */ @@ -109,6 +113,9 @@ struct fio_file { uint32_t min_zone; /* inclusive */ uint32_t max_zone; /* exclusive */ + /* SP Random Info */ + struct sprandom_info *spr_info; + /* * Track last end and last start of IO for a given data direction */ @@ -119,19 +126,14 @@ struct fio_file { uint64_t last_write; /* - * Tracks the last iodepth number of completed writes, if data - * verification is enabled + * For use by the io engine to store offset */ - uint64_t *last_write_comp; - unsigned int last_write_idx; + uint64_t engine_pos; /* - * For use by the io engine for offset or private data storage + * For use by the io engine for private data storage */ - union { - uint64_t engine_pos; - void *engine_data; - }; + void *engine_data; /* * if io is protected by a semaphore, this is set diff --git a/filesetup.c b/filesetup.c index fb556d8444..a766c39a6b 100644 --- a/filesetup.c +++ b/filesetup.c @@ -15,6 +15,7 @@ #include "lib/axmap.h" #include "rwlock.h" #include "zbd.h" +#include "sprandom.h" #ifdef CONFIG_LINUX_FALLOCATE #include @@ -303,13 +304,12 @@ static bool pre_read_file(struct thread_data *td, struct fio_file *f) if (bs > left) bs = left; - b = malloc(bs); + b = calloc(1, bs); if (!b) { td_verror(td, errno, "malloc"); ret = false; goto error; } - memset(b, 0, bs); if (lseek(f->fd, f->file_offset, SEEK_SET) < 0) { td_verror(td, errno, "lseek"); @@ -737,21 +737,11 @@ int generic_open_file(struct thread_data *td, struct fio_file *f) f_out = stderr; } - if (td_trim(td)) - goto skip_flags; if (td->o.odirect) flags |= OS_O_DIRECT; - if (td->o.oatomic) { - if (!FIO_O_ATOMIC) { - td_verror(td, EINVAL, "OS does not support atomic IO"); - return 1; - } - flags |= OS_O_DIRECT | FIO_O_ATOMIC; - } flags |= td->o.sync_io; if (td->o.create_on_open && td->o.allow_create) flags |= O_CREAT; -skip_flags: if (f->filetype != FIO_TYPE_FILE) flags |= FIO_O_NOATIME; @@ -760,6 +750,11 @@ int generic_open_file(struct thread_data *td, struct fio_file *f) if (!read_only) flags |= O_RDWR; + if (td->o.verify_only) { + flags &= ~O_RDWR; + flags |= O_RDONLY; + } + if (f->filetype == FIO_TYPE_FILE && td->o.allow_create) flags |= O_CREAT; @@ -768,7 +763,7 @@ int generic_open_file(struct thread_data *td, struct fio_file *f) else from_hash = file_lookup_open(f, flags); } else if (td_read(td)) { - if (f->filetype == FIO_TYPE_CHAR && !read_only) + if (td_ioengine_flagged(td, FIO_RO_NEEDS_RW_OPEN) && !read_only) flags |= O_RDWR; else flags |= O_RDONLY; @@ -1008,7 +1003,8 @@ uint64_t get_start_offset(struct thread_data *td, struct fio_file *f) /* * Find longest path component that exists and return its length */ -int longest_existing_path(char *path) { +static int longest_existing_path(const char *path) +{ char buf[PATH_MAX]; bool done; char *buf_pos; @@ -1394,17 +1390,21 @@ int setup_files(struct thread_data *td) if (err) goto err_out; - /* - * iolog already set the total io size, if we read back - * stored entries. - */ - if (!o->read_iolog_file) { - if (o->io_size) - td->total_io_size = o->io_size * o->loops; - else - td->total_io_size = o->size * o->loops; + if (td->o.sprandom) { + if (td->o.nr_files != 1) { + log_err("fio: SPRandom supports only one file"); + goto err_out; + } + err = sprandom_init(td, td->files[0]); + if (err) + goto err_out; } + if (o->io_size) + td->total_io_size = o->io_size * o->loops; + else + td->total_io_size = o->size * o->loops; + done: if (td->o.zone_mode == ZONE_MODE_ZBD) { err = zbd_setup_files(td); @@ -1417,6 +1417,12 @@ int setup_files(struct thread_data *td) td_restore_runstate(td, old_state); + if (td->o.dp_type != FIO_DP_NONE) { + err = dp_init(td); + if (err) + goto err_out; + } + return 0; err_offset: @@ -1452,9 +1458,8 @@ static void __init_rand_distribution(struct thread_data *td, struct fio_file *f) nranges = (fsize + range_size - 1ULL) / range_size; - seed = jhash(f->file_name, strlen(f->file_name), 0) * td->thread_number; - if (!td->o.rand_repeatable) - seed = td->rand_seeds[4]; + seed = jhash(f->file_name, strlen(f->file_name), 0) * td->thread_number * + td->rand_seeds[FIO_RAND_BLOCK_OFF]; if (td->o.random_distribution == FIO_RAND_DIST_ZIPF) zipf_init(&f->zipf, nranges, td->o.zipf_theta.u.f, td->o.random_center.u.f, seed); @@ -1486,7 +1491,7 @@ static bool init_rand_distribution(struct thread_data *td) /* * Check if the number of blocks exceeds the randomness capability of - * the selected generator. Tausworthe is 32-bit, the others are fullly + * the selected generator. Tausworthe is 32-bit, the others are fully * 64-bit capable. */ static int check_rand_gen_limits(struct thread_data *td, struct fio_file *f, @@ -1510,6 +1515,7 @@ static int check_rand_gen_limits(struct thread_data *td, struct fio_file *f, "random_generator= option to get rid of this " "warning.\n"); td->o.random_generator = FIO_RAND_GEN_TAUSWORTHE64; + init_rand_offset_seed(td); return 0; } @@ -1594,6 +1600,10 @@ void fio_file_free(struct fio_file *f) { if (fio_file_axmap(f)) axmap_free(f->io_axmap); + if (f->ruhs_info) + sfree(f->ruhs_info); + if (f->spr_info) + sprandom_free(f->spr_info); if (!fio_file_smalloc(f)) { free(f->file_name); free(f); @@ -1627,6 +1637,7 @@ void close_and_free_files(struct thread_data *td) } zbd_close_file(f); + fdp_free_ruhs_info(f); fio_file_free(f); } @@ -1837,7 +1848,10 @@ int add_file(struct thread_data *td, const char *fname, int numjob, int inc) /* can't handle smalloc failure from here */ assert(f->file_name); - get_file_type(f); + if (td->o.filetype) + f->filetype = td->o.filetype; + else + get_file_type(f); switch (td->o.file_lock_mode) { case FILE_LOCK_NONE: @@ -2031,11 +2045,12 @@ void dup_files(struct thread_data *td, struct thread_data *org) if (!org->files) return; - td->files = malloc(org->files_index * sizeof(f)); + td->files = calloc(org->files_index, sizeof(f)); if (td->o.file_lock_mode != FILE_LOCK_NONE) td->file_locks = malloc(org->files_index); + assert(org->files_index >= org->o.nr_files); for_each_file(org, f, i) { struct fio_file *__f; diff --git a/fio.1 b/fio.1 index a3ebb67d36..bc3efa5f13 100644 --- a/fio.1 +++ b/fio.1 @@ -1,4 +1,4 @@ -.TH fio 1 "August 2017" "User Manual" +.TH fio 1 "May 2025" "User Manual" .SH NAME fio \- flexible I/O tester .SH SYNOPSIS @@ -67,8 +67,8 @@ List all commands defined by \fIioengine\fR, or print help for \fIcommand\fR defined by \fIioengine\fR. If no \fIioengine\fR is given, list all available ioengines. .TP -.BI \-\-showcmd \fR=\fPjobfile -Convert \fIjobfile\fR to a set of command\-line options. +.BI \-\-showcmd +Convert given \fIjobfile\fRs to a set of command\-line options. .TP .BI \-\-readonly Turn on safety read\-only checks, preventing writes and trims. The \fB\-\-readonly\fR @@ -292,7 +292,7 @@ For Zone Block Device Mode: .RS .P .PD 0 -z means Zone +z means Zone .P .PD .RE @@ -471,10 +471,12 @@ See \fB\-\-max\-jobs\fR. Default: 1. .SS "Time related parameters" .TP .BI runtime \fR=\fPtime -Tell fio to terminate processing after the specified period of time. It -can be quite hard to determine for how long a specified job will run, so -this parameter is handy to cap the total runtime to a given time. When -the unit is omitted, the value is interpreted in seconds. +Limit runtime. The test will run until it completes the configured I/O +workload or until it has run for this specified amount of time, whichever +occurs first. It can be quite hard to determine for how long a specified +job will run, so this parameter is handy to cap the total runtime to a +given time. When the unit is omitted, the value is interpreted in +seconds. .TP .BI time_based If set, fio will run for the duration of the \fBruntime\fR specified @@ -495,6 +497,15 @@ thus it will increase the total runtime if a special timeout or \fBruntime\fR is specified. When the unit is omitted, the value is given in seconds. .TP +.BI ramp_size \fR=\fPsize +If set, fio will wait until the job does given amount of IO before +logging any performance numbers. When \fBgroup_reporting\fR is enabled, +the logging starts when all jobs in the group together perform given +amount of IO. Similarly to \fBramp_time\fR this is useful for letting +performance to settle before logging results and will increase the total +runtime if a special timeout or \fBruntime\fR is specified. When +the unit is omitted, the value is given in bytes. +.TP .BI clocksource \fR=\fPstr Use the given clocksource as the base of timing. The supported options are: .RS @@ -535,6 +546,10 @@ copy that segment, instead of entering the kernel with a \fBgettimeofday\fR\|(2) call. The CPU set aside for doing these time calls will be excluded from other uses. Fio will manually clear it from the CPU mask of other jobs. +.TP +.BI job_start_clock_id \fR=\fPint +The clock_id passed to the call to \fBclock_gettime\fR used to record job_start +in the \fBjson\fR output format. Default is 0, or CLOCK_REALTIME. .SS "Target file/device" .TP .BI directory \fR=\fPstr @@ -569,7 +584,7 @@ by this option will be \fBsize\fR divided by number of files unless an explicit size is specified by \fBfilesize\fR. .RS .P -Each colon in the wanted path must be escaped with a '\\' +Each colon in the wanted path must be escaped with a '\e' character. For instance, if the path is `/dev/dsk/foo@3,0:c' then you would use `filename=/dev/dsk/foo@3,0\\:c' and if the path is `F:\\filename' then you would use `filename=F\\:\\filename'. @@ -579,6 +594,11 @@ the first device, `\\\\.\\PhysicalDrive1' for the second etc. Note: Windows and FreeBSD prevent write access to areas of the disk containing in-use data (e.g. filesystems). .P +For HTTP and S3 access, specify a valid URL path or S3 key, respectively. +A filename for path-style S3 includes a bucket name (`/bucket/k/e.y') +while a virtual-hosted-style S3 filename (`/k/e.y') does not because its +bucket name is specified in \fBhttp_host\fR. +.P The filename `\-' is a reserved name, meaning *stdin* or *stdout*. Which of the two depends on the read/write direction set. .RE @@ -624,8 +644,32 @@ To avoid collisions between networked clients, fio defaults to prefixing any generated filenames (with a directory specified) with the source of the client connecting. To disable this behavior, set this option to 0. .TP +.BI filetype \fR=\fPstr +Assume that all files defined in a job are of this type. By default fio will do +\fBstat\fR\|(2) for each file to know its file type. For huge filesets it might +be a bottleneck, so the option can be used to skip the huge number of syscalls. +The file types are: +.RS +.RS +.TP +.B none +Unset. The default. +.TP +.B file +Regular file. +.TP +.B block +Block device file. +.TP +.B char +Char device file. +.RE +.RE +.TP .BI opendir \fR=\fPstr -Recursively open any files below directory \fIstr\fR. +Recursively open any files below directory \fIstr\fR. This accepts only a +single directory and unlike related options, colons appearing in the path must +not be escaped. .TP .BI lockfile \fR=\fPstr Fio defaults to not locking any files before it does I/O to them. If a file @@ -741,12 +785,12 @@ same data multiple times. Thus it will not work on non-seekable I/O engines (e.g. network, splice). Default: false. .TP .BI unlink \fR=\fPbool -Unlink the job files when done. Not the default, as repeated runs of that +Unlink (delete) the job files when done. Not the default, as repeated runs of that job would then waste time recreating the file set again and again. Default: false. .TP .BI unlink_each_loop \fR=\fPbool -Unlink job files after each iteration or loop. Default: false. +Unlink (delete) job files after each iteration or loop. Default: false. .TP .BI zonemode \fR=\fPstr Accepted values are: @@ -828,30 +872,65 @@ numbers fio only reads beyond the write pointer if explicitly told to do so. Default: false. .TP .BI max_open_zones \fR=\fPint -When running a random write test across an entire drive many more zones will be -open than in a typical application workload. Hence this command line option -that allows to limit the number of open zones. The number of open zones is -defined as the number of zones to which write commands are issued by all -threads/processes. +When a zone of a zoned block device is partially written (i.e. not all sectors +of the zone have been written), the zone is in one of three +conditions: 'implicit open', 'explicit open' or 'closed'. Zoned block devices +may have a limit called 'max_open_zones' (same name as the parameter) on the +total number of zones that can simultaneously be in the 'implicit open' +or 'explicit open' conditions. Zoned block devices may have another limit +called 'max_active_zones', on the total number of zones that can simultaneously +be in the three conditions. The \fBmax_open_zones\fR parameter limits +the number of zones to which write commands are issued by all fio jobs, that is, +limits the number of zones that will be in the conditions. When the device has +the max_open_zones limit and does not have the max_active_zones limit, the +\fBmax_open_zones\fR parameter limits the number of zones in the two open +conditions up to the limit. In this case, fio includes zones in the two open +conditions to the write target zones at fio start. When the device has both the +max_open_zones and the max_active_zones limits, the \fBmax_open_zones\fR +parameter limits the number of zones in the three conditions up to the limit. +In this case, fio includes zones in the three conditions to the write target +zones at fio start. + +This parameter is relevant only if the \fBzonemode=zbd\fR is used. The default +value is always equal to the max_open_zones limit of the target zoned block +device and a value higher than this limit cannot be specified by users unless +the option \fBignore_zone_limits\fR is specified. When \fBignore_zone_limits\fR +is specified or the target device does not have the max_open_zones limit, +\fBmax_open_zones\fR can specify 0 to disable any limit on the number of zones +that can be simultaneously written to by all jobs. .TP .BI job_max_open_zones \fR=\fPint -Limit on the number of simultaneously opened zones per single thread/process. +In the same manner as \fBmax_open_zones\fR, limit the number of open zones per +fio job, that is, the number of zones that a single job can simultaneously write +to. A value of zero indicates no limit. Default: zero. .TP .BI ignore_zone_limits \fR=\fPbool -If this isn't set, fio will query the max open zones limit from the zoned block -device, and exit if the specified \fBmax_open_zones\fR value is larger than the -limit reported by the device. Default: false. +If this option is used, fio will ignore the maximum number of open zones limit +of the zoned block device in use, thus allowing the option \fBmax_open_zones\fR +value to be larger than the device reported limit. Default: false. .TP .BI zone_reset_threshold \fR=\fPfloat -A number between zero and one that indicates the ratio of logical blocks with -data to the total number of logical blocks in the test above which zones -should be reset periodically. +A number between zero and one that indicates the ratio of written bytes in the +zones with write pointers in the IO range to the size of the IO range. When +current ratio is above this ratio, zones are reset periodically as +\fBzone_reset_frequency\fR specifies. If there are multiple jobs when using this +option, the IO range for all write jobs has to be the same. .TP .BI zone_reset_frequency \fR=\fPfloat A number between zero and one that indicates how often a zone reset should be issued if the zone reset threshold has been exceeded. A zone reset is submitted after each (1 / zone_reset_frequency) write requests. This and the previous parameter can be used to simulate garbage collection activity. +.TP +.BI recover_zbd_write_error \fR=\fPbool +If this option is specified together with the option \fBcontinue_on_error\fR, +check the write pointer positions after the failed writes to sequential write +required zones. Then move the write pointers so that the next writes do not +fail due to partial writes and unexpected write pointer positions. If +\fBcontinue_on_error\fR is not specified, errors out. When the writes are +asynchronous, the write pointer move fills blocks with zero then breaks verify +data. If an asynchronous IO engine and \fBverify\fR workload are specified, +errors out. Default: false. .SS "I/O type" .TP @@ -860,11 +939,6 @@ If value is true, use non-buffered I/O. This is usually O_DIRECT. Note that OpenBSD and ZFS on Solaris don't support direct I/O. On Windows the synchronous ioengines don't support direct I/O. Default: false. .TP -.BI atomic \fR=\fPbool -If value is true, attempt to use atomic direct I/O. Atomic writes are -guaranteed to be stable once acknowledged by the operating system. Only -Linux supports O_ATOMIC right now. -.TP .BI buffered \fR=\fPbool If value is true, use buffered I/O. This is the opposite of the \fBdirect\fR option. Defaults to true. @@ -900,7 +974,15 @@ Random mixed reads and writes. .TP .B trimwrite Sequential trim+write sequences. Blocks will be trimmed first, -then the same blocks will be written to. +then the same blocks will be written to. So if `io_size=64K' is specified, +Fio will trim a total of 64K bytes and also write 64K bytes on the same +trimmed blocks. This behaviour will be consistent with `number_ios' or +other Fio options limiting the total bytes or number of I/O's. +.TP +.B randtrimwrite +Like +.B trimwrite , +but uses random offsets rather than sequential writes. .RE .P Fio defaults to read if the option is not specified. For the mixed I/O @@ -914,7 +996,9 @@ modifier with a value of 8. If the suffix is used with a sequential I/O pattern, then the `' value specified will be added to the generated offset for each I/O turning sequential I/O into sequential I/O with holes. For instance, using `rw=write:4k' will skip 4k for every write. Also see -the \fBrw_sequencer\fR option. +the \fBrw_sequencer\fR option. If this is used with \fBverify\fR then +\fBverify_header_seed\fR option will be disabled, unless its explicitly +enabled. .RE .TP .BI rw_sequencer \fR=\fPstr @@ -933,12 +1017,45 @@ Generate the same offset. .P \fBsequential\fR is only useful for random I/O, where fio would normally generate a new random offset for every I/O. If you append e.g. 8 to randread, -you would get a new random offset for every 8 I/Os. The result would be a -seek for only every 8 I/Os, instead of for every I/O. Use `rw=randread:8' -to specify that. As sequential I/O is already sequential, setting -\fBsequential\fR for that would not result in any differences. \fBidentical\fR -behaves in a similar fashion, except it sends the same offset 8 number of -times before generating a new offset. +i.e. `rw=randread:8' you would get a new random offset for every 8 I/Os. The +result would be a sequence of 8 sequential offsets with a random starting +point. However this behavior may change if a sequential I/O reaches end of the +file. As sequential I/O is already sequential, setting \fBsequential\fR for +that would not result in any difference. \fBidentical\fR behaves in a similar +fashion, except it sends the same offset 8 number of times before generating a +new offset. +.P +Example #1: +.RS +.P +.PD 0 +rw=randread:8 +.P +rw_sequencer=sequential +.P +bs=4k +.PD +.RE +.P +The generated sequence of offsets will look like this: +4k, 8k, 12k, 16k, 20k, 24k, 28k, 32k, 92k, 96k, 100k, 104k, 108k, 112k, 116k, +120k, 48k, 52k ... +.P +Example #2: +.RS +.P +.PD 0 +rw=randread:8 +.P +rw_sequencer=identical +.P +bs=4k +.PD +.RE +.P +The generated sequence of offsets will look like this: +4k, 4k, 4k, 4k, 4k, 4k, 4k, 4k, 92k, 92k, 92k, 92k, 92k, 92k, 92k, 92k, 48k, +48k, 48k ... .RE .TP .BI unified_rw_reporting \fR=\fPstr @@ -969,12 +1086,11 @@ Alias for \fBboth\fR. .RE .TP .BI randrepeat \fR=\fPbool -Seed the random number generator used for random I/O patterns in a -predictable way so the pattern is repeatable across runs. Default: true. +Seed all random number generators in a predictable way so the pattern is +repeatable across runs. Default: true. .TP .BI allrandrepeat \fR=\fPbool -Seed all random number generators in a predictable way so results are -repeatable across runs. Default: false. +Alias for \fBrandrepeat\fR. Default: true. .TP .BI randseed \fR=\fPint Seed the random number generators based on this seed value, to be able to @@ -1045,6 +1161,11 @@ Advise using FADV_SEQUENTIAL. .TP .B random Advise using FADV_RANDOM. +.TP +.B noreuse +Advise using FADV_NOREUSE. This may be a no-op on older Linux +kernels. Since Linux 6.3, it provides a hint to the LRU algorithm. +See the \fBposix_fadvise\fR\|(2) man page. .RE .RE .TP @@ -1083,7 +1204,7 @@ provided. Data before the given offset will not be touched. This effectively caps the file size at `real_size \- offset'. Can be combined with \fBsize\fR to constrain the start and end range of the I/O workload. A percentage can be specified by a number between 1 and 100 followed by '%', -for example, `offset=20%' to specify 20%. In ZBD mode, value can be set as +for example, `offset=20%' to specify 20%. In ZBD mode, value can be set as number of zones using 'z'. .TP .BI offset_align \fR=\fPint @@ -1099,7 +1220,7 @@ specified). This option is useful if there are several jobs which are intended to operate on a file in parallel disjoint segments, with even spacing between the starting points. Percentages can be used for this option. If a percentage is given, the generated offset will be aligned to the minimum -\fBblocksize\fR or to the value of \fBoffset_align\fR if provided.In ZBD mode, value +\fBblocksize\fR or to the value of \fBoffset_align\fR if provided.In ZBD mode, value can be set as number of zones using 'z'. .TP .BI number_ios \fR=\fPint @@ -1122,7 +1243,7 @@ see \fBend_fsync\fR and \fBfsync_on_close\fR. .TP .BI fdatasync \fR=\fPint Like \fBfsync\fR but uses \fBfdatasync\fR\|(2) to only sync data and -not metadata blocks. In Windows, FreeBSD, DragonFlyBSD or OSX there is no +not metadata blocks. In Windows, DragonFlyBSD or OSX there is no \fBfdatasync\fR\|(2) so this falls back to using \fBfsync\fR\|(2). Defaults to 0, which means fio does not periodically issue and wait for a data-only sync to complete. @@ -1216,12 +1337,12 @@ map. For the \fBnormal\fR distribution, a normal (Gaussian) deviation is supplied as a value between 0 and 100. .P The second, optional float is allowed for \fBpareto\fR, \fBzipf\fR and \fBnormal\fR -distributions. It allows to set base of distribution in non-default place, giving +distributions. It allows one to set base of distribution in non-default place, giving more control over most probable outcome. This value is in range [0-1] which maps linearly to range of possible random values. Defaults are: random for \fBpareto\fR and \fBzipf\fR, and 0.5 for \fBnormal\fR. If you wanted to use \fBzipf\fR with a `theta` of 1.2 centered on 1/4 of allowed value range, -you would use `random_distibution=zipf:1.2:0.25`. +you would use `random_distribution=zipf:1.2:0.25`. .P For a \fBzoned\fR distribution, fio supports specifying percentages of I/O access that should fall within what range of the file or device. For @@ -1288,11 +1409,11 @@ Normally fio will cover every block of the file when doing random I/O. If this option is given, fio will just get a new random offset without looking at past I/O history. This means that some blocks may not be read or written, and that some blocks may be read/written more than once. If this option is -used with \fBverify\fR and multiple blocksizes (via \fBbsrange\fR), +used with \fBverify\fR then \fBverify_header_seed\fR will be disabled. If this +option is used with \fBverify\fR and multiple blocksizes (via \fBbsrange\fR), only intact blocks are verified, i.e., partially-overwritten blocks are -ignored. With an async I/O engine and an I/O depth > 1, it is possible for -the same block to be overwritten, which can cause verification errors. Either -do not use norandommap in this case, or also use the lfsr random generator. +ignored. With an async I/O engine and an I/O depth > 1, header write sequence +number verification will be disabled. See \fBverify_write_sequence\fR. .TP .BI softrandommap \fR=\fPbool See \fBnorandommap\fR. If fio runs with the random block map enabled and @@ -1326,6 +1447,52 @@ multiple times. The default value is \fBtausworthe\fR, unless the required space exceeds 2^32 blocks. If it does, then \fBtausworthe64\fR is selected automatically. .RE +.TP +.B sprandom=bool +SPRandom is a method designed to rapidly precondition SSDs for +steady-state random write workloads. It divides the device into +equally sized regions and writes the device's entire physical capacity +once, selecting offsets so that the regions have a distribution of +invalid blocks matching the distribution that occurs at steady state. +Default: false. + +It uses \fBrandom_generator=lfsr\fR, which fio will set by default. +Selecting any other random generator will result in an error. +.TP +.B spr_num_regions=int +See +.BR sprandom . +Specifies the number of regions used for SPRandom. Default=100 +.P +.RS +For large devices it is better to use more regions, to increase precision +and reduce memory allocation. The allocation is proportional to the region size. +.RE +.TP +.B spr_cs=int +See +.BR sprandom . +Define a cache size in bytes, as specified by the SSD manufacturer. +.P +.RS +When this is non-zero, delay invalidating writes by one region in order +to make sure that all original writes from a region are flushed from +cache before the later invalidating writes are sent to the device. +This deferral prevents the original write and the later invalidating +write from being present in the device's cache at the same time which +would allow the device to ignore the original write and prevent +sprandom from achieving its target validity fractions. The actual +cache size is used to ensure that the number of regions is not set +so large that the size of a region is smaller than the device cache. +The default is 0. +.RE +.TP +.B spr_op=float +See +.BR sprandom . +Over-provisioning ratio in the range (0, 1), as specified by the SSD manufacturer. +The default is 0.15. +.RE .SS "Block size" .TP .BI blocksize \fR=\fPint[,int][,int] "\fR,\fB bs" \fR=\fPint[,int][,int] @@ -1359,7 +1526,7 @@ described in \fBblocksize\fR. Example: .RS .RS .P -bsrange=1k\-4k,2k\-8k +bsrange=1k\-4k,2k\-8k or bsrange=1k:4k,2k:8k .RE .RE .TP @@ -1541,7 +1708,6 @@ Note that by using \fBworking_set\fR the dedupe percentage will converge to the desired over time while \fBrepeat\fR maintains the desired percentage throughout the job. .RE -.RE .TP .BI dedupe_working_set_percentage \fR=\fPint If \fBdedupe_mode\fR is set to \fBworking_set\fR, then this controls @@ -1553,6 +1719,15 @@ Note that \fBsize\fR needs to be explicitly provided and only 1 file per job is supported .RE .TP +.BI dedupe_global \fR=\fPbool +This controls whether the deduplication buffers will be shared amongst +all jobs that have this option set. The buffers are spread evenly between +participating jobs. +.P +.RS +Note that \fBdedupe_mode\fR must be set to \fBworking_set\fR for this to work. +Can be used in combination with compression +.TP .BI invalidate \fR=\fPbool Invalidate the buffer/page cache parts of the files to be used prior to starting I/O if the platform and file type support it. Defaults to true. @@ -1622,11 +1797,11 @@ multiplied by the I/O depth given. Note that for \fBshmhuge\fR and \fBmmaphuge\fR to work, the system must have free huge pages allocated. This can normally be checked and set by reading/writing `/proc/sys/vm/nr_hugepages' on a Linux system. Fio assumes a huge page -is 4MiB in size. So to calculate the number of huge pages you need for a -given job file, add up the I/O depth of all jobs (normally one unless -\fBiodepth\fR is used) and multiply by the maximum bs set. Then divide -that number by the huge page size. You can see the size of the huge pages in -`/proc/meminfo'. If no huge pages are allocated by having a non-zero +is 2 or 4MiB in size depending on the platform. So to calculate the number of +huge pages you need for a given job file, add up the I/O depth of all jobs +(normally one unless \fBiodepth\fR is used) and multiply by the maximum bs set. +Then divide that number by the huge page size. You can see the size of the huge +pages in `/proc/meminfo'. If no huge pages are allocated by having a non-zero number in `nr_hugepages', using \fBmmaphuge\fR or \fBshmhuge\fR will fail. Also see \fBhugepage\-size\fR. .P @@ -1646,10 +1821,11 @@ of subsequent I/O memory buffers is the sum of the \fBiomem_align\fR and \fBbs\fR used. .TP .BI hugepage\-size \fR=\fPint -Defines the size of a huge page. Must at least be equal to the system -setting, see `/proc/meminfo'. Defaults to 4MiB. Should probably -always be a multiple of megabytes, so using `hugepage\-size=Xm' is the -preferred way to set this to avoid setting a non-pow-2 bad value. +Defines the size of a huge page. Must at least be equal to the system setting, +see `/proc/meminfo' and `/sys/kernel/mm/hugepages/'. Defaults to 2 or 4MiB +depending on the platform. Should probably always be a multiple of megabytes, +so using `hugepage\-size=Xm' is the preferred way to set this to avoid setting +a non-pow-2 bad value. .TP .BI lockmem \fR=\fPint Pin the specified amount of memory with \fBmlock\fR\|(2). Can be used to @@ -1658,8 +1834,11 @@ simulate a smaller amount of memory. The amount specified is per worker. .TP .BI size \fR=\fPint[%|z] The total size of file I/O for each thread of this job. Fio will run until -this many bytes has been transferred, unless runtime is limited by other options -(such as \fBruntime\fR, for instance, or increased/decreased by \fBio_size\fR). +this many bytes has been transferred, unless runtime is altered by other means +such as (1) \fBruntime\fR, (2) \fBio_size\fR, (3) \fBnumber_ios\fR, (4) +gaps/holes while doing I/O's such as `rw=read:16K', or (5) sequential I/O +reaching end of the file which is possible when \fBpercentage_random\fR is +less than 100. Fio will divide this size between the available files determined by options such as \fBnrfiles\fR, \fBfilename\fR, unless \fBfilesize\fR is specified by the job. If the result of division happens to be 0, the size is @@ -1668,7 +1847,7 @@ If this option is not specified, fio will use the full size of the given files or devices. If the files do not exist, size must be given. It is also possible to give size as a percentage between 1 and 100. If `size=20%' is given, fio will use 20% of the full size of the given files or devices. In ZBD mode, -size can be given in units of number of zones using 'z'. Can be combined with \fBoffset\fR to +size can be given in units of number of zones using 'z'. Can be combined with \fBoffset\fR to constrain the start and end range that I/O will be done within. .TP .BI io_size \fR=\fPint[%|z] "\fR,\fB io_limit" \fR=\fPint[%|z] @@ -1686,10 +1865,10 @@ also be set as number of zones using 'z'. .TP .BI filesize \fR=\fPirange(int) Individual file sizes. May be a range, in which case fio will select sizes -for files at random within the given range and limited to \fBsize\fR in -total (if that is given). If not given, each created file is the same size. -This option overrides \fBsize\fR in terms of file size, which means -this value is used as a fixed size or possible range of each file. +for files at random within the given range. If not given, each created file +is the same size. This option overrides \fBsize\fR in terms of file size, +i.e. \fBsize\fR becomes merely the default for \fBio_size\fR (and +has no effect it all if \fBio_size\fR is set explicitly). .TP .BI file_append \fR=\fPbool Perform I/O after the end of the file. Normally fio will operate within the @@ -1706,8 +1885,9 @@ started on the result. .SS "I/O engine" .TP .BI ioengine \fR=\fPstr -Defines how the job issues I/O to the file. The following types are defined: -.RS +fio supports 2 kinds of performance measurement: I/O and file/directory operation. + +I/O engines define how the job issues I/O to the file. The following types are defined: .RS .TP .B sync @@ -1729,6 +1909,15 @@ Basic \fBpreadv\fR\|(2) or \fBpwritev\fR\|(2) I/O. .B pvsync2 Basic \fBpreadv2\fR\|(2) or \fBpwritev2\fR\|(2) I/O. .TP +.B io_uring +Fast Linux native asynchronous I/O. Supports async IO +for both direct and buffered IO. +This engine defines engine specific options. +.TP +.B io_uring_cmd +Fast Linux native asynchronous I/O for passthrough commands. +This engine defines engine specific options. +.TP .B libaio Linux native asynchronous I/O. Note that Linux may only support queued behavior with non-buffered I/O (set `direct=1' or @@ -1791,15 +1980,14 @@ A job never finishes unless there is at least one non-cpuio job. .RS .P .PD 0 -\fBcpuload\fR\=85 will cause that job to do nothing but burn 85% of the CPU. +\fBcpuload\fR=85 will cause that job to do nothing but burn 85% of the CPU. In case of SMP machines, use \fBnumjobs=\fR\ to get desired CPU usage, as the cpuload only loads a single CPU at the desired rate. .P -\fBcpumode\fR\=qsort replace the default noop instructions loop +\fBcpumode\fR=qsort replace the default noop instructions loop by a qsort algorithm to consume more energy. - -.P +.PD .RE .TP .B rdma @@ -1881,11 +2069,6 @@ e.g., on NAND, writing sequentially to erase blocks and discarding before overwriting. The \fBtrimwrite\fR mode works well for this constraint. .TP -.B pmemblk -Read and write using filesystem DAX to a file on a filesystem -mounted with DAX on a persistent memory device through the PMDK -libpmemblk library. -.TP .B dev\-dax Read and write using device DAX to a persistent memory device (e.g., /dev/dax0.0) through the PMDK libpmem library. @@ -1897,21 +2080,6 @@ ioengine `foo.o' in `/tmp'. The path can be either absolute or relative. See `engines/skeleton_external.c' in the fio source for details of writing an external I/O engine. .TP -.B filecreate -Simply create the files and do no I/O to them. You still need to set -\fBfilesize\fR so that all the accounting still occurs, but no actual I/O will be -done other than creating the file. -.TP -.B filestat -Simply do stat() and do no I/O to the file. You need to set 'filesize' -and 'nrfiles', so that files will be created. -This engine is to measure file lookup and meta data access. -.TP -.B filedelete -Simply delete files by unlink() and do no I/O to the file. You need to set 'filesize' -and 'nrfiles', so that files will be created. -This engine is to measure file delete. -.TP .B libpmem Read and write using mmap I/O to a file on a filesystem mounted with DAX on a persistent memory device through the PMDK @@ -1956,6 +2124,67 @@ via kernel NFS. .TP .B exec Execute 3rd party tools. Could be used to perform monitoring during jobs runtime. +.TP +.B xnvme +I/O engine using the xNVMe C API, for NVMe devices. The xnvme engine provides +flexibility to access GNU/Linux Kernel NVMe driver via libaio, IOCTLs, io_uring, +the SPDK NVMe driver, or your own custom NVMe driver. The xnvme engine includes +engine specific options. (See \fIhttps://xnvme.io/\fR). +.TP +.B libblkio +Use the libblkio library (\fIhttps://gitlab.com/libblkio/libblkio\fR). The +specific driver to use must be set using \fBlibblkio_driver\fR. If +\fBmem\fR/\fBiomem\fR is not specified, memory allocation is delegated to +libblkio (and so is guaranteed to work with the selected driver). One libblkio +instance is used per process, so all jobs setting option \fBthread\fR will share +a single instance (with one queue per thread) and must specify compatible +options. Note that some drivers don't allow several instances to access the same +device or file simultaneously, but allow it for threads. +.RE +.P +File/directory operation engines define how the job operates file or directory. +The following types are defined: +.RS +.TP +.B filecreate +Simply create the files and do no I/O to them. You still need to +set \fBfilesize\fP so that all the accounting still occurs, but no +actual I/O will be done other than creating the file. +Example job file: filecreate-ioengine.fio. +.TP +.B filestat +Simply do stat() and do no I/O to the file. You need to set \fBfilesize\fP +and \fBnrfiles\fP, so that files will be created. +This engine is to measure file lookup and meta data access. +Example job file: filestat-ioengine.fio. +.TP +.B filedelete +Simply delete the files by unlink() and do no I/O to them. You need to set \fBfilesize\fP +and \fBnrfiles\fP, so that the files will be created. +This engine is to measure file delete. +Example job file: filedelete-ioengine.fio. +.TP +.B dircreate +Simply create the directories and do no I/O to them. You still need to +set \fBfilesize\fP so that all the accounting still occurs, but no +actual I/O will be done other than creating the directories. +Example job file: dircreate-ioengine.fio. +.TP +.B dirstat +Simply do stat() and do no I/O to the directories. You need to set \fBfilesize\fP +and \fBnrfiles\fP, so that directories will be created. +This engine is to measure directory lookup and meta data access. +Example job file: dirstat-ioengine.fio. +.TP +.B dirdelete +Simply delete the directories by rmdir() and do no I/O to them. You need to set \fBfilesize\fP +and \fBnrfiles\fP, so that the directories will be created. +This engine is to measure directory delete. +.P +For file and directory operation engines, there is no I/O throughput, then the statistics \ +data in report have different meanings. The meaningful output indexes are: \fBiops\fP and \fBclat\fP. \ +\fBbw\fP is meaningless. Refer to section: "Interpreting the output" for more details. +.RE .SS "I/O engine specific parameters" In addition, there are some parameters which are only valid when a specific \fBioengine\fR is in use. These are used identically to normal parameters, @@ -1980,6 +2209,14 @@ is set, this defaults to the highest priority class. A single value applies to reads and writes. Comma-separated values may be specified for reads and writes. See man \fBionice\fR\|(1). See also the \fBprioclass\fR option. .TP +.BI (io_uring,libaio)cmdprio_hint \fR=\fPint[,int] +Set the I/O priority hint to use for I/Os that must be issued with a +priority when \fBcmdprio_percentage\fR or \fBcmdprio_bssplit\fR is set. +If not specified when \fBcmdprio_percentage\fR or \fBcmdprio_bssplit\fR +is set, this defaults to 0 (no hint). A single value applies to reads and +writes. Comma-separated values may be specified for reads and writes. +See also the \fBpriohint\fR option. +.TP .BI (io_uring,libaio)cmdprio \fR=\fPint[,int] Set the I/O priority value to use for I/Os that must be issued with a priority when \fBcmdprio_percentage\fR or \fBcmdprio_bssplit\fR is set. @@ -1995,40 +2232,90 @@ To get a finer control over I/O priority, this option allows specifying the percentage of IOs that must have a priority set depending on the block size of the IO. This option is useful only when used together with the option \fBbssplit\fR, that is, multiple different block sizes are used for reads and -writes. The format for this option is the same as the format of the -\fBbssplit\fR option, with the exception that values for trim IOs are -ignored. This option is mutually exclusive with the \fBcmdprio_percentage\fR -option. +writes. +.RS +.P +The first accepted format for this option is the same as the format of the +\fBbssplit\fR option: +.RS +.P +cmdprio_bssplit=blocksize/percentage:blocksize/percentage +.RE +.P +In this case, each entry will use the priority class, priority hint and +priority level defined by the options \fBcmdprio_class\fR, \fBcmdprio\fR +and \fBcmdprio_hint\fR respectively. +.P +The second accepted format for this option is: +.RS +.P +cmdprio_bssplit=blocksize/percentage/class/level:blocksize/percentage/class/level +.RE +.P +In this case, the priority class and priority level is defined inside each +entry. In comparison with the first accepted format, the second accepted format +does not restrict all entries to have the same priority class and priority +level. +.P +The third accepted format for this option is: +.RS +.P +cmdprio_bssplit=blocksize/percentage/class/level/hint:... +.RE +.P +This is an extension of the second accepted format that allows one to also +specify a priority hint. +.P +For all formats, only the read and write data directions are supported, values +for trim IOs are ignored. This option is mutually exclusive with the +\fBcmdprio_percentage\fR option. +.RE .TP -.BI (io_uring)fixedbufs +.BI (io_uring,io_uring_cmd)fixedbufs If fio is asked to do direct IO, then Linux will map pages for each IO call, and release them when IO is done. If this option is set, the pages are pre-mapped before IO is started. This eliminates the need to map and release for each IO. This is more efficient, and reduces the IO latency as well. .TP -.BI (io_uring)hipri +.BI (io_uring,io_uring_cmd)nonvectored \fR=\fPint +With this option, fio will use non-vectored read/write commands, where address +must contain the address directly. Default is -1. +.TP +.BI (io_uring,io_uring_cmd)force_async +Normal operation for io_uring is to try and issue an sqe as non-blocking first, +and if that fails, execute it in an async manner. With this option set to N, +then every N request fio will ask sqe to be issued in an async manner. Default +is 0. +.TP +.BI (io_uring,io_uring_cmd,xnvme)hipri If this option is set, fio will attempt to use polled IO completions. Normal IO completions generate interrupts to signal the completion of IO, polled completions do not. Hence they are require active reaping by the application. The benefits are more efficient IO for high IOPS scenarios, and lower latencies for low queue depth IO. .TP -.BI (io_uring)registerfiles +.BI (io_uring,io_uring_cmd)registerfiles With this option, fio registers the set of files being used with the kernel. This avoids the overhead of managing file counts in the kernel, making the submission and completion part more lightweight. Required for the below sqthread_poll option. .TP -.BI (io_uring)sqthread_poll +.BI (io_uring,io_uring_cmd,xnvme)sqthread_poll Normally fio will submit IO by issuing a system call to notify the kernel of available items in the SQ ring. If this option is set, the act of submitting IO will be done by a polling thread in the kernel. This frees up cycles for fio, at -the cost of using more CPU in the system. +the cost of using more CPU in the system. As submission is just the time it +takes to fill in the sqe entries and any syscall required to wake up the idle +kernel thread, fio will not report submission latencies. .TP -.BI (io_uring)sqthread_poll_cpu +.BI (io_uring,io_uring_cmd)sqthread_poll_cpu \fR=\fPint When `sqthread_poll` is set, this option provides a way to define which CPU should be used for the polling thread. .TP +.BI (io_uring_cmd)cmd_type \fR=\fPstr +Specifies the type of uring passthrough command to be used. Supported +value is nvme. Default is nvme. +.TP .BI (libaio)userspace_reap Normally, with the libaio engine in use, fio will use the \fBio_getevents\fR\|(3) system call to reap newly returned events. With @@ -2044,7 +2331,7 @@ than normal. When hipri is set this determines the probability of a pvsync2 I/O being high priority. The default is 100%. .TP -.BI (pvsync2,libaio,io_uring)nowait +.BI (pvsync2,libaio,io_uring,io_uring_cmd)nowait \fR=\fPbool By default if a request cannot be executed immediately (e.g. resource starvation, waiting on locks) it is queued and the initiating process will be blocked until the required resource becomes free. @@ -2060,6 +2347,162 @@ cached data. Currently the RWF_NOWAIT flag does not supported for cached write. For direct I/O, requests will only succeed if cache invalidation isn't required, file blocks are fully allocated and the disk request could be issued immediately. .TP +.BI (pvsync2,io_uring)uncached \fR=\fPint +This option will perform buffered IO without retaining data in the +page cache after the operation completes. + +Reads work like a normal buffered read but pages are evicted immediately +after data is copied to userspace. Writes work like buffered writes but +a writeback is initiated before the syscall returns. Pages are evicted +once the writeback completes. + +This option sets the RWF_UNCACHED flag (supported from the 6.14 Linux kernel) on +a per-IO basis. +.TP +.BI (pvsync2,libaio,io_uring)atomic \fR=\fPbool +This option means that writes are issued with torn-write protection, meaning +that for a power fail or kernel crash, all or none of the data from the write +will be stored, but never a mix of old and new data. Torn-write protection is +also known as atomic writes. + +This option sets the RWF_ATOMIC flag (supported from the 6.11 Linux kernel) on +a per-IO basis. + +Writes with RWF_ATOMIC set will be rejected by the kernel when the file does +not support torn-write protection. To learn a file's torn-write limits, issue +statx with STATX_WRITE_ATOMIC. +.TP +.BI (io_uring_cmd,xnvme)fdp \fR=\fPbool +Enable Flexible Data Placement mode for write commands. +.TP +.BI (io_uring_cmd,xnvme)dataplacement \fR=\fPstr +Specifies the data placement directive type to use for write commands. The +following types are supported: +.RS +.RS +.TP +.B none +Do not use a data placement directive. This is the default. +.TP +.B fdp +Use Flexible Data placement directives for write commands. This is equivalent +to specifying \fBfdp\fR=1. +.TP +.B streams +Use Streams directives for write commands. +.RE +.RE +.TP +.BI (io_uring_cmd,xnvme)plid_select=str, fdp_pli_select \fR=\fPstr +Defines how fio decides which placement ID to use next. The following types +are defined: +.RS +.RS +.TP +.B random +Choose a placement ID at random (uniform). +.TP +.B roundrobin +Round robin over available placement IDs. This is the default. +.TP +.B scheme +Choose a placement ID (index) based on the scheme file defined by +the option \fBdp_scheme\fP. +.RE +.P +The available placement ID (indices) are defined by \fBplids\fR or +\fBfdp_pli\fR option except for the case of \fBscheme\fP. +.RE +.TP +.BI (io_uring_cmd,xnvme)plids=str, fdp_pli \fR=\fPstr +Select which Placement ID Indices (FDP) or Placement IDs (streams) this job is +allowed to use for writes. This option accepts a comma-separated list of values +or ranges (e.g., 1,2-4,5,6-8). + +For FDP by default, the job will cycle through all available Placement IDs, so +use this option to be selective. The values specified here are array indices +for the list of placement IDs returned by the nvme-cli command `nvme fdp +status'. If you want fio to use FDP placement identifiers only at indices 0, 2 +and 5, set `plids=0,2,5'. + +For streams this should be a list of Stream IDs. +.TP +.BI (io_uring_cmd,xnvme)\fR\fBdp_scheme\fP=str +Defines which placement ID (index) to be selected based on offset(LBA) range. +The file should contains one or more scheme entries in the following format: +.sp +.RS +.RS +0, 10737418240, 0 +.br +10737418240, 21474836480, 1 +.br +21474836480, 32212254720, 2 +.br +\&... +.RE +.sp +Each line, a scheme entry, contains start offset, end offset, and placement ID +(index) separated by comma(,). If the write offset is within the range of a certain +scheme entry(start offset ≤ offset < end offset), the corresponding placement ID +(index) will be selected. If the write offset belongs to multiple scheme entries, +the first matched scheme entry will be applied. If the offset is not within any range +of scheme entry, dspec field will be set to 0, default RUH. (Caution: In case of +multiple devices in a job, all devices of the job will be affected by the scheme. If +this option is specified, the option \fBplids\fP or \fBfdp_pli\fP will be ignored.) +.RE +.TP +.BI (io_uring_cmd,xnvme)md_per_io_size \fR=\fPint +Size in bytes for separate metadata buffer per IO. For io_uring_cmd these +buffers are allocated using malloc regardless of what is set for \fBiomem\fR. +Default: 0. +.TP +.BI (io_uring_cmd,xnvme)pi_act \fR=\fPint +Action to take when nvme namespace is formatted with protection information. +If this is set to 1 and namespace is formatted with metadata size equal to +protection information size, fio won't use separate metadata buffer or extended +logical block. If this is set to 1 and namespace is formatted with metadata +size greater than protection information size, fio will not generate or verify +the protection information portion of metadata for write or read case +respectively. If this is set to 0, fio generates protection information for +write case and verifies for read case. Default: 1. + +For 16 bit CRC generation fio will use isa-l if available otherwise it will +use the default slower generator. +(see: https://github.com/intel/isa-l) +.TP +.BI (io_uring_cmd,xnvme)pi_chk \fR=\fPstr[,str][,str] +Controls the protection information check. This can take one or more of these +values. Default: none. +.RS +.RS +.TP +.B GUARD +Enables protection information checking of guard field. +.TP +.B REFTAG +Enables protection information checking of logical block reference tag field. +.TP +.B APPTAG +Enables protection information checking of application tag field. +.RE +.RE +.TP +.BI (io_uring_cmd,xnvme)apptag \fR=\fPint +Specifies logical block application tag value, if namespace is formatted to use +end to end protection information. Default: 0x1234. +.TP +.BI (io_uring_cmd,xnvme)apptag_mask \fR=\fPint +Specifies logical block application tag mask value, if namespace is formatted +to use end to end protection information. Default: 0xffff. +.TP +.BI (io_uring_cmd)num_range \fR=\fPint +For trim command this will be the number of ranges to trim per I/O request. +The number of logical blocks per range is determined by the \fBbs\fR option +which should be a multiple of logical block size. This cannot be used with +read or write. Note that setting this option > 1, \fBlog_offset\fR will not be +able to log all the offsets. Default: 1. +.TP .BI (cpuio)cpuload \fR=\fPint Attempt to use the specified percentage of CPU cycles. This is a mandatory option when using cpuio I/O engine. @@ -2067,6 +2510,19 @@ option when using cpuio I/O engine. .BI (cpuio)cpuchunks \fR=\fPint Split the load into cycles of the given time. In microseconds. .TP +.BI (cpuio)cpumode \fR=\fPstr +Specify how to stress the CPU. It can take these two values: +.RS +.RS +.TP +.B noop +This is the default and directs the CPU to execute noop instructions. +.TP +.B qsort +Replace the default noop instructions with a qsort algorithm to consume more energy. +.RE +.RE +.TP .BI (cpuio)exit_on_io_done \fR=\fPbool Detect when I/O threads are done, then exit. .TP @@ -2082,7 +2538,7 @@ The TCP or UDP port to bind to or connect to. If this is used with this will be the starting port number since fio will use a range of ports. .TP -.BI (rdma,librpma_*)port \fR=\fPint +.BI (rdma)port \fR=\fPint The port to use for RDMA-CM communication. This should be the same value on the client and the server side. .TP @@ -2091,16 +2547,6 @@ The hostname or IP address to use for TCP, UDP or RDMA-CM based I/O. If the job is a TCP listener or UDP reader, the hostname is not used and must be omitted unless it is a valid UDP multicast address. .TP -.BI (librpma_*)serverip \fR=\fPstr -The IP address to be used for RDMA-CM based I/O. -.TP -.BI (librpma_*_server)direct_write_to_pmem \fR=\fPbool -Set to 1 only when Direct Write to PMem from the remote host is possible. Otherwise, set to 0. -.TP -.BI (librpma_*_server)busy_wait_polling \fR=\fPbool -Set to 0 to wait for completion instead of busy-wait polling completion. -Default: 1. -.TP .BI (netsplice,net)interface \fR=\fPstr The IP address of the network interface used to send or receive UDP multicast. @@ -2130,11 +2576,16 @@ User datagram protocol V6. .TP .B unix UNIX domain socket. +.TP +.B vsock +VSOCK protocol. .RE .P -When the protocol is TCP or UDP, the port must also be given, as well as the -hostname if the job is a TCP listener or UDP reader. For unix sockets, the +When the protocol is TCP, UDP or VSOCK, the port must also be given, as well as the +hostname if the job is a TCP or VSOCK listener or UDP reader. For unix sockets, the normal \fBfilename\fR option should be used and the port is invalid. +When the protocol is VSOCK, the \fBhostname\fR is the CID of the remote VM. + .RE .TP .BI (netsplice,net)listen @@ -2191,6 +2642,10 @@ Ceph cluster. If the \fBclustername\fR is specified, the \fBclientname\fR shall the full *type.id* string. If no type. prefix is given, fio will add 'client.' by default. .TP +.BI (rados)conf \fR=\fPstr +Specifies the configuration path of ceph cluster, so conf file does not +have to be /etc/ceph/ceph.conf. +.TP .BI (rbd,rados)busy_poll \fR=\fPbool Poll store instead of waiting for completion. Usually this provides better throughput at cost of higher(up to 100%) CPU utilization. @@ -2200,9 +2655,24 @@ During initialization, touch (create if do not exist) all objects (files). Touching all objects affects ceph caches and likely impacts test results. Enabled by default. .TP +.BI (rbd)rbd_encryption_format \fR=\fPstr +Specifies the encryption format of the RBD image. Supported values are +`luks1' and `luks2'. If set, \fBrbd_encryption_passphrase\fR must +also be specified. Note that the image must have been previously formatted +using `rbd encryption format '; the fio rbd engine will only +attempt to load the encryption context, not format the image. +The RBD encryption feature is disabled by default. +Support for this feature requires librbd version 16.2 (Ceph Pacific) or later. +.TP +.BI (rbd)rbd_encryption_passphrase \fR=\fPstr +The passphrase used to unlock the encrypted RBD image. Required if +\fBrbd_encryption_format\fR is set. +.TP .BI (http)http_host \fR=\fPstr -Hostname to connect to. For S3, this could be the bucket name. Default -is \fBlocalhost\fR +Hostname to connect to. HTTP port 80 is used automatically when the value +of the \fBhttps\fP parameter is \fBoff\fP, and HTTPS port 443 if it is \fBon\fP. +A virtual-hosted-style S3 hostname starts with a bucket name, while a +path-style S3 hostname does not. Default is \fBlocalhost\fR. .TP .BI (http)http_user \fR=\fPstr Username for HTTP authentication. @@ -2211,8 +2681,8 @@ Username for HTTP authentication. Password for HTTP authentication. .TP .BI (http)https \fR=\fPstr -Whether to use HTTPS instead of plain HTTP. \fRon\fP enables HTTPS; -\fRinsecure\fP will enable HTTPS, but disable SSL peer verification (use +Whether to use HTTPS instead of plain HTTP. \fBon\fP enables HTTPS; +\fBinsecure\fP will enable HTTPS, but disable SSL peer verification (use with caution!). Default is \fBoff\fR. .TP .BI (http)http_mode \fR=\fPstr @@ -2228,6 +2698,18 @@ The S3 secret key. .BI (http)http_s3_keyid \fR=\fPstr The S3 key/access id. .TP +.BI (http)http_s3_security_token \fR=\fPstr +The S3 security token. +.TP +.BI (http)http_s3_sse_customer_key \fR=\fPstr +The encryption customer key in SSE server side. +.TP +.BI (http)http_s3_sse_customer_algorithm \fR=\fPstr +The encryption customer algorithm in SSE server side. Default is \fBAES256\fR +.TP +.BI (http)http_s3_storage_class \fR=\fPstr +Which storage class to access. User-customizable settings. Default is \fBSTANDARD\fR +.TP .BI (http)http_swift_auth_token \fR=\fPstr The Swift auth token. See the example configuration file on how to retrieve this. @@ -2237,6 +2719,26 @@ Enable verbose requests from libcurl. Useful for debugging. 1 turns on verbose logging from libcurl, 2 additionally enables HTTP IO tracing. Default is \fB0\fR .TP +.BI (http)http_object_mode \fR=\fPstr +How to structure objects for HTTP IO: block or range. Default is \fBblock\fR. +.RS +.RS +.TP +.B block +One object is created for every block. The HTTP engine treats \fBblocksize\fR +as the size of the object to read or write, and appends the block start/end +offsets to the \fBfilename\fR to create the target object path. Reads and +writes operate on whole objects at a time. +.TP +.B range +One object is created for every file. The object path is the filename directly +for both read and write I/O. For read requests, the \fBblocksize\fR and +\fBoffset\fR will be used to set the "Range" header on read requests to issue +partial reads of the object. For write requests, blocksize is used to set the +size of the object, the same as in \fBblock\fR mode. +.RE +.RE +.TP .BI (mtd)skip_bad \fR=\fPbool Skip operations against known bad blocks. .TP @@ -2275,16 +2777,50 @@ that "owns" the device also needs to support hipri (also known as iopoll and mq_poll). The MegaRAID driver is an example of a SCSI LLD. Default: clear (0) which does normal (interrupted based) IO. .TP -.BI (sg)readfua \fR=\fPbool +.BI (sg, io_uring_cmd)readfua \fR=\fPbool With readfua option set to 1, read operations include the force unit access (fua) flag. Default: 0. .TP -.BI (sg)writefua \fR=\fPbool +.BI (sg, io_uring_cmd)writefua \fR=\fPbool With writefua option set to 1, write operations include the force unit access (fua) flag. Default: 0. .TP +.BI (io_uring_cmd)write_mode \fR=\fPstr +Specifies the type of write operation. Defaults to 'write'. +.RS +.RS +.TP +.B write +Use Write commands for write operations +.TP +.B uncor +Use Write Uncorrectable commands for write operations +.TP +.B zeroes +Use Write Zeroes commands for write operations +.TP +.B verify +Use Verify commands for write operations +.RE +.RE +.TP +.BI (io_uring_cmd)verify_mode \fR=\fPstr +Specifies the type of command to be used in the verification phase. Defaults to 'read'. +.RS +.RS +.TP +.B read +Use Read commands for data verification +.TP +.B compare +Use Compare commands for data verification. This option is only valid with +specific pattern(s), which means it *must* be given with `verify=pattern` and +`verify_pattern=`. +.RE +.RE +.TP .BI (sg)sg_write_mode \fR=\fPstr -Specify the type of write commands to issue. This option can take three +Specify the type of write commands to issue. This option can take multiple values: .RS .RS @@ -2292,12 +2828,15 @@ values: .B write (default) Write opcodes are issued as usual .TP +.B write_and_verify +Issue WRITE AND VERIFY commands. The BYTCHK bit is set to 00b. This directs the +device to carry out a medium verification with no data comparison for the data +that was written. The writefua option is ignored with this selection. +.TP .B verify -Issue WRITE AND VERIFY commands. The BYTCHK bit is set to 0. This -directs the device to carry out a medium verification with no data -comparison. The writefua option is ignored with this selection. +This option is deprecated. Use write_and_verify instead. .TP -.B same +.B write_same Issue WRITE SAME commands. This transfers a single block to the device and writes this same block of data to a contiguous sequence of LBAs beginning at the specified offset. fio's block size parameter @@ -2308,9 +2847,43 @@ blocksize=8k will write 16 sectors with each command. fio will still generate 8k of data for each command butonly the first 512 bytes will be used and transferred to the device. The writefua option is ignored with this selection. +.TP +.B same +This option is deprecated. Use write_same instead. +.TP +.B write_same_ndob +Issue WRITE SAME(16) commands as above but with the No Data Output +Buffer (NDOB) bit set. No data will be transferred to the device with +this bit set. Data written will be a pre-determined pattern such as +all zeroes. +.TP +.B write_stream +Issue WRITE STREAM(16) commands. Use the stream_id option to specify +the stream identifier. +.TP +.B verify_bytchk_00 +Issue VERIFY commands with BYTCHK set to 00. This directs the device to carry +out a medium verification with no data comparison. +.TP +.B verify_bytchk_01 +Issue VERIFY commands with BYTCHK set to 01. This directs the device to +compare the data on the device with the data transferred to the device. +.TP +.B verify_bytchk_11 +Issue VERIFY commands with BYTCHK set to 11. This transfers a single block to +the device and compares the contents of this block with the data on the device +beginning at the specified offset. fio's block size parameter specifies the +total amount of data compared with this command. However, only one block +(sector) worth of data is transferred to the device. This is similar to the +WRITE SAME command except that data is compared instead of written. .RE .RE .TP +.BI (sg)stream_id \fR=\fPint +Set the stream identifier for WRITE STREAM commands. If this is set to 0 (which is not +a valid stream identifier) fio will open a stream and then close it when done. Default +is 0. +.TP .BI (nbd)uri \fR=\fPstr Specify the NBD URI of the server to test. The string is a standard NBD URI (see @@ -2362,11 +2935,11 @@ Specify the label or UUID of the DAOS pool to connect to. Specify the label or UUID of the DAOS container to open. .TP .BI (dfs)chunk_size -Specificy a different chunk size (in bytes) for the dfs file. +Specify a different chunk size (in bytes) for the dfs file. Use DAOS container's chunk size by default. .TP .BI (dfs)object_class -Specificy a different object class for the dfs file. +Specify a different object class for the dfs file. Use DAOS container's object class by default. .TP .BI (nfs)nfs_url @@ -2395,8 +2968,177 @@ replaced by the name of the job .BI (exec)grace_time\fR=\fPint Defines the time between the SIGTERM and SIGKILL signals. Default is 1 second. .TP -.BI (exec)std_redirect\fR=\fbool +.BI (exec)std_redirect\fR=\fPbool If set, stdout and stderr streams are redirected to files named from the job name. Default is true. +.TP +.BI (xnvme)xnvme_async\fR=\fPstr +Select the xnvme async command interface. This can take these values. +.RS +.RS +.TP +.B emu +This is default and use to emulate asynchronous I/O by using a single thread to +create a queue pair on top of a synchronous I/O interface using the NVMe driver +IOCTL. +.TP +.BI thrpool +Emulate an asynchronous I/O interface with a pool of userspace threads on top +of a synchronous I/O interface using the NVMe driver IOCTL. By default four +threads are used. +.TP +.BI io_uring +Linux native asynchronous I/O interface which supports both direct and buffered +I/O. +.TP +.BI libaio +Use Linux aio for Asynchronous I/O +.TP +.BI posix +Use the posix asynchronous I/O interface to perform one or more I/O operations +asynchronously. +.TP +.BI vfio +Use the user-space VFIO-based backend, implemented using libvfn instead of +SPDK. +.TP +.BI nil +Do not transfer any data; just pretend to. This is mainly used for +introspective performance evaluation. +.RE +.RE +.TP +.BI (xnvme)xnvme_sync\fR=\fPstr +Select the xnvme synchronous command interface. This can take these values. +.RS +.RS +.TP +.B nvme +This is default and uses Linux NVMe Driver ioctl() for synchronous I/O. +.TP +.BI psync +This supports regular as well as vectored pread() and pwrite() commands. +.TP +.BI block +This is the same as psync except that it also supports zone management +commands using Linux block layer IOCTLs. +.RE +.RE +.TP +.BI (xnvme)xnvme_admin\fR=\fPstr +Select the xnvme admin command interface. This can take these values. +.RS +.RS +.TP +.B nvme +This is default and uses Linux NVMe Driver ioctl() for admin commands. +.TP +.BI block +Use Linux Block Layer ioctl() and sysfs for admin commands. +.RE +.RE +.TP +.BI (xnvme)xnvme_dev_nsid\fR=\fPint +xnvme namespace identifier for userspace NVMe driver SPDK or vfio. +.TP +.BI (xnvme)xnvme_dev_subnqn\fR=\fPstr +Sets the subsystem NQN for fabrics. This is for xNVMe to utilize a fabrics +target with multiple systems. +.TP +.BI (xnvme)xnvme_mem\fR=\fPstr +Select the xnvme memory backend. This can take these values. +.RS +.RS +.TP +.B posix +This is the default posix memory backend for linux NVMe driver. +.TP +.BI hugepage +Use hugepages, instead of existing posix memory backend. The memory backend +uses hugetlbfs. This require users to allocate hugepages, mount hugetlbfs and +set an environment variable for XNVME_HUGETLB_PATH. +.TP +.BI spdk +Uses SPDK's memory allocator. +.TP +.BI vfio +Uses libvfn's memory allocator. This also specifies the use of libvfn backend +instead of SPDK. +.RE +.RE +.TP +.BI (xnvme)xnvme_iovec +If this option is set, xnvme will use vectored read/write commands. +.TP +.BI (libblkio)libblkio_driver \fR=\fPstr +The libblkio driver to use. Different drivers access devices through different +underlying interfaces. Available drivers depend on the libblkio version in use +and are listed at \fIhttps://libblkio.gitlab.io/libblkio/blkio.html#drivers\fR +.TP +.BI (libblkio)libblkio_path \fR=\fPstr +Sets the value of the driver-specific "path" property before connecting the +libblkio instance, which identifies the target device or file on which to +perform I/O. Its exact semantics are driver-dependent and not all drivers may +support it; see \fIhttps://libblkio.gitlab.io/libblkio/blkio.html#drivers\fR +.TP +.BI (libblkio)libblkio_pre_connect_props \fR=\fPstr +A colon-separated list of additional libblkio properties to be set after +creating but before connecting the libblkio instance. Each property must have +the format \fB=\fR. Colons can be escaped as \fB\\:\fR. These are +set after the engine sets any other properties, so those can be overridden. +Available properties depend on the libblkio version in use and are listed at +\fIhttps://libblkio.gitlab.io/libblkio/blkio.html#properties\fR +.TP +.BI (libblkio)libblkio_num_entries \fR=\fPint +Sets the value of the driver-specific "num-entries" property before starting the +libblkio instance. Its exact semantics are driver-dependent and not all drivers +may support it; see \fIhttps://libblkio.gitlab.io/libblkio/blkio.html#drivers\fR +.TP +.BI (libblkio)libblkio_queue_size \fR=\fPint +Sets the value of the driver-specific "queue-size" property before starting the +libblkio instance. Its exact semantics are driver-dependent and not all drivers +may support it; see \fIhttps://libblkio.gitlab.io/libblkio/blkio.html#drivers\fR +.TP +.BI (libblkio)libblkio_pre_start_props \fR=\fPstr +A colon-separated list of additional libblkio properties to be set after +connecting but before starting the libblkio instance. Each property must have +the format \fB=\fR. Colons can be escaped as \fB\\:\fR. These are +set after the engine sets any other properties, so those can be overridden. +Available properties depend on the libblkio version in use and are listed at +\fIhttps://libblkio.gitlab.io/libblkio/blkio.html#properties\fR +.TP +.BI (libblkio)hipri +Use poll queues. This is incompatible with \fBlibblkio_wait_mode=eventfd\fR and +\fBlibblkio_force_enable_completion_eventfd\fR. +.TP +.BI (libblkio)libblkio_vectored +Submit vectored read and write requests. +.TP +.BI (libblkio)libblkio_write_zeroes_on_trim +Submit trims as "write zeroes" requests instead of discard requests. +.TP +.BI (libblkio)libblkio_wait_mode \fR=\fPstr +How to wait for completions: +.RS +.RS +.TP +.B block \fR(default) +Use a blocking call to \fBblkioq_do_io()\fR. +.TP +.B eventfd +Use a blocking call to \fBread()\fR on the completion eventfd. +.TP +.B loop +Use a busy loop with a non-blocking call to \fBblkioq_do_io()\fR. +.RE +.RE +.TP +.BI (libblkio)libblkio_force_enable_completion_eventfd +Enable the queue's completion eventfd even when unused. This may impact +performance. The default is to enable it only if +\fBlibblkio_wait_mode=eventfd\fR. +.TP +.BI (windowsaio)no_completion_thread +Avoid using a separate thread for completion polling. .SS "I/O depth" .TP .BI iodepth \fR=\fPint @@ -2497,6 +3239,13 @@ reporting if I/O gets backed up on the device side (the coordinated omission problem). Note that this option cannot reliably be used with async IO engines. .SS "I/O rate" .TP +.BI thinkcycles \fR=\fPint +Stall the job for the specified number of cycles after an I/O has completed before +issuing the next. May be used to simulate processing being done by an application. +This is not taken into account for the time to be waited on for \fBthinktime\fR. +Might not have any effect on some platforms, this can be checked by trying a setting +a high enough amount of thinkcycles. +.TP .BI thinktime \fR=\fPtime Stall the job for the specified period of time after an I/O has completed before issuing the next. May be used to simulate processing being done by an application. @@ -2578,6 +3327,10 @@ By default, fio will attempt to catch up to the specified rate setting, if any kind of thinktime setting was used. If this option is set, then fio will ignore the thinktime and continue doing IO at the specified rate, instead of entering a catch-up mode after thinktime is done. +.TP +.BI rate_cycle \fR=\fPint +Average bandwidth for \fBrate_min\fR and \fBrate_iops_min\fR over this number +of milliseconds. Defaults to 1000. .SS "I/O latency" .TP .BI latency_target \fR=\fPtime @@ -2607,16 +3360,13 @@ If set, fio will exit the job with an ETIMEDOUT error if it exceeds this maximum latency. When the unit is omitted, the value is interpreted in microseconds. Comma-separated values may be specified for reads, writes, and trims as described in \fBblocksize\fR. -.TP -.BI rate_cycle \fR=\fPint -Average bandwidth for \fBrate\fR and \fBrate_min\fR over this number -of milliseconds. Defaults to 1000. .SS "I/O replay" .TP .BI write_iolog \fR=\fPstr Write the issued I/O patterns to the specified file. See \fBread_iolog\fR. Specify a separate file for each job, otherwise the -iologs will be interspersed and the file may be corrupt. +iologs will be interspersed and the file may be corrupt. This file will be +opened in append mode. .TP .BI read_iolog \fR=\fPstr Open an iolog with the specified filename and replay the I/O patterns it @@ -2722,6 +3472,10 @@ limitations. First, the waitee must be defined prior to the waiter job (meaning no forward references). Second, if a job is being referenced as a waitee, it must have a unique name (no duplicate waitees). .TP +.BI comm \fR=\fPstr +Set the job process comm to the specified string. See man \fBprctrl\fR\|(2). +Note: This option is currently supported only on Linux. +.TP .BI nice \fR=\fPint Run the job with the given nice value. See man \fBnice\fR\|(2). .\" ignore blank line here from HOWTO as it looks normal without it @@ -2742,6 +3496,15 @@ Set the I/O priority class. See man \fBionice\fR\|(1). For per-command priority setting, see the I/O engine specific `cmdprio_percentage` and `cmdprio_class` options. .TP +.BI priohint \fR=\fPint +Set the I/O priority hint. This is only applicable to platforms that support +I/O priority classes and to devices with features controlled through priority +hints, e.g. block devices supporting command duration limits, or CDL. CDL is a +way to indicate the desired maximum latency of I/Os so that the device can +optimize its internal command scheduling according to the latency limits +indicated by the user. For per-I/O priority hint setting, see the I/O engine +specific \fBcmdprio_hint\fB option. +.TP .BI cpus_allowed \fR=\fPstr Controls the same options as \fBcpumask\fR, but accepts a textual specification of the permitted CPUs instead and CPUs are indexed from 0. So @@ -2911,7 +3674,10 @@ Do not perform specified workload, only verify data still matches previous invocation of this workload. This option allows one to check data multiple times at a later date without overwriting it. This option makes sense only for workloads that write data, and does not support workloads with the -\fBtime_based\fR option set. +\fBtime_based\fR option set. Options \fBverify_write_sequence\fR and +\fBverify_header_seed\fR will be disabled in this mode, unless they are +explicitly enabled. The writes reported in the output when this option is +specified are phantom writes, since no writes are actually issued. .TP .BI do_verify \fR=\fPbool Run the verify phase after a write phase. Only valid if \fBverify\fR is @@ -2922,8 +3688,9 @@ If writing to a file, fio can verify the file contents after each iteration of the job. Each verification method also implies verification of special header, which is written to the beginning of each block. This header also includes meta information, like offset of the block, block number, timestamp -when block was written, etc. \fBverify\fR can be combined with -\fBverify_pattern\fR option. The allowed values are: +when block was written, initial seed value used to generate the buffer +contents, etc. \fBverify\fR can be combined with \fBverify_pattern\fR option. +The allowed values are: .RS .RS .TP @@ -2994,26 +3761,37 @@ Verify a strict pattern. Normally fio includes a header with some basic information and checksumming, but if this option is set, only the specific pattern set with \fBverify_pattern\fR is verified. .TP +.B pattern_hdr +Verify a pattern in conjunction with a header. +.TP .B null Only pretend to verify. Useful for testing internals with `ioengine=null', not for much else. .RE .P This option can be used for repeated burn\-in tests of a system to make sure -that the written data is also correctly read back. If the data direction -given is a read or random read, fio will assume that it should verify a -previously written file. If the data direction includes any form of write, -the verify will be of the newly written data. +that the written data is also correctly read back. +.P +If the data direction given is a read or random read, fio will assume that it +should verify a previously written file. In this scenario fio will not verify +the block number written in the header. The header seed won't be verified, +unless its explicitly requested by setting \fBverify_header_seed\fR option. +Note in this scenario the header seed check will only work if the read +invocation exactly matches the original write invocation. +.P +If the data direction includes any form of write, the verify will be of the +newly written data. .P To avoid false verification errors, do not use the norandommap option when verifying data with async I/O engines and I/O depths > 1. Or use the norandommap and the lfsr random generator together to avoid writing to the -same offset with muliple outstanding I/Os. +same offset with multiple outstanding I/Os. .RE .TP .BI verify_offset \fR=\fPint Swap the verification header with data somewhere else in the block before -writing. It is swapped back before verifying. +writing. It is swapped back before verifying. This should be within the range +of \fBverify_interval\fR. .TP .BI verify_interval \fR=\fPint Write the verification header at a finer granularity than the @@ -3043,6 +3821,13 @@ verify_pattern=0xff%o"abcd"\-12 .RE .RE .TP +.BI verify_pattern_interval \fR=\fPbool +Recreate an instance of the \fBverify_pattern\fR every +\fBverify_pattern_interval\fR bytes. This is only useful when +\fBverify_pattern\fR contains the %o format specifier and can be used to speed +up the process of writing each block on a device with its offset. Default: +0 (disabled). +.TP .BI verify_fatal \fR=\fPbool Normally fio will keep checking the entire contents before quitting on a block verification failure. If this option is set, fio will exit the job on @@ -3106,6 +3891,26 @@ far it should verify. Without this information, fio will run a full verification pass, according to the settings in the job file used. Default false. .TP +.BI experimental_verify \fR=\fPbool +Enable experimental verification. Standard verify records I/O metadata for +later use during the verification phase. Experimental verify instead resets the +file after the write phase and then replays I/Os for the verification phase. +.TP +.BI verify_write_sequence \fR=\fPbool +Verify the header write sequence number. In a scenario with multiple jobs, +verification of the write sequence number may fail. Disabling this option +will mean that write sequence number checking is skipped. Doing that can be +useful for testing atomic writes, as it means that checksum verification can +still be attempted. For when \fBatomic\fR is enabled, checksum verification +is expected to succeed (while write sequence checking can still fail). +.TP +.BI verify_header_seed \fR=\fPbool +Verify the header seed value which was used to generate the buffer contents. +In certain scenarios with read / verify only workloads, when \fBnorandommap\fR +is enabled, with offset modifiers (refer options \fBreadwrite\fR and +\fBrw_sequencer\fR), etc verification of header seed may fail. Disabling this +option will mean that header seed checking is skipped. Defaults to true. +.TP .BI trim_percentage \fR=\fPint Number of verify blocks to discard/trim. .TP @@ -3117,9 +3922,6 @@ Verify that trim/discarded blocks are returned as zeros. .TP .BI trim_backlog_batch \fR=\fPint Trim this number of I/O blocks. -.TP -.BI experimental_verify \fR=\fPbool -Enable experimental verification. .SS "Steady state" .TP .BI steadystate \fR=\fPstr:float "\fR,\fP ss" \fR=\fPstr:float @@ -3165,19 +3967,28 @@ slope. Stop the job if the slope falls below the specified limit. .TP .BI steadystate_duration \fR=\fPtime "\fR,\fP ss_dur" \fR=\fPtime A rolling window of this duration will be used to judge whether steady state -has been reached. Data will be collected once per second. The default is 0 -which disables steady state detection. When the unit is omitted, the -value is interpreted in seconds. +has been reached. Data will be collected every \fBss_interval\fR. The default +is 0 which disables steady state detection. When the unit is omitted, the value +is interpreted in seconds. .TP .BI steadystate_ramp_time \fR=\fPtime "\fR,\fP ss_ramp" \fR=\fPtime Allow the job to run for the specified duration before beginning data collection for checking the steady state job termination criterion. The default is 0. When the unit is omitted, the value is interpreted in seconds. +.TP +.BI steadystate_check_interval \fR=\fPtime "\fR,\fP ss_interval" \fR=\fPtime +The values suring the rolling window will be collected with a period of this +value. If \fBss_interval\fR is 30s and \fBss_dur\fR is 300s, 10 measurements +will be taken. Default is 1s but that might not converge, especially for slower +devices, so set this accordingly. When the unit is omitted, the value is +interpreted in seconds. .SS "Measurements and reporting" .TP .BI per_job_logs \fR=\fPbool -If set, this generates bw/clat/iops log with per file private filenames. If -not set, jobs with identical names will share the log filename. Default: +If set to true, fio generates bw/clat/iops logs with per job unique filenames. +If set to false, jobs with identical names will share a log filename. Note that +when this option is set to false log files will be opened in append mode and if +log files already exist the previous contents will not be overwritten. Default: true. .TP .BI group_reporting @@ -3188,6 +3999,19 @@ quickly becomes unwieldy. To see the final report per-group instead of per-job, use \fBgroup_reporting\fR. Jobs in a file will be part of the same reporting group, unless if separated by a \fBstonewall\fR, or by using \fBnew_group\fR. +.RS +.P +NOTE: When \fBgroup_reporting\fR is used along with \fBjson\fR output, there +are certain per-job properties which can be different between jobs but do not +have a natural group-level equivalent. Examples include \fBkb_base\fR, +\fBunit_base\fR, \fBsig_figs\fR, \fBthread_number\fR, \fBpid\fR, and +\fBjob_start\fR. For these properties, the values for the first job are +recorded for the group. +.P +Also, options like \fBpercentile_list\fR and \fBunified_rw_reporting\fR should +be consistent among the jobs in a reporting group. Having options like these +vary across the jobs in a reporting group is an unsupported configuration. +.RE .TP .BI new_group Start a new reporting group. See: \fBgroup_reporting\fR. If not given, @@ -3258,12 +4082,14 @@ resulting in more precise time-related I/O statistics. Also see \fBlog_avg_msec\fR as well. Defaults to 1024. .TP .BI log_avg_msec \fR=\fPint -By default, fio will log an entry in the iops, latency, or bw log for every -I/O that completes. When writing to the disk log, that can quickly grow to a -very large size. Setting this option makes fio average the each log entry -over the specified period of time, reducing the resolution of the log. See -\fBlog_max_value\fR as well. Defaults to 0, logging all entries. -Also see \fBLOG FILE FORMATS\fR section. +By default, fio will log an entry in the iops, latency, or bw log for every I/O +that completes. When writing to the disk log, that can quickly grow to a very +large size. Setting this option directs fio to instead record an average over +the specified duration for each log entry, reducing the resolution of the log. +When the job completes, fio will flush any accumulated latency log data, so the +final log interval may not match the value specified by this option and there +may even be duplicate timestamps. See \fBlog_window_value\fR as well. Defaults +to 0, logging entries for each I/O. Also see \fBLOG FILE FORMATS\fR section. .TP .BI log_hist_msec \fR=\fPint Same as \fBlog_avg_msec\fR, but logs entries for completion latency @@ -3280,10 +4106,28 @@ the histogram logs enabled with \fBlog_hist_msec\fR. For each increment in coarseness, fio outputs half as many bins. Defaults to 0, for which histogram logs contain 1216 latency bins. See \fBLOG FILE FORMATS\fR section. .TP -.BI log_max_value \fR=\fPbool -If \fBlog_avg_msec\fR is set, fio logs the average over that window. If -you instead want to log the maximum value, set this option to 1. Defaults to -0, meaning that averaged values are logged. +.BI log_window_value \fR=\fPstr "\fR,\fP log_max_value" \fR=\fPstr +If \fBlog_avg_msec\fR is set, fio by default logs the average over that window. +This option determines whether fio logs the average, maximum or both the +values over the window. This only affects the latency logging, as both average +and maximum values for iops or bw log will be same. Accepted values are: +.RS +.TP +.B avg +Log average value over the window. The default. +.TP +.B max +Log maximum value in the window. +.TP +.B both +Log both average and maximum value over the window. +.TP +.B 0 +Backward-compatible alias for \fBavg\fR. +.TP +.B 1 +Backward-compatible alias for \fBmax\fR. +.RE .TP .BI log_offset \fR=\fPbool If this is set, the iolog options will include the byte offset for the I/O @@ -3291,9 +4135,16 @@ entry as well as the other data values. Defaults to 0 meaning that offsets are not present in logs. Also see \fBLOG FILE FORMATS\fR section. .TP .BI log_prio \fR=\fPbool -If this is set, the iolog options will include the I/O priority for the I/O -entry as well as the other data values. Defaults to 0 meaning that -I/O priorities are not present in logs. Also see \fBLOG FILE FORMATS\fR section. +If this is set, the `Command priority` field in \fBLOG FILE FORMATS\fR +shows the priority value and the IO priority class of the command. +Otherwise, the field shows if the command has the highest RT priority +class or not. Also see \fBLOG FILE FORMATS\fR section. +.TP +.BI log_issue_time \fR=\fPbool +If this is set, the iolog options will include the command issue time for the +I/O entry as well as the other data values. Defaults to 0 meaning that command +issue times are not present in logs. Also see \fBLOG FILE FORMATS\fR section. +This option shall be set together with \fBwrite_lat_log\fR and \fBlog_offset\fR. .TP .BI log_compression \fR=\fPint If this is set, fio will compress the I/O logs as it goes, to keep the @@ -3319,10 +4170,19 @@ decompressed with fio, using the \fB\-\-inflate\-log\fR command line parameter. The files will be stored with a `.fz' suffix. .TP .BI log_unix_epoch \fR=\fPbool -If set, fio will log Unix timestamps to the log files produced by enabling -write_type_log for each log type, instead of the default zero-based +Backward-compatible alias for \fBlog_alternate_epoch\fR. +.TP +.BI log_alternate_epoch \fR=\fPbool +If set, fio will log timestamps based on the epoch used by the clock specified +in the \fBlog_alternate_epoch_clock_id\fR option, to the log files produced by +enabling write_type_log for each log type, instead of the default zero-based timestamps. .TP +.BI log_alternate_epoch_clock_id \fR=\fPint +Specifies the clock_id to be used by clock_gettime to obtain the alternate +epoch if \fBlog_alternate_epoch\fR is true. Otherwise has no effect. Default +value is 0, or CLOCK_REALTIME. +.TP .BI block_error_percentiles \fR=\fPbool If set, record errors in trim block-sized units from writes and trims and output a histogram of how many trims it took to get to errors, and what kind @@ -3400,6 +4260,16 @@ EILSEQ) until the runtime is exceeded or the I/O size specified is completed. If this option is used, there are two more stats that are appended, the total error count and the first error. The error field given in the stats is the first error that was hit during the run. +.RS +.P +Note: a write error from the device may go unnoticed by fio when using buffered +IO, as the write() (or similar) system call merely dirties the kernel pages, +unless `sync' or `direct' is used. Device IO errors occur when the dirty data is +actually written out to disk. If fully sync writes aren't desirable, `fsync' or +`fdatasync' can be used as well. This is specific to writes, as reads are always +synchronous. +.RS +.P The allowed values are: .RS .RS @@ -3668,21 +4538,63 @@ submission to completion of the I/O pieces. For sync I/O, clat will usually be equal (or very close) to 0, as the time from submit to complete is basically just CPU time (I/O has already been done, see slat explanation). + +For file and directory operation engines, \fBclat\fP denotes the time +to complete one file or directory operation. +.RS +.TP +\fBfilecreate engine\fP:\tthe time cost to create a new file +.TP +\fBfilestat engine\fP:\tthe time cost to look up an existing file +.TP +\fBfiledelete engine\fP:\tthe time cost to delete a file +.TP +\fBdircreate engine\fP:\tthe time cost to create a new directory +.TP +\fBdirstat engine\fP:\tthe time cost to look up an existing directory +.TP +\fBdirdelete engine\fP:\tthe time cost to delete a directory +.RE .TP .B lat Total latency. Same names as slat and clat, this denotes the time from when fio created the I/O unit to completion of the I/O operation. .TP .B bw -Bandwidth statistics based on samples. Same names as the xlat stats, -but also includes the number of samples taken (\fIsamples\fR) and an -approximate percentage of total aggregate bandwidth this thread -received in its group (\fIper\fR). This last value is only really -useful if the threads in this group are on the same disk, since they -are then competing for disk access. +Bandwidth statistics based on measurements from discrete intervals. Fio +continuosly monitors bytes transferred and I/O operations completed. By default +fio calculates bandwidth in each half-second interval (see \fBbwavgtime\fR) +and reports descriptive statistics for the measurements here. Same names as the +xlat stats, but also includes the number of samples taken (\fIsamples\fR) and an +approximate percentage of total aggregate bandwidth this thread received in its +group (\fIper\fR). This last value is only really useful if the threads in this +group are on the same disk, since they are then competing for disk access. + +For file and directory operation engines, \fBbw\fR is meaningless. .TP .B iops -IOPS statistics based on samples. Same names as \fBbw\fR. +IOPS statistics based on measurements from discrete intervals. +For details see the description for \fBbw\fR above. See +\fBiopsavgtime\fR to control the duration of the intervals. +Same values reported here as for \fBbw\fR except for percentage. + +For file and directory operation engines, \fBiops\fP is the most +fundamental index to denote the performance. +It means how many files or directories can be operated per second. +.RS +.TP +\fBfilecreate engine\fP:\tnumber of files can be created per second +.TP +\fBfilestat engine\fP:\tnumber of files can be looked up per second +.TP +\fBfiledelete engine\fP:\tnumber of files can be deleted per second +.TP +\fBdircreate engine\fP:\tnumber of directories can be created per second +.TP +\fBdirstat engine\fP:\tnumber of directories can be looked up per second +.TP +\fBdirdelete engine\fP:\tnumber of directories can be deleted per second +.RE .TP .B lat (nsec/usec/msec) The distribution of I/O completion latencies. This is the time from when @@ -3759,7 +4671,7 @@ They will look like this: .P .nf Disk stats (read/write): - sda: ios=16398/16511, merge=30/162, ticks=6853/819634, in_queue=826487, util=100.00% + sda: ios=16398/16511, sectors=32321/65472, merge=30/162, ticks=6853/819634, in_queue=826487, util=100.00% .fi .P Each value is printed for both reads and writes, with reads first. The @@ -3987,7 +4899,7 @@ This format is not supported in fio versions >= 1.20\-rc3. .TP .B Trace file format v2 The second version of the trace file format was added in fio version 1.17. It -allows to access more then one file per trace and has a bigger set of possible +allows one to access more than one file per trace and has a bigger set of possible file actions. .RS .P @@ -4032,7 +4944,9 @@ given in bytes. The `action' can be one of these: .TP .B wait Wait for `offset' microseconds. Everything below 100 is discarded. -The time is relative to the previous `wait' statement. +The time is relative to the previous `wait' statement. Note that action `wait` +is not allowed as of version 3, as the same behavior can be achieved using +timestamps. .TP .B read Read `length' bytes beginning from `offset'. @@ -4050,6 +4964,37 @@ Write `length' bytes beginning from `offset'. Trim the given file from the given `offset' for `length' bytes. .RE .RE +.RE +.TP +.B Trace file format v3 +The third version of the trace file format was added in fio version 3.31. It +forces each action to have a timestamp associated with it. +.RS +.P +The first line of the trace file has to be: +.RS +.P +"fio version 3 iolog" +.RE +.P +Following this can be lines in two different formats, which are described below. +.P +.B +The file management format: +.RS +timestamp filename action +.RE +.P +.B +The file I/O action format: +.RS +timestamp filename action offset length +.RE +.P +The `timestamp` is relative to the beginning of the run (ie starts at 0). The +`filename`, `action`, `offset` and `length` are identical to version 2, except +that version 3 does not allow the `wait` action. +.RE .SH I/O REPLAY \- MERGING TRACES Colocation is a common practice used to get the most out of a machine. Knowing which workloads play nicely with each other and which ones don't is @@ -4139,7 +5084,6 @@ running as a server backend, it will send the job states back to the client for safe storage, then execute the remote trigger, if specified. If a local trigger is specified, the server will still send back the write state, but the client will then execute the trigger. -.RE .P .B Verification trigger example .RS @@ -4194,7 +5138,7 @@ and IOPS. The logs share a common format, which looks like this: .RS .P time (msec), value, data direction, block size (bytes), offset (bytes), -command priority +command priority, issue time (nsec) .RE .P `Time' for the log entry is always in milliseconds. The `value' logged depends @@ -4237,12 +5181,26 @@ number with the lowest 13 bits indicating the priority value (\fBprio\fR and \fBcmdprio\fR options) and the highest 3 bits indicating the IO priority class (\fBprioclass\fR and \fBcmdprio_class\fR options). .P +The entry's `issue time` is the command issue time in nanoseconds. The logging +of the issue time can be toggled with \fBlog_issue_time\fR. This field has valid +values in completion latency log file (clat), or submit latency log file (slat). +The field has value 0 in other log files. +.P Fio defaults to logging every individual I/O but when windowed logging is set -through \fBlog_avg_msec\fR, either the average (by default) or the maximum -(\fBlog_max_value\fR is set) `value' seen over the specified period of time -is recorded. Each `data direction' seen within the window period will aggregate -its values in a separate row. Further, when using windowed logging the `block -size' and `offset' entries will always contain 0. +through \fBlog_avg_msec\fR, either the average (by default), the maximum +(\fBlog_window_value\fR is set to max) `value' seen over the specified period of +time, or both the average `value' and maximum `value1' (\fBlog_window_value\fR is +set to both) is recorded. The log file format when both the values are reported +takes this form: +.RS +.P +time (msec), value, value1, data direction, block size (bytes), offset (bytes), +command priority, issue time (nsec) +.RE +.P +Each `data direction' seen within the window period will aggregate its values +in a separate row. Further, when using windowed logging the `block size', +`offset' and `issue time` entries will always contain 0. .SH CLIENT / SERVER Normally fio is invoked as a stand-alone application on the machine where the I/O workload should be generated. However, the backend and frontend of fio can @@ -4292,6 +5250,9 @@ is the connect string, and `remote\-args' and `job file(s)' are sent to the server. The `server' string follows the same format as it does on the server side, to allow IP/hostname/socket and port strings. .P +Note that all job options must be defined in job files when running fio as a +client. Any job options specified in `remote\-args' will be ignored. +.P Fio can connect to multiple servers this way: .RS .P @@ -4345,8 +5306,17 @@ containing two hostnames `h1' and `h2' with IP addresses 192.168.10.120 and .PD .RE .P +This behavior can be disabled by the \fBunique_filename\fR option. +.P Terse output in client/server mode will differ slightly from what is produced when fio is run in stand-alone mode. See the terse output section for details. +.P +Also, if one fio invocation runs workloads on multiple servers, fio will +provide at the end an aggregate summary report for all workloads. This +aggregate summary report assumes that options affecting reporting like +\fBunified_rw_reporting\fR and \fBpercentile_list\fR are identical across all +the jobs summarized. Having different values for these options is an +unsupported configuration. .SH AUTHORS .B fio was written by Jens Axboe . @@ -4361,7 +5331,7 @@ Report bugs to the \fBfio\fR mailing list . .br See \fBREPORTING\-BUGS\fR. .P -\fBREPORTING\-BUGS\fR: \fIhttp://git.kernel.dk/cgit/fio/plain/REPORTING\-BUGS\fR +\fBREPORTING\-BUGS\fR: \fIhttps://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio.git/tree/REPORTING\-BUGS\fR .SH "SEE ALSO" For further documentation see \fBHOWTO\fR and \fBREADME\fR. .br @@ -4369,6 +5339,6 @@ Sample jobfiles are available in the `examples/' directory. .br These are typically located under `/usr/share/doc/fio'. .P -\fBHOWTO\fR: \fIhttp://git.kernel.dk/cgit/fio/plain/HOWTO\fR +\fBHOWTO\fR: \fIhttps://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio.git/tree/HOWTO\fR .br -\fBREADME\fR: \fIhttp://git.kernel.dk/cgit/fio/plain/README\fR +\fBREADME\fR: \fIhttps://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio.git/tree/README.rst\fR diff --git a/fio.c b/fio.c index f19db1be6f..1394a51d94 100644 --- a/fio.c +++ b/fio.c @@ -27,15 +27,9 @@ int main(int argc, char *argv[], char *envp[]) { int ret = 1; - compiletime_assert(TD_NR <= TD_ENG_FLAG_SHIFT, "TD_ENG_FLAG_SHIFT"); - if (initialize_fio(envp)) return 1; -#if !defined(CONFIG_GETTIMEOFDAY) && !defined(CONFIG_CLOCK_GETTIME) -#error "No available clock source!" -#endif - if (fio_server_create_sk_key()) goto done; diff --git a/fio.h b/fio.h index 6bb21ebb2a..65c68d4bba 100644 --- a/fio.h +++ b/fio.h @@ -71,6 +71,16 @@ struct fio_sem; +#define MAX_TRIM_RANGE 256 + +/* + * Range for trim command + */ +struct trim_range { + unsigned long long start; + unsigned long long len; +}; + /* * offset generator types */ @@ -97,6 +107,7 @@ enum { __TD_F_MMAP_KEEP, __TD_F_DIRS_CREATED, __TD_F_CHECK_RATE, + __TD_F_SYNCS, __TD_F_LAST, /* not a real bit, keep last */ }; @@ -118,6 +129,7 @@ enum { TD_F_MMAP_KEEP = 1U << __TD_F_MMAP_KEEP, TD_F_DIRS_CREATED = 1U << __TD_F_DIRS_CREATED, TD_F_CHECK_RATE = 1U << __TD_F_CHECK_RATE, + TD_F_SYNCS = 1U << __TD_F_SYNCS, }; enum { @@ -142,6 +154,8 @@ enum { FIO_RAND_POISSON3_OFF, FIO_RAND_PRIO_CMDS, FIO_RAND_DEDUPE_WORKING_SET_IX, + FIO_RAND_FDP_OFF, + FIO_RAND_SPRANDOM_OFF, FIO_RAND_NR_OFFS, }; @@ -161,6 +175,7 @@ enum { F_ADV_TYPE, F_ADV_RANDOM, F_ADV_SEQUENTIAL, + F_ADV_NOREUSE, }; /* @@ -182,7 +197,7 @@ struct zone_split_index { */ struct thread_data { struct flist_head opt_list; - unsigned long flags; + unsigned long long flags; struct thread_options o; void *eo; pthread_t thread; @@ -244,8 +259,9 @@ struct thread_data { size_t orig_buffer_size; volatile int runstate; volatile bool terminate; - bool last_was_sync; - enum fio_ddir last_ddir; + + enum fio_ddir last_ddir_completed; + enum fio_ddir last_ddir_issued; int mmapfd; @@ -256,8 +272,10 @@ struct thread_data { struct frand_state bsrange_state[DDIR_RWDIR_CNT]; struct frand_state verify_state; + struct frand_state verify_state_last_do_io; struct frand_state trim_state; struct frand_state delay_state; + struct frand_state fdp_state; struct frand_state buf_state; struct frand_state buf_state_prev; @@ -267,14 +285,16 @@ struct thread_data { struct frand_state prio_state; struct frand_state dedupe_working_set_index_state; struct frand_state *dedupe_working_set_states; + struct frand_state sprandom_state; unsigned long long num_unique_pages; struct zone_split_index **zone_state_index; - unsigned int num_open_zones; + unsigned int num_write_zones; unsigned int verify_batch; unsigned int trim_batch; + bool trim_verify; struct thread_io_list *vstate; @@ -335,10 +355,10 @@ struct thread_data { */ uint64_t rate_bps[DDIR_RWDIR_CNT]; uint64_t rate_next_io_time[DDIR_RWDIR_CNT]; - unsigned long long rate_bytes[DDIR_RWDIR_CNT]; - unsigned long rate_blocks[DDIR_RWDIR_CNT]; + unsigned long long last_rate_check_bytes[DDIR_RWDIR_CNT]; + unsigned long last_rate_check_blocks[DDIR_RWDIR_CNT]; unsigned long long rate_io_issue_bytes[DDIR_RWDIR_CNT]; - struct timespec lastrate[DDIR_RWDIR_CNT]; + struct timespec last_rate_check_time[DDIR_RWDIR_CNT]; int64_t last_usec[DDIR_RWDIR_CNT]; struct frand_state poisson_state[DDIR_RWDIR_CNT]; @@ -354,9 +374,17 @@ struct thread_data { * Issue side */ uint64_t io_issues[DDIR_RWDIR_CNT]; + uint64_t verify_read_issues; uint64_t io_issue_bytes[DDIR_RWDIR_CNT]; uint64_t loops; + /* + * Keep track of inflight write sequence numbers (numberio) which are used to save verify state. + */ + uint64_t *inflight_numberio; + unsigned int next_inflight_numberio_idx; + uint64_t inflight_issued; + /* * Completions */ @@ -368,26 +396,28 @@ struct thread_data { uint64_t zone_bytes; struct fio_sem *sem; uint64_t bytes_done[DDIR_RWDIR_CNT]; + uint64_t bytes_verified; uint64_t *thinktime_blocks_counter; struct timespec last_thinktime; - uint64_t last_thinktime_blocks; + int64_t last_thinktime_blocks; /* - * State for random io, a bitmap of blocks done vs not done + * State for random offsets */ - struct frand_state random_state; + struct frand_state offset_state; struct timespec start; /* start of this loop */ struct timespec epoch; /* time job was started */ - unsigned long long unix_epoch; /* Time job was started, unix epoch based. */ + unsigned long long alternate_epoch; /* Time job was started, as clock_gettime(log_alternate_epoch_clock_id) */ + unsigned long long job_start; /* Time job was started, as clock_gettime(job_start_clock_id) */ struct timespec last_issue; long time_offset; struct timespec ts_cache; struct timespec terminate_time; unsigned int ts_cache_nr; unsigned int ts_cache_mask; - bool ramp_time_over; + unsigned int ramp_period_state; /* * Time since last latency_window was started @@ -428,9 +458,13 @@ struct thread_data { struct flist_head io_log_list; FILE *io_log_rfile; unsigned int io_log_blktrace; + unsigned int io_log_blktrace_swap; + unsigned long long io_log_last_ttime; + struct timespec io_log_start_time; unsigned int io_log_current; unsigned int io_log_checkmark; unsigned int io_log_highmark; + unsigned int io_log_version; struct timespec io_log_highmark_time; /* @@ -508,10 +542,9 @@ enum { if ((td)->error) \ break; \ (td)->error = ____e; \ - if (!(td)->first_error) \ - nowarn_snprintf(td->verror, sizeof(td->verror), \ - "file:%s:%d, func=%s, error=%s", \ - __FILE__, __LINE__, (func), (msg)); \ + nowarn_snprintf(td->verror, sizeof(td->verror), \ + "file:%s:%d, func=%s, error=%s", \ + __FILE__, __LINE__, (func), (msg)); \ } while (0) @@ -593,12 +626,27 @@ extern bool eta_time_within_slack(unsigned int time); static inline void fio_ro_check(const struct thread_data *td, struct io_u *io_u) { assert(!(io_u->ddir == DDIR_WRITE && !td_write(td)) && - !(io_u->ddir == DDIR_TRIM && !td_trim(td))); + !(io_u->ddir == DDIR_TRIM && !(td_trim(td) || td->trim_verify))); + + /* + * The last line above allows trim operations during trim/verify + * workloads. For these workloads we cannot simply set the trim bit for + * the thread's ddir because then fio would assume that + * ddir={trimewrite, randtrimwrite}. + */ +} + +static inline bool multi_range_trim(struct thread_data *td, struct io_u *io_u) +{ + if (io_u->ddir == DDIR_TRIM && td->o.num_range > 1) + return true; + + return false; } static inline bool should_fsync(struct thread_data *td) { - if (td->last_was_sync) + if (ddir_sync(td->last_ddir_issued)) return false; if (td_write(td) || td->o.override_sync) return true; @@ -628,7 +676,7 @@ extern void fio_options_dup_and_init(struct option *); extern char *fio_option_dup_subs(const char *); extern void fio_options_mem_dupe(struct thread_data *); extern void td_fill_rand_seeds(struct thread_data *); -extern void td_fill_verify_state_seed(struct thread_data *); +extern void init_rand_offset_seed(struct thread_data *); extern void add_job_opts(const char **, int); extern int ioengine_load(struct thread_data *); extern bool parse_dryrun(void); @@ -676,13 +724,13 @@ enum { TD_NR, }; -#define TD_ENG_FLAG_SHIFT 17 -#define TD_ENG_FLAG_MASK ((1U << 17) - 1) +#define TD_ENG_FLAG_SHIFT (__TD_F_LAST) +#define TD_ENG_FLAG_MASK ((1ULL << (__TD_F_LAST)) - 1) static inline void td_set_ioengine_flags(struct thread_data *td) { td->flags = (~(TD_ENG_FLAG_MASK << TD_ENG_FLAG_SHIFT) & td->flags) | - (td->io_ops->flags << TD_ENG_FLAG_SHIFT); + ((unsigned long long)td->io_ops->flags << TD_ENG_FLAG_SHIFT); } static inline bool td_ioengine_flagged(struct thread_data *td, @@ -742,17 +790,48 @@ extern void lat_target_check(struct thread_data *); extern void lat_target_init(struct thread_data *); extern void lat_target_reset(struct thread_data *); +/* + * Inflight log + */ +extern void log_inflight(struct thread_data *, struct io_u *); +extern void invalidate_inflight(struct thread_data *, struct io_u *); +extern void clear_inflight(struct thread_data *); + /* * Iterates all threads/processes within all the defined jobs + * Usage: + * for_each_td(var_name_for_td) { + * << bodoy of your loop >> + * Note: internally-scoped loop index availble as __td_index + * } end_for_each_td() */ -#define for_each_td(td, i) \ - for ((i) = 0, (td) = &segments[0].threads[0]; (i) < (int) thread_number; (i)++, (td) = tnumber_to_td((i))) +#define for_each_td(td) \ +{ \ + int __td_index; \ + struct thread_data *(td); \ + for (__td_index = 0, (td) = &segments[0].threads[0];\ + __td_index < (int) thread_number; __td_index++, (td) = tnumber_to_td(__td_index)) +#define for_each_td_index() \ +{ \ + int __td_index; \ + for (__td_index = 0; __td_index < (int) thread_number; __td_index++) +#define end_for_each() } + #define for_each_file(td, f, i) \ if ((td)->files_index) \ for ((i) = 0, (f) = (td)->files[0]; \ (i) < (td)->o.nr_files && ((f) = (td)->files[i]) != NULL; \ (i)++) +static inline bool fio_offset_overlap_risk(struct thread_data *td) +{ + if (td->o.norandommap || td->o.softrandommap || + td->o.ddir_seq_add || (td->o.ddir_seq_nr > 1)) + return true; + + return false; +} + static inline bool fio_fill_issue_time(struct thread_data *td) { if (td->o.read_iolog_file || diff --git a/fio_sem.h b/fio_sem.h index a796ddd74d..a06f6eb792 100644 --- a/fio_sem.h +++ b/fio_sem.h @@ -21,8 +21,10 @@ enum { extern int __fio_sem_init(struct fio_sem *, int); extern struct fio_sem *fio_sem_init(int); +extern struct fio_sem *fio_shared_sem_init(int); extern void __fio_sem_remove(struct fio_sem *); extern void fio_sem_remove(struct fio_sem *); +extern void fio_shared_sem_remove(struct fio_sem *); extern void fio_sem_up(struct fio_sem *); extern void fio_sem_down(struct fio_sem *); extern bool fio_sem_down_trylock(struct fio_sem *); diff --git a/fio_shared_sem.c b/fio_shared_sem.c new file mode 100644 index 0000000000..bc26bbe719 --- /dev/null +++ b/fio_shared_sem.c @@ -0,0 +1,42 @@ +/* + * Separate out the two helper functions for fio_sem from "fio_sem.c". + * These two functions depend on fio shared memory. Other fio_sem + * functions in "fio_sem.c" are used for fio shared memory. This file + * separation is required to avoid build failures caused by circular + * dependency. + */ + +#include + +#include "fio_sem.h" +#include "smalloc.h" + +/* + * Allocate and initialize fio_sem lock object in the same manner as + * fio_sem_init(), except the lock object is allocated from the fio + * shared memory. This allows the parent process to free the lock + * allocated by child processes. + */ +struct fio_sem *fio_shared_sem_init(int value) +{ + struct fio_sem *sem; + + sem = smalloc(sizeof(struct fio_sem)); + if (!sem) + return NULL; + + if (!__fio_sem_init(sem, value)) + return sem; + + fio_shared_sem_remove(sem); + return NULL; +} + +/* + * Free the fio_sem lock object allocated by fio_shared_sem_init(). + */ +void fio_shared_sem_remove(struct fio_sem *sem) +{ + __fio_sem_remove(sem); + sfree(sem); +} diff --git a/fio_time.h b/fio_time.h index b3bbd4c011..ef107c50dc 100644 --- a/fio_time.h +++ b/fio_time.h @@ -8,6 +8,10 @@ /* IWYU pragma: end_exports */ #include "lib/types.h" +#define RAMP_PERIOD_CHECK_MSEC 1000 + +extern bool ramp_period_enabled; + struct thread_data; extern uint64_t ntime_since(const struct timespec *, const struct timespec *); extern uint64_t ntime_since_now(const struct timespec *); @@ -22,14 +26,17 @@ extern uint64_t time_since_now(const struct timespec *); extern uint64_t time_since_genesis(void); extern uint64_t mtime_since_genesis(void); extern uint64_t utime_since_genesis(void); +extern void cycles_spin(unsigned int); extern uint64_t usec_spin(unsigned int); extern uint64_t usec_sleep(struct thread_data *, unsigned long); extern void fill_start_time(struct timespec *); extern void set_genesis_time(void); -extern bool ramp_time_over(struct thread_data *); -extern bool in_ramp_time(struct thread_data *); +extern int ramp_period_check(void); +extern bool ramp_period_over(struct thread_data *); +extern bool in_ramp_period(struct thread_data *); +extern int td_ramp_period_init(struct thread_data *); extern void fio_time_init(void); extern void timespec_add_msec(struct timespec *, unsigned int); -extern void set_epoch_time(struct thread_data *, int); +extern void set_epoch_time(struct thread_data *, clockid_t, clockid_t); #endif diff --git a/gclient.c b/gclient.c index e0e0e7bf92..73f64b3b87 100644 --- a/gclient.c +++ b/gclient.c @@ -292,7 +292,7 @@ static void gfio_thread_status_op(struct fio_client *client, if (sum_stat_clients == 1) return; - sum_thread_stats(&client_ts, &p->ts, sum_stat_nr == 1); + sum_thread_stats(&client_ts, &p->ts); sum_group_stats(&client_gs, &p->rs); client_ts.members++; @@ -553,12 +553,15 @@ static void gfio_quit_op(struct fio_client *client, struct fio_net_cmd *cmd) } static struct thread_options *gfio_client_add_job(struct gfio_client *gc, - struct thread_options_pack *top) + struct thread_options_pack *top, size_t top_sz) { struct gfio_client_options *gco; gco = calloc(1, sizeof(*gco)); - convert_thread_options_to_cpu(&gco->o, top); + if (convert_thread_options_to_cpu(&gco->o, top, top_sz)) { + dprint(FD_NET, "client: failed parsing add_job command\n"); + return NULL; + } INIT_FLIST_HEAD(&gco->list); flist_add_tail(&gco->list, &gc->o_list); gc->o_list_nr = 1; @@ -577,7 +580,10 @@ static void gfio_add_job_op(struct fio_client *client, struct fio_net_cmd *cmd) p->thread_number = le32_to_cpu(p->thread_number); p->groupid = le32_to_cpu(p->groupid); - o = gfio_client_add_job(gc, &p->top); + o = gfio_client_add_job(gc, &p->top, + cmd->pdu_len - offsetof(struct cmd_add_job_pdu, top)); + if (o == NULL) + return; gdk_threads_enter(); @@ -1155,21 +1161,18 @@ static void gfio_show_clat_percentiles(struct gfio_client *gc, #define GFIO_CLAT 1 #define GFIO_SLAT 2 #define GFIO_LAT 4 -#define GFIO_HILAT 8 -#define GFIO_LOLAT 16 static void gfio_show_ddir_status(struct gfio_client *gc, GtkWidget *mbox, struct group_run_stats *rs, struct thread_stat *ts, int ddir) { const char *ddir_label[3] = { "Read", "Write", "Trim" }; - const char *hilat, *lolat; GtkWidget *frame, *label, *box, *vbox, *main_vbox; - unsigned long long min[5], max[5]; + unsigned long long min[3], max[3]; unsigned long runt; unsigned long long bw, iops; unsigned int flags = 0; - double mean[5], dev[5]; + double mean[3], dev[3]; char *io_p, *io_palt, *bw_p, *bw_palt, *iops_p; char tmp[128]; int i2p; @@ -1268,14 +1271,6 @@ static void gfio_show_ddir_status(struct gfio_client *gc, GtkWidget *mbox, flags |= GFIO_CLAT; if (calc_lat(&ts->lat_stat[ddir], &min[2], &max[2], &mean[2], &dev[2])) flags |= GFIO_LAT; - if (calc_lat(&ts->clat_high_prio_stat[ddir], &min[3], &max[3], &mean[3], &dev[3])) { - flags |= GFIO_HILAT; - if (calc_lat(&ts->clat_low_prio_stat[ddir], &min[4], &max[4], &mean[4], &dev[4])) - flags |= GFIO_LOLAT; - /* we only want to print low priority statistics if other IOs were - * submitted with the priority bit set - */ - } if (flags) { frame = gtk_frame_new("Latency"); @@ -1284,24 +1279,12 @@ static void gfio_show_ddir_status(struct gfio_client *gc, GtkWidget *mbox, vbox = gtk_vbox_new(FALSE, 3); gtk_container_add(GTK_CONTAINER(frame), vbox); - if (ts->lat_percentiles) { - hilat = "High priority total latency"; - lolat = "Low priority total latency"; - } else { - hilat = "High priority completion latency"; - lolat = "Low priority completion latency"; - } - if (flags & GFIO_SLAT) gfio_show_lat(vbox, "Submission latency", min[0], max[0], mean[0], dev[0]); if (flags & GFIO_CLAT) gfio_show_lat(vbox, "Completion latency", min[1], max[1], mean[1], dev[1]); if (flags & GFIO_LAT) gfio_show_lat(vbox, "Total latency", min[2], max[2], mean[2], dev[2]); - if (flags & GFIO_HILAT) - gfio_show_lat(vbox, hilat, min[3], max[3], mean[3], dev[3]); - if (flags & GFIO_LOLAT) - gfio_show_lat(vbox, lolat, min[4], max[4], mean[4], dev[4]); } if (ts->slat_percentiles && flags & GFIO_SLAT) @@ -1309,40 +1292,16 @@ static void gfio_show_ddir_status(struct gfio_client *gc, GtkWidget *mbox, ts->io_u_plat[FIO_SLAT][ddir], ts->slat_stat[ddir].samples, "Submission"); - if (ts->clat_percentiles && flags & GFIO_CLAT) { + if (ts->clat_percentiles && flags & GFIO_CLAT) gfio_show_clat_percentiles(gc, main_vbox, ts, ddir, ts->io_u_plat[FIO_CLAT][ddir], ts->clat_stat[ddir].samples, "Completion"); - if (!ts->lat_percentiles) { - if (flags & GFIO_HILAT) - gfio_show_clat_percentiles(gc, main_vbox, ts, ddir, - ts->io_u_plat_high_prio[ddir], - ts->clat_high_prio_stat[ddir].samples, - "High priority completion"); - if (flags & GFIO_LOLAT) - gfio_show_clat_percentiles(gc, main_vbox, ts, ddir, - ts->io_u_plat_low_prio[ddir], - ts->clat_low_prio_stat[ddir].samples, - "Low priority completion"); - } - } - if (ts->lat_percentiles && flags & GFIO_LAT) { + if (ts->lat_percentiles && flags & GFIO_LAT) gfio_show_clat_percentiles(gc, main_vbox, ts, ddir, ts->io_u_plat[FIO_LAT][ddir], ts->lat_stat[ddir].samples, "Total"); - if (flags & GFIO_HILAT) - gfio_show_clat_percentiles(gc, main_vbox, ts, ddir, - ts->io_u_plat_high_prio[ddir], - ts->clat_high_prio_stat[ddir].samples, - "High priority total"); - if (flags & GFIO_LOLAT) - gfio_show_clat_percentiles(gc, main_vbox, ts, ddir, - ts->io_u_plat_low_prio[ddir], - ts->clat_low_prio_stat[ddir].samples, - "Low priority total"); - } free(io_p); free(bw_p); diff --git a/gettime.c b/gettime.c index 099e9d9f6c..b19d39208a 100644 --- a/gettime.c +++ b/gettime.c @@ -136,20 +136,10 @@ int fio_get_mono_time(struct timespec *ts) { int ret; -#ifdef CONFIG_CLOCK_GETTIME #if defined(CONFIG_CLOCK_MONOTONIC) ret = clock_gettime(CLOCK_MONOTONIC, ts); #else ret = clock_gettime(CLOCK_REALTIME, ts); -#endif -#else - struct timeval tv; - - ret = gettimeofday(&tv, NULL); - if (ret == 0) { - ts->tv_sec = tv.tv_sec; - ts->tv_nsec = tv.tv_usec * 1000; - } #endif assert(ret <= 0); return ret; @@ -168,7 +158,6 @@ static void __fio_gettime(struct timespec *tp) break; } #endif -#ifdef CONFIG_CLOCK_GETTIME case CS_CGETTIME: { if (fio_get_mono_time(tp) < 0) { log_err("fio: fio_get_mono_time() fails\n"); @@ -176,7 +165,6 @@ static void __fio_gettime(struct timespec *tp) } break; } -#endif #ifdef ARCH_HAVE_CPU_CLOCK case CS_CPUCLOCK: { uint64_t nsecs, t, multiples; @@ -313,7 +301,7 @@ static int calibrate_cpu_clock(void) max_ticks = MAX_CLOCK_SEC * cycles_per_msec * 1000ULL; max_mult = ULLONG_MAX / max_ticks; - dprint(FD_TIME, "\n\nmax_ticks=%llu, __builtin_clzll=%d, " + dprint(FD_TIME, "max_ticks=%llu, __builtin_clzll=%d, " "max_mult=%llu\n", max_ticks, __builtin_clzll(max_ticks), max_mult); @@ -335,7 +323,7 @@ static int calibrate_cpu_clock(void) /* * Find the greatest power of 2 clock ticks that is less than the - * ticks in MAX_CLOCK_SEC_2STAGE + * ticks in MAX_CLOCK_SEC */ max_cycles_shift = max_cycles_mask = 0; tmp = MAX_CLOCK_SEC * 1000ULL * cycles_per_msec; @@ -431,22 +419,22 @@ void fio_clock_init(void) uint64_t ntime_since(const struct timespec *s, const struct timespec *e) { - int64_t sec, nsec; + int64_t sec, nsec; - sec = e->tv_sec - s->tv_sec; - nsec = e->tv_nsec - s->tv_nsec; - if (sec > 0 && nsec < 0) { - sec--; - nsec += 1000000000LL; - } + sec = e->tv_sec - s->tv_sec; + nsec = e->tv_nsec - s->tv_nsec; + if (sec > 0 && nsec < 0) { + sec--; + nsec += 1000000000LL; + } /* * time warp bug on some kernels? */ - if (sec < 0 || (sec == 0 && nsec < 0)) - return 0; + if (sec < 0 || (sec == 0 && nsec < 0)) + return 0; - return nsec + (sec * 1000000000LL); + return nsec + (sec * 1000000000LL); } uint64_t ntime_since_now(const struct timespec *s) @@ -623,7 +611,7 @@ static void *clock_thread_fn(void *data) seq = *t->seq; if (seq == UINT_MAX) break; - __sync_synchronize(); + tsc_barrier(); tsc = get_cpu_clock(); } while (seq != atomic32_compare_and_swap(t->seq, seq, seq + 1)); @@ -671,7 +659,7 @@ static int clock_cmp(const void *p1, const void *p2) int fio_monotonic_clocktest(int debug) { struct clock_thread *cthreads; - unsigned int seen_cpus, nr_cpus = cpus_online(); + unsigned int seen_cpus, nr_cpus = cpus_configured(); struct clock_entry *entries; unsigned long nr_entries, tentries, failed = 0; struct clock_entry *prev, *this; diff --git a/gfio.c b/gfio.c index 22c5314d3d..b97337fc6d 100644 --- a/gfio.c +++ b/gfio.c @@ -730,8 +730,7 @@ static struct gui_entry *alloc_new_gui_entry(struct gui *ui) { struct gui_entry *ge; - ge = malloc(sizeof(*ge)); - memset(ge, 0, sizeof(*ge)); + ge = calloc(1, sizeof(*ge)); ge->state = GE_STATE_NEW; ge->ui = ui; return ge; @@ -1243,7 +1242,7 @@ static void about_dialog(GtkWidget *w, gpointer data) "program-name", "gfio", "comments", "Gtk2 UI for fio", "license", license_trans, - "website", "http://git.kernel.dk/cgit/fio/", + "website", "https://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio.git/", "authors", authors, "version", fio_version_string, "copyright", "© 2012-2017 Jens Axboe ", diff --git a/goptions.h b/goptions.h index a225a8d1b6..0361750946 100644 --- a/goptions.h +++ b/goptions.h @@ -1,6 +1,8 @@ #ifndef GFIO_OPTIONS_H #define GFIO_OPTIONS_H +#include + void gopt_get_options_window(GtkWidget *window, struct gfio_client *gc); void gopt_init(void); void gopt_exit(void); diff --git a/graph.c b/graph.c index 7a174170c7..3d2b6c96dd 100644 --- a/graph.c +++ b/graph.c @@ -713,8 +713,7 @@ static void graph_label_add_value(struct graph_label *i, void *value, struct graph *g = i->parent; struct graph_value *x; - x = malloc(sizeof(*x)); - memset(x, 0, sizeof(*x)); + x = calloc(1, sizeof(*x)); INIT_FLIST_HEAD(&x->alias); INIT_FLIST_HEAD(&x->list); flist_add_tail(&x->list, &i->value_list); @@ -999,7 +998,7 @@ const char *graph_find_tooltip(struct graph *g, int ix, int iy) ydiff = fabs(yval - y); /* - * zero delta, or within or match critera, break + * zero delta, or within or match criteria, break */ if (ydiff < best_delta) { best_delta = ydiff; diff --git a/hash.h b/hash.h index 2c04bc2969..51f0706e2c 100644 --- a/hash.h +++ b/hash.h @@ -9,32 +9,6 @@ (C) 2002 William Lee Irwin III, IBM */ /* - * Knuth recommends primes in approximately golden ratio to the maximum - * integer representable by a machine word for multiplicative hashing. - * Chuck Lever verified the effectiveness of this technique: - * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf - * - * These primes are chosen to be bit-sparse, that is operations on - * them can use shifts and additions instead of multiplications for - * machines where multiplications are slow. - */ - -#if BITS_PER_LONG == 32 -/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */ -#define GOLDEN_RATIO_PRIME 0x9e370001UL -#elif BITS_PER_LONG == 64 -/* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */ -#define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL -#else -#error Define GOLDEN_RATIO_PRIME for your wordsize. -#endif - -/* - * The above primes are actively bad for hashing, since they are - * too sparse. The 32-bit one is mostly ok, the 64-bit one causes - * real problems. Besides, the "prime" part is pointless for the - * multiplicative hash. - * * Although a random odd number will do, it turns out that the golden * ratio phi = (sqrt(5)-1)/2, or its negative, has particularly nice * properties. @@ -142,20 +116,20 @@ static inline uint32_t jhash(const void *key, uint32_t length, uint32_t initval) /* Last block: affect all 32 bits of (c) */ /* All the case statements fall through */ switch (length) { - case 12: c += (uint32_t) k[11] << 24; fallthrough; - case 11: c += (uint32_t) k[10] << 16; fallthrough; - case 10: c += (uint32_t) k[9] << 8; fallthrough; - case 9: c += k[8]; fallthrough; - case 8: b += (uint32_t) k[7] << 24; fallthrough; - case 7: b += (uint32_t) k[6] << 16; fallthrough; - case 6: b += (uint32_t) k[5] << 8; fallthrough; - case 5: b += k[4]; fallthrough; - case 4: a += (uint32_t) k[3] << 24; fallthrough; - case 3: a += (uint32_t) k[2] << 16; fallthrough; - case 2: a += (uint32_t) k[1] << 8; fallthrough; + case 12: c += (uint32_t) k[11] << 24; fio_fallthrough; + case 11: c += (uint32_t) k[10] << 16; fio_fallthrough; + case 10: c += (uint32_t) k[9] << 8; fio_fallthrough; + case 9: c += k[8]; fio_fallthrough; + case 8: b += (uint32_t) k[7] << 24; fio_fallthrough; + case 7: b += (uint32_t) k[6] << 16; fio_fallthrough; + case 6: b += (uint32_t) k[5] << 8; fio_fallthrough; + case 5: b += k[4]; fio_fallthrough; + case 4: a += (uint32_t) k[3] << 24; fio_fallthrough; + case 3: a += (uint32_t) k[2] << 16; fio_fallthrough; + case 2: a += (uint32_t) k[1] << 8; fio_fallthrough; case 1: a += k[0]; __jhash_final(a, b, c); - fallthrough; + fio_fallthrough; case 0: /* Nothing left to add */ break; } diff --git a/helper_thread.c b/helper_thread.c index b9b83db305..88614e58e5 100644 --- a/helper_thread.c +++ b/helper_thread.c @@ -1,4 +1,7 @@ +#include #include +#include +#include #include #ifdef CONFIG_HAVE_TIMERFD_CREATE #include @@ -103,13 +106,14 @@ static int read_from_pipe(int fd, void *buf, size_t len) static void block_signals(void) { -#ifdef HAVE_PTHREAD_SIGMASK +#ifdef CONFIG_PTHREAD_SIGMASK sigset_t sigmask; + int ret; + ret = pthread_sigmask(SIG_UNBLOCK, NULL, &sigmask); assert(ret == 0); ret = pthread_sigmask(SIG_BLOCK, &sigmask, NULL); - assert(ret == 0); #endif } @@ -122,7 +126,10 @@ static void submit_action(enum action a) return; ret = write_to_pipe(helper_data->pipe[1], &data, sizeof(data)); - assert(ret == 1); + if (ret != 1) { + log_err("failed to write action into pipe, err %i:%s", errno, strerror(errno)); + assert(0); + } } void helper_reset(void) @@ -154,7 +161,6 @@ void helper_thread_exit(void) return; helper_data->exit = 1; - submit_action(A_EXIT); pthread_join(helper_data->thread, NULL); } @@ -281,10 +287,16 @@ static void *helper_thread_main(void *data) }, { .name = "steadystate", - .interval_ms = steadystate_enabled ? STEADYSTATE_MSEC : + .interval_ms = steadystate_enabled ? ss_check_interval : 0, .func = steadystate_check, - } + }, + { + .name = "ramp_period", + .interval_ms = ramp_period_enabled ? + RAMP_PERIOD_CHECK_MSEC : 0, + .func = ramp_period_check, + }, }; struct timespec ts; long clk_tck; @@ -412,6 +424,8 @@ int helper_thread_create(struct fio_sem *startup_sem, struct sk_out *sk_out) int ret; hd = scalloc(1, sizeof(*hd)); + if (!hd) + return 1; setup_disk_util(); steadystate_setup(); diff --git a/helper_thread.h b/helper_thread.h index d7df6c4d80..1c8167e83b 100644 --- a/helper_thread.h +++ b/helper_thread.h @@ -1,6 +1,11 @@ #ifndef FIO_HELPER_THREAD_H #define FIO_HELPER_THREAD_H +#include + +struct fio_sem; +struct sk_out; + extern void helper_reset(void); extern void helper_do_stat(void); extern bool helper_should_exit(void); diff --git a/idletime.c b/idletime.c index fc1df8e9d0..90ed77ea6e 100644 --- a/idletime.c +++ b/idletime.c @@ -189,7 +189,7 @@ void fio_idle_prof_init(void) pthread_condattr_t cattr; struct idle_prof_thread *ipt; - ipc.nr_cpus = cpus_online(); + ipc.nr_cpus = cpus_configured(); ipc.status = IDLE_PROF_STATUS_OK; if (ipc.opt == IDLE_PROF_OPT_NONE) diff --git a/init.c b/init.c index 5f069d9a5b..130158cbd3 100644 --- a/init.c +++ b/init.c @@ -224,6 +224,13 @@ static struct option l_opts[FIO_NR_OPTIONS] = { .has_arg = optional_argument, .val = 'S', }, +#ifdef WIN32 + { + .name = (char *) "server-internal", + .has_arg = required_argument, + .val = 'N', + }, +#endif { .name = (char *) "daemonize", .has_arg = required_argument, .val = 'D', @@ -605,12 +612,32 @@ static int fixup_options(struct thread_data *td) struct thread_options *o = &td->o; int ret = 0; - if (read_only && (td_write(td) || td_trim(td))) { + /* + * Denote whether we are verifying trims. Now we only have to check a + * single variable instead of having to check all three options. + */ + td->trim_verify = o->verify && o->trim_backlog && o->trim_percentage; + dprint(FD_VERIFY, "td->trim_verify=%d\n", td->trim_verify); + + if (read_only && (td_write(td) || td_trim(td) || td->trim_verify)) { log_err("fio: trim and write operations are not allowed" " with the --readonly parameter.\n"); ret |= 1; } + if (td_trimwrite(td) && o->num_range > 1) { + log_err("fio: trimwrite cannot be used with multiple" + " ranges.\n"); + ret |= 1; + } + + if (td_trim(td) && o->num_range > 1 && + !td_ioengine_flagged(td, FIO_MULTI_RANGE_TRIM)) { + log_err("fio: can't use multiple ranges with IO engine %s\n", + td->io_ops->name); + ret |= 1; + } + #ifndef CONFIG_PSHARED if (!o->use_thread) { log_info("fio: this platform does not support process shared" @@ -663,6 +690,36 @@ static int fixup_options(struct thread_data *td) if (o->zone_mode == ZONE_MODE_STRIDED && !o->zone_range) o->zone_range = o->zone_size; + /* + * SPRandom Requires: random write, random_generator=lfsr, norandommap=1 + */ + if (o->sprandom) { + if (td_write(td) && td_random(td)) { + if (fio_option_is_set(o, random_generator)) { + if (o->random_generator != FIO_RAND_GEN_LFSR) { + log_err("fio: sprandom requires random_generator=lfsr\n"); + ret |= 1; + } + } else { + log_info("fio: sprandom sets random_generator=lfsr\n"); + o->random_generator = FIO_RAND_GEN_LFSR; + } + if (fio_option_is_set(o, norandommap)) { + if (o->norandommap == 0) { + log_err("fio: sprandom requires norandommap=1\n"); + ret |= 1; + } + /* if == 1, OK */ + } else { + log_info("fio: sprandom sets norandommap=1\n"); + o->norandommap = 1; + } + } else { + log_err("fio: sprandom requires random write, random_generator=lfsr, norandommap=1\n"); + ret |= 1; + } + } + /* * Reads can do overwrites, we always need to pre-create the file */ @@ -833,6 +890,59 @@ static int fixup_options(struct thread_data *td) (o->max_bs[DDIR_WRITE] % o->verify_interval)) o->verify_interval = gcd(o->min_bs[DDIR_WRITE], o->max_bs[DDIR_WRITE]); + + if (o->verify_only) { + if (!fio_option_is_set(o, verify_write_sequence)) + o->verify_write_sequence = 0; + + if (!fio_option_is_set(o, verify_header_seed)) + o->verify_header_seed = 0; + } + + if (o->norandommap && !td_ioengine_flagged(td, FIO_SYNCIO) && + o->iodepth > 1) { + /* + * Disable write sequence checks with norandommap and + * iodepth > 1. + * Unless we were explicitly asked to enable it. + */ + if (!fio_option_is_set(o, verify_write_sequence)) + o->verify_write_sequence = 0; + } + + /* + * Verify header should not be offset beyond the verify + * interval. + */ + if (o->verify_offset + sizeof(struct verify_header) > + o->verify_interval) { + log_err("fio: cannot offset verify header beyond the " + "verify interval.\n"); + ret |= 1; + } + + /* + * Disable rand_seed check when we have verify_backlog, + * zone reset frequency for zonemode=zbd, or if we are using + * an RB tree for IO history logs. + * Unless we were explicitly asked to enable it. + */ + if (!td_write(td) || (td->flags & TD_F_VER_BACKLOG) || + o->zrf.u.f || fio_offset_overlap_risk(td)) { + if (!fio_option_is_set(o, verify_header_seed)) + o->verify_header_seed = 0; + } + } + + if (td->o.oatomic) { + if (!td_ioengine_flagged(td, FIO_ATOMICWRITES)) { + log_err("fio: engine does not support atomic writes\n"); + td->o.oatomic = 0; + ret |= 1; + } + + if (!td_write(td)) + td->o.oatomic = 0; } if (o->pre_read) { @@ -909,12 +1019,6 @@ static int fixup_options(struct thread_data *td) ret |= 1; } - /* - * O_ATOMIC implies O_DIRECT - */ - if (o->oatomic) - o->odirect = 1; - /* * If randseed is set, that overrides randrepeat */ @@ -950,13 +1054,16 @@ static int fixup_options(struct thread_data *td) if (o->disable_slat) o->slat_percentiles = 0; - /* - * Fix these up to be nsec internally - */ - for_each_rw_ddir(ddir) - o->max_latency[ddir] *= 1000ULL; + /* Do this only for the parent job */ + if (!td->subjob_number) { + /* + * Fix these up to be nsec internally + */ + for_each_rw_ddir(ddir) + o->max_latency[ddir] *= 1000ULL; - o->latency_target *= 1000ULL; + o->latency_target *= 1000ULL; + } /* * Dedupe working set verifications @@ -980,6 +1087,33 @@ static int fixup_options(struct thread_data *td) } } + for_each_td(td2) { + if (td->o.ss_check_interval != td2->o.ss_check_interval) { + log_err("fio: conflicting ss_check_interval: %llu and %llu, must be globally equal\n", + td->o.ss_check_interval, td2->o.ss_check_interval); + ret |= 1; + } + } end_for_each(); + if (td->o.ss_dur && td->o.ss_check_interval / 1000L < 1000) { + log_err("fio: ss_check_interval must be at least 1s\n"); + ret |= 1; + + } + if (td->o.ss_dur && (td->o.ss_dur % td->o.ss_check_interval != 0 || td->o.ss_dur <= td->o.ss_check_interval)) { + log_err("fio: ss_duration %lluus must be multiple of ss_check_interval %lluus\n", + td->o.ss_dur, td->o.ss_check_interval); + ret |= 1; + } + + if (td->o.fdp) { + if (fio_option_is_set(&td->o, dp_type) && + (td->o.dp_type == FIO_DP_STREAMS || td->o.dp_type == FIO_DP_NONE)) { + log_err("fio: fdp=1 is not compatible with dataplacement={streams, none}\n"); + ret |= 1; + } else { + td->o.dp_type = FIO_DP_FDP; + } + } return ret; } @@ -1000,7 +1134,12 @@ static void init_rand_file_service(struct thread_data *td) } } -void td_fill_verify_state_seed(struct thread_data *td) +/* + * Separate initialization of the random generator for offsets in case we need + * to re-initialize it if we discover later on that the combination of filesize + * and block size exceeds the limits of the default random generator. + */ +void init_rand_offset_seed(struct thread_data *td) { bool use64; @@ -1009,16 +1148,21 @@ void td_fill_verify_state_seed(struct thread_data *td) else use64 = false; - init_rand_seed(&td->verify_state, td->rand_seeds[FIO_RAND_VER_OFF], - use64); + init_rand_seed(&td->offset_state, td->rand_seeds[FIO_RAND_BLOCK_OFF], use64); } -static void td_fill_rand_seeds_internal(struct thread_data *td, bool use64) +void td_fill_rand_seeds(struct thread_data *td) { uint64_t read_seed = td->rand_seeds[FIO_RAND_BS_OFF]; uint64_t write_seed = td->rand_seeds[FIO_RAND_BS1_OFF]; uint64_t trim_seed = td->rand_seeds[FIO_RAND_BS2_OFF]; int i; + bool use64; + + if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE64) + use64 = true; + else + use64 = false; /* * trimwrite is special in that we need to generate the same @@ -1036,7 +1180,8 @@ static void td_fill_rand_seeds_internal(struct thread_data *td, bool use64) init_rand_seed(&td->bsrange_state[DDIR_WRITE], write_seed, use64); init_rand_seed(&td->bsrange_state[DDIR_TRIM], trim_seed, use64); - td_fill_verify_state_seed(td); + init_rand_seed(&td->verify_state, td->rand_seeds[FIO_RAND_VER_OFF], + use64); init_rand_seed(&td->rwmix_state, td->rand_seeds[FIO_RAND_MIX_OFF], false); if (td->o.file_service_type == FIO_FSERVICE_RANDOM) @@ -1045,7 +1190,7 @@ static void td_fill_rand_seeds_internal(struct thread_data *td, bool use64) init_rand_file_service(td); init_rand_seed(&td->file_size_state, td->rand_seeds[FIO_RAND_FILE_SIZE_OFF], use64); - init_rand_seed(&td->trim_state, td->rand_seeds[FIO_RAND_TRIM_OFF], use64); + init_rand_seed(&td->trim_state, td->rand_seeds[FIO_RAND_TRIM_OFF], false); init_rand_seed(&td->delay_state, td->rand_seeds[FIO_RAND_START_DELAY], use64); init_rand_seed(&td->poisson_state[0], td->rand_seeds[FIO_RAND_POISSON_OFF], 0); init_rand_seed(&td->poisson_state[1], td->rand_seeds[FIO_RAND_POISSON2_OFF], 0); @@ -1055,42 +1200,49 @@ static void td_fill_rand_seeds_internal(struct thread_data *td, bool use64) init_rand_seed(&td->prio_state, td->rand_seeds[FIO_RAND_PRIO_CMDS], false); init_rand_seed(&td->dedupe_working_set_index_state, td->rand_seeds[FIO_RAND_DEDUPE_WORKING_SET_IX], use64); - if (!td_random(td)) - return; - - if (td->o.rand_repeatable) - td->rand_seeds[FIO_RAND_BLOCK_OFF] = FIO_RANDSEED * td->thread_number; - - init_rand_seed(&td->random_state, td->rand_seeds[FIO_RAND_BLOCK_OFF], use64); + init_rand_offset_seed(td); for (i = 0; i < DDIR_RWDIR_CNT; i++) { struct frand_state *s = &td->seq_rand_state[i]; init_rand_seed(s, td->rand_seeds[FIO_RAND_SEQ_RAND_READ_OFF], false); } + + init_rand_seed(&td->buf_state, td->rand_seeds[FIO_RAND_BUF_OFF], use64); + frand_copy(&td->buf_state_prev, &td->buf_state); + + init_rand_seed(&td->fdp_state, td->rand_seeds[FIO_RAND_FDP_OFF], false); + init_rand_seed(&td->sprandom_state, td->rand_seeds[FIO_RAND_SPRANDOM_OFF], false); } -void td_fill_rand_seeds(struct thread_data *td) +static int setup_random_seeds(struct thread_data *td) { - bool use64; - - if (td->o.allrand_repeatable) { - unsigned int i; + uint64_t seed; + unsigned int i; - for (i = 0; i < FIO_RAND_NR_OFFS; i++) - td->rand_seeds[i] = FIO_RANDSEED * td->thread_number - + i; + if (!td->o.rand_repeatable && !fio_option_is_set(&td->o, rand_seed)) { + int ret = init_random_seeds(td->rand_seeds, sizeof(td->rand_seeds)); + dprint(FD_RANDOM, "using system RNG for random seeds\n"); + if (ret) + return ret; + } else { + seed = td->o.rand_seed; + for (i = 0; i < 4; i++) + seed *= 0x9e370001UL; + + for (i = 0; i < FIO_RAND_NR_OFFS; i++) { + td->rand_seeds[i] = seed * td->thread_number + i; + seed *= 0x9e370001UL; + } } - if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE64) - use64 = true; - else - use64 = false; + dprint(FD_RANDOM, "FIO_RAND_NR_OFFS=%d\n", FIO_RAND_NR_OFFS); + for (int i = 0; i < FIO_RAND_NR_OFFS; i++) + dprint(FD_RANDOM, "rand_seeds[%d]=%" PRIu64 "\n", i, td->rand_seeds[i]); - td_fill_rand_seeds_internal(td, use64); + td_fill_rand_seeds(td); - init_rand_seed(&td->buf_state, td->rand_seeds[FIO_RAND_BUF_OFF], use64); - frand_copy(&td->buf_state_prev, &td->buf_state); + return 0; } /* @@ -1226,31 +1378,6 @@ static void init_flags(struct thread_data *td) } } -static int setup_random_seeds(struct thread_data *td) -{ - uint64_t seed; - unsigned int i; - - if (!td->o.rand_repeatable && !fio_option_is_set(&td->o, rand_seed)) { - int ret = init_random_seeds(td->rand_seeds, sizeof(td->rand_seeds)); - if (!ret) - td_fill_rand_seeds(td); - return ret; - } - - seed = td->o.rand_seed; - for (i = 0; i < 4; i++) - seed *= 0x9e370001UL; - - for (i = 0; i < FIO_RAND_NR_OFFS; i++) { - td->rand_seeds[i] = seed * td->thread_number + i; - seed *= 0x9e370001UL; - } - - td_fill_rand_seeds(td); - return 0; -} - enum { FPRE_NONE = 0, FPRE_JOBNAME, @@ -1404,15 +1531,14 @@ static void gen_log_name(char *name, size_t size, const char *logtype, static int check_waitees(char *waitee) { - struct thread_data *td; - int i, ret = 0; + int ret = 0; - for_each_td(td, i) { + for_each_td(td) { if (td->subjob_number) continue; ret += !strcmp(td->o.name, waitee); - } + } end_for_each(); return ret; } @@ -1445,6 +1571,23 @@ static bool wait_for_ok(const char *jobname, struct thread_options *o) return true; } +static int verify_per_group_options(struct thread_data *td, const char *jobname) +{ + for_each_td(td2) { + if (td->groupid != td2->groupid) + continue; + + if (td->o.stats && + td->o.lat_percentiles != td2->o.lat_percentiles) { + log_err("fio: lat_percentiles in job: %s differs from group\n", + jobname); + return 1; + } + } end_for_each(); + + return 0; +} + /* * Treat an empty log file name the same as a one not given */ @@ -1514,7 +1657,7 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num, if (fixup_options(td)) goto err; - if (init_dedupe_working_set_seeds(td)) + if (!td->o.dedupe_global && init_dedupe_working_set_seeds(td, 0)) goto err; /* @@ -1549,7 +1692,14 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num, td->ts.sig_figs = o->sig_figs; init_thread_stat_min_vals(&td->ts); - td->ddir_seq_nr = o->ddir_seq_nr; + + /* + * td->>ddir_seq_nr needs to be initialized to 1, NOT o->ddir_seq_nr, + * so that get_next_offset gets a new random offset the first time it + * is called, instead of keeping an initial offset of 0 for the first + * nr-1 calls + */ + td->ddir_seq_nr = 1; if ((o->stonewall || o->new_group) && prev_group_jobs) { prev_group_jobs = 0; @@ -1563,9 +1713,16 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num, td->groupid = groupid; prev_group_jobs++; + if (td->o.group_reporting && prev_group_jobs > 1 && + verify_per_group_options(td, jobname)) + goto err; + if (setup_rate(td)) goto err; + if (td_ramp_period_init(td)) + goto err; + if (o->write_lat_log) { struct log_params p = { .td = td, @@ -1575,29 +1732,44 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num, .log_type = IO_LOG_TYPE_LAT, .log_offset = o->log_offset, .log_prio = o->log_prio, + .log_issue_time = o->log_issue_time, .log_gz = o->log_gz, .log_gz_store = o->log_gz_store, }; const char *pre = make_log_name(o->lat_log_file, o->name); const char *suf; + if (o->log_issue_time && !o->log_offset) { + log_err("fio: log_issue_time option requires write_lat_log and log_offset options\n"); + goto err; + } + if (p.log_gz_store) suf = "log.fz"; else suf = "log"; - gen_log_name(logname, sizeof(logname), "lat", pre, - td->thread_number, suf, o->per_job_logs); - setup_log(&td->lat_log, &p, logname); + if (!o->disable_lat) { + gen_log_name(logname, sizeof(logname), "lat", pre, + td->thread_number, suf, o->per_job_logs); + setup_log(&td->lat_log, &p, logname); + } - gen_log_name(logname, sizeof(logname), "slat", pre, - td->thread_number, suf, o->per_job_logs); - setup_log(&td->slat_log, &p, logname); + if (!o->disable_slat) { + gen_log_name(logname, sizeof(logname), "slat", pre, + td->thread_number, suf, o->per_job_logs); + setup_log(&td->slat_log, &p, logname); + } - gen_log_name(logname, sizeof(logname), "clat", pre, - td->thread_number, suf, o->per_job_logs); - setup_log(&td->clat_log, &p, logname); + if (!o->disable_clat) { + gen_log_name(logname, sizeof(logname), "clat", pre, + td->thread_number, suf, o->per_job_logs); + setup_log(&td->clat_log, &p, logname); + } + } else if (o->log_issue_time) { + log_err("fio: log_issue_time option requires write_lat_log and log_offset options\n"); + goto err; } if (o->write_hist_log) { @@ -1609,6 +1781,7 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num, .log_type = IO_LOG_TYPE_HIST, .log_offset = o->log_offset, .log_prio = o->log_prio, + .log_issue_time = o->log_issue_time, .log_gz = o->log_gz, .log_gz_store = o->log_gz_store, }; @@ -1616,7 +1789,7 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num, const char *suf; #ifndef CONFIG_ZLIB - if (td->client_type) { + if (is_backend) { log_err("fio: --write_hist_log requires zlib in client/server mode\n"); goto err; } @@ -1641,6 +1814,7 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num, .log_type = IO_LOG_TYPE_BW, .log_offset = o->log_offset, .log_prio = o->log_prio, + .log_issue_time = o->log_issue_time, .log_gz = o->log_gz, .log_gz_store = o->log_gz_store, }; @@ -1673,6 +1847,7 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num, .log_type = IO_LOG_TYPE_IOPS, .log_offset = o->log_offset, .log_prio = o->log_prio, + .log_issue_time = o->log_issue_time, .log_gz = o->log_gz, .log_gz_store = o->log_gz_store, }; @@ -1920,8 +2095,7 @@ static int __parse_jobs_ini(struct thread_data *td, * it's really 256 + small bit, 280 should suffice */ if (!nested) { - name = malloc(280); - memset(name, 0, 280); + name = calloc(1, 280); } opts = NULL; @@ -2141,6 +2315,10 @@ static int __parse_jobs_ini(struct thread_data *td, i++; } + free(job_sections); + job_sections = NULL; + nr_job_sections = 0; + free(opts); out: free(string); @@ -2221,7 +2399,7 @@ static void usage(const char *name) printf(" --minimal\t\tMinimal (terse) output\n"); printf(" --output-format=type\tOutput format (terse,json,json+,normal)\n"); printf(" --terse-version=type\tSet terse version output format" - " (default 3, or 2 or 4)\n"); + " (default 3, or 2 or 4 or 5)\n"); printf(" --version\t\tPrint version info and exit\n"); printf(" --help\t\tPrint this page\n"); printf(" --cpuclock-test\tPerform test/validation of CPU clock\n"); @@ -2343,6 +2521,10 @@ const struct debug_level debug_levels[] = { .help = "Zoned Block Device logging", .shift = FD_ZBD, }, + { .name = "sprandom", + .help = "SPRandom logging", + .shift = FD_SPRANDOM, + }, { .name = NULL, }, }; @@ -2762,6 +2944,15 @@ int parse_cmd_line(int argc, char *argv[], int client_type) break; ret = fio_cmd_ioengine_option_parse(td, opt, val); + + if (ret) { + if (td) { + put_job(td); + td = NULL; + } + do_exit++; + exit_val = 1; + } break; } case 'w': @@ -2789,6 +2980,12 @@ int parse_cmd_line(int argc, char *argv[], int client_type) exit_val = 1; #endif break; +#ifdef WIN32 + case 'N': + did_arg = true; + fio_server_internal_set(optarg); + break; +#endif case 'D': if (pid_file) free(pid_file); @@ -2936,7 +3133,7 @@ int parse_cmd_line(int argc, char *argv[], int client_type) log_err("%s: unrecognized option '%s'\n", argv[0], argv[optind - 1]); show_closest_option(argv[optind - 1]); - fallthrough; + fio_fallthrough; default: do_exit++; exit_val = 1; diff --git a/io_ddir.h b/io_ddir.h index 296a9d04ac..280c1e796a 100644 --- a/io_ddir.h +++ b/io_ddir.h @@ -11,6 +11,7 @@ enum fio_ddir { DDIR_WAIT, DDIR_LAST, DDIR_INVAL = -1, + DDIR_TIMEOUT = -2, DDIR_RWDIR_CNT = 3, DDIR_RWDIR_SYNC_CNT = 4, @@ -41,6 +42,7 @@ enum td_ddir { TD_DDIR_RANDRW = TD_DDIR_RW | TD_DDIR_RAND, TD_DDIR_RANDTRIM = TD_DDIR_TRIM | TD_DDIR_RAND, TD_DDIR_TRIMWRITE = TD_DDIR_TRIM | TD_DDIR_WRITE, + TD_DDIR_RANDTRIMWRITE = TD_DDIR_RANDTRIM | TD_DDIR_WRITE, }; #define td_read(td) ((td)->o.td_ddir & TD_DDIR_READ) @@ -51,6 +53,8 @@ enum td_ddir { #define file_randommap(td, f) (!(td)->o.norandommap && fio_file_axmap((f))) #define td_trimwrite(td) (((td)->o.td_ddir & TD_DDIR_TRIMWRITE) \ == TD_DDIR_TRIMWRITE) +#define td_randtrimwrite(td) (((td)->o.td_ddir & TD_DDIR_RANDTRIMWRITE) \ + == TD_DDIR_RANDTRIMWRITE) static inline int ddir_sync(enum fio_ddir ddir) { @@ -67,7 +71,8 @@ static inline const char *ddir_str(enum td_ddir ddir) { static const char *__str[] = { NULL, "read", "write", "rw", "rand", "randread", "randwrite", "randrw", - "trim", NULL, "trimwrite", NULL, "randtrim" }; + "trim", NULL, "trimwrite", NULL, "randtrim", + NULL, "randtrimwrite" }; return __str[ddir]; } diff --git a/io_u.c b/io_u.c index 3c72d63d0d..653a700c83 100644 --- a/io_u.c +++ b/io_u.c @@ -11,6 +11,7 @@ #include "lib/pow2.h" #include "minmax.h" #include "zbd.h" +#include "sprandom.h" struct io_completion_data { int nr; /* input */ @@ -84,6 +85,22 @@ static uint64_t last_block(struct thread_data *td, struct fio_file *f, return max_blocks; } + +static int __get_next_rand_offset_sprandom(struct thread_data *td, struct fio_file *f, + enum fio_ddir ddir, uint64_t *b, + uint64_t lastb) +{ + assert(ddir == DDIR_WRITE); + + /* SP RANDOM writes all addresses once */ + if (sprandom_get_next_offset(f->spr_info, f, b)) { + dprint(FD_SPRANDOM, "sprandom is done\n"); + td->done = 1; + return 1; + } + return 0; +} + static int __get_next_rand_offset(struct thread_data *td, struct fio_file *f, enum fio_ddir ddir, uint64_t *b, uint64_t lastb) @@ -93,11 +110,11 @@ static int __get_next_rand_offset(struct thread_data *td, struct fio_file *f, if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE || td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE64) { - r = __rand(&td->random_state); + r = __rand(&td->offset_state); dprint(FD_RANDOM, "off rand %llu\n", (unsigned long long) r); - *b = lastb * (r / (rand_max(&td->random_state) + 1.0)); + *b = lastb * (r / (rand_max(&td->offset_state) + 1.0)); } else { uint64_t off = 0; @@ -277,7 +294,9 @@ static int __get_next_rand_offset_zoned(struct thread_data *td, static int get_next_rand_offset(struct thread_data *td, struct fio_file *f, enum fio_ddir ddir, uint64_t *b) { - if (td->o.random_distribution == FIO_RAND_DIST_RANDOM) { + if (td->o.sprandom && ddir == DDIR_WRITE) { + return __get_next_rand_offset_sprandom(td, f, ddir, b, 0); + } else if (td->o.random_distribution == FIO_RAND_DIST_RANDOM) { uint64_t lastb; lastb = last_block(td, f, ddir); @@ -355,11 +374,22 @@ static int get_next_seq_offset(struct thread_data *td, struct fio_file *f, * and invalidate the cache, if we need to. */ if (f->last_pos[ddir] >= f->io_size + get_start_offset(td, f) && - o->time_based) { + o->time_based && o->nr_files == 1) { f->last_pos[ddir] = f->file_offset; loop_cache_invalidate(td, f); } + /* + * If we reach the end for a rw-io-size based run, reset us back to 0 + * and invalidate the cache, if we need to. + */ + if (td_rw(td) && o->io_size > o->size) { + if (f->last_pos[ddir] >= f->io_size + get_start_offset(td, f)) { + f->last_pos[ddir] = f->file_offset; + loop_cache_invalidate(td, f); + } + } + if (f->last_pos[ddir] < f->real_file_size) { uint64_t pos; @@ -417,7 +447,13 @@ static int get_next_block(struct thread_data *td, struct io_u *io_u, b = offset = -1ULL; - if (rw_seq) { + if (td_randtrimwrite(td) && ddir == DDIR_WRITE) { + /* don't mark randommap for these writes */ + io_u_set(td, io_u, IO_U_F_BUSY_OK); + offset = f->last_start[DDIR_TRIM] - f->file_offset; + *is_random = true; + ret = 0; + } else if (rw_seq) { if (td_random(td)) { if (should_do_random(td, ddir)) { ret = get_next_rand_block(td, f, ddir, &b); @@ -507,6 +543,24 @@ static int get_next_offset(struct thread_data *td, struct io_u *io_u, return 1; } + /* + * For randtrimwrite, we decide whether to issue a trim or a write + * based on whether the offsets for the most recent trim and write + * operations match. If they don't match that means we just issued a + * new trim and the next operation should be a write. If they *do* + * match that means we just completed a trim+write pair and the next + * command should be a trim. + * + * This works fine for sequential workloads but for random workloads + * it's possible to complete a trim+write pair and then have the next + * randomly generated offset match the previous offset. If that happens + * we need to alter the offset for the last write operation in order + * to ensure that we issue a write operation the next time through. + */ + if (td_randtrimwrite(td) && ddir == DDIR_TRIM && + f->last_start[DDIR_TRIM] == io_u->offset) + f->last_start[DDIR_WRITE]--; + io_u->verify_offset = io_u->offset; return 0; } @@ -530,6 +584,12 @@ static unsigned long long get_next_buflen(struct thread_data *td, struct io_u *i assert(ddir_rw(ddir)); + if (td_randtrimwrite(td) && ddir == DDIR_WRITE) { + struct fio_file *f = io_u->file; + + return f->last_pos[DDIR_TRIM] - f->last_start[DDIR_TRIM]; + } + if (td->o.bs_is_seq_rand) ddir = is_random ? DDIR_WRITE : DDIR_READ; @@ -687,7 +747,7 @@ static enum fio_ddir rate_ddir(struct thread_data *td, enum fio_ddir ddir) * check if the usec is capable of taking negative values */ if (now > td->o.timeout) { - ddir = DDIR_INVAL; + ddir = DDIR_TIMEOUT; return ddir; } usec = td->o.timeout - now; @@ -696,7 +756,7 @@ static enum fio_ddir rate_ddir(struct thread_data *td, enum fio_ddir ddir) now = utime_since_now(&td->epoch); if ((td->o.timeout && (now > td->o.timeout)) || td->terminate) - ddir = DDIR_INVAL; + ddir = DDIR_TIMEOUT; return ddir; } @@ -714,7 +774,7 @@ static enum fio_ddir get_rw_ddir(struct thread_data *td) * See if it's time to fsync/fdatasync/sync_file_range first, * and if not then move on to check regular I/Os. */ - if (should_fsync(td)) { + if (should_fsync(td) && td->last_ddir_issued == DDIR_WRITE) { if (td->o.fsync_blocks && td->io_issues[DDIR_WRITE] && !(td->io_issues[DDIR_WRITE] % td->o.fsync_blocks)) return DDIR_SYNC; @@ -755,7 +815,15 @@ static enum fio_ddir get_rw_ddir(struct thread_data *td) else ddir = DDIR_INVAL; - td->rwmix_ddir = rate_ddir(td, ddir); + if (!should_check_rate(td)) { + /* + * avoid time-consuming call to utime_since_now() if rate checking + * isn't being used. this imrpoves IOPs 50%. See: + * https://github.com/axboe/fio/issues/1501#issuecomment-1418327049 + */ + td->rwmix_ddir = ddir; + } else + td->rwmix_ddir = rate_ddir(td, ddir); return td->rwmix_ddir; } @@ -766,9 +834,9 @@ static void set_rw_ddir(struct thread_data *td, struct io_u *io_u) if (td->o.zone_mode == ZONE_MODE_ZBD) ddir = zbd_adjust_ddir(td, io_u, ddir); - if (td_trimwrite(td)) { + if (td_trimwrite(td) && !ddir_sync(ddir)) { struct fio_file *f = io_u->file; - if (f->last_pos[DDIR_WRITE] == f->last_pos[DDIR_TRIM]) + if (f->last_start[DDIR_WRITE] == f->last_start[DDIR_TRIM]) ddir = DDIR_TRIM; else ddir = DDIR_WRITE; @@ -820,9 +888,16 @@ void put_io_u(struct thread_data *td, struct io_u *io_u) __td_io_u_unlock(td); } +static inline void io_u_clear_inflight_flags(struct thread_data *td, + struct io_u *io_u) +{ + io_u_clear(td, io_u, IO_U_F_FLIGHT | IO_U_F_BUSY_OK | + IO_U_F_PATTERN_DONE); +} + void clear_io_u(struct thread_data *td, struct io_u *io_u) { - io_u_clear(td, io_u, IO_U_F_FLIGHT); + io_u_clear_inflight_flags(td, io_u); put_io_u(td, io_u); } @@ -902,6 +977,65 @@ static void setup_strided_zone_mode(struct thread_data *td, struct io_u *io_u) fio_file_reset(td, f); } +static int fill_multi_range_io_u(struct thread_data *td, struct io_u *io_u) +{ + bool is_random; + uint64_t buflen, i = 0; + struct trim_range *range; + struct fio_file *f = io_u->file; + uint8_t *buf; + + buf = io_u->buf; + buflen = 0; + + while (i < td->o.num_range) { + range = (struct trim_range *)buf; + if (get_next_offset(td, io_u, &is_random)) { + dprint(FD_IO, "io_u %p, failed getting offset\n", + io_u); + break; + } + + io_u->buflen = get_next_buflen(td, io_u, is_random); + if (!io_u->buflen) { + dprint(FD_IO, "io_u %p, failed getting buflen\n", io_u); + break; + } + + if (io_u->offset + io_u->buflen > io_u->file->real_file_size) { + dprint(FD_IO, "io_u %p, off=0x%llx + len=0x%llx exceeds file size=0x%llx\n", + io_u, + (unsigned long long) io_u->offset, io_u->buflen, + (unsigned long long) io_u->file->real_file_size); + break; + } + + range->start = io_u->offset; + range->len = io_u->buflen; + buflen += io_u->buflen; + f->last_start[io_u->ddir] = io_u->offset; + f->last_pos[io_u->ddir] = io_u->offset + range->len; + + buf += sizeof(struct trim_range); + i++; + + if (td_random(td) && file_randommap(td, io_u->file)) + mark_random_map(td, io_u, io_u->offset, io_u->buflen); + dprint_io_u(io_u, "fill"); + } + if (buflen) { + /* + * Set buffer length as overall trim length for this IO, and + * tell the ioengine about the number of ranges to be trimmed. + */ + io_u->buflen = buflen; + io_u->number_trim = i; + return 0; + } + + return 1; +} + static int fill_io_u(struct thread_data *td, struct io_u *io_u) { bool is_random; @@ -913,7 +1047,7 @@ static int fill_io_u(struct thread_data *td, struct io_u *io_u) set_rw_ddir(td, io_u); - if (io_u->ddir == DDIR_INVAL) { + if (io_u->ddir == DDIR_INVAL || io_u->ddir == DDIR_TIMEOUT) { dprint(FD_IO, "invalid direction received ddir = %d", io_u->ddir); return 1; } @@ -928,28 +1062,38 @@ static int fill_io_u(struct thread_data *td, struct io_u *io_u) else if (td->o.zone_mode == ZONE_MODE_ZBD) setup_zbd_zone_mode(td, io_u); - /* - * No log, let the seq/rand engine retrieve the next buflen and - * position. - */ - if (get_next_offset(td, io_u, &is_random)) { - dprint(FD_IO, "io_u %p, failed getting offset\n", io_u); - return 1; - } + if (multi_range_trim(td, io_u)) { + if (fill_multi_range_io_u(td, io_u)) + return 1; + } else { + /* + * No log, let the seq/rand engine retrieve the next buflen and + * position. + */ + if (get_next_offset(td, io_u, &is_random)) { + dprint(FD_IO, "io_u %p, failed getting offset\n", io_u); + return 1; + } - io_u->buflen = get_next_buflen(td, io_u, is_random); - if (!io_u->buflen) { - dprint(FD_IO, "io_u %p, failed getting buflen\n", io_u); - return 1; + io_u->buflen = get_next_buflen(td, io_u, is_random); + if (!io_u->buflen) { + dprint(FD_IO, "io_u %p, failed getting buflen\n", io_u); + return 1; + } } - offset = io_u->offset; + if (td->o.zone_mode == ZONE_MODE_ZBD) { ret = zbd_adjust_block(td, io_u); - if (ret == io_u_eof) + if (ret == io_u_eof) { + dprint(FD_IO, "zbd_adjust_block() returned io_u_eof\n"); return 1; + } } + if (td->o.dp_type != FIO_DP_NONE) + dp_fill_dspec_data(td, io_u); + if (io_u->offset + io_u->buflen > io_u->file->real_file_size) { dprint(FD_IO, "io_u %p, off=0x%llx + len=0x%llx exceeds file size=0x%llx\n", io_u, @@ -961,11 +1105,12 @@ static int fill_io_u(struct thread_data *td, struct io_u *io_u) /* * mark entry before potentially trimming io_u */ - if (td_random(td) && file_randommap(td, io_u->file)) + if (!multi_range_trim(td, io_u) && td_random(td) && file_randommap(td, io_u->file)) io_u->buflen = mark_random_map(td, io_u, offset, io_u->buflen); out: - dprint_io_u(io_u, "fill"); + if (!multi_range_trim(td, io_u)) + dprint_io_u(io_u, "fill"); io_u->verify_offset = io_u->offset; td->zone_bytes += io_u->buflen; return 0; @@ -993,7 +1138,7 @@ static void __io_u_mark_map(uint64_t *map, unsigned int nr) break; case 1 ... 4: idx = 1; - fallthrough; + fio_fallthrough; case 0: break; } @@ -1035,7 +1180,7 @@ void io_u_mark_depth(struct thread_data *td, unsigned int nr) break; case 2 ... 3: idx = 1; - fallthrough; + fio_fallthrough; case 1: break; } @@ -1076,7 +1221,7 @@ static void io_u_mark_lat_nsec(struct thread_data *td, unsigned long long nsec) break; case 2 ... 3: idx = 1; - fallthrough; + fio_fallthrough; case 0 ... 1: break; } @@ -1118,7 +1263,7 @@ static void io_u_mark_lat_usec(struct thread_data *td, unsigned long long usec) break; case 2 ... 3: idx = 1; - fallthrough; + fio_fallthrough; case 0 ... 1: break; } @@ -1166,7 +1311,7 @@ static void io_u_mark_lat_msec(struct thread_data *td, unsigned long long msec) break; case 2 ... 3: idx = 1; - fallthrough; + fio_fallthrough; case 0 ... 1: break; } @@ -1327,8 +1472,8 @@ static struct fio_file *__get_next_file(struct thread_data *td) if (td->o.file_service_type == FIO_FSERVICE_SEQ) goto out; if (td->file_service_left) { - td->file_service_left--; - goto out; + td->file_service_left--; + goto out; } } @@ -1376,6 +1521,10 @@ static long set_io_u_file(struct thread_data *td, struct io_u *io_u) put_file_log(td, f); td_io_close_file(td, f); io_u->file = NULL; + + if (io_u->ddir == DDIR_TIMEOUT) + return 1; + if (td->o.file_service_type & __FIO_FSERVICE_NONUNIFORM) fio_file_reset(td, f); else { @@ -1570,7 +1719,6 @@ struct io_u *__get_io_u(struct thread_data *td) { const bool needs_lock = td_async_processing(td); struct io_u *io_u = NULL; - int ret; if (td->stop_io) return NULL; @@ -1582,6 +1730,10 @@ struct io_u *__get_io_u(struct thread_data *td) if (!io_u_rempty(&td->io_u_requeues)) { io_u = io_u_rpop(&td->io_u_requeues); io_u->resid = 0; + if (io_u->file && td->runstate == TD_FSYNCING) { + put_file_log(td, io_u->file); + io_u->file = NULL; + } } else if (!queue_full(td)) { io_u = io_u_qpop(&td->io_u_freelist); @@ -1595,7 +1747,7 @@ struct io_u *__get_io_u(struct thread_data *td) assert(io_u->flags & IO_U_F_FREE); io_u_clear(td, io_u, IO_U_F_FREE | IO_U_F_NO_FILE_PUT | IO_U_F_TRIMMED | IO_U_F_BARRIER | - IO_U_F_VER_LIST | IO_U_F_HIGH_PRIO); + IO_U_F_VER_LIST); io_u->error = 0; io_u->acct_ddir = -1; @@ -1604,14 +1756,16 @@ struct io_u *__get_io_u(struct thread_data *td) io_u_set(td, io_u, IO_U_F_IN_CUR_DEPTH); io_u->ipo = NULL; } else if (td_async_processing(td)) { + int ret; /* * We ran out, wait for async verify threads to finish and * return one */ assert(!(td->flags & TD_F_CHILD)); ret = pthread_cond_wait(&td->free_cond, &td->io_u_lock); - assert(ret == 0); - if (!td->error) + if (fio_unlikely(ret != 0)) { + td->error = errno; + } else if (!td->error) goto again; } @@ -1625,20 +1779,26 @@ static bool check_get_trim(struct thread_data *td, struct io_u *io_u) { if (!(td->flags & TD_F_TRIM_BACKLOG)) return false; - if (!td->trim_entries) + if (!td->trim_entries) { + td->trim_batch = 0; return false; + } if (td->trim_batch) { td->trim_batch--; if (get_next_trim(td, io_u)) return true; + else + td->trim_batch = 0; } else if (!(td->io_hist_len % td->o.trim_backlog) && - td->last_ddir != DDIR_READ) { - td->trim_batch = td->o.trim_batch; - if (!td->trim_batch) - td->trim_batch = td->o.trim_backlog; - if (get_next_trim(td, io_u)) + td->last_ddir_completed != DDIR_TRIM) { + if (get_next_trim(td, io_u)) { + td->trim_batch = td->o.trim_batch; + if (!td->trim_batch) + td->trim_batch = td->o.trim_backlog; + td->trim_batch--; return true; + } } return false; @@ -1655,7 +1815,7 @@ static bool check_get_verify(struct thread_data *td, struct io_u *io_u) if (td->verify_batch) get_verify = 1; else if (!(td->io_hist_len % td->o.verify_backlog) && - td->last_ddir != DDIR_READ) { + td->last_ddir_completed != DDIR_READ) { td->verify_batch = td->o.verify_batch; if (!td->verify_batch) td->verify_batch = td->o.verify_backlog; @@ -1766,7 +1926,7 @@ struct io_u *get_io_u(struct thread_data *td) assert(fio_file_open(f)); - if (ddir_rw(io_u->ddir)) { + if (ddir_rw(io_u->ddir) && !multi_range_trim(td, io_u)) { if (!io_u->buflen && !td_ioengine_flagged(td, FIO_NOIO)) { dprint(FD_IO, "get_io_u: zero buflen on %p\n", io_u); goto err_put; @@ -1782,8 +1942,9 @@ struct io_u *get_io_u(struct thread_data *td) io_u->buflen); } else if ((td->flags & TD_F_SCRAMBLE_BUFFERS) && !(td->flags & TD_F_COMPRESS) && - !(td->flags & TD_F_DO_VERIFY)) + !(td->flags & TD_F_DO_VERIFY)) { do_scramble = 1; + } } else if (io_u->ddir == DDIR_READ) { /* * Reset the buf_filled parameters so next time if the @@ -1803,6 +1964,7 @@ struct io_u *get_io_u(struct thread_data *td) * Remember the issuing context priority. The IO engine may change this. */ io_u->ioprio = td->ioprio; + io_u->clat_prio_index = 0; out: assert(io_u->file); if (!td_io_prep(td, io_u)) { @@ -1823,25 +1985,36 @@ struct io_u *get_io_u(struct thread_data *td) static void __io_u_log_error(struct thread_data *td, struct io_u *io_u) { enum error_type_bit eb = td_error_type(io_u->ddir, io_u->error); + bool non_fatal_error = td_non_fatal_error(td, eb, io_u->error); - if (td_non_fatal_error(td, eb, io_u->error) && !td->o.error_dump) + /* + * Non-fatal errors (errors that should be ignored), are normally not + * dumped to the log, unless td->o.error_dump. Regardless, non-fatal + * errors should never call td_verror() to set td->error. + */ + if (non_fatal_error && !td->o.error_dump) return; log_err("fio: io_u error%s%s: %s: %s offset=%llu, buflen=%llu\n", io_u->file ? " on file " : "", io_u->file ? io_u->file->file_name : "", - strerror(io_u->error), + (io_u->flags & IO_U_F_DEVICE_ERROR) ? + "Device-specific error" : strerror(io_u->error), io_ddir_name(io_u->ddir), io_u->offset, io_u->xfer_buflen); + zbd_log_err(td, io_u); + if (td->io_ops->errdetails) { - char *err = td->io_ops->errdetails(io_u); + char *err = td->io_ops->errdetails(td, io_u); - log_err("fio: %s\n", err); - free(err); + if (err) { + log_err("fio: %s\n", err); + free(err); + } } - if (!td->error) + if (!td->error && !non_fatal_error) td_verror(td, io_u->error, "io_u error"); } @@ -1888,8 +2061,7 @@ static void account_io_completion(struct thread_data *td, struct io_u *io_u, unsigned long long tnsec; tnsec = ntime_since(&io_u->start_time, &icd->time); - add_lat_sample(td, idx, tnsec, bytes, io_u->offset, - io_u->ioprio, io_u_is_high_prio(io_u)); + add_lat_sample(td, idx, tnsec, bytes, io_u); if (td->flags & TD_F_PROFILE_OPS) { struct prof_io_ops *ops = &td->prof_io_ops; @@ -1910,8 +2082,7 @@ static void account_io_completion(struct thread_data *td, struct io_u *io_u, if (ddir_rw(idx)) { if (!td->o.disable_clat) { - add_clat_sample(td, idx, llnsec, bytes, io_u->offset, - io_u->ioprio, io_u_is_high_prio(io_u)); + add_clat_sample(td, idx, llnsec, bytes, io_u); io_u_mark_latency(td, llnsec); } @@ -1930,8 +2101,6 @@ static void account_io_completion(struct thread_data *td, struct io_u *io_u, static void file_log_write_comp(const struct thread_data *td, struct fio_file *f, uint64_t offset, unsigned int bytes) { - int idx; - if (!f) return; @@ -1939,19 +2108,11 @@ static void file_log_write_comp(const struct thread_data *td, struct fio_file *f f->first_write = offset; if (f->last_write == -1ULL || ((offset + bytes) > f->last_write)) f->last_write = offset + bytes; - - if (!f->last_write_comp) - return; - - idx = f->last_write_idx++; - f->last_write_comp[idx] = offset; - if (f->last_write_idx == td->o.iodepth) - f->last_write_idx = 0; } static bool should_account(struct thread_data *td) { - return ramp_time_over(td) && (td->runstate == TD_RUNNING || + return ramp_period_over(td) && (td->runstate == TD_RUNNING || td->runstate == TD_VERIFYING); } @@ -1965,7 +2126,13 @@ static void io_completed(struct thread_data *td, struct io_u **io_u_ptr, dprint_io_u(io_u, "complete"); assert(io_u->flags & IO_U_F_FLIGHT); - io_u_clear(td, io_u, IO_U_F_FLIGHT | IO_U_F_BUSY_OK); + io_u_clear_inflight_flags(td, io_u); + invalidate_inflight(td, io_u); + + if (td->o.zone_mode == ZONE_MODE_ZBD && td->o.recover_zbd_write_error && + io_u->error && io_u->ddir == DDIR_WRITE && + !td_ioengine_flagged(td, FIO_SYNCIO)) + zbd_recover_write_error(td, io_u); /* * Mark IO ok to verify @@ -1983,7 +2150,8 @@ static void io_completed(struct thread_data *td, struct io_u **io_u_ptr, } if (ddir_sync(ddir)) { - td->last_was_sync = true; + if (io_u->error) + goto error; if (f) { f->first_write = -1ULL; f->last_write = -1ULL; @@ -1993,8 +2161,7 @@ static void io_completed(struct thread_data *td, struct io_u **io_u_ptr, return; } - td->last_was_sync = false; - td->last_ddir = ddir; + td->last_ddir_completed = ddir; if (!io_u->error && ddir_rw(ddir)) { unsigned long long bytes = io_u->xfer_buflen - io_u->resid; @@ -2038,6 +2205,7 @@ static void io_completed(struct thread_data *td, struct io_u **io_u_ptr, icd->error = ret; } } else if (io_u->error) { +error: icd->error = io_u->error; io_u_log_error(td, io_u); } @@ -2090,13 +2258,27 @@ static void ios_completed(struct thread_data *td, } } +static void io_u_update_bytes_done(struct thread_data *td, + struct io_completion_data *icd) +{ + int ddir; + + if (td->runstate == TD_VERIFYING) { + td->bytes_verified += icd->bytes_done[DDIR_READ]; + if (td_write(td)) + return; + } + + for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) + td->bytes_done[ddir] += icd->bytes_done[ddir]; +} + /* * Complete a single io_u for the sync engines. */ int io_u_sync_complete(struct thread_data *td, struct io_u *io_u) { struct io_completion_data icd; - int ddir; init_icd(td, &icd, 1); io_completed(td, &io_u, &icd); @@ -2109,8 +2291,7 @@ int io_u_sync_complete(struct thread_data *td, struct io_u *io_u) return -1; } - for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) - td->bytes_done[ddir] += icd.bytes_done[ddir]; + io_u_update_bytes_done(td, &icd); return 0; } @@ -2122,7 +2303,7 @@ int io_u_queued_complete(struct thread_data *td, int min_evts) { struct io_completion_data icd; struct timespec *tvp = NULL; - int ret, ddir; + int ret; struct timespec ts = { .tv_sec = 0, .tv_nsec = 0, }; dprint(FD_IO, "io_u_queued_complete: min=%d\n", min_evts); @@ -2148,8 +2329,7 @@ int io_u_queued_complete(struct thread_data *td, int min_evts) return -1; } - for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) - td->bytes_done[ddir] += icd.bytes_done[ddir]; + io_u_update_bytes_done(td, &icd); return ret; } @@ -2159,16 +2339,10 @@ int io_u_queued_complete(struct thread_data *td, int min_evts) */ void io_u_queued(struct thread_data *td, struct io_u *io_u) { - if (!td->o.disable_slat && ramp_time_over(td) && td->o.stats) { - unsigned long slat_time; - - slat_time = ntime_since(&io_u->start_time, &io_u->issue_time); - + if (!td->o.disable_slat && ramp_period_over(td) && td->o.stats) { if (td->parent) td = td->parent; - - add_slat_sample(td, io_u->ddir, slat_time, io_u->xfer_buflen, - io_u->offset, io_u->ioprio); + add_slat_sample(td, io_u); } } @@ -2296,7 +2470,11 @@ int do_io_u_sync(const struct thread_data *td, struct io_u *io_u) int ret; if (io_u->ddir == DDIR_SYNC) { +#ifdef CONFIG_FCNTL_SYNC + ret = fcntl(io_u->file->fd, F_FULLFSYNC); +#else ret = fsync(io_u->file->fd); +#endif } else if (io_u->ddir == DDIR_DATASYNC) { #ifdef CONFIG_FDATASYNC ret = fdatasync(io_u->file->fd); @@ -2317,7 +2495,7 @@ int do_io_u_sync(const struct thread_data *td, struct io_u *io_u) return ret; } -int do_io_u_trim(const struct thread_data *td, struct io_u *io_u) +int do_io_u_trim(struct thread_data *td, struct io_u *io_u) { #ifndef FIO_HAVE_TRIM io_u->error = EINVAL; diff --git a/io_u.h b/io_u.h index bdbac52577..68771ebab8 100644 --- a/io_u.h +++ b/io_u.h @@ -21,7 +21,9 @@ enum { IO_U_F_TRIMMED = 1 << 5, IO_U_F_BARRIER = 1 << 6, IO_U_F_VER_LIST = 1 << 7, - IO_U_F_HIGH_PRIO = 1 << 8, + IO_U_F_PATTERN_DONE = 1 << 8, + IO_U_F_DEVICE_ERROR = 1 << 9, + IO_U_F_VER_IN_DEV = 1 << 10, /* Verify data in device */ }; /* @@ -44,12 +46,18 @@ struct io_u { /* * Write generation */ - unsigned short numberio; + uint64_t numberio; /* * IO priority. */ unsigned short ioprio; + unsigned short clat_prio_index; + + /* + * number of trim ranges for this IO. + */ + unsigned int number_trim; /* * Allocated/set buffer and length @@ -82,14 +90,16 @@ struct io_u { unsigned long long resid; unsigned int error; + int inflight_idx; + /* * io engine private data */ union { unsigned int index; unsigned int seen; - void *engine_data; }; + void *engine_data; union { struct flist_head verify_list; @@ -103,8 +113,7 @@ struct io_u { * @success == true means that the I/O operation has been queued or * completed successfully. */ - void (*zbd_queue_io)(struct thread_data *td, struct io_u *, int q, - bool success); + void (*zbd_queue_io)(struct thread_data *td, struct io_u *, int *q); /* * ZBD mode zbd_put_io callback: called in after completion of an I/O @@ -117,6 +126,9 @@ struct io_u { */ int (*end_io)(struct thread_data *, struct io_u **); + uint32_t dtype; + uint32_t dspec; + union { #ifdef CONFIG_LIBAIO struct iocb iocb; @@ -135,6 +147,7 @@ struct io_u { #endif void *mmap_data; }; + void *pi_attr; }; /* @@ -158,7 +171,7 @@ void io_u_mark_submit(struct thread_data *, unsigned int); bool queue_full(const struct thread_data *); int do_io_u_sync(const struct thread_data *, struct io_u *); -int do_io_u_trim(const struct thread_data *, struct io_u *); +int do_io_u_trim(struct thread_data *, struct io_u *); #ifdef FIO_INC_DEBUG static inline void dprint_io_u(struct io_u *io_u, const char *p) @@ -193,6 +206,5 @@ static inline enum fio_ddir acct_ddir(struct io_u *io_u) td_flags_clear((td), &(io_u->flags), (val)) #define io_u_set(td, io_u, val) \ td_flags_set((td), &(io_u)->flags, (val)) -#define io_u_is_high_prio(io_u) (io_u->flags & IO_U_F_HIGH_PRIO) #endif diff --git a/ioengines.c b/ioengines.c index d08a511a06..9f75e66c6d 100644 --- a/ioengines.c +++ b/ioengines.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "fio.h" #include "diskutil.h" @@ -24,6 +25,13 @@ static FLIST_HEAD(engine_list); +static inline bool async_ioengine_sync_trim(struct thread_data *td, + struct io_u *io_u) +{ + return td_ioengine_flagged(td, FIO_ASYNCIO_SYNC_TRIM) && + io_u->ddir == DDIR_TRIM; +} + static bool check_engine_ops(struct thread_data *td, struct ioengine_ops *ops) { if (ops->version != FIO_IOOPS_VERSION) { @@ -223,6 +231,8 @@ struct ioengine_ops *load_ioengine(struct thread_data *td) */ void free_ioengine(struct thread_data *td) { + assert(td != NULL && td->io_ops != NULL); + dprint(FD_IO, "free ioengine %s\n", td->io_ops->name); if (td->eo && td->io_ops->options) { @@ -333,8 +343,13 @@ enum fio_q_status td_io_queue(struct thread_data *td, struct io_u *io_u) * flag is now set */ if (td_offload_overlap(td)) { - int res = pthread_mutex_unlock(&overlap_check); - assert(res == 0); + int res; + + res = pthread_mutex_unlock(&overlap_check); + if (fio_unlikely(res != 0)) { + log_err("failed to unlock overlap check mutex, err: %i:%s", errno, strerror(errno)); + abort(); + } } assert(fio_file_open(io_u->file)); @@ -348,17 +363,17 @@ enum fio_q_status td_io_queue(struct thread_data *td, struct io_u *io_u) io_u->resid = 0; if (td_ioengine_flagged(td, FIO_SYNCIO) || - (td_ioengine_flagged(td, FIO_ASYNCIO_SYNC_TRIM) && - io_u->ddir == DDIR_TRIM)) { - if (fio_fill_issue_time(td)) + async_ioengine_sync_trim(td, io_u)) { + if (fio_fill_issue_time(td)) { fio_gettime(&io_u->issue_time, NULL); - /* - * only used for iolog - */ - if (td->o.read_iolog_file) - memcpy(&td->last_issue, &io_u->issue_time, - sizeof(io_u->issue_time)); + /* + * only used for iolog + */ + if (td->o.read_iolog_file) + memcpy(&td->last_issue, &io_u->issue_time, + sizeof(io_u->issue_time)); + } } @@ -371,14 +386,16 @@ enum fio_q_status td_io_queue(struct thread_data *td, struct io_u *io_u) } ret = td->io_ops->queue(td, io_u); - zbd_queue_io_u(td, io_u, ret); + zbd_queue_io_u(td, io_u, &ret); unlock_file(td, io_u->file); - if (ret == FIO_Q_BUSY && ddir_rw(ddir)) { - td->io_issues[ddir]--; - td->io_issue_bytes[ddir] -= buflen; - td->rate_io_issue_bytes[ddir] -= buflen; + if (ret == FIO_Q_BUSY) { + if (ddir_rw(ddir)) { + td->io_issues[ddir]--; + td->io_issue_bytes[ddir] -= buflen; + td->rate_io_issue_bytes[ddir] -= buflen; + } io_u_clear(td, io_u, IO_U_F_FLIGHT); } @@ -421,6 +438,8 @@ enum fio_q_status td_io_queue(struct thread_data *td, struct io_u *io_u) io_u_mark_depth(td, 1); td->ts.total_io_u[io_u->ddir]++; } + + td->last_ddir_issued = ddir; } else if (ret == FIO_Q_QUEUED) { td->io_u_queued++; @@ -430,20 +449,23 @@ enum fio_q_status td_io_queue(struct thread_data *td, struct io_u *io_u) if (td->io_u_queued >= td->o.iodepth_batch) td_io_commit(td); + + td->last_ddir_issued = ddir; } if (!td_ioengine_flagged(td, FIO_SYNCIO) && - (!td_ioengine_flagged(td, FIO_ASYNCIO_SYNC_TRIM) || - io_u->ddir != DDIR_TRIM)) { - if (fio_fill_issue_time(td)) + !async_ioengine_sync_trim(td, io_u)) { + if (fio_fill_issue_time(td) && + !td_ioengine_flagged(td, FIO_ASYNCIO_SETS_ISSUE_TIME)) { fio_gettime(&io_u->issue_time, NULL); - /* - * only used for iolog - */ - if (td->o.read_iolog_file) - memcpy(&td->last_issue, &io_u->issue_time, - sizeof(io_u->issue_time)); + /* + * only used for iolog + */ + if (td->o.read_iolog_file) + memcpy(&td->last_issue, &io_u->issue_time, + sizeof(io_u->issue_time)); + } } return ret; @@ -555,6 +577,10 @@ int td_io_open_file(struct thread_data *td, struct fio_file *f) flags = POSIX_FADV_RANDOM; else if (td->o.fadvise_hint == F_ADV_SEQUENTIAL) flags = POSIX_FADV_SEQUENTIAL; +#ifdef POSIX_FADV_NOREUSE + else if (td->o.fadvise_hint == F_ADV_NOREUSE) + flags = POSIX_FADV_NOREUSE; +#endif else { log_err("fio: unknown fadvise type %d\n", td->o.fadvise_hint); @@ -570,19 +596,21 @@ int td_io_open_file(struct thread_data *td, struct fio_file *f) if (fio_option_is_set(&td->o, write_hint) && (f->filetype == FIO_TYPE_BLOCK || f->filetype == FIO_TYPE_FILE)) { uint64_t hint = td->o.write_hint; - int cmd; + int res; /* - * For direct IO, we just need/want to set the hint on - * the file descriptor. For buffered IO, we need to set - * it on the inode. + * For direct IO, set the hint on the file descriptor if that is + * supported. Otherwise set it on the inode. For buffered IO, we + * need to set it on the inode. */ - if (td->o.odirect) - cmd = F_SET_FILE_RW_HINT; - else - cmd = F_SET_RW_HINT; - - if (fcntl(f->fd, cmd, &hint) < 0) { + if (td->o.odirect) { + res = fcntl(f->fd, F_SET_FILE_RW_HINT, &hint); + if (res < 0) + res = fcntl(f->fd, F_SET_RW_HINT, &hint); + } else { + res = fcntl(f->fd, F_SET_RW_HINT, &hint); + } + if (res < 0) { td_verror(td, errno, "fcntl write hint"); goto err; } diff --git a/ioengines.h b/ioengines.h index b3f755b477..3d220a73ca 100644 --- a/ioengines.h +++ b/ioengines.h @@ -7,8 +7,9 @@ #include "flist.h" #include "io_u.h" #include "zbd_types.h" +#include "dataplacement.h" -#define FIO_IOOPS_VERSION 30 +#define FIO_IOOPS_VERSION 39 #ifndef CONFIG_DYNAMIC_ENGINES #define FIO_STATIC static @@ -39,8 +40,7 @@ struct ioengine_ops { int (*commit)(struct thread_data *); int (*getevents)(struct thread_data *, unsigned int, unsigned int, const struct timespec *); struct io_u *(*event)(struct thread_data *, int); - char *(*errdetails)(struct io_u *); - int (*cancel)(struct thread_data *, struct io_u *); + char *(*errdetails)(struct thread_data *, struct io_u *); void (*cleanup)(struct thread_data *); int (*open_file)(struct thread_data *, struct fio_file *); int (*close_file)(struct thread_data *, struct fio_file *); @@ -59,30 +59,70 @@ struct ioengine_ops { uint64_t, struct zbd_zone *, unsigned int); int (*reset_wp)(struct thread_data *, struct fio_file *, uint64_t, uint64_t); + int (*move_zone_wp)(struct thread_data *, struct fio_file *, + struct zbd_zone *, uint64_t, const char *); int (*get_max_open_zones)(struct thread_data *, struct fio_file *, unsigned int *); + int (*get_max_active_zones)(struct thread_data *, struct fio_file *, + unsigned int *); + int (*finish_zone)(struct thread_data *, struct fio_file *, + uint64_t, uint64_t); + int (*fdp_fetch_ruhs)(struct thread_data *, struct fio_file *, + struct fio_ruhs_info *); int option_struct_size; struct fio_option *options; }; +enum { + __FIO_SYNCIO = 0, /* io engine has synchronous ->queue */ + __FIO_RAWIO, /* some sort of direct/raw io */ + __FIO_DISKLESSIO, /* no disk involved */ + __FIO_NOEXTEND, /* engine can't extend file */ + __FIO_NODISKUTIL, /* diskutil can't handle filename */ + __FIO_UNIDIR, /* engine is uni-directional */ + __FIO_NOIO, /* thread does only pseudo IO */ + __FIO_PIPEIO, /* input/output no seekable */ + __FIO_BARRIER, /* engine supports barriers */ + __FIO_MEMALIGN, /* engine wants aligned memory */ + __FIO_BIT_BASED, /* engine uses a bit base (e.g. uses Kbit as opposed to + KB) */ + __FIO_FAKEIO, /* engine pretends to do IO */ + __FIO_NOSTATS, /* don't do IO stats */ + __FIO_NOFILEHASH, /* doesn't hash the files for lookup later. */ + __FIO_ASYNCIO_SYNC_TRIM, /* io engine has async ->queue except for trim */ + __FIO_NO_OFFLOAD, /* no async offload */ + __FIO_ASYNCIO_SETS_ISSUE_TIME, /* async ioengine with commit function that sets + issue_time */ + __FIO_SKIPPABLE_IOMEM_ALLOC, /* skip iomem_alloc & iomem_free if job sets mem/iomem */ + __FIO_RO_NEEDS_RW_OPEN, /* open files in rw mode even if we have a read job; only + affects ioengines using generic_open_file */ + __FIO_MULTI_RANGE_TRIM, /* ioengine supports trim with more than one range */ + __FIO_ATOMICWRITES, /* ioengine supports atomic writes */ + __FIO_IOENGINE_F_LAST, /* not a real bit; used to count number of bits */ +}; + enum fio_ioengine_flags { - FIO_SYNCIO = 1 << 0, /* io engine has synchronous ->queue */ - FIO_RAWIO = 1 << 1, /* some sort of direct/raw io */ - FIO_DISKLESSIO = 1 << 2, /* no disk involved */ - FIO_NOEXTEND = 1 << 3, /* engine can't extend file */ - FIO_NODISKUTIL = 1 << 4, /* diskutil can't handle filename */ - FIO_UNIDIR = 1 << 5, /* engine is uni-directional */ - FIO_NOIO = 1 << 6, /* thread does only pseudo IO */ - FIO_PIPEIO = 1 << 7, /* input/output no seekable */ - FIO_BARRIER = 1 << 8, /* engine supports barriers */ - FIO_MEMALIGN = 1 << 9, /* engine wants aligned memory */ - FIO_BIT_BASED = 1 << 10, /* engine uses a bit base (e.g. uses Kbit as opposed to KB) */ - FIO_FAKEIO = 1 << 11, /* engine pretends to do IO */ - FIO_NOSTATS = 1 << 12, /* don't do IO stats */ - FIO_NOFILEHASH = 1 << 13, /* doesn't hash the files for lookup later. */ - FIO_ASYNCIO_SYNC_TRIM - = 1 << 14, /* io engine has async ->queue except for trim */ - FIO_NO_OFFLOAD = 1 << 15, /* no async offload */ + FIO_SYNCIO = 1 << __FIO_SYNCIO, + FIO_RAWIO = 1 << __FIO_RAWIO, + FIO_DISKLESSIO = 1 << __FIO_DISKLESSIO, + FIO_NOEXTEND = 1 << __FIO_NOEXTEND, + FIO_NODISKUTIL = 1 << __FIO_NODISKUTIL, + FIO_UNIDIR = 1 << __FIO_UNIDIR, + FIO_NOIO = 1 << __FIO_NOIO, + FIO_PIPEIO = 1 << __FIO_PIPEIO, + FIO_BARRIER = 1 << __FIO_BARRIER, + FIO_MEMALIGN = 1 << __FIO_MEMALIGN, + FIO_BIT_BASED = 1 << __FIO_BIT_BASED, + FIO_FAKEIO = 1 << __FIO_FAKEIO, + FIO_NOSTATS = 1 << __FIO_NOSTATS, + FIO_NOFILEHASH = 1 << __FIO_NOFILEHASH, + FIO_ASYNCIO_SYNC_TRIM = 1 << __FIO_ASYNCIO_SYNC_TRIM, + FIO_NO_OFFLOAD = 1 << __FIO_NO_OFFLOAD, + FIO_ASYNCIO_SETS_ISSUE_TIME = 1 << __FIO_ASYNCIO_SETS_ISSUE_TIME, + FIO_SKIPPABLE_IOMEM_ALLOC = 1 << __FIO_SKIPPABLE_IOMEM_ALLOC, + FIO_RO_NEEDS_RW_OPEN = 1 << __FIO_RO_NEEDS_RW_OPEN, + FIO_MULTI_RANGE_TRIM = 1 << __FIO_MULTI_RANGE_TRIM, + FIO_ATOMICWRITES = 1 << __FIO_ATOMICWRITES, }; /* diff --git a/iolog.c b/iolog.c index 1aeb7a76b2..dcf6083c7d 100644 --- a/iolog.c +++ b/iolog.c @@ -31,6 +31,7 @@ static int iolog_flush(struct io_log *log); static const char iolog_ver2[] = "fio version 2 iolog"; +static const char iolog_ver3[] = "fio version 3 iolog"; void queue_io_piece(struct thread_data *td, struct io_piece *ipo) { @@ -40,18 +41,24 @@ void queue_io_piece(struct thread_data *td, struct io_piece *ipo) void log_io_u(const struct thread_data *td, const struct io_u *io_u) { + struct timespec now; + if (!td->o.write_iolog_file) return; - fprintf(td->iolog_f, "%s %s %llu %llu\n", io_u->file->file_name, - io_ddir_name(io_u->ddir), - io_u->offset, io_u->buflen); + fio_gettime(&now, NULL); + fprintf(td->iolog_f, "%llu %s %s %llu %llu\n", + (unsigned long long) utime_since_now(&td->io_log_start_time), + io_u->file->file_name, io_ddir_name(io_u->ddir), io_u->offset, + io_u->buflen); + } void log_file(struct thread_data *td, struct fio_file *f, enum file_log_act what) { const char *act[] = { "add", "open", "close" }; + struct timespec now; assert(what < 3); @@ -65,15 +72,18 @@ void log_file(struct thread_data *td, struct fio_file *f, if (!td->iolog_f) return; - fprintf(td->iolog_f, "%s %s\n", f->file_name, act[what]); + fio_gettime(&now, NULL); + fprintf(td->iolog_f, "%llu %s %s\n", + (unsigned long long) utime_since_now(&td->io_log_start_time), + f->file_name, act[what]); } static void iolog_delay(struct thread_data *td, unsigned long delay) { uint64_t usec = utime_since_now(&td->last_issue); unsigned long orig_delay = delay; - uint64_t this_delay; struct timespec ts; + int ret = 0; if (delay < td->time_offset) { td->time_offset = 0; @@ -87,13 +97,15 @@ static void iolog_delay(struct thread_data *td, unsigned long delay) delay -= usec; fio_gettime(&ts, NULL); - while (delay && !td->terminate) { - this_delay = delay; - if (this_delay > 500000) - this_delay = 500000; - usec_sleep(td, this_delay); - delay -= this_delay; + while (delay && !td->terminate) { + ret = io_u_queued_complete(td, 0); + if (ret < 0) + td_verror(td, -ret, "io_u_queued_complete"); + if (td->flags & TD_F_REGROW_LOGS) + regrow_logs(td); + if (utime_since_now(&ts) > delay) + break; } usec = utime_since_now(&ts); @@ -116,6 +128,10 @@ static int ipo_special(struct thread_data *td, struct io_piece *ipo) f = td->files[ipo->fileno]; + if (ipo->delay) + iolog_delay(td, ipo->delay); + if (fio_fill_issue_time(td)) + fio_gettime(&td->last_issue, NULL); switch (ipo->file_action) { case FIO_LOG_OPEN_FILE: if (td->o.replay_redirect && fio_file_open(f)) { @@ -124,8 +140,17 @@ static int ipo_special(struct thread_data *td, struct io_piece *ipo) break; } ret = td_io_open_file(td, f); - if (!ret) + if (!ret) { + if (td->o.dp_type != FIO_DP_NONE) { + int dp_init_ret = dp_init(td); + + if (dp_init_ret != 0) { + td_verror(td, abs(dp_init_ret), "dp_init"); + return -1; + } + } break; + } td_verror(td, ret, "iolog open file"); return -1; case FIO_LOG_CLOSE_FILE: @@ -134,6 +159,11 @@ static int ipo_special(struct thread_data *td, struct io_piece *ipo) case FIO_LOG_UNLINK_FILE: td_io_unlink_file(td, f); break; + case FIO_LOG_ADD_FILE: + /* + * Nothing to do + */ + break; default: log_err("fio: bad file action %d\n", ipo->file_action); break; @@ -142,7 +172,25 @@ static int ipo_special(struct thread_data *td, struct io_piece *ipo) return 1; } -static bool read_iolog2(struct thread_data *td); +static bool read_iolog(struct thread_data *td); + +unsigned long long delay_since_ttime(const struct thread_data *td, + unsigned long long time) +{ + double tmp; + double scale; + const unsigned long long *last_ttime = &td->io_log_last_ttime; + + if (!*last_ttime || td->o.no_stall || time < *last_ttime) + return 0; + else if (td->o.replay_time_scale == 100) + return time - *last_ttime; + + + scale = (double) 100.0 / (double) td->o.replay_time_scale; + tmp = time - *last_ttime; + return tmp * scale; +} int read_iolog_get(struct thread_data *td, struct io_u *io_u) { @@ -152,10 +200,15 @@ int read_iolog_get(struct thread_data *td, struct io_u *io_u) while (!flist_empty(&td->io_log_list)) { int ret; - if (!td->io_log_blktrace && td->o.read_iolog_chunked) { + if (td->o.read_iolog_chunked) { if (td->io_log_checkmark == td->io_log_current) { - if (!read_iolog2(td)) - return 1; + if (td->io_log_blktrace) { + if (!read_blktrace(td)) + return 1; + } else { + if (!read_iolog(td)) + return 1; + } } td->io_log_current--; } @@ -183,6 +236,9 @@ int read_iolog_get(struct thread_data *td, struct io_u *io_u) io_u->buflen, io_u->file->file_name); if (ipo->delay) iolog_delay(td, ipo->delay); + + if (td->o.dp_type != FIO_DP_NONE) + dp_fill_dspec_data(td, io_u); } else { elapsed = mtime_since_genesis(); if (ipo->delay > elapsed) @@ -245,11 +301,12 @@ void log_io_piece(struct thread_data *td, struct io_u *io_u) } /* - * Only sort writes if we don't have a random map in which case we need - * to check for duplicate blocks and drop the old one, which we rely on - * the rb insert/lookup for handling. + * Sort writes if we don't have a random map in which case we need to + * check for duplicate blocks and drop the old one, which we rely on + * the rb insert/lookup for handling. Sort writes if we have offset + * modifier which can also create duplicate blocks. */ - if (file_randommap(td, ipo->file)) { + if (!fio_offset_overlap_risk(td)) { INIT_FLIST_HEAD(&ipo->list); flist_add_tail(&ipo->list, &td->io_hist_list); ipo->flags |= IP_F_ONLIST; @@ -355,7 +412,7 @@ void write_iolog_close(struct thread_data *td) td->iolog_buf = NULL; } -static int64_t iolog_items_to_fetch(struct thread_data *td) +int64_t iolog_items_to_fetch(struct thread_data *td) { struct timespec now; uint64_t elapsed; @@ -383,20 +440,27 @@ static int64_t iolog_items_to_fetch(struct thread_data *td) return items_to_fetch; } +#define io_act(_td, _r) (((_td)->io_log_version == 3 && (r) == 5) || \ + ((_td)->io_log_version == 2 && (r) == 4)) +#define file_act(_td, _r) (((_td)->io_log_version == 3 && (r) == 3) || \ + ((_td)->io_log_version == 2 && (r) == 2)) + /* - * Read version 2 iolog data. It is enhanced to include per-file logging, + * Read version 2 and 3 iolog data. It is enhanced to include per-file logging, * syncs, etc. */ -static bool read_iolog2(struct thread_data *td) +static bool read_iolog(struct thread_data *td) { unsigned long long offset; unsigned int bytes; - int reads, writes, waits, fileno = 0, file_action = 0; /* stupid gcc */ + unsigned long long delay = 0; + int reads, writes, trims, waits, fileno = 0, file_action = 0; /* stupid gcc */ char *rfname, *fname, *act; char *str, *p; enum fio_ddir rw; bool realloc = false; int64_t items_to_fetch = 0; + int syncs; if (td->o.read_iolog_chunked) { items_to_fetch = iolog_items_to_fetch(td); @@ -412,40 +476,62 @@ static bool read_iolog2(struct thread_data *td) rfname = fname = malloc(256+16); act = malloc(256+16); - reads = writes = waits = 0; + syncs = reads = writes = trims = waits = 0; while ((p = fgets(str, 4096, td->io_log_rfile)) != NULL) { struct io_piece *ipo; int r; + unsigned long long ttime; - r = sscanf(p, "%256s %256s %llu %u", rfname, act, &offset, - &bytes); + if (td->io_log_version == 3) { + r = sscanf(p, "%llu %256s %256s %llu %u", &ttime, rfname, act, + &offset, &bytes); + delay = delay_since_ttime(td, ttime); + td->io_log_last_ttime = ttime; + /* + * "wait" is not allowed with version 3 + */ + if (!strcmp(act, "wait")) { + log_err("iolog: ignoring wait command with" + " version 3 for file %s\n", fname); + continue; + } + } else /* version 2 */ + r = sscanf(p, "%256s %256s %llu %u", rfname, act, &offset, &bytes); if (td->o.replay_redirect) fname = td->o.replay_redirect; - if (r == 4) { + if (io_act(td, r)) { /* * Check action first */ if (!strcmp(act, "wait")) rw = DDIR_WAIT; - else if (!strcmp(act, "read")) + else if (!strcmp(act, "read")) { + if (td->o.replay_skip & (1u << DDIR_READ)) + continue; rw = DDIR_READ; - else if (!strcmp(act, "write")) + } else if (!strcmp(act, "write")) { + if (td->o.replay_skip & (1u << DDIR_WRITE)) + continue; rw = DDIR_WRITE; - else if (!strcmp(act, "sync")) + } else if (!strcmp(act, "sync")) { + if (td->o.replay_skip & (1u << DDIR_SYNC)) + continue; rw = DDIR_SYNC; - else if (!strcmp(act, "datasync")) + } else if (!strcmp(act, "datasync")) rw = DDIR_DATASYNC; - else if (!strcmp(act, "trim")) + else if (!strcmp(act, "trim")) { + if (td->o.replay_skip & (1u << DDIR_TRIM)) + continue; rw = DDIR_TRIM; - else { + } else { log_err("fio: bad iolog file action: %s\n", act); continue; } fileno = get_fileno(td, fname); - } else if (r == 2) { + } else if (file_act(td, r)) { rw = DDIR_INVAL; if (!strcmp(act, "add")) { if (td->o.replay_redirect && @@ -456,7 +542,6 @@ static bool read_iolog2(struct thread_data *td) fileno = add_file(td, fname, td->subjob_number, 1); file_action = FIO_LOG_ADD_FILE; } - continue; } else if (!strcmp(act, "open")) { fileno = get_fileno(td, fname); file_action = FIO_LOG_OPEN_FILE; @@ -469,7 +554,7 @@ static bool read_iolog2(struct thread_data *td) continue; } } else { - log_err("bad iolog2: %s\n", p); + log_err("bad iolog%d: %s\n", td->io_log_version, p); continue; } @@ -482,12 +567,21 @@ static bool read_iolog2(struct thread_data *td) if (read_only) continue; writes++; + } else if (rw == DDIR_TRIM) { + /* + * Don't add a trim for ro mode + */ + if (read_only) + continue; + trims++; } else if (rw == DDIR_WAIT) { if (td->o.no_stall) continue; waits++; } else if (rw == DDIR_INVAL) { - } else if (!ddir_sync(rw)) { + } else if (ddir_sync(rw)) { + syncs++; + } else { log_err("bad ddir: %d\n", rw); continue; } @@ -498,6 +592,8 @@ static bool read_iolog2(struct thread_data *td) ipo = calloc(1, sizeof(*ipo)); init_ipo(ipo); ipo->ddir = rw; + if (td->io_log_version == 3) + ipo->delay = delay; if (rw == DDIR_WAIT) { ipo->delay = offset; } else { @@ -542,6 +638,8 @@ static bool read_iolog2(struct thread_data *td) " read-only\n", td->o.name, writes); writes = 0; } + if (syncs) + td->flags |= TD_F_SYNCS; if (td->o.read_iolog_chunked) { if (td->io_log_current == 0) { @@ -552,19 +650,22 @@ static bool read_iolog2(struct thread_data *td) { io_u_quiesce(td); free_io_mem(td); - init_io_u_buffers(td); + if (init_io_u_buffers(td)) + return false; } return true; } - if (!reads && !writes && !waits) + if (!reads && !writes && !waits && !trims) return false; - else if (reads && !writes) - td->o.td_ddir = TD_DDIR_READ; - else if (!reads && writes) - td->o.td_ddir = TD_DDIR_WRITE; - else - td->o.td_ddir = TD_DDIR_RW; + + td->o.td_ddir = 0; + if (reads) + td->o.td_ddir |= TD_DDIR_READ; + if (writes) + td->o.td_ddir |= TD_DDIR_WRITE; + if (trims) + td->o.td_ddir |= TD_DDIR_TRIM; return true; } @@ -626,8 +727,6 @@ static bool init_iolog_read(struct thread_data *td, char *fname) } else f = fopen(fname, "r"); - free(fname); - if (!f) { perror("fopen read iolog"); return false; @@ -642,18 +741,22 @@ static bool init_iolog_read(struct thread_data *td, char *fname) } /* - * version 2 of the iolog stores a specific string as the + * versions 2 and 3 of the iolog store a specific string as the * first line, check for that */ - if (!strncmp(iolog_ver2, buffer, strlen(iolog_ver2))) { - free_release_files(td); - td->io_log_rfile = f; - return read_iolog2(td); + if (!strncmp(iolog_ver2, buffer, strlen(iolog_ver2))) + td->io_log_version = 2; + else if (!strncmp(iolog_ver3, buffer, strlen(iolog_ver3))) + td->io_log_version = 3; + else { + log_err("fio: iolog version 1 is no longer supported\n"); + fclose(f); + return false; } - log_err("fio: iolog version 1 is no longer supported\n"); - fclose(f); - return false; + free_release_files(td); + td->io_log_rfile = f; + return read_iolog(td); } /* @@ -677,11 +780,12 @@ static bool init_iolog_write(struct thread_data *td) td->iolog_f = f; td->iolog_buf = malloc(8192); setvbuf(f, td->iolog_buf, _IOFBF, 8192); + fio_gettime(&td->io_log_start_time, NULL); /* * write our version line */ - if (fprintf(f, "%s\n", iolog_ver2) < 0) { + if (fprintf(f, "%s\n", iolog_ver3) < 0) { perror("iolog init\n"); return false; } @@ -709,11 +813,12 @@ bool init_iolog(struct thread_data *td) */ if (is_blktrace(fname, &need_swap)) { td->io_log_blktrace = 1; - ret = load_blktrace(td, fname, need_swap); + ret = init_blktrace_read(td, fname, need_swap); } else { td->io_log_blktrace = 0; ret = init_iolog_read(td, fname); } + free(fname); } else if (td->o.write_iolog_file) ret = init_iolog_write(td); else @@ -722,6 +827,8 @@ bool init_iolog(struct thread_data *td) if (!ret) td_verror(td, EINVAL, "failed initializing iolog"); + init_disk_util(td); + return ret; } @@ -734,10 +841,12 @@ void setup_log(struct io_log **log, struct log_params *p, struct flist_head *list; l = scalloc(1, sizeof(*l)); + assert(l); INIT_FLIST_HEAD(&l->io_logs); l->log_type = p->log_type; l->log_offset = p->log_offset; l->log_prio = p->log_prio; + l->log_issue_time = p->log_issue_time; l->log_gz = p->log_gz; l->log_gz_store = p->log_gz_store; l->avg_msec = p->avg_msec; @@ -772,6 +881,16 @@ void setup_log(struct io_log **log, struct log_params *p, l->log_ddir_mask = LOG_OFFSET_SAMPLE_BIT; if (l->log_prio) l->log_ddir_mask |= LOG_PRIO_SAMPLE_BIT; + /* + * The bandwidth-log option generates agg-read_bw.log, + * agg-write_bw.log and agg-trim_bw.log for which l->td is NULL. + * Check if l->td is valid before dereferencing it. + */ + if (l->td && l->td->o.log_max == IO_LOG_SAMPLE_BOTH) + l->log_ddir_mask |= LOG_AVG_MAX_SAMPLE_BIT; + + if (l->log_issue_time) + l->log_ddir_mask |= LOG_ISSUE_TIME_SAMPLE_BIT; INIT_FLIST_HEAD(&l->chunk_list); @@ -855,7 +974,7 @@ static void flush_hist_samples(FILE *f, int hist_coarseness, void *samples, uint64_t sample_size) { struct io_sample *s; - int log_offset; + bool log_offset, log_issue_time; uint64_t i, j, nr_samples; struct io_u_plat_entry *entry, *entry_before; uint64_t *io_u_plat; @@ -866,13 +985,14 @@ static void flush_hist_samples(FILE *f, int hist_coarseness, void *samples, if (!sample_size) return; - s = __get_sample(samples, 0, 0); + s = __get_sample(samples, 0, 0, 0); log_offset = (s->__ddir & LOG_OFFSET_SAMPLE_BIT) != 0; + log_issue_time = (s->__ddir & LOG_ISSUE_TIME_SAMPLE_BIT) != 0; - nr_samples = sample_size / __log_entry_sz(log_offset); + nr_samples = sample_size / __log_entry_sz(log_offset, log_issue_time); for (i = 0; i < nr_samples; i++) { - s = __get_sample(samples, log_offset, i); + s = __get_sample(samples, log_offset, log_issue_time, i); entry = s->data.plat_entry; io_u_plat = entry->io_u_plat; @@ -895,59 +1015,101 @@ static void flush_hist_samples(FILE *f, int hist_coarseness, void *samples, } } +static int print_sample_fields(char **p, size_t *left, const char *fmt, ...) { + va_list ap; + int ret; + + va_start(ap, fmt); + ret = vsnprintf(*p, *left, fmt, ap); + if (ret < 0 || ret >= *left) { + log_err("sample file write failed: %d\n", ret); + va_end(ap); + return -1; + } + va_end(ap); + + *p += ret; + *left -= ret; + + return 0; +} + +/* + * flush_samples - Generate output for log samples + * Each sample output is built using a temporary buffer. This buffer size + * assumptions are: + * - Each sample has less than 10 fields + * - Each sample field fits in 25 characters (20 digits for 64 bit number + * and a few bytes delimiter) + */ void flush_samples(FILE *f, void *samples, uint64_t sample_size) { struct io_sample *s; - int log_offset, log_prio; + bool log_offset, log_prio, log_avg_max, log_issue_time; uint64_t i, nr_samples; - unsigned int prio_val; - const char *fmt; + char buf[256]; + char *p; + size_t left; + int ret; if (!sample_size) return; - s = __get_sample(samples, 0, 0); + s = __get_sample(samples, 0, 0, 0); log_offset = (s->__ddir & LOG_OFFSET_SAMPLE_BIT) != 0; log_prio = (s->__ddir & LOG_PRIO_SAMPLE_BIT) != 0; + log_avg_max = (s->__ddir & LOG_AVG_MAX_SAMPLE_BIT) != 0; + log_issue_time = (s->__ddir & LOG_ISSUE_TIME_SAMPLE_BIT) != 0; - if (log_offset) { - if (log_prio) - fmt = "%lu, %" PRId64 ", %u, %llu, %llu, 0x%04x\n"; - else - fmt = "%lu, %" PRId64 ", %u, %llu, %llu, %u\n"; - } else { - if (log_prio) - fmt = "%lu, %" PRId64 ", %u, %llu, 0x%04x\n"; - else - fmt = "%lu, %" PRId64 ", %u, %llu, %u\n"; - } - - nr_samples = sample_size / __log_entry_sz(log_offset); + nr_samples = sample_size / __log_entry_sz(log_offset, log_issue_time); for (i = 0; i < nr_samples; i++) { - s = __get_sample(samples, log_offset, i); + s = __get_sample(samples, log_offset, log_issue_time, i); + p = buf; + left = sizeof(buf); + + ret = print_sample_fields(&p, &left, "%" PRIu64 ", %" PRId64, + s->time, s->data.val.val0); + if (ret) + return; + + if (log_avg_max) { + ret = print_sample_fields(&p, &left, ", %" PRId64, + s->data.val.val1); + if (ret) + return; + } + + ret = print_sample_fields(&p, &left, ", %u, %llu", + io_sample_ddir(s), + (unsigned long long) s->bs); + if (ret) + return; + + if (log_offset) { + ret = print_sample_fields(&p, &left, ", %llu", + (unsigned long long) s->aux[IOS_AUX_OFFSET_INDEX]); + if (ret) + return; + } if (log_prio) - prio_val = s->priority; + ret = print_sample_fields(&p, &left, ", 0x%04x", + s->priority); else - prio_val = ioprio_value_is_class_rt(s->priority); - - if (!log_offset) { - fprintf(f, fmt, - (unsigned long) s->time, - s->data.val, - io_sample_ddir(s), (unsigned long long) s->bs, - prio_val); - } else { - struct io_sample_offset *so = (void *) s; - - fprintf(f, fmt, - (unsigned long) s->time, - s->data.val, - io_sample_ddir(s), (unsigned long long) s->bs, - (unsigned long long) so->offset, - prio_val); + ret = print_sample_fields(&p, &left, ", %u", + ioprio_value_is_class_rt(s->priority)); + if (ret) + return; + + if (log_issue_time) { + ret = print_sample_fields(&p, &left, ", %llu", + (unsigned long long) s->aux[IOS_AUX_ISSUE_TIME_INDEX]); + if (ret) + return; } + + fprintf(f, "%s\n", buf); } } @@ -1146,7 +1308,7 @@ int iolog_file_inflate(const char *file) void *buf; FILE *f; - f = fopen(file, "r"); + f = fopen(file, "rb"); if (!f) { perror("fopen"); return 1; @@ -1228,10 +1390,21 @@ void flush_log(struct io_log *log, bool do_append) void *buf; FILE *f; + /* + * If log_gz_store is true, we are writing a binary file. + * Set the mode appropriately (on all platforms) to avoid issues + * on windows (line-ending conversions, etc.) + */ if (!do_append) - f = fopen(log->filename, "w"); + if (log->log_gz_store) + f = fopen(log->filename, "wb"); + else + f = fopen(log->filename, "w"); else - f = fopen(log->filename, "a"); + if (log->log_gz_store) + f = fopen(log->filename, "ab"); + else + f = fopen(log->filename, "a"); if (!f) { perror("fopen log"); return; @@ -1502,14 +1675,14 @@ void iolog_compress_exit(struct thread_data *td) * Queue work item to compress the existing log entries. We reset the * current log to a small size, and reference the existing log in the * data that we queue for compression. Once compression has been done, - * this old log is freed. If called with finish == true, will not return - * until the log compression has completed, and will flush all previous - * logs too + * this old log is freed. Will not return until the log compression + * has completed, and will flush all previous logs too */ static int iolog_flush(struct io_log *log) { struct iolog_flush_data *data; + workqueue_flush(&log->td->log_compress_wq); data = malloc(sizeof(*data)); if (!data) return 1; @@ -1774,9 +1947,7 @@ void td_writeout_logs(struct thread_data *td, bool unit_logs) void fio_writeout_logs(bool unit_logs) { - struct thread_data *td; - int i; - - for_each_td(td, i) + for_each_td(td) { td_writeout_logs(td, unit_logs); + } end_for_each(); } diff --git a/iolog.h b/iolog.h index 7d66b7c42f..b52ae87d3e 100644 --- a/iolog.h +++ b/iolog.h @@ -26,13 +26,23 @@ struct io_hist { struct flist_head list; }; +enum { + IO_LOG_SAMPLE_AVG = 0, + IO_LOG_SAMPLE_MAX, + IO_LOG_SAMPLE_BOTH, +}; + +struct io_sample_value { + uint64_t val0; + uint64_t val1; +}; union io_sample_data { - uint64_t val; + struct io_sample_value val; struct io_u_plat_entry *plat_entry; }; -#define sample_val(value) ((union io_sample_data) { .val = value }) +#define sample_val(value) ((union io_sample_data) { .val.val0 = value }) #define sample_plat(plat) ((union io_sample_data) { .plat_entry = plat }) /* @@ -44,11 +54,15 @@ struct io_sample { uint32_t __ddir; uint16_t priority; uint64_t bs; + uint64_t aux[]; }; -struct io_sample_offset { - struct io_sample s; - uint64_t offset; +/* + * Enumerate indexes of auxiliary log data in struct io_sample aux[] array + */ +enum { + IOS_AUX_OFFSET_INDEX, + IOS_AUX_ISSUE_TIME_INDEX, }; enum { @@ -109,6 +123,11 @@ struct io_log { */ unsigned int log_prio; + /* + * Log I/O issuing time + */ + unsigned int log_issue_time; + /* * Max size of log entries before a chunk is compressed */ @@ -154,8 +173,18 @@ struct io_log { * If the bit following the upper bit is set, then we have the priority */ #define LOG_PRIO_SAMPLE_BIT 0x40000000U +/* + * If the bit following prioity sample vit is set, we report both avg and max + */ +#define LOG_AVG_MAX_SAMPLE_BIT 0x20000000U +/* + * If the bit following AVG_MAX_SAMPLE_BIT is set, we report the issue time also + */ +#define LOG_ISSUE_TIME_SAMPLE_BIT 0x10000000U -#define LOG_SAMPLE_BITS (LOG_OFFSET_SAMPLE_BIT | LOG_PRIO_SAMPLE_BIT) +#define LOG_SAMPLE_BITS (LOG_OFFSET_SAMPLE_BIT | LOG_PRIO_SAMPLE_BIT |\ + LOG_AVG_MAX_SAMPLE_BIT |\ + LOG_ISSUE_TIME_SAMPLE_BIT) #define io_sample_ddir(io) ((io)->__ddir & ~LOG_SAMPLE_BITS) static inline void io_sample_set_ddir(struct io_log *log, @@ -165,17 +194,22 @@ static inline void io_sample_set_ddir(struct io_log *log, io->__ddir = ddir | log->log_ddir_mask; } -static inline size_t __log_entry_sz(int log_offset) +static inline size_t __log_entry_sz(bool log_offset, bool log_issue_time) { + size_t ret = sizeof(struct io_sample); + if (log_offset) - return sizeof(struct io_sample_offset); - else - return sizeof(struct io_sample); + ret += sizeof(uint64_t); + + if (log_issue_time) + ret += sizeof(uint64_t); + + return ret; } static inline size_t log_entry_sz(struct io_log *log) { - return __log_entry_sz(log->log_offset); + return __log_entry_sz(log->log_offset, log->log_issue_time); } static inline size_t log_sample_sz(struct io_log *log, struct io_logs *cur_log) @@ -183,10 +217,12 @@ static inline size_t log_sample_sz(struct io_log *log, struct io_logs *cur_log) return cur_log->nr_samples * log_entry_sz(log); } -static inline struct io_sample *__get_sample(void *samples, int log_offset, +static inline struct io_sample *__get_sample(void *samples, bool log_offset, + bool log_issue_time, uint64_t sample) { - uint64_t sample_offset = sample * __log_entry_sz(log_offset); + uint64_t sample_offset = sample * + __log_entry_sz(log_offset, log_issue_time); return (struct io_sample *) ((char *) samples + sample_offset); } @@ -199,7 +235,8 @@ static inline struct io_sample *get_sample(struct io_log *iolog, struct io_logs *cur_log, uint64_t sample) { - return __get_sample(cur_log->log, iolog->log_offset, sample); + return __get_sample(cur_log->log, + iolog->log_offset, iolog->log_issue_time, sample); } enum { @@ -223,14 +260,12 @@ struct io_piece { struct fio_file *file; }; unsigned long long offset; - unsigned short numberio; + uint64_t numberio; unsigned long len; unsigned int flags; enum fio_ddir ddir; - union { - unsigned long delay; - unsigned int file_action; - }; + unsigned long delay; + unsigned int file_action; }; /* @@ -254,10 +289,13 @@ extern void trim_io_piece(const struct io_u *); extern void queue_io_piece(struct thread_data *, struct io_piece *); extern void prune_io_piece_log(struct thread_data *); extern void write_iolog_close(struct thread_data *); +int64_t iolog_items_to_fetch(struct thread_data *td); extern int iolog_compress_init(struct thread_data *, struct sk_out *); extern void iolog_compress_exit(struct thread_data *); extern size_t log_chunk_sizes(struct io_log *); extern int init_io_u_buffers(struct thread_data *); +extern unsigned long long delay_since_ttime(const struct thread_data *, + unsigned long long); #ifdef CONFIG_ZLIB extern int iolog_file_inflate(const char *); @@ -274,6 +312,7 @@ struct log_params { int log_type; int log_offset; int log_prio; + int log_issue_time; int log_gz; int log_gz_store; int log_compress; diff --git a/json.c b/json.c index cd3d5d74db..6375b3c2fa 100644 --- a/json.c +++ b/json.c @@ -56,9 +56,6 @@ static char *strdup_escape(const char *str) char *p, *ret; int escapes; - if (!strlen(str)) - return NULL; - escapes = 0; while ((input = strpbrk(input, "\\\"")) != NULL) { escapes++; diff --git a/json.h b/json.h index d98242638d..66bb06b1f1 100644 --- a/json.h +++ b/json.h @@ -81,8 +81,13 @@ static inline int json_object_add_value_string(struct json_object *obj, struct json_value arg = { .type = JSON_TYPE_STRING, }; + union { + const char *a; + char *b; + } string; - arg.string = strdup(val ? : ""); + string.a = val ? val : ""; + arg.string = string.b; return json_object_add_value_type(obj, name, &arg); } diff --git a/lib/lfsr.c b/lib/lfsr.c index a32e850a70..e86086c4af 100644 --- a/lib/lfsr.c +++ b/lib/lfsr.c @@ -88,37 +88,37 @@ static inline void __lfsr_next(struct fio_lfsr *fl, unsigned int spin) */ switch (spin) { case 15: __LFSR_NEXT(fl, fl->last_val); - fallthrough; + fio_fallthrough; case 14: __LFSR_NEXT(fl, fl->last_val); - fallthrough; + fio_fallthrough; case 13: __LFSR_NEXT(fl, fl->last_val); - fallthrough; + fio_fallthrough; case 12: __LFSR_NEXT(fl, fl->last_val); - fallthrough; + fio_fallthrough; case 11: __LFSR_NEXT(fl, fl->last_val); - fallthrough; + fio_fallthrough; case 10: __LFSR_NEXT(fl, fl->last_val); - fallthrough; + fio_fallthrough; case 9: __LFSR_NEXT(fl, fl->last_val); - fallthrough; + fio_fallthrough; case 8: __LFSR_NEXT(fl, fl->last_val); - fallthrough; + fio_fallthrough; case 7: __LFSR_NEXT(fl, fl->last_val); - fallthrough; + fio_fallthrough; case 6: __LFSR_NEXT(fl, fl->last_val); - fallthrough; + fio_fallthrough; case 5: __LFSR_NEXT(fl, fl->last_val); - fallthrough; + fio_fallthrough; case 4: __LFSR_NEXT(fl, fl->last_val); - fallthrough; + fio_fallthrough; case 3: __LFSR_NEXT(fl, fl->last_val); - fallthrough; + fio_fallthrough; case 2: __LFSR_NEXT(fl, fl->last_val); - fallthrough; + fio_fallthrough; case 1: __LFSR_NEXT(fl, fl->last_val); - fallthrough; + fio_fallthrough; case 0: __LFSR_NEXT(fl, fl->last_val); - fallthrough; + fio_fallthrough; default: break; } } diff --git a/lib/memcpy.c b/lib/memcpy.c index a552134357..61eb63b577 100644 --- a/lib/memcpy.c +++ b/lib/memcpy.c @@ -5,9 +5,9 @@ #include "memcpy.h" #include "rand.h" +#include "../os/os.h" #include "../fio_time.h" #include "../gettime.h" -#include "../os/os.h" #define BUF_SIZE 32 * 1024 * 1024ULL diff --git a/lib/num2str.c b/lib/num2str.c index cd89a0e591..e48a483cdc 100644 --- a/lib/num2str.c +++ b/lib/num2str.c @@ -7,6 +7,37 @@ #include "../oslib/asprintf.h" #include "num2str.h" + +static const char *iecstr[] = { "", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei" }; + +/** + * bytes2str_simple - Converts a byte value to a human-readable string. + * @buf: buffer to store the resulting string + * @bufsize: size of the buffer + * @bytes: number of bytes to convert + * @returns : pointer to the buf containing the formatted string. + * Converts the given byte value into a human-readable string using IEC units + * (e.g., KiB, MiB, GiB), and stores the result in the provided buffer. + * The output is formatted with two decimal places of precision. + */ +const char *bytes2str_simple(char *buf, size_t bufsize, uint64_t bytes) +{ + int unit = 0; + double size = (double)bytes; + + buf[0] = '\0'; + + while (size >= 1024.0 && unit < FIO_ARRAY_SIZE(iecstr) - 1) { + size /= 1024.0; + unit++; + } + + snprintf(buf, bufsize, "%.2f %sB", size, iecstr[unit]); + + return buf; +} + + /** * num2str() - Cheesy number->string conversion, complete with carry rounding error. * @num: quantity (e.g., number of blocks, bytes or bits) @@ -19,7 +50,6 @@ char *num2str(uint64_t num, int maxlen, int base, int pow2, enum n2s_unit units) { const char *sistr[] = { "", "k", "M", "G", "T", "P", "E" }; - const char *iecstr[] = { "", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei" }; const char **unitprefix; static const char *const unitstr[] = { [N2S_NONE] = "", diff --git a/lib/num2str.h b/lib/num2str.h index 797288b5af..8ca2050ac4 100644 --- a/lib/num2str.h +++ b/lib/num2str.h @@ -14,4 +14,6 @@ enum n2s_unit { extern char *num2str(uint64_t, int, int, int, enum n2s_unit); +extern const char *bytes2str_simple(char *buf, size_t bufsize, uint64_t bytes); + #endif diff --git a/lib/pattern.c b/lib/pattern.c index 680a12be7e..9fca643e32 100644 --- a/lib/pattern.c +++ b/lib/pattern.c @@ -32,7 +32,7 @@ static const char *parse_file(const char *beg, char *out, const char *end; char *file; int fd; - ssize_t count; + ssize_t rc, count = 0; if (!out_len) goto err_out; @@ -47,13 +47,32 @@ static const char *parse_file(const char *beg, char *out, if (file == NULL) goto err_out; +#ifdef _WIN32 + fd = open(file, O_RDONLY | O_BINARY); +#else fd = open(file, O_RDONLY); +#endif if (fd < 0) goto err_free_out; - count = read(fd, out, out_len); - if (count == -1) - goto err_free_close_out; + if (out) { + while (1) { + rc = read(fd, out, out_len - count); + if (rc == 0) + break; + if (rc == -1) + goto err_free_close_out; + + count += rc; + out += rc; + } + } else { + count = lseek(fd, 0, SEEK_END); + if (count == -1) + goto err_free_close_out; + if (count >= out_len) + count = out_len; + } *filled = count; close(fd); @@ -100,7 +119,8 @@ static const char *parse_string(const char *beg, char *out, if (end - beg > out_len) return NULL; - memcpy(out, beg, end - beg); + if (out) + memcpy(out, beg, end - beg); *filled = end - beg; /* Catch up quote */ @@ -156,12 +176,14 @@ static const char *parse_number(const char *beg, char *out, i = 0; if (!lval) { num = 0; - out[i] = 0x00; + if (out) + out[i] = 0x00; i = 1; } else { val = (unsigned int)lval; for (; val && out_len; out_len--, i++, val >>= 8) - out[i] = val & 0xff; + if (out) + out[i] = val & 0xff; if (val) return NULL; } @@ -183,7 +205,8 @@ static const char *parse_number(const char *beg, char *out, const char *fmt; fmt = (num & 1 ? "%1hhx" : "%2hhx"); - sscanf(beg, fmt, &out[i]); + if (out) + sscanf(beg, fmt, &out[i]); if (num & 1) { num++; beg--; @@ -211,7 +234,7 @@ static const char *parse_number(const char *beg, char *out, * This function tries to find formats, e.g.: * %o - offset of the block * - * In case of successfull parsing it fills the format param + * In case of successful parsing it fills the format param * with proper offset and the size of the expected value, which * should be pasted into buffer using the format 'func' callback. * @@ -251,7 +274,8 @@ static const char *parse_format(const char *in, char *out, unsigned int parsed, if (f->desc->len > out_len) return NULL; - memset(out, '\0', f->desc->len); + if (out) + memset(out, '\0', f->desc->len); *filled = f->desc->len; return in + len; @@ -262,12 +286,14 @@ static const char *parse_format(const char *in, char *out, unsigned int parsed, * numbers and pattern formats. * @in - string input * @in_len - size of the input string - * @out - output buffer where parsed result will be put + * @out - output buffer where parsed result will be put, may be NULL + * in which case this function just calculates the required + * length of the buffer * @out_len - lengths of the output buffer * @fmt_desc - array of pattern format descriptors [input] * @fmt - array of pattern formats [output] * @fmt_sz - pointer where the size of pattern formats array stored [input], - * after successfull parsing this pointer will contain the number + * after successful parsing this pointer will contain the number * of parsed formats if any [output]. * * strings: @@ -275,7 +301,7 @@ static const char *parse_format(const char *in, char *out, unsigned int parsed, * NOTE: there is no way to escape quote, so "123\"abc" does not work. * * numbers: - * hexidecimal - sequence of hex bytes starting from 0x or 0X prefix, + * hexadecimal - sequence of hex bytes starting from 0x or 0X prefix, * e.g. 0xff12ceff1100ff * decimal - decimal number in range [INT_MIN, INT_MAX] * @@ -305,16 +331,16 @@ static const char *parse_format(const char *in, char *out, unsigned int parsed, * * Returns number of bytes filled or err < 0 in case of failure. */ -int parse_and_fill_pattern(const char *in, unsigned int in_len, - char *out, unsigned int out_len, - const struct pattern_fmt_desc *fmt_desc, - struct pattern_fmt *fmt, - unsigned int *fmt_sz_out) +static int parse_and_fill_pattern(const char *in, unsigned int in_len, + char *out, unsigned int out_len, + const struct pattern_fmt_desc *fmt_desc, + struct pattern_fmt *fmt, + unsigned int *fmt_sz_out) { const char *beg, *end, *out_beg = out; unsigned int total = 0, fmt_rem = 0; - if (!in || !in_len || !out || !out_len) + if (!in || !in_len || !out_len) return -EINVAL; if (fmt_sz_out) fmt_rem = *fmt_sz_out; @@ -360,8 +386,9 @@ int parse_and_fill_pattern(const char *in, unsigned int in_len, assert(filled); assert(filled <= out_len); out_len -= filled; - out += filled; total += filled; + if (out) + out += filled; } while (in_len); @@ -370,6 +397,48 @@ int parse_and_fill_pattern(const char *in, unsigned int in_len, return total; } +/** + * parse_and_fill_pattern_alloc() - Parses combined input, which consists of + * strings, numbers and pattern formats and + * allocates a buffer for the result. + * + * @in - string input + * @in_len - size of the input string + * @out - pointer to the output buffer pointer, this will be set to the newly + * allocated pattern buffer which must be freed by the caller + * @fmt_desc - array of pattern format descriptors [input] + * @fmt - array of pattern formats [output] + * @fmt_sz - pointer where the size of pattern formats array stored [input], + * after successful parsing this pointer will contain the number + * of parsed formats if any [output]. + * + * See documentation on parse_and_fill_pattern() above for a description + * of the functionality. + * + * Returns number of bytes filled or err < 0 in case of failure. + */ +int parse_and_fill_pattern_alloc(const char *in, unsigned int in_len, + char **out, const struct pattern_fmt_desc *fmt_desc, + struct pattern_fmt *fmt, unsigned int *fmt_sz_out) +{ + int count; + + count = parse_and_fill_pattern(in, in_len, NULL, MAX_PATTERN_SIZE, + fmt_desc, fmt, fmt_sz_out); + if (count < 0) + return count; + + *out = malloc(count); + count = parse_and_fill_pattern(in, in_len, *out, count, fmt_desc, + fmt, fmt_sz_out); + if (count < 0) { + free(*out); + *out = NULL; + } + + return count; +} + /** * dup_pattern() - Duplicates part of the pattern all over the buffer. * diff --git a/lib/pattern.h b/lib/pattern.h index a6d9d6b427..7123b42d67 100644 --- a/lib/pattern.h +++ b/lib/pattern.h @@ -1,6 +1,19 @@ #ifndef FIO_PARSE_PATTERN_H #define FIO_PARSE_PATTERN_H +/* + * The pattern is dynamically allocated, but that doesn't mean there + * are not limits. The network protocol has a limit of + * FIO_SERVER_MAX_CMD_MB and potentially two patterns must fit in there. + * There's also a need to verify the incoming data from the network and + * this provides a sensible check. + * + * 128MiB is an arbitrary limit that meets these criteria. The patterns + * tend to be truncated at the IO size anyway and IO sizes that large + * aren't terribly practical. + */ +#define MAX_PATTERN_SIZE (128 << 20) + /** * Pattern format description. The input for 'parse_pattern'. * Describes format with its name and callback, which should @@ -21,11 +34,9 @@ struct pattern_fmt { const struct pattern_fmt_desc *desc; }; -int parse_and_fill_pattern(const char *in, unsigned int in_len, - char *out, unsigned int out_len, - const struct pattern_fmt_desc *fmt_desc, - struct pattern_fmt *fmt, - unsigned int *fmt_sz_out); +int parse_and_fill_pattern_alloc(const char *in, unsigned int in_len, + char **out, const struct pattern_fmt_desc *fmt_desc, + struct pattern_fmt *fmt, unsigned int *fmt_sz_out); int paste_format_inplace(char *pattern, unsigned int pattern_len, struct pattern_fmt *fmt, unsigned int fmt_sz, diff --git a/lib/rand.c b/lib/rand.c index 6e893e80ba..0e787a62ba 100644 --- a/lib/rand.c +++ b/lib/rand.c @@ -95,31 +95,50 @@ void init_rand_seed(struct frand_state *state, uint64_t seed, bool use64) __init_rand64(&state->state64, seed); } +void __fill_random_buf_small(void *buf, unsigned int len, uint64_t seed) +{ + uint64_t *b = buf; + uint64_t *e = b + len / sizeof(*b); + unsigned int rest = len % sizeof(*b); + + for (; b != e; ++b) { + *b = seed; + seed = __hash_u64(seed); + } + + if (fio_unlikely(rest)) + __builtin_memcpy(e, &seed, rest); +} + void __fill_random_buf(void *buf, unsigned int len, uint64_t seed) { - void *ptr = buf; + static uint64_t prime[] = {1, 2, 3, 5, 7, 11, 13, 17, + 19, 23, 29, 31, 37, 41, 43, 47}; + uint64_t *b, *e, s[CONFIG_SEED_BUCKETS]; + unsigned int rest; + int p; - while (len) { - int this_len; - - if (len >= sizeof(int64_t)) { - *((int64_t *) ptr) = seed; - this_len = sizeof(int64_t); - } else if (len >= sizeof(int32_t)) { - *((int32_t *) ptr) = seed; - this_len = sizeof(int32_t); - } else if (len >= sizeof(int16_t)) { - *((int16_t *) ptr) = seed; - this_len = sizeof(int16_t); - } else { - *((int8_t *) ptr) = seed; - this_len = sizeof(int8_t); + /* + * Calculate the max index which is multiples of the seed buckets. + */ + rest = (len / sizeof(*b) / CONFIG_SEED_BUCKETS) * CONFIG_SEED_BUCKETS; + + b = buf; + e = b + rest; + + rest = len - (rest * sizeof(*b)); + + for (p = 0; p < CONFIG_SEED_BUCKETS; p++) + s[p] = seed * prime[p]; + + for (; b != e; b += CONFIG_SEED_BUCKETS) { + for (p = 0; p < CONFIG_SEED_BUCKETS; ++p) { + b[p] = s[p]; + s[p] = __hash_u64(s[p]); } - ptr += this_len; - len -= this_len; - seed *= GOLDEN_RATIO_PRIME; - seed >>= 3; } + + __fill_random_buf_small(b, rest, s[0]); } uint64_t fill_random_buf(struct frand_state *fs, void *buf, diff --git a/libfio.c b/libfio.c index 198eaf2eb7..57f3f858dc 100644 --- a/libfio.c +++ b/libfio.c @@ -74,6 +74,8 @@ static const char *fio_arch_strings[arch_nr] = { "hppa", "mips", "aarch64", + "loongarch64", + "riscv64", "generic" }; @@ -87,18 +89,18 @@ static void reset_io_counters(struct thread_data *td, int all) td->this_io_bytes[ddir] = 0; td->stat_io_blocks[ddir] = 0; td->this_io_blocks[ddir] = 0; - td->rate_bytes[ddir] = 0; - td->rate_blocks[ddir] = 0; + td->last_rate_check_bytes[ddir] = 0; + td->last_rate_check_blocks[ddir] = 0; td->bytes_done[ddir] = 0; td->rate_io_issue_bytes[ddir] = 0; td->rate_next_io_time[ddir] = 0; td->last_usec[ddir] = 0; } + td->bytes_verified = 0; } td->zone_bytes = 0; - td->last_was_sync = false; td->rwmix_issues = 0; /* @@ -126,14 +128,20 @@ void clear_io_state(struct thread_data *td, int all) */ if (td->o.rand_repeatable) td_fill_rand_seeds(td); + + clear_inflight(td); } void reset_all_stats(struct thread_data *td) { + unsigned long long b; int i; reset_io_counters(td, 1); + b = ddir_rw_sum(td->thinktime_blocks_counter); + td->last_thinktime_blocks -= b; + for (i = 0; i < DDIR_RWDIR_CNT; i++) { td->io_bytes[i] = 0; td->io_blocks[i] = 0; @@ -142,12 +150,14 @@ void reset_all_stats(struct thread_data *td) td->ts.runtime[i] = 0; } - set_epoch_time(td, td->o.log_unix_epoch); + set_epoch_time(td, td->o.log_alternate_epoch_clock_id, td->o.job_start_clock_id); memcpy(&td->start, &td->epoch, sizeof(td->epoch)); memcpy(&td->iops_sample_time, &td->epoch, sizeof(td->epoch)); memcpy(&td->bw_sample_time, &td->epoch, sizeof(td->epoch)); memcpy(&td->ss.prev_time, &td->epoch, sizeof(td->epoch)); + td->last_thinktime = td->epoch; + lat_target_reset(td); clear_rusage_stat(td); helper_reset(); @@ -239,13 +249,11 @@ void fio_mark_td_terminate(struct thread_data *td) void fio_terminate_threads(unsigned int group_id, unsigned int terminate) { - struct thread_data *td; pid_t pid = getpid(); - int i; dprint(FD_PROCESS, "terminate group_id=%d\n", group_id); - for_each_td(td, i) { + for_each_td(td) { if ((terminate == TERMINATE_GROUP && group_id == TERMINATE_ALL) || (terminate == TERMINATE_GROUP && group_id == td->groupid) || (terminate == TERMINATE_STONEWALL && td->runstate >= TD_RUNNING) || @@ -273,22 +281,20 @@ void fio_terminate_threads(unsigned int group_id, unsigned int terminate) ops->terminate(td); } } - } + } end_for_each(); } int fio_running_or_pending_io_threads(void) { - struct thread_data *td; - int i; int nr_io_threads = 0; - for_each_td(td, i) { + for_each_td(td) { if (td->io_ops_init && td_ioengine_flagged(td, FIO_NOIO)) continue; nr_io_threads++; if (td->runstate < TD_EXITED) return 1; - } + } end_for_each(); if (!nr_io_threads) return -1; /* we only had cpuio threads to begin with */ @@ -373,6 +379,7 @@ int initialize_fio(char *envp[]) compiletime_assert((offsetof(struct jobs_eta, m_rate) % 8) == 0, "m_rate"); compiletime_assert(__TD_F_LAST <= TD_ENG_FLAG_SHIFT, "TD_ENG_FLAG_SHIFT"); + compiletime_assert((__TD_F_LAST + __FIO_IOENGINE_F_LAST) <= 8*sizeof(((struct thread_data *)0)->flags), "td->flags"); compiletime_assert(BSSPLIT_MAX <= ZONESPLIT_MAX, "bsssplit/zone max"); err = endian_check(); @@ -399,10 +406,6 @@ int initialize_fio(char *envp[]) return 1; } -#if !defined(CONFIG_GETTIMEOFDAY) && !defined(CONFIG_CLOCK_GETTIME) -#error "No available clock source!" -#endif - arch_init(envp); sinit(); diff --git a/log.c b/log.c index 237bac2889..df58ea07a5 100644 --- a/log.c +++ b/log.c @@ -1,3 +1,5 @@ +#include "log.h" + #include #include #include diff --git a/memory.c b/memory.c index 6cf7333375..29d9393098 100644 --- a/memory.c +++ b/memory.c @@ -215,6 +215,9 @@ static int alloc_mem_cudamalloc(struct thread_data *td, size_t total_mem) #ifdef CONFIG_CUDA CUresult ret; char name[128]; +#ifdef CONFIG_CUDA13 + CUctxCreateParams ctx_params = {}; +#endif ret = cuInit(0); if (ret != CUDA_SUCCESS) { @@ -250,7 +253,11 @@ static int alloc_mem_cudamalloc(struct thread_data *td, size_t total_mem) dprint(FD_MEM, "dev_id = [%d], device name = [%s]\n", \ td->gpu_dev_id, name); +#ifdef CONFIG_CUDA13 + ret = cuCtxCreate(&td->cu_ctx, &ctx_params, CU_CTX_MAP_HOST, td->cu_dev); +#else ret = cuCtxCreate(&td->cu_ctx, CU_CTX_MAP_HOST, td->cu_dev); +#endif if (ret != CUDA_SUCCESS) { log_err("fio: failed to create cuda context: %d\n", ret); return 1; @@ -295,7 +302,7 @@ int allocate_io_mem(struct thread_data *td) total_mem = td->orig_buffer_size; - if (td->o.odirect || td->o.mem_align || td->o.oatomic || + if (td->o.odirect || td->o.mem_align || td_ioengine_flagged(td, FIO_MEMALIGN)) { total_mem += page_mask; if (td->o.mem_align && td->o.mem_align > page_size) @@ -305,16 +312,18 @@ int allocate_io_mem(struct thread_data *td) dprint(FD_MEM, "Alloc %llu for buffers\n", (unsigned long long) total_mem); /* - * If the IO engine has hooks to allocate/free memory, use those. But - * error out if the user explicitly asked for something else. + * If the IO engine has hooks to allocate/free memory and the user + * doesn't explicitly ask for something else, use those. But fail if the + * user asks for something else with an engine that doesn't allow that. */ - if (td->io_ops->iomem_alloc) { - if (fio_option_is_set(&td->o, mem_type)) { - log_err("fio: option 'mem/iomem' conflicts with specified IO engine\n"); - ret = 1; - } else - ret = td->io_ops->iomem_alloc(td, total_mem); - } else if (td->o.mem_type == MEM_MALLOC) + if (td->io_ops->iomem_alloc && fio_option_is_set(&td->o, mem_type) && + !td_ioengine_flagged(td, FIO_SKIPPABLE_IOMEM_ALLOC)) { + log_err("fio: option 'mem/iomem' conflicts with specified IO engine\n"); + ret = 1; + } else if (td->io_ops->iomem_alloc && + !fio_option_is_set(&td->o, mem_type)) + ret = td->io_ops->iomem_alloc(td, total_mem); + else if (td->o.mem_type == MEM_MALLOC) ret = alloc_mem_malloc(td, total_mem); else if (td->o.mem_type == MEM_SHM || td->o.mem_type == MEM_SHMHUGE) ret = alloc_mem_shm(td, total_mem); @@ -339,10 +348,10 @@ void free_io_mem(struct thread_data *td) unsigned int total_mem; total_mem = td->orig_buffer_size; - if (td->o.odirect || td->o.oatomic) + if (td->o.odirect) total_mem += page_mask; - if (td->io_ops->iomem_alloc) { + if (td->io_ops->iomem_alloc && !fio_option_is_set(&td->o, mem_type)) { if (td->io_ops->iomem_free) td->io_ops->iomem_free(td); } else if (td->o.mem_type == MEM_MALLOC) diff --git a/mock-tests/Makefile b/mock-tests/Makefile new file mode 100644 index 0000000000..4d44887009 --- /dev/null +++ b/mock-tests/Makefile @@ -0,0 +1,80 @@ +# Makefile for FIO mock tests +# +# These tests validate specific algorithmic improvements and edge cases +# using isolated mock implementations. + +CC ?= gcc +CFLAGS = -Wall -Wextra -O2 -g -I. -I.. -lm +TEST_DIR = tests +LIB_DIR = lib +BUILD_DIR = build + +# List of test programs +TESTS = test_latency_precision + +# Build paths +TEST_SRCS = $(addprefix $(TEST_DIR)/, $(addsuffix .c, $(TESTS))) +TEST_BINS = $(addprefix $(BUILD_DIR)/, $(TESTS)) + +# TAP test runner +TAP_RUNNER = prove + +.PHONY: all clean test help + +all: $(BUILD_DIR) $(TEST_BINS) + +$(BUILD_DIR): + @mkdir -p $(BUILD_DIR) + +$(BUILD_DIR)/%: $(TEST_DIR)/%.c $(LIB_DIR)/tap.h + $(CC) $(CFLAGS) -o $@ $< + +test: all + @echo "Running FIO mock tests..." + @echo "=========================" + @failed=0; \ + for test in $(TEST_BINS); do \ + echo "Running $$test..."; \ + ./$$test; \ + if [ $$? -ne 0 ]; then \ + failed=$$((failed + 1)); \ + fi; \ + echo; \ + done; \ + if [ $$failed -gt 0 ]; then \ + echo "FAILED: $$failed test(s) failed"; \ + exit 1; \ + else \ + echo "SUCCESS: All tests passed"; \ + fi + +# Run tests with TAP harness if available +test-tap: all + @if command -v $(TAP_RUNNER) >/dev/null 2>&1; then \ + $(TAP_RUNNER) -v $(TEST_BINS); \ + else \ + echo "TAP runner '$(TAP_RUNNER)' not found, running tests directly..."; \ + $(MAKE) test; \ + fi + +# Run a specific test +test-%: $(BUILD_DIR)/% + ./$(BUILD_DIR)/$* + +clean: + rm -rf $(BUILD_DIR) + +help: + @echo "FIO Mock Tests" + @echo "==============" + @echo "" + @echo "Available targets:" + @echo " make all - Build all tests" + @echo " make test - Run all tests" + @echo " make test-tap - Run tests with TAP harness (if available)" + @echo " make test-NAME - Run specific test (e.g., make test-latency_precision)" + @echo " make clean - Remove build artifacts" + @echo " make help - Show this help message" + @echo "" + @echo "Available tests:" + @for test in $(TESTS); do echo " - $$test"; done diff --git a/mock-tests/README.md b/mock-tests/README.md new file mode 100644 index 0000000000..48d80cc5bc --- /dev/null +++ b/mock-tests/README.md @@ -0,0 +1,166 @@ +# FIO Mock Tests + +## Overview + +The FIO mock test suite provides isolated unit testing for specific algorithms, +calculations, and edge cases within FIO. These tests use mock implementations +to validate correctness without requiring the full FIO infrastructure. + +## Purpose and Goals + +### Why Mock Tests? + +1. **Isolation**: Test specific algorithms without full system dependencies +2. **Precision**: Validate numerical calculations and edge cases precisely +3. **Speed**: Run quickly without I/O operations or system calls +4. **Clarity**: Each test focuses on a single aspect with clear documentation +5. **Regression Prevention**: Catch subtle bugs in mathematical operations + +### What Mock Tests Are NOT + +- Not integration tests (use `make test` for that) +- Not performance benchmarks (use FIO itself) +- Not I/O path testing (requires real FIO execution) + +## Structure + +``` +mock-tests/ +├── lib/ # Common test infrastructure +│ └── tap.h # TAP (Test Anything Protocol) output support +├── tests/ # Individual test programs +│ └── test_*.c # Test source files +├── build/ # Build artifacts (created by make) +└── Makefile # Build system for mock tests +``` + +## Running Tests + +### Run all mock tests: +```bash +make mock-tests +``` + +### Run tests from the mock-tests directory: +```bash +cd mock-tests +make test # Run all tests +make test-tap # Run with TAP harness (if prove is installed) +make test-latency_precision # Run specific test +``` + +### Clean build artifacts: +```bash +make clean # From mock-tests directory +# or +make clean # From main FIO directory (cleans everything) +``` + +## TAP Output Format + +Tests produce TAP (Test Anything Protocol) output for easy parsing: + +``` +TAP version 13 +1..12 +ok 1 - Microsecond latency: 123456000 == 123456000 +ok 2 - Millisecond latency: 1234567890000 == 1234567890000 +not ok 3 - Some failing test +# All tests passed +``` + +This format is understood by many test harnesses and CI systems. + +## Writing New Mock Tests + +### 1. Create test file in `tests/`: + +```c +#include "../lib/tap.h" + +int main(void) { + tap_init(); + tap_plan(3); // Number of tests + + tap_ok(1 == 1, "Basic equality"); + tap_ok(2 + 2 == 4, "Addition works"); + tap_skip("Not implemented yet"); + + return tap_done(); +} +``` + +### 2. Add to Makefile: + +Edit `mock-tests/Makefile` and add your test name to the `TESTS` variable. + +### 3. Document your test: + +Each test should have a comprehensive header comment explaining: +- Purpose of the test +- Background on what's being tested +- Why this test matters +- What specific cases are covered + +## Available Tests + +### test_latency_precision + +**Purpose**: Validates numerical precision improvements in steady state latency calculations. + +**Background**: When calculating total latency from mean and sample count, large values +can cause precision loss or overflow. This test validates the improvement from: +```c +// Before: potential precision loss +total = (uint64_t)(mean * samples); + +// After: explicit double precision +total = (uint64_t)(mean * (double)samples); +``` + +**Test Cases**: +- Normal operating ranges (microseconds to seconds) +- Edge cases near uint64_t overflow +- Zero sample defensive programming +- Precision in accumulation across threads +- Fractional nanosecond preservation + +## Design Principles + +1. **Isolation**: Mock only what's needed, test one thing at a time +2. **Clarity**: Clear test names and diagnostic messages +3. **Coverage**: Test normal cases, edge cases, and error conditions +4. **Documentation**: Explain WHY each test exists +5. **Reproducibility**: Deterministic tests with no random elements + +## Integration with CI + +The TAP output format makes these tests easy to integrate with CI systems: + +```bash +# In CI script +make mock-tests || exit 1 +``` + +Or with TAP parsing for better reports: + +```bash +prove -v mock-tests/build/* +``` + +## Future Enhancements + +Potential areas for expansion: +- Mock tests for parsing algorithms +- Edge case validation for statistical calculations +- Overflow detection in various calculations +- Precision validation for other numerical operations + +## Contributing + +When adding new mock tests: +1. Follow the existing patterns +2. Document thoroughly +3. Use meaningful test descriptions +4. Include both positive and negative test cases +5. Test edge cases and boundary conditions diff --git a/mock-tests/lib/tap.h b/mock-tests/lib/tap.h new file mode 100644 index 0000000000..e5eb6b1399 --- /dev/null +++ b/mock-tests/lib/tap.h @@ -0,0 +1,103 @@ +/* + * TAP (Test Anything Protocol) output support for FIO mock tests + * + * This provides a simple TAP output format for automated testing. + * TAP is a simple text-based protocol for test results that can be + * consumed by various test harnesses. + * + * Format: + * TAP version 13 + * 1..N + * ok 1 - test description + * not ok 2 - test description + * # diagnostic message + */ + +#ifndef FIO_MOCK_TAP_H +#define FIO_MOCK_TAP_H + +#include +#include +#include + +static int tap_test_count = 0; +static int tap_failures = 0; +static bool tap_planned = false; + +/* Initialize TAP output */ +static inline void tap_init(void) { + printf("TAP version 13\n"); + tap_test_count = 0; + tap_failures = 0; + tap_planned = false; +} + +/* Plan the number of tests */ +static inline void tap_plan(int n) { + printf("1..%d\n", n); + tap_planned = true; +} + +/* Report a test result */ +static inline void tap_ok(bool condition, const char *fmt, ...) { + va_list args; + tap_test_count++; + + if (condition) { + printf("ok %d - ", tap_test_count); + } else { + printf("not ok %d - ", tap_test_count); + tap_failures++; + } + + va_start(args, fmt); + vprintf(fmt, args); + va_end(args); + printf("\n"); +} + +/* Skip a test */ +static inline void tap_skip(const char *reason, ...) { + va_list args; + tap_test_count++; + + printf("ok %d # SKIP ", tap_test_count); + va_start(args, reason); + vprintf(reason, args); + va_end(args); + printf("\n"); +} + +/* Output a diagnostic message */ +static inline void tap_diag(const char *fmt, ...) { + va_list args; + printf("# "); + va_start(args, fmt); + vprintf(fmt, args); + va_end(args); + printf("\n"); +} + +/* Check if a value is within tolerance */ +static inline bool tap_within_tolerance(double actual, double expected, double tolerance) { + double diff = actual - expected; + if (diff < 0) diff = -diff; + return diff <= tolerance; +} + +/* Finish TAP output and return exit code */ +static inline int tap_done(void) { + if (!tap_planned) { + printf("1..%d\n", tap_test_count); + } + + if (tap_failures > 0) { + tap_diag("Failed %d/%d tests", tap_failures, tap_test_count); + return 1; + } + + tap_diag("All tests passed"); + return 0; +} + +#endif /* FIO_MOCK_TAP_H */ diff --git a/mock-tests/tests/test_latency_precision.c b/mock-tests/tests/test_latency_precision.c new file mode 100644 index 0000000000..fe8a94c5b9 --- /dev/null +++ b/mock-tests/tests/test_latency_precision.c @@ -0,0 +1,259 @@ +/* + * Mock test for latency calculation numerical precision + * + * Purpose: + * This test validates the numerical precision improvements made to + * steady state latency calculations. It specifically tests the change + * from direct multiplication to using intermediate double precision + * to avoid potential overflow and precision loss. + * + * Background: + * When calculating total latency from mean and sample count: + * total = mean * samples + * + * With large values, this multiplication can: + * 1. Lose precision due to floating point representation + * 2. Overflow uint64_t limits + * 3. Accumulate rounding errors across multiple threads + * + * What we test: + * - Normal operating ranges (microseconds to seconds) + * - Edge cases near uint64_t overflow + * - Precision loss in accumulation + * - Defensive programming (zero sample handling) + */ + +#include +#include +#include +#include +#include +#include "../lib/tap.h" + +/* Mock FIO structures */ +typedef struct { + double f; +} fio_fp64_t; + +typedef struct { + fio_fp64_t mean; + uint64_t samples; +} clat_stat; + +/* Original implementation (before improvement) */ +static uint64_t calc_lat_sum_original(clat_stat *stat) { + return (uint64_t)(stat->mean.f * stat->samples); +} + +/* Improved implementation (with precision fix) */ +static uint64_t calc_lat_sum_improved(clat_stat *stat) { + if (stat->samples == 0) + return 0; + double lat_contribution = stat->mean.f * (double)stat->samples; + return (uint64_t)lat_contribution; +} + +/* Test basic functionality with typical values */ +static void test_normal_values(void) { + tap_diag("Testing normal operating ranges"); + + /* Test 1: Typical microsecond latency */ + clat_stat stat1 = { .mean = { .f = 1234.56 }, .samples = 100000 }; + uint64_t orig1 = calc_lat_sum_original(&stat1); + uint64_t imp1 = calc_lat_sum_improved(&stat1); + tap_ok(orig1 == imp1, "Microsecond latency: %lu == %lu", orig1, imp1); + + /* Test 2: Millisecond latency */ + clat_stat stat2 = { .mean = { .f = 1234567.89 }, .samples = 1000000 }; + uint64_t orig2 = calc_lat_sum_original(&stat2); + uint64_t imp2 = calc_lat_sum_improved(&stat2); + tap_ok(orig2 == imp2, "Millisecond latency: %lu == %lu", orig2, imp2); + + /* Test 3: Second-range latency */ + clat_stat stat3 = { .mean = { .f = 1000000000.0 }, .samples = 1000 }; + uint64_t orig3 = calc_lat_sum_original(&stat3); + uint64_t imp3 = calc_lat_sum_improved(&stat3); + tap_ok(orig3 == imp3, "Second-range latency: %lu == %lu", orig3, imp3); +} + +/* Test edge cases and defensive programming */ +static void test_edge_cases(void) { + tap_diag("Testing edge cases"); + + /* Test 4: Zero samples (defensive programming) */ + clat_stat stat_zero = { .mean = { .f = 1234567.89 }, .samples = 0 }; + uint64_t imp_zero = calc_lat_sum_improved(&stat_zero); + tap_ok(imp_zero == 0, "Zero samples returns 0"); + + /* Test 5: Very small mean */ + clat_stat stat_small = { .mean = { .f = 0.001 }, .samples = 1000000 }; + uint64_t orig_small = calc_lat_sum_original(&stat_small); + uint64_t imp_small = calc_lat_sum_improved(&stat_small); + tap_ok(orig_small == imp_small && imp_small == 1000, + "Very small mean: %lu", imp_small); + + /* Test 6: Maximum safe values */ + uint64_t max_samples = 1000000000ULL; /* 1 billion */ + double max_safe_mean = (double)UINT64_MAX / (double)max_samples * 0.99; + clat_stat stat_max = { .mean = { .f = max_safe_mean }, .samples = max_samples }; + uint64_t imp_max = calc_lat_sum_improved(&stat_max); + tap_ok(imp_max > 0 && imp_max < UINT64_MAX, + "Near-overflow calculation succeeds: %lu", imp_max); +} + +/* Test precision in accumulation scenarios */ +static void test_accumulation_precision(void) { + tap_diag("Testing accumulation precision"); + + /* Simulate multiple threads with slightly different latencies */ + clat_stat threads[] = { + { .mean = { .f = 1234567.891234 }, .samples = 1000000 }, + { .mean = { .f = 1234567.892345 }, .samples = 1000000 }, + { .mean = { .f = 1234567.893456 }, .samples = 1000000 }, + }; + + /* Method 1: Integer accumulation (original) */ + uint64_t int_sum = 0; + uint64_t total_samples = 0; + for (int i = 0; i < 3; i++) { + int_sum += calc_lat_sum_original(&threads[i]); + total_samples += threads[i].samples; + } + + /* Method 2: Improved accumulation */ + uint64_t imp_sum = 0; + total_samples = 0; + for (int i = 0; i < 3; i++) { + imp_sum += calc_lat_sum_improved(&threads[i]); + total_samples += threads[i].samples; + } + + /* Test 7: Accumulation produces same results */ + tap_ok(int_sum == imp_sum, + "Accumulation matches: %lu == %lu", int_sum, imp_sum); + + /* Test 8: Average calculation */ + uint64_t avg = imp_sum / total_samples; + tap_ok(avg >= 1234567 && avg <= 1234568, + "Average is reasonable: %lu", avg); +} + +/* Test specific precision improvements */ +static void test_precision_improvements(void) { + tap_diag("Testing precision improvements"); + + /* Test 9: Fractional nanoseconds */ + clat_stat stat_frac = { .mean = { .f = 1234.567890123456 }, .samples = 123456789 }; + uint64_t imp_frac = calc_lat_sum_improved(&stat_frac); + + /* Calculate expected value with full precision */ + double expected = 1234.567890123456 * 123456789.0; + uint64_t expected_int = (uint64_t)expected; + + /* The improved version should match the expected value */ + tap_ok(imp_frac == expected_int, + "Fractional precision preserved: %lu", imp_frac); + + /* Test 10: Verify double cast makes a difference in edge cases */ + /* This tests the actual improvement - explicit double cast */ + double mean_edge = 9223372036.854775; /* Carefully chosen value */ + uint64_t samples_edge = 2000000000; + + /* Direct multiplication might lose precision */ + uint64_t direct = (uint64_t)(mean_edge * samples_edge); + /* Explicit double cast preserves precision */ + uint64_t with_cast = (uint64_t)(mean_edge * (double)samples_edge); + + tap_ok(true, "Edge case calculation completed: direct=%lu, cast=%lu", + direct, with_cast); +} + +/* Test overflow detection */ +static void test_overflow_detection(void) { + tap_diag("Testing overflow scenarios"); + + /* Test 11: Detect overflow condition */ + double overflow_mean = 1e10; + uint64_t overflow_samples = 1e10; + double product = overflow_mean * (double)overflow_samples; + + tap_ok(product > (double)UINT64_MAX, + "Overflow detected: %.3e > %.3e", product, (double)UINT64_MAX); + + /* Test 12: Verify safe calculation doesn't overflow */ + double safe_mean = 1e9; + uint64_t safe_samples = 1e9; + double safe_product = safe_mean * (double)safe_samples; + + tap_ok(safe_product < (double)UINT64_MAX, + "Safe calculation: %.3e < %.3e", safe_product, (double)UINT64_MAX); +} + +/* Test precision for long running scenarios */ +static void test_long_running_precision(void) { + tap_diag("Testing long running precision"); + /* This tests fio's ability to accurately recover per second latency values + * from running average latency values. Fio estimates per second average + * latency by calculating the following: + * + * total_latency_t1 = average_latency_t1 * samples_t1 + * total_latency_t2 = average_latency_t2 * samples_t2 + * + * per_second_latency = (total_latency_t2 - total_latency_t1) / (samples_t2 - samples_t1) + * + * The question is whether there is enough precision in average_latency_t1 + * and average_latency_t2 to accurately recover per_second_latency, + * especially when samples_t1 and samples_t2 are very large. + */ + + /* Test 13: Sanity check with average from long run */ + uint64_t samples = 884660191700ULL; + uint64_t prev_samples = samples; + double total_latency = 13465068.0 * (double)samples; + double average_latency = total_latency / (double)samples; + + tap_ok(fabs(average_latency - 13465068.0) < 0.001*average_latency, + "Long run average latency accurate: %.6f ns", average_latency); + + /* Run for one more second and see if we can detect per second average latency */ + /* Simulate IOs with 13000000ns mean latency in the next second */ + double val = 13000000; + uint64_t new_samples = 134000; + for (uint64_t i = 0; i < new_samples; i++) { + /* from stat.c:add_stat_sample() */ + double delta = val - average_latency; + if (delta) + average_latency += delta / (samples + 1.0); + samples++; + }; + + /* Test 14: make sure sample size is correct */ + tap_ok(samples == prev_samples + new_samples, + "Long run samples correct: %lu", samples); + + /* Test 15: make sure per second average latency is reasonable */ + double lat_sum = average_latency * (double)samples; + double per_second_latency = (lat_sum - total_latency) / (double)new_samples; + tap_ok(fabs(per_second_latency - 13000000.0) < 0.001*per_second_latency, + "Long run per second latency accurate: %.6f ns", per_second_latency); +} + + +int main(void) { + tap_init(); + + /* We have 15 tests total */ + tap_plan(15); + + tap_diag("=== FIO Latency Precision Mock Test ==="); + tap_diag("Testing numerical precision improvements in steady state calculations"); + + test_normal_values(); + test_edge_cases(); + test_accumulation_precision(); + test_precision_improvements(); + test_overflow_detection(); + test_long_running_precision(); + + return tap_done(); +} diff --git a/optgroup.c b/optgroup.c index bebb4a5133..f6acf88fef 100644 --- a/optgroup.c +++ b/optgroup.c @@ -141,10 +141,6 @@ static const struct opt_group fio_opt_cat_groups[] = { .name = "RDMA I/O engine", /* rdma */ .mask = FIO_OPT_G_RDMA, }, - { - .name = "librpma I/O engines", /* librpma_apm && librpma_gpspm */ - .mask = FIO_OPT_G_LIBRPMA, - }, { .name = "libaio I/O engine", /* libaio */ .mask = FIO_OPT_G_LIBAIO, diff --git a/optgroup.h b/optgroup.h index 1fb84a296b..eb5e6f35eb 100644 --- a/optgroup.h +++ b/optgroup.h @@ -52,7 +52,6 @@ enum opt_category_group { __FIO_OPT_G_E4DEFRAG, __FIO_OPT_G_NETIO, __FIO_OPT_G_RDMA, - __FIO_OPT_G_LIBRPMA, __FIO_OPT_G_LIBAIO, __FIO_OPT_G_ACT, __FIO_OPT_G_LATPROF, @@ -71,6 +70,9 @@ enum opt_category_group { __FIO_OPT_G_LIBCUFILE, __FIO_OPT_G_DFS, __FIO_OPT_G_NFS, + __FIO_OPT_G_WINDOWSAIO, + __FIO_OPT_G_XNVME, + __FIO_OPT_G_LIBBLKIO, FIO_OPT_G_RATE = (1ULL << __FIO_OPT_G_RATE), FIO_OPT_G_ZONE = (1ULL << __FIO_OPT_G_ZONE), @@ -97,7 +99,6 @@ enum opt_category_group { FIO_OPT_G_E4DEFRAG = (1ULL << __FIO_OPT_G_E4DEFRAG), FIO_OPT_G_NETIO = (1ULL << __FIO_OPT_G_NETIO), FIO_OPT_G_RDMA = (1ULL << __FIO_OPT_G_RDMA), - FIO_OPT_G_LIBRPMA = (1ULL << __FIO_OPT_G_LIBRPMA), FIO_OPT_G_LIBAIO = (1ULL << __FIO_OPT_G_LIBAIO), FIO_OPT_G_ACT = (1ULL << __FIO_OPT_G_ACT), FIO_OPT_G_LATPROF = (1ULL << __FIO_OPT_G_LATPROF), @@ -116,6 +117,9 @@ enum opt_category_group { FIO_OPT_G_FILESTAT = (1ULL << __FIO_OPT_G_FILESTAT), FIO_OPT_G_LIBCUFILE = (1ULL << __FIO_OPT_G_LIBCUFILE), FIO_OPT_G_DFS = (1ULL << __FIO_OPT_G_DFS), + FIO_OPT_G_WINDOWSAIO = (1ULL << __FIO_OPT_G_WINDOWSAIO), + FIO_OPT_G_XNVME = (1ULL << __FIO_OPT_G_XNVME), + FIO_OPT_G_LIBBLKIO = (1ULL << __FIO_OPT_G_LIBBLKIO), }; extern const struct opt_group *opt_group_from_mask(uint64_t *mask); diff --git a/options.c b/options.c index 102bcf5661..f592bc24d9 100644 --- a/options.c +++ b/options.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -251,6 +252,114 @@ int str_split_parse(struct thread_data *td, char *str, return ret; } +static int fio_fdp_cmp(const void *p1, const void *p2) +{ + const uint16_t *t1 = p1; + const uint16_t *t2 = p2; + + return *t1 - *t2; +} + +static int str_fdp_pli_cb(void *data, const char *input) +{ + struct thread_data *td = cb_data_to_td(data); + char *str, *p, *id1; + int i = 0, ret = 0; + + if (!input) + return 1; + + p = str = strdup(input); + strip_blank_front(&str); + strip_blank_end(str); + + while ((id1 = strsep(&str, ",")) != NULL) { + char *str2, *id2; + unsigned int start, end; + + if (!strlen(id1)) + break; + + str2 = id1; + end = -1; + while ((id2 = strsep(&str2, "-")) != NULL) { + if (!strlen(id2)) + break; + + end = strtoull(id2, NULL, 0); + } + + start = strtoull(id1, NULL, 0); + if (end == -1) + end = start; + if (start > end) { + ret = 1; + break; + } + + while (start <= end) { + if (i >= FIO_MAX_DP_IDS) { + log_err("fio: only %d IDs supported\n", FIO_MAX_DP_IDS); + ret = 1; + break; + } + if (start > 0xFFFF) { + log_err("Placement IDs cannot exceed 0xFFFF\n"); + ret = 1; + break; + } + td->o.dp_ids[i++] = start++; + } + + if (ret) + break; + } + + free(p); + + qsort(td->o.dp_ids, i, sizeof(*td->o.dp_ids), fio_fdp_cmp); + td->o.dp_nr_ids = i; + + return ret; +} + +/* str_dp_scheme_cb() is a callback function for parsing the fdp_scheme option + This function validates the fdp_scheme filename. */ +static int str_dp_scheme_cb(void *data, const char *input) +{ + struct thread_data *td = cb_data_to_td(data); + struct stat sb; + char *filename; + int ret = 0; + + if (parse_dryrun()) + return 0; + + filename = strdup(td->o.dp_scheme_file); + strip_blank_front(&filename); + strip_blank_end(filename); + + strcpy(td->o.dp_scheme_file, filename); + + if (lstat(filename, &sb) < 0){ + ret = errno; + log_err("fio: lstat() error related to %s\n", filename); + td_verror(td, ret, "lstat"); + goto out; + } + + if (!S_ISREG(sb.st_mode)) { + ret = errno; + log_err("fio: %s is not a file\n", filename); + td_verror(td, ret, "S_ISREG"); + goto out; + } + +out: + free(filename); + return ret; +} + static int str_bssplit_cb(void *data, const char *input) { struct thread_data *td = cb_data_to_td(data); @@ -278,6 +387,135 @@ static int str_bssplit_cb(void *data, const char *input) return ret; } +static int parse_cmdprio_bssplit_entry(struct thread_options *o, + struct split_prio *entry, char *str) +{ + int matches = 0; + char *bs_str = NULL; + long long bs_val; + unsigned int perc = 0, class, level, hint; + + /* + * valid entry formats: + * bs/ - %s/ - set perc to 0, prio to -1. + * bs/perc - %s/%u - set prio to -1. + * bs/perc/class/level - %s/%u/%u/%u + * bs/perc/class/level/hint - %s/%u/%u/%u/%u + */ + matches = sscanf(str, "%m[^/]/%u/%u/%u/%u", + &bs_str, &perc, &class, &level, &hint); + if (matches < 1) { + log_err("fio: invalid cmdprio_bssplit format\n"); + return 1; + } + + if (str_to_decimal(bs_str, &bs_val, 1, o, 0, 0)) { + log_err("fio: split conversion failed\n"); + free(bs_str); + return 1; + } + free(bs_str); + + entry->bs = bs_val; + entry->perc = min(perc, 100u); + entry->prio = -1; + switch (matches) { + case 1: /* bs/ case */ + case 2: /* bs/perc case */ + break; + case 4: /* bs/perc/class/level case */ + case 5: /* bs/perc/class/level/hint case */ + class = min(class, (unsigned int) IOPRIO_MAX_PRIO_CLASS); + level = min(level, (unsigned int) IOPRIO_MAX_PRIO); + if (matches == 5) + hint = min(hint, (unsigned int) IOPRIO_MAX_PRIO_HINT); + else + hint = 0; + entry->prio = ioprio_value(class, level, hint); + break; + default: + log_err("fio: invalid cmdprio_bssplit format\n"); + return 1; + } + + return 0; +} + +/* + * Returns a negative integer if the first argument should be before the second + * argument in the sorted list. A positive integer if the first argument should + * be after the second argument in the sorted list. A zero if they are equal. + */ +static int fio_split_prio_cmp(const void *p1, const void *p2) +{ + const struct split_prio *tmp1 = p1; + const struct split_prio *tmp2 = p2; + + if (tmp1->bs > tmp2->bs) + return 1; + if (tmp1->bs < tmp2->bs) + return -1; + return 0; +} + +int split_parse_prio_ddir(struct thread_options *o, struct split_prio **entries, + int *nr_entries, char *str) +{ + struct split_prio *tmp_entries; + unsigned int nr_bssplits; + char *str_cpy, *p, *fname; + + /* strsep modifies the string, dup it so that we can use strsep twice */ + p = str_cpy = strdup(str); + if (!p) + return 1; + + nr_bssplits = 0; + while ((fname = strsep(&str_cpy, ":")) != NULL) { + if (!strlen(fname)) + break; + nr_bssplits++; + } + free(p); + + if (nr_bssplits > BSSPLIT_MAX) { + log_err("fio: too many cmdprio_bssplit entries\n"); + return 1; + } + + tmp_entries = calloc(nr_bssplits, sizeof(*tmp_entries)); + if (!tmp_entries) + return 1; + + nr_bssplits = 0; + while ((fname = strsep(&str, ":")) != NULL) { + struct split_prio *entry; + + if (!strlen(fname)) + break; + + entry = &tmp_entries[nr_bssplits]; + + if (parse_cmdprio_bssplit_entry(o, entry, fname)) { + log_err("fio: failed to parse cmdprio_bssplit entry\n"); + free(tmp_entries); + return 1; + } + + /* skip zero perc entries, they provide no useful information */ + if (entry->perc) + nr_bssplits++; + } + + qsort(tmp_entries, nr_bssplits, sizeof(*tmp_entries), + fio_split_prio_cmp); + + *entries = tmp_entries; + *nr_entries = nr_bssplits; + + return 0; +} + static int str2error(char *str) { const char *err[] = { "EPERM", "ENOENT", "ESRCH", "EINTR", "EIO", @@ -330,7 +568,11 @@ static int ignore_error_type(struct thread_data *td, enum error_type_bit etype, if (fname[0] == 'E') { error[i] = str2error(fname); } else { - error[i] = atoi(fname); + int base = 10; + if (!strncmp(fname, "0x", 2) || + !strncmp(fname, "0X", 2)) + base = 16; + error[i] = strtol(fname, NULL, base); if (error[i] < 0) error[i] = -error[i]; } @@ -364,6 +606,8 @@ static int str_replay_skip_cb(void *data, const char *input) if (parse_dryrun()) return 0; + if (!input) + return 1; p = str = strdup(input); @@ -402,6 +646,8 @@ static int str_ignore_error_cb(void *data, const char *input) if (parse_dryrun()) return 0; + if (!input) + return 1; p = str = strdup(input); @@ -438,9 +684,21 @@ static int str_rw_cb(void *data, const char *str) if (!nr) return 0; - if (td_random(td)) - o->ddir_seq_nr = atoi(nr); - else { + if (td_random(td)) { + long long val; + + if (str_to_decimal(nr, &val, 1, o, 0, 0)) { + log_err("fio: randrw postfix parsing failed\n"); + free(nr); + return 1; + } + if ((val <= 0) || (val > UINT_MAX)) { + log_err("fio: randrw postfix parsing out of range\n"); + free(nr); + return 1; + } + o->ddir_seq_nr = (unsigned int) val; + } else { long long val; if (str_to_decimal(nr, &val, 1, o, 0, 0)) { @@ -477,7 +735,7 @@ static int fio_clock_source_cb(void *data, const char *str) return 0; } -static int str_rwmix_read_cb(void *data, unsigned long long *val) +static int str_rwmix_read_cb(void *data, long long *val) { struct thread_data *td = cb_data_to_td(data); @@ -486,7 +744,7 @@ static int str_rwmix_read_cb(void *data, unsigned long long *val) return 0; } -static int str_rwmix_write_cb(void *data, unsigned long long *val) +static int str_rwmix_write_cb(void *data, long long *val) { struct thread_data *td = cb_data_to_td(data); @@ -505,7 +763,7 @@ static int str_exitall_cb(void) int fio_cpus_split(os_cpu_mask_t *mask, unsigned int cpu_index) { unsigned int i, index, cpus_in_mask; - const long max_cpu = cpus_online(); + const long max_cpu = cpus_configured(); cpus_in_mask = fio_cpu_count(mask); if (!cpus_in_mask) @@ -544,7 +802,7 @@ static int str_cpumask_cb(void *data, unsigned long long *val) return 1; } - max_cpu = cpus_online(); + max_cpu = cpus_configured(); for (i = 0; i < sizeof(int) * 8; i++) { if ((1 << i) & *val) { @@ -580,7 +838,7 @@ static int set_cpus_allowed(struct thread_data *td, os_cpu_mask_t *mask, strip_blank_front(&str); strip_blank_end(str); - max_cpu = cpus_online(); + max_cpu = cpus_configured(); while ((cpu = strsep(&str, ",")) != NULL) { char *str2, *cpu2; @@ -633,6 +891,8 @@ static int str_cpus_allowed_cb(void *data, const char *input) if (parse_dryrun()) return 0; + if (!input) + return 1; return set_cpus_allowed(td, &td->o.cpumask, input); } @@ -643,6 +903,8 @@ static int str_verify_cpus_allowed_cb(void *data, const char *input) if (parse_dryrun()) return 0; + if (!input) + return 1; return set_cpus_allowed(td, &td->o.verify_cpumask, input); } @@ -654,6 +916,8 @@ static int str_log_cpus_allowed_cb(void *data, const char *input) if (parse_dryrun()) return 0; + if (!input) + return 1; return set_cpus_allowed(td, &td->o.log_gz_cpumask, input); } @@ -698,6 +962,8 @@ static int str_numa_mpol_cb(void *data, char *input) if (parse_dryrun()) return 0; + if (!input) + return 1; nodelist = strchr(input, ':'); if (nodelist) { @@ -1110,6 +1376,13 @@ static int str_random_distribution_cb(void *data, const char *str) return 0; } +static bool is_valid_steadystate(unsigned int state) +{ + return (state == FIO_SS_IOPS || state == FIO_SS_IOPS_SLOPE || + state == FIO_SS_BW || state == FIO_SS_BW_SLOPE || + state == FIO_SS_LAT || state == FIO_SS_LAT_SLOPE); +} + static int str_steadystate_cb(void *data, const char *str) { struct thread_data *td = cb_data_to_td(data); @@ -1118,8 +1391,7 @@ static int str_steadystate_cb(void *data, const char *str) char *pct; long long ll; - if (td->o.ss_state != FIO_SS_IOPS && td->o.ss_state != FIO_SS_IOPS_SLOPE && - td->o.ss_state != FIO_SS_BW && td->o.ss_state != FIO_SS_BW_SLOPE) { + if (!is_valid_steadystate(td->o.ss_state)) { /* should be impossible to get here */ log_err("fio: unknown steady state criterion\n"); return 1; @@ -1163,6 +1435,21 @@ static int str_steadystate_cb(void *data, const char *str) return 0; td->o.ss_limit.u.f = val; + } else if (td->o.ss_state & FIO_SS_LAT) { + long long tns; + if (check_str_time(nr, &tns, 0)) { + log_err("fio: steadystate latency threshold parsing failed\n"); + free(nr); + return 1; + } + + dprint(FD_PARSE, "set steady state latency threshold to %lld nsec\n", tns); + free(nr); + if (parse_dryrun()) + return 0; + + td->o.ss_limit.u.f = (double) tns; + } else { /* bandwidth criterion */ if (str_to_decimal(nr, &ll, 1, td, 0, 0)) { log_err("fio: steadystate BW threshold postfix parsing failed\n"); @@ -1244,7 +1531,7 @@ int get_max_str_idx(char *input) } /* - * Returns the directory at the index, indexes > entires will be + * Returns the directory at the index, indexes > entries will be * assigned via modulo division of the index */ int set_name_idx(char *target, size_t tlen, char *input, int index, @@ -1295,6 +1582,9 @@ static int str_filename_cb(void *data, const char *input) struct thread_data *td = cb_data_to_td(data); char *fname, *str, *p; + if (!input) + return 1; + p = str = strdup(input); strip_blank_front(&str); @@ -1365,9 +1655,12 @@ static int str_buffer_pattern_cb(void *data, const char *input) struct thread_data *td = cb_data_to_td(data); int ret; + if (!input) + return 1; + /* FIXME: for now buffer pattern does not support formats */ - ret = parse_and_fill_pattern(input, strlen(input), td->o.buffer_pattern, - MAX_PATTERN_SIZE, NULL, NULL, NULL); + ret = parse_and_fill_pattern_alloc(input, strlen(input), + &td->o.buffer_pattern, NULL, NULL, NULL); if (ret < 0) return 1; @@ -1414,10 +1707,13 @@ static int str_verify_pattern_cb(void *data, const char *input) struct thread_data *td = cb_data_to_td(data); int ret; + if (!input) + return 1; + td->o.verify_fmt_sz = FIO_ARRAY_SIZE(td->o.verify_fmt); - ret = parse_and_fill_pattern(input, strlen(input), td->o.verify_pattern, - MAX_PATTERN_SIZE, fmt_desc, - td->o.verify_fmt, &td->o.verify_fmt_sz); + ret = parse_and_fill_pattern_alloc(input, strlen(input), + &td->o.verify_pattern, fmt_desc, td->o.verify_fmt, + &td->o.verify_fmt_sz); if (ret < 0) return 1; @@ -1438,7 +1734,7 @@ static int str_gtod_reduce_cb(void *data, int *il) int val = *il; /* - * Only modfiy options if gtod_reduce==1 + * Only modify options if gtod_reduce==1 * Otherwise leave settings alone. */ if (val) { @@ -1455,7 +1751,7 @@ static int str_gtod_reduce_cb(void *data, int *il) return 0; } -static int str_offset_cb(void *data, unsigned long long *__val) +static int str_offset_cb(void *data, long long *__val) { struct thread_data *td = cb_data_to_td(data); unsigned long long v = *__val; @@ -1476,7 +1772,7 @@ static int str_offset_cb(void *data, unsigned long long *__val) return 0; } -static int str_offset_increment_cb(void *data, unsigned long long *__val) +static int str_offset_increment_cb(void *data, long long *__val) { struct thread_data *td = cb_data_to_td(data); unsigned long long v = *__val; @@ -1497,7 +1793,7 @@ static int str_offset_increment_cb(void *data, unsigned long long *__val) return 0; } -static int str_size_cb(void *data, unsigned long long *__val) +static int str_size_cb(void *data, long long *__val) { struct thread_data *td = cb_data_to_td(data); unsigned long long v = *__val; @@ -1541,7 +1837,7 @@ static int str_io_size_cb(void *data, unsigned long long *__val) return 0; } -static int str_zoneskip_cb(void *data, unsigned long long *__val) +static int str_zoneskip_cb(void *data, long long *__val) { struct thread_data *td = cb_data_to_td(data); unsigned long long v = *__val; @@ -1703,6 +1999,22 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .category = FIO_OPT_C_FILE, .group = FIO_OPT_G_FILENAME, }, + { + .name = "filetype", + .lname = "file_type", + .type = FIO_OPT_STR, + .off1 = offsetof(struct thread_options, filetype), + .help = "Assume all files defined in a job are of this type", + .def = "none", + .group = FIO_OPT_G_IO_BASIC, + .category = FIO_OPT_C_FILE, + .posval = { + { .ival = "none", .oval = 0 }, + { .ival = "file", .oval = FIO_TYPE_FILE }, + { .ival = "block", .oval = FIO_TYPE_BLOCK }, + { .ival = "char", .oval = FIO_TYPE_CHAR }, + }, + }, { .name = "directory", .lname = "Directory", @@ -1825,6 +2137,10 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .oval = TD_DDIR_TRIMWRITE, .help = "Trim and write mix, trims preceding writes" }, + { .ival = "randtrimwrite", + .oval = TD_DDIR_RANDTRIMWRITE, + .help = "Randomly trim and write mix, trims preceding writes" + }, }, }, { @@ -1937,16 +2253,6 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .help = "RDMA IO engine", }, #endif -#ifdef CONFIG_LIBRPMA_APM - { .ival = "librpma_apm", - .help = "librpma IO engine in APM mode", - }, -#endif -#ifdef CONFIG_LIBRPMA_GPSPM - { .ival = "librpma_gpspm", - .help = "librpma IO engine in GPSPM mode", - }, -#endif #ifdef CONFIG_LINUX_EXT4_MOVE_EXTENT { .ival = "e4defrag", .help = "ext4 defrag engine", @@ -1970,12 +2276,6 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .help = "Hadoop Distributed Filesystem (HDFS) engine" }, #endif -#ifdef CONFIG_PMEMBLK - { .ival = "pmemblk", - .help = "PMDK libpmemblk based IO engine", - }, - -#endif #ifdef CONFIG_IME { .ival = "ime_psync", .help = "DDN's IME synchronous IO engine", @@ -2018,10 +2318,15 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .help = "DAOS File System (dfs) IO engine", }, #endif -#ifdef CONFIG_NFS +#ifdef CONFIG_LIBNFS { .ival = "nfs", .help = "NFS IO engine", }, +#endif +#ifdef CONFIG_LIBXNVME + { .ival = "xnvme", + .help = "XNVME IO engine", + }, #endif }, }, @@ -2222,6 +2527,17 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .category = FIO_OPT_C_IO, .group = FIO_OPT_G_INVALID, }, + { + .name = "num_range", + .lname = "Number of ranges", + .type = FIO_OPT_INT, + .off1 = offsetof(struct thread_options, num_range), + .maxval = MAX_TRIM_RANGE, + .help = "Number of ranges for trim command", + .def = "1", + .category = FIO_OPT_C_IO, + .group = FIO_OPT_G_INVALID, + }, { .name = "bs", .lname = "Block size", @@ -2311,6 +2627,7 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { }, { .name = "randrepeat", + .alias = "allrandrepeat", .lname = "Random repeatable", .type = FIO_OPT_BOOL, .off1 = offsetof(struct thread_options, rand_repeatable), @@ -2356,6 +2673,56 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .category = FIO_OPT_C_IO, .group = FIO_OPT_G_RANDOM, }, + { + .name = "sprandom", + .lname = "Sandisk Pseudo Random Preconditioning", + .type = FIO_OPT_BOOL, + .off1 = offsetof(struct thread_options, sprandom), + .help = "Set up Sandisk Pseudo Random Preconditioning", + .parent = "rw", + .hide = 1, + .def = "0", + .category = FIO_OPT_C_IO, + .group = FIO_OPT_G_RANDOM, + }, + { + .name = "spr_num_regions", + .lname = "SPRandom number of regions", + .type = FIO_OPT_INT, + .off1 = offsetof(struct thread_options, spr_num_regions), + .help = "Number of regions for sprandom", + .parent = "sprandom", + .hide = 1, + .def = "100", + .category = FIO_OPT_C_IO, + .group = FIO_OPT_G_RANDOM, + }, + { + .name = "spr_op", + .lname = "SPRandom Over provisioning", + .type = FIO_OPT_FLOAT_LIST, + .off1 = offsetof(struct thread_options, spr_over_provisioning), + .help = "Over provisioning ratio for SPRandom", + .parent = "sprandom", + .maxlen = 1, + .hide = 1, + .def = "0.15", + .category = FIO_OPT_C_IO, + .group = FIO_OPT_G_RANDOM, + }, + { + .name = "spr_cs", + .lname = "SPRandom Device cache size", + .type = FIO_OPT_ULL, + .off1 = offsetof(struct thread_options, spr_cache_size), + .help = "Cache Size in bytes for SPRandom", + .parent = "sprandom", + .maxlen = 1, + .hide = 1, + .def = "0", + .category = FIO_OPT_C_IO, + .group = FIO_OPT_G_RANDOM, + }, { .name = "random_generator", .lname = "Random Generator", @@ -2440,16 +2807,6 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .category = FIO_OPT_C_IO, .group = FIO_OPT_G_RANDOM, }, - { - .name = "allrandrepeat", - .lname = "All Random Repeat", - .type = FIO_OPT_BOOL, - .off1 = offsetof(struct thread_options, allrand_repeatable), - .help = "Use repeatable random numbers for everything", - .def = "0", - .category = FIO_OPT_C_IO, - .group = FIO_OPT_G_RANDOM, - }, { .name = "nrfiles", .lname = "Number of files", @@ -2587,6 +2944,12 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .oval = F_ADV_SEQUENTIAL, .help = "Advise using FADV_SEQUENTIAL", }, +#ifdef POSIX_FADV_NOREUSE + { .ival = "noreuse", + .oval = F_ADV_NOREUSE, + .help = "Advise using FADV_NOREUSE", + }, +#endif }, .help = "Use fadvise() to advise the kernel on IO pattern", .def = "1", @@ -2674,6 +3037,7 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .category = FIO_OPT_C_IO, .group = FIO_OPT_G_IO_TYPE, }, +#ifdef FIO_HAVE_RWF_ATOMIC { .name = "atomic", .lname = "Atomic I/O", @@ -2684,6 +3048,7 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .category = FIO_OPT_C_IO, .group = FIO_OPT_G_IO_TYPE, }, +#endif { .name = "buffered", .lname = "Buffered I/O", @@ -2783,6 +3148,16 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .category = FIO_OPT_C_GENERAL, .group = FIO_OPT_G_RUNTIME, }, + { + .name = "ramp_size", + .lname = "Ramp size", + .type = FIO_OPT_STR_VAL, + .off1 = offsetof(struct thread_options, ramp_size), + .minval = 1, + .help = "Amount of data transferred before measuring performance", + .category = FIO_OPT_C_GENERAL, + .group = FIO_OPT_G_RUNTIME, + }, { .name = "clocksource", .lname = "Clock source", @@ -2799,12 +3174,10 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .help = "Use gettimeofday(2) for timing", }, #endif -#ifdef CONFIG_CLOCK_GETTIME { .ival = "clock_gettime", .oval = CS_CGETTIME, .help = "Use clock_gettime(2) for timing", }, -#endif #ifdef ARCH_HAVE_CPU_CLOCK { .ival = "cpu", .oval = CS_CPUCLOCK, @@ -2963,6 +3336,10 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .oval = VERIFY_PATTERN_NO_HDR, .help = "Verify strict pattern", }, + { .ival = "pattern_hdr", + .oval = VERIFY_PATTERN, + .help = "Verify pattern with header", + }, { .ival = "null", .oval = VERIFY_NULL, @@ -3033,6 +3410,18 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .category = FIO_OPT_C_IO, .group = FIO_OPT_G_VERIFY, }, + { + .name = "verify_pattern_interval", + .lname = "Running verify pattern", + .type = FIO_OPT_INT, + .off1 = offsetof(struct thread_options, verify_pattern_interval), + .def = "0", + .help = "Re-create verify pattern every N bytes", + .parent = "verify", + .hide = 1, + .category = FIO_OPT_C_IO, + .group = FIO_OPT_G_VERIFY, + }, { .name = "verify_fatal", .lname = "Verify fatal", @@ -3143,6 +3532,28 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .category = FIO_OPT_C_IO, .group = FIO_OPT_G_VERIFY, }, + { + .name = "verify_write_sequence", + .lname = "Verify write sequence number", + .off1 = offsetof(struct thread_options, verify_write_sequence), + .type = FIO_OPT_BOOL, + .def = "1", + .help = "Verify header write sequence number", + .parent = "verify", + .category = FIO_OPT_C_IO, + .group = FIO_OPT_G_VERIFY, + }, + { + .name = "verify_header_seed", + .lname = "Verify header seed", + .off1 = offsetof(struct thread_options, verify_header_seed), + .type = FIO_OPT_BOOL, + .def = "1", + .help = "Verify the header seed used to generate the buffer contents", + .parent = "verify", + .category = FIO_OPT_C_IO, + .group = FIO_OPT_G_VERIFY, + }, #ifdef FIO_HAVE_TRIM { .name = "trim_percentage", @@ -3467,7 +3878,7 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .lname = "Per device/file maximum number of open zones", .type = FIO_OPT_INT, .off1 = offsetof(struct thread_options, max_open_zones), - .maxval = ZBD_MAX_OPEN_ZONES, + .maxval = ZBD_MAX_WRITE_ZONES, .help = "Limit on the number of simultaneously opened sequential write zones with zonemode=zbd", .def = "0", .category = FIO_OPT_C_IO, @@ -3478,7 +3889,7 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .lname = "Job maximum number of open zones", .type = FIO_OPT_INT, .off1 = offsetof(struct thread_options, job_max_open_zones), - .maxval = ZBD_MAX_OPEN_ZONES, + .maxval = ZBD_MAX_WRITE_ZONES, .help = "Limit on the number of simultaneously opened sequential write zones with zonemode=zbd by one thread/process", .def = "0", .category = FIO_OPT_C_IO, @@ -3518,6 +3929,99 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .category = FIO_OPT_C_IO, .group = FIO_OPT_G_ZONE, }, + { + .name = "recover_zbd_write_error", + .lname = "Recover write errors when zonemode=zbd is set", + .type = FIO_OPT_BOOL, + .off1 = offsetof(struct thread_options, recover_zbd_write_error), + .def = 0, + .help = "Continue writes for sequential write required zones after recovering write errors with care for partial write pointer move", + .category = FIO_OPT_C_IO, + .group = FIO_OPT_G_ZONE, + }, + { + .name = "fdp", + .lname = "Flexible data placement", + .type = FIO_OPT_BOOL, + .off1 = offsetof(struct thread_options, fdp), + .help = "Use Data placement directive (FDP)", + .def = "0", + .category = FIO_OPT_C_IO, + .group = FIO_OPT_G_INVALID, + }, + { + .name = "dataplacement", + .alias = "data_placement", + .lname = "Data Placement interface", + .type = FIO_OPT_STR, + .off1 = offsetof(struct thread_options, dp_type), + .help = "Data Placement interface to use", + .def = "none", + .category = FIO_OPT_C_IO, + .group = FIO_OPT_G_INVALID, + .posval = { + { .ival = "none", + .oval = FIO_DP_NONE, + .help = "Do not specify a data placement interface", + }, + { .ival = "fdp", + .oval = FIO_DP_FDP, + .help = "Use Flexible Data Placement interface", + }, + { .ival = "streams", + .oval = FIO_DP_STREAMS, + .help = "Use Streams interface", + }, + }, + }, + { + .name = "plid_select", + .alias = "fdp_pli_select", + .lname = "Data Placement ID selection strategy", + .type = FIO_OPT_STR, + .off1 = offsetof(struct thread_options, dp_id_select), + .help = "Strategy for selecting next Data Placement ID", + .def = "roundrobin", + .category = FIO_OPT_C_IO, + .group = FIO_OPT_G_INVALID, + .posval = { + { .ival = "random", + .oval = FIO_DP_RANDOM, + .help = "Choose a Placement ID at random (uniform)", + }, + { .ival = "roundrobin", + .oval = FIO_DP_RR, + .help = "Round robin select Placement IDs", + }, + { .ival = "scheme", + .oval = FIO_DP_SCHEME, + .help = "Use a scheme(based on LBA) to select Placement IDs", + }, + }, + }, + { + .name = "plids", + .alias = "fdp_pli", + .lname = "Stream IDs/Data Placement ID indices", + .type = FIO_OPT_STR, + .cb = str_fdp_pli_cb, + .off1 = offsetof(struct thread_options, dp_ids), + .help = "Sets which Data Placement ids to use (defaults to all for FDP)", + .hide = 1, + .category = FIO_OPT_C_IO, + .group = FIO_OPT_G_INVALID, + }, + { + .name = "dp_scheme", + .lname = "Data Placement Scheme", + .type = FIO_OPT_STR_STORE, + .cb = str_dp_scheme_cb, + .off1 = offsetof(struct thread_options, dp_scheme_file), + .maxlen = PATH_MAX, + .help = "scheme file that specifies offset-RUH mapping", + .category = FIO_OPT_C_IO, + .group = FIO_OPT_G_INVALID, + }, { .name = "lockmem", .lname = "Lock memory", @@ -3564,6 +4068,17 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .category = FIO_OPT_C_IO, .group = FIO_OPT_G_RWMIX, }, +#ifdef CONFIG_LINUX + { + .name = "comm", + .lname = "Job process comm", + .type = FIO_OPT_STR_STORE, + .off1 = offsetof(struct thread_options, comm), + .help = "Process comm of this job", + .category = FIO_OPT_C_GENERAL, + .group = FIO_OPT_G_DESC, + }, +#endif { .name = "nice", .lname = "Nice", @@ -3614,6 +4129,18 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .category = FIO_OPT_C_GENERAL, .group = FIO_OPT_G_CRED, }, + { + .name = "priohint", + .lname = "I/O nice priority hint", + .type = FIO_OPT_INT, + .off1 = offsetof(struct thread_options, ioprio_hint), + .help = "Set job IO priority hint", + .minval = IOPRIO_MIN_PRIO_HINT, + .maxval = IOPRIO_MAX_PRIO_HINT, + .interval = 1, + .category = FIO_OPT_C_GENERAL, + .group = FIO_OPT_G_CRED, + }, #else { .name = "prioclass", @@ -3621,6 +4148,12 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .type = FIO_OPT_UNSUPPORTED, .help = "Your platform does not support IO priority classes", }, + { + .name = "priohint", + .lname = "I/O nice priority hint", + .type = FIO_OPT_UNSUPPORTED, + .help = "Your platform does not support IO priority hints", + }, #endif { .name = "thinktime", @@ -3646,6 +4179,18 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .category = FIO_OPT_C_IO, .group = FIO_OPT_G_THINKTIME, }, + { + .name = "thinkcycles", + .lname = "Think cycles", + .type = FIO_OPT_INT, + .off1 = offsetof(struct thread_options, thinkcycles), + .help = "Spin for a constant amount of cycles between requests", + .def = "0", + .parent = "thinktime", + .hide = 1, + .category = FIO_OPT_C_IO, + .group = FIO_OPT_G_THINKTIME, + }, { .name = "thinktime_blocks", .lname = "Thinktime blocks", @@ -4280,6 +4825,8 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .name = "log_hist_coarseness", .lname = "Histogram logs coarseness", .type = FIO_OPT_INT, + .maxval = 6, + .minval = 0, .off1 = offsetof(struct thread_options, log_hist_coarseness), .help = "Integer in range [0,6]. Higher coarseness outputs" " fewer histogram bins per sample. The number of bins for" @@ -4299,14 +4846,38 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .group = FIO_OPT_G_INVALID, }, { - .name = "log_max_value", - .lname = "Log maximum instead of average", - .type = FIO_OPT_BOOL, + .name = "log_window_value", + .alias = "log_max_value", + .lname = "Log maximum, average or both values", + .type = FIO_OPT_STR, .off1 = offsetof(struct thread_options, log_max), - .help = "Log max sample in a window instead of average", - .def = "0", + .help = "Log max, average or both sample in a window", + .def = "avg", .category = FIO_OPT_C_LOG, .group = FIO_OPT_G_INVALID, + .posval = { + { .ival = "avg", + .oval = IO_LOG_SAMPLE_AVG, + .help = "Log average value over the window", + }, + { .ival = "max", + .oval = IO_LOG_SAMPLE_MAX, + .help = "Log maximum value in the window", + }, + { .ival = "both", + .oval = IO_LOG_SAMPLE_BOTH, + .help = "Log both average and maximum values over the window" + }, + /* Compatibility with former boolean values */ + { .ival = "0", + .oval = IO_LOG_SAMPLE_AVG, + .help = "Alias for 'avg'", + }, + { .ival = "1", + .oval = IO_LOG_SAMPLE_MAX, + .help = "Alias for 'max'", + }, + }, }, { .name = "log_offset", @@ -4328,6 +4899,16 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .category = FIO_OPT_C_LOG, .group = FIO_OPT_G_INVALID, }, + { + .name = "log_issue_time", + .lname = "Log IO issue time", + .type = FIO_OPT_BOOL, + .off1 = offsetof(struct thread_options, log_issue_time), + .help = "Include IO issue time for each log entry", + .def = "0", + .category = FIO_OPT_C_LOG, + .group = FIO_OPT_G_INVALID, + }, #ifdef CONFIG_ZLIB { .name = "log_compression", @@ -4384,11 +4965,21 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { }, #endif { - .name = "log_unix_epoch", - .lname = "Log epoch unix", + .name = "log_alternate_epoch", + .alias = "log_unix_epoch", + .lname = "Log epoch alternate", .type = FIO_OPT_BOOL, - .off1 = offsetof(struct thread_options, log_unix_epoch), - .help = "Use Unix time in log files", + .off1 = offsetof(struct thread_options, log_alternate_epoch), + .help = "Use alternate epoch time in log files. Uses the same epoch as that is used by clock_gettime with specified log_alternate_epoch_clock_id.", + .category = FIO_OPT_C_LOG, + .group = FIO_OPT_G_INVALID, + }, + { + .name = "log_alternate_epoch_clock_id", + .lname = "Log alternate epoch clock_id", + .type = FIO_OPT_INT, + .off1 = offsetof(struct thread_options, log_alternate_epoch_clock_id), + .help = "If log_alternate_epoch is true, this option specifies the clock_id from clock_gettime whose epoch should be used. If log_alternate_epoch is false, this option has no effect. Default value is 0, or CLOCK_REALTIME", .category = FIO_OPT_C_LOG, .group = FIO_OPT_G_INVALID, }, @@ -4525,6 +5116,16 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .category = FIO_OPT_C_IO, .group = FIO_OPT_G_IO_BUF, }, + { + .name = "dedupe_global", + .lname = "Global deduplication", + .type = FIO_OPT_BOOL, + .off1 = offsetof(struct thread_options, dedupe_global), + .help = "Share deduplication buffers across jobs", + .def = "0", + .category = FIO_OPT_C_IO, + .group = FIO_OPT_G_IO_BUF, + }, { .name = "dedupe_mode", .lname = "Dedupe mode", @@ -4707,6 +5308,16 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .category = FIO_OPT_C_GENERAL, .group = FIO_OPT_G_CLOCK, }, + { + .name = "job_start_clock_id", + .lname = "Job start clock_id", + .type = FIO_OPT_INT, + .off1 = offsetof(struct thread_options, job_start_clock_id), + .help = "The clock_id passed to the call to clock_gettime used to record job_start in the json output format. Default is 0, or CLOCK_REALTIME", + .verify = gtod_cpu_verify, + .category = FIO_OPT_C_GENERAL, + .group = FIO_OPT_G_CLOCK, + }, { .name = "unified_rw_reporting", .lname = "Unified RW Reporting", @@ -4994,6 +5605,14 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .oval = FIO_SS_BW_SLOPE, .help = "slope calculated from bandwidth measurements", }, + { .ival = "lat", + .oval = FIO_SS_LAT, + .help = "maximum mean deviation of latency measurements", + }, + { .ival = "lat_slope", + .oval = FIO_SS_LAT_SLOPE, + .help = "slope calculated from latency measurements", + }, }, .category = FIO_OPT_C_GENERAL, .group = FIO_OPT_G_RUNTIME, @@ -5026,6 +5645,20 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .category = FIO_OPT_C_GENERAL, .group = FIO_OPT_G_RUNTIME, }, + { + .name = "steadystate_check_interval", + .lname = "Steady state check interval", + .alias = "ss_interval", + .parent = "steadystate", + .type = FIO_OPT_STR_VAL_TIME, + .off1 = offsetof(struct thread_options, ss_check_interval), + .help = "Polling interval for the steady state check (too low means steadystate will not converge)", + .def = "1", + .is_seconds = 1, + .is_time = 1, + .category = FIO_OPT_C_GENERAL, + .group = FIO_OPT_G_RUNTIME, + }, { .name = NULL, }, @@ -5146,7 +5779,7 @@ void fio_keywords_init(void) sprintf(buf, "%llu", mb_memory); fio_keywords[1].replace = strdup(buf); - l = cpus_online(); + l = cpus_configured(); sprintf(buf, "%lu", l); fio_keywords[2].replace = strdup(buf); } diff --git a/os/linux/io_uring.h b/os/linux/io_uring.h index c45b5e9a93..cadcadaecf 100644 --- a/os/linux/io_uring.h +++ b/os/linux/io_uring.h @@ -22,6 +22,10 @@ struct io_uring_sqe { union { __u64 off; /* offset into file */ __u64 addr2; + struct { + __u32 cmd_op; + __u32 __pad1; + }; }; union { __u64 addr; /* pointer to buffer or iovecs */ @@ -45,6 +49,7 @@ struct io_uring_sqe { __u32 rename_flags; __u32 unlink_flags; __u32 hardlink_flags; + __u32 uring_cmd_flags; }; __u64 user_data; /* data to be passed back at completion time */ /* pack this to avoid bogus arm OABI complaints */ @@ -60,9 +65,34 @@ struct io_uring_sqe { __s32 splice_fd_in; __u32 file_index; }; - __u64 __pad2[2]; + union { + struct { + __u64 addr3; + __u64 __pad2[1]; + }; + struct { + __u64 attr_ptr; /* pointer to attribute information */ + __u64 attr_type_mask; /* bit mask of attributes */ + }; + /* + * If the ring is initialized with IORING_SETUP_SQE128, then + * this field is used for 80 bytes of arbitrary command data + */ + __u8 cmd[0]; + }; }; +/* sqe->attr_type_mask flags */ +#define IORING_RW_ATTR_FLAG_PI (1U << 0) +/* PI attribute information */ +struct io_uring_attr_pi { + __u16 flags; + __u16 app_tag; + __u32 len; + __u64 addr; + __u64 seed; + __u64 rsvd; +}; enum { IOSQE_FIXED_FILE_BIT, IOSQE_IO_DRAIN_BIT, @@ -70,6 +100,7 @@ enum { IOSQE_IO_HARDLINK_BIT, IOSQE_ASYNC_BIT, IOSQE_BUFFER_SELECT_BIT, + IOSQE_CQE_SKIP_SUCCESS_BIT, }; /* @@ -87,6 +118,8 @@ enum { #define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT) /* select buffer from sqe->buf_group */ #define IOSQE_BUFFER_SELECT (1U << IOSQE_BUFFER_SELECT_BIT) +/* don't post CQE if request succeeded */ +#define IOSQE_CQE_SKIP_SUCCESS (1U << IOSQE_CQE_SKIP_SUCCESS_BIT) /* * io_uring_setup() flags @@ -98,6 +131,62 @@ enum { #define IORING_SETUP_CLAMP (1U << 4) /* clamp SQ/CQ ring sizes */ #define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */ #define IORING_SETUP_R_DISABLED (1U << 6) /* start with ring disabled */ +#define IORING_SETUP_SUBMIT_ALL (1U << 7) /* continue submit on error */ +/* + * Cooperative task running. When requests complete, they often require + * forcing the submitter to transition to the kernel to complete. If this + * flag is set, work will be done when the task transitions anyway, rather + * than force an inter-processor interrupt reschedule. This avoids interrupting + * a task running in userspace, and saves an IPI. + */ +#define IORING_SETUP_COOP_TASKRUN (1U << 8) +/* + * If COOP_TASKRUN is set, get notified if task work is available for + * running and a kernel transition would be needed to run it. This sets + * IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN. + */ +#define IORING_SETUP_TASKRUN_FLAG (1U << 9) + +#define IORING_SETUP_SQE128 (1U << 10) /* SQEs are 128 byte */ +#define IORING_SETUP_CQE32 (1U << 11) /* CQEs are 32 byte */ + +/* + * Only one task is allowed to submit requests + */ +#define IORING_SETUP_SINGLE_ISSUER (1U << 12) + +/* + * Defer running task work to get events. + * Rather than running bits of task work whenever the task transitions + * try to do it just before it is needed. + */ +#define IORING_SETUP_DEFER_TASKRUN (1U << 13) + +/* + * Application provides the memory for the rings + */ +#define IORING_SETUP_NO_MMAP (1U << 14) + +/* + * Register the ring fd in itself for use with + * IORING_REGISTER_USE_REGISTERED_RING; return a registered fd index rather + * than an fd. + */ +#define IORING_SETUP_REGISTERED_FD_ONLY (1U << 15) + +/* + * Removes indirection through the SQ index array. + */ +#define IORING_SETUP_NO_SQARRAY (1U << 16) + +/* Use hybrid poll in iopoll process */ +#define IORING_SETUP_HYBRID_IOPOLL (1U << 17) + +/* + * Allow both 16b and 32b CQEs. If a 32b CQE is posted, it will have + * IORING_CQE_F_32 set in cqe->flags. + */ +#define IORING_SETUP_CQE_MIXED (1U << 18) enum { IORING_OP_NOP, @@ -140,11 +229,26 @@ enum { IORING_OP_MKDIRAT, IORING_OP_SYMLINKAT, IORING_OP_LINKAT, + IORING_OP_MSG_RING, + IORING_OP_FSETXATTR, + IORING_OP_SETXATTR, + IORING_OP_FGETXATTR, + IORING_OP_GETXATTR, + IORING_OP_SOCKET, + IORING_OP_URING_CMD, + /* this goes last, obviously */ IORING_OP_LAST, }; +/* + * sqe->uring_cmd_flags + * IORING_URING_CMD_FIXED use registered buffer; pass thig flag + * along with setting sqe->buf_index. + */ +#define IORING_URING_CMD_FIXED (1U << 0) + /* * sqe->fsync_flags */ @@ -182,6 +286,11 @@ enum { #define IORING_POLL_UPDATE_EVENTS (1U << 1) #define IORING_POLL_UPDATE_USER_DATA (1U << 2) +#define IORING_NOP_INJECT_RESULT (1U << 0) +#define IORING_NOP_FILE (1U << 1) +#define IORING_NOP_FIXED_FILE (1U << 2) +#define IORING_NOP_FIXED_BUFFER (1U << 3) + /* * IO completion data structure (Completion Queue Entry) */ @@ -189,6 +298,12 @@ struct io_uring_cqe { __u64 user_data; /* sqe->data submission passed back */ __s32 res; /* result code for this event */ __u32 flags; + + /* + * If the ring is initialized with IORING_SETUP_CQE32, then this field + * contains 16-bytes of padding, doubling the size of the CQE. + */ + __u64 big_cqe[]; }; /* @@ -254,10 +369,12 @@ struct io_cqring_offsets { /* * io_uring_enter(2) flags */ -#define IORING_ENTER_GETEVENTS (1U << 0) -#define IORING_ENTER_SQ_WAKEUP (1U << 1) -#define IORING_ENTER_SQ_WAIT (1U << 2) -#define IORING_ENTER_EXT_ARG (1U << 3) +#define IORING_ENTER_GETEVENTS (1U << 0) +#define IORING_ENTER_SQ_WAKEUP (1U << 1) +#define IORING_ENTER_SQ_WAIT (1U << 2) +#define IORING_ENTER_EXT_ARG (1U << 3) +#define IORING_ENTER_REGISTERED_RING (1U << 4) +#define IORING_ENTER_NO_IOWAIT (1U << 7) /* * Passed in for io_uring_setup(2). Copied back with updated info on success @@ -289,6 +406,8 @@ struct io_uring_params { #define IORING_FEAT_EXT_ARG (1U << 8) #define IORING_FEAT_NATIVE_WORKERS (1U << 9) #define IORING_FEAT_RSRC_TAGS (1U << 10) +#define IORING_FEAT_CQE_SKIP (1U << 11) +#define IORING_FEAT_NO_IOWAIT (1U << 17) /* * io_uring_register(2) opcodes and arguments @@ -321,6 +440,10 @@ enum { /* set/get max number of io-wq workers */ IORING_REGISTER_IOWQ_MAX_WORKERS = 19, + /* register/unregister io_uring fd with the ring */ + IORING_REGISTER_RING_FDS = 20, + IORING_UNREGISTER_RING_FDS = 21, + /* this goes last */ IORING_REGISTER_LAST }; diff --git a/os/mac/posix.c b/os/mac/posix.c new file mode 100644 index 0000000000..421b226054 --- /dev/null +++ b/os/mac/posix.c @@ -0,0 +1,99 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "../../log.h" + +#include "posix.h" + +#define MMAP_CHUNK_SIZE (16LL * 1024 * 1024 * 1024) + +/* + * NB: performance of discard_pages() will be slower under Rosetta. + */ +static int discard_pages(int fd, off_t offset, off_t len) +{ + /* Align offset and len to page size */ + long pagesize = sysconf(_SC_PAGESIZE); + long offset_pad = offset % pagesize; + offset -= offset_pad; + len += offset_pad; + len = (len + pagesize - 1) & -pagesize; + + while (len > 0) { + int saved_errno; + size_t mmap_len = MIN(MMAP_CHUNK_SIZE, len); + void *addr = mmap(0, mmap_len, PROT_NONE, MAP_SHARED, fd, + offset); + + if (addr == MAP_FAILED) { + saved_errno = errno; + log_err("discard_pages: failed to mmap (%s), " + "offset = %llu, len = %zu\n", + strerror(errno), offset, mmap_len); + return saved_errno; + } + + if (msync(addr, mmap_len, MS_INVALIDATE)) { + saved_errno = errno; + log_err("discard_pages: msync failed to free cache " + "pages\n"); + + if (munmap(addr, mmap_len) < 0) + log_err("discard_pages: munmap failed (%s)\n", + strerror(errno)); + return saved_errno; + } + + if (munmap(addr, mmap_len) < 0) { + saved_errno = errno; + log_err("discard_pages: munmap failed (%s), " + "len = %zu)\n", strerror(errno), mmap_len); + return saved_errno; + } + + len -= mmap_len; + offset += mmap_len; + } + + return 0; +} + +static inline int set_readhead(int fd, bool enabled) { + int ret; + + ret = fcntl(fd, F_RDAHEAD, enabled ? 1 : 0); + if (ret == -1) { + ret = errno; + } + + return ret; +} + +int posix_fadvise(int fd, off_t offset, off_t len, int advice) +{ + int ret; + + switch(advice) { + case POSIX_FADV_NORMAL: + ret = 0; + break; + case POSIX_FADV_RANDOM: + ret = set_readhead(fd, false); + break; + case POSIX_FADV_SEQUENTIAL: + ret = set_readhead(fd, true); + break; + case POSIX_FADV_DONTNEED: + ret = discard_pages(fd, offset, len); + break; + default: + ret = EINVAL; + } + + return ret; +} diff --git a/os/mac/posix.h b/os/mac/posix.h new file mode 100644 index 0000000000..6ef7854a9e --- /dev/null +++ b/os/mac/posix.h @@ -0,0 +1,11 @@ +#ifndef FIO_MAC_POSIX_H +#define FIO_MAC_POSIX_H + +#define POSIX_FADV_NORMAL (0) +#define POSIX_FADV_RANDOM (1) +#define POSIX_FADV_SEQUENTIAL (2) +#define POSIX_FADV_DONTNEED (4) + +extern int posix_fadvise(int fd, off_t offset, off_t len, int advice); + +#endif diff --git a/os/os-android.h b/os/os-android.h deleted file mode 100644 index 10c51b8318..0000000000 --- a/os/os-android.h +++ /dev/null @@ -1,316 +0,0 @@ -#ifndef FIO_OS_ANDROID_H -#define FIO_OS_ANDROID_H - -#define FIO_OS os_android - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "./os-linux-syscall.h" -#include "../file.h" - -#ifndef __has_builtin // Optional of course. - #define __has_builtin(x) 0 // Compatibility with non-clang compilers. -#endif - -#define FIO_HAVE_DISK_UTIL -#define FIO_HAVE_IOSCHED_SWITCH -#define FIO_HAVE_IOPRIO -#define FIO_HAVE_IOPRIO_CLASS -#define FIO_HAVE_ODIRECT -#define FIO_HAVE_HUGETLB -#define FIO_HAVE_BLKTRACE -#define FIO_HAVE_CL_SIZE -#define FIO_HAVE_CGROUPS -#define FIO_HAVE_FS_STAT -#define FIO_HAVE_TRIM -#define FIO_HAVE_GETTID -#define FIO_USE_GENERIC_INIT_RANDOM_STATE -#define FIO_HAVE_E4_ENG -#define FIO_HAVE_BYTEORDER_FUNCS -#define FIO_HAVE_MMAP_HUGE -#define FIO_NO_HAVE_SHM_H - -#define OS_MAP_ANON MAP_ANONYMOUS - -#ifndef POSIX_MADV_DONTNEED -#define posix_madvise madvise -#define POSIX_MADV_DONTNEED MADV_DONTNEED -#define POSIX_MADV_SEQUENTIAL MADV_SEQUENTIAL -#define POSIX_MADV_RANDOM MADV_RANDOM -#endif - -#ifdef MADV_REMOVE -#define FIO_MADV_FREE MADV_REMOVE -#endif -#ifndef MAP_HUGETLB -#define MAP_HUGETLB 0x40000 /* arch specific */ -#endif - -#ifdef CONFIG_PTHREAD_GETAFFINITY -#define FIO_HAVE_GET_THREAD_AFFINITY -#define fio_get_thread_affinity(mask) \ - pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask)) -#endif - -#ifndef CONFIG_NO_SHM -/* - * Bionic doesn't support SysV shared memeory, so implement it using ashmem - */ -#include -#include -#include -#include -#if __ANDROID_API__ >= __ANDROID_API_O__ -#include -#else -#define ASHMEM_DEVICE "/dev/ashmem" -#endif -#define shmid_ds shmid64_ds -#define SHM_HUGETLB 04000 - -static inline int shmctl(int __shmid, int __cmd, struct shmid_ds *__buf) -{ - int ret=0; - if (__cmd == IPC_RMID) - { - int length = ioctl(__shmid, ASHMEM_GET_SIZE, NULL); - struct ashmem_pin pin = {0 , length}; - ret = ioctl(__shmid, ASHMEM_UNPIN, &pin); - close(__shmid); - } - return ret; -} - -#if __ANDROID_API__ >= __ANDROID_API_O__ -static inline int shmget(key_t __key, size_t __size, int __shmflg) -{ - char keybuf[11]; - - sprintf(keybuf, "%d", __key); - - return ASharedMemory_create(keybuf, __size + sizeof(uint64_t)); -} -#else -static inline int shmget(key_t __key, size_t __size, int __shmflg) -{ - int fd,ret; - char keybuf[11]; - - fd = open(ASHMEM_DEVICE, O_RDWR); - if (fd < 0) - return fd; - - sprintf(keybuf,"%d",__key); - ret = ioctl(fd, ASHMEM_SET_NAME, keybuf); - if (ret < 0) - goto error; - - /* Stores size in first 8 bytes, allocate extra space */ - ret = ioctl(fd, ASHMEM_SET_SIZE, __size + sizeof(uint64_t)); - if (ret < 0) - goto error; - - return fd; - -error: - close(fd); - return ret; -} -#endif - -static inline void *shmat(int __shmid, const void *__shmaddr, int __shmflg) -{ - size_t size = ioctl(__shmid, ASHMEM_GET_SIZE, NULL); - /* Needs to be 8-byte aligned to prevent SIGBUS on 32-bit ARM */ - uint64_t *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, __shmid, 0); - /* Save size at beginning of buffer, for use with munmap */ - *ptr = size; - return ptr + 1; -} - -static inline int shmdt (const void *__shmaddr) -{ - /* Find mmap size which we stored at the beginning of the buffer */ - uint64_t *ptr = (uint64_t *)__shmaddr - 1; - size_t size = *ptr; - return munmap(ptr, size); -} -#endif - -#define SPLICE_DEF_SIZE (64*1024) - -enum { - IOPRIO_CLASS_NONE, - IOPRIO_CLASS_RT, - IOPRIO_CLASS_BE, - IOPRIO_CLASS_IDLE, -}; - -enum { - IOPRIO_WHO_PROCESS = 1, - IOPRIO_WHO_PGRP, - IOPRIO_WHO_USER, -}; - -#define IOPRIO_BITS 16 -#define IOPRIO_CLASS_SHIFT 13 - -#define IOPRIO_MIN_PRIO 0 /* highest priority */ -#define IOPRIO_MAX_PRIO 7 /* lowest priority */ - -#define IOPRIO_MIN_PRIO_CLASS 0 -#define IOPRIO_MAX_PRIO_CLASS 3 - -static inline int ioprio_value(int ioprio_class, int ioprio) -{ - /* - * If no class is set, assume BE - */ - if (!ioprio_class) - ioprio_class = IOPRIO_CLASS_BE; - - return (ioprio_class << IOPRIO_CLASS_SHIFT) | ioprio; -} - -static inline bool ioprio_value_is_class_rt(unsigned int priority) -{ - return (priority >> IOPRIO_CLASS_SHIFT) == IOPRIO_CLASS_RT; -} - -static inline int ioprio_set(int which, int who, int ioprio_class, int ioprio) -{ - return syscall(__NR_ioprio_set, which, who, - ioprio_value(ioprio_class, ioprio)); -} - -#ifndef BLKGETSIZE64 -#define BLKGETSIZE64 _IOR(0x12,114,size_t) -#endif - -#ifndef BLKFLSBUF -#define BLKFLSBUF _IO(0x12,97) -#endif - -#ifndef BLKDISCARD -#define BLKDISCARD _IO(0x12,119) -#endif - -static inline int blockdev_invalidate_cache(struct fio_file *f) -{ - return ioctl(f->fd, BLKFLSBUF); -} - -static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes) -{ - if (!ioctl(f->fd, BLKGETSIZE64, bytes)) - return 0; - - return errno; -} - -static inline unsigned long long os_phys_mem(void) -{ - long pagesize, pages; - - pagesize = sysconf(_SC_PAGESIZE); - pages = sysconf(_SC_PHYS_PAGES); - if (pages == -1 || pagesize == -1) - return 0; - - return (unsigned long long) pages * (unsigned long long) pagesize; -} - -#ifdef O_NOATIME -#define FIO_O_NOATIME O_NOATIME -#else -#define FIO_O_NOATIME 0 -#endif - -/* Check for GCC or Clang byte swap intrinsics */ -#if (__has_builtin(__builtin_bswap16) && __has_builtin(__builtin_bswap32) \ - && __has_builtin(__builtin_bswap64)) || (__GNUC__ > 4 \ - || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) /* fio_swapN */ -#define fio_swap16(x) __builtin_bswap16(x) -#define fio_swap32(x) __builtin_bswap32(x) -#define fio_swap64(x) __builtin_bswap64(x) -#else -#include -#define fio_swap16(x) bswap_16(x) -#define fio_swap32(x) bswap_32(x) -#define fio_swap64(x) bswap_64(x) -#endif /* fio_swapN */ - -#define CACHE_LINE_FILE \ - "/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size" - -static inline int arch_cache_line_size(void) -{ - char size[32]; - int fd, ret; - - fd = open(CACHE_LINE_FILE, O_RDONLY); - if (fd < 0) - return -1; - - ret = read(fd, size, sizeof(size)); - - close(fd); - - if (ret <= 0) - return -1; - else - return atoi(size); -} - -static inline unsigned long long get_fs_free_size(const char *path) -{ - unsigned long long ret; - struct statfs s; - - if (statfs(path, &s) < 0) - return -1ULL; - - ret = s.f_bsize; - ret *= (unsigned long long) s.f_bfree; - return ret; -} - -static inline int os_trim(struct fio_file *f, unsigned long long start, - unsigned long long len) -{ - uint64_t range[2]; - - range[0] = start; - range[1] = len; - - if (!ioctl(f->fd, BLKDISCARD, range)) - return 0; - - return errno; -} - -#ifdef CONFIG_SCHED_IDLE -static inline int fio_set_sched_idle(void) -{ - struct sched_param p = { .sched_priority = 0, }; - return sched_setscheduler(gettid(), SCHED_IDLE, &p); -} -#endif - -#ifndef RWF_UNCACHED -#define RWF_UNCACHED 0x00000040 -#endif - -#endif diff --git a/os/os-ashmem.h b/os/os-ashmem.h new file mode 100644 index 0000000000..80eab7c4e1 --- /dev/null +++ b/os/os-ashmem.h @@ -0,0 +1,84 @@ +#ifndef CONFIG_NO_SHM +/* + * Bionic doesn't support SysV shared memory, so implement it using ashmem + */ +#include +#include +#include +#include +#ifdef CONFIG_ASHAREDMEMORY_CREATE +#include +#else +#define ASHMEM_DEVICE "/dev/ashmem" +#endif +#define shmid_ds shmid64_ds +#define SHM_HUGETLB 04000 + +static inline int shmctl(int __shmid, int __cmd, struct shmid_ds *__buf) +{ + int ret=0; + if (__cmd == IPC_RMID) + { + int length = ioctl(__shmid, ASHMEM_GET_SIZE, NULL); + struct ashmem_pin pin = {0 , length}; + ret = ioctl(__shmid, ASHMEM_UNPIN, &pin); + close(__shmid); + } + return ret; +} + +#ifdef CONFIG_ASHAREDMEMORY_CREATE +static inline int shmget(key_t __key, size_t __size, int __shmflg) +{ + char keybuf[11]; + + sprintf(keybuf, "%d", __key); + + return ASharedMemory_create(keybuf, __size + sizeof(uint64_t)); +} +#else +static inline int shmget(key_t __key, size_t __size, int __shmflg) +{ + int fd,ret; + char keybuf[11]; + + fd = open(ASHMEM_DEVICE, O_RDWR); + if (fd < 0) + return fd; + + sprintf(keybuf,"%d",__key); + ret = ioctl(fd, ASHMEM_SET_NAME, keybuf); + if (ret < 0) + goto error; + + /* Stores size in first 8 bytes, allocate extra space */ + ret = ioctl(fd, ASHMEM_SET_SIZE, __size + sizeof(uint64_t)); + if (ret < 0) + goto error; + + return fd; + +error: + close(fd); + return ret; +} +#endif + +static inline void *shmat(int __shmid, const void *__shmaddr, int __shmflg) +{ + size_t size = ioctl(__shmid, ASHMEM_GET_SIZE, NULL); + /* Needs to be 8-byte aligned to prevent SIGBUS on 32-bit ARM */ + uint64_t *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, __shmid, 0); + /* Save size at beginning of buffer, for use with munmap */ + *ptr = size; + return ptr + 1; +} + +static inline int shmdt (const void *__shmaddr) +{ + /* Find mmap size which we stored at the beginning of the buffer */ + uint64_t *ptr = (uint64_t *)__shmaddr - 1; + size_t size = *ptr; + return munmap(ptr, size); +} +#endif diff --git a/os/os-dragonfly.h b/os/os-dragonfly.h index 5b37a37e19..4ce7253956 100644 --- a/os/os-dragonfly.h +++ b/os/os-dragonfly.h @@ -171,10 +171,12 @@ static inline int fio_getaffinity(int pid, os_cpu_mask_t *mask) * ioprio_set() with 4 arguments, so define fio's ioprio_set() as a macro. * Note that there is no idea of class within ioprio_set(2) unlike Linux. */ -#define ioprio_value(ioprio_class, ioprio) (ioprio) -#define ioprio_set(which, who, ioprio_class, ioprio) \ +#define ioprio_value(ioprio_class, ioprio, ioprio_hint) (ioprio) +#define ioprio_set(which, who, ioprio_class, ioprio, ioprio_hint) \ ioprio_set(which, who, ioprio) +#define ioprio(ioprio) (ioprio) + static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes) { struct partinfo pi; diff --git a/os/os-hpux.h b/os/os-hpux.h index a80cb2bc47..9f3d76f507 100644 --- a/os/os-hpux.h +++ b/os/os-hpux.h @@ -88,9 +88,9 @@ static inline unsigned long long os_phys_mem(void) return ret; } -#define FIO_HAVE_CPU_ONLINE_SYSCONF +#define FIO_HAVE_CPU_CONF_SYSCONF -static inline unsigned int cpus_online(void) +static inline unsigned int cpus_configured(void) { return mpctl(MPC_GETNUMSPUS, 0, NULL); } diff --git a/os/os-linux-syscall.h b/os/os-linux-syscall.h index c399b2fa99..626330adde 100644 --- a/os/os-linux-syscall.h +++ b/os/os-linux-syscall.h @@ -270,6 +270,29 @@ #define __NR_ioprio_get 31 #endif +/* Linux syscalls for loongarch64 */ +#elif defined(ARCH_LOONGARCH64_H) +#ifndef __NR_ioprio_set +#define __NR_ioprio_set 30 +#define __NR_ioprio_get 31 +#endif + +#ifndef __NR_fadvise64 +#define __NR_fadvise64 223 +#endif + +#ifndef __NR_sys_splice +#define __NR_sys_splice 76 +#define __NR_sys_tee 77 +#define __NR_sys_vmsplice 75 +#endif + +/* Linux syscalls for riscv64 */ +#elif defined(ARCH_RISCV64_H) +#ifndef __NR_ioprio_set +#define __NR_ioprio_set 30 +#define __NR_ioprio_get 31 +#endif #else #warning "Unknown architecture" #endif diff --git a/os/os-linux.h b/os/os-linux.h index 3001140ca4..6157e0e0b3 100644 --- a/os/os-linux.h +++ b/os/os-linux.h @@ -1,7 +1,11 @@ #ifndef FIO_OS_LINUX_H #define FIO_OS_LINUX_H +#ifdef __ANDROID__ +#define FIO_OS os_android +#else #define FIO_OS os_linux +#endif #include #include @@ -17,6 +21,11 @@ #include #include #include +#include +#ifdef __ANDROID__ +#include "os-ashmem.h" +#define FIO_NO_HAVE_SHM_H +#endif #ifdef ARCH_HAVE_CRC_CRYPTO #include @@ -50,8 +59,10 @@ #define FIO_HAVE_TRIM #define FIO_HAVE_GETTID #define FIO_USE_GENERIC_INIT_RANDOM_STATE +#define FIO_HAVE_BYTEORDER_FUNCS #define FIO_HAVE_PWRITEV2 #define FIO_HAVE_SHM_ATTACH_REMOVED +#define FIO_HAVE_RWF_ATOMIC #ifdef MAP_HUGETLB #define FIO_HAVE_MMAP_HUGE @@ -81,8 +92,8 @@ typedef cpu_set_t os_cpu_mask_t; pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask)) #endif -#define fio_cpu_clear(mask, cpu) (void) CPU_CLR((cpu), (mask)) -#define fio_cpu_set(mask, cpu) (void) CPU_SET((cpu), (mask)) +#define fio_cpu_clear(mask, cpu) CPU_CLR((cpu), (mask)) +#define fio_cpu_set(mask, cpu) CPU_SET((cpu), (mask)) #define fio_cpu_isset(mask, cpu) (CPU_ISSET((cpu), (mask)) != 0) #define fio_cpu_count(mask) CPU_COUNT((mask)) @@ -115,13 +126,24 @@ enum { #define IOPRIO_BITS 16 #define IOPRIO_CLASS_SHIFT 13 +#define IOPRIO_HINT_BITS 10 +#define IOPRIO_HINT_SHIFT 3 + #define IOPRIO_MIN_PRIO 0 /* highest priority */ #define IOPRIO_MAX_PRIO 7 /* lowest priority */ #define IOPRIO_MIN_PRIO_CLASS 0 #define IOPRIO_MAX_PRIO_CLASS 3 -static inline int ioprio_value(int ioprio_class, int ioprio) +#define IOPRIO_MIN_PRIO_HINT 0 +#define IOPRIO_MAX_PRIO_HINT ((1 << IOPRIO_HINT_BITS) - 1) + +#define ioprio_class(ioprio) ((ioprio) >> IOPRIO_CLASS_SHIFT) +#define ioprio(ioprio) ((ioprio) & IOPRIO_MAX_PRIO) +#define ioprio_hint(ioprio) \ + (((ioprio) >> IOPRIO_HINT_SHIFT) & IOPRIO_MAX_PRIO_HINT) + +static inline int ioprio_value(int ioprio_class, int ioprio, int ioprio_hint) { /* * If no class is set, assume BE @@ -129,18 +151,21 @@ static inline int ioprio_value(int ioprio_class, int ioprio) if (!ioprio_class) ioprio_class = IOPRIO_CLASS_BE; - return (ioprio_class << IOPRIO_CLASS_SHIFT) | ioprio; + return (ioprio_class << IOPRIO_CLASS_SHIFT) | + (ioprio_hint << IOPRIO_HINT_SHIFT) | + ioprio; } static inline bool ioprio_value_is_class_rt(unsigned int priority) { - return (priority >> IOPRIO_CLASS_SHIFT) == IOPRIO_CLASS_RT; + return ioprio_class(priority) == IOPRIO_CLASS_RT; } -static inline int ioprio_set(int which, int who, int ioprio_class, int ioprio) +static inline int ioprio_set(int which, int who, int ioprio_class, int ioprio, + int ioprio_hint) { return syscall(__NR_ioprio_set, which, who, - ioprio_value(ioprio_class, ioprio)); + ioprio_value(ioprio_class, ioprio, ioprio_hint)); } #ifndef CONFIG_HAVE_GETTID @@ -195,12 +220,6 @@ static inline unsigned long long os_phys_mem(void) #define FIO_O_NOATIME 0 #endif -#ifdef O_ATOMIC -#define OS_O_ATOMIC O_ATOMIC -#else -#define OS_O_ATOMIC 040000000 -#endif - #ifdef MADV_REMOVE #define FIO_MADV_FREE MADV_REMOVE #endif @@ -241,14 +260,6 @@ static inline int arch_cache_line_size(void) return atoi(size); } -#ifdef __powerpc64__ -#define FIO_HAVE_CPU_ONLINE_SYSCONF -static inline unsigned int cpus_online(void) -{ - return sysconf(_SC_NPROCESSORS_CONF); -} -#endif - static inline unsigned long long get_fs_free_size(const char *path) { unsigned long long ret; @@ -318,8 +329,12 @@ static inline int fio_set_sched_idle(void) #define RWF_NOWAIT 0x00000008 #endif -#ifndef RWF_UNCACHED -#define RWF_UNCACHED 0x00000040 +#ifndef RWF_ATOMIC +#define RWF_ATOMIC 0x00000040 +#endif + +#ifndef RWF_DONTCACHE +#define RWF_DONTCACHE 0x00000080 #endif #ifndef RWF_WRITE_LIFE_SHIFT diff --git a/os/os-mac.h b/os/os-mac.h index ec2cc1e555..4e96228a34 100644 --- a/os/os-mac.h +++ b/os/os-mac.h @@ -14,12 +14,16 @@ #include #include +#include "../arch/arch.h" #include "../file.h" +#include "mac/posix.h" + #define FIO_USE_GENERIC_INIT_RANDOM_STATE #define FIO_HAVE_GETTID #define FIO_HAVE_CHARDEV_SIZE #define FIO_HAVE_NATIVE_FALLOCATE +#define FIO_HAVE_CPU_HAS #define OS_MAP_ANON MAP_ANON @@ -33,10 +37,6 @@ pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask)) #endif -#ifndef CONFIG_CLOCKID_T -typedef unsigned int clockid_t; -#endif - #define FIO_OS_DIRECTIO static inline int fio_set_odirect(struct fio_file *f) { @@ -106,4 +106,14 @@ static inline bool fio_fallocate(struct fio_file *f, uint64_t offset, uint64_t l return false; } +static inline bool os_cpu_has(cpu_features feature) +{ + /* just check for arm on OSX for now, we know that has it */ + if (feature != CPU_ARM64_CRC32C) + return false; + return FIO_ARCH == arch_aarch64; +} + #endif + +#define CONFIG_POSIX_FADVISE diff --git a/os/os-netbsd.h b/os/os-netbsd.h index 624c7fa509..b553a4300b 100644 --- a/os/os-netbsd.h +++ b/os/os-netbsd.h @@ -13,7 +13,7 @@ #include #include -/* XXX hack to avoid confilcts between rbtree.h and */ +/* XXX hack to avoid conflicts between rbtree.h and */ #undef rb_node #undef rb_left #undef rb_right diff --git a/os/os-qnx.h b/os/os-qnx.h new file mode 100755 index 0000000000..8ae9695ec8 --- /dev/null +++ b/os/os-qnx.h @@ -0,0 +1,105 @@ +#ifndef FIO_OS_QNX_H +#define FIO_OS_QNX_H + +#define FIO_OS os_qnx +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* XXX hack to avoid conflicts between rbtree.h and */ +#undef RB_BLACK +#undef RB_RED +#undef RB_ROOT + +#include "../file.h" + +/* QNX is not supporting SA_RESTART. Use SA_NOCLDSTOP instead of it */ +#ifndef SA_RESTART +#define SA_RESTART SA_NOCLDSTOP +#endif + +#define FIO_NO_HAVE_SHM_H + +typedef uint64_t __u64; +typedef unsigned int __u32; + +#define FIO_USE_GENERIC_INIT_RANDOM_STATE +#define FIO_HAVE_FS_STAT +#define FIO_HAVE_GETTID + +#define OS_MAP_ANON MAP_ANON + +#ifndef PTHREAD_STACK_MIN +#define PTHREAD_STACK_MIN 4096 +#endif + +#define fio_swap16(x) swap16(x) +#define fio_swap32(x) swap32(x) +#define fio_swap64(x) swap64(x) + +#ifdef CONFIG_PTHREAD_GETAFFINITY +#define FIO_HAVE_GET_THREAD_AFFINITY +#define fio_get_thread_affinity(mask) \ + pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask)) +#endif + +static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes) +{ + struct stat statbuf; + + if (fstat(f->fd, &statbuf) == -1) { + *bytes = 0; + return errno; + } + + *bytes = (unsigned long long)(statbuf.st_blocksize * statbuf.st_nblocks); + return 0; +} + +static inline int blockdev_invalidate_cache(struct fio_file *f) +{ + return ENOTSUP; +} + +static inline unsigned long long os_phys_mem(void) +{ + uint64_t mem = 0; + const char *const strings = SYSPAGE_ENTRY(strings)->data; + const struct asinfo_entry *const begin = SYSPAGE_ENTRY(asinfo); + const struct asinfo_entry *const end = begin + SYSPAGE_ENTRY_SIZE(asinfo) / SYSPAGE_ELEMENT_SIZE(asinfo); + + assert(SYSPAGE_ELEMENT_SIZE(asinfo) == sizeof(struct asinfo_entry)); + + for (const struct asinfo_entry *e = begin; e < end; ++e) { + if (!strcmp(strings + e->name, "ram")) + mem += e->end - e->start + 1; + } + return mem; +} + +static inline unsigned long long get_fs_free_size(const char *path) +{ + unsigned long long ret; + struct statvfs s; + + if (statvfs(path, &s) < 0) + return -1ULL; + + ret = s.f_frsize; + ret *= (unsigned long long) s.f_bfree; + return ret; +} + +#ifdef MADV_FREE +#define FIO_MADV_FREE MADV_FREE +#endif + +#endif diff --git a/os/os-solaris.h b/os/os-solaris.h index ea1f081c89..60d4c1eca4 100644 --- a/os/os-solaris.h +++ b/os/os-solaris.h @@ -119,7 +119,7 @@ static inline int fio_set_odirect(struct fio_file *f) static inline bool fio_cpu_isset(os_cpu_mask_t *mask, int cpu) { - const unsigned int max_cpus = sysconf(_SC_NPROCESSORS_ONLN); + const unsigned int max_cpus = sysconf(_SC_NPROCESSORS_CONF); unsigned int num_cpus; processorid_t *cpus; bool ret; diff --git a/os/os-windows.h b/os/os-windows.h index 59da9dba1a..909c12e357 100644 --- a/os/os-windows.h +++ b/os/os-windows.h @@ -44,7 +44,7 @@ #define fio_swap64(x) _byteswap_uint64(x) #define _SC_PAGESIZE 0x1 -#define _SC_NPROCESSORS_ONLN 0x2 +#define _SC_NPROCESSORS_CONF 0x2 #define _SC_PHYS_PAGES 0x4 #define SA_RESTART 0 @@ -106,10 +106,11 @@ int fdatasync(int fildes); int lstat(const char * path, struct stat * buf); uid_t geteuid(void); char* ctime_r(const time_t *t, char *buf); -int nanosleep(const struct timespec *rqtp, struct timespec *rmtp); ssize_t pread(int fildes, void *buf, size_t nbyte, off_t offset); ssize_t pwrite(int fildes, const void *buf, size_t nbyte, off_t offset); +HANDLE windows_handle_connection(HANDLE hjob, int sk); +HANDLE windows_create_job(void); static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes) { @@ -217,9 +218,6 @@ static inline int fio_mkdir(const char *path, mode_t mode) { return 0; } -#define FIO_HAVE_CPU_ONLINE_SYSCONF -unsigned int cpus_online(void); - int first_set_cpu(os_cpu_mask_t *cpumask); int fio_setaffinity(int pid, os_cpu_mask_t cpumask); int fio_cpuset_init(os_cpu_mask_t *mask); diff --git a/os/os.h b/os/os.h index 5965d7b806..0736f8a1b8 100644 --- a/os/os.h +++ b/os/os.h @@ -24,6 +24,7 @@ enum { os_windows, os_android, os_dragonfly, + os_qnx, os_nr, }; @@ -33,14 +34,14 @@ typedef enum { } cpu_features; /* IWYU pragma: begin_exports */ -#if defined(__ANDROID__) -#include "os-android.h" -#elif defined(__linux__) +#if defined(__linux__) #include "os-linux.h" #elif defined(__FreeBSD__) #include "os-freebsd.h" #elif defined(__OpenBSD__) #include "os-openbsd.h" +#elif defined(__QNX__) +#include "os-qnx.h" #elif defined(__NetBSD__) #include "os-netbsd.h" #elif defined(__sun__) @@ -118,11 +119,20 @@ extern int fio_cpus_split(os_cpu_mask_t *mask, unsigned int cpu); #endif #ifndef FIO_HAVE_IOPRIO_CLASS +#define ioprio_class(prio) 0 #define ioprio_value_is_class_rt(prio) (false) +#define IOPRIO_MIN_PRIO_CLASS 0 +#define IOPRIO_MAX_PRIO_CLASS 0 +#define ioprio_hint(prio) 0 +#define IOPRIO_MIN_PRIO_HINT 0 +#define IOPRIO_MAX_PRIO_HINT 0 #endif #ifndef FIO_HAVE_IOPRIO -#define ioprio_value(prioclass, prio) (0) -#define ioprio_set(which, who, prioclass, prio) (0) +#define ioprio_value(prioclass, prio, priohint) (0) +#define ioprio(ioprio) 0 +#define ioprio_set(which, who, prioclass, prio, priohint) (0) +#define IOPRIO_MIN_PRIO 0 +#define IOPRIO_MAX_PRIO 0 #endif #ifndef FIO_HAVE_ODIRECT @@ -131,12 +141,6 @@ extern int fio_cpus_split(os_cpu_mask_t *mask, unsigned int cpu); #define OS_O_DIRECT O_DIRECT #endif -#ifdef OS_O_ATOMIC -#define FIO_O_ATOMIC OS_O_ATOMIC -#else -#define FIO_O_ATOMIC 0 -#endif - #ifndef FIO_HAVE_HUGETLB #define SHM_HUGETLB 0 #define MAP_HUGETLB 0 @@ -170,11 +174,7 @@ extern int fio_cpus_split(os_cpu_mask_t *mask, unsigned int cpu); #endif #ifndef FIO_PREFERRED_CLOCK_SOURCE -#ifdef CONFIG_CLOCK_GETTIME #define FIO_PREFERRED_CLOCK_SOURCE CS_CGETTIME -#else -#define FIO_PREFERRED_CLOCK_SOURCE CS_GTOD -#endif #endif #ifndef CONFIG_SOCKLEN_T @@ -350,10 +350,12 @@ static inline unsigned long long get_fs_free_size(const char *path) } #endif -#ifndef FIO_HAVE_CPU_ONLINE_SYSCONF -static inline unsigned int cpus_online(void) +#ifndef FIO_HAVE_CPU_CONF_SYSCONF +static inline unsigned int cpus_configured(void) { - return sysconf(_SC_NPROCESSORS_ONLN); + int nr_cpus = sysconf(_SC_NPROCESSORS_CONF); + + return nr_cpus >= 1 ? nr_cpus : 1; } #endif @@ -361,7 +363,7 @@ static inline unsigned int cpus_online(void) #ifdef FIO_HAVE_CPU_AFFINITY static inline int CPU_COUNT(os_cpu_mask_t *mask) { - int max_cpus = cpus_online(); + int max_cpus = cpus_configured(); int nr_cpus, i; for (i = 0, nr_cpus = 0; i < max_cpus; i++) diff --git a/os/windows/cpu-affinity.c b/os/windows/cpu-affinity.c index 7601970fc7..8f3d6a76b4 100644 --- a/os/windows/cpu-affinity.c +++ b/os/windows/cpu-affinity.c @@ -2,12 +2,6 @@ #include -/* Return all processors regardless of processor group */ -unsigned int cpus_online(void) -{ - return GetActiveProcessorCount(ALL_PROCESSOR_GROUPS); -} - static void print_mask(os_cpu_mask_t *cpumask) { for (int i = 0; i < FIO_CPU_MASK_ROWS; i++) diff --git a/os/windows/dlls.c b/os/windows/dlls.c index 774b1c612f..ffedfa1e8f 100644 --- a/os/windows/dlls.c +++ b/os/windows/dlls.c @@ -11,12 +11,18 @@ void os_clk_tck(long *clk_tck) */ unsigned long minRes, maxRes, curRes; HMODULE lib; - FARPROC queryTimer; - FARPROC setTimer; + NTSTATUS NTAPI (*queryTimer) + (OUT PULONG MinimumResolution, + OUT PULONG MaximumResolution, + OUT PULONG CurrentResolution); + NTSTATUS NTAPI (*setTimer) + (IN ULONG DesiredResolution, + IN BOOLEAN SetResolution, + OUT PULONG CurrentResolution); if (!(lib = LoadLibrary(TEXT("ntdll.dll"))) || - !(queryTimer = GetProcAddress(lib, "NtQueryTimerResolution")) || - !(setTimer = GetProcAddress(lib, "NtSetTimerResolution"))) { + !(queryTimer = (void *)GetProcAddress(lib, "NtQueryTimerResolution")) || + !(setTimer = (void *)GetProcAddress(lib, "NtSetTimerResolution"))) { dprint(FD_HELPERTHREAD, "Failed to load ntdll library, set to lower bound 64 Hz\n"); *clk_tck = 64; @@ -30,4 +36,4 @@ void os_clk_tck(long *clk_tck) setTimer(maxRes, 1, &curRes); *clk_tck = (long) (10000000L / maxRes); } -} \ No newline at end of file +} diff --git a/os/windows/examples.wxs b/os/windows/examples.wxs index 9308ba8be8..d70c77133f 100755 --- a/os/windows/examples.wxs +++ b/os/windows/examples.wxs @@ -125,9 +125,6 @@ - - - @@ -212,7 +209,6 @@ - diff --git a/os/windows/install.wxs b/os/windows/install.wxs index 7773bb3b86..d1b89dbccb 100755 --- a/os/windows/install.wxs +++ b/os/windows/install.wxs @@ -33,13 +33,13 @@ - + - + @@ -100,7 +100,7 @@ - + fio@vger.kernel.org http://www.spinics.net/lists/fio/ https://bluestop.org/fio/ diff --git a/os/windows/posix.c b/os/windows/posix.c index 09c2e4a785..ca3ee389df 100644 --- a/os/windows/posix.c +++ b/os/windows/posix.c @@ -216,10 +216,18 @@ long sysconf(int name) MEMORYSTATUSEX status; switch (name) { - case _SC_NPROCESSORS_ONLN: - val = GetNumLogicalProcessors(); + case _SC_NPROCESSORS_CONF: + /* + * Using GetMaximumProcessorCount introduces a problem in + * gettime.c because Windows does not have + * fio_get_thread_affinity. Log sample (see #1479): + * + * CPU mask contains processor beyond last active processor index (2) + * clock setaffinity failed: No error + */ + val = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS); if (val == -1) - log_err("sysconf(_SC_NPROCESSORS_ONLN) failed\n"); + log_err("sysconf(_SC_NPROCESSORS_CONF) failed\n"); break; @@ -289,7 +297,7 @@ void Time_tToSystemTime(time_t dosTime, SYSTEMTIME *systemTime) LONGLONG jan1970; SYSTEMTIME tempSystemTime; - jan1970 = Int32x32To64(dosTime, 10000000) + 116444736000000000; + jan1970 = (dosTime * 10000000LL) + 116444736000000000LL; utcFT.dwLowDateTime = (DWORD)jan1970; utcFT.dwHighDateTime = jan1970 >> 32; @@ -537,48 +545,9 @@ int fcntl(int fildes, int cmd, ...) return 0; } -/* - * Get the value of a local clock source. - * This implementation supports 2 clocks: CLOCK_MONOTONIC provides high-accuracy - * relative time, while CLOCK_REALTIME provides a low-accuracy wall time. - */ -int clock_gettime(clockid_t clock_id, struct timespec *tp) -{ - int rc = 0; - - if (clock_id == CLOCK_MONOTONIC) { - static LARGE_INTEGER freq = {{0,0}}; - LARGE_INTEGER counts; - uint64_t t; - - QueryPerformanceCounter(&counts); - if (freq.QuadPart == 0) - QueryPerformanceFrequency(&freq); - - tp->tv_sec = counts.QuadPart / freq.QuadPart; - /* Get the difference between the number of ns stored - * in 'tv_sec' and that stored in 'counts' */ - t = tp->tv_sec * freq.QuadPart; - t = counts.QuadPart - t; - /* 't' now contains the number of cycles since the last second. - * We want the number of nanoseconds, so multiply out by 1,000,000,000 - * and then divide by the frequency. */ - t *= 1000000000; - tp->tv_nsec = t / freq.QuadPart; - } else if (clock_id == CLOCK_REALTIME) { - /* clock_gettime(CLOCK_REALTIME,...) is just an alias for gettimeofday with a - * higher-precision field. */ - struct timeval tv; - gettimeofday(&tv, NULL); - tp->tv_sec = tv.tv_sec; - tp->tv_nsec = tv.tv_usec * 1000; - } else { - errno = EINVAL; - rc = -1; - } - - return rc; -} +#ifndef CLOCK_MONOTONIC_RAW +#define CLOCK_MONOTONIC_RAW 4 +#endif int mlock(const void * addr, size_t len) { @@ -817,18 +786,24 @@ ssize_t pwrite(int fildes, const void *buf, size_t nbyte, off_t offset) { int64_t pos = _telli64(fildes); - ssize_t len = _write(fildes, buf, nbyte); + ssize_t len; + _lseeki64(fildes, offset, SEEK_SET); + len = _write(fildes, buf, nbyte); _lseeki64(fildes, pos, SEEK_SET); + return len; } ssize_t pread(int fildes, void *buf, size_t nbyte, off_t offset) { int64_t pos = _telli64(fildes); - ssize_t len = read(fildes, buf, nbyte); + ssize_t len; + _lseeki64(fildes, offset, SEEK_SET); + len = read(fildes, buf, nbyte); _lseeki64(fildes, pos, SEEK_SET); + return len; } @@ -860,10 +835,12 @@ ssize_t writev(int fildes, const struct iovec *iov, int iovcnt) return bytes_written; } +#ifndef _WIN32 long long strtoll(const char *restrict str, char **restrict endptr, int base) { return _strtoi64(str, endptr, base); } +#endif int poll(struct pollfd fds[], nfds_t nfds, int timeout) { @@ -884,10 +861,9 @@ int poll(struct pollfd fds[], nfds_t nfds, int timeout) FD_ZERO(&exceptfds); for (i = 0; i < nfds; i++) { - if (fds[i].fd == INVALID_SOCKET) { - fds[i].revents = 0; + fds[i].revents = 0; + if (fds[i].fd == INVALID_SOCKET) continue; - } if (fds[i].events & POLLIN) FD_SET(fds[i].fd, &readfds); @@ -917,34 +893,6 @@ int poll(struct pollfd fds[], nfds_t nfds, int timeout) return rc; } -int nanosleep(const struct timespec *rqtp, struct timespec *rmtp) -{ - struct timespec tv; - DWORD ms_remaining; - DWORD ms_total = (rqtp->tv_sec * 1000) + (rqtp->tv_nsec / 1000000.0); - - if (ms_total == 0) - ms_total = 1; - - ms_remaining = ms_total; - - /* Since Sleep() can sleep for less than the requested time, add a loop to - ensure we only return after the requested length of time has elapsed */ - do { - fio_gettime(&tv, NULL); - Sleep(ms_remaining); - ms_remaining = ms_total - mtime_since_now(&tv); - } while (ms_remaining > 0 && ms_remaining < ms_total); - - /* this implementation will never sleep for less than the requested time */ - if (rmtp != NULL) { - rmtp->tv_sec = 0; - rmtp->tv_nsec = 0; - } - - return 0; -} - DIR *opendir(const char *dirname) { struct dirent_ctx *dc = NULL; @@ -1026,3 +974,174 @@ in_addr_t inet_network(const char *cp) hbo = ((nbo & 0xFF) << 24) + ((nbo & 0xFF00) << 8) + ((nbo & 0xFF0000) >> 8) + ((nbo & 0xFF000000) >> 24); return hbo; } + +static HANDLE create_named_pipe(char *pipe_name, int wait_connect_time) +{ + HANDLE hpipe; + + hpipe = CreateNamedPipe ( + pipe_name, + PIPE_ACCESS_DUPLEX, + PIPE_WAIT | PIPE_TYPE_BYTE, + 1, 0, 0, wait_connect_time, NULL); + + if (hpipe == INVALID_HANDLE_VALUE) { + log_err("ConnectNamedPipe failed (%lu).\n", GetLastError()); + return INVALID_HANDLE_VALUE; + } + + if (!ConnectNamedPipe(hpipe, NULL)) { + log_err("ConnectNamedPipe failed (%lu).\n", GetLastError()); + CloseHandle(hpipe); + return INVALID_HANDLE_VALUE; + } + + return hpipe; +} + +static BOOL windows_create_process(PROCESS_INFORMATION *pi, const char *args, HANDLE *hjob) +{ + LPSTR this_cmd_line = GetCommandLine(); + LPSTR new_process_cmd_line = malloc((strlen(this_cmd_line)+strlen(args)) * sizeof(char *)); + STARTUPINFO si = {0}; + DWORD flags = 0; + + strcpy(new_process_cmd_line, this_cmd_line); + strcat(new_process_cmd_line, args); + + si.cb = sizeof(si); + memset(pi, 0, sizeof(*pi)); + + if ((hjob != NULL) && (*hjob != INVALID_HANDLE_VALUE)) + flags = CREATE_SUSPENDED | CREATE_BREAKAWAY_FROM_JOB; + + flags |= CREATE_NEW_CONSOLE; + + if( !CreateProcess( NULL, + new_process_cmd_line, + NULL, /* Process handle not inherited */ + NULL, /* Thread handle not inherited */ + TRUE, /* no handle inheritance */ + flags, + NULL, /* Use parent's environment block */ + NULL, /* Use parent's starting directory */ + &si, + pi ) + ) + { + log_err("CreateProcess failed (%lu).\n", GetLastError() ); + free(new_process_cmd_line); + return 1; + } + if ((hjob != NULL) && (*hjob != INVALID_HANDLE_VALUE)) { + BOOL ret = AssignProcessToJobObject(*hjob, pi->hProcess); + if (!ret) { + log_err("AssignProcessToJobObject failed (%lu).\n", GetLastError() ); + return 1; + } + + ResumeThread(pi->hThread); + } + + free(new_process_cmd_line); + return 0; +} + +HANDLE windows_create_job(void) +{ + JOBOBJECT_EXTENDED_LIMIT_INFORMATION jeli = { 0 }; + BOOL success; + HANDLE hjob = CreateJobObject(NULL, NULL); + + jeli.BasicLimitInformation.LimitFlags = JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE; + success = SetInformationJobObject(hjob, JobObjectExtendedLimitInformation, &jeli, sizeof(jeli)); + if ( success == 0 ) { + log_err( "SetInformationJobObject failed: error %lu\n", GetLastError() ); + return INVALID_HANDLE_VALUE; + } + return hjob; +} + +/* wait for a child process to either exit or connect to a child */ +static bool monitor_process_till_connect(PROCESS_INFORMATION *pi, HANDLE *hpipe) +{ + bool connected = FALSE; + bool process_alive = TRUE; + char buffer[32] = {0}; + DWORD bytes_read; + + do { + DWORD exit_code; + GetExitCodeProcess(pi->hProcess, &exit_code); + if (exit_code != STILL_ACTIVE) { + dprint(FD_PROCESS, "process %u exited %d\n", GetProcessId(pi->hProcess), exit_code); + break; + } + + memset(buffer, 0, sizeof(buffer)); + ReadFile(*hpipe, &buffer, sizeof(buffer) - 1, &bytes_read, NULL); + if (bytes_read && strstr(buffer, "connected")) { + dprint(FD_PROCESS, "process %u connected to client\n", GetProcessId(pi->hProcess)); + connected = TRUE; + } + usleep(10*1000); + } while (process_alive && !connected); + return connected; +} + +/*create a process with --server-internal to emulate fork() */ +HANDLE windows_handle_connection(HANDLE hjob, int sk) +{ + char pipe_name[64] = "\\\\.\\pipe\\fiointernal-"; + char args[128] = " --server-internal="; + PROCESS_INFORMATION pi; + HANDLE hpipe = INVALID_HANDLE_VALUE; + WSAPROTOCOL_INFO protocol_info; + HANDLE ret; + + sprintf(pipe_name+strlen(pipe_name), "%d", GetCurrentProcessId()); + sprintf(args+strlen(args), "%s", pipe_name); + + if (windows_create_process(&pi, args, &hjob) != 0) + return INVALID_HANDLE_VALUE; + else + ret = pi.hProcess; + + /* duplicate socket and write the protocol_info to pipe so child can + * duplicate the communication socket */ + if (WSADuplicateSocket(sk, GetProcessId(pi.hProcess), &protocol_info)) { + log_err("WSADuplicateSocket failed (%lu).\n", GetLastError()); + ret = INVALID_HANDLE_VALUE; + goto cleanup; + } + + /* make a pipe with a unique name based upon processid */ + hpipe = create_named_pipe(pipe_name, 1000); + if (hpipe == INVALID_HANDLE_VALUE) { + ret = INVALID_HANDLE_VALUE; + goto cleanup; + } + + if (!WriteFile(hpipe, &protocol_info, sizeof(protocol_info), NULL, NULL)) { + log_err("WriteFile failed (%lu).\n", GetLastError()); + ret = INVALID_HANDLE_VALUE; + goto cleanup; + } + + dprint(FD_PROCESS, "process %d created child process %u\n", GetCurrentProcessId(), GetProcessId(pi.hProcess)); + + /* monitor the process until it either exits or connects. This level + * doesnt care which of those occurs because the result is that it + * needs to loop around and create another child process to monitor */ + if (!monitor_process_till_connect(&pi, &hpipe)) + ret = INVALID_HANDLE_VALUE; + +cleanup: + /* close the handles and pipes because this thread is done monitoring them */ + if (ret == INVALID_HANDLE_VALUE) + CloseHandle(pi.hProcess); + CloseHandle(pi.hThread); + DisconnectNamedPipe(hpipe); + CloseHandle(hpipe); + return ret; +} diff --git a/os/windows/posix.h b/os/windows/posix.h index 02a9075be2..afb00d5a36 100644 --- a/os/windows/posix.h +++ b/os/windows/posix.h @@ -3,7 +3,6 @@ typedef int clockid_t; -extern int clock_gettime(clockid_t clock_id, struct timespec *tp); extern int inet_aton(const char *, struct in_addr *); extern int win_to_posix_error(DWORD winerr); diff --git a/os/windows/posix/include/syslog.h b/os/windows/posix/include/syslog.h index b8582e9540..03a04f69f8 100644 --- a/os/windows/posix/include/syslog.h +++ b/os/windows/posix/include/syslog.h @@ -1,7 +1,7 @@ #ifndef SYSLOG_H #define SYSLOG_H -int syslog(); +int syslog(int priority, const char *format, ...); #define LOG_INFO 0x1 #define LOG_ERROR 0x2 diff --git a/oslib/blkzoned.h b/oslib/blkzoned.h index 719b041d12..a8e4a94809 100644 --- a/oslib/blkzoned.h +++ b/oslib/blkzoned.h @@ -16,8 +16,16 @@ extern int blkzoned_report_zones(struct thread_data *td, struct zbd_zone *zones, unsigned int nr_zones); extern int blkzoned_reset_wp(struct thread_data *td, struct fio_file *f, uint64_t offset, uint64_t length); +extern int blkzoned_move_zone_wp(struct thread_data *td, struct fio_file *f, + struct zbd_zone *z, uint64_t length, + const char *buf); extern int blkzoned_get_max_open_zones(struct thread_data *td, struct fio_file *f, unsigned int *max_open_zones); +extern int blkzoned_get_max_active_zones(struct thread_data *td, + struct fio_file *f, + unsigned int *max_active_zones); +extern int blkzoned_finish_zone(struct thread_data *td, struct fio_file *f, + uint64_t offset, uint64_t length); #else /* * Define stubs for systems that do not have zoned block device support. @@ -46,11 +54,29 @@ static inline int blkzoned_reset_wp(struct thread_data *td, struct fio_file *f, { return -EIO; } +static inline int blkzoned_move_zone_wp(struct thread_data *td, + struct fio_file *f, struct zbd_zone *z, + uint64_t length, const char *buf) +{ + return -EIO; +} static inline int blkzoned_get_max_open_zones(struct thread_data *td, struct fio_file *f, unsigned int *max_open_zones) { return -EIO; } +static inline int blkzoned_get_max_active_zones(struct thread_data *td, + struct fio_file *f, + unsigned int *max_open_zones) +{ + return -EIO; +} +static inline int blkzoned_finish_zone(struct thread_data *td, + struct fio_file *f, + uint64_t offset, uint64_t length) +{ + return -EIO; +} #endif #endif /* FIO_BLKZONED_H */ diff --git a/oslib/libmtd.h b/oslib/libmtd.h index a0c90dcb9d..668e77981f 100644 --- a/oslib/libmtd.h +++ b/oslib/libmtd.h @@ -256,7 +256,7 @@ int mtd_mark_bad(const struct mtd_dev_info *mtd, int fd, int eb); * @mtd: MTD device description object * @fd: MTD device node file descriptor * @eb: eraseblock to read from - * @offs: offset withing the eraseblock to read from + * @offs: offset within the eraseblock to read from * @buf: buffer to read data to * @len: how many bytes to read * @@ -273,7 +273,7 @@ int mtd_read(const struct mtd_dev_info *mtd, int fd, int eb, int offs, * @mtd: MTD device description object * @fd: MTD device node file descriptor * @eb: eraseblock to write to - * @offs: offset withing the eraseblock to write to + * @offs: offset within the eraseblock to write to * @data: data buffer to write * @len: how many data bytes to write * @oob: OOB buffer to write @@ -329,7 +329,7 @@ int mtd_write_oob(libmtd_t desc, const struct mtd_dev_info *mtd, int fd, * @mtd: MTD device description object * @fd: MTD device node file descriptor * @eb: eraseblock to write to - * @offs: offset withing the eraseblock to write to + * @offs: offset within the eraseblock to write to * @img_name: the file to write * * This function writes an image @img_name the MTD device defined by @mtd. @eb diff --git a/oslib/linux-blkzoned.c b/oslib/linux-blkzoned.c index 185bd5011b..c45ef623de 100644 --- a/oslib/linux-blkzoned.c +++ b/oslib/linux-blkzoned.c @@ -22,6 +22,10 @@ #include "zbd_types.h" #include +#ifndef BLKFINISHZONE +#define BLKFINISHZONE _IOW(0x12, 136, struct blk_zone_range) +#endif +#include /* * If the uapi headers installed on the system lacks zone capacity support, @@ -183,6 +187,29 @@ int blkzoned_get_max_open_zones(struct thread_data *td, struct fio_file *f, return 0; } +int blkzoned_get_max_active_zones(struct thread_data *td, struct fio_file *f, + unsigned int *max_active_zones) +{ + char *max_active_str; + + if (f->filetype != FIO_TYPE_BLOCK) + return -EIO; + + max_active_str = blkzoned_get_sysfs_attr(f->file_name, "queue/max_active_zones"); + if (!max_active_str) { + *max_active_zones = 0; + return 0; + } + + dprint(FD_ZBD, "%s: max active zones supported by device: %s\n", + f->file_name, max_active_str); + *max_active_zones = atoll(max_active_str); + + free(max_active_str); + + return 0; +} + static uint64_t zone_capacity(struct blk_zone_report *hdr, struct blk_zone *blkz) { @@ -216,6 +243,8 @@ int blkzoned_report_zones(struct thread_data *td, struct fio_file *f, hdr->sector = offset >> 9; ret = ioctl(fd, BLKREPORTZONE, hdr); if (ret) { + log_err("%s: BLKREPORTZONE ioctl failed, ret=%d, err=%d.\n", + f->file_name, ret, -errno); ret = -errno; goto out; } @@ -308,3 +337,66 @@ int blkzoned_reset_wp(struct thread_data *td, struct fio_file *f, return ret; } + +int blkzoned_finish_zone(struct thread_data *td, struct fio_file *f, + uint64_t offset, uint64_t length) +{ + struct blk_zone_range zr = { + .sector = offset >> 9, + .nr_sectors = length >> 9, + }; + int fd, ret = 0; + + /* If the file is not yet opened, open it for this function. */ + fd = f->fd; + if (fd < 0) { + fd = open(f->file_name, O_RDWR | O_LARGEFILE); + if (fd < 0) + return -errno; + } + + if (ioctl(fd, BLKFINISHZONE, &zr) < 0) { + ret = -errno; + /* + * Kernel versions older than 5.5 do not support BLKFINISHZONE + * and return the ENOTTY error code. These old kernels only + * support block devices that close zones automatically. + */ + if (ret == ENOTTY) + ret = 0; + } + + if (f->fd < 0) + close(fd); + + return ret; +} + +int blkzoned_move_zone_wp(struct thread_data *td, struct fio_file *f, + struct zbd_zone *z, uint64_t length, const char *buf) +{ + int fd, ret = 0; + + /* If the file is not yet open, open it for this function */ + fd = f->fd; + if (fd < 0) { + fd = open(f->file_name, O_WRONLY | O_DIRECT); + if (fd < 0) + return -errno; + } + + /* If write data is not provided, fill zero to move the write pointer */ + if (!buf) { + ret = fallocate(fd, FALLOC_FL_ZERO_RANGE, z->wp, length); + goto out; + } + + if (pwrite(fd, buf, length, z->wp) < 0) + ret = -errno; + +out: + if (f->fd < 0) + close(fd); + + return ret; +} diff --git a/oslib/linux-dev-lookup.c b/oslib/linux-dev-lookup.c index 1dda93f2a0..4335faf99b 100644 --- a/oslib/linux-dev-lookup.c +++ b/oslib/linux-dev-lookup.c @@ -16,6 +16,16 @@ int blktrace_lookup_device(const char *redirect, char *path, unsigned int maj, int found = 0; DIR *D; + /* + * If replay_redirect is set then always return this device + * upon lookup which overrides the device lookup based on + * major minor in the actual blktrace + */ + if (redirect) { + strcpy(path, redirect); + return 1; + } + D = opendir(path); if (!D) return 0; @@ -44,17 +54,6 @@ int blktrace_lookup_device(const char *redirect, char *path, unsigned int maj, if (!S_ISBLK(st.st_mode)) continue; - /* - * If replay_redirect is set then always return this device - * upon lookup which overrides the device lookup based on - * major minor in the actual blktrace - */ - if (redirect) { - strcpy(path, redirect); - found = 1; - break; - } - if (maj == major(st.st_rdev) && min == minor(st.st_rdev)) { strcpy(path, full_path); found = 1; diff --git a/parse.c b/parse.c index d086ee488f..5bb55bffac 100644 --- a/parse.c +++ b/parse.c @@ -480,14 +480,17 @@ static size_t opt_len(const char *str) char delimiter[] = {',', ':'}; char *postfix; unsigned int i; + size_t candidate_len; + size_t prefix_len = strlen(str); for (i = 0; i < FIO_ARRAY_SIZE(delimiter); i++) { postfix = strchr(str, delimiter[i]); - if (postfix) - return (int)(postfix - str); + candidate_len = (size_t)(postfix - str); + if (postfix && candidate_len < prefix_len) + prefix_len = candidate_len; } - return strlen(str); + return prefix_len; } static int str_match_len(const struct value_pair *vp, const char *str) @@ -601,7 +604,7 @@ static int __handle_option(const struct fio_option *o, const char *ptr, } case FIO_OPT_STR_VAL_TIME: is_time = 1; - fallthrough; + fio_fallthrough; case FIO_OPT_ULL: case FIO_OPT_INT: case FIO_OPT_STR_VAL: @@ -817,6 +820,8 @@ static int __handle_option(const struct fio_option *o, const char *ptr, if (o->off1) { cp = td_var(data, o, o->off1); + if (*cp) + free(*cp); *cp = strdup(ptr); if (strlen(ptr) > o->maxlen - 1) { log_err("value exceeds max length of %d\n", @@ -978,7 +983,7 @@ static int __handle_option(const struct fio_option *o, const char *ptr, } case FIO_OPT_DEPRECATED: ret = 1; - fallthrough; + fio_fallthrough; case FIO_OPT_SOFT_DEPRECATED: log_info("Option %s is deprecated\n", o->name); break; diff --git a/parse.h b/parse.h index d68484eaf0..806a76ee09 100644 --- a/parse.h +++ b/parse.h @@ -32,7 +32,7 @@ enum fio_opt_type { */ struct value_pair { const char *ival; /* string option */ - unsigned long long oval;/* output value */ + unsigned long long oval; /* output value */ const char *help; /* help text for sub option */ int orval; /* OR value */ void *cb; /* sub-option callback */ diff --git a/pcbuf.h b/pcbuf.h new file mode 100644 index 0000000000..df23b2334a --- /dev/null +++ b/pcbuf.h @@ -0,0 +1,211 @@ +/** + * SPDX-License-Identifier: GPL-2.0 only + * + * Copyright (c) 2025 Sandisk Corporation or its affiliates. + */ +/** + * Two-phase circular buffer implementation for producer/consumer separation. + * + * This header defines the data structures and inline functions for a two-phase + * circular buffer, allowing staged writes and explicit commit of data batches. + * Useful for double-buffered systems or scenarios requiring controlled visibility + * of produced data to consumers. + */ +#ifndef PHASE_CIRCULAR_BUFFER_H +#define PHASE_CIRCULAR_BUFFER_H + +#include +#include +#include +#include +#include + +/** + * struct pc_buf - Two-phase circular buffer. + * @commit_head: Index of the next committed element in the buffer (visible to consumer). + * @staging_head: Index of the next staged (but not yet committed) element (written by producer). + * @read_tail: Index of the next element to be read by the consumer. + * @capacity: Total capacity of the buffer (number of elements). + * @buffer: Buffer data. + * + * This structure implements a two-phase circular buffer, where data is first staged + * by advancing @staging_head, and only becomes visible to the consumer when @commit_head + * is explicitly updated. This allows for controlled commit of data batches, useful in + * double-buffered systems or producer/consumer separation. + */ +struct pc_buf { + uint64_t commit_head; + uint64_t staging_head; + uint64_t read_tail; + uint64_t capacity; + uint64_t buffer[]; +}; + +/** + * pcb_alloc - Allocate and initialize buffer. + * @capacity: Number of elements the buffer can hold. + * + * Returns a pointer to the allocated buffer, or NULL on failure. + */ +static inline struct pc_buf *pcb_alloc(uint64_t capacity) +{ + size_t size = sizeof(struct pc_buf) + sizeof(uint64_t) * capacity; + struct pc_buf *cb = (struct pc_buf *)malloc(size); + + if (!cb) + return NULL; + cb->commit_head = 0; + cb->staging_head = 0; + cb->read_tail = 0; + cb->capacity = capacity; + return cb; +} + +/** + * pcb_is_empty - Check if the buffer is empty. + * @cb: pointer to the pc_buf structure. + * + * Returns true if the buffer has no committed data. + */ +static inline bool pcb_is_empty(const struct pc_buf *cb) +{ + return cb->read_tail == cb->commit_head; +} + +/** + * pcb_is_full - Check if the buffer is full. + * @cb: pointer to the pc_buf structure. + * + * Returns true if the buffer cannot accept more staged data. + */ + +static inline bool pcb_is_full(const struct pc_buf *cb) +{ + return ((cb->staging_head + 1) % cb->capacity) == cb->read_tail; +} + +/** + * pcb_push_staged - Push a value into the staged buffer. + * @cb: pointer to the pc_buf structure. + * @value: value to be staged. + * + * Returns true if the value was successfully staged, false if the buffer is full. + */ +static inline bool pcb_push_staged(struct pc_buf *cb, uint64_t value) +{ + if (pcb_is_full(cb)) + return false; + + cb->buffer[cb->staging_head] = value; + cb->staging_head = (cb->staging_head + 1) % cb->capacity; + return true; +} + +/** + * pcb_commit - Commit the staged data to make it visible to consumers. + * @cb: pointer to the pc_buf structure. + * + * Updates the commit head to the current staging head, making + * all staged data visible to consumers. It should be called after staging data. + */ +static inline void pcb_commit(struct pc_buf *cb) +{ + cb->commit_head = cb->staging_head; +} + +/** + * pcb_pop - Pop a value from the committed buffer. + * @cb: pointer to the pc_buf structure. + * @out: pointer to the variable to store the popped value. + * + * Returns true if a value was successfully popped, false if the buffer is empty. + */ +static inline bool pcb_pop(struct pc_buf *cb, uint64_t *out) +{ + if (pcb_is_empty(cb)) + return false; + + *out = cb->buffer[cb->read_tail]; + cb->read_tail = (cb->read_tail + 1) % cb->capacity; + return true; +} + +/** + * pcb_print_committed - Print the contents of the committed buffer. + * @cb: pointer to the pc_buf structure. + * + * This function prints all committed data in the buffer. + */ +static inline void pcb_print_committed(const struct pc_buf *cb) +{ + uint64_t i = cb->read_tail; + + printf("Committed buffer: "); + while (i != cb->commit_head) { + printf("%" PRIu64 " ", cb->buffer[i]); + i = (i + 1) % cb->capacity; + } + printf("\n"); +} + +/** + * pcb_print_staged - Print the contents of the staged buffer. + * @cb: pointer to the pc_buf structure. + * + * This function prints all staged data that has not yet been committed. + */ +static inline void pcb_print_staged(const struct pc_buf *cb) +{ + uint64_t i = cb->commit_head; + + printf("Staged (not visible yet): "); + while (i != cb->staging_head) { + printf("%" PRIu64 " ", cb->buffer[i]); + i = (i + 1) % cb->capacity; + } + printf("\n"); +} + +/** + * pcb_committed_size - Get the size of committed data in the buffer. + * @cb: pointer to the pc_buf structure. + * + * Returns the number of elements that have been committed and are visible to consumers. + */ +static inline uint64_t pcb_committed_size(const struct pc_buf *cb) +{ + if (cb->commit_head >= cb->read_tail) + return cb->commit_head - cb->read_tail; + else + return cb->capacity - cb->read_tail + cb->commit_head; +} + +/** + * pcb_staged_size - Get the size of staged data in the buffer. + * @cb: pointer to the pc_buf structure. + * + * Returns the number of elements that have been staged but not yet committed. + */ +static inline uint64_t pcb_staged_size(const struct pc_buf *cb) +{ + if (cb->staging_head >= cb->commit_head) + return cb->staging_head - cb->commit_head; + else + return cb->capacity - cb->commit_head + cb->staging_head; +} + +/** + * pcb_space_available - Check if there is space available for staging. + * @cb: pointer to the pc_buf structure. + * + * Returns true if there is space available for staging new data, false if the buffer is full. + */ +static inline bool pcb_space_available(const struct pc_buf *cb) +{ + uint64_t used = pcb_committed_size(cb) + pcb_staged_size(cb); + /* keep 1 slot reserved to distinguish full from empty */ + return used < (cb->capacity - 1); +} + +#endif /* PHASE_CIRCULAR_BUFFER_H */ + diff --git a/rate-submit.c b/rate-submit.c index 13dbe7a2e9..92be3df75e 100644 --- a/rate-submit.c +++ b/rate-submit.c @@ -5,6 +5,9 @@ * */ #include +#include +#include + #include "fio.h" #include "ioengines.h" #include "lib/getrusage.h" @@ -12,8 +15,7 @@ static void check_overlap(struct io_u *io_u) { - int i, res; - struct thread_data *td; + int res; /* * Allow only one thread to check for overlap at a time to prevent two @@ -28,10 +30,13 @@ static void check_overlap(struct io_u *io_u) * threads as they assess overlap. */ res = pthread_mutex_lock(&overlap_check); - assert(res == 0); + if (fio_unlikely(res != 0)) { + log_err("failed to lock overlap check mutex, err: %i:%s", errno, strerror(errno)); + abort(); + } retry: - for_each_td(td, i) { + for_each_td(td) { if (td->runstate <= TD_SETTING_UP || td->runstate >= TD_FINISHING || !td->o.serialize_overlap || @@ -42,11 +47,17 @@ static void check_overlap(struct io_u *io_u) continue; res = pthread_mutex_unlock(&overlap_check); - assert(res == 0); + if (fio_unlikely(res != 0)) { + log_err("failed to unlock overlap check mutex, err: %i:%s", errno, strerror(errno)); + abort(); + } res = pthread_mutex_lock(&overlap_check); - assert(res == 0); + if (fio_unlikely(res != 0)) { + log_err("failed to lock overlap check mutex, err: %i:%s", errno, strerror(errno)); + abort(); + } goto retry; - } + } end_for_each(); } static int io_workqueue_fn(struct submit_worker *sw, @@ -154,6 +165,7 @@ static int io_workqueue_init_worker_fn(struct submit_worker *sw) dup_files(td, parent); td->eo = parent->eo; fio_options_mem_dupe(td); + td->iolog_f = parent->iolog_f; if (ioengine_load(td)) goto err; @@ -173,7 +185,7 @@ static int io_workqueue_init_worker_fn(struct submit_worker *sw) if (td->io_ops->post_init && td->io_ops->post_init(td)) goto err_io_init; - set_epoch_time(td, td->o.log_unix_epoch); + set_epoch_time(td, td->o.log_alternate_epoch_clock_id, td->o.job_start_clock_id); fio_getrusage(&td->ru_start); clear_io_state(td, 1); @@ -195,7 +207,16 @@ static void io_workqueue_exit_worker_fn(struct submit_worker *sw, struct thread_data *td = sw->priv; (*sum_cnt)++; - sum_thread_stats(&sw->wq->td->ts, &td->ts, *sum_cnt == 1); + + /* + * io_workqueue_update_acct_fn() doesn't support per prio stats, and + * even if it did, offload can't be used with all async IO engines. + * If group reporting is set in the parent td, the group result + * generated by __show_run_stats() can still contain multiple prios + * from different offloaded jobs. + */ + sw->wq->td->ts.disable_prio_stat = 1; + sum_thread_stats(&sw->wq->td->ts, &td->ts); fio_options_free(td); close_and_free_files(td); @@ -254,6 +275,8 @@ static void sum_ddir(struct thread_data *dst, struct thread_data *src, sum_val(&dst->this_io_blocks[ddir], &src->this_io_blocks[ddir]); sum_val(&dst->this_io_bytes[ddir], &src->this_io_bytes[ddir]); sum_val(&dst->bytes_done[ddir], &src->bytes_done[ddir]); + if (ddir == DDIR_READ) + sum_val(&dst->bytes_verified, &src->bytes_verified); pthread_double_unlock(&dst->io_wq.stat_lock, &src->io_wq.stat_lock); } diff --git a/server.c b/server.c index 90c52e01ac..cde7fdf30c 100644 --- a/server.c +++ b/server.c @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -63,12 +64,28 @@ static char me[128]; static pthread_key_t sk_out_key; +#ifdef WIN32 +static char *fio_server_pipe_name = NULL; +static HANDLE hjob = INVALID_HANDLE_VALUE; +struct ffi_element { + union { + pthread_t thread; + HANDLE hProcess; + }; + bool is_thread; +}; +#endif + struct fio_fork_item { struct flist_head list; int exitval; int signal; int exited; +#ifdef WIN32 + struct ffi_element element; +#else pid_t pid; +#endif }; struct cmd_reply { @@ -250,6 +267,28 @@ static int fio_send_data(int sk, const void *p, unsigned int len) return fio_sendv_data(sk, &iov, 1); } +bool fio_server_poll_fd(int fd, short events, int timeout) +{ + struct pollfd pfd = { + .fd = fd, + .events = events, + }; + int ret; + + ret = poll(&pfd, 1, timeout); + if (ret < 0) { + if (errno == EINTR) + return false; + log_err("fio: poll: %s\n", strerror(errno)); + return false; + } else if (!ret) { + return false; + } + if (pfd.revents & events) + return true; + return false; +} + static int fio_recv_data(int sk, void *buf, unsigned int len, bool wait) { int flags; @@ -651,6 +690,63 @@ static int fio_net_queue_stop(int error, int signal) return fio_net_send_ack(NULL, error, signal); } +#ifdef WIN32 +static void fio_server_add_fork_item(struct ffi_element *element, struct flist_head *list) +{ + struct fio_fork_item *ffi; + + ffi = malloc(sizeof(*ffi)); + ffi->exitval = 0; + ffi->signal = 0; + ffi->exited = 0; + ffi->element = *element; + flist_add_tail(&ffi->list, list); +} + +static void fio_server_add_conn_pid(struct flist_head *conn_list, HANDLE hProcess) +{ + struct ffi_element element = {.hProcess = hProcess, .is_thread=FALSE}; + dprint(FD_NET, "server: forked off connection job (tid=%u)\n", (int) element.thread); + + fio_server_add_fork_item(&element, conn_list); +} + +static void fio_server_add_job_pid(struct flist_head *job_list, pthread_t thread) +{ + struct ffi_element element = {.thread = thread, .is_thread=TRUE}; + dprint(FD_NET, "server: forked off job job (tid=%u)\n", (int) element.thread); + fio_server_add_fork_item(&element, job_list); +} + +static void fio_server_check_fork_item(struct fio_fork_item *ffi) +{ + int ret; + + if (ffi->element.is_thread) { + + ret = pthread_kill(ffi->element.thread, 0); + if (ret) { + int rev_val; + pthread_join(ffi->element.thread, (void**) &rev_val); /*if the thread is dead, then join it to get status*/ + + ffi->exitval = rev_val; + if (ffi->exitval) + log_err("thread (tid=%u) exited with %x\n", (int) ffi->element.thread, (int) ffi->exitval); + dprint(FD_PROCESS, "thread (tid=%u) exited with %x\n", (int) ffi->element.thread, (int) ffi->exitval); + ffi->exited = 1; + } + } else { + DWORD exit_val; + GetExitCodeProcess(ffi->element.hProcess, &exit_val); + + if (exit_val != STILL_ACTIVE) { + dprint(FD_PROCESS, "process %u exited with %d\n", GetProcessId(ffi->element.hProcess), exit_val); + ffi->exited = 1; + ffi->exitval = exit_val; + } + } +} +#else static void fio_server_add_fork_item(pid_t pid, struct flist_head *list) { struct fio_fork_item *ffi; @@ -698,10 +794,21 @@ static void fio_server_check_fork_item(struct fio_fork_item *ffi) } } } +#endif static void fio_server_fork_item_done(struct fio_fork_item *ffi, bool stop) { +#ifdef WIN32 + if (ffi->element.is_thread) + dprint(FD_NET, "tid %u exited, sig=%u, exitval=%d\n", (int) ffi->element.thread, ffi->signal, ffi->exitval); + else { + dprint(FD_NET, "pid %u exited, sig=%u, exitval=%d\n", (int) GetProcessId(ffi->element.hProcess), ffi->signal, ffi->exitval); + CloseHandle(ffi->element.hProcess); + ffi->element.hProcess = INVALID_HANDLE_VALUE; + } +#else dprint(FD_NET, "pid %u exited, sig=%u, exitval=%d\n", (int) ffi->pid, ffi->signal, ffi->exitval); +#endif /* * Fold STOP and QUIT... @@ -762,27 +869,62 @@ static int handle_load_file_cmd(struct fio_net_cmd *cmd) return 0; } -static int handle_run_cmd(struct sk_out *sk_out, struct flist_head *job_list, - struct fio_net_cmd *cmd) +#ifdef WIN32 +static void *fio_backend_thread(void *data) { - pid_t pid; int ret; + struct sk_out *sk_out = (struct sk_out *) data; sk_out_assign(sk_out); + ret = fio_backend(sk_out); + sk_out_drop(); + + pthread_exit((void*) (intptr_t) ret); + return NULL; +} +#endif + +static int handle_run_cmd(struct sk_out *sk_out, struct flist_head *job_list, + struct fio_net_cmd *cmd) +{ + int ret; + fio_time_init(); set_genesis_time(); - pid = fork(); - if (pid) { - fio_server_add_job_pid(job_list, pid); - return 0; +#ifdef WIN32 + { + pthread_t thread; + /* both this thread and backend_thread call sk_out_assign() to double increment + * the ref count. This ensures struct is valid until both threads are done with it + */ + sk_out_assign(sk_out); + ret = pthread_create(&thread, NULL, fio_backend_thread, sk_out); + if (ret) { + log_err("pthread_create: %s\n", strerror(ret)); + return ret; + } + + fio_server_add_job_pid(job_list, thread); + return ret; } +#else + { + pid_t pid; + sk_out_assign(sk_out); + pid = fork(); + if (pid) { + fio_server_add_job_pid(job_list, pid); + return 0; + } - ret = fio_backend(sk_out); - free_threads_shm(); - sk_out_drop(); - _exit(ret); + ret = fio_backend(sk_out); + free_threads_shm(); + sk_out_drop(); + _exit(ret); + } +#endif } static int handle_job_cmd(struct fio_net_cmd *cmd) @@ -858,7 +1000,7 @@ static int handle_probe_cmd(struct fio_net_cmd *cmd) .os = FIO_OS, .arch = FIO_ARCH, .bpp = sizeof(void *), - .cpus = __cpu_to_le32(cpus_online()), + .cpus = __cpu_to_le32(cpus_configured()), }; dprint(FD_NET, "server: sending probe reply\n"); @@ -941,6 +1083,7 @@ static int handle_update_job_cmd(struct fio_net_cmd *cmd) struct cmd_add_job_pdu *pdu = (struct cmd_add_job_pdu *) cmd->payload; struct thread_data *td; uint32_t tnumber; + int ret; tnumber = le32_to_cpu(pdu->thread_number); @@ -952,8 +1095,9 @@ static int handle_update_job_cmd(struct fio_net_cmd *cmd) } td = tnumber_to_td(tnumber); - convert_thread_options_to_cpu(&td->o, &pdu->top); - send_update_job_reply(cmd->tag, 0); + ret = convert_thread_options_to_cpu(&td->o, &pdu->top, + cmd->pdu_len - offsetof(struct cmd_add_job_pdu, top)); + send_update_job_reply(cmd->tag, ret); return 0; } @@ -1182,7 +1326,7 @@ static int handle_xmits(struct sk_out *sk_out) sk_unlock(sk_out); while (!flist_empty(&list)) { - entry = flist_entry(list.next, struct sk_entry, list); + entry = flist_first_entry(&list, struct sk_entry, list); flist_del(&entry->list); ret += handle_sk_entry(sk_out, entry); } @@ -1238,7 +1382,8 @@ static int handle_connection(struct sk_out *sk_out) if (ret < 0) break; - cmd = fio_net_recv_cmd(sk_out->sk, true); + if (pfd.revents & POLLIN) + cmd = fio_net_recv_cmd(sk_out->sk, true); if (!cmd) { ret = -1; break; @@ -1300,6 +1445,73 @@ static int get_my_addr_str(int sk) return 0; } +#ifdef WIN32 +static int handle_connection_process(void) +{ + WSAPROTOCOL_INFO protocol_info; + DWORD bytes_read; + HANDLE hpipe; + int sk; + struct sk_out *sk_out; + int ret; + char *msg = (char *) "connected"; + + log_info("server enter accept loop. ProcessID %d\n", GetCurrentProcessId()); + + hpipe = CreateFile( + fio_server_pipe_name, + GENERIC_READ | GENERIC_WRITE, + 0, NULL, + OPEN_EXISTING, + 0, NULL); + + if (hpipe == INVALID_HANDLE_VALUE) { + log_err("couldnt open pipe %s error %lu\n", + fio_server_pipe_name, GetLastError()); + return -1; + } + + if (!ReadFile(hpipe, &protocol_info, sizeof(protocol_info), &bytes_read, NULL)) { + log_err("couldnt read pi from pipe %s error %lu\n", fio_server_pipe_name, + GetLastError()); + } + + if (use_ipv6) /* use protocol_info to create a duplicate of parents socket */ + sk = WSASocket(AF_INET6, SOCK_STREAM, 0, &protocol_info, 0, 0); + else + sk = WSASocket(AF_INET, SOCK_STREAM, 0, &protocol_info, 0, 0); + + sk_out = scalloc(1, sizeof(*sk_out)); + if (!sk_out) { + CloseHandle(hpipe); + close(sk); + return -1; + } + + sk_out->sk = sk; + sk_out->hProcess = INVALID_HANDLE_VALUE; + INIT_FLIST_HEAD(&sk_out->list); + __fio_sem_init(&sk_out->lock, FIO_SEM_UNLOCKED); + __fio_sem_init(&sk_out->wait, FIO_SEM_LOCKED); + __fio_sem_init(&sk_out->xmit, FIO_SEM_UNLOCKED); + + get_my_addr_str(sk); + + if (!WriteFile(hpipe, msg, strlen(msg), NULL, NULL)) { + log_err("couldnt write pipe\n"); + close(sk); + return -1; + } + CloseHandle(hpipe); + + sk_out_assign(sk_out); + + ret = handle_connection(sk_out); + __sk_out_drop(sk_out); + return ret; +} +#endif + static int accept_loop(int listen_sk) { struct sockaddr_in addr; @@ -1317,8 +1529,11 @@ static int accept_loop(int listen_sk) struct sk_out *sk_out; const char *from; char buf[64]; +#ifdef WIN32 + HANDLE hProcess; +#else pid_t pid; - +#endif pfd.fd = listen_sk; pfd.events = POLLIN; do { @@ -1376,6 +1591,13 @@ static int accept_loop(int listen_sk) __fio_sem_init(&sk_out->wait, FIO_SEM_LOCKED); __fio_sem_init(&sk_out->xmit, FIO_SEM_UNLOCKED); +#ifdef WIN32 + hProcess = windows_handle_connection(hjob, sk); + if (hProcess == INVALID_HANDLE_VALUE) + return -1; + sk_out->hProcess = hProcess; + fio_server_add_conn_pid(&conn_list, hProcess); +#else pid = fork(); if (pid) { close(sk); @@ -1392,6 +1614,7 @@ static int accept_loop(int listen_sk) */ sk_out_assign(sk_out); handle_connection(sk_out); +#endif } return exitval; @@ -1465,8 +1688,11 @@ void fio_server_send_ts(struct thread_stat *ts, struct group_run_stats *rs) { struct cmd_ts_pdu p; int i, j, k; - void *ss_buf; - uint64_t *ss_iops, *ss_bw; + size_t clat_prio_stats_extra_size = 0; + size_t ss_extra_size = 0; + size_t extended_buf_size = 0; + void *extended_buf; + void *extended_buf_wp; dprint(FD_NET, "server sending end stats\n"); @@ -1480,9 +1706,12 @@ void fio_server_send_ts(struct thread_stat *ts, struct group_run_stats *rs) p.ts.error = cpu_to_le32(ts->error); p.ts.thread_number = cpu_to_le32(ts->thread_number); p.ts.groupid = cpu_to_le32(ts->groupid); + p.ts.job_start = cpu_to_le64(ts->job_start); p.ts.pid = cpu_to_le32(ts->pid); p.ts.members = cpu_to_le32(ts->members); p.ts.unified_rw_rep = cpu_to_le32(ts->unified_rw_rep); + p.ts.ioprio = cpu_to_le32(ts->ioprio); + p.ts.disable_prio_stat = cpu_to_le32(ts->disable_prio_stat); for (i = 0; i < DDIR_RWDIR_CNT; i++) { convert_io_stat(&p.ts.clat_stat[i], &ts->clat_stat[i]); @@ -1541,7 +1770,6 @@ void fio_server_send_ts(struct thread_stat *ts, struct group_run_stats *rs) p.ts.total_submit = cpu_to_le64(ts->total_submit); p.ts.total_complete = cpu_to_le64(ts->total_complete); - p.ts.nr_zone_resets = cpu_to_le64(ts->nr_zone_resets); for (i = 0; i < DDIR_RWDIR_CNT; i++) { p.ts.io_bytes[i] = cpu_to_le64(ts->io_bytes[i]); @@ -1555,6 +1783,9 @@ void fio_server_send_ts(struct thread_stat *ts, struct group_run_stats *rs) p.ts.kb_base = cpu_to_le32(ts->kb_base); p.ts.unit_base = cpu_to_le32(ts->unit_base); + p.ts.nr_zone_resets = cpu_to_le64(ts->nr_zone_resets); + p.ts.count_zone_resets = cpu_to_le16(ts->count_zone_resets); + p.ts.latency_depth = cpu_to_le32(ts->latency_depth); p.ts.latency_target = cpu_to_le64(ts->latency_target); p.ts.latency_window = cpu_to_le64(ts->latency_window); @@ -1577,38 +1808,96 @@ void fio_server_send_ts(struct thread_stat *ts, struct group_run_stats *rs) p.ts.cachehit = cpu_to_le64(ts->cachehit); p.ts.cachemiss = cpu_to_le64(ts->cachemiss); + convert_gs(&p.rs, rs); + for (i = 0; i < DDIR_RWDIR_CNT; i++) { - for (j = 0; j < FIO_IO_U_PLAT_NR; j++) { - p.ts.io_u_plat_high_prio[i][j] = cpu_to_le64(ts->io_u_plat_high_prio[i][j]); - p.ts.io_u_plat_low_prio[i][j] = cpu_to_le64(ts->io_u_plat_low_prio[i][j]); + if (ts->nr_clat_prio[i]) + clat_prio_stats_extra_size += ts->nr_clat_prio[i] * sizeof(*ts->clat_prio[i]); + } + extended_buf_size += clat_prio_stats_extra_size; + + dprint(FD_NET, "ts->ss_state = %d\n", ts->ss_state); + if (ts->ss_state & FIO_SS_DATA) + ss_extra_size = 3 * ts->ss_dur * sizeof(uint64_t); + + extended_buf_size += ss_extra_size; + if (!extended_buf_size) { + fio_net_queue_cmd(FIO_NET_CMD_TS, &p, sizeof(p), NULL, SK_F_COPY); + return; + } + + extended_buf_size += sizeof(p); + extended_buf = calloc(1, extended_buf_size); + if (!extended_buf) { + log_err("fio: failed to allocate FIO_NET_CMD_TS buffer\n"); + return; + } + + memcpy(extended_buf, &p, sizeof(p)); + extended_buf_wp = (struct cmd_ts_pdu *)extended_buf + 1; + + if (clat_prio_stats_extra_size) { + for (i = 0; i < DDIR_RWDIR_CNT; i++) { + struct clat_prio_stat *prio = (struct clat_prio_stat *) extended_buf_wp; + + for (j = 0; j < ts->nr_clat_prio[i]; j++) { + for (k = 0; k < FIO_IO_U_PLAT_NR; k++) + prio->io_u_plat[k] = + cpu_to_le64(ts->clat_prio[i][j].io_u_plat[k]); + convert_io_stat(&prio->clat_stat, + &ts->clat_prio[i][j].clat_stat); + prio->ioprio = cpu_to_le32(ts->clat_prio[i][j].ioprio); + prio++; + } + + if (ts->nr_clat_prio[i]) { + uint64_t offset = (char *)extended_buf_wp - (char *)extended_buf; + struct cmd_ts_pdu *ptr = extended_buf; + + ptr->ts.clat_prio_offset[i] = cpu_to_le64(offset); + ptr->ts.nr_clat_prio[i] = cpu_to_le32(ts->nr_clat_prio[i]); + } + + extended_buf_wp = prio; } - convert_io_stat(&p.ts.clat_high_prio_stat[i], &ts->clat_high_prio_stat[i]); - convert_io_stat(&p.ts.clat_low_prio_stat[i], &ts->clat_low_prio_stat[i]); } - convert_gs(&p.rs, rs); + if (ss_extra_size) { + uint64_t *ss_iops, *ss_bw, *ss_lat; + uint64_t offset; + struct cmd_ts_pdu *ptr = extended_buf; - dprint(FD_NET, "ts->ss_state = %d\n", ts->ss_state); - if (ts->ss_state & FIO_SS_DATA) { dprint(FD_NET, "server sending steadystate ring buffers\n"); - ss_buf = malloc(sizeof(p) + 2*ts->ss_dur*sizeof(uint64_t)); + /* ss iops */ + ss_iops = (uint64_t *) extended_buf_wp; + for (i = 0; i < ts->ss_dur; i++) + ss_iops[i] = cpu_to_le64(ts->ss_iops_data[i]); - memcpy(ss_buf, &p, sizeof(p)); + offset = (char *)extended_buf_wp - (char *)extended_buf; + ptr->ts.ss_iops_data_offset = cpu_to_le64(offset); + extended_buf_wp = ss_iops + (int) ts->ss_dur; - ss_iops = (uint64_t *) ((struct cmd_ts_pdu *)ss_buf + 1); - ss_bw = ss_iops + (int) ts->ss_dur; - for (i = 0; i < ts->ss_dur; i++) { - ss_iops[i] = cpu_to_le64(ts->ss_iops_data[i]); + /* ss bw */ + ss_bw = extended_buf_wp; + for (i = 0; i < ts->ss_dur; i++) ss_bw[i] = cpu_to_le64(ts->ss_bw_data[i]); - } - fio_net_queue_cmd(FIO_NET_CMD_TS, ss_buf, sizeof(p) + 2*ts->ss_dur*sizeof(uint64_t), NULL, SK_F_COPY); + offset = (char *)extended_buf_wp - (char *)extended_buf; + ptr->ts.ss_bw_data_offset = cpu_to_le64(offset); + extended_buf_wp = ss_bw + (int) ts->ss_dur; + + /* ss lat */ + ss_lat = extended_buf_wp; + for (i = 0; i < ts->ss_dur; i++) + ss_lat[i] = cpu_to_le64(ts->ss_lat_data[i]); - free(ss_buf); + offset = (char *)extended_buf_wp - (char *)extended_buf; + ptr->ts.ss_lat_data_offset = cpu_to_le64(offset); } - else - fio_net_queue_cmd(FIO_NET_CMD_TS, &p, sizeof(p), NULL, SK_F_COPY); + + fio_net_queue_cmd(FIO_NET_CMD_TS, extended_buf, extended_buf_size, NULL, SK_F_COPY); + free(extended_buf); } void fio_server_send_gs(struct group_run_stats *rs) @@ -1981,6 +2270,7 @@ int fio_send_iolog(struct thread_data *td, struct io_log *log, const char *name) .thread_number = cpu_to_le32(td->thread_number), .log_type = cpu_to_le32(log->log_type), .log_hist_coarseness = cpu_to_le32(log->hist_coarseness), + .per_job_logs = cpu_to_le32(td->o.per_job_logs), }; struct sk_entry *first; struct flist_head *entry; @@ -2009,15 +2299,20 @@ int fio_send_iolog(struct thread_data *td, struct io_log *log, const char *name) struct io_sample *s = get_sample(log, cur_log, i); s->time = cpu_to_le64(s->time); - s->data.val = cpu_to_le64(s->data.val); + if (log->log_type != IO_LOG_TYPE_HIST) { + s->data.val.val0 = cpu_to_le64(s->data.val.val0); + s->data.val.val1 = cpu_to_le64(s->data.val.val1); + } s->__ddir = __cpu_to_le32(s->__ddir); s->bs = cpu_to_le64(s->bs); - if (log->log_offset) { - struct io_sample_offset *so = (void *) s; + if (log->log_offset) + s->aux[IOS_AUX_OFFSET_INDEX] = + cpu_to_le64(s->aux[IOS_AUX_OFFSET_INDEX]); - so->offset = cpu_to_le64(so->offset); - } + if (log->log_issue_time) + s->aux[IOS_AUX_ISSUE_TIME_INDEX] = + cpu_to_le64(s->aux[IOS_AUX_ISSUE_TIME_INDEX]); } } @@ -2047,22 +2342,29 @@ int fio_send_iolog(struct thread_data *td, struct io_log *log, const char *name) void fio_server_send_add_job(struct thread_data *td) { - struct cmd_add_job_pdu pdu = { - .thread_number = cpu_to_le32(td->thread_number), - .groupid = cpu_to_le32(td->groupid), - }; + struct cmd_add_job_pdu *pdu; + size_t cmd_sz = offsetof(struct cmd_add_job_pdu, top) + + thread_options_pack_size(&td->o); - convert_thread_options_to_net(&pdu.top, &td->o); + pdu = malloc(cmd_sz); + pdu->thread_number = cpu_to_le32(td->thread_number); + pdu->groupid = cpu_to_le32(td->groupid); - fio_net_queue_cmd(FIO_NET_CMD_ADD_JOB, &pdu, sizeof(pdu), NULL, - SK_F_COPY); + convert_thread_options_to_net(&pdu->top, &td->o); + + fio_net_queue_cmd(FIO_NET_CMD_ADD_JOB, pdu, cmd_sz, NULL, SK_F_COPY); + free(pdu); } void fio_server_send_start(struct thread_data *td) { struct sk_out *sk_out = pthread_getspecific(sk_out_key); - assert(sk_out->sk != -1); + if (sk_out->sk == -1) { + log_err("pthread getting specific for key failed, sk_out %p, sk %i, err: %i:%s", + sk_out, sk_out->sk, errno, strerror(errno)); + abort(); + } fio_net_queue_cmd(FIO_NET_CMD_SERVER_START, NULL, 0, NULL, SK_F_SIMPLE); } @@ -2489,12 +2791,25 @@ static int fio_server(void) if (fio_handle_server_arg()) return -1; + set_sig_handlers(); + +#ifdef WIN32 + /* if this is a child process, go handle the connection */ + if (fio_server_pipe_name != NULL) { + ret = handle_connection_process(); + return ret; + } + + /* job to link child processes so they terminate together */ + hjob = windows_create_job(); + if (hjob == INVALID_HANDLE_VALUE) + return -1; +#endif + sk = fio_init_server_connection(); if (sk < 0) return -1; - set_sig_handlers(); - ret = accept_loop(sk); close(sk); @@ -2635,3 +2950,10 @@ void fio_server_set_arg(const char *arg) { fio_server_arg = strdup(arg); } + +#ifdef WIN32 +void fio_server_internal_set(const char *arg) +{ + fio_server_pipe_name = strdup(arg); +} +#endif diff --git a/server.h b/server.h index 25b6bbdc25..e0a921b84d 100644 --- a/server.h +++ b/server.h @@ -15,6 +15,9 @@ struct sk_out { unsigned int refs; /* frees sk_out when it drops to zero. * protected by below ->lock */ +#ifdef WIN32 + HANDLE hProcess; /* process handle of handle_connection_process*/ +#endif int sk; /* socket fd to talk to client */ struct fio_sem lock; /* protects ref and below list */ struct flist_head list; /* list of pending transmit work */ @@ -48,7 +51,7 @@ struct fio_net_cmd_reply { }; enum { - FIO_SERVER_VER = 95, + FIO_SERVER_VER = 118, FIO_SERVER_MAX_FRAGMENT_PDU = 1024, FIO_SERVER_MAX_CMD_MB = 2048, @@ -194,7 +197,9 @@ struct cmd_iolog_pdu { uint32_t compressed; uint32_t log_offset; uint32_t log_prio; + uint32_t log_issue_time; uint32_t log_hist_coarseness; + uint32_t per_job_logs; uint8_t name[FIO_NET_NAME_MAX]; struct io_sample samples[0]; }; @@ -212,6 +217,7 @@ extern int fio_server_text_output(int, const char *, size_t); extern int fio_net_send_cmd(int, uint16_t, const void *, off_t, uint64_t *, struct flist_head *); extern int fio_net_send_simple_cmd(int, uint16_t, uint64_t, struct flist_head *); extern void fio_server_set_arg(const char *); +extern void fio_server_internal_set(const char *); extern int fio_server_parse_string(const char *, char **, bool *, int *, struct in_addr *, struct in6_addr *, int *); extern int fio_server_parse_host(const char *, int, struct in_addr *, struct in6_addr *); extern const char *fio_server_op(unsigned int); @@ -222,6 +228,7 @@ extern void fio_server_send_gs(struct group_run_stats *); extern void fio_server_send_du(void); extern void fio_server_send_job_options(struct flist_head *, unsigned int); extern int fio_server_get_verify_state(const char *, int, void **); +extern bool fio_server_poll_fd(int fd, short events, int timeout); extern struct fio_net_cmd *fio_net_recv_cmd(int sk, bool wait); diff --git a/smalloc.c b/smalloc.c index fa00f0ee33..ac7ef70168 100644 --- a/smalloc.c +++ b/smalloc.c @@ -283,13 +283,13 @@ static void sfree_check_redzone(struct block_hdr *hdr) if (hdr->prered != SMALLOC_PRE_RED) { log_err("smalloc pre redzone destroyed!\n" " ptr=%p, prered=%x, expected %x\n", - hdr, hdr->prered, SMALLOC_PRE_RED); + hdr+1, hdr->prered, SMALLOC_PRE_RED); assert(0); } if (*postred != SMALLOC_POST_RED) { log_err("smalloc post redzone destroyed!\n" " ptr=%p, postred=%x, expected %x\n", - hdr, *postred, SMALLOC_POST_RED); + hdr+1, *postred, SMALLOC_POST_RED); assert(0); } } @@ -566,6 +566,10 @@ void *smalloc(size_t size) void *scalloc(size_t nmemb, size_t size) { + /* + * smalloc_pool (called by smalloc) will zero the memory, so we don't + * need to do it here. + */ return smalloc(nmemb * size); } diff --git a/sprandom.c b/sprandom.c new file mode 100644 index 0000000000..429a775400 --- /dev/null +++ b/sprandom.c @@ -0,0 +1,887 @@ +/** + * SPDX-License-Identifier: GPL-2.0 only + * + * Copyright (c) 2025 Sandisk Corporation or its affiliates. + */ +#include +#include +#include +#include "lib/pow2.h" +#include "fio.h" +#include "file.h" +#include "sprandom.h" + +/* + * Model for Estimating Steady-State Data Distribution in SSDs + * + * This model estimates the distribution of valid data across a flash drive + * in a steady state. It is based on the key insight from Desnoyers' research, + * which establishes a relationship between data validity and the physical + * space it occupies. + * + * P. Desnoyers, "Analytic Models of SSD Write Performance," + * ACM Transactions on Storage, + * vol. 8, no. 2, pp. 1–18, Jun. 2012, doi: 10.1145/2133360.2133364. + * + * The Core Principle + * ================== + * + * The fundamental concept is that for a drive in a steady state, the product + * of a block's validity and the fraction of drive space occupied by such + * blocks is constant. + * + * Key Equation (1): i * f(i) = k + * + * Where: + * - i: The number of valid pages in a block. + * - f(i): The fraction of the drive composed of blocks with 'i' valid pages. + * - k: A constant for the drive. + * + * This implies that for any two validity levels i and j: i * f(i) = j * f(j). + * In other words, regions with lower validity (more invalid data) must + * occupy proportionally more physical space than regions with high validity. + * + * + * Modeling Steps + * ============== + * The model is built by following these steps: + * + * 1. Normalize Validity & Relate to Write Amplification (WA) + * We normalize 'i' into a validity fraction: + * + * valid_frac(i) = i / num_pages_per_region + * + * A greedy garbage collection (GC) algorithm reclaims the block with the + * lowest validity. The validity of this GC block (`valid_frac_gc`) is + * determined by the drive's WA: + * + * valid_frac_gc = 1 - (1 / WA) + * + * 2. Determine Write Amplification (WA) from Over-Provisioning (OP) + * The WA can be calculated from the drive's OP. A simple approximation + * is often sufficient for most cases: + * + * WA ≈ 0.5 / OP + 0.7 + * + * Note: The precise formula from Desnoyers uses + * alpha = T/U + * where + * OP = alpha - 1 + * + * in the equation: + * alpha + * WA = ---------------------------- + * (alpha + W(-alpha*e^-alpha) + * + * with W being the Lambert W function). + * + * 3. Define the Distribution Curve + * + * Using the steady-state principle, we can find the relative size f(i) of a + * region given its validity (`valid_frac_i`) by comparing it to the GC block. + * + * valid_frac(i) * f(i) = valid_frac_gc * f_gc + * + * By defining the base size f_gc = 1, we get a simple relationship: + * + * f(i) = valid_frac_gc / valid_frac(i) + * + * This formula defines a curve where points are spaced equally by validity. + * + * 4. Resample for Equal-Sized Regions + * + * The final step is to make the model practical. We take the curve defined + * above and resample it to get points that are equally spaced by region + * size f(i). This resampling gives the expected validity for each + * equal-sized region of the drive, completing the model. + */ + +#define PCT_PRECISION 10000 + +static inline double *d_alloc(size_t n) +{ + return calloc(n, sizeof(double)); +} + +struct point { + double x; + double y; +}; + +static inline struct point *p_alloc(size_t n) +{ + return calloc(n, sizeof(struct point)); +} + +static void print_d_array(const char *hdr, double *darray, size_t len) +{ + struct buf_output out; + int i; + + buf_output_init(&out); + + __log_buf(&out, "["); + for (i = 0; i < len - 1; i++) + __log_buf(&out, "%.2f, ", darray[i]); + + __log_buf(&out, "%.2f]\n", darray[len - 1]); + if (hdr) + dprint(FD_SPRANDOM, "%s: ", hdr); + + dprint(FD_SPRANDOM, "%s", out.buf); + buf_output_free(&out); +} + +static void print_d_points(struct point *parray, size_t len) +{ + struct buf_output out; + unsigned int i; + + buf_output_init(&out); + + __log_buf(&out, "["); + for (i = 0; i < len - 1; i++) + __log_buf(&out, "(%.2f %.2f), ", parray[i].x, parray[i].y); + + __log_buf(&out, "(%.2f %.2f)]\n", parray[len - 1].x, parray[len - 1].y); + dprint(FD_SPRANDOM, "%s", out.buf); + buf_output_free(&out); +} + +/* Comparison function for qsort to sort points by x-value */ +static int compare_points(const void *a, const void *b) +{ + /* Cast void pointers to struct point pointers */ + const struct point *point_a = (const struct point *)a; + const struct point *point_b = (const struct point *)b; + + if (point_a->x < point_b->x) + return -1; + + if (point_a->x > point_b->x) + return 1; + + return 0; +} + +/** + * reverse - Reverses the elements of a double array in place. + * @arr: pointer to the array of doubles to be reversed. + * @size: number of elements in the array. + */ +static void reverse(double arr[], size_t size) +{ + size_t left = 0; + size_t right = size - 1; + + if (size <= 1) + return; + + while (left < right) { + double temp = arr[left]; + arr[left] = arr[right]; + arr[right] = temp; + left++; + right--; + } +} + +/** + * linspace - Generates a linearly spaced array of doubles. + * @start: The starting value of the sequence. + * @end: The ending value of the sequence. + * @num: The number of elements to generate. + * + * Allocates and returns an array of @num doubles, linearly spaced + * between @start and @end (inclusive). If @num is 0, returns NULL. + * If @num is 1, the array contains only @start. + * + * Return: allocated array, or NULL on allocation failure or if @num is 0. + */ +static double *linspace(double start, double end, unsigned int num) +{ + double *arr; + unsigned int i; + double step; + + if (num == 0) + return NULL; + + dprint(FD_SPRANDOM, "linespace start=%0.2f end=%0.2f num=%d\n", + start, end, num); + + arr = d_alloc(num); + if (arr == NULL) + return NULL; + + if (num == 1) { + arr[0] = start; + return arr; + } + + /* Calculate step size */ + step = (end - start) / ((double)num - 1.0); + + for (i = 0; i < num; i++) + arr[i] = start + (double)i * step; + + return arr; +} + +/** + * linear_interp - Performs linear interpolation or extrapolation. + * @new_x: The x-value at which to interpolate. + * @x_arr: Array of x-values (must be sorted in strictly increasing order). + * @y_arr: Array of y-values corresponding to x_arr. + * @num: Number of points in x_arr and y_arr. + * + * Returns the interpolated y-value at new_x using linear interpolation + * between the points in x_arr and y_arr. If new_x is outside the range + * of x_arr, returns the nearest endpoint's y-value (extrapolation). + * Handles edge cases for zero or one point, and avoids division by zero + * if two x-values are nearly identical. + */ +static double linear_interp(double new_x, const double *x_arr, + const double *y_arr, unsigned int num) +{ + unsigned int i; + double x1, y1, x2, y2; + + if (num == 0) + return 0.0; + + if (num == 1) + return y_arr[0]; /* If only one point, return its y-value */ + + /* Handle extrapolation outside the range */ + if (new_x <= x_arr[0]) + return y_arr[0]; + + if (new_x >= x_arr[num - 1]) + return y_arr[num - 1]; + + /* Find the interval [x_arr[i], x_arr[i + 1]] that contains new_x */ + for (i = 0; i < num - 1; i++) { + if (new_x >= x_arr[i] && new_x <= x_arr[i + 1]) { + x1 = x_arr[i]; + y1 = y_arr[i]; + x2 = x_arr[i + 1]; + y2 = y_arr[i + 1]; + + /* Avoid division by zero if x values are identical + * Using a small epsilon for float comparison + * Return y1 if x1 and x2 are almost identical + */ + if (fabs(x2 - x1) < 1e-9) + return y1; + + return y1 + (y2 - y1) * ((new_x - x1) / (x2 - x1)); + } + } + /* Should not reach here if new_x is within bounds + * and x_arr is strictly increasing + */ + return 0.0; +} + +/** + * sample_curve_equally_on_x - Resamples a curve at equally spaced x-values. + * @points: array of input points (must have strictly increasing x-values). + * @num: Number of input points. + * @num_resampled: number of points to resample to. + * @resampled_points: An output array of resampled points. + * + * Sorts the input points by x-value, checks for strictly increasing x-values, + * and generates a new set of points with x-values equally spaced between the + * minimum and maximum x of the input. Uses linear interpolation to compute + * corresponding y-values. + * Note: The function allocates memory for the output array. + * + * Return: 0 on success, negative error code on failure. + */ +static int sample_curve_equally_on_x(struct point *points, unsigned int num, + unsigned int num_resampled, + struct point **resampled_points) +{ + double *x_orig = (double *)0; + double *y_orig = (double *)0; + double *new_x_arr = (double *)0; + struct point *new_points_arr = (struct point *)0; + unsigned int i; + int ret = 0; + + if (points == NULL || resampled_points == NULL) + return -EINVAL; + + if (num == 0) { + log_err("fio: original points array cannot be empty.\n"); + return -EINVAL; + } + + if (num_resampled == 0) { + *resampled_points = NULL; + return 0; + } + + qsort(points, num, sizeof(struct point), compare_points); + + /* Check if x-values are strictly increasing and sort them */ + for (i = 0; i < num - 1; i++) { + if (points[i+1].x <= points[i].x) { + log_err("fio: x-values must be strictly increasing.\n"); + ret = -EINVAL; + goto cleanup; + } + } + + /* 2. Extract x and y into separate arrays for interpolation */ + x_orig = d_alloc(num); + y_orig = d_alloc(num); + if (x_orig == NULL || y_orig == NULL) { + log_err("fio: Memory allocation failed for x_orig or y_orig.\n"); + ret = -ENOMEM; + goto cleanup; + } + for (i = 0; i < num; i++) { + x_orig[i] = points[i].x; + y_orig[i] = points[i].y; + } + + /* 4. Generate new_x values using linspace */ + new_x_arr = linspace(x_orig[0], x_orig[num - 1], num_resampled); + if (new_x_arr == NULL) { + ret = -ENOMEM; + goto cleanup; + } + + /* 5. Allocate memory for new resampled points */ + new_points_arr = p_alloc(num_resampled); + if (new_points_arr == NULL) { + log_err("fio: Memory allocation failed for new_points_arr.\n"); + ret = -ENOMEM; + goto cleanup; + } + + /* 6. Perform linear interpolation for each new_x to get new_y */ + for (i = 0; i < num_resampled; i++) { + new_points_arr[i].x = new_x_arr[i]; + new_points_arr[i].y = linear_interp(new_x_arr[i], x_orig, y_orig, num); + } + + *resampled_points = new_points_arr; + +cleanup: + free(x_orig); + free(y_orig); + free(new_x_arr); + + return ret; +} + +/** + * compute_waf - Compute the write amplification factor (WAF) + * @over_provisioning: The over-provisioning ratio (0 < over_provisioning < 1) + * + * write amplification approximation equation + * + * 0.5 + * WAF = ------------------ + 0.7 + * over_provisioning + * + * Return: The computed write amplification factor as a double. + */ +static inline double compute_waf(double over_provisioning) +{ + return 0.5 / over_provisioning + 0.7; +} + +/** + * compute_gc_validity - validity of the block selected for GC (garbage collector) + * + * @waf: The Write Amplification Factor, must be greater than 1.0. + * + * Return: The computed gavalidity; + */ +static inline double compute_gc_validity(double waf) +{ + assert(waf > 1.0); /* Ensure WAF is greater than 1.0 */ + return 1.0 - (double)1.0 / waf; +} + +/** + * compute_validity_dist - Computes a resampled validity distribution for regions. + * @n_regions: Number of regions to divide the distribution into. + * @over_provisioning: Over-provisioning factor used to calculate WAF and validity. + * + * Calculates the validity distribution across a specified number of regions, + * based on the write amplification factor (WAF) and over-provisioning. + * Steps: + * - Allocates and fills arrays for: + * - validity distribution + * - block ratios + * - accumulated ratios + * - Constructs a set of points representing the curve. + * - Resamples the curve to ensure equal spacing along the x-axis. + * - Reverses the resulting validity distribution before returning. + * + * Note: The function allocates memory for the validity distribution array. + * + * Return: resampled and reversed validity distribution array or NULL on error. + */ +static double *compute_validity_dist(unsigned int n_regions, double over_provisioning) +{ + double waf = compute_waf(over_provisioning); + double validity = compute_gc_validity(waf); + double *validity_distribution = NULL; + double *blocks_ratio = NULL; + double *acc_ratio = NULL; + double acc; + unsigned int i; + struct point *points = NULL; + struct point *points_resampled = NULL; + int ret; + + if (n_regions == 0) { + log_err("fio: requires at least one region"); + goto out; + } + + /* + * Use linspace to get equally distributed validity values, + * along the y-axis of the curve we want to generate. + */ + validity_distribution = linspace(1.0, validity, n_regions); + + blocks_ratio = d_alloc(n_regions); + if (blocks_ratio == NULL) { + log_err("fio: memory allocation failed for linspace.\n"); + goto out; + } + + for (i = 0; i < n_regions; i++) + blocks_ratio[i] = 1.0 / validity_distribution[i]; + + acc_ratio = d_alloc(n_regions); + if (acc_ratio == NULL) { + log_err("fio: memory allocation failed for linspace_c.\n"); + goto out; + } + + acc = 0.0; + for (i = 0; i < n_regions; i++) { + acc_ratio[i] = acc + blocks_ratio[i]; + acc = acc_ratio[i]; + } + + print_d_array("validity_distribution", validity_distribution, n_regions); + print_d_array("blocks ratio", blocks_ratio, n_regions); + print_d_array("accumulated ratio:", acc_ratio, n_regions); + + points = p_alloc(n_regions); + + for (i = 0; i < n_regions; i++) { + points[i].x = acc_ratio[i]; + points[i].y = validity_distribution[i]; + } + print_d_points(points, n_regions); + + /* + * Use linspace again to get uniformly distributed x-values, + * and then interpolate the curve to find the validity at those + * uniformly distributed x-values. + */ + ret = sample_curve_equally_on_x(points, n_regions, n_regions, + &points_resampled); + + if (ret == 0) { + print_d_points(points_resampled, n_regions); + } else { + log_err("fio: failed to resample curve. Error code: %d\n", ret); + free(validity_distribution); + validity_distribution = NULL; + goto out; + } + + for (i = 0; i < n_regions; i++) + validity_distribution[i] = points_resampled[i].y; + + print_d_array("validity resampled", validity_distribution, n_regions); + +out: + free(points); + free(points_resampled); + free(blocks_ratio); + free(acc_ratio); + + reverse(validity_distribution, n_regions); + + return validity_distribution; +} + +/** + * Calculate the physical size based on logical size and over-provisioning + * + * @over_provisioning: over provisioning factor (e.g. 0.2 for 20%) + * @logical_sz: Logical size in bytes + * @align_bs: Block size for alignment in bytes + * + * return: Physical size in bytes, including over-provisioning and aligned to align_bs + */ +static uint64_t sprandom_physical_size(double over_provisioning, uint64_t logical_sz, + uint64_t align_bs) +{ + uint64_t size; + + size = logical_sz + ceil((double)logical_sz * over_provisioning); + return (size + (align_bs - 1)) & ~(align_bs - 1); +} + +/** + * estimate_inv_capacity - Estimates the invalid capacity of a region. + * @region_cnt: number of offsets in the region. + * @validity: invalidation ration in the regions (between 0 and 1). + * + * Calculates the expected number of invalidion in regions, adding a margin + * of 6 standard deviations to account for statistical variation. + * + * Returns: Estimated invalid capacity + */ +static uint64_t estimate_inv_capacity(uint64_t region_cnt, double validity) +{ + double sigma = sqrt((double)region_cnt * validity * (1.0 - validity)); + return (uint64_t)ceil(region_cnt * (1.0 - validity) + 6.0 * sigma); +} + +/** + * sprandom_setup - Initialize and configure sprandom_info structure. + * @spr_info: Pointer to sprandom_info structure to be initialized. + * @logical_size: Logical size of the storage region. + * @align_bs: Alignment block size. + * + * Calculates physical size and region parameters based on logical size, + * alignment, and over-provisioning. Allocates and initializes validity + * distribution and invalid percentage arrays for regions. Precomputes + * invalid buffer capacity and allocates buffer. Sets up region size, + * write counts, and resets region/phase counters. + * + * Returns 0 on success, enagative value on failure. + */ +static int sprandom_setup(struct sprandom_info *spr_info, uint64_t logical_size, + uint64_t align_bs) +{ + double over_provisioning = spr_info->over_provisioning; + int ret = 0; + uint64_t physical_size; + uint64_t region_sz; + uint64_t region_write_count; + double *validity_dist; + size_t invalid_capacity; + size_t total_alloc = 0; + char bytes2str_buf[40]; + int i; + + physical_size = sprandom_physical_size(over_provisioning, + logical_size, align_bs); + + validity_dist = compute_validity_dist(spr_info->num_regions, + spr_info->over_provisioning); + if (!validity_dist) { + ret = -ENOMEM; + goto err; + } + + /* Initialize validity_distribution */ + print_d_array("validity resampled:", validity_dist, spr_info->num_regions); + + /* Precompute invalidity percentage array */ + spr_info->invalid_pct = calloc(spr_info->num_regions, + sizeof(spr_info->invalid_pct[0])); + if (!spr_info->invalid_pct) { + ret = -ENOMEM; + goto err; + } + + total_alloc += spr_info->num_regions * sizeof(spr_info->invalid_pct[0]); + + for (i = 0; i < spr_info->num_regions; i++) { + double inv = (1.0 - validity_dist[i]) * (double)PCT_PRECISION; + spr_info->invalid_pct[i] = (int)round(inv); + } + + region_sz = physical_size / spr_info->num_regions; + region_write_count = region_sz / align_bs; + + if ((spr_info->cache_sz) && (spr_info->cache_sz > region_sz)) { + log_err("fio: sprandom: spr_cs [%"PRIu64"] must be smaller than" + " region_sz [%"PRIu64"] which means [%"PRIu64"] regions" + " allowed", spr_info->cache_sz, region_sz, + (physical_size / spr_info->cache_sz)); + ret = -EINVAL; + goto err; + } + + if (spr_info->cache_sz) { + /* Need 2x size to be safe since we wait to invalidate until after next region */ + invalid_capacity = estimate_inv_capacity(region_write_count, + validity_dist[0]) * 2; + } else { + invalid_capacity = estimate_inv_capacity(region_write_count, + validity_dist[0]); + } + + spr_info->invalid_capacity = invalid_capacity; + + spr_info->invalid_buf = pcb_alloc(invalid_capacity); + + total_alloc += invalid_capacity * sizeof(uint64_t); + + spr_info->region_sz = region_sz; + spr_info->invalid_count[0] = 0; + spr_info->invalid_count[1] = 0; + spr_info->curr_phase = 0; + spr_info->current_region = 0; + spr_info->region_write_count = region_write_count; + spr_info->writes_remaining = region_write_count; + + /* Display overall allocation */ + dprint(FD_SPRANDOM, "Summary:\n"); + dprint(FD_SPRANDOM, " logical_size: %"PRIu64": %s\n", + logical_size, + bytes2str_simple(bytes2str_buf, sizeof(bytes2str_buf), logical_size)); + dprint(FD_SPRANDOM, " physical_size: %"PRIu64": %s\n", + physical_size, + bytes2str_simple(bytes2str_buf, sizeof(bytes2str_buf), physical_size)); + dprint(FD_SPRANDOM, " op: %02f\n", spr_info->over_provisioning); + dprint(FD_SPRANDOM, " region_size: %"PRIu64"\n", region_sz); + dprint(FD_SPRANDOM, " num_regions: %u\n", spr_info->num_regions); + dprint(FD_SPRANDOM, " cache_size: %"PRIu64": %s\n", + spr_info->cache_sz, + bytes2str_simple(bytes2str_buf, sizeof(bytes2str_buf), + spr_info->cache_sz)); + dprint(FD_SPRANDOM, " region_write_count: %"PRIu64"\n", region_write_count); + dprint(FD_SPRANDOM, " invalid_capacity: %zu\n", invalid_capacity); + dprint(FD_SPRANDOM, " dynamic memory: %zu: %s\n", + total_alloc, + bytes2str_simple(bytes2str_buf, sizeof(bytes2str_buf), total_alloc)); + + free(validity_dist); + return 0; +err: + free(validity_dist); + free(spr_info->invalid_pct); + return ret; +} + +/** + * sprandom_add_with_probability - Adds an offset to the invalid buffer with + * a probability. + * + * @info: sprandom_info structure containing random state and buffers. + * @offset: The offset value to potentially add to the invalid buffer. + * @phase: The current phase index for invalid count tracking. + * + * Generates a random value and, based on the current region's invalid percentage, + * decides whether to add the offset to the invalid buffer. + * If the buffer is full, ogs an error and asserts failure. + */ +static void sprandom_add_with_probability(struct sprandom_info *info, + uint64_t offset, unsigned int phase) +{ + + int v = rand_between(info->rand_state, 0, PCT_PRECISION); + + if (v <= info->invalid_pct[info->current_region]) { + if (pcb_space_available(info->invalid_buf)) { + pcb_push_staged(info->invalid_buf, offset); + info->invalid_count[phase]++; + } else { + dprint(FD_SPRANDOM, "pcb buffer would be overriten\n"); + assert(false); + } + } +} + +static void dprint_invalidation(const struct sprandom_info *info) +{ + uint32_t phase = info->curr_phase; + double inv = 0; + double inv_act; /* actually invalidation percentage */ + + inv_act = (double)info->invalid_count[phase] / (double)info->region_write_count; + if (info->current_region > 0) + inv = (double)info->invalid_pct[info->current_region - 1] / PCT_PRECISION; + + dprint(FD_SPRANDOM, "Invalidation[%d] %"PRIu64" %zu %.04f %.04f\n", + info->current_region, + info->region_write_count, + info->invalid_count[phase], + inv, inv_act); +} + +/** + * sprandom_get_next_offset - Generate the next write offset for a region, + * managing invalidation, and region transitions. + * + * @info: sprandom_info structure containing state and configuration. + * @f: fio file associated with the ssd device. + * @b: block offset to store the next write offset. + * + * Generates offsets to write a region and saves a fraction of the offsets + * in a two phase circular buffer. + * When transitioning to the next region (phase is flipped),it first writes + * all saved offsets to achieve the desired fraction of invalid blocks in the + * previous region. The remainder of the current region is then filled with + * new offsets. + * + * Returns: + * 0 if a valid offset is found and stored in @b, + * 1 if no more offsets are available (end of regions or LFSR exhausted). + */ +int sprandom_get_next_offset(struct sprandom_info *info, struct fio_file *f, uint64_t *b) +{ + uint64_t offset = 0; + uint32_t phase = info->curr_phase; + + if (!info->cache_sz) { + /* replay invalidation at start of next region prior to moving + * to new region. + */ + if (pcb_pop(info->invalid_buf, &offset)) { + sprandom_add_with_probability(info, offset, phase ^ 1); + dprint(FD_SPRANDOM, "Write %"PRIu64" over %d\n", + offset, info->current_region); + goto out; + } + } + + /* Move to next region */ + if (info->writes_remaining == 0) { + if (info->cache_sz) { + /* replay invalidation for previous region at end of this + * region to avoid invalidations hitting the defined cache. + */ + if (pcb_pop(info->invalid_buf, &offset)) { + sprandom_add_with_probability(info, offset, phase ^ 1); + dprint(FD_SPRANDOM, "Cache Defer Write %"PRIu64" " + " over %d\n", offset, info->current_region); + goto out; + } + } + + if (info->current_region >= info->num_regions) { + dprint(FD_SPRANDOM, "End: Last Region %d cur%d\n", + info->current_region, info->num_regions); + return 1; + } + + dprint_invalidation(info); + + info->invalid_count[phase] = 0; + + info->current_region++; + phase ^= 1; + info->writes_remaining = info->region_write_count - + info->invalid_count[phase]; + info->curr_phase = phase; + pcb_commit(info->invalid_buf); + } + + /* Fetch new offset */ + if (lfsr_next(&f->lfsr, &offset)) { + if (info->cache_sz) { + /* Since we defer invalidation to the end of next region we + * need to take into account end of lfsr case + */ + if (pcb_pop(info->invalid_buf, &offset)) { + dprint(FD_SPRANDOM, "lfsr cache exit Write %"PRIu64" " + " over %d\n", offset, info->current_region); + goto out; + } + } + + dprint(FD_SPRANDOM, "End: LFSR exhausted %d [%zu] [%zu]\n", + info->current_region, + info->invalid_count[phase], + info->invalid_count[phase ^ 1]); + + dprint_invalidation(info); + + return 1; + } + + if (info->writes_remaining > 0) + info->writes_remaining--; + + sprandom_add_with_probability(info, offset, phase ^ 1); + dprint(FD_SPRANDOM, "Write %"PRIu64" lfsr %d\n", offset, info->current_region); +out: + *b = offset; + return 0; +} + +/** + * sprandom_init - initialize sprandom info + * @td: fio thread data + * @f: fio file associated with the ssd device. + * + * Sets up the sprandom_info structure for the given file according: + * region count, over-provisioning, and file/device size. + * + * Return: 0 on success, negative error code on failure. + */ +int sprandom_init(struct thread_data *td, struct fio_file *f) +{ + struct sprandom_info *info = NULL; + double over_provisioning; + uint64_t logical_size; + uint64_t align_bs = td->o.bs[DDIR_WRITE]; + int ret; + + if (!td->o.sprandom) + return 0; + + if (!is_power_of_2(align_bs)) { + log_err("fio: sprandom: bs [%"PRIu64"] should be power of 2", + align_bs); + return -EINVAL; + } + + info = calloc(1, sizeof(*info)); + if (!info) + return -ENOMEM; + + logical_size = min(f->real_file_size, f->io_size); + over_provisioning = td->o.spr_over_provisioning.u.f; + info->num_regions = td->o.spr_num_regions; + info->over_provisioning = over_provisioning; + info->cache_sz = td->o.spr_cache_size; + td->o.io_size = sprandom_physical_size(over_provisioning, + logical_size, align_bs); + info->rand_state = &td->sprandom_state; + ret = sprandom_setup(info, logical_size, align_bs); + if (ret) + goto err; + + f->spr_info = info; + return 0; +err: + free(info); + return ret; +} + +/** + * sprandom_free - Frees resources associated with a sprandom_info structure. + * @info: Pointer to the sprandom_info structure to be freed. + * + * Releases memory allocated for validity_dist, invalid_buf, and the spr_info + * structure itself. Does nothing if @spr_info is NULL. + */ +void sprandom_free(struct sprandom_info *info) +{ + if (!info) + return; + + free(info->invalid_pct); + free(info->invalid_buf); + free(info); +} diff --git a/sprandom.h b/sprandom.h new file mode 100644 index 0000000000..d50c5afb4e --- /dev/null +++ b/sprandom.h @@ -0,0 +1,78 @@ +/** + * SPDX-License-Identifier: GPL-2.0 only + * + * Copyright (c) 2025 Sandisk Corporation or its affiliates. + */ + +#ifndef FIO_SPRANDOM_H +#define FIO_SPRANDOM_H + +#include +#include "lib/rand.h" +#include "pcbuf.h" + +/** + * struct sprandom_info - information for sprandom operations. + * + * @over_provisioning: Over-provisioning ratio for the flash device. + * @region_sz: Size of each region in bytes. + * @num_regions: Number of SPRandom regions. + * @validity_dist: validity for each region. + * @invalid_pct: invalidation percentages per region. + * @invalid_buf: invalidation offsets two pahse buffer. + * @invalid_capacity: maximal size of invalidation buffer for a region. + * @invalid_count: number of invalid offsets in each phase. + * @current_region: index of the current region being processed. + * @curr_phase: current phase of the invalidation process (0 or 1). + * @region_write_count: number of writes performed in the current region. + * @writes_remaining: umber of writes left to perform. + * @rand_state: state for the random number generator. + */ +struct sprandom_info { + double over_provisioning; + uint64_t region_sz; + uint64_t cache_sz; + uint32_t num_regions; + + uint32_t *invalid_pct; + + /* Invalidation list*/ + struct pc_buf *invalid_buf; + uint64_t invalid_capacity; + size_t invalid_count[2]; + uint32_t current_region; + uint32_t curr_phase; + + /* Region and write tracking */ + uint64_t region_write_count; + uint64_t writes_remaining; + + struct frand_state *rand_state; +}; + +/** + * sprandom_init - Initialize the sprandom for a given file and thread. + * @td: FIO thread data + * @f: FIO file + * + * Returns 0 on success, or a negative error code on failure. + */ +int sprandom_init(struct thread_data *td, struct fio_file *f); + +/** + * sprandom_free - Frees resources associated with a sprandom_info structure. + * @info: sprandom_info structure to be freed. + */ +void sprandom_free(struct sprandom_info *info); + +/** + * sprandom_get_next_offset - Get the next random offset for a file. + * @info: sprandom_info structure containing the state + * @f: FIO file + * @b: Output pointer to store the next offset. + * + * Returns 0 on success, or a negative error code on failure. + */ +int sprandom_get_next_offset(struct sprandom_info *info, struct fio_file *f, uint64_t *b); + +#endif /* FIO_SPRANDOM_H */ diff --git a/stat.c b/stat.c index 7e84058d9b..620b46262e 100644 --- a/stat.c +++ b/stat.c @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -23,6 +24,15 @@ #define LOG_MSEC_SLACK 1 #endif +struct log_sample { + union io_sample_data data; + uint32_t ddir; + uint64_t bs; + uint64_t offset; + uint16_t priority; + uint64_t issue_time; +}; + struct fio_sem *stat_sem; void clear_rusage_stat(struct thread_data *td) @@ -138,7 +148,7 @@ static int double_cmp(const void *a, const void *b) return cmp; } -unsigned int calc_clat_percentiles(uint64_t *io_u_plat, unsigned long long nr, +unsigned int calc_clat_percentiles(const uint64_t *io_u_plat, unsigned long long nr, fio_fp64_t *plist, unsigned long long **output, unsigned long long *maxv, unsigned long long *minv) { @@ -202,7 +212,7 @@ unsigned int calc_clat_percentiles(uint64_t *io_u_plat, unsigned long long nr, /* * Find and display the p-th percentile of clat */ -static void show_clat_percentiles(uint64_t *io_u_plat, unsigned long long nr, +static void show_clat_percentiles(const uint64_t *io_u_plat, unsigned long long nr, fio_fp64_t *plist, unsigned int precision, const char *pre, struct buf_output *out) { @@ -265,7 +275,19 @@ static void show_clat_percentiles(uint64_t *io_u_plat, unsigned long long nr, free(ovals); } -bool calc_lat(struct io_stat *is, unsigned long long *min, +static int get_nr_prios_with_samples(struct thread_stat *ts, enum fio_ddir ddir) +{ + int i, nr_prios_with_samples = 0; + + for (i = 0; i < ts->nr_clat_prio[ddir]; i++) { + if (ts->clat_prio[ddir][i].clat_stat.samples) + nr_prios_with_samples++; + } + + return nr_prios_with_samples; +} + +bool calc_lat(const struct io_stat *is, unsigned long long *min, unsigned long long *max, double *mean, double *dev) { double n = (double) is->samples; @@ -285,13 +307,14 @@ bool calc_lat(struct io_stat *is, unsigned long long *min, return true; } -void show_mixed_group_stats(struct group_run_stats *rs, struct buf_output *out) +static void show_mixed_group_stats(const struct group_run_stats *rs, struct buf_output *out) { char *io, *agg, *min, *max; char *ioalt, *aggalt, *minalt, *maxalt; - uint64_t io_mix = 0, agg_mix = 0, min_mix = -1, max_mix = 0, min_run = -1, max_run = 0; - int i; + uint64_t io_mix = 0, agg_mix = 0, min_mix = -1, max_mix = 0; + uint64_t min_run = -1, max_run = 0; const int i2p = is_power_of_2(rs->kb_base); + int i; for (i = 0; i < DDIR_RWDIR_CNT; i++) { if (!rs->max_run[i]) @@ -325,7 +348,7 @@ void show_mixed_group_stats(struct group_run_stats *rs, struct buf_output *out) free(maxalt); } -void show_group_stats(struct group_run_stats *rs, struct buf_output *out) +void show_group_stats(const struct group_run_stats *rs, struct buf_output *out) { char *io, *agg, *min, *max; char *ioalt, *aggalt, *minalt, *maxalt; @@ -363,13 +386,13 @@ void show_group_stats(struct group_run_stats *rs, struct buf_output *out) free(minalt); free(maxalt); } - - /* Need to aggregate statisitics to show mixed values */ - if (rs->unified_rw_rep == UNIFIED_BOTH) + + /* Need to aggregate statistics to show mixed values */ + if (rs->unified_rw_rep == UNIFIED_BOTH) show_mixed_group_stats(rs, out); } -void stat_calc_dist(uint64_t *map, unsigned long total, double *io_u_dist) +void stat_calc_dist(const uint64_t *map, unsigned long total, double *io_u_dist) { int i; @@ -387,8 +410,8 @@ void stat_calc_dist(uint64_t *map, unsigned long total, double *io_u_dist) } } -static void stat_calc_lat(struct thread_stat *ts, double *dst, - uint64_t *src, int nr) +static void stat_calc_lat(const struct thread_stat *ts, double *dst, + const uint64_t *src, int nr) { unsigned long total = ddir_rw_sum(ts->total_io_u); int i; @@ -411,7 +434,7 @@ static void stat_calc_lat(struct thread_stat *ts, double *dst, * To keep the terse format unaltered, add all of the ns latency * buckets to the first us latency bucket */ -static void stat_calc_lat_nu(struct thread_stat *ts, double *io_u_lat_u) +static void stat_calc_lat_nu(const struct thread_stat *ts, double *io_u_lat_u) { unsigned long ntotal = 0, total = ddir_rw_sum(ts->total_io_u); int i; @@ -424,17 +447,17 @@ static void stat_calc_lat_nu(struct thread_stat *ts, double *io_u_lat_u) io_u_lat_u[0] += 100.0 * (double) ntotal / (double) total; } -void stat_calc_lat_n(struct thread_stat *ts, double *io_u_lat) +void stat_calc_lat_n(const struct thread_stat *ts, double *io_u_lat) { stat_calc_lat(ts, io_u_lat, ts->io_u_lat_n, FIO_IO_U_LAT_N_NR); } -void stat_calc_lat_u(struct thread_stat *ts, double *io_u_lat) +void stat_calc_lat_u(const struct thread_stat *ts, double *io_u_lat) { stat_calc_lat(ts, io_u_lat, ts->io_u_lat_u, FIO_IO_U_LAT_U_NR); } -void stat_calc_lat_m(struct thread_stat *ts, double *io_u_lat) +void stat_calc_lat_m(const struct thread_stat *ts, double *io_u_lat) { stat_calc_lat(ts, io_u_lat, ts->io_u_lat_m, FIO_IO_U_LAT_M_NR); } @@ -461,179 +484,58 @@ static void display_lat(const char *name, unsigned long long min, free(maxp); } -static double convert_agg_kbytes_percent(struct group_run_stats *rs, int ddir, int mean) +static struct thread_stat *gen_mixed_ddir_stats_from_ts(const struct thread_stat *ts) { - double p_of_agg = 100.0; - if (rs && rs->agg[ddir] > 1024) { - p_of_agg = mean * 100.0 / (double) (rs->agg[ddir] / 1024.0); - - if (p_of_agg > 100.0) - p_of_agg = 100.0; - } - return p_of_agg; -} - -static void show_mixed_ddir_status(struct group_run_stats *rs, struct thread_stat *ts, - struct buf_output *out) -{ - unsigned long runt; - unsigned long long min, max, bw, iops; - double mean, dev; - char *io_p, *bw_p, *bw_p_alt, *iops_p, *post_st = NULL; struct thread_stat *ts_lcl; - int i2p; - int ddir = 0; - - /* Handle aggregation of Reads (ddir = 0), Writes (ddir = 1), and Trims (ddir = 2) */ + /* + * Handle aggregation of Reads (ddir = 0), Writes (ddir = 1), and + * Trims (ddir = 2) + */ ts_lcl = malloc(sizeof(struct thread_stat)); - memset((void *)ts_lcl, 0, sizeof(struct thread_stat)); - ts_lcl->unified_rw_rep = UNIFIED_MIXED; /* calculate mixed stats */ - init_thread_stat_min_vals(ts_lcl); - - sum_thread_stats(ts_lcl, ts, 1); - - assert(ddir_rw(ddir)); - - if (!ts_lcl->runtime[ddir]) - return; - - i2p = is_power_of_2(rs->kb_base); - runt = ts_lcl->runtime[ddir]; - - bw = (1000 * ts_lcl->io_bytes[ddir]) / runt; - io_p = num2str(ts_lcl->io_bytes[ddir], ts->sig_figs, 1, i2p, N2S_BYTE); - bw_p = num2str(bw, ts->sig_figs, 1, i2p, ts->unit_base); - bw_p_alt = num2str(bw, ts->sig_figs, 1, !i2p, ts->unit_base); - - iops = (1000 * ts_lcl->total_io_u[ddir]) / runt; - iops_p = num2str(iops, ts->sig_figs, 1, 0, N2S_NONE); - - log_buf(out, " mixed: IOPS=%s, BW=%s (%s)(%s/%llumsec)%s\n", - iops_p, bw_p, bw_p_alt, io_p, - (unsigned long long) ts_lcl->runtime[ddir], - post_st ? : ""); - - free(post_st); - free(io_p); - free(bw_p); - free(bw_p_alt); - free(iops_p); - - if (calc_lat(&ts_lcl->slat_stat[ddir], &min, &max, &mean, &dev)) - display_lat("slat", min, max, mean, dev, out); - if (calc_lat(&ts_lcl->clat_stat[ddir], &min, &max, &mean, &dev)) - display_lat("clat", min, max, mean, dev, out); - if (calc_lat(&ts_lcl->lat_stat[ddir], &min, &max, &mean, &dev)) - display_lat(" lat", min, max, mean, dev, out); - if (calc_lat(&ts_lcl->clat_high_prio_stat[ddir], &min, &max, &mean, &dev)) { - display_lat(ts_lcl->lat_percentiles ? "high prio_lat" : "high prio_clat", - min, max, mean, dev, out); - if (calc_lat(&ts_lcl->clat_low_prio_stat[ddir], &min, &max, &mean, &dev)) - display_lat(ts_lcl->lat_percentiles ? "low prio_lat" : "low prio_clat", - min, max, mean, dev, out); - } - - if (ts->slat_percentiles && ts_lcl->slat_stat[ddir].samples > 0) - show_clat_percentiles(ts_lcl->io_u_plat[FIO_SLAT][ddir], - ts_lcl->slat_stat[ddir].samples, - ts->percentile_list, - ts->percentile_precision, "slat", out); - if (ts->clat_percentiles && ts_lcl->clat_stat[ddir].samples > 0) - show_clat_percentiles(ts_lcl->io_u_plat[FIO_CLAT][ddir], - ts_lcl->clat_stat[ddir].samples, - ts->percentile_list, - ts->percentile_precision, "clat", out); - if (ts->lat_percentiles && ts_lcl->lat_stat[ddir].samples > 0) - show_clat_percentiles(ts_lcl->io_u_plat[FIO_LAT][ddir], - ts_lcl->lat_stat[ddir].samples, - ts->percentile_list, - ts->percentile_precision, "lat", out); - - if (ts->clat_percentiles || ts->lat_percentiles) { - const char *name = ts->lat_percentiles ? "lat" : "clat"; - char prio_name[32]; - uint64_t samples; - - if (ts->lat_percentiles) - samples = ts_lcl->lat_stat[ddir].samples; - else - samples = ts_lcl->clat_stat[ddir].samples; - - /* Only print this if some high and low priority stats were collected */ - if (ts_lcl->clat_high_prio_stat[ddir].samples > 0 && - ts_lcl->clat_low_prio_stat[ddir].samples > 0) - { - sprintf(prio_name, "high prio (%.2f%%) %s", - 100. * (double) ts_lcl->clat_high_prio_stat[ddir].samples / (double) samples, - name); - show_clat_percentiles(ts_lcl->io_u_plat_high_prio[ddir], - ts_lcl->clat_high_prio_stat[ddir].samples, - ts->percentile_list, - ts->percentile_precision, prio_name, out); - - sprintf(prio_name, "low prio (%.2f%%) %s", - 100. * (double) ts_lcl->clat_low_prio_stat[ddir].samples / (double) samples, - name); - show_clat_percentiles(ts_lcl->io_u_plat_low_prio[ddir], - ts_lcl->clat_low_prio_stat[ddir].samples, - ts->percentile_list, - ts->percentile_precision, prio_name, out); - } + if (!ts_lcl) { + log_err("fio: failed to allocate local thread stat\n"); + return NULL; } - if (calc_lat(&ts_lcl->bw_stat[ddir], &min, &max, &mean, &dev)) { - double p_of_agg = 100.0, fkb_base = (double)rs->kb_base; - const char *bw_str; + init_thread_stat(ts_lcl); - if ((rs->unit_base == 1) && i2p) - bw_str = "Kibit"; - else if (rs->unit_base == 1) - bw_str = "kbit"; - else if (i2p) - bw_str = "KiB"; - else - bw_str = "kB"; + /* calculate mixed stats */ + ts_lcl->unified_rw_rep = UNIFIED_MIXED; + ts_lcl->lat_percentiles = ts->lat_percentiles; + ts_lcl->clat_percentiles = ts->clat_percentiles; + ts_lcl->slat_percentiles = ts->slat_percentiles; + ts_lcl->percentile_precision = ts->percentile_precision; + memcpy(ts_lcl->percentile_list, ts->percentile_list, sizeof(ts->percentile_list)); + ts_lcl->sig_figs = ts->sig_figs; - p_of_agg = convert_agg_kbytes_percent(rs, ddir, mean); + sum_thread_stats(ts_lcl, ts); - if (rs->unit_base == 1) { - min *= 8.0; - max *= 8.0; - mean *= 8.0; - dev *= 8.0; - } + return ts_lcl; +} - if (mean > fkb_base * fkb_base) { - min /= fkb_base; - max /= fkb_base; - mean /= fkb_base; - dev /= fkb_base; - bw_str = (rs->unit_base == 1 ? "Mibit" : "MiB"); - } +static double convert_agg_kbytes_percent(const struct group_run_stats *rs, + enum fio_ddir ddir, int mean) +{ + double p_of_agg = 100.0; + if (rs && rs->agg[ddir] > 1024) { + p_of_agg = mean * 100.0 / (double) (rs->agg[ddir] / 1024.0); - log_buf(out, " bw (%5s/s): min=%5llu, max=%5llu, per=%3.2f%%, " - "avg=%5.02f, stdev=%5.02f, samples=%" PRIu64 "\n", - bw_str, min, max, p_of_agg, mean, dev, - (&ts_lcl->bw_stat[ddir])->samples); - } - if (calc_lat(&ts_lcl->iops_stat[ddir], &min, &max, &mean, &dev)) { - log_buf(out, " iops : min=%5llu, max=%5llu, " - "avg=%5.02f, stdev=%5.02f, samples=%" PRIu64 "\n", - min, max, mean, dev, (&ts_lcl->iops_stat[ddir])->samples); + if (p_of_agg > 100.0) + p_of_agg = 100.0; } - - free(ts_lcl); + return p_of_agg; } -static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts, - int ddir, struct buf_output *out) +static void show_ddir_status(const struct group_run_stats *rs, struct thread_stat *ts, + enum fio_ddir ddir, struct buf_output *out) { unsigned long runt; unsigned long long min, max, bw, iops; double mean, dev; char *io_p, *bw_p, *bw_p_alt, *iops_p, *post_st = NULL; - int i2p; + int i2p, i; + const char *clat_type = ts->lat_percentiles ? "lat" : "clat"; if (ddir_sync(ddir)) { if (calc_lat(&ts->sync_stat, &min, &max, &mean, &dev)) { @@ -663,7 +565,7 @@ static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts, iops = (1000 * (uint64_t)ts->total_io_u[ddir]) / runt; iops_p = num2str(iops, ts->sig_figs, 1, 0, N2S_NONE); - if (ddir == DDIR_WRITE) + if (ts->count_zone_resets) post_st = zbd_write_status(ts); else if (ddir == DDIR_READ && ts->cachehit && ts->cachemiss) { uint64_t total; @@ -694,12 +596,24 @@ static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts, display_lat("clat", min, max, mean, dev, out); if (calc_lat(&ts->lat_stat[ddir], &min, &max, &mean, &dev)) display_lat(" lat", min, max, mean, dev, out); - if (calc_lat(&ts->clat_high_prio_stat[ddir], &min, &max, &mean, &dev)) { - display_lat(ts->lat_percentiles ? "high prio_lat" : "high prio_clat", - min, max, mean, dev, out); - if (calc_lat(&ts->clat_low_prio_stat[ddir], &min, &max, &mean, &dev)) - display_lat(ts->lat_percentiles ? "low prio_lat" : "low prio_clat", - min, max, mean, dev, out); + + /* Only print per prio stats if there are >= 2 prios with samples */ + if (get_nr_prios_with_samples(ts, ddir) >= 2) { + for (i = 0; i < ts->nr_clat_prio[ddir]; i++) { + char buf[64]; + + if (!calc_lat(&ts->clat_prio[ddir][i].clat_stat, &min, + &max, &mean, &dev)) + continue; + + snprintf(buf, sizeof(buf), + "%s prio %u/%u/%u", + clat_type, + ioprio_class(ts->clat_prio[ddir][i].ioprio), + ioprio(ts->clat_prio[ddir][i].ioprio), + ioprio_hint(ts->clat_prio[ddir][i].ioprio)); + display_lat(buf, min, max, mean, dev, out); + } } if (ts->slat_percentiles && ts->slat_stat[ddir].samples > 0) @@ -719,8 +633,7 @@ static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts, ts->percentile_precision, "lat", out); if (ts->clat_percentiles || ts->lat_percentiles) { - const char *name = ts->lat_percentiles ? "lat" : "clat"; - char prio_name[32]; + char prio_name[64]; uint64_t samples; if (ts->lat_percentiles) @@ -728,25 +641,27 @@ static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts, else samples = ts->clat_stat[ddir].samples; - /* Only print this if some high and low priority stats were collected */ - if (ts->clat_high_prio_stat[ddir].samples > 0 && - ts->clat_low_prio_stat[ddir].samples > 0) - { - sprintf(prio_name, "high prio (%.2f%%) %s", - 100. * (double) ts->clat_high_prio_stat[ddir].samples / (double) samples, - name); - show_clat_percentiles(ts->io_u_plat_high_prio[ddir], - ts->clat_high_prio_stat[ddir].samples, - ts->percentile_list, - ts->percentile_precision, prio_name, out); - - sprintf(prio_name, "low prio (%.2f%%) %s", - 100. * (double) ts->clat_low_prio_stat[ddir].samples / (double) samples, - name); - show_clat_percentiles(ts->io_u_plat_low_prio[ddir], - ts->clat_low_prio_stat[ddir].samples, - ts->percentile_list, - ts->percentile_precision, prio_name, out); + /* Only print per prio stats if there are >= 2 prios with samples */ + if (get_nr_prios_with_samples(ts, ddir) >= 2) { + for (i = 0; i < ts->nr_clat_prio[ddir]; i++) { + uint64_t prio_samples = + ts->clat_prio[ddir][i].clat_stat.samples; + + if (!prio_samples) + continue; + + snprintf(prio_name, sizeof(prio_name), + "%s prio %u/%u/%u (%.2f%% of IOs)", + clat_type, + ioprio_class(ts->clat_prio[ddir][i].ioprio), + ioprio(ts->clat_prio[ddir][i].ioprio), + ioprio_hint(ts->clat_prio[ddir][i].ioprio), + 100. * (double) prio_samples / (double) samples); + show_clat_percentiles(ts->clat_prio[ddir][i].io_u_plat, + prio_samples, ts->percentile_list, + ts->percentile_precision, + prio_name, out); + } } } @@ -792,7 +707,20 @@ static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts, } } -static bool show_lat(double *io_u_lat, int nr, const char **ranges, +static void show_mixed_ddir_status(const struct group_run_stats *rs, + const struct thread_stat *ts, + struct buf_output *out) +{ + struct thread_stat *ts_lcl = gen_mixed_ddir_stats_from_ts(ts); + + if (ts_lcl) + show_ddir_status(rs, ts_lcl, DDIR_READ, out); + + free_clat_prio_stats(ts_lcl); + free(ts_lcl); +} + +static bool show_lat(const double *io_u_lat, int nr, const char **ranges, const char *msg, struct buf_output *out) { bool new_line = true, shown = false; @@ -823,7 +751,7 @@ static bool show_lat(double *io_u_lat, int nr, const char **ranges, return true; } -static void show_lat_n(double *io_u_lat_n, struct buf_output *out) +static void show_lat_n(const double *io_u_lat_n, struct buf_output *out) { const char *ranges[] = { "2=", "4=", "10=", "20=", "50=", "100=", "250=", "500=", "750=", "1000=", }; @@ -831,7 +759,7 @@ static void show_lat_n(double *io_u_lat_n, struct buf_output *out) show_lat(io_u_lat_n, FIO_IO_U_LAT_N_NR, ranges, "nsec", out); } -static void show_lat_u(double *io_u_lat_u, struct buf_output *out) +static void show_lat_u(const double *io_u_lat_u, struct buf_output *out) { const char *ranges[] = { "2=", "4=", "10=", "20=", "50=", "100=", "250=", "500=", "750=", "1000=", }; @@ -839,7 +767,7 @@ static void show_lat_u(double *io_u_lat_u, struct buf_output *out) show_lat(io_u_lat_u, FIO_IO_U_LAT_U_NR, ranges, "usec", out); } -static void show_lat_m(double *io_u_lat_m, struct buf_output *out) +static void show_lat_m(const double *io_u_lat_m, struct buf_output *out) { const char *ranges[] = { "2=", "4=", "10=", "20=", "50=", "100=", "250=", "500=", "750=", "1000=", "2000=", @@ -848,7 +776,7 @@ static void show_lat_m(double *io_u_lat_m, struct buf_output *out) show_lat(io_u_lat_m, FIO_IO_U_LAT_M_NR, ranges, "msec", out); } -static void show_latencies(struct thread_stat *ts, struct buf_output *out) +static void show_latencies(const struct thread_stat *ts, struct buf_output *out) { double io_u_lat_n[FIO_IO_U_LAT_N_NR]; double io_u_lat_u[FIO_IO_U_LAT_U_NR]; @@ -1005,10 +933,10 @@ static void show_block_infos(int nr_block_infos, uint32_t *block_infos, i == BLOCK_STATE_COUNT - 1 ? '\n' : ','); } -static void show_ss_normal(struct thread_stat *ts, struct buf_output *out) +static void show_ss_normal(const struct thread_stat *ts, struct buf_output *out) { - char *p1, *p1alt, *p2; - unsigned long long bw_mean, iops_mean; + char *p1, *p1alt, *p2, *p3 = NULL; + unsigned long long bw_mean, iops_mean, lat_mean; const int i2p = is_power_of_2(ts->kb_base); if (!ts->ss_dur) @@ -1016,15 +944,34 @@ static void show_ss_normal(struct thread_stat *ts, struct buf_output *out) bw_mean = steadystate_bw_mean(ts); iops_mean = steadystate_iops_mean(ts); + lat_mean = steadystate_lat_mean(ts); p1 = num2str(bw_mean / ts->kb_base, ts->sig_figs, ts->kb_base, i2p, ts->unit_base); p1alt = num2str(bw_mean / ts->kb_base, ts->sig_figs, ts->kb_base, !i2p, ts->unit_base); p2 = num2str(iops_mean, ts->sig_figs, 1, 0, N2S_NONE); + if (ts->ss_state & FIO_SS_LAT) { + const char *lat_unit = "nsec"; + unsigned long long lat_val = lat_mean; + double lat_mean_d = lat_mean, lat_dev_d = 0.0; + char *lat_num; + + if (nsec_to_msec(&lat_val, &lat_val, &lat_mean_d, &lat_dev_d)) + lat_unit = "msec"; + else if (nsec_to_usec(&lat_val, &lat_val, &lat_mean_d, &lat_dev_d)) + lat_unit = "usec"; - log_buf(out, " steadystate : attained=%s, bw=%s (%s), iops=%s, %s%s=%.3f%s\n", + lat_num = num2str((unsigned long long)lat_mean_d, ts->sig_figs, 1, 0, N2S_NONE); + if (asprintf(&p3, "%s%s", lat_num, lat_unit) < 0) + p3 = NULL; + free(lat_num); + } + + log_buf(out, " steadystate : attained=%s, bw=%s (%s), iops=%s%s%s, %s%s=%.3f%s\n", ts->ss_state & FIO_SS_ATTAINED ? "yes" : "no", p1, p1alt, p2, - ts->ss_state & FIO_SS_IOPS ? "iops" : "bw", + p3 ? ", lat=" : "", + p3 ? p3 : "", + ts->ss_state & FIO_SS_IOPS ? "iops" : (ts->ss_state & FIO_SS_LAT ? "lat" : "bw"), ts->ss_state & FIO_SS_SLOPE ? " slope": " mean dev", ts->ss_criterion.u.f, ts->ss_state & FIO_SS_PCT ? "%" : ""); @@ -1032,20 +979,23 @@ static void show_ss_normal(struct thread_stat *ts, struct buf_output *out) free(p1); free(p1alt); free(p2); + free(p3); } -static void show_agg_stats(struct disk_util_agg *agg, int terse, +static void show_agg_stats(const struct disk_util_agg *agg, int terse, struct buf_output *out) { if (!agg->slavecount) return; if (!terse) { - log_buf(out, ", aggrios=%llu/%llu, aggrmerge=%llu/%llu, " - "aggrticks=%llu/%llu, aggrin_queue=%llu, " - "aggrutil=%3.2f%%", + log_buf(out, ", aggrios=%llu/%llu, aggsectors=%llu/%llu, " + "aggrmerge=%llu/%llu, aggrticks=%llu/%llu, " + "aggrin_queue=%llu, aggrutil=%3.2f%%", (unsigned long long) agg->ios[0] / agg->slavecount, (unsigned long long) agg->ios[1] / agg->slavecount, + (unsigned long long) agg->sectors[0] / agg->slavecount, + (unsigned long long) agg->sectors[1] / agg->slavecount, (unsigned long long) agg->merges[0] / agg->slavecount, (unsigned long long) agg->merges[1] / agg->slavecount, (unsigned long long) agg->ticks[0] / agg->slavecount, @@ -1100,7 +1050,7 @@ static void aggregate_slaves_stats(struct disk_util *masterdu) agg->max_util.u.f = 100.0; } -void print_disk_util(struct disk_util_stat *dus, struct disk_util_agg *agg, +void print_disk_util(const struct disk_util_stat *dus, const struct disk_util_agg *agg, int terse, struct buf_output *out) { double util = 0; @@ -1114,11 +1064,14 @@ void print_disk_util(struct disk_util_stat *dus, struct disk_util_agg *agg, if (agg->slavecount) log_buf(out, " "); - log_buf(out, " %s: ios=%llu/%llu, merge=%llu/%llu, " - "ticks=%llu/%llu, in_queue=%llu, util=%3.2f%%", + log_buf(out, " %s: ios=%llu/%llu, sectors=%llu/%llu, " + "merge=%llu/%llu, ticks=%llu/%llu, in_queue=%llu, " + "util=%3.2f%%", dus->name, (unsigned long long) dus->s.ios[0], (unsigned long long) dus->s.ios[1], + (unsigned long long) dus->s.sectors[0], + (unsigned long long) dus->s.sectors[1], (unsigned long long) dus->s.merges[0], (unsigned long long) dus->s.merges[1], (unsigned long long) dus->s.ticks[0], @@ -1148,8 +1101,8 @@ void print_disk_util(struct disk_util_stat *dus, struct disk_util_agg *agg, log_buf(out, "\n"); } -void json_array_add_disk_util(struct disk_util_stat *dus, - struct disk_util_agg *agg, struct json_array *array) +void json_array_add_disk_util(const struct disk_util_stat *dus, + const struct disk_util_agg *agg, struct json_array *array) { struct json_object *obj; double util = 0; @@ -1165,6 +1118,8 @@ void json_array_add_disk_util(struct disk_util_stat *dus, json_object_add_value_string(obj, "name", (const char *)dus->name); json_object_add_value_int(obj, "read_ios", dus->s.ios[0]); json_object_add_value_int(obj, "write_ios", dus->s.ios[1]); + json_object_add_value_int(obj, "read_sectors", dus->s.sectors[0]); + json_object_add_value_int(obj, "write_sectors", dus->s.sectors[1]); json_object_add_value_int(obj, "read_merges", dus->s.merges[0]); json_object_add_value_int(obj, "write_merges", dus->s.merges[1]); json_object_add_value_int(obj, "read_ticks", dus->s.ticks[0]); @@ -1182,6 +1137,10 @@ void json_array_add_disk_util(struct disk_util_stat *dus, agg->ios[0] / agg->slavecount); json_object_add_value_int(obj, "aggr_write_ios", agg->ios[1] / agg->slavecount); + json_object_add_value_int(obj, "aggr_read_sectors", + agg->sectors[0] / agg->slavecount); + json_object_add_value_int(obj, "aggr_write_sectors", + agg->sectors[1] / agg->slavecount); json_object_add_value_int(obj, "aggr_read_merges", agg->merges[0] / agg->slavecount); json_object_add_value_int(obj, "aggr_write_merge", @@ -1212,8 +1171,8 @@ static void json_object_add_disk_utils(struct json_object *obj, } } -void show_disk_util(int terse, struct json_object *parent, - struct buf_output *out) +static void show_disk_util(int terse, struct json_object *parent, + struct buf_output *out) { struct flist_head *entry; struct disk_util *du; @@ -1222,9 +1181,8 @@ void show_disk_util(int terse, struct json_object *parent, if (!is_running_backend()) return; - if (flist_empty(&disk_list)) { + if (flist_empty(&disk_list)) return; - } if ((output_format & FIO_OUTPUT_JSON) && parent) do_json = true; @@ -1234,9 +1192,9 @@ void show_disk_util(int terse, struct json_object *parent, if (!terse && !do_json) log_buf(out, "\nDisk stats (read/write):\n"); - if (do_json) + if (do_json) { json_object_add_disk_utils(parent, &disk_list); - else if (output_format & ~(FIO_OUTPUT_JSON | FIO_OUTPUT_JSON_PLUS)) { + } else if (output_format & ~(FIO_OUTPUT_JSON | FIO_OUTPUT_JSON_PLUS)) { flist_for_each(entry, &disk_list) { du = flist_entry(entry, struct disk_util, list); @@ -1247,7 +1205,7 @@ void show_disk_util(int terse, struct json_object *parent, } static void show_thread_status_normal(struct thread_stat *ts, - struct group_run_stats *rs, + const struct group_run_stats *rs, struct buf_output *out) { double usr_cpu, sys_cpu; @@ -1363,8 +1321,9 @@ static void show_thread_status_normal(struct thread_stat *ts, } static void show_ddir_status_terse(struct thread_stat *ts, - struct group_run_stats *rs, int ddir, - int ver, struct buf_output *out) + const struct group_run_stats *rs, + enum fio_ddir ddir, int ver, + struct buf_output *out) { unsigned long long min, max, minv, maxv, bw, iops; unsigned long long *ovals = NULL; @@ -1396,19 +1355,20 @@ static void show_ddir_status_terse(struct thread_stat *ts, else log_buf(out, ";%llu;%llu;%f;%f", 0ULL, 0ULL, 0.0, 0.0); - if (ts->lat_percentiles) + if (ts->lat_percentiles) { len = calc_clat_percentiles(ts->io_u_plat[FIO_LAT][ddir], ts->lat_stat[ddir].samples, ts->percentile_list, &ovals, &maxv, &minv); - else if (ts->clat_percentiles) + } else if (ts->clat_percentiles) { len = calc_clat_percentiles(ts->io_u_plat[FIO_CLAT][ddir], ts->clat_stat[ddir].samples, ts->percentile_list, &ovals, &maxv, &minv); - else + } else { len = 0; - + } + for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++) { if (i >= len) { log_buf(out, ";0%%=0"); @@ -1435,8 +1395,9 @@ static void show_ddir_status_terse(struct thread_stat *ts, } log_buf(out, ";%llu;%llu;%f%%;%f;%f", min, max, p_of_agg, mean, dev); - } else + } else { log_buf(out, ";%llu;%llu;%f%%;%f;%f", 0ULL, 0ULL, 0.0, 0.0, 0.0); + } if (ver == 5) { if (bw_stat) @@ -1452,32 +1413,23 @@ static void show_ddir_status_terse(struct thread_stat *ts, } } -static void show_mixed_ddir_status_terse(struct thread_stat *ts, - struct group_run_stats *rs, - int ver, struct buf_output *out) +static void show_mixed_ddir_status_terse(const struct thread_stat *ts, + const struct group_run_stats *rs, + int ver, struct buf_output *out) { - struct thread_stat *ts_lcl; + struct thread_stat *ts_lcl = gen_mixed_ddir_stats_from_ts(ts); - /* Handle aggregation of Reads (ddir = 0), Writes (ddir = 1), and Trims (ddir = 2) */ - ts_lcl = malloc(sizeof(struct thread_stat)); - memset((void *)ts_lcl, 0, sizeof(struct thread_stat)); - ts_lcl->unified_rw_rep = UNIFIED_MIXED; /* calculate mixed stats */ - init_thread_stat_min_vals(ts_lcl); - ts_lcl->lat_percentiles = ts->lat_percentiles; - ts_lcl->clat_percentiles = ts->clat_percentiles; - ts_lcl->slat_percentiles = ts->slat_percentiles; - ts_lcl->percentile_precision = ts->percentile_precision; - memcpy(ts_lcl->percentile_list, ts->percentile_list, sizeof(ts->percentile_list)); - - sum_thread_stats(ts_lcl, ts, 1); + if (ts_lcl) + show_ddir_status_terse(ts_lcl, rs, DDIR_READ, ver, out); - /* add the aggregated stats to json parent */ - show_ddir_status_terse(ts_lcl, rs, DDIR_READ, ver, out); + free_clat_prio_stats(ts_lcl); free(ts_lcl); } -static struct json_object *add_ddir_lat_json(struct thread_stat *ts, uint32_t percentiles, - struct io_stat *lat_stat, uint64_t *io_u_plat) +static struct json_object *add_ddir_lat_json(struct thread_stat *ts, + uint32_t percentiles, + const struct io_stat *lat_stat, + const uint64_t *io_u_plat) { char buf[120]; double mean, dev; @@ -1527,7 +1479,8 @@ static struct json_object *add_ddir_lat_json(struct thread_stat *ts, uint32_t pe } static void add_ddir_status_json(struct thread_stat *ts, - struct group_run_stats *rs, int ddir, struct json_object *parent) + const struct group_run_stats *rs, enum fio_ddir ddir, + struct json_object *parent) { unsigned long long min, max; unsigned long long bw_bytes, bw; @@ -1587,25 +1540,41 @@ static void add_ddir_status_json(struct thread_stat *ts, if (!ddir_rw(ddir)) return; - /* Only print PRIO latencies if some high priority samples were gathered */ - if (ts->clat_high_prio_stat[ddir].samples > 0) { - const char *high, *low; + /* Only include per prio stats if there are >= 2 prios with samples */ + if (get_nr_prios_with_samples(ts, ddir) >= 2) { + struct json_array *array = json_create_array(); + const char *obj_name; + int i; - if (ts->lat_percentiles) { - high = "lat_high_prio"; - low = "lat_low_prio"; - } else { - high = "clat_high_prio"; - low = "clat_low_prio"; - } + if (ts->lat_percentiles) + obj_name = "lat_ns"; + else + obj_name = "clat_ns"; + + json_object_add_value_array(dir_object, "prios", array); + + for (i = 0; i < ts->nr_clat_prio[ddir]; i++) { + struct json_object *obj; - tmp_object = add_ddir_lat_json(ts, ts->clat_percentiles | ts->lat_percentiles, - &ts->clat_high_prio_stat[ddir], ts->io_u_plat_high_prio[ddir]); - json_object_add_value_object(dir_object, high, tmp_object); + if (!ts->clat_prio[ddir][i].clat_stat.samples) + continue; - tmp_object = add_ddir_lat_json(ts, ts->clat_percentiles | ts->lat_percentiles, - &ts->clat_low_prio_stat[ddir], ts->io_u_plat_low_prio[ddir]); - json_object_add_value_object(dir_object, low, tmp_object); + obj = json_create_object(); + + json_object_add_value_int(obj, "prioclass", + ioprio_class(ts->clat_prio[ddir][i].ioprio)); + json_object_add_value_int(obj, "prio", + ioprio(ts->clat_prio[ddir][i].ioprio)); + json_object_add_value_int(obj, "priohint", + ioprio_hint(ts->clat_prio[ddir][i].ioprio)); + + tmp_object = add_ddir_lat_json(ts, + ts->clat_percentiles | ts->lat_percentiles, + &ts->clat_prio[ddir][i].clat_stat, + ts->clat_prio[ddir][i].io_u_plat); + json_object_add_value_object(obj, obj_name, tmp_object); + json_array_add_value_object(array, obj); + } } if (calc_lat(&ts->bw_stat[ddir], &min, &max, &mean, &dev)) { @@ -1648,23 +1617,13 @@ static void add_ddir_status_json(struct thread_stat *ts, static void add_mixed_ddir_status_json(struct thread_stat *ts, struct group_run_stats *rs, struct json_object *parent) { - struct thread_stat *ts_lcl; - - /* Handle aggregation of Reads (ddir = 0), Writes (ddir = 1), and Trims (ddir = 2) */ - ts_lcl = malloc(sizeof(struct thread_stat)); - memset((void *)ts_lcl, 0, sizeof(struct thread_stat)); - ts_lcl->unified_rw_rep = UNIFIED_MIXED; /* calculate mixed stats */ - init_thread_stat_min_vals(ts_lcl); - ts_lcl->lat_percentiles = ts->lat_percentiles; - ts_lcl->clat_percentiles = ts->clat_percentiles; - ts_lcl->slat_percentiles = ts->slat_percentiles; - ts_lcl->percentile_precision = ts->percentile_precision; - memcpy(ts_lcl->percentile_list, ts->percentile_list, sizeof(ts->percentile_list)); - - sum_thread_stats(ts_lcl, ts, 1); + struct thread_stat *ts_lcl = gen_mixed_ddir_stats_from_ts(ts); /* add the aggregated stats to json parent */ - add_ddir_status_json(ts_lcl, rs, DDIR_READ, parent); + if (ts_lcl) + add_ddir_status_json(ts_lcl, rs, DDIR_READ, parent); + + free_clat_prio_stats(ts_lcl); free(ts_lcl); } @@ -1783,6 +1742,7 @@ static struct json_object *show_thread_status_json(struct thread_stat *ts, root = json_create_object(); json_object_add_value_string(root, "jobname", ts->name); json_object_add_value_int(root, "groupid", ts->groupid); + json_object_add_value_int(root, "job_start", ts->job_start); json_object_add_value_int(root, "error", ts->error); /* ETA Info */ @@ -1790,6 +1750,7 @@ static struct json_object *show_thread_status_json(struct thread_stat *ts, if (je) { json_object_add_value_int(root, "eta", je->eta_sec); json_object_add_value_int(root, "elapsed", je->elapsed_sec); + free(je); } if (opt_list) @@ -1959,9 +1920,10 @@ static struct json_object *show_thread_status_json(struct thread_stat *ts, struct json_array *iops, *bw; int j, k, l; char ss_buf[64]; + int intervals = ts->ss_dur / (ss_check_interval / 1000L); snprintf(ss_buf, sizeof(ss_buf), "%s%s:%f%s", - ts->ss_state & FIO_SS_IOPS ? "iops" : "bw", + ts->ss_state & FIO_SS_IOPS ? "iops" : (ts->ss_state & FIO_SS_LAT ? "lat" : "bw"), ts->ss_state & FIO_SS_SLOPE ? "_slope" : "", (float) ts->ss_limit.u.f, ts->ss_state & FIO_SS_PCT ? "%" : ""); @@ -1992,18 +1954,32 @@ static struct json_object *show_thread_status_json(struct thread_stat *ts, if ((ts->ss_state & FIO_SS_ATTAINED) || !(ts->ss_state & FIO_SS_BUFFER_FULL)) j = ts->ss_head; else - j = ts->ss_head == 0 ? ts->ss_dur - 1 : ts->ss_head - 1; - for (l = 0; l < ts->ss_dur; l++) { - k = (j + l) % ts->ss_dur; + j = ts->ss_head == 0 ? intervals - 1 : ts->ss_head - 1; + for (l = 0; l < intervals; l++) { + k = (j + l) % intervals; json_array_add_value_int(bw, ts->ss_bw_data[k]); json_array_add_value_int(iops, ts->ss_iops_data[k]); } json_object_add_value_int(data, "bw_mean", steadystate_bw_mean(ts)); json_object_add_value_int(data, "iops_mean", steadystate_iops_mean(ts)); + if (ts->ss_state & FIO_SS_LAT) { + struct json_array *lat; + lat = json_create_array(); + for (l = 0; l < intervals; l++) { + k = (j + l) % intervals; + json_array_add_value_int(lat, ts->ss_lat_data[k]); + } + json_object_add_value_int(data, "lat_mean", steadystate_lat_mean(ts)); + json_object_add_value_array(data, "lat_ns", lat); + } json_object_add_value_array(data, "iops", iops); json_object_add_value_array(data, "bw", bw); } + if (ts->count_zone_resets) + json_object_add_value_int(root, "zone_resets", + ts->nr_zone_resets); + return root; } @@ -2034,7 +2010,7 @@ struct json_object *show_thread_status(struct thread_stat *ts, return ret; } -static void __sum_stat(struct io_stat *dst, struct io_stat *src, bool first) +static void __sum_stat(struct io_stat *dst, const struct io_stat *src, bool first) { double mean, S; @@ -2073,9 +2049,10 @@ static void __sum_stat(struct io_stat *dst, struct io_stat *src, bool first) * numbers. For group_reporting, we should just add those up, not make * them the mean of everything. */ -static void sum_stat(struct io_stat *dst, struct io_stat *src, bool first, - bool pure_sum) +static void sum_stat(struct io_stat *dst, const struct io_stat *src, bool pure_sum) { + bool first = dst->samples == 0; + if (src->samples == 0) return; @@ -2099,9 +2076,9 @@ static void sum_stat(struct io_stat *dst, struct io_stat *src, bool first, } } -void sum_group_stats(struct group_run_stats *dst, struct group_run_stats *src) +void sum_group_stats(struct group_run_stats *dst, const struct group_run_stats *src) { - int i; + unsigned int i; for (i = 0; i < DDIR_RWDIR_CNT; i++) { if (dst->max_run[i] < src->max_run[i]) @@ -2125,48 +2102,251 @@ void sum_group_stats(struct group_run_stats *dst, struct group_run_stats *src) dst->sig_figs = src->sig_figs; } -void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src, - bool first) +/* + * Free the clat_prio_stat arrays allocated by alloc_clat_prio_stat_ddir(). + */ +void free_clat_prio_stats(struct thread_stat *ts) +{ + enum fio_ddir ddir; + + if (!ts) + return; + + for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) { + sfree(ts->clat_prio[ddir]); + ts->clat_prio[ddir] = NULL; + ts->nr_clat_prio[ddir] = 0; + } +} + +/* + * Allocate a clat_prio_stat array. The array has to be allocated/freed using + * smalloc/sfree, so that it is accessible by the process/thread summing the + * thread_stats. + */ +int alloc_clat_prio_stat_ddir(struct thread_stat *ts, enum fio_ddir ddir, + int nr_prios) +{ + struct clat_prio_stat *clat_prio; + int i; + + clat_prio = scalloc(nr_prios, sizeof(*ts->clat_prio[ddir])); + if (!clat_prio) { + log_err("fio: failed to allocate ts clat data\n"); + return 1; + } + + for (i = 0; i < nr_prios; i++) + clat_prio[i].clat_stat.min_val = ULONG_MAX; + + ts->clat_prio[ddir] = clat_prio; + ts->nr_clat_prio[ddir] = nr_prios; + + return 0; +} + +static int grow_clat_prio_stat(struct thread_stat *dst, enum fio_ddir ddir) +{ + int curr_len = dst->nr_clat_prio[ddir]; + void *new_arr; + + new_arr = scalloc(curr_len + 1, sizeof(*dst->clat_prio[ddir])); + if (!new_arr) { + log_err("fio: failed to grow clat prio array\n"); + return 1; + } + + memcpy(new_arr, dst->clat_prio[ddir], + curr_len * sizeof(*dst->clat_prio[ddir])); + sfree(dst->clat_prio[ddir]); + + dst->clat_prio[ddir] = new_arr; + dst->clat_prio[ddir][curr_len].clat_stat.min_val = ULONG_MAX; + dst->nr_clat_prio[ddir]++; + + return 0; +} + +static int find_clat_prio_index(struct thread_stat *dst, enum fio_ddir ddir, + uint32_t ioprio) +{ + int i, nr_prios = dst->nr_clat_prio[ddir]; + + for (i = 0; i < nr_prios; i++) { + if (dst->clat_prio[ddir][i].ioprio == ioprio) + return i; + } + + return -1; +} + +static int alloc_or_get_clat_prio_index(struct thread_stat *dst, + enum fio_ddir ddir, uint32_t ioprio, + int *idx) +{ + int index = find_clat_prio_index(dst, ddir, ioprio); + + if (index == -1) { + index = dst->nr_clat_prio[ddir]; + + if (grow_clat_prio_stat(dst, ddir)) + return 1; + + dst->clat_prio[ddir][index].ioprio = ioprio; + } + + *idx = index; + + return 0; +} + +static int clat_prio_stats_copy(struct thread_stat *dst, const struct thread_stat *src, + enum fio_ddir dst_ddir, enum fio_ddir src_ddir) +{ + size_t sz = sizeof(*src->clat_prio[src_ddir]) * + src->nr_clat_prio[src_ddir]; + + dst->clat_prio[dst_ddir] = smalloc(sz); + if (!dst->clat_prio[dst_ddir]) { + log_err("fio: failed to alloc clat prio array\n"); + return 1; + } + + memcpy(dst->clat_prio[dst_ddir], src->clat_prio[src_ddir], sz); + dst->nr_clat_prio[dst_ddir] = src->nr_clat_prio[src_ddir]; + + return 0; +} + +static int clat_prio_stat_add_samples(struct thread_stat *dst, + enum fio_ddir dst_ddir, uint32_t ioprio, + const struct io_stat *io_stat, + const uint64_t *io_u_plat) +{ + int i, dst_index; + + if (!io_stat->samples) + return 0; + + if (alloc_or_get_clat_prio_index(dst, dst_ddir, ioprio, &dst_index)) + return 1; + + sum_stat(&dst->clat_prio[dst_ddir][dst_index].clat_stat, io_stat, + false); + + for (i = 0; i < FIO_IO_U_PLAT_NR; i++) + dst->clat_prio[dst_ddir][dst_index].io_u_plat[i] += io_u_plat[i]; + + return 0; +} + +static int sum_clat_prio_stats_src_single_prio(struct thread_stat *dst, + const struct thread_stat *src, + enum fio_ddir dst_ddir, + enum fio_ddir src_ddir) +{ + const struct io_stat *io_stat; + const uint64_t *io_u_plat; + + /* + * If src ts has no clat_prio_stat array, then all I/Os were submitted + * using src->ioprio. Thus, the global samples in src->clat_stat (or + * src->lat_stat) can be used as the 'per prio' samples for src->ioprio. + */ + assert(!src->clat_prio[src_ddir]); + assert(src->nr_clat_prio[src_ddir] == 0); + + if (src->lat_percentiles) { + io_u_plat = src->io_u_plat[FIO_LAT][src_ddir]; + io_stat = &src->lat_stat[src_ddir]; + } else { + io_u_plat = src->io_u_plat[FIO_CLAT][src_ddir]; + io_stat = &src->clat_stat[src_ddir]; + } + + return clat_prio_stat_add_samples(dst, dst_ddir, src->ioprio, io_stat, + io_u_plat); +} + +static int sum_clat_prio_stats_src_multi_prio(struct thread_stat *dst, + const struct thread_stat *src, + enum fio_ddir dst_ddir, + enum fio_ddir src_ddir) +{ + int i; + + /* + * If src ts has a clat_prio_stat array, then there are multiple prios + * in use (i.e. src ts had cmdprio_percentage or cmdprio_bssplit set). + * The samples for the default prio will exist in the src->clat_prio + * array, just like the samples for any other prio. + */ + assert(src->clat_prio[src_ddir]); + assert(src->nr_clat_prio[src_ddir]); + + /* If the dst ts doesn't yet have a clat_prio array, simply memcpy. */ + if (!dst->clat_prio[dst_ddir]) + return clat_prio_stats_copy(dst, src, dst_ddir, src_ddir); + + /* The dst ts already has a clat_prio_array, add src stats into it. */ + for (i = 0; i < src->nr_clat_prio[src_ddir]; i++) { + struct io_stat *io_stat = &src->clat_prio[src_ddir][i].clat_stat; + uint64_t *io_u_plat = src->clat_prio[src_ddir][i].io_u_plat; + uint32_t ioprio = src->clat_prio[src_ddir][i].ioprio; + + if (clat_prio_stat_add_samples(dst, dst_ddir, ioprio, io_stat, io_u_plat)) + return 1; + } + + return 0; +} + +static int sum_clat_prio_stats(struct thread_stat *dst, const struct thread_stat *src, + enum fio_ddir dst_ddir, enum fio_ddir src_ddir) +{ + if (dst->disable_prio_stat) + return 0; + + if (!src->clat_prio[src_ddir]) + return sum_clat_prio_stats_src_single_prio(dst, src, dst_ddir, + src_ddir); + + return sum_clat_prio_stats_src_multi_prio(dst, src, dst_ddir, src_ddir); +} + +void sum_thread_stats(struct thread_stat *dst, const struct thread_stat *src) { int k, l, m; for (l = 0; l < DDIR_RWDIR_CNT; l++) { - if (!(dst->unified_rw_rep == UNIFIED_MIXED)) { - sum_stat(&dst->clat_stat[l], &src->clat_stat[l], first, false); - sum_stat(&dst->clat_high_prio_stat[l], &src->clat_high_prio_stat[l], first, false); - sum_stat(&dst->clat_low_prio_stat[l], &src->clat_low_prio_stat[l], first, false); - sum_stat(&dst->slat_stat[l], &src->slat_stat[l], first, false); - sum_stat(&dst->lat_stat[l], &src->lat_stat[l], first, false); - sum_stat(&dst->bw_stat[l], &src->bw_stat[l], first, true); - sum_stat(&dst->iops_stat[l], &src->iops_stat[l], first, true); + if (dst->unified_rw_rep != UNIFIED_MIXED) { + sum_stat(&dst->clat_stat[l], &src->clat_stat[l], false); + sum_stat(&dst->slat_stat[l], &src->slat_stat[l], false); + sum_stat(&dst->lat_stat[l], &src->lat_stat[l], false); + sum_stat(&dst->bw_stat[l], &src->bw_stat[l], true); + sum_stat(&dst->iops_stat[l], &src->iops_stat[l], true); + sum_clat_prio_stats(dst, src, l, l); dst->io_bytes[l] += src->io_bytes[l]; if (dst->runtime[l] < src->runtime[l]) dst->runtime[l] = src->runtime[l]; } else { - sum_stat(&dst->clat_stat[0], &src->clat_stat[l], first, false); - sum_stat(&dst->clat_high_prio_stat[0], &src->clat_high_prio_stat[l], first, false); - sum_stat(&dst->clat_low_prio_stat[0], &src->clat_low_prio_stat[l], first, false); - sum_stat(&dst->slat_stat[0], &src->slat_stat[l], first, false); - sum_stat(&dst->lat_stat[0], &src->lat_stat[l], first, false); - sum_stat(&dst->bw_stat[0], &src->bw_stat[l], first, true); - sum_stat(&dst->iops_stat[0], &src->iops_stat[l], first, true); + sum_stat(&dst->clat_stat[0], &src->clat_stat[l], false); + sum_stat(&dst->slat_stat[0], &src->slat_stat[l], false); + sum_stat(&dst->lat_stat[0], &src->lat_stat[l], false); + sum_stat(&dst->bw_stat[0], &src->bw_stat[l], true); + sum_stat(&dst->iops_stat[0], &src->iops_stat[l], true); + sum_clat_prio_stats(dst, src, 0, l); dst->io_bytes[0] += src->io_bytes[l]; if (dst->runtime[0] < src->runtime[l]) dst->runtime[0] = src->runtime[l]; - - /* - * We're summing to the same destination, so override - * 'first' after the first iteration of the loop - */ - first = false; } } - sum_stat(&dst->sync_stat, &src->sync_stat, first, false); + sum_stat(&dst->sync_stat, &src->sync_stat, false); dst->usr_time += src->usr_time; dst->sys_time += src->sys_time; dst->ctx += src->ctx; @@ -2187,7 +2367,7 @@ void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src, dst->io_u_lat_m[k] += src->io_u_lat_m[k]; for (k = 0; k < DDIR_RWDIR_CNT; k++) { - if (!(dst->unified_rw_rep == UNIFIED_MIXED)) { + if (dst->unified_rw_rep != UNIFIED_MIXED) { dst->total_io_u[k] += src->total_io_u[k]; dst->short_io_u[k] += src->short_io_u[k]; dst->drop_io_u[k] += src->drop_io_u[k]; @@ -2203,7 +2383,7 @@ void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src, for (k = 0; k < FIO_LAT_CNT; k++) for (l = 0; l < DDIR_RWDIR_CNT; l++) for (m = 0; m < FIO_IO_U_PLAT_NR; m++) - if (!(dst->unified_rw_rep == UNIFIED_MIXED)) + if (dst->unified_rw_rep != UNIFIED_MIXED) dst->io_u_plat[k][l][m] += src->io_u_plat[k][l][m]; else dst->io_u_plat[k][0][m] += src->io_u_plat[k][l][m]; @@ -2211,23 +2391,13 @@ void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src, for (k = 0; k < FIO_IO_U_PLAT_NR; k++) dst->io_u_sync_plat[k] += src->io_u_sync_plat[k]; - for (k = 0; k < DDIR_RWDIR_CNT; k++) { - for (m = 0; m < FIO_IO_U_PLAT_NR; m++) { - if (!(dst->unified_rw_rep == UNIFIED_MIXED)) { - dst->io_u_plat_high_prio[k][m] += src->io_u_plat_high_prio[k][m]; - dst->io_u_plat_low_prio[k][m] += src->io_u_plat_low_prio[k][m]; - } else { - dst->io_u_plat_high_prio[0][m] += src->io_u_plat_high_prio[k][m]; - dst->io_u_plat_low_prio[0][m] += src->io_u_plat_low_prio[k][m]; - } - - } - } - dst->total_run_time += src->total_run_time; dst->total_submit += src->total_submit; dst->total_complete += src->total_complete; - dst->nr_zone_resets += src->nr_zone_resets; + if (src->count_zone_resets) { + dst->count_zone_resets = 1; + dst->nr_zone_resets += src->nr_zone_resets; + } dst->cachehit += src->cachehit; dst->cachemiss += src->cachemiss; } @@ -2251,8 +2421,6 @@ void init_thread_stat_min_vals(struct thread_stat *ts) ts->lat_stat[i].min_val = ULONG_MAX; ts->bw_stat[i].min_val = ULONG_MAX; ts->iops_stat[i].min_val = ULONG_MAX; - ts->clat_high_prio_stat[i].min_val = ULONG_MAX; - ts->clat_low_prio_stat[i].min_val = ULONG_MAX; } ts->sync_stat.min_val = ULONG_MAX; } @@ -2265,10 +2433,60 @@ void init_thread_stat(struct thread_stat *ts) ts->groupid = -1; } +static void init_per_prio_stats(struct thread_stat *threadstats, int nr_ts) +{ + struct thread_stat *ts; + int i, j, last_ts, idx; + enum fio_ddir ddir; + + j = 0; + last_ts = -1; + idx = 0; + + /* + * Loop through all tds, if a td requires per prio stats, temporarily + * store a 1 in ts->disable_prio_stat, and then do an additional + * loop at the end where we invert the ts->disable_prio_stat values. + */ + for_each_td(td) { + if (!td->o.stats) + continue; + if (idx && + (!td->o.group_reporting || + (td->o.group_reporting && last_ts != td->groupid))) { + idx = 0; + j++; + } + + last_ts = td->groupid; + ts = &threadstats[j]; + + /* idx == 0 means first td in group, or td is not in a group. */ + if (idx == 0) + ts->ioprio = td->ioprio; + else if (td->ioprio != ts->ioprio) + ts->disable_prio_stat = 1; + + for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) { + if (td->ts.clat_prio[ddir]) { + ts->disable_prio_stat = 1; + break; + } + } + + idx++; + } end_for_each(); + + /* Loop through all dst threadstats and fixup the values. */ + for (i = 0; i < nr_ts; i++) { + ts = &threadstats[i]; + ts->disable_prio_stat = !ts->disable_prio_stat; + } +} + void __show_run_stats(void) { struct group_run_stats *runstats, *rs; - struct thread_data *td; struct thread_stat *threadstats, *ts; int i, j, k, nr_ts, last_ts, idx; bool kb_base_warned = false; @@ -2289,7 +2507,7 @@ void __show_run_stats(void) */ nr_ts = 0; last_ts = -1; - for_each_td(td, i) { + for_each_td(td) { if (!td->o.group_reporting) { nr_ts++; continue; @@ -2301,7 +2519,7 @@ void __show_run_stats(void) last_ts = td->groupid; nr_ts++; - } + } end_for_each(); threadstats = malloc(nr_ts * sizeof(struct thread_stat)); opt_lists = malloc(nr_ts * sizeof(struct flist_head *)); @@ -2311,10 +2529,12 @@ void __show_run_stats(void) opt_lists[i] = NULL; } + init_per_prio_stats(threadstats, nr_ts); + j = 0; last_ts = -1; idx = 0; - for_each_td(td, i) { + for_each_td(td) { if (!td->o.stats) continue; if (idx && (!td->o.group_reporting || @@ -2335,7 +2555,6 @@ void __show_run_stats(void) opt_lists[j] = &td->opt_list; idx++; - ts->members++; if (ts->groupid == -1) { /* @@ -2355,6 +2574,7 @@ void __show_run_stats(void) */ ts->thread_number = td->thread_number; ts->groupid = td->groupid; + ts->job_start = td->job_start; /* * first pid in group, not very useful... @@ -2384,7 +2604,7 @@ void __show_run_stats(void) ts->error = td->first_error; snprintf(ts->verror, sizeof(ts->verror), "%s", td->verror); - } else if (td->error) { + } else if (td->error) { ts->error = td->error; snprintf(ts->verror, sizeof(ts->verror), "%s", td->verror); @@ -2400,7 +2620,9 @@ void __show_run_stats(void) for (k = 0; k < ts->nr_block_infos; k++) ts->block_infos[k] = td->ts.block_infos[k]; - sum_thread_stats(ts, &td->ts, idx == 1); + sum_thread_stats(ts, &td->ts); + + ts->members++; if (td->o.ss_dur) { ts->ss_state = td->ss.state; @@ -2408,6 +2630,7 @@ void __show_run_stats(void) ts->ss_head = td->ss.head; ts->ss_bw_data = td->ss.bw_data; ts->ss_iops_data = td->ss.iops_data; + ts->ss_lat_data = td->ss.lat_data; ts->ss_limit.u.f = td->ss.limit; ts->ss_slope.u.f = td->ss.slope; ts->ss_deviation.u.f = td->ss.deviation; @@ -2415,7 +2638,7 @@ void __show_run_stats(void) } else ts->ss_dur = ts->ss_state = 0; - } + } end_for_each(); for (i = 0; i < nr_ts; i++) { unsigned long long bw; @@ -2450,7 +2673,7 @@ void __show_run_stats(void) } for (i = 0; i < groupid + 1; i++) { - int ddir; + enum fio_ddir ddir; rs = &runstats[i]; @@ -2556,39 +2779,46 @@ void __show_run_stats(void) log_info_flush(); free(runstats); + + /* free arrays allocated by sum_thread_stats(), if any */ + for (i = 0; i < nr_ts; i++) { + ts = &threadstats[i]; + free_clat_prio_stats(ts); + } free(threadstats); free(opt_lists); } int __show_running_run_stats(void) { - struct thread_data *td; unsigned long long *rt; struct timespec ts; - int i; fio_sem_down(stat_sem); rt = malloc(thread_number * sizeof(unsigned long long)); fio_gettime(&ts, NULL); - for_each_td(td, i) { + for_each_td(td) { + if (td->runstate >= TD_EXITED) + continue; + td->update_rusage = 1; for_each_rw_ddir(ddir) { td->ts.io_bytes[ddir] = td->io_bytes[ddir]; } td->ts.total_run_time = mtime_since(&td->epoch, &ts); - rt[i] = mtime_since(&td->start, &ts); + rt[__td_index] = mtime_since(&td->start, &ts); if (td_read(td) && td->ts.io_bytes[DDIR_READ]) - td->ts.runtime[DDIR_READ] += rt[i]; + td->ts.runtime[DDIR_READ] += rt[__td_index]; if (td_write(td) && td->ts.io_bytes[DDIR_WRITE]) - td->ts.runtime[DDIR_WRITE] += rt[i]; + td->ts.runtime[DDIR_WRITE] += rt[__td_index]; if (td_trim(td) && td->ts.io_bytes[DDIR_TRIM]) - td->ts.runtime[DDIR_TRIM] += rt[i]; - } + td->ts.runtime[DDIR_TRIM] += rt[__td_index]; + } end_for_each(); - for_each_td(td, i) { + for_each_td(td) { if (td->runstate >= TD_EXITED) continue; if (td->rusage_sem) { @@ -2596,18 +2826,21 @@ int __show_running_run_stats(void) fio_sem_down(td->rusage_sem); } td->update_rusage = 0; - } + } end_for_each(); __show_run_stats(); - for_each_td(td, i) { + for_each_td(td) { + if (td->runstate >= TD_EXITED) + continue; + if (td_read(td) && td->ts.io_bytes[DDIR_READ]) - td->ts.runtime[DDIR_READ] -= rt[i]; + td->ts.runtime[DDIR_READ] -= rt[__td_index]; if (td_write(td) && td->ts.io_bytes[DDIR_WRITE]) - td->ts.runtime[DDIR_WRITE] -= rt[i]; + td->ts.runtime[DDIR_WRITE] -= rt[__td_index]; if (td_trim(td) && td->ts.io_bytes[DDIR_TRIM]) - td->ts.runtime[DDIR_TRIM] -= rt[i]; - } + td->ts.runtime[DDIR_TRIM] -= rt[__td_index]; + } end_for_each(); free(rt); fio_sem_up(stat_sem); @@ -2682,6 +2915,14 @@ static inline void add_stat_sample(struct io_stat *is, unsigned long long data) is->samples++; } +static inline void add_stat_prio_sample(struct clat_prio_stat *clat_prio, + unsigned short clat_prio_index, + unsigned long long nsec) +{ + if (clat_prio) + add_stat_sample(&clat_prio[clat_prio_index].clat_stat, nsec); +} + /* * Return a struct io_logs, which is added to the tail of the log * list for 'iolog'. @@ -2696,7 +2937,10 @@ static struct io_logs *get_new_log(struct io_log *iolog) * forever */ if (!iolog->cur_log_max) { - new_samples = iolog->td->o.log_entries; + if (iolog->td) + new_samples = iolog->td->o.log_entries; + else + new_samples = DEF_LOG_ENTRIES; } else { new_samples = iolog->cur_log_max * 2; if (new_samples > MAX_LOG_ENTRIES) @@ -2835,17 +3079,15 @@ static struct io_logs *get_cur_log(struct io_log *iolog) return iolog->pending; } -static void __add_log_sample(struct io_log *iolog, union io_sample_data data, - enum fio_ddir ddir, unsigned long long bs, - unsigned long t, uint64_t offset, - unsigned int priority) +static void __add_log_sample(struct io_log *iolog, unsigned long t, + struct log_sample *sample) { struct io_logs *cur_log; if (iolog->disabled) return; if (flist_empty(&iolog->io_logs)) - iolog->avg_last[ddir] = t; + iolog->avg_last[sample->ddir] = t; cur_log = get_cur_log(iolog); if (cur_log) { @@ -2853,17 +3095,19 @@ static void __add_log_sample(struct io_log *iolog, union io_sample_data data, s = get_sample(iolog, cur_log, cur_log->nr_samples); - s->data = data; - s->time = t + (iolog->td ? iolog->td->unix_epoch : 0); - io_sample_set_ddir(iolog, s, ddir); - s->bs = bs; - s->priority = priority; + s->data = sample->data; + s->time = t; + if (iolog->td && iolog->td->o.log_alternate_epoch) + s->time += iolog->td->alternate_epoch; + io_sample_set_ddir(iolog, s, sample->ddir); + s->bs = sample->bs; + s->priority = sample->priority; - if (iolog->log_offset) { - struct io_sample_offset *so = (void *) s; + if (iolog->log_offset) + s->aux[IOS_AUX_OFFSET_INDEX] = sample->offset; - so->offset = offset; - } + if (iolog->log_issue_time) + s->aux[IOS_AUX_ISSUE_TIME_INDEX] = sample->issue_time; cur_log->nr_samples++; return; @@ -2879,14 +3123,36 @@ static inline void reset_io_stat(struct io_stat *ios) ios->mean.u.f = ios->S.u.f = 0; } +static inline void reset_io_u_plat(uint64_t *io_u_plat) +{ + int i; + + for (i = 0; i < FIO_IO_U_PLAT_NR; i++) + io_u_plat[i] = 0; +} + +static inline void reset_clat_prio_stats(struct thread_stat *ts) +{ + enum fio_ddir ddir; + int i; + + for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) { + if (!ts->clat_prio[ddir]) + continue; + + for (i = 0; i < ts->nr_clat_prio[ddir]; i++) { + reset_io_stat(&ts->clat_prio[ddir][i].clat_stat); + reset_io_u_plat(ts->clat_prio[ddir][i].io_u_plat); + } + } +} + void reset_io_stats(struct thread_data *td) { struct thread_stat *ts = &td->ts; - int i, j, k; + int i, j; for (i = 0; i < DDIR_RWDIR_CNT; i++) { - reset_io_stat(&ts->clat_high_prio_stat[i]); - reset_io_stat(&ts->clat_low_prio_stat[i]); reset_io_stat(&ts->clat_stat[i]); reset_io_stat(&ts->slat_stat[i]); reset_io_stat(&ts->lat_stat[i]); @@ -2898,21 +3164,16 @@ void reset_io_stats(struct thread_data *td) ts->total_io_u[i] = 0; ts->short_io_u[i] = 0; ts->drop_io_u[i] = 0; - - for (j = 0; j < FIO_IO_U_PLAT_NR; j++) { - ts->io_u_plat_high_prio[i][j] = 0; - ts->io_u_plat_low_prio[i][j] = 0; - if (!i) - ts->io_u_sync_plat[j] = 0; - } } for (i = 0; i < FIO_LAT_CNT; i++) for (j = 0; j < DDIR_RWDIR_CNT; j++) - for (k = 0; k < FIO_IO_U_PLAT_NR; k++) - ts->io_u_plat[i][j][k] = 0; + reset_io_u_plat(ts->io_u_plat[i][j]); + + reset_clat_prio_stats(ts); ts->total_io_u[DDIR_SYNC] = 0; + reset_io_u_plat(ts->io_u_sync_plat); for (i = 0; i < FIO_IO_U_MAP_NR; i++) { ts->io_u_map[i] = 0; @@ -2934,7 +3195,7 @@ void reset_io_stats(struct thread_data *td) } static void __add_stat_to_log(struct io_log *iolog, enum fio_ddir ddir, - unsigned long elapsed, bool log_max) + unsigned long elapsed, int log_max) { /* * Note an entry in the log. Use the mean from the logged samples, @@ -2942,23 +3203,30 @@ static void __add_stat_to_log(struct io_log *iolog, enum fio_ddir ddir, * had actual samples done. */ if (iolog->avg_window[ddir].samples) { - union io_sample_data data; - - if (log_max) - data.val = iolog->avg_window[ddir].max_val; - else - data.val = iolog->avg_window[ddir].mean.u.f + 0.50; + struct log_sample sample = { {{ 0, 0 }}, ddir, 0, 0, 0, 0 }; + union io_sample_data *d = &sample.data; + + if (log_max == IO_LOG_SAMPLE_AVG) { + d->val.val0 = iolog->avg_window[ddir].mean.u.f + 0.50; + d->val.val1 = 0; + } else if (log_max == IO_LOG_SAMPLE_MAX) { + d->val.val0 = iolog->avg_window[ddir].max_val; + d->val.val1 = 0; + } else { + d->val.val0 = iolog->avg_window[ddir].mean.u.f + 0.50; + d->val.val1 = iolog->avg_window[ddir].max_val; + } - __add_log_sample(iolog, data, ddir, 0, elapsed, 0, 0); + __add_log_sample(iolog, elapsed, &sample); } reset_io_stat(&iolog->avg_window[ddir]); } static void _add_stat_to_log(struct io_log *iolog, unsigned long elapsed, - bool log_max) + int log_max) { - int ddir; + enum fio_ddir ddir; for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) __add_stat_to_log(iolog, ddir, elapsed, log_max); @@ -2966,11 +3234,10 @@ static void _add_stat_to_log(struct io_log *iolog, unsigned long elapsed, static unsigned long add_log_sample(struct thread_data *td, struct io_log *iolog, - union io_sample_data data, - enum fio_ddir ddir, unsigned long long bs, - uint64_t offset, unsigned int ioprio) + struct log_sample *sample) { unsigned long elapsed, this_window; + enum fio_ddir ddir = sample->ddir; if (!ddir_rw(ddir)) return 0; @@ -2981,8 +3248,7 @@ static unsigned long add_log_sample(struct thread_data *td, * If no time averaging, just add the log sample. */ if (!iolog->avg_msec) { - __add_log_sample(iolog, data, ddir, bs, elapsed, offset, - ioprio); + __add_log_sample(iolog, elapsed, sample); return 0; } @@ -2990,7 +3256,7 @@ static unsigned long add_log_sample(struct thread_data *td, * Add the sample. If the time period has passed, then * add that entry to the log and clear. */ - add_stat_sample(&iolog->avg_window[ddir], data.val); + add_stat_sample(&iolog->avg_window[ddir], sample->data.val.val0); /* * If period hasn't passed, adding the above sample is all we @@ -3006,7 +3272,7 @@ static unsigned long add_log_sample(struct thread_data *td, return diff; } - __add_stat_to_log(iolog, ddir, elapsed, td->o.log_max != 0); + __add_stat_to_log(iolog, ddir, elapsed, td->o.log_max); iolog->avg_last[ddir] = elapsed - (elapsed % iolog->avg_msec); @@ -3020,27 +3286,28 @@ void finalize_logs(struct thread_data *td, bool unit_logs) elapsed = mtime_since_now(&td->epoch); if (td->clat_log && unit_logs) - _add_stat_to_log(td->clat_log, elapsed, td->o.log_max != 0); + _add_stat_to_log(td->clat_log, elapsed, td->o.log_max); if (td->slat_log && unit_logs) - _add_stat_to_log(td->slat_log, elapsed, td->o.log_max != 0); + _add_stat_to_log(td->slat_log, elapsed, td->o.log_max); if (td->lat_log && unit_logs) - _add_stat_to_log(td->lat_log, elapsed, td->o.log_max != 0); + _add_stat_to_log(td->lat_log, elapsed, td->o.log_max); if (td->bw_log && (unit_logs == per_unit_log(td->bw_log))) - _add_stat_to_log(td->bw_log, elapsed, td->o.log_max != 0); + _add_stat_to_log(td->bw_log, elapsed, td->o.log_max); if (td->iops_log && (unit_logs == per_unit_log(td->iops_log))) - _add_stat_to_log(td->iops_log, elapsed, td->o.log_max != 0); + _add_stat_to_log(td->iops_log, elapsed, td->o.log_max); } void add_agg_sample(union io_sample_data data, enum fio_ddir ddir, unsigned long long bs) { struct io_log *iolog; + struct log_sample sample = { data, ddir, bs, 0, 0, 0 }; if (!ddir_rw(ddir)) return; iolog = agg_io_log[ddir]; - __add_log_sample(iolog, data, ddir, bs, mtime_since_genesis(), 0, 0); + __add_log_sample(iolog, mtime_since_genesis(), &sample); } void add_sync_clat_sample(struct thread_stat *ts, unsigned long long nsec) @@ -3063,35 +3330,42 @@ static inline void add_lat_percentile_sample(struct thread_stat *ts, ts->io_u_plat[lat][ddir][idx]++; } -static inline void add_lat_percentile_prio_sample(struct thread_stat *ts, - unsigned long long nsec, - enum fio_ddir ddir, - bool high_prio) +static inline void +add_lat_percentile_prio_sample(struct thread_stat *ts, unsigned long long nsec, + enum fio_ddir ddir, + unsigned short clat_prio_index) { unsigned int idx = plat_val_to_idx(nsec); - if (!high_prio) - ts->io_u_plat_low_prio[ddir][idx]++; - else - ts->io_u_plat_high_prio[ddir][idx]++; + if (ts->clat_prio[ddir]) + ts->clat_prio[ddir][clat_prio_index].io_u_plat[idx]++; } void add_clat_sample(struct thread_data *td, enum fio_ddir ddir, unsigned long long nsec, unsigned long long bs, - uint64_t offset, unsigned int ioprio, bool high_prio) + struct io_u *io_u) { const bool needs_lock = td_async_processing(td); unsigned long elapsed, this_window; struct thread_stat *ts = &td->ts; struct io_log *iolog = td->clat_hist_log; + uint64_t offset = 0; + unsigned int ioprio = 0; + unsigned short clat_prio_index = 0; if (needs_lock) __td_io_u_lock(td); + if (io_u) { + offset = io_u->offset; + ioprio = io_u->ioprio; + clat_prio_index = io_u->clat_prio_index; + } + add_stat_sample(&ts->clat_stat[ddir], nsec); /* - * When lat_percentiles=1 (default 0), the reported high/low priority + * When lat_percentiles=1 (default 0), the reported per priority * percentiles and stats are used for describing total latency values, * even though the variable names themselves start with clat_. * @@ -3099,16 +3373,20 @@ void add_clat_sample(struct thread_data *td, enum fio_ddir ddir, * lat_percentiles=0. add_lat_sample() will add the prio stat sample * when lat_percentiles=1. */ - if (!ts->lat_percentiles) { - if (high_prio) - add_stat_sample(&ts->clat_high_prio_stat[ddir], nsec); - else - add_stat_sample(&ts->clat_low_prio_stat[ddir], nsec); - } + if (!ts->lat_percentiles) + add_stat_prio_sample(ts->clat_prio[ddir], clat_prio_index, + nsec); + + if (td->clat_log) { + struct log_sample sample = { sample_val(nsec), ddir, bs, + offset, ioprio, 0 }; + + if (io_u) + sample.issue_time = + ntime_since(&td->epoch, &io_u->issue_time); - if (td->clat_log) - add_log_sample(td, td->clat_log, sample_val(nsec), ddir, bs, - offset, ioprio); + add_log_sample(td, td->clat_log, &sample); + } if (ts->clat_percentiles) { /* @@ -3119,7 +3397,7 @@ void add_clat_sample(struct thread_data *td, enum fio_ddir ddir, add_lat_percentile_sample(ts, nsec, ddir, FIO_CLAT); if (!ts->lat_percentiles) add_lat_percentile_prio_sample(ts, nsec, ddir, - high_prio); + clat_prio_index); } if (iolog && iolog->hist_msec) { @@ -3134,6 +3412,8 @@ void add_clat_sample(struct thread_data *td, enum fio_ddir ddir, if (this_window >= iolog->hist_msec) { uint64_t *io_u_plat; struct io_u_plat_entry *dst; + struct log_sample sample = { {{ 0, 0 }}, ddir, bs, + offset, ioprio, 0 }; /* * Make a byte-for-byte copy of the latency histogram @@ -3147,8 +3427,9 @@ void add_clat_sample(struct thread_data *td, enum fio_ddir ddir, memcpy(&(dst->io_u_plat), io_u_plat, FIO_IO_U_PLAT_NR * sizeof(uint64_t)); flist_add(&dst->list, &hw->list); - __add_log_sample(iolog, sample_plat(dst), ddir, bs, - elapsed, offset, ioprio); + + sample.data = sample_plat(dst); + __add_log_sample(iolog, elapsed, &sample); /* * Update the last time we recorded as being now, minus @@ -3164,24 +3445,31 @@ void add_clat_sample(struct thread_data *td, enum fio_ddir ddir, __td_io_u_unlock(td); } -void add_slat_sample(struct thread_data *td, enum fio_ddir ddir, - unsigned long long nsec, unsigned long long bs, - uint64_t offset, unsigned int ioprio) +void add_slat_sample(struct thread_data *td, struct io_u *io_u) { const bool needs_lock = td_async_processing(td); struct thread_stat *ts = &td->ts; + enum fio_ddir ddir; + unsigned long long nsec; + ddir = io_u->ddir; if (!ddir_rw(ddir)) return; if (needs_lock) __td_io_u_lock(td); + nsec = ntime_since(&io_u->start_time, &io_u->issue_time); + add_stat_sample(&ts->slat_stat[ddir], nsec); - if (td->slat_log) - add_log_sample(td, td->slat_log, sample_val(nsec), ddir, bs, - offset, ioprio); + if (td->slat_log) { + struct log_sample sample = { sample_val(nsec), ddir, + io_u->xfer_buflen, io_u->offset, io_u->ioprio, + ntime_since(&td->epoch, &io_u->issue_time) }; + + add_log_sample(td, td->slat_log, &sample); + } if (ts->slat_percentiles) add_lat_percentile_sample(ts, nsec, ddir, FIO_SLAT); @@ -3192,7 +3480,7 @@ void add_slat_sample(struct thread_data *td, enum fio_ddir ddir, void add_lat_sample(struct thread_data *td, enum fio_ddir ddir, unsigned long long nsec, unsigned long long bs, - uint64_t offset, unsigned int ioprio, bool high_prio) + struct io_u * io_u) { const bool needs_lock = td_async_processing(td); struct thread_stat *ts = &td->ts; @@ -3205,12 +3493,15 @@ void add_lat_sample(struct thread_data *td, enum fio_ddir ddir, add_stat_sample(&ts->lat_stat[ddir], nsec); - if (td->lat_log) - add_log_sample(td, td->lat_log, sample_val(nsec), ddir, bs, - offset, ioprio); + if (td->lat_log) { + struct log_sample sample = { sample_val(nsec), ddir, bs, + io_u->offset, io_u->ioprio, 0 }; + + add_log_sample(td, td->lat_log, &sample); + } /* - * When lat_percentiles=1 (default 0), the reported high/low priority + * When lat_percentiles=1 (default 0), the reported per priority * percentiles and stats are used for describing total latency values, * even though the variable names themselves start with clat_. * @@ -3221,12 +3512,10 @@ void add_lat_sample(struct thread_data *td, enum fio_ddir ddir, */ if (ts->lat_percentiles) { add_lat_percentile_sample(ts, nsec, ddir, FIO_LAT); - add_lat_percentile_prio_sample(ts, nsec, ddir, high_prio); - if (high_prio) - add_stat_sample(&ts->clat_high_prio_stat[ddir], nsec); - else - add_stat_sample(&ts->clat_low_prio_stat[ddir], nsec); - + add_lat_percentile_prio_sample(ts, nsec, ddir, + io_u->clat_prio_index); + add_stat_prio_sample(ts->clat_prio[ddir], io_u->clat_prio_index, + nsec); } if (needs_lock) __td_io_u_unlock(td); @@ -3249,9 +3538,12 @@ void add_bw_sample(struct thread_data *td, struct io_u *io_u, add_stat_sample(&ts->bw_stat[io_u->ddir], rate); - if (td->bw_log) - add_log_sample(td, td->bw_log, sample_val(rate), io_u->ddir, - bytes, io_u->offset, io_u->ioprio); + if (td->bw_log) { + struct log_sample sample = { sample_val(rate), io_u->ddir, + bytes, io_u->offset, io_u->ioprio, 0 }; + + add_log_sample(td, td->bw_log, &sample); + } td->stat_io_bytes[io_u->ddir] = td->this_io_bytes[io_u->ddir]; @@ -3300,13 +3592,12 @@ static int __add_samples(struct thread_data *td, struct timespec *parent_tv, add_stat_sample(&stat[ddir], rate); if (log) { - unsigned long long bs = 0; + struct log_sample sample = { + sample_val(rate), ddir, 0, 0, 0, 0 }; if (td->o.min_bs[ddir] == td->o.max_bs[ddir]) - bs = td->o.min_bs[ddir]; - - next = add_log_sample(td, log, sample_val(rate), ddir, - bs, 0, 0); + sample.bs = td->o.min_bs[ddir]; + next = add_log_sample(td, log, &sample); next_log = min(next_log, next); } @@ -3344,9 +3635,12 @@ void add_iops_sample(struct thread_data *td, struct io_u *io_u, add_stat_sample(&ts->iops_stat[io_u->ddir], 1); - if (td->iops_log) - add_log_sample(td, td->iops_log, sample_val(1), io_u->ddir, - bytes, io_u->offset, io_u->ioprio); + if (td->iops_log) { + struct log_sample sample = { sample_val(1), io_u->ddir, bytes, + io_u->offset, io_u->ioprio, 0 }; + + add_log_sample(td, td->iops_log, &sample); + } td->stat_io_blocks[io_u->ddir] = td->this_io_blocks[io_u->ddir]; @@ -3361,26 +3655,38 @@ static int add_iops_samples(struct thread_data *td, struct timespec *t) td->ts.iops_stat, td->iops_log, false); } +static bool td_in_logging_state(struct thread_data *td) +{ + if (in_ramp_period(td)) + return false; + + switch(td->runstate) { + case TD_RUNNING: + case TD_VERIFYING: + case TD_FINISHING: + case TD_EXITED: + return true; + default: + return false; + } +} + /* * Returns msecs to next event */ int calc_log_samples(void) { - struct thread_data *td; unsigned int next = ~0U, tmp = 0, next_mod = 0, log_avg_msec_min = -1U; struct timespec now; - int i; long elapsed_time = 0; - fio_gettime(&now, NULL); - - for_each_td(td, i) { - elapsed_time = mtime_since_now(&td->epoch); + for_each_td(td) { + fio_gettime(&now, NULL); + elapsed_time = mtime_since(&td->epoch, &now); if (!td->o.stats) continue; - if (in_ramp_time(td) || - !(td->runstate == TD_RUNNING || td->runstate == TD_VERIFYING)) { + if (!td_in_logging_state(td)) { next = min(td->o.iops_avg_time, td->o.bw_avg_time); continue; } @@ -3401,7 +3707,7 @@ int calc_log_samples(void) if (tmp < next) next = tmp; - } + } end_for_each(); /* if log_avg_msec_min has not been changed, set it to 0 */ if (log_avg_msec_min == -1U) @@ -3420,7 +3726,7 @@ int calc_log_samples(void) void stat_init(void) { - stat_sem = fio_sem_init(FIO_SEM_UNLOCKED); + stat_sem = fio_shared_sem_init(FIO_SEM_UNLOCKED); } void stat_exit(void) @@ -3430,7 +3736,7 @@ void stat_exit(void) * have ended. */ fio_sem_down(stat_sem); - fio_sem_remove(stat_sem); + fio_shared_sem_remove(stat_sem); } /* diff --git a/stat.h b/stat.h index 9ef8caa438..84ea844586 100644 --- a/stat.h +++ b/stat.h @@ -51,7 +51,7 @@ struct group_run_stats { * * FIO_IO_U_PLAT_GROUP_NR and FIO_IO_U_PLAT_BITS determine the memory * requirement of storing those aggregate counts. The memory used will - * be (FIO_IO_U_PLAT_GROUP_NR * 2^FIO_IO_U_PLAT_BITS) * sizeof(int) + * be (FIO_IO_U_PLAT_GROUP_NR * 2^FIO_IO_U_PLAT_BITS) * sizeof(uint64_t) * bytes. * * FIO_IO_U_PLAT_NR is the total number of buckets. @@ -68,7 +68,7 @@ struct group_run_stats { * than one. This method has low accuracy when the value is small. For * example, let the buckets be {[0,99],[100,199],...,[900,999]}, and * the represented value of each bucket be the mean of the range. Then - * a value 0 has an round-off error of 49.5. To improve on this, we + * a value 0 has a round-off error of 49.5. To improve on this, we * use buckets with non-uniform ranges, while bounding the error of * each bucket within a ratio of the sample value. A simple example * would be when error_bound = 0.005, buckets are { @@ -142,7 +142,6 @@ enum block_info_state { BLOCK_STATE_COUNT, }; -#define MAX_PATTERN_SIZE 512 #define FIO_JOBNAME_SIZE 128 #define FIO_JOBDESC_SIZE 256 #define FIO_VERROR_SIZE 128 @@ -158,16 +157,24 @@ enum fio_lat { FIO_LAT_CNT = 3, }; +struct clat_prio_stat { + uint64_t io_u_plat[FIO_IO_U_PLAT_NR]; + struct io_stat clat_stat; + uint32_t ioprio; +}; + struct thread_stat { char name[FIO_JOBNAME_SIZE]; char verror[FIO_VERROR_SIZE]; uint32_t error; uint32_t thread_number; uint32_t groupid; + uint64_t job_start; /* Time job was started, as clock_gettime(job_start_clock_id) */ uint32_t pid; char description[FIO_JOBDESC_SIZE]; uint32_t members; uint32_t unified_rw_rep; + uint32_t disable_prio_stat; /* * bandwidth and latency stats @@ -226,17 +233,18 @@ struct thread_stat { uint32_t first_error; uint64_t total_err_count; - /* ZBD stats */ - uint64_t nr_zone_resets; - uint64_t nr_block_infos; uint32_t block_infos[MAX_NR_BLOCK_INFOS]; uint32_t kb_base; uint32_t unit_base; + /* ZBD stats */ + uint64_t nr_zone_resets; + uint16_t count_zone_resets; /* Flag to enable nr_zone_resets */ + uint16_t pad3; + uint32_t latency_depth; - uint32_t pad3; uint64_t latency_target; fio_fp64_t latency_percentile; uint64_t latency_window; @@ -252,21 +260,50 @@ struct thread_stat { fio_fp64_t ss_deviation; fio_fp64_t ss_criterion; - uint64_t io_u_plat_high_prio[DDIR_RWDIR_CNT][FIO_IO_U_PLAT_NR] __attribute__((aligned(8)));; - uint64_t io_u_plat_low_prio[DDIR_RWDIR_CNT][FIO_IO_U_PLAT_NR]; - struct io_stat clat_high_prio_stat[DDIR_RWDIR_CNT] __attribute__((aligned(8))); - struct io_stat clat_low_prio_stat[DDIR_RWDIR_CNT]; + /* A mirror of td->ioprio. */ + uint32_t ioprio; union { uint64_t *ss_iops_data; + /* + * For FIO_NET_CMD_TS, the pointed to data will temporarily + * be stored at this offset from the start of the payload. + */ + uint64_t ss_iops_data_offset; uint64_t pad4; }; union { uint64_t *ss_bw_data; + /* + * For FIO_NET_CMD_TS, the pointed to data will temporarily + * be stored at this offset from the start of the payload. + */ + uint64_t ss_bw_data_offset; uint64_t pad5; }; + union { + uint64_t *ss_lat_data; + /* + * For FIO_NET_CMD_TS, the pointed to data will temporarily + * be stored at this offset from the start of the payload. + */ + uint64_t ss_lat_data_offset; + uint64_t pad5b; + }; + + union { + struct clat_prio_stat *clat_prio[DDIR_RWDIR_CNT]; + /* + * For FIO_NET_CMD_TS, the pointed to data will temporarily + * be stored at this offset from the start of the payload. + */ + uint64_t clat_prio_offset[DDIR_RWDIR_CNT]; + uint64_t pad6; + }; + uint32_t nr_clat_prio[DDIR_RWDIR_CNT]; + uint64_t cachehit; uint64_t cachemiss; } __attribute__((packed)); @@ -318,35 +355,35 @@ extern void stat_init(void); extern void stat_exit(void); extern struct json_object * show_thread_status(struct thread_stat *ts, struct group_run_stats *rs, struct flist_head *, struct buf_output *); -extern void show_group_stats(struct group_run_stats *rs, struct buf_output *); -extern bool calc_thread_status(struct jobs_eta *je, int force); +extern void show_group_stats(const struct group_run_stats *rs, struct buf_output *); extern void display_thread_status(struct jobs_eta *je); extern void __show_run_stats(void); extern int __show_running_run_stats(void); extern void show_running_run_stats(void); extern void check_for_running_stats(void); -extern void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src, bool first); -extern void sum_group_stats(struct group_run_stats *dst, struct group_run_stats *src); +extern void sum_thread_stats(struct thread_stat *dst, const struct thread_stat *src); +extern void sum_group_stats(struct group_run_stats *dst, const struct group_run_stats *src); extern void init_thread_stat_min_vals(struct thread_stat *ts); extern void init_thread_stat(struct thread_stat *ts); extern void init_group_run_stat(struct group_run_stats *gs); extern void eta_to_str(char *str, unsigned long eta_sec); -extern bool calc_lat(struct io_stat *is, unsigned long long *min, unsigned long long *max, double *mean, double *dev); -extern unsigned int calc_clat_percentiles(uint64_t *io_u_plat, unsigned long long nr, fio_fp64_t *plist, unsigned long long **output, unsigned long long *maxv, unsigned long long *minv); -extern void stat_calc_lat_n(struct thread_stat *ts, double *io_u_lat); -extern void stat_calc_lat_m(struct thread_stat *ts, double *io_u_lat); -extern void stat_calc_lat_u(struct thread_stat *ts, double *io_u_lat); -extern void stat_calc_dist(uint64_t *map, unsigned long total, double *io_u_dist); +extern bool calc_lat(const struct io_stat *is, unsigned long long *min, unsigned long long *max, double *mean, double *dev); +extern unsigned int calc_clat_percentiles(const uint64_t *io_u_plat, unsigned long long nr, fio_fp64_t *plist, unsigned long long **output, unsigned long long *maxv, unsigned long long *minv); +extern void stat_calc_lat_n(const struct thread_stat *ts, double *io_u_lat); +extern void stat_calc_lat_m(const struct thread_stat *ts, double *io_u_lat); +extern void stat_calc_lat_u(const struct thread_stat *ts, double *io_u_lat); +extern void stat_calc_dist(const uint64_t *map, unsigned long total, double *io_u_dist); extern void reset_io_stats(struct thread_data *); extern void update_rusage_stat(struct thread_data *); extern void clear_rusage_stat(struct thread_data *); -extern void add_lat_sample(struct thread_data *, enum fio_ddir, unsigned long long, - unsigned long long, uint64_t, unsigned int, bool); -extern void add_clat_sample(struct thread_data *, enum fio_ddir, unsigned long long, - unsigned long long, uint64_t, unsigned int, bool); -extern void add_slat_sample(struct thread_data *, enum fio_ddir, unsigned long long, - unsigned long long, uint64_t, unsigned int); +extern void add_lat_sample(struct thread_data *, enum fio_ddir, + unsigned long long, unsigned long long, + struct io_u *); +extern void add_clat_sample(struct thread_data *, enum fio_ddir, + unsigned long long, unsigned long long, + struct io_u *); +extern void add_slat_sample(struct thread_data *, struct io_u *); extern void add_agg_sample(union io_sample_data, enum fio_ddir, unsigned long long); extern void add_iops_sample(struct thread_data *, struct io_u *, unsigned int); @@ -355,10 +392,12 @@ extern void add_bw_sample(struct thread_data *, struct io_u *, extern void add_sync_clat_sample(struct thread_stat *ts, unsigned long long nsec); extern int calc_log_samples(void); +extern void free_clat_prio_stats(struct thread_stat *); +extern int alloc_clat_prio_stat_ddir(struct thread_stat *, enum fio_ddir, int); -extern void print_disk_util(struct disk_util_stat *, struct disk_util_agg *, int terse, struct buf_output *); -extern void json_array_add_disk_util(struct disk_util_stat *dus, - struct disk_util_agg *agg, struct json_array *parent); +extern void print_disk_util(const struct disk_util_stat *, const struct disk_util_agg *, int terse, struct buf_output *); +extern void json_array_add_disk_util(const struct disk_util_stat *dus, + const struct disk_util_agg *agg, struct json_array *parent); extern struct io_log *agg_io_log[DDIR_RWDIR_CNT]; extern bool write_bw_log; diff --git a/steadystate.c b/steadystate.c index 2e3da1db0c..9e26012deb 100644 --- a/steadystate.c +++ b/steadystate.c @@ -4,27 +4,33 @@ #include "steadystate.h" bool steadystate_enabled = false; +unsigned int ss_check_interval = 1000; void steadystate_free(struct thread_data *td) { free(td->ss.iops_data); free(td->ss.bw_data); + free(td->ss.lat_data); td->ss.iops_data = NULL; td->ss.bw_data = NULL; + td->ss.lat_data = NULL; } static void steadystate_alloc(struct thread_data *td) { - td->ss.bw_data = calloc(td->ss.dur, sizeof(uint64_t)); - td->ss.iops_data = calloc(td->ss.dur, sizeof(uint64_t)); + int intervals = td->ss.dur / (ss_check_interval / 1000L); + + td->ss.bw_data = calloc(intervals, sizeof(uint64_t)); + td->ss.iops_data = calloc(intervals, sizeof(uint64_t)); + td->ss.lat_data = calloc(intervals, sizeof(uint64_t)); td->ss.state |= FIO_SS_DATA; } void steadystate_setup(void) { - struct thread_data *td, *prev_td; - int i, prev_groupid; + struct thread_data *prev_td; + int prev_groupid; if (!steadystate_enabled) return; @@ -36,7 +42,7 @@ void steadystate_setup(void) */ prev_groupid = -1; prev_td = NULL; - for_each_td(td, i) { + for_each_td(td) { if (!td->ss.dur) continue; @@ -51,53 +57,63 @@ void steadystate_setup(void) prev_groupid = td->groupid; } prev_td = td; - } + } end_for_each(); if (prev_td && prev_td->o.group_reporting) steadystate_alloc(prev_td); } -static bool steadystate_slope(uint64_t iops, uint64_t bw, +static bool steadystate_slope(uint64_t iops, uint64_t bw, double lat, struct thread_data *td) { int i, j; double result; struct steadystate_data *ss = &td->ss; uint64_t new_val; + int intervals = ss->dur / (ss_check_interval / 1000L); ss->bw_data[ss->tail] = bw; ss->iops_data[ss->tail] = iops; + ss->lat_data[ss->tail] = (uint64_t)lat; if (ss->state & FIO_SS_IOPS) new_val = iops; - else + else if (ss->state & FIO_SS_BW) new_val = bw; + else + new_val = (uint64_t)lat; - if (ss->state & FIO_SS_BUFFER_FULL || ss->tail - ss->head == ss->dur - 1) { + if (ss->state & FIO_SS_BUFFER_FULL || ss->tail - ss->head == intervals - 1) { if (!(ss->state & FIO_SS_BUFFER_FULL)) { /* first time through */ - for(i = 0, ss->sum_y = 0; i < ss->dur; i++) { + for (i = 0, ss->sum_y = 0; i < intervals; i++) { if (ss->state & FIO_SS_IOPS) ss->sum_y += ss->iops_data[i]; - else + else if (ss->state & FIO_SS_BW) ss->sum_y += ss->bw_data[i]; - j = (ss->head + i) % ss->dur; + else + ss->sum_y += ss->lat_data[i]; + j = (ss->head + i) % intervals; if (ss->state & FIO_SS_IOPS) ss->sum_xy += i * ss->iops_data[j]; - else + else if (ss->state & FIO_SS_BW) ss->sum_xy += i * ss->bw_data[j]; + else + ss->sum_xy += i * ss->lat_data[j]; } ss->state |= FIO_SS_BUFFER_FULL; } else { /* easy to update the sums */ ss->sum_y -= ss->oldest_y; ss->sum_y += new_val; - ss->sum_xy = ss->sum_xy - ss->sum_y + ss->dur * new_val; + ss->sum_xy = ss->sum_xy - ss->sum_y + intervals * new_val; } if (ss->state & FIO_SS_IOPS) ss->oldest_y = ss->iops_data[ss->head]; - else + else if (ss->state & FIO_SS_BW) ss->oldest_y = ss->bw_data[ss->head]; + else + ss->oldest_y = ss->lat_data[ss->head]; /* * calculate slope as (sum_xy - sum_x * sum_y / n) / (sum_(x^2) @@ -105,10 +121,10 @@ static bool steadystate_slope(uint64_t iops, uint64_t bw, * equally spaced when they are often off by a few milliseconds. * This assumption greatly simplifies the calculations. */ - ss->slope = (ss->sum_xy - (double) ss->sum_x * ss->sum_y / ss->dur) / - (ss->sum_x_sq - (double) ss->sum_x * ss->sum_x / ss->dur); + ss->slope = (ss->sum_xy - (double) ss->sum_x * ss->sum_y / intervals) / + (ss->sum_x_sq - (double) ss->sum_x * ss->sum_x / intervals); if (ss->state & FIO_SS_PCT) - ss->criterion = 100.0 * ss->slope / (ss->sum_y / ss->dur); + ss->criterion = 100.0 * ss->slope / (ss->sum_y / intervals); else ss->criterion = ss->slope; @@ -123,14 +139,14 @@ static bool steadystate_slope(uint64_t iops, uint64_t bw, return true; } - ss->tail = (ss->tail + 1) % ss->dur; + ss->tail = (ss->tail + 1) % intervals; if (ss->tail <= ss->head) - ss->head = (ss->head + 1) % ss->dur; + ss->head = (ss->head + 1) % intervals; return false; } -static bool steadystate_deviation(uint64_t iops, uint64_t bw, +static bool steadystate_deviation(uint64_t iops, uint64_t bw, double lat, struct thread_data *td) { int i; @@ -138,40 +154,51 @@ static bool steadystate_deviation(uint64_t iops, uint64_t bw, double mean; struct steadystate_data *ss = &td->ss; + int intervals = ss->dur / (ss_check_interval / 1000L); ss->bw_data[ss->tail] = bw; ss->iops_data[ss->tail] = iops; + ss->lat_data[ss->tail] = (uint64_t)lat; - if (ss->state & FIO_SS_BUFFER_FULL || ss->tail - ss->head == ss->dur - 1) { + if (ss->state & FIO_SS_BUFFER_FULL || ss->tail - ss->head == intervals - 1) { if (!(ss->state & FIO_SS_BUFFER_FULL)) { /* first time through */ - for(i = 0, ss->sum_y = 0; i < ss->dur; i++) + for (i = 0, ss->sum_y = 0; i < intervals; i++) { if (ss->state & FIO_SS_IOPS) ss->sum_y += ss->iops_data[i]; - else + else if (ss->state & FIO_SS_BW) ss->sum_y += ss->bw_data[i]; + else + ss->sum_y += ss->lat_data[i]; + } ss->state |= FIO_SS_BUFFER_FULL; } else { /* easy to update the sum */ ss->sum_y -= ss->oldest_y; if (ss->state & FIO_SS_IOPS) ss->sum_y += ss->iops_data[ss->tail]; - else + else if (ss->state & FIO_SS_BW) ss->sum_y += ss->bw_data[ss->tail]; + else + ss->sum_y += ss->lat_data[ss->tail]; } if (ss->state & FIO_SS_IOPS) ss->oldest_y = ss->iops_data[ss->head]; - else + else if (ss->state & FIO_SS_BW) ss->oldest_y = ss->bw_data[ss->head]; + else + ss->oldest_y = ss->lat_data[ss->head]; - mean = (double) ss->sum_y / ss->dur; + mean = (double) ss->sum_y / intervals; ss->deviation = 0.0; - for (i = 0; i < ss->dur; i++) { + for (i = 0; i < intervals; i++) { if (ss->state & FIO_SS_IOPS) diff = ss->iops_data[i] - mean; - else + else if (ss->state & FIO_SS_BW) diff = ss->bw_data[i] - mean; + else + diff = ss->lat_data[i] - mean; ss->deviation = max(ss->deviation, diff * (diff < 0.0 ? -1.0 : 1.0)); } @@ -180,8 +207,9 @@ static bool steadystate_deviation(uint64_t iops, uint64_t bw, else ss->criterion = ss->deviation; - dprint(FD_STEADYSTATE, "sum_y: %llu, mean: %f, max diff: %f, " + dprint(FD_STEADYSTATE, "intervals: %d, sum_y: %llu, mean: %f, max diff: %f, " "objective: %f, limit: %f\n", + intervals, (unsigned long long) ss->sum_y, mean, ss->deviation, ss->criterion, ss->limit); @@ -189,27 +217,31 @@ static bool steadystate_deviation(uint64_t iops, uint64_t bw, return true; } - ss->tail = (ss->tail + 1) % ss->dur; - if (ss->tail <= ss->head) - ss->head = (ss->head + 1) % ss->dur; + ss->tail = (ss->tail + 1) % intervals; + if (ss->tail == ss->head) + ss->head = (ss->head + 1) % intervals; return false; } int steadystate_check(void) { - int i, j, ddir, prev_groupid, group_ramp_time_over = 0; + int ddir, prev_groupid, group_ramp_time_over = 0; unsigned long rate_time; - struct thread_data *td, *td2; struct timespec now; uint64_t group_bw = 0, group_iops = 0; + double group_lat_sum = 0.0; + uint64_t group_lat_samples = 0; uint64_t td_iops, td_bytes; + double group_lat; bool ret; prev_groupid = -1; - for_each_td(td, i) { + for_each_td(td) { const bool needs_lock = td_async_processing(td); struct steadystate_data *ss = &td->ss; + double td_lat_sum = 0.0; + uint64_t td_lat_samples = 0; if (!ss->dur || td->runstate <= TD_SETTING_UP || td->runstate >= TD_EXITED || !ss->state || @@ -222,6 +254,8 @@ int steadystate_check(void) (td->o.group_reporting && td->groupid != prev_groupid)) { group_bw = 0; group_iops = 0; + group_lat_sum = 0.0; + group_lat_samples = 0; group_ramp_time_over = 0; } prev_groupid = td->groupid; @@ -229,10 +263,10 @@ int steadystate_check(void) fio_gettime(&now, NULL); if (ss->ramp_time && !(ss->state & FIO_SS_RAMP_OVER)) { /* - * Begin recording data one second after ss->ramp_time + * Begin recording data one check interval after ss->ramp_time * has elapsed */ - if (utime_since(&td->epoch, &now) >= (ss->ramp_time + 1000000L)) + if (utime_since(&td->epoch, &now) >= (ss->ramp_time + ss_check_interval * 1000L)) ss->state |= FIO_SS_RAMP_OVER; } @@ -242,6 +276,9 @@ int steadystate_check(void) for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) { td_iops += td->io_blocks[ddir]; td_bytes += td->io_bytes[ddir]; + td_lat_sum += td->ts.clat_stat[ddir].mean.u.f * + td->ts.clat_stat[ddir].samples; + td_lat_samples += td->ts.clat_stat[ddir].samples; } if (needs_lock) @@ -250,20 +287,19 @@ int steadystate_check(void) rate_time = mtime_since(&ss->prev_time, &now); memcpy(&ss->prev_time, &now, sizeof(now)); - /* - * Begin monitoring when job starts but don't actually use - * data in checking stopping criterion until ss->ramp_time is - * over. This ensures that we will have a sane value in - * prev_iops/bw the first time through after ss->ramp_time - * is done. - */ if (ss->state & FIO_SS_RAMP_OVER) { - group_bw += 1000 * (td_bytes - ss->prev_bytes) / rate_time; - group_iops += 1000 * (td_iops - ss->prev_iops) / rate_time; + group_bw += rate_time * (td_bytes - ss->prev_bytes) / + (ss_check_interval * ss_check_interval / 1000L); + group_iops += rate_time * (td_iops - ss->prev_iops) / + (ss_check_interval * ss_check_interval / 1000L); + group_lat_sum += td_lat_sum - ss->prev_lat_sum; + group_lat_samples += td_lat_samples - ss->prev_lat_samples; ++group_ramp_time_over; } ss->prev_iops = td_iops; ss->prev_bytes = td_bytes; + ss->prev_lat_sum = td_lat_sum; + ss->prev_lat_samples = td_lat_samples; if (td->o.group_reporting && !(ss->state & FIO_SS_DATA)) continue; @@ -278,30 +314,34 @@ int steadystate_check(void) dprint(FD_STEADYSTATE, "steadystate_check() thread: %d, " "groupid: %u, rate_msec: %ld, " "iops: %llu, bw: %llu, head: %d, tail: %d\n", - i, td->groupid, rate_time, + __td_index, td->groupid, rate_time, (unsigned long long) group_iops, (unsigned long long) group_bw, ss->head, ss->tail); + group_lat = 0.0; + if (group_lat_samples) + group_lat = group_lat_sum / group_lat_samples; + if (ss->state & FIO_SS_SLOPE) - ret = steadystate_slope(group_iops, group_bw, td); + ret = steadystate_slope(group_iops, group_bw, group_lat, td); else - ret = steadystate_deviation(group_iops, group_bw, td); + ret = steadystate_deviation(group_iops, group_bw, group_lat, td); if (ret) { if (td->o.group_reporting) { - for_each_td(td2, j) { + for_each_td(td2) { if (td2->groupid == td->groupid) { td2->ss.state |= FIO_SS_ATTAINED; fio_mark_td_terminate(td2); } - } + } end_for_each(); } else { ss->state |= FIO_SS_ATTAINED; fio_mark_td_terminate(td); } } - } + } end_for_each(); return 0; } @@ -309,8 +349,7 @@ int td_steadystate_init(struct thread_data *td) { struct steadystate_data *ss = &td->ss; struct thread_options *o = &td->o; - struct thread_data *td2; - int j; + int intervals; memset(ss, 0, sizeof(*ss)); @@ -322,17 +361,19 @@ int td_steadystate_init(struct thread_data *td) ss->dur = o->ss_dur; ss->limit = o->ss_limit.u.f; ss->ramp_time = o->ss_ramp_time; + ss_check_interval = o->ss_check_interval / 1000L; ss->state = o->ss_state; if (!td->ss.ramp_time) ss->state |= FIO_SS_RAMP_OVER; - ss->sum_x = o->ss_dur * (o->ss_dur - 1) / 2; - ss->sum_x_sq = (o->ss_dur - 1) * (o->ss_dur) * (2*o->ss_dur - 1) / 6; + intervals = ss->dur / (ss_check_interval / 1000L); + ss->sum_x = intervals * (intervals - 1) / 2; + ss->sum_x_sq = (intervals - 1) * (intervals) * (2*intervals - 1) / 6; } /* make sure that ss options are consistent within reporting group */ - for_each_td(td2, j) { + for_each_td(td2) { if (td2->groupid == td->groupid) { struct steadystate_data *ss2 = &td2->ss; @@ -346,35 +387,37 @@ int td_steadystate_init(struct thread_data *td) return 1; } } - } + } end_for_each(); return 0; } -uint64_t steadystate_bw_mean(struct thread_stat *ts) +static uint64_t steadystate_data_mean(uint64_t *data, int ss_dur) { int i; uint64_t sum; + int intervals = ss_dur / (ss_check_interval / 1000L); - if (!ts->ss_dur) + if (!ss_dur) return 0; - for (i = 0, sum = 0; i < ts->ss_dur; i++) - sum += ts->ss_bw_data[i]; + for (i = 0, sum = 0; i < intervals; i++) + sum += data[i]; - return sum / ts->ss_dur; + return sum / intervals; } -uint64_t steadystate_iops_mean(struct thread_stat *ts) +uint64_t steadystate_bw_mean(const struct thread_stat *ts) { - int i; - uint64_t sum; - - if (!ts->ss_dur) - return 0; + return steadystate_data_mean(ts->ss_bw_data, ts->ss_dur); +} - for (i = 0, sum = 0; i < ts->ss_dur; i++) - sum += ts->ss_iops_data[i]; +uint64_t steadystate_iops_mean(const struct thread_stat *ts) +{ + return steadystate_data_mean(ts->ss_iops_data, ts->ss_dur); +} - return sum / ts->ss_dur; +uint64_t steadystate_lat_mean(const struct thread_stat *ts) +{ + return steadystate_data_mean(ts->ss_lat_data, ts->ss_dur); } diff --git a/steadystate.h b/steadystate.h index bbb86fbb30..aff152115f 100644 --- a/steadystate.h +++ b/steadystate.h @@ -7,10 +7,12 @@ extern void steadystate_free(struct thread_data *); extern int steadystate_check(void); extern void steadystate_setup(void); extern int td_steadystate_init(struct thread_data *); -extern uint64_t steadystate_bw_mean(struct thread_stat *); -extern uint64_t steadystate_iops_mean(struct thread_stat *); +extern uint64_t steadystate_bw_mean(const struct thread_stat *); +extern uint64_t steadystate_iops_mean(const struct thread_stat *); +extern uint64_t steadystate_lat_mean(const struct thread_stat *); extern bool steadystate_enabled; +extern unsigned int ss_check_interval; struct steadystate_data { double limit; @@ -23,6 +25,7 @@ struct steadystate_data { unsigned int tail; uint64_t *iops_data; uint64_t *bw_data; + uint64_t *lat_data; double slope; double deviation; @@ -37,6 +40,8 @@ struct steadystate_data { struct timespec prev_time; uint64_t prev_iops; uint64_t prev_bytes; + double prev_lat_sum; + uint64_t prev_lat_samples; }; enum { @@ -48,6 +53,7 @@ enum { __FIO_SS_DATA, __FIO_SS_PCT, __FIO_SS_BUFFER_FULL, + __FIO_SS_LAT, }; enum { @@ -59,11 +65,11 @@ enum { FIO_SS_DATA = 1 << __FIO_SS_DATA, FIO_SS_PCT = 1 << __FIO_SS_PCT, FIO_SS_BUFFER_FULL = 1 << __FIO_SS_BUFFER_FULL, + FIO_SS_LAT = 1 << __FIO_SS_LAT, FIO_SS_IOPS_SLOPE = FIO_SS_IOPS | FIO_SS_SLOPE, FIO_SS_BW_SLOPE = FIO_SS_BW | FIO_SS_SLOPE, + FIO_SS_LAT_SLOPE = FIO_SS_LAT | FIO_SS_SLOPE, }; -#define STEADYSTATE_MSEC 1000 - #endif diff --git a/t/client_server.py b/t/client_server.py new file mode 100755 index 0000000000..88f5297f93 --- /dev/null +++ b/t/client_server.py @@ -0,0 +1,505 @@ +#!/usr/bin/env python3 +""" +# client_server.py +# +# Test fio's client/server mode. +# +# USAGE +# see python3 client_server.py --help +# +# EXAMPLES +# python3 t/client_server.py +# python3 t/client_server.py -f ./fio +# +# REQUIREMENTS +# Python 3.6 +# +# This will start fio server instances listening on the interfaces below and +# will break if any ports are already occupied. +# +# +""" +import os +import sys +import time +import locale +import logging +import argparse +import tempfile +import subprocess +import configparser +from pathlib import Path +from fiotestlib import FioJobCmdTest, run_fio_tests + + +SERVER_LIST = [ + ",8765", + ",8766", + ",8767", + ",8768", + ] + +PIDFILE_LIST = [] + +class ClientServerTest(FioJobCmdTest): + """ + Client/sever test class. + """ + + def setup(self, parameters): + """Setup a test.""" + + fio_args = [ + f"--output={self.filenames['output']}", + f"--output-format={self.fio_opts['output-format']}", + ] + for server in self.fio_opts['servers']: + option = f"--client={server['client']}" + fio_args.append(option) + fio_args.append(server['jobfile']) + + super().setup(fio_args) + + + +class ClientServerTestGlobalSingle(ClientServerTest): + """ + Client/sever test class. + One server connection only. + The job file may or may not have a global section. + """ + + def check_result(self): + super().check_result() + + config = configparser.ConfigParser(allow_no_value=True) + config.read(self.fio_opts['servers'][0]['jobfile']) + + if not config.has_section('global'): + if len(self.json_data['global options']) > 0: + self.failure_reason = f"{self.failure_reason} non-empty 'global options' dictionary found with no global section in job file." + self.passed = False + return + + if len(self.json_data['global options']) == 0: + self.failure_reason = f"{self.failure_reason} empty 'global options' dictionary found with no global section in job file." + self.passed = False + + # Now make sure job file global section matches 'global options' + # in JSON output + job_file_global = dict(config['global']) + for key, value in job_file_global.items(): + if value is None: + job_file_global[key] = "" + if job_file_global != self.json_data['global options']: + self.failure_reason = f"{self.failure_reason} 'global options' dictionary does not match global section in job file." + self.passed = False + + +class ClientServerTestGlobalMultiple(ClientServerTest): + """ + Client/sever test class. + Multiple server connections. + Job files may or may not have a global section. + """ + + def check_result(self): + super().check_result() + + # + # For each job file, check if it has a global section + # If so, make sure the 'global options' array has + # as element for it. + # At the end, make sure the total number of elements matches the number + # of job files with global sections. + # + + global_sections = 0 + for server in self.fio_opts['servers']: + config = configparser.ConfigParser(allow_no_value=True) + config.read(server['jobfile']) + + if not config.has_section('global'): + continue + + global_sections += 1 + + # this can only parse one server spec format + [hostname, port] = server['client'].split(',') + + match = None + for global_opts in self.json_data['global options']: + if 'hostname' not in global_opts: + continue + if 'port' not in global_opts: + continue + if global_opts['hostname'] == hostname and int(global_opts['port']) == int(port): + match = global_opts + break + + if not match: + self.failure_reason = f"{self.failure_reason} matching 'global options' element not found for {hostname}, {port}." + self.passed = False + continue + + del match['hostname'] + del match['port'] + + # Now make sure job file global section matches 'global options' + # in JSON output + job_file_global = dict(config['global']) + for key, value in job_file_global.items(): + if value is None: + job_file_global[key] = "" + if job_file_global != match: + self.failure_reason += " 'global options' dictionary does not match global section in job file." + self.passed = False + else: + logging.debug("Job file global section matches 'global options' array element %s", server['client']) + + if global_sections != len(self.json_data['global options']): + self.failure_reason = f"{self.failure_reason} mismatched number of elements in 'global options' array." + self.passed = False + else: + logging.debug("%d elements in global options array as expected", global_sections) + + +class ClientServerTestAllClientsLat(ClientServerTest): + """ + Client/sever test class. + Make sure the "All clients" job has latency percentile data. + Assumes that a job named 'test' is run with no global section. + Only check read data. + """ + + def check_result(self): + super().check_result() + + config = configparser.ConfigParser(allow_no_value=True) + config.read(self.fio_opts['servers'][0]['jobfile']) + + lats = { 'clat': True, 'lat': False, 'slat': False } + for key in lats: + opt = f"{key}_percentiles" + if opt in config.options('test'): + lats[key] = config.getboolean('test', opt) + logging.debug("%s set to %s", opt, lats[key]) + + all_clients = None + client_stats = self.json_data['client_stats'] + for client in client_stats: + if client['jobname'] == "All clients": + all_clients = client + break + + if not all_clients: + self.failure_reason = f"{self.failure_reason} Could not find 'All clients' output" + self.passed = False + + for key, value in lats.items(): + if value: + if 'percentile' not in all_clients['read'][f"{key}_ns"]: + self.failure_reason += f" {key} percentiles not found" + self.passed = False + break + + logging.debug("%s percentiles found as expected", key) + else: + if 'percentile' in all_clients['read'][f"{key}_ns"]: + self.failure_reason += f" {key} percentiles found unexpectedly" + self.passed = False + break + + logging.debug("%s percentiles appropriately not found", key) + + + +TEST_LIST = [ + { # Smoke test + "test_id": 1, + "fio_opts": { + "output-format": "json", + "servers": [ + { + "client" : 0, # index into the SERVER_LIST array + "jobfile": "test01.fio", + }, + ] + }, + "test_class": ClientServerTest, + }, + { # try another client + "test_id": 2, + "fio_opts": { + "output-format": "json", + "servers": [ + { + "client" : 1, + "jobfile": "test01.fio", + }, + ] + }, + "test_class": ClientServerTest, + }, + { # single client global section + "test_id": 3, + "fio_opts": { + "output-format": "json", + "servers": [ + { + "client" : 2, + "jobfile": "test01.fio", + }, + ] + }, + "test_class": ClientServerTestGlobalSingle, + }, + { # single client no global section + "test_id": 4, + "fio_opts": { + "output-format": "json", + "servers": [ + { + "client" : 3, + "jobfile": "test04-noglobal.fio", + }, + ] + }, + "test_class": ClientServerTestGlobalSingle, + }, + { # multiple clients, some with global, some without + "test_id": 5, + "fio_opts": { + "output-format": "json", + "servers": [ + { + "client" : 0, + "jobfile": "test04-noglobal.fio", + }, + { + "client" : 1, + "jobfile": "test01.fio", + }, + { + "client" : 2, + "jobfile": "test04-noglobal.fio", + }, + { + "client" : 3, + "jobfile": "test01.fio", + }, + ] + }, + "test_class": ClientServerTestGlobalMultiple, + }, + { # multiple clients, all with global sections + "test_id": 6, + "fio_opts": { + "output-format": "json", + "servers": [ + { + "client" : 0, + "jobfile": "test01.fio", + }, + { + "client" : 1, + "jobfile": "test01.fio", + }, + { + "client" : 2, + "jobfile": "test01.fio", + }, + { + "client" : 3, + "jobfile": "test01.fio", + }, + ] + }, + "test_class": ClientServerTestGlobalMultiple, + }, + { # Enable submission latency + "test_id": 7, + "fio_opts": { + "output-format": "json", + "servers": [ + { + "client" : 0, + "jobfile": "test07-slat.fio", + }, + { + "client" : 1, + "jobfile": "test07-slat.fio", + }, + ] + }, + "test_class": ClientServerTestAllClientsLat, + }, + { # Enable completion latency + "test_id": 8, + "fio_opts": { + "output-format": "json", + "servers": [ + { + "client" : 0, + "jobfile": "test08-clat.fio", + }, + { + "client" : 1, + "jobfile": "test08-clat.fio", + }, + ] + }, + "test_class": ClientServerTestAllClientsLat, + }, + { # Enable total latency + "test_id": 9, + "fio_opts": { + "output-format": "json", + "servers": [ + { + "client" : 0, + "jobfile": "test09-lat.fio", + }, + { + "client" : 1, + "jobfile": "test09-lat.fio", + }, + ] + }, + "test_class": ClientServerTestAllClientsLat, + }, + { # Disable completion latency + "test_id": 10, + "fio_opts": { + "output-format": "json", + "servers": [ + { + "client" : 0, + "jobfile": "test10-noclat.fio", + }, + { + "client" : 1, + "jobfile": "test10-noclat.fio", + }, + ] + }, + "test_class": ClientServerTestAllClientsLat, + }, + { # Enable submission, completion, total latency + "test_id": 11, + "fio_opts": { + "output-format": "json", + "servers": [ + { + "client" : 0, + "jobfile": "test11-alllat.fio", + }, + { + "client" : 1, + "jobfile": "test11-alllat.fio", + }, + ] + }, + "test_class": ClientServerTestAllClientsLat, + }, +] + + +def parse_args(): + """Parse command-line arguments.""" + + parser = argparse.ArgumentParser() + parser.add_argument('-d', '--debug', help='Enable debug messages', action='store_true') + parser.add_argument('-f', '--fio', help='path to file executable (e.g., ./fio)') + parser.add_argument('-a', '--artifact-root', help='artifact root directory') + parser.add_argument('-s', '--skip', nargs='+', type=int, + help='list of test(s) to skip') + parser.add_argument('-o', '--run-only', nargs='+', type=int, + help='list of test(s) to run, skipping all others') + args = parser.parse_args() + + return args + + +def start_servers(fio_path, servers=SERVER_LIST): + """Start servers for our tests.""" + + for server in servers: + tmpfile = tempfile.mktemp() + cmd = [fio_path, f"--server={server}", f"--daemonize={tmpfile}"] + cmd_result = subprocess.run(cmd, capture_output=True, check=False, + encoding=locale.getpreferredencoding()) + if cmd_result.returncode != 0: + logging.error("Unable to start server on %s: %s", server, cmd_result.stderr) + return False + + logging.debug("Started server %s", server) + PIDFILE_LIST.append(tmpfile) + + return True + + +def stop_servers(pidfiles=PIDFILE_LIST): + """Stop running fio server invocations.""" + + for pidfile in pidfiles: + with open(pidfile, "r", encoding=locale.getpreferredencoding()) as file: + pid = file.read().strip() + + cmd = ["kill", f"{pid}"] + cmd_result = subprocess.run(cmd, capture_output=True, check=False, + encoding=locale.getpreferredencoding()) + if cmd_result.returncode != 0: + logging.error("Unable to kill server with PID %s: %s", pid, cmd_result.stderr) + return False + logging.debug("Sent stop signal to PID %s", pid) + + return True + + +def main(): + """Run tests for fio's client/server mode.""" + + args = parse_args() + + if args.debug: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + artifact_root = args.artifact_root if args.artifact_root else \ + f"client_server-test-{time.strftime('%Y%m%d-%H%M%S')}" + os.mkdir(artifact_root) + print(f"Artifact directory is {artifact_root}") + + if args.fio: + fio_path = str(Path(args.fio).absolute()) + else: + fio_path = os.path.join(os.path.dirname(__file__), '../fio') + print(f"fio path is {fio_path}") + + if not start_servers(fio_path): + sys.exit(1) + print("Servers started") + + job_path = os.path.join(os.path.dirname(__file__), "client_server") + for test in TEST_LIST: + opts = test['fio_opts'] + for server in opts['servers']: + server['client'] = SERVER_LIST[server['client']] + server['jobfile'] = os.path.join(job_path, server['jobfile']) + + test_env = { + 'fio_path': fio_path, + 'fio_root': str(Path(__file__).absolute().parent.parent), + 'artifact_root': artifact_root, + 'basename': 'client_server', + } + + _, failed, _ = run_fio_tests(TEST_LIST, test_env, args) + + stop_servers() + sys.exit(failed) + +if __name__ == '__main__': + main() diff --git a/t/client_server/test01.fio b/t/client_server/test01.fio new file mode 100644 index 0000000000..98927f56d4 --- /dev/null +++ b/t/client_server/test01.fio @@ -0,0 +1,14 @@ +[global] +ioengine=null +time_based +runtime=3s +filesize=1T + +[test1] +description=test1 + +[test2] +description=test2 + +[test3] +description=test3 diff --git a/t/client_server/test04-noglobal.fio b/t/client_server/test04-noglobal.fio new file mode 100644 index 0000000000..1353f889fc --- /dev/null +++ b/t/client_server/test04-noglobal.fio @@ -0,0 +1,20 @@ +[test1] +description=test1 +ioengine=null +time_based +runtime=3s +filesize=1T + +[test2] +description=test2 +ioengine=null +time_based +runtime=3s +filesize=1T + +[test3] +description=test3 +ioengine=null +time_based +runtime=3s +filesize=1T diff --git a/t/client_server/test07-slat.fio b/t/client_server/test07-slat.fio new file mode 100644 index 0000000000..595c66b359 --- /dev/null +++ b/t/client_server/test07-slat.fio @@ -0,0 +1,7 @@ +[test] +ioengine=null +iodepth=2 +filesize=1T +time_based +runtime=3s +slat_percentiles=1 diff --git a/t/client_server/test08-clat.fio b/t/client_server/test08-clat.fio new file mode 100644 index 0000000000..ef6ea512b1 --- /dev/null +++ b/t/client_server/test08-clat.fio @@ -0,0 +1,7 @@ +[test] +ioengine=null +iodepth=2 +filesize=1T +time_based +runtime=3s +clat_percentiles=1 diff --git a/t/client_server/test09-lat.fio b/t/client_server/test09-lat.fio new file mode 100644 index 0000000000..87ef9093cd --- /dev/null +++ b/t/client_server/test09-lat.fio @@ -0,0 +1,7 @@ +[test] +ioengine=null +iodepth=2 +filesize=1T +time_based +runtime=3s +lat_percentiles=1 diff --git a/t/client_server/test10-noclat.fio b/t/client_server/test10-noclat.fio new file mode 100644 index 0000000000..a27213e612 --- /dev/null +++ b/t/client_server/test10-noclat.fio @@ -0,0 +1,7 @@ +[test] +ioengine=null +iodepth=2 +filesize=1T +time_based +runtime=3s +clat_percentiles=0 diff --git a/t/client_server/test11-alllat.fio b/t/client_server/test11-alllat.fio new file mode 100644 index 0000000000..3404c2dbe4 --- /dev/null +++ b/t/client_server/test11-alllat.fio @@ -0,0 +1,9 @@ +[test] +ioengine=null +iodepth=2 +filesize=1T +time_based +runtime=3s +slat_percentiles=1 +clat_percentiles=1 +lat_percentiles=1 diff --git a/t/dedupe.c b/t/dedupe.c index 109ea1af49..02e52b742e 100644 --- a/t/dedupe.c +++ b/t/dedupe.c @@ -143,15 +143,15 @@ static int read_block(int fd, void *buf, off_t offset) return __read_block(fd, buf, offset, blocksize); } -static void account_unique_capacity(uint64_t offset, uint64_t *unique_capacity, - struct zlib_ctrl *zc) +static int account_unique_capacity(uint64_t offset, uint64_t *unique_capacity, + struct zlib_ctrl *zc) { z_stream *stream = &zc->stream; unsigned int compressed_len; int ret; if (read_block(file.fd, zc->buf_in, offset)) - return; + return 1; stream->next_in = zc->buf_in; stream->avail_in = blocksize; @@ -159,7 +159,8 @@ static void account_unique_capacity(uint64_t offset, uint64_t *unique_capacity, stream->next_out = zc->buf_out; ret = deflate(stream, Z_FINISH); - assert(ret != Z_STREAM_ERROR); + if (ret == Z_STREAM_ERROR) + return 1; compressed_len = blocksize - stream->avail_out; if (dump_output) @@ -169,6 +170,7 @@ static void account_unique_capacity(uint64_t offset, uint64_t *unique_capacity, *unique_capacity += compressed_len; deflateReset(stream); + return 0; } static void add_item(struct chunk *c, struct item *i) @@ -225,12 +227,12 @@ static struct chunk *alloc_chunk(void) return c; } -static void insert_chunk(struct item *i, uint64_t *unique_capacity, - struct zlib_ctrl *zc) +static int insert_chunk(struct item *i, uint64_t *unique_capacity, + struct zlib_ctrl *zc) { struct fio_rb_node **p, *parent; struct chunk *c; - int diff; + int ret, diff; p = &rb_root.rb_node; parent = NULL; @@ -244,8 +246,6 @@ static void insert_chunk(struct item *i, uint64_t *unique_capacity, } else if (diff > 0) { p = &(*p)->rb_right; } else { - int ret; - if (!collision_check) goto add; @@ -266,17 +266,21 @@ static void insert_chunk(struct item *i, uint64_t *unique_capacity, memcpy(c->hash, i->hash, sizeof(i->hash)); rb_link_node(&c->rb_node, parent, p); rb_insert_color(&c->rb_node, &rb_root); - if (compression) - account_unique_capacity(i->offset, unique_capacity, zc); + if (compression) { + ret = account_unique_capacity(i->offset, unique_capacity, zc); + if (ret) + return ret; + } add: add_item(c, i); + return 0; } -static void insert_chunks(struct item *items, unsigned int nitems, - uint64_t *ndupes, uint64_t *unique_capacity, - struct zlib_ctrl *zc) +static int insert_chunks(struct item *items, unsigned int nitems, + uint64_t *ndupes, uint64_t *unique_capacity, + struct zlib_ctrl *zc) { - int i; + int i, ret = 0; fio_sem_down(rb_lock); @@ -288,11 +292,15 @@ static void insert_chunks(struct item *items, unsigned int nitems, s = sizeof(items[i].hash) / sizeof(uint32_t); r = bloom_set(bloom, items[i].hash, s); *ndupes += r; - } else - insert_chunk(&items[i], unique_capacity, zc); + } else { + ret = insert_chunk(&items[i], unique_capacity, zc); + if (ret) + break; + } } fio_sem_up(rb_lock); + return ret; } static void crc_buf(void *buf, uint32_t *hash) @@ -320,6 +328,7 @@ static int do_work(struct worker_thread *thread, void *buf) uint64_t ndupes = 0; uint64_t unique_capacity = 0; struct item *items; + int ret; offset = thread->cur_offset; @@ -339,13 +348,17 @@ static int do_work(struct worker_thread *thread, void *buf) nitems++; } - insert_chunks(items, nitems, &ndupes, &unique_capacity, &thread->zc); + ret = insert_chunks(items, nitems, &ndupes, &unique_capacity, &thread->zc); free(items); - thread->items += nitems; - thread->dupes += ndupes; - thread->unique_capacity += unique_capacity; - return 0; + if (!ret) { + thread->items += nitems; + thread->dupes += ndupes; + thread->unique_capacity += unique_capacity; + return 0; + } + + return ret; } static void thread_init_zlib_control(struct worker_thread *thread) @@ -675,7 +688,7 @@ int main(int argc, char *argv[]) use_bloom = 0; if (!num_threads) - num_threads = cpus_online(); + num_threads = cpus_configured(); if (argc == optind) return usage(argv); diff --git a/t/fiotestcommon.py b/t/fiotestcommon.py new file mode 100644 index 0000000000..04dfb91fcc --- /dev/null +++ b/t/fiotestcommon.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python3 +""" +fiotestcommon.py + +This contains constant definitions, helpers, and a Requirements class that can +be used to help with running fio tests. +""" + +import os +import locale +import logging +import platform +import subprocess +import multiprocessing + + +SUCCESS_DEFAULT = { + 'zero_return': True, + 'stderr_empty': True, + 'timeout': 600, + } +SUCCESS_LONG = { + 'zero_return': True, + 'stderr_empty': True, + 'timeout': 3600, + } +SUCCESS_NONZERO = { + 'zero_return': False, + 'stderr_empty': False, + 'timeout': 600, + } +SUCCESS_STDERR = { + 'zero_return': True, + 'stderr_empty': False, + 'timeout': 600, + } + + +def get_file(filename): + """Safely read a file.""" + file_data = '' + success = True + + try: + with open(filename, "r", encoding=locale.getpreferredencoding()) as output_file: + file_data = output_file.read() + except OSError: + success = False + + return file_data, success + + +class Requirements(): + """Requirements consists of multiple run environment characteristics. + These are to determine if a particular test can be run""" + + _linux = False + _libaio = False + _io_uring = False + _zbd = False + _root = False + _zoned_nullb = False + _not_macos = False + _not_windows = False + _unittests = False + _cpucount4 = False + _nvmecdev = False + + def __init__(self, fio_root, args): + Requirements._not_macos = platform.system() != "Darwin" + Requirements._not_windows = platform.system() != "Windows" + Requirements._linux = platform.system() == "Linux" + + if Requirements._linux: + config_file = os.path.join(fio_root, "config-host.h") + contents, success = get_file(config_file) + if not success: + print(f"Unable to open {config_file} to check requirements") + Requirements._zbd = True + else: + Requirements._zbd = "CONFIG_HAS_BLKZONED" in contents + Requirements._libaio = "CONFIG_LIBAIO" in contents + + contents, success = get_file("/proc/kallsyms") + if not success: + print("Unable to open '/proc/kallsyms' to probe for io_uring support") + else: + Requirements._io_uring = "io_uring_setup" in contents + + Requirements._root = os.geteuid() == 0 + if Requirements._zbd and Requirements._root: + try: + subprocess.run(["modprobe", "null_blk"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + if os.path.exists("/sys/module/null_blk/parameters/zoned"): + Requirements._zoned_nullb = True + except Exception: + pass + + if platform.system() == "Windows": + utest_exe = "unittest.exe" + else: + utest_exe = "unittest" + unittest_path = os.path.join(fio_root, "unittests", utest_exe) + Requirements._unittests = os.path.exists(unittest_path) + + Requirements._cpucount4 = multiprocessing.cpu_count() >= 4 + Requirements._nvmecdev = args.nvmecdev if hasattr(args, 'nvmecdev') else False + + req_list = [ + Requirements.linux, + Requirements.libaio, + Requirements.io_uring, + Requirements.zbd, + Requirements.root, + Requirements.zoned_nullb, + Requirements.not_macos, + Requirements.not_windows, + Requirements.unittests, + Requirements.cpucount4, + Requirements.nvmecdev, + ] + for req in req_list: + value, desc = req() + logging.debug("Requirements: Requirement '%s' met? %s", desc, value) + + @classmethod + def linux(cls): + """Are we running on Linux?""" + return Requirements._linux, "Linux required" + + @classmethod + def libaio(cls): + """Is libaio available?""" + return Requirements._libaio, "libaio required" + + @classmethod + def io_uring(cls): + """Is io_uring available?""" + return Requirements._io_uring, "io_uring required" + + @classmethod + def zbd(cls): + """Is ZBD support available?""" + return Requirements._zbd, "Zoned block device support required" + + @classmethod + def root(cls): + """Are we running as root?""" + return Requirements._root, "root required" + + @classmethod + def zoned_nullb(cls): + """Are zoned null block devices available?""" + return Requirements._zoned_nullb, "Zoned null block device support required" + + @classmethod + def not_macos(cls): + """Are we running on a platform other than macOS?""" + return Requirements._not_macos, "platform other than macOS required" + + @classmethod + def not_windows(cls): + """Are we running on a platform other than Windws?""" + return Requirements._not_windows, "platform other than Windows required" + + @classmethod + def unittests(cls): + """Were unittests built?""" + return Requirements._unittests, "Unittests support required" + + @classmethod + def cpucount4(cls): + """Do we have at least 4 CPUs?""" + return Requirements._cpucount4, "4+ CPUs required" + + @classmethod + def nvmecdev(cls): + """Do we have an NVMe character device to test?""" + return Requirements._nvmecdev, "NVMe character device test target required" diff --git a/t/fiotestlib.py b/t/fiotestlib.py new file mode 100755 index 0000000000..2049e41ec4 --- /dev/null +++ b/t/fiotestlib.py @@ -0,0 +1,490 @@ +#!/usr/bin/env python3 +""" +fiotestlib.py + +This library contains FioTest objects that provide convenient means to run +different sorts of fio tests. + +It also contains a test runner that runs an array of dictionary objects +describing fio tests. +""" + +import os +import sys +import json +import locale +import shutil +import logging +import platform +import traceback +import subprocess +from pathlib import Path +from fiotestcommon import get_file, SUCCESS_DEFAULT + + +class FioTest(): + """Base for all fio tests.""" + + def __init__(self, exe_path, success, testnum, artifact_root): + self.success = success + self.testnum = testnum + self.output = {} + self.passed = True + self.failure_reason = '' + self.parameters = None + self.paths = { + 'exe': exe_path, + 'artifacts': artifact_root, + 'test_dir': os.path.join(artifact_root, \ + f"{testnum:04d}"), + } + self.filenames = { + 'cmd': os.path.join(self.paths['test_dir'], \ + f"{os.path.basename(self.paths['exe'])}.command"), + 'stdout': os.path.join(self.paths['test_dir'], \ + f"{os.path.basename(self.paths['exe'])}.stdout"), + 'stderr': os.path.join(self.paths['test_dir'], \ + f"{os.path.basename(self.paths['exe'])}.stderr"), + 'exitcode': os.path.join(self.paths['test_dir'], \ + f"{os.path.basename(self.paths['exe'])}.exitcode"), + } + + def setup(self, parameters): + """Setup instance variables for test.""" + + self.parameters = parameters + if not os.path.exists(self.paths['test_dir']): + os.mkdir(self.paths['test_dir']) + + def run(self): + """Run the test.""" + + raise NotImplementedError() + + def check_result(self): + """Check test results.""" + + raise NotImplementedError() + + +class FioExeTest(FioTest): + """Test consists of an executable binary or script""" + + def run(self): + """Execute the binary or script described by this instance.""" + + command = [self.paths['exe']] + self.parameters + with open(self.filenames['cmd'], "w+", + encoding=locale.getpreferredencoding()) as command_file: + command_file.write(" \\\n ".join(command)) + + try: + with open(self.filenames['stdout'], "w+", + encoding=locale.getpreferredencoding()) as stdout_file, \ + open(self.filenames['stderr'], "w+", + encoding=locale.getpreferredencoding()) as stderr_file, \ + open(self.filenames['exitcode'], "w+", + encoding=locale.getpreferredencoding()) as exitcode_file: + proc = None + # Avoid using subprocess.run() here because when a timeout occurs, + # fio will be stopped with SIGKILL. This does not give fio a + # chance to clean up and means that child processes may continue + # running and submitting IO. + proc = subprocess.Popen(command, + stdout=stdout_file, + stderr=stderr_file, + cwd=self.paths['test_dir'], + universal_newlines=True) + proc.communicate(timeout=self.success['timeout']) + exitcode_file.write(f'{proc.returncode}\n') + logging.debug("Test %d: return code: %d", self.testnum, proc.returncode) + self.output['proc'] = proc + except subprocess.TimeoutExpired: + proc.terminate() + proc.communicate() + assert proc.poll() + self.output['failure'] = 'timeout' + except Exception: + if proc: + if not proc.poll(): + proc.terminate() + proc.communicate() + self.output['failure'] = 'exception' + self.output['exc_info'] = sys.exc_info() + + def check_result(self): + """Check results of test run.""" + + if 'proc' not in self.output: + if self.output['failure'] == 'timeout': + self.failure_reason = f"{self.failure_reason} timeout," + else: + assert self.output['failure'] == 'exception' + self.failure_reason = f'{self.failure_reason} exception: ' + \ + f'{self.output["exc_info"][0]}, {self.output["exc_info"][1]}' + + self.passed = False + return + + if 'zero_return' in self.success: + if self.success['zero_return']: + if self.output['proc'].returncode != 0: + self.passed = False + self.failure_reason = f"{self.failure_reason} non-zero return code," + else: + if self.output['proc'].returncode == 0: + self.failure_reason = f"{self.failure_reason} zero return code," + self.passed = False + + stderr_size = os.path.getsize(self.filenames['stderr']) + if 'stderr_empty' in self.success: + if self.success['stderr_empty']: + if stderr_size != 0: + self.failure_reason = f"{self.failure_reason} stderr not empty size {stderr_size}," + self.passed = False + else: + if stderr_size == 0: + self.failure_reason = f"{self.failure_reason} stderr empty," + self.passed = False + + +class FioJobFileTest(FioExeTest): + """Test consists of a fio job with options in a job file.""" + + def __init__(self, fio_path, fio_job, success, testnum, artifact_root, + fio_pre_job=None, fio_pre_success=None, + output_format="normal"): + """Construct a FioJobFileTest which is a FioExeTest consisting of a + single fio job file with an optional setup step. + + fio_path: location of fio executable + fio_job: location of fio job file + success: Definition of test success + testnum: test ID + artifact_root: root directory for artifacts + fio_pre_job: fio job for preconditioning + fio_pre_success: Definition of test success for fio precon job + output_format: normal (default), json, jsonplus, or terse + """ + + self.fio_job = fio_job + self.fio_pre_job = fio_pre_job + self.fio_pre_success = fio_pre_success if fio_pre_success else success + self.output_format = output_format + self.precon_failed = False + self.json_data = None + + super().__init__(fio_path, success, testnum, artifact_root) + + def setup(self, parameters): + """Setup instance variables for fio job test.""" + + self.filenames['fio_output'] = f"{os.path.basename(self.fio_job)}.output" + fio_args = [ + "--max-jobs=16", + f"--output-format={self.output_format}", + f"--output={self.filenames['fio_output']}", + self.fio_job, + ] + if parameters: + fio_args += parameters + + super().setup(fio_args) + + # Update the filenames from the default + self.filenames['cmd'] = os.path.join(self.paths['test_dir'], + f"{os.path.basename(self.fio_job)}.command") + self.filenames['stdout'] = os.path.join(self.paths['test_dir'], + f"{os.path.basename(self.fio_job)}.stdout") + self.filenames['stderr'] = os.path.join(self.paths['test_dir'], + f"{os.path.basename(self.fio_job)}.stderr") + self.filenames['exitcode'] = os.path.join(self.paths['test_dir'], + f"{os.path.basename(self.fio_job)}.exitcode") + + def run_pre_job(self): + """Run fio job precondition step.""" + + precon = FioJobFileTest(self.paths['exe'], self.fio_pre_job, + self.fio_pre_success, + self.testnum, + self.paths['artifacts'], + output_format=self.output_format) + precon.setup(None) + precon.run() + precon.check_result() + self.precon_failed = not precon.passed + self.failure_reason = precon.failure_reason + + def run(self): + """Run fio job test.""" + + if self.fio_pre_job: + self.run_pre_job() + + if not self.precon_failed: + super().run() + else: + logging.debug("Test %d: precondition step failed", self.testnum) + + def get_file_fail(self, filename): + """Safely read a file and fail the test upon error.""" + file_data = None + + try: + with open(filename, "r", encoding=locale.getpreferredencoding()) as output_file: + file_data = output_file.read() + except OSError: + self.failure_reason += f" unable to read file {filename}" + self.passed = False + + return file_data + + def check_result(self): + """Check fio job results.""" + + if self.precon_failed: + self.passed = False + self.failure_reason = f"{self.failure_reason} precondition step failed," + return + + super().check_result() + + if not self.passed: + return + + if 'json' not in self.output_format: + return + + file_data = self.get_file_fail(os.path.join(self.paths['test_dir'], + self.filenames['fio_output'])) + if not file_data: + return + + # + # Sometimes fio informational messages are included outside the JSON + # output, especially under Windows. Try to decode output as JSON data, + # skipping outside the first { and last } + # + lines = file_data.splitlines() + last = len(lines) - lines[::-1].index("}") + file_data = '\n'.join(lines[lines.index("{"):last]) + try: + self.json_data = json.loads(file_data) + except json.JSONDecodeError: + self.failure_reason = f"{self.failure_reason} unable to decode JSON data," + self.passed = False + + +class FioJobCmdTest(FioExeTest): + """This runs a fio job with options specified on the command line.""" + + def __init__(self, fio_path, success, testnum, artifact_root, fio_opts, basename=None): + + self.basename = basename if basename else os.path.basename(fio_path) + self.fio_opts = fio_opts + self.json_data = None + self.iops_log_lines = None + + super().__init__(fio_path, success, testnum, artifact_root) + + filename_stub = os.path.join(self.paths['test_dir'], f"{self.basename}{self.testnum:03d}") + self.filenames['cmd'] = f"{filename_stub}.command" + self.filenames['stdout'] = f"{filename_stub}.stdout" + self.filenames['stderr'] = f"{filename_stub}.stderr" + self.filenames['output'] = os.path.abspath(f"{filename_stub}.output") + self.filenames['exitcode'] = f"{filename_stub}.exitcode" + self.filenames['iopslog'] = os.path.abspath(f"{filename_stub}") + + def run(self): + super().run() + + if 'output-format' in self.fio_opts and 'json' in \ + self.fio_opts['output-format']: + if not self.get_json(): + print('Unable to decode JSON data') + self.passed = False + + if any('--write_iops_log=' in param for param in self.parameters): + self.get_iops_log() + + def get_iops_log(self): + """Read IOPS log from the first job.""" + + log_filename = self.filenames['iopslog'] + "_iops.1.log" + with open(log_filename, 'r', encoding=locale.getpreferredencoding()) as iops_file: + self.iops_log_lines = iops_file.read() + + def get_json(self): + """Convert fio JSON output into a python JSON object""" + + filename = self.filenames['output'] + with open(filename, 'r', encoding=locale.getpreferredencoding()) as file: + file_data = file.read() + + # + # Sometimes fio informational messages are included outside the JSON + # output, especially under Windows. Try to decode output as JSON data, + # skipping outside the first { and last } + # + lines = file_data.splitlines() + last = len(lines) - lines[::-1].index("}") + file_data = '\n'.join(lines[lines.index("{"):last]) + try: + self.json_data = json.loads(file_data) + except json.JSONDecodeError: + return False + + return True + + @staticmethod + def check_empty(job): + """ + Make sure JSON data is empty. + + Some data structures should be empty. This function makes sure that they are. + + job JSON object that we need to check for emptiness + """ + + return job['total_ios'] == 0 and \ + job['slat_ns']['N'] == 0 and \ + job['clat_ns']['N'] == 0 and \ + job['lat_ns']['N'] == 0 + + def check_all_ddirs(self, ddir_nonzero, job): + """ + Iterate over the data directions and check whether each is + appropriately empty or not. + """ + + retval = True + ddirlist = ['read', 'write', 'trim'] + + for ddir in ddirlist: + if ddir in ddir_nonzero: + if self.check_empty(job[ddir]): + print(f"Unexpected zero {ddir} data found in output") + retval = False + else: + if not self.check_empty(job[ddir]): + print(f"Unexpected {ddir} data found in output") + retval = False + + return retval + + +def run_fio_tests(test_list, test_env, args): + """ + Run tests as specified in test_list. + """ + + passed = 0 + failed = 0 + skipped = 0 + + for config in test_list: + if (args.skip and config['test_id'] in args.skip) or \ + (args.run_only and config['test_id'] not in args.run_only) or \ + ('force_skip' in config and config['force_skip']): + skipped = skipped + 1 + print(f"Test {config['test_id']} SKIPPED (User request or override)") + continue + + if issubclass(config['test_class'], FioJobFileTest): + if config['pre_job']: + fio_pre_job = os.path.join(test_env['fio_root'], 't', 'jobs', + config['pre_job']) + else: + fio_pre_job = None + if config['pre_success']: + fio_pre_success = config['pre_success'] + else: + fio_pre_success = None + if 'output_format' in config: + output_format = config['output_format'] + else: + output_format = 'normal' + test = config['test_class']( + test_env['fio_path'], + os.path.join(test_env['fio_root'], 't', 'jobs', config['job']), + config['success'], + config['test_id'], + test_env['artifact_root'], + fio_pre_job=fio_pre_job, + fio_pre_success=fio_pre_success, + output_format=output_format) + desc = config['job'] + parameters = config['parameters'] if 'parameters' in config else None + elif issubclass(config['test_class'], FioJobCmdTest): + if not 'success' in config: + config['success'] = SUCCESS_DEFAULT + test = config['test_class'](test_env['fio_path'], + config['success'], + config['test_id'], + test_env['artifact_root'], + config['fio_opts'], + test_env['basename']) + desc = config['test_id'] + parameters = config + elif issubclass(config['test_class'], FioExeTest): + exe_path = os.path.join(test_env['fio_root'], config['exe']) + parameters = [] + if config['parameters']: + parameters = [p.format(fio_path=test_env['fio_path'], nvmecdev=args.nvmecdev) + for p in config['parameters']] + if Path(exe_path).suffix == '.py' and platform.system() == "Windows": + parameters.insert(0, exe_path) + exe_path = "python.exe" + if config['test_id'] in test_env['pass_through']: + parameters += test_env['pass_through'][config['test_id']].split() + test = config['test_class']( + exe_path, + config['success'], + config['test_id'], + test_env['artifact_root']) + desc = config['exe'] + else: + print(f"Test {config['test_id']} FAILED: unable to process test config") + failed = failed + 1 + continue + + if 'requirements' in config and not args.skip_req: + reqs_met = True + for req in config['requirements']: + reqs_met, reason = req() + logging.debug("Test %d: Requirement '%s' met? %s", config['test_id'], reason, + reqs_met) + if not reqs_met: + break + if not reqs_met: + print(f"Test {config['test_id']} SKIPPED ({reason}) {desc}") + skipped = skipped + 1 + continue + + try: + test.setup(parameters) + test.run() + test.check_result() + except KeyboardInterrupt: + break + except Exception as e: + test.passed = False + test.failure_reason += str(e) + logging.debug("Test %d exception:\n%s\n", config['test_id'], traceback.format_exc()) + if test.passed: + result = "PASSED" + passed = passed + 1 + if hasattr(args, 'cleanup') and args.cleanup: + shutil.rmtree(test_env['artifact_root'] + f"/{config['test_id']:04d}", ignore_errors=True) + else: + result = f"FAILED: {test.failure_reason}" + failed = failed + 1 + contents, _ = get_file(test.filenames['stderr']) + logging.debug("Test %d: stderr:\n%s", config['test_id'], contents) + contents, _ = get_file(test.filenames['stdout']) + logging.debug("Test %d: stdout:\n%s", config['test_id'], contents) + print(f"Test {config['test_id']} {result} {desc}") + + print(f"{passed} test(s) passed, {failed} failed, {skipped} skipped") + + return passed, failed, skipped diff --git a/t/io_uring.c b/t/io_uring.c index a98f78fd4a..0a04af4ed0 100644 --- a/t/io_uring.c +++ b/t/io_uring.c @@ -11,6 +11,10 @@ #include #endif +#ifdef CONFIG_LIBNUMA +#include +#endif + #include #include #include @@ -24,13 +28,16 @@ #include #include #include +#include #include "../arch/arch.h" +#include "../os/os.h" #include "../lib/types.h" #include "../lib/roundup.h" #include "../lib/rand.h" #include "../minmax.h" #include "../os/linux/io_uring.h" +#include "../engines/nvme.h" struct io_sq_ring { unsigned *head; @@ -63,6 +70,8 @@ struct file { unsigned long max_size; unsigned long cur_off; unsigned pending_ios; + unsigned int nsid; /* nsid field required for nvme-passthrough */ + unsigned int lba_shift; /* lba_shift field required for nvme-passthrough */ int real_fd; int fixed_fd; int fileno; @@ -76,6 +85,7 @@ struct file { struct submitter { pthread_t thread; int ring_fd; + int enter_ring_fd; int index; struct io_sq_ring sq_ring; struct io_uring_sqe *sqes; @@ -85,6 +95,7 @@ struct submitter { unsigned long reaps; unsigned long done; unsigned long calls; + unsigned long io_errors; volatile int finish; __s32 *fds; @@ -99,6 +110,10 @@ struct submitter { io_context_t aio_ctx; #endif + int numa_node; + int per_file_depth; + const char *filename; + struct file files[MAX_FDS]; unsigned nr_files; unsigned cur_file; @@ -109,6 +124,7 @@ static struct submitter *submitter; static volatile int finish; static int stats_running; static unsigned long max_iops; +static long t_io_uring_page_size; static int depth = DEPTH; static int batch_submit = BATCH_SUBMIT; @@ -116,38 +132,79 @@ static int batch_complete = BATCH_COMPLETE; static int bs = BS; static int polled = 1; /* use IO polling */ static int fixedbufs = 1; /* use fixed user buffers */ -static int dma_map; /* pre-map DMA buffers */ static int register_files = 1; /* use fixed files */ static int buffered = 0; /* use buffered IO, not O_DIRECT */ static int sq_thread_poll = 0; /* use kernel submission/poller thread */ static int sq_thread_cpu = -1; /* pin above thread to this CPU */ static int do_nop = 0; /* no-op SQ ring commands */ +static int use_files = 1; static int nthreads = 1; static int stats = 0; /* generate IO stats */ static int aio = 0; /* use libaio */ static int runtime = 0; /* runtime */ static int random_io = 1; /* random or sequential IO */ +static int register_ring = 1; /* register ring */ +static int use_sync = 0; /* use preadv2 */ +static int numa_placement = 0; /* set to node of device */ +static int vectored = 0; /* use vectored IO */ +static int pt = 0; /* passthrough I/O or not */ +static int restriction = 0; /* for testing restriction filter */ static unsigned long tsc_rate; #define TSC_RATE_FILE "tsc-rate" -static int vectored = 1; - static float plist[] = { 1.0, 5.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 95.0, 99.0, 99.5, 99.9, 99.95, 99.99 }; static int plist_len = 17; -#ifndef IORING_REGISTER_MAP_BUFFERS -#define IORING_REGISTER_MAP_BUFFERS 20 -struct io_uring_map_buffers { - __s32 fd; - __u32 buf_start; - __u32 buf_end; - __u32 flags; - __u64 rsvd[2]; -}; -#endif +static int nvme_identify(int fd, __u32 nsid, enum nvme_identify_cns cns, + enum nvme_csi csi, void *data) +{ + struct nvme_passthru_cmd cmd = { + .opcode = nvme_admin_identify, + .nsid = nsid, + .addr = (__u64)(uintptr_t)data, + .data_len = NVME_IDENTIFY_DATA_SIZE, + .cdw10 = cns, + .cdw11 = csi << NVME_IDENTIFY_CSI_SHIFT, + .timeout_ms = NVME_DEFAULT_IOCTL_TIMEOUT, + }; + + return ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd); +} + +static int nvme_get_info(int fd, __u32 *nsid, __u32 *lba_sz, __u64 *nlba) +{ + struct nvme_id_ns ns; + int namespace_id; + int err; + + namespace_id = ioctl(fd, NVME_IOCTL_ID); + if (namespace_id < 0) { + fprintf(stderr, "error failed to fetch namespace-id\n"); + close(fd); + return -errno; + } + + /* + * Identify namespace to get namespace-id, namespace size in LBA's + * and LBA data size. + */ + err = nvme_identify(fd, namespace_id, NVME_IDENTIFY_CNS_NS, + NVME_CSI_NVM, &ns); + if (err) { + fprintf(stderr, "error failed to fetch identify namespace\n"); + close(fd); + return err; + } + + *nsid = namespace_id; + *lba_sz = 1 << ns.lbaf[(ns.flbas & 0x0f)].ds; + *nlba = ns.nsze; + + return 0; +} static unsigned long cycles_to_nsec(unsigned long cycles) { @@ -183,9 +240,9 @@ static unsigned long plat_idx_to_val(unsigned int idx) return cycles_to_nsec(base + ((k + 0.5) * (1 << error_bits))); } -unsigned int calc_clat_percentiles(unsigned long *io_u_plat, unsigned long nr, - unsigned long **output, - unsigned long *maxv, unsigned long *minv) +unsigned int calculate_clat_percentiles(unsigned long *io_u_plat, + unsigned long nr, unsigned long **output, + unsigned long *maxv, unsigned long *minv) { unsigned long sum = 0; unsigned int len = plist_len, i, j = 0; @@ -239,7 +296,7 @@ static void show_clat_percentiles(unsigned long *io_u_plat, unsigned long nr, bool is_last; char fmt[32]; - len = calc_clat_percentiles(io_u_plat, nr, &ovals, &maxv, &minv); + len = calculate_clat_percentiles(io_u_plat, nr, &ovals, &maxv, &minv); if (!len || !ovals) goto out; @@ -287,6 +344,7 @@ static void show_clat_percentiles(unsigned long *io_u_plat, unsigned long nr, free(ovals); } +#ifdef ARCH_HAVE_CPU_CLOCK static unsigned int plat_val_to_idx(unsigned long val) { unsigned int msb, error_bits, base, offset, idx; @@ -322,6 +380,7 @@ static unsigned int plat_val_to_idx(unsigned long val) return idx; } +#endif static void add_stat(struct submitter *s, int clock_index, int nr) { @@ -338,40 +397,25 @@ static void add_stat(struct submitter *s, int clock_index, int nr) #endif } -static int io_uring_map_buffers(struct submitter *s) -{ - struct io_uring_map_buffers map = { - .fd = s->files[0].real_fd, - .buf_end = depth, - }; - - if (do_nop) - return 0; - if (s->nr_files > 1) { - fprintf(stderr, "Can't map buffers with multiple files\n"); - return -1; - } - - return syscall(__NR_io_uring_register, s->ring_fd, - IORING_REGISTER_MAP_BUFFERS, &map, 1); -} - static int io_uring_register_buffers(struct submitter *s) { - if (do_nop) - return 0; + int ret; - return syscall(__NR_io_uring_register, s->ring_fd, - IORING_REGISTER_BUFFERS, s->iovecs, depth); + /* + * All iovecs are filled in case of readv, but it's all contig + * from vec0. Just register a single buffer for all buffers. + */ + s->iovecs[0].iov_len = bs * roundup_pow2(depth); + ret = syscall(__NR_io_uring_register, s->ring_fd, + IORING_REGISTER_BUFFERS, s->iovecs, 1); + s->iovecs[0].iov_len = bs; + return ret; } static int io_uring_register_files(struct submitter *s) { int i; - if (do_nop) - return 0; - s->fds = calloc(s->nr_files, sizeof(__s32)); for (i = 0; i < s->nr_files; i++) { s->fds[i] = s->files[i].real_fd; @@ -384,6 +428,8 @@ static int io_uring_register_files(struct submitter *s) static int io_uring_setup(unsigned entries, struct io_uring_params *p) { + int ret; + /* * Clamp CQ ring size at our SQ ring size, we don't need more entries * than that. @@ -391,84 +437,111 @@ static int io_uring_setup(unsigned entries, struct io_uring_params *p) p->flags |= IORING_SETUP_CQSIZE; p->cq_entries = entries; - return syscall(__NR_io_uring_setup, entries, p); -} - -static void io_uring_probe(int fd) -{ - struct io_uring_probe *p; - int ret; - - p = malloc(sizeof(*p) + 256 * sizeof(struct io_uring_probe_op)); - if (!p) - return; - - memset(p, 0, sizeof(*p) + 256 * sizeof(struct io_uring_probe_op)); - ret = syscall(__NR_io_uring_register, fd, IORING_REGISTER_PROBE, p, 256); - if (ret < 0) - goto out; + p->flags |= IORING_SETUP_COOP_TASKRUN; + p->flags |= IORING_SETUP_SINGLE_ISSUER; + p->flags |= IORING_SETUP_DEFER_TASKRUN; + p->flags |= IORING_SETUP_NO_SQARRAY; +retry: + ret = syscall(__NR_io_uring_setup, entries, p); + if (!ret) + return 0; - if (IORING_OP_READ > p->ops_len) - goto out; + if (errno == EINVAL && p->flags & IORING_SETUP_COOP_TASKRUN) { + p->flags &= ~IORING_SETUP_COOP_TASKRUN; + goto retry; + } + if (errno == EINVAL && p->flags & IORING_SETUP_SINGLE_ISSUER) { + p->flags &= ~IORING_SETUP_SINGLE_ISSUER; + goto retry; + } + if (errno == EINVAL && p->flags & IORING_SETUP_DEFER_TASKRUN) { + p->flags &= ~IORING_SETUP_DEFER_TASKRUN; + goto retry; + } + if (errno == EINVAL && p->flags & IORING_SETUP_NO_SQARRAY) { + p->flags &= ~IORING_SETUP_NO_SQARRAY; + goto retry; + } - if ((p->ops[IORING_OP_READ].flags & IO_URING_OP_SUPPORTED)) - vectored = 0; -out: - free(p); + return ret; } static int io_uring_enter(struct submitter *s, unsigned int to_submit, unsigned int min_complete, unsigned int flags) { - return syscall(__NR_io_uring_enter, s->ring_fd, to_submit, min_complete, - flags, NULL, 0); + if (register_ring) + flags |= IORING_ENTER_REGISTERED_RING; +#ifdef FIO_ARCH_HAS_SYSCALL + return __do_syscall6(__NR_io_uring_enter, s->enter_ring_fd, to_submit, + min_complete, flags, NULL, 0); +#else + return syscall(__NR_io_uring_enter, s->enter_ring_fd, to_submit, + min_complete, flags, NULL, 0); +#endif } -#ifndef CONFIG_HAVE_GETTID -static int gettid(void) +static unsigned long long get_offset(struct submitter *s, struct file *f) { - return syscall(__NR_gettid); -} -#endif + unsigned long long offset; + long r; -static unsigned file_depth(struct submitter *s) -{ - return (depth + s->nr_files - 1) / s->nr_files; + if (random_io) { + unsigned long long block; + + r = __rand64(&s->rand_state); + block = r % f->max_blocks; + offset = block * (unsigned long long) bs; + } else { + offset = f->cur_off; + f->cur_off += bs; + if (f->cur_off + bs > f->max_size) + f->cur_off = 0; + } + + return offset; } -static void init_io(struct submitter *s, unsigned index) +static struct file *get_next_file(struct submitter *s) { - struct io_uring_sqe *sqe = &s->sqes[index]; - unsigned long offset; struct file *f; - long r; - - if (do_nop) { - sqe->opcode = IORING_OP_NOP; - return; - } if (s->nr_files == 1) { f = &s->files[0]; } else { f = &s->files[s->cur_file]; - if (f->pending_ios >= file_depth(s)) { + if (f->pending_ios >= s->per_file_depth) { s->cur_file++; if (s->cur_file == s->nr_files) s->cur_file = 0; f = &s->files[s->cur_file]; } } + f->pending_ios++; + return f; +} - if (random_io) { - r = __rand64(&s->rand_state); - offset = (r % (f->max_blocks - 1)) * bs; - } else { - offset = f->cur_off; - f->cur_off += bs; - if (f->cur_off + bs > f->max_size) - f->cur_off = 0; +static void init_io(struct submitter *s, unsigned index) +{ + struct io_uring_sqe *sqe = &s->sqes[index]; + struct file *f; + + f = get_next_file(s); + + if (do_nop) { + sqe->rw_flags = IORING_NOP_FILE; + if (register_files) { + sqe->fd = f->fixed_fd; + sqe->rw_flags |= IORING_NOP_FIXED_FILE; + } else { + sqe->fd = f->real_fd; + } + if (fixedbufs) + sqe->rw_flags |= IORING_NOP_FIXED_BUFFER; + sqe->rw_flags |= IORING_NOP_INJECT_RESULT; + sqe->len = bs; + sqe->opcode = IORING_OP_NOP; + return; } if (register_files) { @@ -482,7 +555,7 @@ static void init_io(struct submitter *s, unsigned index) sqe->opcode = IORING_OP_READ_FIXED; sqe->addr = (unsigned long) s->iovecs[index].iov_base; sqe->len = bs; - sqe->buf_index = index; + sqe->buf_index = 0; } else if (!vectored) { sqe->opcode = IORING_OP_READ; sqe->addr = (unsigned long) s->iovecs[index].iov_base; @@ -495,26 +568,82 @@ static void init_io(struct submitter *s, unsigned index) sqe->buf_index = 0; } sqe->ioprio = 0; - sqe->off = offset; + sqe->off = get_offset(s, f); sqe->user_data = (unsigned long) f->fileno; if (stats && stats_running) sqe->user_data |= ((uint64_t)s->clock_index << 32); } +static void init_io_pt(struct submitter *s, unsigned index) +{ + struct io_uring_sqe *sqe = &s->sqes[index << 1]; + unsigned long offset; + struct file *f; + struct nvme_uring_cmd *cmd; + unsigned long long slba; + unsigned long long nlb; + + f = get_next_file(s); + + offset = get_offset(s, f); + + if (register_files) { + sqe->fd = f->fixed_fd; + sqe->flags = IOSQE_FIXED_FILE; + } else { + sqe->fd = f->real_fd; + sqe->flags = 0; + } + sqe->opcode = IORING_OP_URING_CMD; + sqe->user_data = (unsigned long) f->fileno; + if (stats) + sqe->user_data |= ((__u64) s->clock_index << 32ULL); + sqe->cmd_op = NVME_URING_CMD_IO; + slba = offset >> f->lba_shift; + nlb = (bs >> f->lba_shift) - 1; + cmd = (struct nvme_uring_cmd *)&sqe->cmd; + /* cdw10 and cdw11 represent starting slba*/ + cmd->cdw10 = slba & 0xffffffff; + cmd->cdw11 = slba >> 32; + /* cdw12 represent number of lba to be read*/ + cmd->cdw12 = nlb; + cmd->addr = (unsigned long) s->iovecs[index].iov_base; + cmd->data_len = bs; + if (fixedbufs) { + sqe->uring_cmd_flags = IORING_URING_CMD_FIXED; + sqe->buf_index = 0; + } + if (vectored) { + sqe->cmd_op = NVME_URING_CMD_IO_VEC; + cmd->addr = (unsigned long) &s->iovecs[index]; + cmd->data_len = 1; + sqe->buf_index = 0; + } + cmd->nsid = f->nsid; + cmd->opcode = 2; +} + static int prep_more_ios_uring(struct submitter *s, int max_ios) { struct io_sq_ring *ring = &s->sq_ring; - unsigned index, tail, next_tail, prepped = 0; + unsigned head, index, tail, next_tail, prepped = 0; + + if (sq_thread_poll) + head = atomic_load_acquire(ring->head); + else + head = *ring->head; next_tail = tail = *ring->tail; do { next_tail++; - if (next_tail == atomic_load_acquire(ring->head)) + if (next_tail == head) break; index = tail & sq_ring_mask; - init_io(s, index); - ring->array[index] = index; + if (pt) + init_io_pt(s, index); + else + init_io(s, index); prepped++; tail = next_tail; } while (prepped < max_ios); @@ -530,7 +659,29 @@ static int get_file_size(struct file *f) if (fstat(f->real_fd, &st) < 0) return -1; - if (S_ISBLK(st.st_mode)) { + if (pt) { + __u64 nlba; + __u32 lbs; + int ret; + + if (!S_ISCHR(st.st_mode)) { + fprintf(stderr, "passthrough works with only nvme-ns " + "generic devices (/dev/ngXnY)\n"); + return -1; + } + ret = nvme_get_info(f->real_fd, &f->nsid, &lbs, &nlba); + if (ret) + return -1; + if ((bs % lbs) != 0) { + printf("error: bs:%d should be a multiple logical_block_size:%d\n", + bs, lbs); + return -1; + } + f->max_blocks = nlba; + f->max_size = nlba; + f->lba_shift = ilog2(lbs); + return 0; + } else if (S_ISBLK(st.st_mode)) { unsigned long long bytes; if (ioctl(f->real_fd, BLKGETSIZE64, &bytes) != 0) @@ -552,27 +703,31 @@ static int reap_events_uring(struct submitter *s) { struct io_cq_ring *ring = &s->cq_ring; struct io_uring_cqe *cqe; - unsigned head, reaped = 0; + unsigned tail, head, reaped = 0; int last_idx = -1, stat_nr = 0; head = *ring->head; + tail = atomic_load_acquire(ring->tail); do { struct file *f; - read_barrier(); - if (head == atomic_load_acquire(ring->tail)) + if (head == tail) break; cqe = &ring->cqes[head & cq_ring_mask]; - if (!do_nop) { + if (use_files) { int fileno = cqe->user_data & 0xffffffff; f = &s->files[fileno]; f->pending_ios--; if (cqe->res != bs) { - printf("io: unexpected ret=%d\n", cqe->res); - if (polled && cqe->res == -EOPNOTSUPP) - printf("Your filesystem/driver/kernel doesn't support polled IO\n"); - return -1; + if (cqe->res == -ENODATA || cqe->res == -EIO) { + s->io_errors++; + } else { + printf("io: unexpected ret=%d\n", cqe->res); + if (polled && cqe->res == -EOPNOTSUPP) + printf("Your filesystem/driver/kernel doesn't support polled IO\n"); + return -1; + } } } if (stats) { @@ -591,29 +746,355 @@ static int reap_events_uring(struct submitter *s) head++; } while (1); - if (stat_nr) - add_stat(s, last_idx, stat_nr); + if (stat_nr) + add_stat(s, last_idx, stat_nr); + + if (reaped) { + s->inflight -= reaped; + atomic_store_release(ring->head, head); + } + return reaped; +} + +static int reap_events_uring_pt(struct submitter *s) +{ + struct io_cq_ring *ring = &s->cq_ring; + struct io_uring_cqe *cqe; + unsigned head, tail, reaped = 0; + int last_idx = -1, stat_nr = 0; + unsigned index; + int fileno; + + head = *ring->head; + tail = atomic_load_acquire(ring->tail); + do { + struct file *f; + + if (head == tail) + break; + index = head & cq_ring_mask; + cqe = &ring->cqes[index << 1]; + fileno = cqe->user_data & 0xffffffff; + f = &s->files[fileno]; + f->pending_ios--; + + if (cqe->res != 0) { + printf("io: unexpected ret=%d\n", cqe->res); + if (polled && cqe->res == -EINVAL) + printf("passthrough doesn't support polled IO\n"); + return -1; + } + if (stats) { + int clock_index = cqe->user_data >> 32; + + if (last_idx != clock_index) { + if (last_idx != -1) { + add_stat(s, last_idx, stat_nr); + stat_nr = 0; + } + last_idx = clock_index; + } + stat_nr++; + } + reaped++; + head++; + } while (1); + + if (stat_nr) + add_stat(s, last_idx, stat_nr); + + if (reaped) { + s->inflight -= reaped; + atomic_store_release(ring->head, head); + } + return reaped; +} + +static void set_affinity(struct submitter *s) +{ +#ifdef CONFIG_LIBNUMA + struct bitmask *mask; + + if (s->numa_node == -1) + return; + + numa_set_preferred(s->numa_node); + + mask = numa_allocate_cpumask(); + numa_node_to_cpus(s->numa_node, mask); + numa_sched_setaffinity(s->tid, mask); +#endif +} + +static int detect_node(struct submitter *s, char *name) +{ +#ifdef CONFIG_LIBNUMA + const char *base = basename(name); + char str[128]; + int ret, fd, node; + + if (pt) + sprintf(str, "/sys/class/nvme-generic/%s/device/numa_node", base); + else + sprintf(str, "/sys/block/%s/device/numa_node", base); + fd = open(str, O_RDONLY); + if (fd < 0) + return -1; + + ret = read(fd, str, sizeof(str)); + if (ret < 0) { + close(fd); + return -1; + } + node = atoi(str); + s->numa_node = node; + close(fd); +#else + s->numa_node = -1; +#endif + return 0; +} + +static int setup_aio(struct submitter *s) +{ +#ifdef CONFIG_LIBAIO + if (polled) { + fprintf(stderr, "aio does not support polled IO\n"); + polled = 0; + } + if (sq_thread_poll) { + fprintf(stderr, "aio does not support SQPOLL IO\n"); + sq_thread_poll = 0; + } + if (do_nop) { + fprintf(stderr, "aio does not support polled IO\n"); + do_nop = 0; + } + if (fixedbufs || register_files) { + fprintf(stderr, "aio does not support registered files or buffers\n"); + fixedbufs = register_files = 0; + } + + s->per_file_depth = (depth + s->nr_files - 1) / s->nr_files; + return io_queue_init(roundup_pow2(depth), &s->aio_ctx); +#else + fprintf(stderr, "Legacy AIO not available on this system/build\n"); + errno = EINVAL; + return -1; +#endif +} + +static int io_uring_register_restrictions(struct submitter *s) +{ + struct io_uring_restriction res[8] = { }; + int ret; + + res[0].opcode = IORING_RESTRICTION_SQE_OP; + res[0].sqe_op = IORING_OP_NOP; + res[1].opcode = IORING_RESTRICTION_SQE_OP; + res[1].sqe_op = IORING_OP_READ; + res[2].opcode = IORING_RESTRICTION_SQE_OP; + res[2].sqe_op = IORING_OP_READV; + res[3].opcode = IORING_RESTRICTION_SQE_OP; + res[3].sqe_op = IORING_OP_READ_FIXED; + + res[4].opcode = IORING_RESTRICTION_REGISTER_OP; + res[4].sqe_op = IORING_REGISTER_BUFFERS; + res[5].opcode = IORING_RESTRICTION_REGISTER_OP; + res[5].sqe_op = IORING_REGISTER_ENABLE_RINGS; + res[6].opcode = IORING_RESTRICTION_REGISTER_OP; + res[6].sqe_op = IORING_REGISTER_RING_FDS; + res[7].opcode = IORING_RESTRICTION_REGISTER_OP; + res[7].sqe_op = IORING_REGISTER_FILES; + + ret = syscall(__NR_io_uring_register, s->ring_fd, + IORING_REGISTER_RESTRICTIONS, res, 8); + if (ret) { + fprintf(stderr, "IORING_REGISTER_RESTRICTIONS: %d\n", ret); + return ret; + } + + return syscall(__NR_io_uring_register, s->ring_fd, IORING_REGISTER_ENABLE_RINGS, NULL, 0); +} + +static int setup_ring(struct submitter *s) +{ + struct io_sq_ring *sring = &s->sq_ring; + struct io_cq_ring *cring = &s->cq_ring; + struct io_uring_params p; + int ret, fd, i; + void *ptr; + size_t len; + + memset(&p, 0, sizeof(p)); + + if (polled && !do_nop) + p.flags |= IORING_SETUP_IOPOLL; + if (sq_thread_poll) { + p.flags |= IORING_SETUP_SQPOLL; + if (sq_thread_cpu != -1) { + p.flags |= IORING_SETUP_SQ_AFF; + p.sq_thread_cpu = sq_thread_cpu; + } + } + if (pt) { + p.flags |= IORING_SETUP_SQE128; + p.flags |= IORING_SETUP_CQE32; + } + if (restriction) + p.flags |= IORING_SETUP_R_DISABLED; + + fd = io_uring_setup(depth, &p); + if (fd < 0) { + perror("io_uring_setup"); + return 1; + } + s->ring_fd = s->enter_ring_fd = fd; + + if (restriction) { + /* enables rings too */ + ret = io_uring_register_restrictions(s); + if (ret) { + fprintf(stderr, "Failed to set restrictions\n"); + return ret; + } + } + + if (fixedbufs) { + struct rlimit rlim; + + rlim.rlim_cur = RLIM_INFINITY; + rlim.rlim_max = RLIM_INFINITY; + /* ignore potential error, not needed on newer kernels */ + setrlimit(RLIMIT_MEMLOCK, &rlim); + + ret = io_uring_register_buffers(s); + if (ret < 0) { + perror("io_uring_register_buffers"); + return 1; + } + } + + if (register_files) { + ret = io_uring_register_files(s); + if (ret < 0) { + perror("io_uring_register_files"); + return 1; + } + } + + ptr = mmap(0, p.sq_off.array + p.sq_entries * sizeof(__u32), + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, + IORING_OFF_SQ_RING); + sring->head = ptr + p.sq_off.head; + sring->tail = ptr + p.sq_off.tail; + sring->ring_mask = ptr + p.sq_off.ring_mask; + sring->ring_entries = ptr + p.sq_off.ring_entries; + sring->flags = ptr + p.sq_off.flags; + sq_ring_mask = *sring->ring_mask; + + if (!(p.flags & IORING_SETUP_NO_SQARRAY)) { + sring->array = ptr + p.sq_off.array; + for (i = 0; i < p.sq_entries; i++) + sring->array[i] = i; + } + + if (p.flags & IORING_SETUP_SQE128) + len = 2 * p.sq_entries * sizeof(struct io_uring_sqe); + else + len = p.sq_entries * sizeof(struct io_uring_sqe); + s->sqes = mmap(0, len, + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, + IORING_OFF_SQES); + + if (p.flags & IORING_SETUP_CQE32) { + len = p.cq_off.cqes + + 2 * p.cq_entries * sizeof(struct io_uring_cqe); + } else { + len = p.cq_off.cqes + + p.cq_entries * sizeof(struct io_uring_cqe); + } + ptr = mmap(0, len, + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, + IORING_OFF_CQ_RING); + cring->head = ptr + p.cq_off.head; + cring->tail = ptr + p.cq_off.tail; + cring->ring_mask = ptr + p.cq_off.ring_mask; + cring->ring_entries = ptr + p.cq_off.ring_entries; + cring->cqes = ptr + p.cq_off.cqes; + cq_ring_mask = *cring->ring_mask; + + s->per_file_depth = INT_MAX; + if (s->nr_files) + s->per_file_depth = (depth + s->nr_files - 1) / s->nr_files; + return 0; +} - if (reaped) { - s->inflight -= reaped; - atomic_store_release(ring->head, head); +static void *allocate_mem(struct submitter *s, int size) +{ + void *buf; + +#ifdef CONFIG_LIBNUMA + if (s->numa_node != -1) + return numa_alloc_onnode(size, s->numa_node); +#endif + + if (posix_memalign(&buf, t_io_uring_page_size, size)) { + printf("failed alloc\n"); + return NULL; } - return reaped; + + return buf; } static int submitter_init(struct submitter *s) { - int i, nr_batch; + int i, nr_batch, err; + static int init_printed; + void *mem, *ptr; + char buf[80]; s->tid = gettid(); - printf("submitter=%d, tid=%d\n", s->index, s->tid); + printf("submitter=%d, tid=%d, file=%s, nfiles=%d, node=%d\n", s->index, + s->tid, s->filename, s->nr_files, s->numa_node); - __init_rand64(&s->rand_state, pthread_self()); - srand48(pthread_self()); + set_affinity(s); + + __init_rand64(&s->rand_state, s->tid); + srand48(s->tid); for (i = 0; i < MAX_FDS; i++) s->files[i].fileno = i; + mem = allocate_mem(s, bs * roundup_pow2(depth)); + for (i = 0, ptr = mem; i < roundup_pow2(depth); i++) { + s->iovecs[i].iov_base = ptr; + s->iovecs[i].iov_len = bs; + ptr += bs; + } + + if (use_sync) { + sprintf(buf, "Engine=preadv2\n"); + err = 0; + } else if (!aio) { + err = setup_ring(s); + if (!err) + sprintf(buf, "Engine=io_uring, sq_ring=%d, cq_ring=%d\n", *s->sq_ring.ring_entries, *s->cq_ring.ring_entries); + } else { + sprintf(buf, "Engine=aio\n"); + err = setup_aio(s); + } + if (err) { + printf("queue setup failed: %s, %d\n", strerror(errno), err); + return -1; + } + + if (!init_printed) { + printf("polled=%d, fixedbufs=%d, register_files=%d, buffered=%d, QD=%d\n", polled, fixedbufs, register_files, buffered, depth); + printf("%s", buf); + init_printed = 1; + } + if (stats) { nr_batch = roundup_pow2(depth / batch_submit); if (nr_batch < 2) @@ -627,43 +1108,38 @@ static int submitter_init(struct submitter *s) s->plat = NULL; nr_batch = 0; } + /* perform the expensive command initialization part for passthrough here + * rather than in the fast path + */ + if (pt) { + for (i = 0; i < roundup_pow2(depth); i++) { + struct io_uring_sqe *sqe = &s->sqes[i << 1]; + memset(&sqe->cmd, 0, sizeof(struct nvme_uring_cmd)); + } + } return nr_batch; } #ifdef CONFIG_LIBAIO static int prep_more_ios_aio(struct submitter *s, int max_ios, struct iocb *iocbs) { - unsigned long offset, data; + uint64_t data; struct file *f; unsigned index; - long r; index = 0; while (index < max_ios) { struct iocb *iocb = &iocbs[index]; - if (s->nr_files == 1) { - f = &s->files[0]; - } else { - f = &s->files[s->cur_file]; - if (f->pending_ios >= file_depth(s)) { - s->cur_file++; - if (s->cur_file == s->nr_files) - s->cur_file = 0; - f = &s->files[s->cur_file]; - } - } - f->pending_ios++; + f = get_next_file(s); - r = lrand48(); - offset = (r % (f->max_blocks - 1)) * bs; io_prep_pread(iocb, f->real_fd, s->iovecs[index].iov_base, - s->iovecs[index].iov_len, offset); + s->iovecs[index].iov_len, get_offset(s, f)); data = f->fileno; if (stats && stats_running) - data |= ((unsigned long) s->clock_index << 32); + data |= (((uint64_t) s->clock_index) << 32); iocb->data = (void *) (uintptr_t) data; index++; } @@ -676,15 +1152,19 @@ static int reap_events_aio(struct submitter *s, struct io_event *events, int evs int reaped = 0; while (evs) { - unsigned long data = (uintptr_t) events[reaped].data; + uint64_t data = (uintptr_t) events[reaped].data; struct file *f = &s->files[data & 0xffffffff]; f->pending_ios--; if (events[reaped].res != bs) { - printf("io: unexpected ret=%ld\n", events[reaped].res); - return -1; - } - if (stats) { + if (events[reaped].res == -ENODATA || + events[reaped].res == -EIO) { + s->io_errors++; + } else { + printf("io: unexpected ret=%ld\n", events[reaped].res); + return -1; + } + } else if (stats) { int clock_index = data >> 32; if (last_idx != clock_index) { @@ -711,12 +1191,21 @@ static int reap_events_aio(struct submitter *s, struct io_event *events, int evs static void *submitter_aio_fn(void *data) { struct submitter *s = data; - int i, ret, prepped, nr_batch; + int i, ret, prepped; struct iocb **iocbsptr; struct iocb *iocbs; struct io_event *events; +#ifdef ARCH_HAVE_CPU_CLOCK + int nr_batch; +#endif + + ret = submitter_init(s); + if (ret < 0) + goto done; - nr_batch = submitter_init(s); +#ifdef ARCH_HAVE_CPU_CLOCK + nr_batch = ret; +#endif iocbsptr = calloc(depth, sizeof(struct iocb *)); iocbs = calloc(depth, sizeof(struct iocb)); @@ -779,18 +1268,59 @@ static void *submitter_aio_fn(void *data) free(iocbsptr); free(iocbs); free(events); +done: finish = 1; return NULL; } #endif +static void io_uring_unregister_ring(struct submitter *s) +{ + struct io_uring_rsrc_update up = { + .offset = s->enter_ring_fd, + }; + + syscall(__NR_io_uring_register, s->ring_fd, IORING_UNREGISTER_RING_FDS, + &up, 1); +} + +static int io_uring_register_ring(struct submitter *s) +{ + struct io_uring_rsrc_update up = { + .data = s->ring_fd, + .offset = -1U, + }; + int ret; + + ret = syscall(__NR_io_uring_register, s->ring_fd, + IORING_REGISTER_RING_FDS, &up, 1); + if (ret == 1) { + s->enter_ring_fd = up.offset; + return 0; + } + register_ring = 0; + return -1; +} + static void *submitter_uring_fn(void *data) { struct submitter *s = data; struct io_sq_ring *ring = &s->sq_ring; - int ret, prepped, nr_batch; + int ret, prepped; +#ifdef ARCH_HAVE_CPU_CLOCK + int nr_batch; +#endif + + ret = submitter_init(s); + if (ret < 0) + goto done; + +#ifdef ARCH_HAVE_CPU_CLOCK + nr_batch = ret; +#endif - nr_batch = submitter_init(s); + if (register_ring) + io_uring_register_ring(s); prepped = 0; do { @@ -845,7 +1375,10 @@ static void *submitter_uring_fn(void *data) do { int r; - r = reap_events_uring(s); + if (pt) + r = reap_events_uring_pt(s); + else + r = reap_events_uring(s); if (r == -1) { s->finish = 1; break; @@ -884,9 +1417,68 @@ static void *submitter_uring_fn(void *data) } } while (!s->finish); + if (register_ring) + io_uring_unregister_ring(s); + +done: + finish = 1; + return NULL; +} + +#ifdef CONFIG_PWRITEV2 +static void *submitter_sync_fn(void *data) +{ + struct submitter *s = data; + int ret; + + if (submitter_init(s) < 0) + goto done; + + do { + uint64_t offset; + struct file *f; + + f = get_next_file(s); + +#ifdef ARCH_HAVE_CPU_CLOCK + if (stats) + s->clock_batch[s->clock_index] = get_cpu_clock(); +#endif + + s->inflight++; + s->calls++; + + offset = get_offset(s, f); + if (polled) + ret = preadv2(f->real_fd, &s->iovecs[0], 1, offset, RWF_HIPRI); + else + ret = preadv2(f->real_fd, &s->iovecs[0], 1, offset, 0); + + if (ret < 0) { + perror("preadv2"); + break; + } else if (ret != bs) { + break; + } + + s->done++; + s->inflight--; + f->pending_ios--; + if (stats) + add_stat(s, s->clock_index, 1); + } while (!s->finish); + +done: + finish = 1; + return NULL; +} +#else +static void *submitter_sync_fn(void *data) +{ finish = 1; return NULL; } +#endif static struct submitter *get_submitter(int offset) { @@ -901,15 +1493,21 @@ static struct submitter *get_submitter(int offset) static void do_finish(const char *reason) { int j; + printf("Exiting on %s\n", reason); for (j = 0; j < nthreads; j++) { struct submitter *s = get_submitter(j); s->finish = 1; } - if (max_iops > 100000) - printf("Maximum IOPS=%luK\n", max_iops / 1000); - else if (max_iops) + if (max_iops > 1000000) { + double miops = (double) max_iops / 1000000.0; + printf("Maximum IOPS=%.2fM\n", miops); + } else if (max_iops > 100000) { + double kiops = (double) max_iops / 1000.0; + printf("Maximum IOPS=%.2fK\n", kiops); + } else { printf("Maximum IOPS=%lu\n", max_iops); + } finish = 1; } @@ -933,144 +1531,6 @@ static void arm_sig_int(void) #endif } -static int setup_aio(struct submitter *s) -{ -#ifdef CONFIG_LIBAIO - if (polled) { - fprintf(stderr, "aio does not support polled IO\n"); - polled = 0; - } - if (sq_thread_poll) { - fprintf(stderr, "aio does not support SQPOLL IO\n"); - sq_thread_poll = 0; - } - if (do_nop) { - fprintf(stderr, "aio does not support polled IO\n"); - do_nop = 0; - } - if (fixedbufs || register_files) { - fprintf(stderr, "aio does not support registered files or buffers\n"); - fixedbufs = register_files = 0; - } - - return io_queue_init(depth, &s->aio_ctx); -#else - fprintf(stderr, "Legacy AIO not available on this system/build\n"); - errno = EINVAL; - return -1; -#endif -} - -static int setup_ring(struct submitter *s) -{ - struct io_sq_ring *sring = &s->sq_ring; - struct io_cq_ring *cring = &s->cq_ring; - struct io_uring_params p; - int ret, fd; - void *ptr; - - memset(&p, 0, sizeof(p)); - - if (polled && !do_nop) - p.flags |= IORING_SETUP_IOPOLL; - if (sq_thread_poll) { - p.flags |= IORING_SETUP_SQPOLL; - if (sq_thread_cpu != -1) { - p.flags |= IORING_SETUP_SQ_AFF; - p.sq_thread_cpu = sq_thread_cpu; - } - } - - fd = io_uring_setup(depth, &p); - if (fd < 0) { - perror("io_uring_setup"); - return 1; - } - s->ring_fd = fd; - - io_uring_probe(fd); - - if (fixedbufs) { - struct rlimit rlim; - - rlim.rlim_cur = RLIM_INFINITY; - rlim.rlim_max = RLIM_INFINITY; - /* ignore potential error, not needed on newer kernels */ - setrlimit(RLIMIT_MEMLOCK, &rlim); - - ret = io_uring_register_buffers(s); - if (ret < 0) { - perror("io_uring_register_buffers"); - return 1; - } - - if (dma_map) { - ret = io_uring_map_buffers(s); - if (ret < 0) { - perror("io_uring_map_buffers"); - return 1; - } - } - } - - if (register_files) { - ret = io_uring_register_files(s); - if (ret < 0) { - perror("io_uring_register_files"); - return 1; - } - } - - ptr = mmap(0, p.sq_off.array + p.sq_entries * sizeof(__u32), - PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, - IORING_OFF_SQ_RING); - sring->head = ptr + p.sq_off.head; - sring->tail = ptr + p.sq_off.tail; - sring->ring_mask = ptr + p.sq_off.ring_mask; - sring->ring_entries = ptr + p.sq_off.ring_entries; - sring->flags = ptr + p.sq_off.flags; - sring->array = ptr + p.sq_off.array; - sq_ring_mask = *sring->ring_mask; - - s->sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe), - PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, - IORING_OFF_SQES); - - ptr = mmap(0, p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe), - PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, - IORING_OFF_CQ_RING); - cring->head = ptr + p.cq_off.head; - cring->tail = ptr + p.cq_off.tail; - cring->ring_mask = ptr + p.cq_off.ring_mask; - cring->ring_entries = ptr + p.cq_off.ring_entries; - cring->cqes = ptr + p.cq_off.cqes; - cq_ring_mask = *cring->ring_mask; - return 0; -} - -static void file_depths(char *buf) -{ - bool prev = false; - char *p; - int i, j; - - buf[0] = '\0'; - p = buf; - for (j = 0; j < nthreads; j++) { - struct submitter *s = get_submitter(j); - - for (i = 0; i < s->nr_files; i++) { - struct file *f = &s->files[i]; - - if (prev) - p += sprintf(p, " %d", f->pending_ios); - else - p += sprintf(p, "%d", f->pending_ios); - prev = true; - } - } -} - static void usage(char *argv, int status) { char runtime_str[16]; @@ -1082,7 +1542,6 @@ static void usage(char *argv, int status) " -b : Block size, default %d\n" " -p : Polled IO, default %d\n" " -B : Fixed buffers, default %d\n" - " -D : DMA map fixed buffers, default %d\n" " -F : Register files, default %d\n" " -n : Number of threads, default %d\n" " -O : Use O_DIRECT, default %d\n" @@ -1091,10 +1550,18 @@ static void usage(char *argv, int status) " -T : TSC rate in HZ\n" " -r : Runtime in seconds, default %s\n" " -R : Use random IO, default %d\n" - " -a : Use legacy aio, default %d\n", + " -a : Use legacy aio, default %d\n" + " -S : Use sync IO (preadv2), default %d\n" + " -X : Use registered ring %d\n" + " -P : Automatically place on device home node %d\n" + " -V : Vectored IO, default %d\n" + " -e : Set restriction filter on opcodes %d\n" + " -u : Use nvme-passthrough I/O, default %d\n", argv, DEPTH, BATCH_SUBMIT, BATCH_COMPLETE, BS, polled, - fixedbufs, dma_map, register_files, nthreads, !buffered, do_nop, - stats, runtime == 0 ? "unlimited" : runtime_str, aio, random_io); + fixedbufs, register_files, nthreads, !buffered, do_nop, + stats, runtime == 0 ? "unlimited" : runtime_str, random_io, aio, + use_sync, register_ring, numa_placement, vectored, restriction, + pt); exit(status); } @@ -1145,16 +1612,15 @@ static void write_tsc_rate(void) int main(int argc, char *argv[]) { struct submitter *s; - unsigned long done, calls, reap; - int err, i, j, flags, fd, opt, threads_per_f, threads_rem = 0, nfiles; + unsigned long done, calls, reap, io_errors; + int i, j, flags, fd, opt, threads_per_f, threads_rem = 0, nfiles; struct file f; - char *fdepths; void *ret; if (!do_nop && argc < 2) usage(argv[0], 1); - while ((opt = getopt(argc, argv, "d:s:c:b:p:B:F:n:N:O:t:T:a:r:D:R:h?")) != -1) { + while ((opt = getopt(argc, argv, "e:d:s:c:b:p:B:F:n:N:O:t:T:a:r:D:R:X:S:P:V:u:h?")) != -1) { switch (opt) { case 'a': aio = !!atoi(optarg); @@ -1215,12 +1681,32 @@ int main(int argc, char *argv[]) case 'r': runtime = atoi(optarg); break; - case 'D': - dma_map = !!atoi(optarg); - break; case 'R': random_io = !!atoi(optarg); break; + case 'X': + register_ring = !!atoi(optarg); + break; + case 'S': +#ifdef CONFIG_PWRITEV2 + use_sync = !!atoi(optarg); +#else + fprintf(stderr, "preadv2 not supported\n"); + exit(1); +#endif + break; + case 'P': + numa_placement = !!atoi(optarg); + break; + case 'V': + vectored = !!atoi(optarg); + break; + case 'u': + pt = !!atoi(optarg); + break; + case 'e': + restriction = !!atoi(optarg); + break; case 'h': case '?': default: @@ -1236,15 +1722,14 @@ int main(int argc, char *argv[]) batch_complete = depth; if (batch_submit > depth) batch_submit = depth; - if (!fixedbufs && dma_map) - dma_map = 0; submitter = calloc(nthreads, sizeof(*submitter) + - depth * sizeof(struct iovec)); + roundup_pow2(depth) * sizeof(struct iovec)); for (j = 0; j < nthreads; j++) { s = get_submitter(j); + s->numa_node = -1; s->index = j; - s->done = s->calls = s->reaps = 0; + s->done = s->calls = s->reaps = s->io_errors = 0; } flags = O_RDONLY | O_NOATIME; @@ -1254,7 +1739,7 @@ int main(int argc, char *argv[]) j = 0; i = optind; nfiles = argc - i; - if (!do_nop) { + if (use_files) { if (!nfiles) { printf("No files specified\n"); usage(argv[0], 1); @@ -1267,7 +1752,7 @@ int main(int argc, char *argv[]) threads_rem = nthreads - threads_per_f * nfiles; } } - while (!do_nop && i < argc) { + while (use_files && i < argc) { int k, limit; memset(&f, 0, sizeof(f)); @@ -1300,7 +1785,10 @@ int main(int argc, char *argv[]) memcpy(&s->files[s->nr_files], &f, sizeof(f)); - printf("Added file %s (submitter %d)\n", argv[i], s->index); + if (numa_placement) + detect_node(s, argv[i]); + + s->filename = argv[i]; s->nr_files++; } threads_rem--; @@ -1310,42 +1798,15 @@ int main(int argc, char *argv[]) arm_sig_int(); - for (j = 0; j < nthreads; j++) { - s = get_submitter(j); - for (i = 0; i < depth; i++) { - void *buf; - - if (posix_memalign(&buf, bs, bs)) { - printf("failed alloc\n"); - return 1; - } - s->iovecs[i].iov_base = buf; - s->iovecs[i].iov_len = bs; - } - } - - for (j = 0; j < nthreads; j++) { - s = get_submitter(j); - - if (!aio) - err = setup_ring(s); - else - err = setup_aio(s); - if (err) { - printf("ring setup failed: %s, %d\n", strerror(errno), err); - return 1; - } - } - s = get_submitter(0); - printf("polled=%d, fixedbufs=%d/%d, register_files=%d, buffered=%d, QD=%d\n", polled, fixedbufs, dma_map, register_files, buffered, depth); - if (!aio) - printf("Engine=io_uring, sq_ring=%d, cq_ring=%d\n", *s->sq_ring.ring_entries, *s->cq_ring.ring_entries); - else - printf("Engine=aio\n"); + t_io_uring_page_size = sysconf(_SC_PAGESIZE); + if (t_io_uring_page_size < 0) + t_io_uring_page_size = 4096; for (j = 0; j < nthreads; j++) { s = get_submitter(j); - if (!aio) + if (use_sync) + pthread_create(&s->thread, NULL, submitter_sync_fn, s); + else if (!aio) pthread_create(&s->thread, NULL, submitter_uring_fn, s); #ifdef CONFIG_LIBAIO else @@ -1353,12 +1814,12 @@ int main(int argc, char *argv[]) #endif } - fdepths = malloc(8 * s->nr_files * nthreads); - reap = calls = done = 0; + reap = calls = done = io_errors = 0; do { unsigned long this_done = 0; unsigned long this_reap = 0; unsigned long this_call = 0; + unsigned long this_io_errors = 0; unsigned long rpc = 0, ipc = 0; unsigned long iops, bw; @@ -1379,29 +1840,43 @@ int main(int argc, char *argv[]) this_done += s->done; this_call += s->calls; this_reap += s->reaps; + this_io_errors += s->io_errors; } if (this_call - calls) { rpc = (this_done - done) / (this_call - calls); ipc = (this_reap - reap) / (this_call - calls); } else rpc = ipc = -1; - file_depths(fdepths); iops = this_done - done; + iops -= this_io_errors - io_errors; if (bs > 1048576) bw = iops * (bs / 1048576); else bw = iops / (1048576 / bs); - if (iops > 100000) - printf("IOPS=%luK, ", iops / 1000); - else + if (iops > 1000000) { + double miops = (double) iops / 1000000.0; + printf("IOPS=%.2fM, ", miops); + } else if (iops > 100000) { + double kiops = (double) iops / 1000.0; + printf("IOPS=%.2fK, ", kiops); + } else { printf("IOPS=%lu, ", iops); + } max_iops = max(max_iops, iops); - if (!do_nop) - printf("BW=%luMiB/s, ", bw); - printf("IOS/call=%ld/%ld, inflight=(%s)\n", rpc, ipc, fdepths); + if (!do_nop) { + if (bw > 2000) { + double bw_g = (double) bw / 1000.0; + + printf("BW=%.2fGiB/s, ", bw_g); + } else { + printf("BW=%luMiB/s, ", bw); + } + } + printf("IOS/call=%ld/%ld\n", rpc, ipc); done = this_done; calls = this_call; reap = this_reap; + io_errors = this_io_errors; } while (!finish); for (j = 0; j < nthreads; j++) { @@ -1409,6 +1884,9 @@ int main(int argc, char *argv[]) pthread_join(s->thread, &ret); close(s->ring_fd); + if (s->io_errors) + printf("%d: %lu IO errors\n", s->tid, s->io_errors); + if (stats) { unsigned long nr; @@ -1421,7 +1899,6 @@ int main(int argc, char *argv[]) } } - free(fdepths); free(submitter); return 0; } diff --git a/t/io_uring_pi.py b/t/io_uring_pi.py new file mode 100644 index 0000000000..bd92edfd0e --- /dev/null +++ b/t/io_uring_pi.py @@ -0,0 +1,408 @@ +#!/usr/bin/env python3 + +""" +# io_uring_pi.py +# +# Test metadata support using the io_uring ioengine. +# +# USAGE +# See python3 io_uring_pi.py --help +# +# EXAMPLES (THIS IS A DESTRUCTIVE TEST!!) +# python3 t/io_uring_pi.py --dut /dev/nvme1n1 -f ./fio +# +# REQUIREMENTS +# Python 3.6 +# +""" + +import os +import sys +import json +import time +import locale +import logging +import argparse +import itertools +import subprocess +from pathlib import Path +from fiotestlib import FioJobCmdTest, run_fio_tests +from fiotestcommon import SUCCESS_NONZERO + + +NUMBER_IOS = 8192 +BS_LOW = 1 +BS_HIGH = 16 + +class DifDixTest(FioJobCmdTest): + """ + NVMe DIF/DIX test class. + """ + + def setup(self, parameters): + """Setup a test.""" + + fio_args = [ + "--name=io_uring_pi", + "--ioengine=io_uring", + "--direct=1", + f"--filename={self.fio_opts['filename']}", + f"--rw={self.fio_opts['rw']}", + f"--bsrange={self.fio_opts['bsrange']}", + f"--output={os.path.basename(self.filenames['output'])}", + f"--md_per_io_size={self.fio_opts['md_per_io_size']}", + "--pi_act=0", + f"--pi_chk={self.fio_opts['pi_chk']}", + f"--apptag={self.fio_opts['apptag']}", + f"--apptag_mask={self.fio_opts['apptag_mask']}", + ] + for opt in ['fixedbufs', 'nonvectored', 'force_async', 'registerfiles', + 'sqthread_poll', 'sqthread_poll_cpu', 'hipri', 'nowait', + 'time_based', 'runtime', 'verify', 'io_size', 'offset', 'number_ios', + 'output-format']: + if opt in self.fio_opts: + option = f"--{opt}={self.fio_opts[opt]}" + fio_args.append(option) + + super().setup(fio_args) + + +TEST_LIST = [ +# +# Write data with pi_act=0 and then read the data back +# + { + # Write workload with variable IO sizes + # pi_act=0 + "test_id": 101, + "fio_opts": { + "rw": 'write', + "number_ios": NUMBER_IOS, + "output-format": "json", + "apptag": "0x8888", + "apptag_mask": "0xFFFF", + "pi_act": 0, + }, + "pi_chk": "GUARD,REFTAG,APPTAG", + "bs_low": BS_LOW, + "bs_high": BS_HIGH, + "test_class": DifDixTest, + }, + { + # Read workload with fixed small IO size + # pi_act=0 + "test_id": 102, + "fio_opts": { + "rw": 'read', + "number_ios": NUMBER_IOS, + "output-format": "json", + "apptag": "0x8888", + "apptag_mask": "0xFFFF", + "pi_act": 0, + }, + "pi_chk": "GUARD,REFTAG,APPTAG", + "bs_low": BS_LOW, + "bs_high": BS_LOW, + "test_class": DifDixTest, + }, + { + # Read workload with variable IO size + # pi_act=0 + "test_id": 103, + "fio_opts": { + "rw": 'read', + "number_ios": NUMBER_IOS, + "output-format": "json", + "apptag": "0x8888", + "apptag_mask": "0xFFFF", + "pi_act": 0, + }, + "pi_chk": "GUARD,REFTAG,APPTAG", + "bs_low": BS_LOW, + "bs_high": BS_HIGH, + "test_class": DifDixTest, + }, + { + # Read workload with variable IO size + # trigger apptag mismatch error + # pi_act=0 + "test_id": 104, + "fio_opts": { + "rw": 'read', + "number_ios": NUMBER_IOS, + "output-format": "json", + "apptag": "0xA888", + "apptag_mask": "0xFFFF", + "pi_act": 0, + }, + "pi_chk": "GUARD,REFTAG,APPTAG", + "bs_low": BS_LOW, + "bs_high": BS_HIGH, + "success": SUCCESS_NONZERO, + "test_class": DifDixTest, + }, + { + # Read workload with variable IO size + # fails because apptag mask must be 0xFFFF + # pi_act=0 + "test_id": 105, + "fio_opts": { + "rw": 'read', + "number_ios": NUMBER_IOS, + "output-format": "json", + "apptag": "0xF888", + "apptag_mask": "0x0FFF", + "pi_act": 0, + }, + "pi_chk": "GUARD,REFTAG,APPTAG", + "bs_low": BS_LOW, + "bs_high": BS_HIGH, + "success": SUCCESS_NONZERO, + "test_class": DifDixTest, + }, +] + + +def get_lbafs(args): + """ + Determine which LBA formats to use. Use either the ones specified on the + command line or if none are specified query the device and use all lba + formats with metadata. + """ + lbaf_list = [] + id_ns_cmd = f"sudo nvme id-ns --output-format=json {args.dut}".split(' ') + id_ns_output = subprocess.check_output(id_ns_cmd) + lbafs = json.loads(id_ns_output)['lbafs'] + if args.lbaf: + for lbaf in args.lbaf: + lbaf_list.append({'lbaf': lbaf, 'ds': 2 ** lbafs[lbaf]['ds'], + 'ms': lbafs[lbaf]['ms'], }) + if lbafs[lbaf]['ms'] == 0: + print(f'Error: lbaf {lbaf} has metadata size zero') + sys.exit(1) + else: + for lbaf_num, lbaf in enumerate(lbafs): + if lbaf['ms'] != 0: + lbaf_list.append({'lbaf': lbaf_num, 'ds': 2 ** lbaf['ds'], + 'ms': lbaf['ms'], }) + + return lbaf_list + + +def get_guard_pi(lbaf_list, args): + """ + Find out how many bits of guard protection information are associated with + each lbaf to be used. If this is not available assume 16-bit guard pi. + Also record the bytes of protection information associated with the number + of guard PI bits. + """ + nvm_id_ns_cmd = f"sudo nvme nvm-id-ns --output-format=json {args.dut}".split(' ') + try: + nvm_id_ns_output = subprocess.check_output(nvm_id_ns_cmd) + except subprocess.CalledProcessError: + print(f"Non-zero return code from {' '.join(nvm_id_ns_cmd)}; " \ + "assuming all lbafs use 16b Guard Protection Information") + for lbaf in lbaf_list: + lbaf['guard_pi_bits'] = 16 + else: + elbafs = json.loads(nvm_id_ns_output)['elbafs'] + for elbaf_num, elbaf in enumerate(elbafs): + for lbaf in lbaf_list: + if lbaf['lbaf'] == elbaf_num: + lbaf['guard_pi_bits'] = 16 << elbaf['pif'] + + # For 16b Guard Protection Information, the PI requires 8 bytes + # For 32b and 64b Guard PI, the PI requires 16 bytes + for lbaf in lbaf_list: + if lbaf['guard_pi_bits'] == 16: + lbaf['pi_bytes'] = 8 + else: + lbaf['pi_bytes'] = 16 + + +def get_capabilities(args): + """ + Determine what end-to-end data protection features the device supports. + """ + caps = { 'pil': [], 'pitype': [], 'elba': [] } + id_ns_cmd = f"sudo nvme id-ns --output-format=json {args.dut}".split(' ') + id_ns_output = subprocess.check_output(id_ns_cmd) + id_ns_json = json.loads(id_ns_output) + + mc = id_ns_json['mc'] + if mc & 1: + caps['elba'].append(1) + if mc & 2: + caps['elba'].append(0) + + dpc = id_ns_json['dpc'] + if dpc & 1: + caps['pitype'].append(1) + if dpc & 2: + caps['pitype'].append(2) + if dpc & 4: + caps['pitype'].append(3) + if dpc & 8: + caps['pil'].append(1) + if dpc & 16: + caps['pil'].append(0) + + for _, value in caps.items(): + if len(value) == 0: + logging.error("One or more end-to-end data protection features unsupported: %s", caps) + sys.exit(-1) + + return caps + + +def format_device(args, lbaf, pitype, pil, elba): + """ + Format device using specified lba format with specified pitype, pil, and + elba values. + """ + + format_cmd = f"sudo nvme format {args.dut} --lbaf={lbaf['lbaf']} " \ + f"--pi={pitype} --pil={pil} --ms={elba} --force" + logging.debug("Format command: %s", format_cmd) + format_cmd = format_cmd.split(' ') + format_cmd_result = subprocess.run(format_cmd, capture_output=True, check=False, + encoding=locale.getpreferredencoding()) + + # Sometimes nvme-cli may format the device successfully but fail to + # rescan the namespaces after the format. Continue if this happens but + # abort if some other error occurs. + if format_cmd_result.returncode != 0: + if 'failed to rescan namespaces' not in format_cmd_result.stderr \ + or 'Success formatting namespace' not in format_cmd_result.stdout: + logging.error(format_cmd_result.stdout) + logging.error(format_cmd_result.stderr) + print("Unable to format device; skipping this configuration") + return False + + logging.debug(format_cmd_result.stdout) + return True + + +def parse_args(): + """Parse command-line arguments.""" + + parser = argparse.ArgumentParser() + parser.add_argument('-d', '--debug', help='Enable debug messages', action='store_true') + parser.add_argument('-f', '--fio', help='path to file executable (e.g., ./fio)') + parser.add_argument('-a', '--artifact-root', help='artifact root directory') + parser.add_argument('-s', '--skip', nargs='+', type=int, + help='list of test(s) to skip') + parser.add_argument('-o', '--run-only', nargs='+', type=int, + help='list of test(s) to run, skipping all others') + parser.add_argument('--dut', help='target device to test ' + '(e.g., /dev/nvme1n1). WARNING: THIS IS A DESTRUCTIVE TEST', required=True) + parser.add_argument('-l', '--lbaf', nargs='+', type=int, + help='list of lba formats to test') + args = parser.parse_args() + + return args + + +def difdix_test(test_env, args, lbaf, pitype): + """ + Adjust test arguments based on values of lbaf, and pitype. Then run + the tests. + """ + for test in TEST_LIST: + test['force_skip'] = False + + blocksize = lbaf['ds'] + # Set fio blocksize parameter at runtime + test['fio_opts']['md_per_io_size'] = lbaf['ms'] * test['bs_high'] + + test['fio_opts']['bsrange'] = f"{blocksize * test['bs_low']}-{blocksize * test['bs_high']}" + + # Set fio pi_chk parameter at runtime. If the device is formatted + # with Type 3 protection information, this means that the reference + # tag is not checked and I/O commands may throw an error if they + # are submitted with the REFTAG bit set in pi_chk. Make sure fio + # does not set pi_chk's REFTAG bit if the device is formatted with + # Type 3 PI. + if 'pi_chk' in test: + if pitype == 3 and 'REFTAG' in test['pi_chk']: + test['fio_opts']['pi_chk'] = test['pi_chk'].replace('REFTAG','') + logging.debug("Type 3 PI: dropping REFTAG bit") + else: + test['fio_opts']['pi_chk'] = test['pi_chk'] + + logging.debug("Test %d: pi_act=%d, bsrange=%s, md_per_io_size=%d", test['test_id'], + test['fio_opts']['pi_act'], test['fio_opts']['bsrange'], + test['fio_opts']['md_per_io_size']) + + return run_fio_tests(TEST_LIST, test_env, args) + + +def main(): + args = parse_args() + + if args.debug: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + artifact_root = args.artifact_root if args.artifact_root else \ + f"io_uring_pi-test-{time.strftime('%Y%m%d-%H%M%S')}" + os.mkdir(artifact_root) + print(f"Artifact directory is {artifact_root}") + + if args.fio: + fio_path = str(Path(args.fio).absolute()) + else: + fio_path = os.path.join(os.path.dirname(__file__), '../fio') + print(f"fio path is {fio_path}") + + lbaf_list = get_lbafs(args) + get_guard_pi(lbaf_list, args) + caps = get_capabilities(args) + print("Device capabilities:", caps) + + for test in TEST_LIST: + test['fio_opts']['filename'] = args.dut + test_env = { + 'fio_path': fio_path, + 'fio_root': str(Path(__file__).absolute().parent.parent), + 'artifact_root': artifact_root, + 'basename': 'io_uring_pi', + } + + total = { 'passed': 0, 'failed': 0, 'skipped': 0 } + + try: + for lbaf, pil, pitype in itertools.product(lbaf_list, caps['pil'], caps['pitype']): + if lbaf['ms'] == 0: + continue + + print("\n") + print("-" * 120) + print(f"lbaf: {lbaf}, pil: {pil}, pitype: {pitype}") + print("-" * 120) + + if not format_device(args, lbaf, pitype, pil, 0): + print("Formatting failed") + continue + + test_env['artifact_root'] = \ + os.path.join(artifact_root, f"lbaf{lbaf['lbaf']}pil{pil}pitype{pitype}") + os.mkdir(test_env['artifact_root']) + + passed, failed, skipped = difdix_test(test_env, args, lbaf, pitype) + + total['passed'] += passed + total['failed'] += failed + total['skipped'] += skipped + + except KeyboardInterrupt: + pass + + print(f"\n\n{total['passed']} test(s) passed, {total['failed']} failed, " \ + f"{total['skipped']} skipped") + sys.exit(total['failed']) + + +if __name__ == '__main__': + main() diff --git a/t/jobs/t0007-37cf9e3c.fio b/t/jobs/t0007-37cf9e3c.fio index d3c987517d..b2592694e6 100644 --- a/t/jobs/t0007-37cf9e3c.fio +++ b/t/jobs/t0007-37cf9e3c.fio @@ -1,4 +1,7 @@ -# Expected result: fio reads 87040KB of data +# Expected result: fio reads 87040KB of data: +# first read is at offset 0, then 2nd read is at offset 1.5m, then the 3rd +# read is at offset 3m, and after the last read at offset 127m - we have only +# read 87,040K data. # Buggy result: fio reads the full 128MB of data [foo] size=128mb diff --git a/t/jobs/t0012.fio b/t/jobs/t0012.fio index d712396691..e01d2b01b6 100644 --- a/t/jobs/t0012.fio +++ b/t/jobs/t0012.fio @@ -14,6 +14,7 @@ flow_sleep=100 thread log_avg_msec=1000 write_iops_log=t0012.fio +time_based [flow1] flow=1 diff --git a/t/jobs/t0014.fio b/t/jobs/t0014.fio index d9b456516e..eb13478ba5 100644 --- a/t/jobs/t0014.fio +++ b/t/jobs/t0014.fio @@ -17,6 +17,7 @@ flow_id=1 thread log_avg_msec=1000 write_iops_log=t0014.fio +time_based [flow1] flow=1 diff --git a/t/jobs/t0015-4e7e7898.fio b/t/jobs/t0015-4e7e7898.fio new file mode 100644 index 0000000000..c650c0b2e1 --- /dev/null +++ b/t/jobs/t0015-4e7e7898.fio @@ -0,0 +1,7 @@ +# Expected result: mean(slat) + mean(clat) = mean(lat) +# Buggy result: equality does not hold + +[test] +ioengine=libaio +size=1M +iodepth=16 diff --git a/t/jobs/t0016-d54ae22.fio b/t/jobs/t0016-d54ae22.fio new file mode 100644 index 0000000000..1b418e7c81 --- /dev/null +++ b/t/jobs/t0016-d54ae22.fio @@ -0,0 +1,7 @@ +# Expected result: mean(slat) + mean(clat) = mean(lat) +# Buggy result: equality does not hold + +[test] +ioengine=null +size=1M +iodepth=16 diff --git a/t/jobs/t0017.fio b/t/jobs/t0017.fio new file mode 100644 index 0000000000..14486d98cd --- /dev/null +++ b/t/jobs/t0017.fio @@ -0,0 +1,9 @@ +# Expected result: mean(slat) + mean(clat) = mean(lat) +# Buggy result: equality does not hold +# This is similar to t0015 and t0016 except that is uses posixaio which is +# available on more platforms and does not have a commit hook + +[test] +ioengine=posixaio +size=1M +iodepth=16 diff --git a/t/jobs/t0018.fio b/t/jobs/t0018.fio new file mode 100644 index 0000000000..e2298b1f97 --- /dev/null +++ b/t/jobs/t0018.fio @@ -0,0 +1,9 @@ +# Expected result: job completes without error +# Buggy result: job fails + +[test] +ioengine=io_uring +filesize=256K +time_based +runtime=3s +rw=randrw diff --git a/t/jobs/t0019.fio b/t/jobs/t0019.fio new file mode 100644 index 0000000000..b60d27d2ea --- /dev/null +++ b/t/jobs/t0019.fio @@ -0,0 +1,10 @@ +# Expected result: offsets are accessed sequentially and all offsets are read +# Buggy result: offsets are not accessed sequentially and one or more offsets are missed +# run with --debug=io or logging to see which offsets are accessed + +[test] +ioengine=null +filesize=1M +write_bw_log=test +per_job_logs=0 +log_offset=1 diff --git a/t/jobs/t0020.fio b/t/jobs/t0020.fio new file mode 100644 index 0000000000..1c1c5166fc --- /dev/null +++ b/t/jobs/t0020.fio @@ -0,0 +1,11 @@ +# Expected result: offsets are not accessed sequentially and all offsets are touched +# Buggy result: offsets are accessed sequentially and one or more offsets are missed +# run with --debug=io or logging to see which offsets are read + +[test] +ioengine=null +filesize=1M +rw=randread +write_bw_log=test +per_job_logs=0 +log_offset=1 diff --git a/t/jobs/t0021.fio b/t/jobs/t0021.fio new file mode 100644 index 0000000000..47fbae71eb --- /dev/null +++ b/t/jobs/t0021.fio @@ -0,0 +1,15 @@ +# make sure the lfsr random generator actually does touch all the offsets +# +# Expected result: offsets are not accessed sequentially and all offsets are touched +# Buggy result: offsets are accessed sequentially and one or more offsets are missed +# run with --debug=io or logging to see which offsets are read + +[test] +ioengine=null +filesize=1M +rw=randread +write_bw_log=test +per_job_logs=0 +log_offset=1 +norandommap=1 +random_generator=lfsr diff --git a/t/jobs/t0022.fio b/t/jobs/t0022.fio new file mode 100644 index 0000000000..2324571e9a --- /dev/null +++ b/t/jobs/t0022.fio @@ -0,0 +1,13 @@ +# make sure that when we enable norandommap we touch some offsets more than once +# +# Expected result: at least one offset is touched more than once +# Buggy result: each offset is touched only once + +[test] +ioengine=null +filesize=1M +rw=randread +write_bw_log=test +per_job_logs=0 +log_offset=1 +norandommap=1 diff --git a/t/jobs/t0023.fio b/t/jobs/t0023.fio new file mode 100644 index 0000000000..8e14a110bc --- /dev/null +++ b/t/jobs/t0023.fio @@ -0,0 +1,63 @@ +# randtrimwrite data direction tests +[global] +filesize=1M +ioengine=null +rw=randtrimwrite +log_offset=1 +per_job_logs=0 +randrepeat=0 +write_bw_log + +# Expected result: trim issued to random offset followed by write to same offset +# all offsets touched +# block sizes match +# Buggy result: something else +[basic] + +# Expected result: trim issued to random offset followed by write to same offset +# all offsets trimmed +# block sizes 8k for both write and trim +# Buggy result: something else +[bs] +bs=8k,8k,8k + +# Expected result: trim issued to random offset followed by write to same offset +# all offsets trimmed +# block sizes match +# Buggy result: something else +[bsrange] +bsrange=512-4k + +# Expected result: trim issued to random offset followed by write to same offset +# all offsets trimmed +# block sizes match +# Buggy result: something else +[bssplit] +bssplit=512/25:1k/:2k/:4k/ + +# Expected result: trim issued to random offset followed by write to same offset +# block sizes match +# Buggy result: something else +[basic_no_rm] +norandommap=1 + +# Expected result: trim issued to random offset followed by write to same offset +# block sizes 8k for both write and trim +# Buggy result: something else +[bs_no_rm] +bs=4k,4k,8k +norandommap=1 + +# Expected result: trim issued to random offset followed by write to same offset +# block sizes match +# Buggy result: something else +[bsrange_no_rm] +bsrange=512-4k +norandommap=1 + +# Expected result: trim issued to random offset followed by write to same offset +# block sizes match +# Buggy result: something else +[bssplit_no_rm] +bssplit=512/25:1k/:2k/:4k/ +norandommap=1 diff --git a/t/jobs/t0024.fio b/t/jobs/t0024.fio new file mode 100644 index 0000000000..2b3dc94c96 --- /dev/null +++ b/t/jobs/t0024.fio @@ -0,0 +1,36 @@ +# trimwrite data direction tests +[global] +filesize=1M +ioengine=null +rw=trimwrite +log_offset=1 +per_job_logs=0 +randrepeat=0 +write_bw_log + +# Expected result: trim issued to sequential offsets followed by write to same offset +# all offsets touched +# block sizes match +# Buggy result: something else +[basic] + +# Expected result: trim issued to sequential offsets followed by write to same offset +# all offsets trimmed +# block sizes 8k for both write and trim +# Buggy result: something else +[bs] +bs=8k,8k,8k + +# Expected result: trim issued to sequential offsets followed by write to same offset +# all offsets trimmed +# block sizes match +# Buggy result: something else +[bsrange] +bsrange=512-4k + +# Expected result: trim issued to sequential offsets followed by write to same offset +# all offsets trimmed +# block sizes match +# Buggy result: something else +[bssplit] +bssplit=512/25:1k/:2k/:4k/ diff --git a/t/jobs/t0025.fio b/t/jobs/t0025.fio new file mode 100644 index 0000000000..29b5fe80d8 --- /dev/null +++ b/t/jobs/t0025.fio @@ -0,0 +1,7 @@ +[job] +filename=t0025file +size=128k +readwrite=write +do_verify=1 +verify=md5 +experimental_verify=1 diff --git a/t/jobs/t0026.fio b/t/jobs/t0026.fio new file mode 100644 index 0000000000..ee89b14057 --- /dev/null +++ b/t/jobs/t0026.fio @@ -0,0 +1,19 @@ +[job1] +filename=t0026file +size=1M +readwrite=randwrite +loops=8 +do_verify=1 +verify=md5 +experimental_verify=1 + +[job2] +stonewall=1 +filename=t0026file +size=1M +readwrite=randrw +time_based +runtime=5 +do_verify=1 +verify=md5 +experimental_verify=1 diff --git a/t/jobs/t0027.fio b/t/jobs/t0027.fio new file mode 100644 index 0000000000..b5b97a30d3 --- /dev/null +++ b/t/jobs/t0027.fio @@ -0,0 +1,14 @@ +[global] +filename=t0027file +size=16k +bs=16k + +[write_job] +readwrite=write +buffer_pattern='t0027.pattern' + +[read_job] +stonewall=1 +readwrite=read +verify=pattern +verify_pattern='t0027.pattern' diff --git a/t/jobs/t0028-c6cade16.fio b/t/jobs/t0028-c6cade16.fio new file mode 100644 index 0000000000..a0096d8026 --- /dev/null +++ b/t/jobs/t0028-c6cade16.fio @@ -0,0 +1,5 @@ +[test] +size=16k +readwrite=write +buffer_pattern="abcd"-120xdeadface +ioengine=null diff --git a/t/jobs/t0029.fio b/t/jobs/t0029.fio new file mode 100644 index 0000000000..481de6f316 --- /dev/null +++ b/t/jobs/t0029.fio @@ -0,0 +1,14 @@ +[global] +filename=t0029file +size=4k +verify=md5 + +[write] +rw=write +do_verify=0 + +[read] +stonewall=1 +rw=read +loops=2 +do_verify=1 diff --git a/t/jobs/t0030.fio b/t/jobs/t0030.fio new file mode 100644 index 0000000000..8bbc810e73 --- /dev/null +++ b/t/jobs/t0030.fio @@ -0,0 +1,10 @@ +# run with --bandwidth-log +# broken behavior: seg fault +# successful behavior: test runs to completion with 0 as the exit code + +[test] +ioengine=null +filesize=1T +rw=read +time_based +runtime=2s diff --git a/t/jobs/t0031-pre.fio b/t/jobs/t0031-pre.fio new file mode 100644 index 0000000000..ce4ee3b691 --- /dev/null +++ b/t/jobs/t0031-pre.fio @@ -0,0 +1,8 @@ +[job] +rw=write +ioengine=libaio +size=1mb +time_based=1 +runtime=1 +filename=t0030file +write_iolog=iolog diff --git a/t/jobs/t0031.fio b/t/jobs/t0031.fio new file mode 100644 index 0000000000..ae8f74428d --- /dev/null +++ b/t/jobs/t0031.fio @@ -0,0 +1,7 @@ +[job] +rw=read +ioengine=libaio +iodepth=128 +filename=t0030file +read_iolog=iolog +write_lat_log=lat_log diff --git a/t/jobs/t0032-43063a1c.fio b/t/jobs/t0032-43063a1c.fio new file mode 100644 index 0000000000..db998e5bec --- /dev/null +++ b/t/jobs/t0032-43063a1c.fio @@ -0,0 +1,12 @@ +# Expected results: max offset is ~1280K +# Buggy result: max offset is ~640K +# + +[global] +ioengine=null +size=1280K +io_size=2560k +bs=128K + +[test1] +rw=rw diff --git a/t/jobs/t0033.fio b/t/jobs/t0033.fio new file mode 100644 index 0000000000..156bdadc6b --- /dev/null +++ b/t/jobs/t0033.fio @@ -0,0 +1,28 @@ +[global] +rw=read +filename=t0033file +size=8k +time_based +runtime=2s +ioengine=libaio +iodepth=1 + +[job1] +write_bw_log=log +log_prio=1 + +[job2] +write_lat_log=log +log_avg_msec=100 +log_window_value=both + +[job3] +write_iops_log=log +log_offset=1 +log_prio=1 + +[job4] +write_iops_log=log +log_avg_msec=100 +log_window_value=both +log_offset=1 diff --git a/t/jobs/t0034.fio b/t/jobs/t0034.fio new file mode 100644 index 0000000000..2b6c4b2d21 --- /dev/null +++ b/t/jobs/t0034.fio @@ -0,0 +1,27 @@ +[global] +rw=read +filename=t0034file +size=8k +time_based +runtime=2s +ioengine=libaio +iodepth=1 + +[job1] +write_lat_log=log +log_offset=1 +log_issue_time=1 + +[job2] +write_lat_log=log +log_offset=1 +log_issue_time=1 +log_avg_msec=100 +log_window_value=both + +[job3] +write_lat_log=log +write_bw_log=log +write_iops_log=log +log_offset=1 +log_issue_time=1 diff --git a/t/jobs/t0035.fio b/t/jobs/t0035.fio new file mode 100644 index 0000000000..fd3488f9fc --- /dev/null +++ b/t/jobs/t0035.fio @@ -0,0 +1,27 @@ +[global] +size=4k +write_lat_log=log + +[job1] +ioengine=filecreate +filename=t0035file1 + +[job2] +ioengine=filestat +filename=t0035file2 + +[job3] +ioengine=filedelete +filename=t0035file3 + +[job4] +ioengine=dircreate +filename=t0035dir1 + +[job5] +ioengine=dirstat +filename=t0035dir2 + +[job6] +ioengine=dirdelete +filename=t0035dir3 diff --git a/t/jobs/t0036-post.fio b/t/jobs/t0036-post.fio new file mode 100644 index 0000000000..13653c6502 --- /dev/null +++ b/t/jobs/t0036-post.fio @@ -0,0 +1,8 @@ +[global] +filename=t0036file +verify=md5 +size=512k + +[job] +readwrite=read +verify_state_load=1 diff --git a/t/jobs/t0036-pre.fio b/t/jobs/t0036-pre.fio new file mode 100644 index 0000000000..ae94c88f32 --- /dev/null +++ b/t/jobs/t0036-pre.fio @@ -0,0 +1,8 @@ +[global] +filename=t0036file +verify=md5 +size=512k + +[job] +readwrite=write +verify_state_save=1 diff --git a/t/jobs/t0037-post.fio b/t/jobs/t0037-post.fio new file mode 100644 index 0000000000..90ccc0c83c --- /dev/null +++ b/t/jobs/t0037-post.fio @@ -0,0 +1,12 @@ +[global] +filename=t0037file.1:t0037file.2:t0037file.3:t0037file.4 +verify=md5 +ioengine=libaio +iodepth=32 +size=512k +loops=4 + +[job] +readwrite=read +verify_state_load=1 + diff --git a/t/jobs/t0037-pre.fio b/t/jobs/t0037-pre.fio new file mode 100644 index 0000000000..4a5e30dca5 --- /dev/null +++ b/t/jobs/t0037-pre.fio @@ -0,0 +1,12 @@ +[global] +filename=t0037file.1:t0037file.2:t0037file.3:t0037file.4 +verify=md5 +ioengine=libaio +iodepth=32 +size=512k +loops=4 + +[job] +readwrite=write +verify_state_save=1 + diff --git a/t/latency_percentiles.py b/t/latency_percentiles.py index cc4374262e..81704700d4 100755 --- a/t/latency_percentiles.py +++ b/t/latency_percentiles.py @@ -80,6 +80,7 @@ import argparse import platform import subprocess +from collections import Counter from pathlib import Path @@ -125,7 +126,8 @@ def run_fio(self, fio_path): "--output-format={output-format}".format(**self.test_options), ] for opt in ['slat_percentiles', 'clat_percentiles', 'lat_percentiles', - 'unified_rw_reporting', 'fsync', 'fdatasync', 'numjobs', 'cmdprio_percentage']: + 'unified_rw_reporting', 'fsync', 'fdatasync', 'numjobs', + 'cmdprio_percentage', 'bssplit', 'cmdprio_bssplit']: if opt in self.test_options: option = '--{0}={{{0}}}'.format(opt) fio_args.append(option.format(**self.test_options)) @@ -268,7 +270,7 @@ def check_latencies(self, jsondata, ddir, slat=True, clat=True, tlat=True, plus= # # Check only for the presence/absence of json+ # latency bins. Future work can check the - # accurracy of the bin values and counts. + # accuracy of the bin values and counts. # # Because the latency percentiles are based on # the bins, we can be confident that the bin @@ -363,20 +365,19 @@ def check_empty(job): def check_nocmdprio_lat(self, job): """ - Make sure no high/low priority latencies appear. + Make sure no per priority latencies appear. job JSON object to check """ for ddir in ['read', 'write', 'trim']: if ddir in job: - if 'lat_high_prio' in job[ddir] or 'lat_low_prio' in job[ddir] or \ - 'clat_high_prio' in job[ddir] or 'clat_low_prio' in job[ddir]: - print("Unexpected high/low priority latencies found in %s output" % ddir) + if 'prios' in job[ddir]: + print("Unexpected per priority latencies found in %s output" % ddir) return False if self.debug: - print("No high/low priority latencies found") + print("No per priority latencies found") return True @@ -497,7 +498,7 @@ def check_terse(self, terse, jsondata): return retval def check_prio_latencies(self, jsondata, clat=True, plus=False): - """Check consistency of high/low priority latencies. + """Check consistency of per priority latencies. clat True if we should check clat data; other check lat data plus True if we have json+ format data where additional checks can @@ -506,78 +507,78 @@ def check_prio_latencies(self, jsondata, clat=True, plus=False): """ if clat: - high = 'clat_high_prio' - low = 'clat_low_prio' - combined = 'clat_ns' + obj = combined = 'clat_ns' else: - high = 'lat_high_prio' - low = 'lat_low_prio' - combined = 'lat_ns' + obj = combined = 'lat_ns' - if not high in jsondata or not low in jsondata or not combined in jsondata: - print("Error identifying high/low priority latencies") + if not 'prios' in jsondata or not combined in jsondata: + print("Error identifying per priority latencies") return False - if jsondata[high]['N'] + jsondata[low]['N'] != jsondata[combined]['N']: - print("High %d + low %d != combined sample size %d" % \ - (jsondata[high]['N'], jsondata[low]['N'], jsondata[combined]['N'])) + sum_sample_size = sum([x[obj]['N'] for x in jsondata['prios']]) + if sum_sample_size != jsondata[combined]['N']: + print("Per prio sample size sum %d != combined sample size %d" % + (sum_sample_size, jsondata[combined]['N'])) return False elif self.debug: - print("High %d + low %d == combined sample size %d" % \ - (jsondata[high]['N'], jsondata[low]['N'], jsondata[combined]['N'])) + print("Per prio sample size sum %d == combined sample size %d" % + (sum_sample_size, jsondata[combined]['N'])) - if min(jsondata[high]['min'], jsondata[low]['min']) != jsondata[combined]['min']: - print("Min of high %d, low %d min latencies does not match min %d from combined data" % \ - (jsondata[high]['min'], jsondata[low]['min'], jsondata[combined]['min'])) + min_val = min([x[obj]['min'] for x in jsondata['prios']]) + if min_val != jsondata[combined]['min']: + print("Min per prio min latency %d does not match min %d from combined data" % + (min_val, jsondata[combined]['min'])) return False elif self.debug: - print("Min of high %d, low %d min latencies matches min %d from combined data" % \ - (jsondata[high]['min'], jsondata[low]['min'], jsondata[combined]['min'])) + print("Min per prio min latency %d matches min %d from combined data" % + (min_val, jsondata[combined]['min'])) - if max(jsondata[high]['max'], jsondata[low]['max']) != jsondata[combined]['max']: - print("Max of high %d, low %d max latencies does not match max %d from combined data" % \ - (jsondata[high]['max'], jsondata[low]['max'], jsondata[combined]['max'])) + max_val = max([x[obj]['max'] for x in jsondata['prios']]) + if max_val != jsondata[combined]['max']: + print("Max per prio max latency %d does not match max %d from combined data" % + (max_val, jsondata[combined]['max'])) return False elif self.debug: - print("Max of high %d, low %d max latencies matches max %d from combined data" % \ - (jsondata[high]['max'], jsondata[low]['max'], jsondata[combined]['max'])) + print("Max per prio max latency %d matches max %d from combined data" % + (max_val, jsondata[combined]['max'])) - weighted_avg = (jsondata[high]['mean'] * jsondata[high]['N'] + \ - jsondata[low]['mean'] * jsondata[low]['N']) / jsondata[combined]['N'] + weighted_vals = [x[obj]['mean'] * x[obj]['N'] for x in jsondata['prios']] + weighted_avg = sum(weighted_vals) / jsondata[combined]['N'] delta = abs(weighted_avg - jsondata[combined]['mean']) if (delta / jsondata[combined]['mean']) > 0.0001: - print("Difference between weighted average %f of high, low means " + print("Difference between merged per prio weighted average %f mean " "and actual mean %f exceeds 0.01%%" % (weighted_avg, jsondata[combined]['mean'])) return False elif self.debug: - print("Weighted average %f of high, low means matches actual mean %f" % \ - (weighted_avg, jsondata[combined]['mean'])) + print("Merged per prio weighted average %f mean matches actual mean %f" % + (weighted_avg, jsondata[combined]['mean'])) if plus: - if not self.check_jsonplus(jsondata[high]): - return False - if not self.check_jsonplus(jsondata[low]): - return False + for prio in jsondata['prios']: + if not self.check_jsonplus(prio[obj]): + return False - bins = {**jsondata[high]['bins'], **jsondata[low]['bins']} - for duration in bins.keys(): - if duration in jsondata[high]['bins'] and duration in jsondata[low]['bins']: - bins[duration] = jsondata[high]['bins'][duration] + \ - jsondata[low]['bins'][duration] + counter = Counter() + for prio in jsondata['prios']: + counter.update(prio[obj]['bins']) + + bins = dict(counter) if len(bins) != len(jsondata[combined]['bins']): - print("Number of combined high/low bins does not match number of overall bins") + print("Number of merged bins %d does not match number of overall bins %d" % + (len(bins), len(jsondata[combined]['bins']))) return False elif self.debug: - print("Number of bins from merged high/low data matches number of overall bins") + print("Number of merged bins %d matches number of overall bins %d" % + (len(bins), len(jsondata[combined]['bins']))) for duration in bins.keys(): if bins[duration] != jsondata[combined]['bins'][duration]: - print("Merged high/low count does not match overall count for duration %d" \ - % duration) + print("Merged per prio count does not match overall count for duration %d" % + duration) return False - print("Merged high/low priority latency data match combined latency data") + print("Merged per priority latency data match combined latency data") return True def check(self): @@ -602,7 +603,7 @@ def check(self): print("Unexpected trim data found in output") retval = False if not self.check_nocmdprio_lat(job): - print("Unexpected high/low priority latencies found") + print("Unexpected per priority latencies found") retval = False retval &= self.check_latencies(job['read'], 0, slat=False) @@ -626,7 +627,7 @@ def check(self): print("Unexpected trim data found in output") retval = False if not self.check_nocmdprio_lat(job): - print("Unexpected high/low priority latencies found") + print("Unexpected per priority latencies found") retval = False retval &= self.check_latencies(job['write'], 1, slat=False, clat=False) @@ -650,7 +651,7 @@ def check(self): print("Unexpected write data found in output") retval = False if not self.check_nocmdprio_lat(job): - print("Unexpected high/low priority latencies found") + print("Unexpected per priority latencies found") retval = False retval &= self.check_latencies(job['trim'], 2, slat=False, tlat=False) @@ -674,7 +675,7 @@ def check(self): print("Unexpected trim data found in output") retval = False if not self.check_nocmdprio_lat(job): - print("Unexpected high/low priority latencies found") + print("Unexpected per priority latencies found") retval = False retval &= self.check_latencies(job['read'], 0, plus=True) @@ -698,7 +699,7 @@ def check(self): print("Unexpected trim data found in output") retval = False if not self.check_nocmdprio_lat(job): - print("Unexpected high/low priority latencies found") + print("Unexpected per priority latencies found") retval = False retval &= self.check_latencies(job['write'], 1, slat=False, plus=True) @@ -722,7 +723,7 @@ def check(self): print("Unexpected trim data found in output") retval = False if not self.check_nocmdprio_lat(job): - print("Unexpected high/low priority latencies found") + print("Unexpected per priority latencies found") retval = False retval &= self.check_latencies(job['read'], 0, slat=False, tlat=False, plus=True) @@ -743,7 +744,7 @@ def check(self): print("Unexpected trim data found in output") retval = False if not self.check_nocmdprio_lat(job): - print("Unexpected high/low priority latencies found") + print("Unexpected per priority latencies found") retval = False retval &= self.check_latencies(job['read'], 0, clat=False, tlat=False, plus=True) @@ -761,11 +762,11 @@ def check(self): job = self.json_data['jobs'][0] retval = True - if 'read' in job or 'write'in job or 'trim' in job: + if 'read' in job or 'write' in job or 'trim' in job: print("Unexpected data direction found in fio output") retval = False if not self.check_nocmdprio_lat(job): - print("Unexpected high/low priority latencies found") + print("Unexpected per priority latencies found") retval = False retval &= self.check_latencies(job['mixed'], 0, plus=True, unified=True) @@ -792,7 +793,7 @@ def check(self): print("Error checking fsync latency data") retval = False if not self.check_nocmdprio_lat(job): - print("Unexpected high/low priority latencies found") + print("Unexpected per priority latencies found") retval = False retval &= self.check_latencies(job['write'], 1, slat=False, plus=True) @@ -813,7 +814,7 @@ def check(self): print("Unexpected trim data found in output") retval = False if not self.check_nocmdprio_lat(job): - print("Unexpected high/low priority latencies found") + print("Unexpected per priority latencies found") retval = False retval &= self.check_latencies(job['read'], 0, plus=True) @@ -839,7 +840,7 @@ def check(self): print("Unexpected trim data found in output") retval = False if not self.check_nocmdprio_lat(job): - print("Unexpected high/low priority latencies found") + print("Unexpected per priority latencies found") retval = False retval &= self.check_latencies(job['read'], 0, slat=False, clat=False, plus=True) @@ -953,7 +954,7 @@ def check(self): job = self.json_data['jobs'][0] retval = True - if 'read' in job or 'write'in job or 'trim' in job: + if 'read' in job or 'write' in job or 'trim' in job: print("Unexpected data direction found in fio output") retval = False @@ -963,6 +964,27 @@ def check(self): return retval +class Test021(FioLatTest): + """Test object for Test 21.""" + + def check(self): + """Check Test 21 output.""" + + job = self.json_data['jobs'][0] + + retval = True + if not self.check_empty(job['trim']): + print("Unexpected trim data found in output") + retval = False + + retval &= self.check_latencies(job['read'], 0, slat=False, tlat=False, plus=True) + retval &= self.check_latencies(job['write'], 1, slat=False, tlat=False, plus=True) + retval &= self.check_prio_latencies(job['read'], clat=True, plus=True) + retval &= self.check_prio_latencies(job['write'], clat=True, plus=True) + + return retval + + def parse_args(): """Parse command-line arguments.""" @@ -1007,7 +1029,7 @@ def main(): # randread, null # enable slat, clat, lat # only clat and lat will appear because - # because the null ioengine is syncrhonous + # because the null ioengine is synchronous "test_id": 1, "runtime": 2, "output-format": "json", @@ -1047,7 +1069,7 @@ def main(): { # randread, aio # enable slat, clat, lat - # all will appear because liaio is asynchronous + # all will appear because libaio is asynchronous "test_id": 4, "runtime": 5, "output-format": "json+", @@ -1153,9 +1175,9 @@ def main(): # randread, null # enable slat, clat, lat # only clat and lat will appear because - # because the null ioengine is syncrhonous - # same as Test 1 except - # numjobs = 4 to test sum_thread_stats() changes + # because the null ioengine is synchronous + # same as Test 1 except add numjobs = 4 to test + # sum_thread_stats() changes "test_id": 12, "runtime": 2, "output-format": "json", @@ -1170,9 +1192,9 @@ def main(): { # randread, aio # enable slat, clat, lat - # all will appear because liaio is asynchronous - # same as Test 4 except - # numjobs = 4 to test sum_thread_stats() changes + # all will appear because libaio is asynchronous + # same as Test 4 except add numjobs = 4 to test + # sum_thread_stats() changes "test_id": 13, "runtime": 5, "output-format": "json+", @@ -1187,8 +1209,8 @@ def main(): { # 50/50 r/w, aio, unified_rw_reporting # enable slat, clat, lata - # same as Test 8 except - # numjobs = 4 to test sum_thread_stats() changes + # same as Test 8 except add numjobs = 4 to test + # sum_thread_stats() changes "test_id": 14, "runtime": 5, "output-format": "json+", @@ -1204,7 +1226,7 @@ def main(): { # randread, aio # enable slat, clat, lat - # all will appear because liaio is asynchronous + # all will appear because libaio is asynchronous # same as Test 4 except add cmdprio_percentage "test_id": 15, "runtime": 5, @@ -1278,8 +1300,8 @@ def main(): { # 50/50 r/w, aio, unified_rw_reporting # enable slat, clat, lat - # same as Test 19 except - # add numjobs = 4 to test sum_thread_stats() changes + # same as Test 19 except add numjobs = 4 to test + # sum_thread_stats() changes "test_id": 20, "runtime": 5, "output-format": "json+", @@ -1293,6 +1315,40 @@ def main(): 'numjobs': 4, "test_obj": Test019, }, + { + # r/w, aio + # enable only clat + # test bssplit and cmdprio_bssplit + "test_id": 21, + "runtime": 5, + "output-format": "json+", + "slat_percentiles": 0, + "clat_percentiles": 1, + "lat_percentiles": 0, + "ioengine": aio, + 'rw': 'randrw', + 'bssplit': '64k/40:1024k/60', + 'cmdprio_bssplit': '64k/25/1/1:64k/75/3/2:1024k/0', + "test_obj": Test021, + }, + { + # r/w, aio + # enable only clat + # same as Test 21 except add numjobs = 4 to test + # sum_thread_stats() changes + "test_id": 22, + "runtime": 5, + "output-format": "json+", + "slat_percentiles": 0, + "clat_percentiles": 1, + "lat_percentiles": 0, + "ioengine": aio, + 'rw': 'randrw', + 'bssplit': '64k/40:1024k/60', + 'cmdprio_bssplit': '64k/25/1/1:64k/75/3/2:1024k/0', + 'numjobs': 4, + "test_obj": Test021, + }, ] passed = 0 @@ -1304,9 +1360,10 @@ def main(): (args.run_only and test['test_id'] not in args.run_only): skipped = skipped + 1 outcome = 'SKIPPED (User request)' - elif (platform.system() != 'Linux' or os.geteuid() != 0) and 'cmdprio_percentage' in test: + elif (platform.system() != 'Linux' or os.geteuid() != 0) and \ + ('cmdprio_percentage' in test or 'cmdprio_bssplit' in test): skipped = skipped + 1 - outcome = 'SKIPPED (Linux root required for cmdprio_percentage tests)' + outcome = 'SKIPPED (Linux root required for cmdprio tests)' else: test_obj = test['test_obj'](artifact_root, test, args.debug) status = test_obj.run_fio(fio) diff --git a/t/lfsr-test.c b/t/lfsr-test.c index 279e07f0ec..632de38313 100644 --- a/t/lfsr-test.c +++ b/t/lfsr-test.c @@ -41,11 +41,11 @@ int main(int argc, char *argv[]) switch (argc) { case 5: if (strncmp(argv[4], "verify", 7) == 0) verify = 1; - fallthrough; + fio_fallthrough; case 4: spin = atoi(argv[3]); - fallthrough; + fio_fallthrough; case 3: seed = atol(argv[2]); - fallthrough; + fio_fallthrough; case 2: numbers = strtol(argv[1], NULL, 16); break; default: usage(); @@ -78,8 +78,7 @@ int main(int argc, char *argv[]) /* Create verification table */ if (verify) { v_size = numbers * sizeof(uint8_t); - v = malloc(v_size); - memset(v, 0, v_size); + v = calloc(1, v_size); printf("\nVerification table is %lf KiB\n", (double)(v_size) / 1024); } v_start = v; diff --git a/t/log_compression.py b/t/log_compression.py new file mode 100755 index 0000000000..94c92db797 --- /dev/null +++ b/t/log_compression.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 +# +# log_compression.py +# +# Test log_compression and log_store_compressed. Uses null ioengine. +# Previous bugs have caused output in per I/O log files to be missing +# and/or out of order +# +# Expected result: 8000 log entries, offset starting at 0 and increasing by bs +# Buggy result: Log entries out of order (usually without log_store_compressed) +# and/or missing log entries (usually with log_store_compressed) +# +# USAGE +# python log_compression.py [-f fio-executable] +# +# EXAMPLES +# python t/log_compression.py +# python t/log_compression.py -f ./fio +# +# REQUIREMENTS +# Python 3.5+ +# +# ===TEST MATRIX=== +# +# With log_compression=10K +# With log_store_compressed=1 and log_compression=10K + +import os +import sys +import platform +import argparse +import subprocess + + +def parse_args(): + """Parse command-line arguments.""" + parser = argparse.ArgumentParser() + parser.add_argument('-f', '--fio', + help='path to fio executable (e.g., ./fio)') + return parser.parse_args() + + +def run_fio(fio,log_store_compressed): + fio_args = [ + '--name=job', + '--ioengine=null', + '--filesize=1000M', + '--bs=128K', + '--rw=write', + '--iodepth=1', + '--write_bw_log=test', + '--per_job_logs=0', + '--log_offset=1', + '--log_compression=10K', + ] + if log_store_compressed: + fio_args.append('--log_store_compressed=1') + + subprocess.check_output([fio] + fio_args) + + if log_store_compressed: + fio_inflate_args = [ + '--inflate-log=test_bw.log.fz' + ] + with open('test_bw.from_fz.log','wt') as f: + subprocess.check_call([fio]+fio_inflate_args,stdout=f) + +def check_log_file(log_store_compressed): + filename = 'test_bw.from_fz.log' if log_store_compressed else 'test_bw.log' + with open(filename,'rt') as f: + file_data = f.read() + log_lines = [x for x in file_data.split('\n') if len(x.strip())!=0] + log_ios = len(log_lines) + + filesize = 1000*1024*1024 + bs = 128*1024 + ios = filesize//bs + if log_ios!=ios: + print('wrong number of ios ({}) in log; should be {}'.format(log_ios,ios)) + return False + + expected_offset = 0 + for line_number,line in enumerate(log_lines): + log_offset = int(line.split(',')[4]) + if log_offset != expected_offset: + print('wrong offset ({}) for io number {} in log; should be {}'.format( + log_offset, line_number, expected_offset)) + return False + expected_offset += bs + return True + +def main(): + """Entry point for this script.""" + args = parse_args() + if args.fio: + fio_path = args.fio + else: + fio_path = os.path.join(os.path.dirname(__file__), '../fio') + if not os.path.exists(fio_path): + fio_path = 'fio' + print("fio path is", fio_path) + + passed_count = 0 + failed_count = 0 + for log_store_compressed in [False, True]: + run_fio(fio_path, log_store_compressed) + passed = check_log_file(log_store_compressed) + print('Test with log_store_compressed={} {}'.format(log_store_compressed, + 'PASSED' if passed else 'FAILED')) + if passed: + passed_count+=1 + else: + failed_count+=1 + + print('{} tests passed, {} failed'.format(passed_count, failed_count)) + + sys.exit(failed_count) + +if __name__ == '__main__': + main() + diff --git a/t/nvmept.py b/t/nvmept.py new file mode 100755 index 0000000000..3d90f4bfcd --- /dev/null +++ b/t/nvmept.py @@ -0,0 +1,399 @@ +#!/usr/bin/env python3 +""" +# nvmept.py +# +# Test fio's io_uring_cmd ioengine with NVMe pass-through commands. +# +# USAGE +# see python3 nvmept.py --help +# +# EXAMPLES +# python3 t/nvmept.py --dut /dev/ng0n1 +# python3 t/nvmept.py --dut /dev/ng1n1 -f ./fio +# +# REQUIREMENTS +# Python 3.6 +# +""" +import os +import sys +import time +import argparse +from pathlib import Path +from fiotestlib import FioJobCmdTest, run_fio_tests + + +class PassThruTest(FioJobCmdTest): + """ + NVMe pass-through test class. Check to make sure output for selected data + direction(s) is non-zero and that zero data appears for other directions. + """ + + def setup(self, parameters): + """Setup a test.""" + + fio_args = [ + "--name=nvmept", + "--ioengine=io_uring_cmd", + "--cmd_type=nvme", + "--iodepth=8", + "--iodepth_batch=4", + "--iodepth_batch_complete=4", + f"--filename={self.fio_opts['filename']}", + f"--rw={self.fio_opts['rw']}", + f"--output={self.filenames['output']}", + f"--output-format={self.fio_opts['output-format']}", + ] + for opt in ['fixedbufs', 'nonvectored', 'force_async', 'registerfiles', + 'sqthread_poll', 'sqthread_poll_cpu', 'hipri', 'nowait', + 'time_based', 'runtime', 'verify', 'io_size']: + if opt in self.fio_opts: + option = f"--{opt}={self.fio_opts[opt]}" + fio_args.append(option) + + super().setup(fio_args) + + + def check_result(self): + super().check_result() + + if 'rw' not in self.fio_opts: + return + + if not self.passed: + return + + job = self.json_data['jobs'][0] + + if self.fio_opts['rw'] in ['read', 'randread']: + self.passed = self.check_all_ddirs(['read'], job) + elif self.fio_opts['rw'] in ['write', 'randwrite']: + if 'verify' not in self.fio_opts: + self.passed = self.check_all_ddirs(['write'], job) + else: + self.passed = self.check_all_ddirs(['read', 'write'], job) + elif self.fio_opts['rw'] in ['trim', 'randtrim']: + self.passed = self.check_all_ddirs(['trim'], job) + elif self.fio_opts['rw'] in ['readwrite', 'randrw']: + self.passed = self.check_all_ddirs(['read', 'write'], job) + elif self.fio_opts['rw'] in ['trimwrite', 'randtrimwrite']: + self.passed = self.check_all_ddirs(['trim', 'write'], job) + else: + print(f"Unhandled rw value {self.fio_opts['rw']}") + self.passed = False + + if job['iodepth_level']['8'] < 95: + print("Did not achieve requested iodepth") + self.passed = False + + +class FlushTest(FioJobCmdTest): + def setup(self, parameters): + fio_args = [ + "--name=nvmept-flush", + "--ioengine=io_uring_cmd", + "--cmd_type=nvme", + "--randrepeat=0", + f"--filename={self.fio_opts['filename']}", + f"--rw={self.fio_opts['rw']}", + f"--output={self.filenames['output']}", + f"--output-format={self.fio_opts['output-format']}", + ] + + for opt in ['fixedbufs', 'nonvectored', 'force_async', 'registerfiles', + 'sqthread_poll', 'sqthread_poll_cpu', 'hipri', 'nowait', + 'time_based', 'runtime', 'verify', 'io_size', 'num_range', + 'iodepth', 'iodepth_batch', 'iodepth_batch_complete', + 'size', 'rate', 'bs', 'bssplit', 'bsrange', 'randrepeat', + 'buffer_pattern', 'verify_pattern', 'offset', 'fdp', + 'fdp_pli', 'fdp_pli_select', 'dataplacement', 'plid_select', + 'plids', 'dp_scheme', 'number_ios', 'read_iolog', 'fsync']: + if opt in self.fio_opts: + option = f"--{opt}={self.fio_opts[opt]}" + fio_args.append(option) + + super().setup(fio_args) + + def check_result(self): + super().check_result() + + job = self.json_data['jobs'][0] + + rw = self.fio_opts['rw'] + fsync = self.fio_opts['fsync'] + + nr_write = job['write']['total_ios'] + nr_sync = job['sync']['total_ios'] + + nr_sync_exp = nr_write // fsync + + # The actual number of DDIR_SYNC issued might miss one DDIR_SYNC command + # when the last command issued was DDIR_WRITE command. + if not ((nr_sync == nr_sync_exp) or (nr_sync + 1 == nr_sync_exp)): + logging.error(f"nr_write={nr_write}, nr_sync={nr_sync}, fsync={fsync}") + self.passed = False + + +TEST_LIST = [ + { + "test_id": 1, + "fio_opts": { + "rw": 'read', + "timebased": 1, + "runtime": 3, + "output-format": "json", + }, + "test_class": PassThruTest, + }, + { + "test_id": 2, + "fio_opts": { + "rw": 'randread', + "timebased": 1, + "runtime": 3, + "output-format": "json", + }, + "test_class": PassThruTest, + }, + { + "test_id": 3, + "fio_opts": { + "rw": 'write', + "timebased": 1, + "runtime": 3, + "output-format": "json", + }, + "test_class": PassThruTest, + }, + { + "test_id": 4, + "fio_opts": { + "rw": 'randwrite', + "timebased": 1, + "runtime": 3, + "output-format": "json", + }, + "test_class": PassThruTest, + }, + { + "test_id": 5, + "fio_opts": { + "rw": 'trim', + "timebased": 1, + "runtime": 3, + "output-format": "json", + }, + "test_class": PassThruTest, + }, + { + "test_id": 6, + "fio_opts": { + "rw": 'randtrim', + "timebased": 1, + "runtime": 3, + "output-format": "json", + }, + "test_class": PassThruTest, + }, + { + "test_id": 7, + "fio_opts": { + "rw": 'write', + "io_size": 1024*1024, + "verify": "crc32c", + "output-format": "json", + }, + "test_class": PassThruTest, + }, + { + "test_id": 8, + "fio_opts": { + "rw": 'randwrite', + "io_size": 1024*1024, + "verify": "crc32c", + "output-format": "json", + }, + "test_class": PassThruTest, + }, + { + "test_id": 9, + "fio_opts": { + "rw": 'readwrite', + "timebased": 1, + "runtime": 3, + "output-format": "json", + }, + "test_class": PassThruTest, + }, + { + "test_id": 10, + "fio_opts": { + "rw": 'randrw', + "timebased": 1, + "runtime": 3, + "output-format": "json", + }, + "test_class": PassThruTest, + }, + { + "test_id": 11, + "fio_opts": { + "rw": 'trimwrite', + "timebased": 1, + "runtime": 3, + "output-format": "json", + }, + "test_class": PassThruTest, + }, + { + "test_id": 12, + "fio_opts": { + "rw": 'randtrimwrite', + "timebased": 1, + "runtime": 3, + "output-format": "json", + }, + "test_class": PassThruTest, + }, + { + "test_id": 13, + "fio_opts": { + "rw": 'randread', + "timebased": 1, + "runtime": 3, + "fixedbufs": 1, + "nonvectored": 1, + "force_async": 1, + "registerfiles": 1, + "sqthread_poll": 1, + "output-format": "json", + }, + "test_class": PassThruTest, + }, + { + "test_id": 14, + "fio_opts": { + "rw": 'randwrite', + "timebased": 1, + "runtime": 3, + "fixedbufs": 1, + "nonvectored": 1, + "force_async": 1, + "registerfiles": 1, + "sqthread_poll": 1, + "output-format": "json", + }, + "test_class": PassThruTest, + }, + { + # We can't enable fixedbufs because for trim-only + # workloads fio actually does not allocate any buffers + "test_id": 15, + "fio_opts": { + "rw": 'randtrim', + "timebased": 1, + "runtime": 3, + "fixedbufs": 0, + "nonvectored": 1, + "force_async": 1, + "registerfiles": 1, + "sqthread_poll": 1, + "output-format": "json", + }, + "test_class": PassThruTest, + }, + { + "test_id": 16, + "fio_opts": { + "rw": 'read', + "bs": 4096, + "number_ios": 10, + "fsync": 1, + "output-format": "json", + }, + "test_class": FlushTest, + }, + { + "test_id": 17, + "fio_opts": { + "rw": 'write', + "bs": 4096, + "number_ios": 10, + "fsync": 1, + "output-format": "json", + }, + "test_class": FlushTest, + }, + { + "test_id": 18, + "fio_opts": { + "rw": 'readwrite', + "bs": 4096, + "number_ios": 10, + "fsync": 1, + "output-format": "json", + }, + "test_class": FlushTest, + }, + { + "test_id": 19, + "fio_opts": { + "rw": 'trimwrite', + "bs": 4096, + "number_ios": 10, + "fsync": 1, + "output-format": "json", + }, + "test_class": FlushTest, + }, +] + +def parse_args(): + """Parse command-line arguments.""" + + parser = argparse.ArgumentParser() + parser.add_argument('-f', '--fio', help='path to file executable (e.g., ./fio)') + parser.add_argument('-a', '--artifact-root', help='artifact root directory') + parser.add_argument('-s', '--skip', nargs='+', type=int, + help='list of test(s) to skip') + parser.add_argument('-o', '--run-only', nargs='+', type=int, + help='list of test(s) to run, skipping all others') + parser.add_argument('--dut', help='target NVMe character device to test ' + '(e.g., /dev/ng0n1). WARNING: THIS IS A DESTRUCTIVE TEST', required=True) + args = parser.parse_args() + + return args + + +def main(): + """Run tests using fio's io_uring_cmd ioengine to send NVMe pass through commands.""" + + args = parse_args() + + artifact_root = args.artifact_root if args.artifact_root else \ + f"nvmept-test-{time.strftime('%Y%m%d-%H%M%S')}" + os.mkdir(artifact_root) + print(f"Artifact directory is {artifact_root}") + + if args.fio: + fio_path = str(Path(args.fio).absolute()) + else: + fio_path = 'fio' + print(f"fio path is {fio_path}") + + for test in TEST_LIST: + test['fio_opts']['filename'] = args.dut + + test_env = { + 'fio_path': fio_path, + 'fio_root': str(Path(__file__).absolute().parent.parent), + 'artifact_root': artifact_root, + 'basename': 'nvmept', + } + + _, failed, _ = run_fio_tests(TEST_LIST, test_env, args) + sys.exit(failed) + + +if __name__ == '__main__': + main() diff --git a/t/nvmept_fdp.py b/t/nvmept_fdp.py new file mode 100755 index 0000000000..31a54a1efa --- /dev/null +++ b/t/nvmept_fdp.py @@ -0,0 +1,1082 @@ +#!/usr/bin/env python3 +# +# Copyright 2024 Samsung Electronics Co., Ltd All Rights Reserved +# +# For conditions of distribution and use, see the accompanying COPYING file. +# +""" +# nvmept_fdp.py +# +# Test fio's io_uring_cmd ioengine with NVMe pass-through FDP write commands. +# +# USAGE +# see python3 nvmept_fdp.py --help +# +# EXAMPLES +# python3 t/nvmept_fdp.py --dut /dev/ng0n1 +# python3 t/nvmept_fdp.py --dut /dev/ng1n1 -f ./fio +# +# REQUIREMENTS +# Python 3.6 +# Device formatted with LBA data size 4096 bytes +# Device with at least five placement IDs +# +# WARNING +# This is a destructive test +""" +import os +import sys +import json +import time +import locale +import logging +import argparse +import subprocess +from pathlib import Path +from fiotestlib import FioJobCmdTest, run_fio_tests +from fiotestcommon import SUCCESS_NONZERO + +# This needs to match FIO_MAX_DP_IDS and DP_MAX_SCHEME_ENTRIES in +# dataplacement.h +FIO_MAX_DP_IDS = 128 +DP_MAX_SCHEME_ENTRIES = 32 + +class FDPTest(FioJobCmdTest): + """ + NVMe pass-through test class. Check to make sure output for selected data + direction(s) is non-zero and that zero data appears for other directions. + """ + + def setup(self, parameters): + """Setup a test.""" + + fio_args = [ + "--name=nvmept-fdp", + "--ioengine=io_uring_cmd", + "--cmd_type=nvme", + "--randrepeat=0", + f"--filename={self.fio_opts['filename']}", + f"--rw={self.fio_opts['rw']}", + f"--output={self.filenames['output']}", + f"--output-format={self.fio_opts['output-format']}", + ] + + for opt in ['fixedbufs', 'nonvectored', 'force_async', 'registerfiles', + 'sqthread_poll', 'sqthread_poll_cpu', 'hipri', 'nowait', + 'time_based', 'runtime', 'verify', 'io_size', 'num_range', + 'iodepth', 'iodepth_batch', 'iodepth_batch_complete', + 'size', 'rate', 'bs', 'bssplit', 'bsrange', 'randrepeat', + 'buffer_pattern', 'verify_pattern', 'offset', 'fdp', + 'fdp_pli', 'fdp_pli_select', 'dataplacement', 'plid_select', + 'plids', 'dp_scheme', 'number_ios', 'read_iolog']: + if opt in self.fio_opts: + option = f"--{opt}={self.fio_opts[opt]}" + fio_args.append(option) + + super().setup(fio_args) + + + def check_result(self): + try: + self._check_result() + finally: + if not update_all_ruhs(self.fio_opts['filename']): + logging.error("Could not reset device") + if not check_all_ruhs(self.fio_opts['filename']): + logging.error("Reclaim units have inconsistent RUAMW values") + + + def _check_result(self): + + super().check_result() + + if 'rw' not in self.fio_opts or \ + not self.passed or \ + 'json' not in self.fio_opts['output-format']: + return + + job = self.json_data['jobs'][0] + rw_fio_opts = self.fio_opts['rw'].split(':')[0] + + if rw_fio_opts in ['read', 'randread']: + self.passed = self.check_all_ddirs(['read'], job) + elif rw_fio_opts in ['write', 'randwrite']: + if 'verify' not in self.fio_opts: + self.passed = self.check_all_ddirs(['write'], job) + else: + self.passed = self.check_all_ddirs(['read', 'write'], job) + elif rw_fio_opts in ['trim', 'randtrim']: + self.passed = self.check_all_ddirs(['trim'], job) + elif rw_fio_opts in ['readwrite', 'randrw']: + self.passed = self.check_all_ddirs(['read', 'write'], job) + elif rw_fio_opts in ['trimwrite', 'randtrimwrite']: + self.passed = self.check_all_ddirs(['trim', 'write'], job) + else: + logging.error("Unhandled rw value %s", self.fio_opts['rw']) + self.passed = False + + if 'iodepth' in self.fio_opts: + # We will need to figure something out if any test uses an iodepth + # different from 8 + if job['iodepth_level']['8'] < 95: + logging.error("Did not achieve requested iodepth") + self.passed = False + else: + logging.debug("iodepth 8 target met %s", job['iodepth_level']['8']) + + +class FDPMultiplePLIDTest(FDPTest): + """ + Write to multiple placement IDs. + """ + + def setup(self, parameters): + mapping = { + 'nruhsd': FIO_FDP_NUMBER_PLIDS, + 'max_ruamw': FIO_FDP_MAX_RUAMW, + 'maxplid': FIO_FDP_NUMBER_PLIDS-1, + # parameters for 400, 401 tests + 'hole_size': 64*1024, + 'nios_for_scheme': min(FIO_FDP_NUMBER_PLIDS//2, DP_MAX_SCHEME_ENTRIES), + } + if 'number_ios' in self.fio_opts and isinstance(self.fio_opts['number_ios'], str): + self.fio_opts['number_ios'] = eval(self.fio_opts['number_ios'].format(**mapping)) + if 'bs' in self.fio_opts and isinstance(self.fio_opts['bs'], str): + self.fio_opts['bs'] = eval(self.fio_opts['bs'].format(**mapping)) + if 'rw' in self.fio_opts and isinstance(self.fio_opts['rw'], str): + self.fio_opts['rw'] = self.fio_opts['rw'].format(**mapping) + if 'plids' in self.fio_opts and isinstance(self.fio_opts['plids'], str): + self.fio_opts['plids'] = self.fio_opts['plids'].format(**mapping) + if 'fdp_pli' in self.fio_opts and isinstance(self.fio_opts['fdp_pli'], str): + self.fio_opts['fdp_pli'] = self.fio_opts['fdp_pli'].format(**mapping) + + super().setup(parameters) + + if 'dp_scheme' in self.fio_opts: + scheme_path = os.path.join(self.paths['test_dir'], self.fio_opts['dp_scheme']) + with open(scheme_path, mode='w') as f: + for i in range(mapping['nios_for_scheme']): + f.write(f'{mapping["hole_size"] * 2 * i}, {mapping["hole_size"] * 2 * (i+1)}, {i}\n') + + if 'read_iolog' in self.fio_opts: + read_iolog_path = os.path.join(self.paths['test_dir'], self.fio_opts['read_iolog']) + with open(read_iolog_path, mode='w') as f: + f.write('fio version 2 iolog\n') + f.write(f'{self.fio_opts["filename"]} add\n') + f.write(f'{self.fio_opts["filename"]} open\n') + + for i in range(mapping['nios_for_scheme']): + f.write(f'{self.fio_opts["filename"]} write {mapping["hole_size"] * 2 * i} {mapping["hole_size"]}\n') + + f.write(f'{self.fio_opts["filename"]} close') + + def _check_result(self): + if 'fdp_pli' in self.fio_opts: + plid_list = self.fio_opts['fdp_pli'].split(',') + elif 'plids' in self.fio_opts: + plid_list = self.fio_opts['plids'].split(',') + else: + plid_list = [str(i) for i in range(FIO_FDP_NUMBER_PLIDS)] + + range_ids = [] + for plid in plid_list: + if '-' in plid: + [start, end] = plid.split('-') + range_ids.extend(list(range(int(start), int(end)+1))) + else: + range_ids.append(int(plid)) + + plid_list = sorted(range_ids) + logging.debug("plid_list: %s", str(plid_list)) + + fdp_status = get_fdp_status(self.fio_opts['filename']) + + select = "roundrobin" + if 'fdp_pli_select' in self.fio_opts: + select = self.fio_opts['fdp_pli_select'] + elif 'plid_select' in self.fio_opts: + select = self.fio_opts['plid_select'] + + if select == "roundrobin": + self._check_robin(plid_list, fdp_status) + elif select == "random": + self._check_random(plid_list, fdp_status) + elif select == "scheme": + self._check_scheme(plid_list, fdp_status) + else: + logging.error("Unknown plid selection strategy %s", select) + self.passed = False + + super()._check_result() + + def _check_robin(self, plid_list, fdp_status): + """ + With round robin we can know exactly how many writes each PLID will + receive. + """ + ruamw = [FIO_FDP_MAX_RUAMW] * FIO_FDP_NUMBER_PLIDS + + number_ios = self.fio_opts['number_ios'] % (len(plid_list)*FIO_FDP_MAX_RUAMW) + remainder = int(number_ios % len(plid_list)) + whole = int((number_ios - remainder) / len(plid_list)) + logging.debug("PLIDs in the list should show they have received %d writes; %d PLIDs will receive one extra", + whole, remainder) + + for plid in plid_list: + ruamw[plid] -= whole + if remainder: + ruamw[plid] -= 1 + remainder -= 1 + logging.debug("Expected ruamw values: %s", str(ruamw)) + + for idx, ruhs in enumerate(fdp_status['ruhss']): + if idx >= FIO_FDP_NUMBER_PLIDS: + break + + if ruhs['ruamw'] != ruamw[idx]: + logging.error("RUAMW mismatch with idx %d, pid %d, expected %d, observed %d", idx, + ruhs['pid'], ruamw[idx], ruhs['ruamw']) + self.passed = False + break + + logging.debug("RUAMW match with idx %d, pid %d: ruamw=%d", idx, ruhs['pid'], ruamw[idx]) + + def _check_random(self, plid_list, fdp_status): + """ + With random selection, a set of PLIDs will receive all the write + operations and the remainder will be untouched. + """ + + total_ruamw = 0 + for plid in plid_list: + total_ruamw += fdp_status['ruhss'][plid]['ruamw'] + + expected = len(plid_list) * FIO_FDP_MAX_RUAMW - self.fio_opts['number_ios'] + if total_ruamw != expected: + logging.error("Expected total ruamw %d for plids %s, observed %d", expected, + str(plid_list), total_ruamw) + self.passed = False + else: + logging.debug("Observed expected total ruamw %d for plids %s", expected, str(plid_list)) + + for idx, ruhs in enumerate(fdp_status['ruhss']): + if idx in plid_list: + continue + if ruhs['ruamw'] != FIO_FDP_MAX_RUAMW: + logging.error("Unexpected ruamw %d for idx %d, pid %d, expected %d", ruhs['ruamw'], + idx, ruhs['pid'], FIO_FDP_MAX_RUAMW) + self.passed = False + else: + logging.debug("Observed expected ruamw %d for idx %d, pid %d", ruhs['ruamw'], idx, + ruhs['pid']) + + def _check_scheme(self, plid_list, fdp_status): + """ + With scheme selection, a set of PLIDs touched by the scheme + """ + + PLID_IDX_POS = 2 + plid_list_from_scheme = set() + + scheme_path = os.path.join(self.paths['test_dir'], self.fio_opts['dp_scheme']) + + with open(scheme_path) as f: + lines = f.readlines() + for line in lines: + line_elem = line.strip().replace(' ', '').split(',') + plid_list_from_scheme.add(int(line_elem[PLID_IDX_POS])) + + logging.debug(f'plid_list_from_scheme: {plid_list_from_scheme}') + + for idx, ruhs in enumerate(fdp_status['ruhss']): + if ruhs['pid'] in plid_list_from_scheme: + if ruhs['ruamw'] == FIO_FDP_MAX_RUAMW: + logging.error("pid %d should be touched by the scheme. But ruamw of it(%d) equals to %d", + ruhs['pid'], ruhs['ruamw'], FIO_FDP_MAX_RUAMW) + self.passed = False + else: + logging.debug("pid %d should be touched by the scheme. ruamw of it(%d) is under %d", + ruhs['pid'], ruhs['ruamw'], FIO_FDP_MAX_RUAMW) + else: + if ruhs['ruamw'] == FIO_FDP_MAX_RUAMW: + logging.debug("pid %d should not be touched by the scheme. ruamw of it(%d) equals to %d", + ruhs['pid'], ruhs['ruamw'], FIO_FDP_MAX_RUAMW) + else: + logging.error("pid %d should not be touched by the scheme. But ruamw of it(%d) is under %d", + ruhs['pid'], ruhs['ruamw'], FIO_FDP_MAX_RUAMW) + self.passed = False + + +class FDPSinglePLIDTest(FDPTest): + """ + Write to a single placement ID only. + """ + + def _check_result(self): + if 'plids' in self.fio_opts: + plid = self.fio_opts['plids'] + elif 'fdp_pli' in self.fio_opts: + plid = self.fio_opts['fdp_pli'] + else: + plid = 0 + + fdp_status = get_fdp_status(self.fio_opts['filename']) + ruamw = fdp_status['ruhss'][plid]['ruamw'] + lba_count = self.fio_opts['number_ios'] + + if FIO_FDP_MAX_RUAMW - lba_count != ruamw: + logging.error("FDP accounting mismatch for plid %d; expected ruamw %d, observed %d", + plid, FIO_FDP_MAX_RUAMW - lba_count, ruamw) + self.passed = False + else: + logging.debug("FDP accounting as expected for plid %d; ruamw = %d", plid, ruamw) + + super()._check_result() + + +class FDPReadTest(FDPTest): + """ + Read workload test. + """ + + def _check_result(self): + ruamw = check_all_ruhs(self.fio_opts['filename']) + + if ruamw != FIO_FDP_MAX_RUAMW: + logging.error("Read workload affected FDP ruamw") + self.passed = False + else: + logging.debug("Read workload did not disturb FDP ruamw") + super()._check_result() + + +def get_fdp_status(dut): + """ + Run the nvme-cli command to obtain FDP status and return result as a JSON + object. + """ + + cmd = f"sudo nvme fdp status --output-format=json {dut}" + cmd = cmd.split(' ') + cmd_result = subprocess.run(cmd, capture_output=True, check=False, + encoding=locale.getpreferredencoding()) + + if cmd_result.returncode != 0: + logging.error("Error obtaining device %s FDP status: %s", dut, cmd_result.stderr) + return False + + return json.loads(cmd_result.stdout) + + +def update_ruh(dut, plid): + """ + Update reclaim unit handles with specified ID(s). This tells the device to + point the RUH to a new (empty) reclaim unit. + """ + + ids = ','.join(plid) if isinstance(plid, list) else plid + cmd = f"nvme fdp update --pids={ids} {dut}" + cmd = cmd.split(' ') + cmd_result = subprocess.run(cmd, capture_output=True, check=False, + encoding=locale.getpreferredencoding()) + + if cmd_result.returncode != 0: + logging.error("Error updating RUH %s ID(s) %s", dut, ids) + return False + + return True + + +def update_all_ruhs(dut): + """ + Update all reclaim unit handles on the device. + """ + + fdp_status = get_fdp_status(dut) + for ruhs in fdp_status['ruhss']: + if not update_ruh(dut, ruhs['pid']): + return False + + return True + + +def check_all_ruhs(dut): + """ + Check that all RUHs have the same value for reclaim unit available media + writes (RUAMW). Return the RUAMW value. + """ + + fdp_status = get_fdp_status(dut) + ruh_status = fdp_status['ruhss'] + + ruamw = ruh_status[0]['ruamw'] + for ruhs in ruh_status: + if ruhs['ruamw'] != ruamw: + logging.error("RUAMW mismatch: found %d, expected %d", ruhs['ruamw'], ruamw) + return False + + return ruamw + + +TEST_LIST = [ + # Write one LBA to one PLID using both the old and new sets of options + ## omit fdp_pli_select/plid_select + { + "test_id": 1, + "fio_opts": { + "rw": 'write', + "bs": 4096, + "number_ios": 1, + "verify": "crc32c", + "fdp": 1, + "fdp_pli": 3, + "output-format": "json", + }, + "test_class": FDPSinglePLIDTest, + }, + { + "test_id": 2, + "fio_opts": { + "rw": 'randwrite', + "bs": 4096, + "number_ios": 1, + "verify": "crc32c", + "dataplacement": "fdp", + "plids": 3, + "output-format": "json", + }, + "test_class": FDPSinglePLIDTest, + }, + ## fdp_pli_select/plid_select=roundrobin + { + "test_id": 3, + "fio_opts": { + "rw": 'write', + "bs": 4096, + "number_ios": 1, + "verify": "crc32c", + "fdp": 1, + "fdp_pli": 3, + "fdp_pli_select": "roundrobin", + "output-format": "json", + }, + "test_class": FDPSinglePLIDTest, + }, + { + "test_id": 4, + "fio_opts": { + "rw": 'randwrite', + "bs": 4096, + "number_ios": 1, + "verify": "crc32c", + "dataplacement": "fdp", + "plids": 3, + "plid_select": "roundrobin", + "output-format": "json", + }, + "test_class": FDPSinglePLIDTest, + }, + ## fdp_pli_select/plid_select=random + { + "test_id": 5, + "fio_opts": { + "rw": 'write', + "bs": 4096, + "number_ios": 1, + "verify": "crc32c", + "fdp": 1, + "fdp_pli": 3, + "fdp_pli_select": "random", + "output-format": "json", + }, + "test_class": FDPSinglePLIDTest, + }, + { + "test_id": 6, + "fio_opts": { + "rw": 'randwrite', + "bs": 4096, + "number_ios": 1, + "verify": "crc32c", + "dataplacement": "fdp", + "plids": 3, + "plid_select": "random", + "output-format": "json", + }, + "test_class": FDPSinglePLIDTest, + }, + # Write four LBAs to one PLID using both the old and new sets of options + ## omit fdp_pli_select/plid_select + { + "test_id": 7, + "fio_opts": { + "rw": 'write', + "bs": 4096, + "number_ios": 4, + "verify": "crc32c", + "fdp": 1, + "fdp_pli": 1, + "output-format": "json", + }, + "test_class": FDPSinglePLIDTest, + }, + { + "test_id": 8, + "fio_opts": { + "rw": 'randwrite', + "bs": 4096, + "number_ios": 4, + "verify": "crc32c", + "dataplacement": "fdp", + "plids": 1, + "output-format": "json", + }, + "test_class": FDPSinglePLIDTest, + }, + ## fdp_pli_select/plid_select=roundrobin + { + "test_id": 9, + "fio_opts": { + "rw": 'write', + "bs": 4096, + "number_ios": 4, + "verify": "crc32c", + "fdp": 1, + "fdp_pli": 1, + "fdp_pli_select": "roundrobin", + "output-format": "json", + }, + "test_class": FDPSinglePLIDTest, + }, + { + "test_id": 10, + "fio_opts": { + "rw": 'randwrite', + "bs": 4096, + "number_ios": 4, + "verify": "crc32c", + "dataplacement": "fdp", + "plids": 1, + "plid_select": "roundrobin", + "output-format": "json", + }, + "test_class": FDPSinglePLIDTest, + }, + ## fdp_pli_select/plid_select=random + { + "test_id": 11, + "fio_opts": { + "rw": 'write', + "bs": 4096, + "number_ios": 4, + "verify": "crc32c", + "fdp": 1, + "fdp_pli": 1, + "fdp_pli_select": "random", + "output-format": "json", + }, + "test_class": FDPSinglePLIDTest, + }, + { + "test_id": 12, + "fio_opts": { + "rw": 'randwrite', + "bs": 4096, + "number_ios": 4, + "verify": "crc32c", + "dataplacement": "fdp", + "plids": 1, + "plid_select": "random", + "output-format": "json", + }, + "test_class": FDPSinglePLIDTest, + }, + # Just a regular write without FDP directive--should land on plid 0 + { + "test_id": 13, + "fio_opts": { + "rw": 'randwrite', + "bs": 4096, + "number_ios": 19, + "verify": "crc32c", + "output-format": "json", + }, + "test_class": FDPSinglePLIDTest, + }, + # Read workload + { + "test_id": 14, + "fio_opts": { + "rw": 'randread', + "bs": 4096, + "number_ios": 19, + "output-format": "json", + }, + "test_class": FDPReadTest, + }, + # write to multiple PLIDs using round robin to select PLIDs + ## write to all PLIDs using old and new sets of options + { + "test_id": 100, + "fio_opts": { + "rw": 'randwrite', + "bs": 4096, + "number_ios": "2*{nruhsd}+3", + "verify": "crc32c", + "fdp": 1, + "fdp_pli_select": "roundrobin", + "output-format": "json", + }, + "test_class": FDPMultiplePLIDTest, + }, + { + "test_id": 101, + "fio_opts": { + "rw": 'randwrite', + "bs": 4096, + "number_ios": "2*{nruhsd}+3", + "verify": "crc32c", + "dataplacement": "fdp", + "plid_select": "roundrobin", + "output-format": "json", + }, + "test_class": FDPMultiplePLIDTest, + }, + ## write to a subset of PLIDs using old and new sets of options + { + "test_id": 102, + "fio_opts": { + "rw": 'randwrite', + "bs": 4096, + "number_ios": "{nruhsd}+1", + "verify": "crc32c", + "fdp": 1, + "fdp_pli": "1,3", + "fdp_pli_select": "roundrobin", + "output-format": "json", + }, + "test_class": FDPMultiplePLIDTest, + }, + { + "test_id": 103, + "fio_opts": { + "rw": 'randwrite', + "bs": 4096, + "number_ios": "{nruhsd}+1", + "verify": "crc32c", + "dataplacement": "fdp", + "plids": "1,3", + "plid_select": "roundrobin", + "output-format": "json", + }, + "test_class": FDPMultiplePLIDTest, + }, + # write to multiple PLIDs using random selection of PLIDs + ## write to all PLIDs using old and new sets of options + { + "test_id": 200, + "fio_opts": { + "rw": 'randwrite', + "bs": 4096, + "number_ios": "{max_ruamw}-1", + "verify": "crc32c", + "fdp": 1, + "fdp_pli_select": "random", + "output-format": "json", + }, + "test_class": FDPMultiplePLIDTest, + }, + { + "test_id": 201, + "fio_opts": { + "rw": 'randwrite', + "bs": 4096, + "number_ios": "{max_ruamw}-1", + "verify": "crc32c", + "dataplacement": "fdp", + "plid_select": "random", + "output-format": "json", + }, + "test_class": FDPMultiplePLIDTest, + }, + ## write to a subset of PLIDs using old and new sets of options + { + "test_id": 202, + "fio_opts": { + "rw": 'randwrite', + "bs": 4096, + "number_ios": "{max_ruamw}-1", + "verify": "crc32c", + "fdp": 1, + "fdp_pli": "1,3,4", + "fdp_pli_select": "random", + "output-format": "json", + }, + "test_class": FDPMultiplePLIDTest, + }, + { + "test_id": 203, + "fio_opts": { + "rw": 'randwrite', + "bs": 4096, + "number_ios": "{max_ruamw}-1", + "verify": "crc32c", + "dataplacement": "fdp", + "plids": "1,3,4", + "plid_select": "random", + "output-format": "json", + }, + "test_class": FDPMultiplePLIDTest, + }, + ### use 3-4 to specify plids + { + "test_id": 204, + "fio_opts": { + "rw": 'randwrite', + "bs": 4096, + "number_ios": "{max_ruamw}-1", + "verify": "crc32c", + "fdp": 1, + "fdp_pli": "1,3-4", + "fdp_pli_select": "random", + "output-format": "json", + }, + "test_class": FDPMultiplePLIDTest, + }, + { + "test_id": 205, + "fio_opts": { + "rw": 'randwrite', + "bs": 4096, + "number_ios": "{max_ruamw}-1", + "verify": "crc32c", + "dataplacement": "fdp", + "plids": "1,3-4", + "plid_select": "random", + "output-format": "json", + }, + "test_class": FDPMultiplePLIDTest, + }, + ### use 1-3 to specify plids + { + "test_id": 206, + "fio_opts": { + "rw": 'randwrite', + "bs": 4096, + "number_ios": "{max_ruamw}-1", + "verify": "crc32c", + "fdp": 1, + "fdp_pli": "1-3", + "fdp_pli_select": "random", + "output-format": "json", + }, + "test_class": FDPMultiplePLIDTest, + }, + { + "test_id": 207, + "fio_opts": { + "rw": 'randwrite', + "bs": 4096, + "number_ios": "{max_ruamw}-1", + "verify": "crc32c", + "dataplacement": "fdp", + "plids": "1-3", + "plid_select": "random", + "output-format": "json", + }, + "test_class": FDPMultiplePLIDTest, + }, + ### use multiple ranges to specify plids + { + "test_id": 208, + "fio_opts": { + "rw": 'randwrite', + "bs": 4096, + "number_ios": "{max_ruamw}-1", + "verify": "crc32c", + "fdp": 1, + "fdp_pli": "1-2,3-3", + "fdp_pli_select": "random", + "output-format": "json", + }, + "test_class": FDPMultiplePLIDTest, + }, + { + "test_id": 209, + "fio_opts": { + "rw": 'randwrite', + "bs": 4096, + "number_ios": "{max_ruamw}-1", + "verify": "crc32c", + "dataplacement": "fdp", + "plids": "1-2,3-3", + "plid_select": "random", + "output-format": "json", + }, + "test_class": FDPMultiplePLIDTest, + }, + { + "test_id": 210, + "fio_opts": { + "rw": 'randwrite', + "bs": 4096, + "number_ios": "{max_ruamw}-1", + "verify": "crc32c", + "fdp": 1, + "fdp_pli": "0-{maxplid}", + "fdp_pli_select": "random", + "output-format": "json", + }, + "test_class": FDPMultiplePLIDTest, + }, + { + "test_id": 211, + "fio_opts": { + "rw": 'randwrite', + "bs": 4096, + "number_ios": "{max_ruamw}-1", + "verify": "crc32c", + "dataplacement": "fdp", + "fdp_pli": "0-{maxplid}", + "plid_select": "random", + "output-format": "json", + }, + "test_class": FDPMultiplePLIDTest, + }, + # Specify invalid options fdp=1 and dataplacement=none + { + "test_id": 300, + "fio_opts": { + "rw": 'write', + "bs": 4096, + "io_size": 4096, + "verify": "crc32c", + "fdp": 1, + "fdp_pli": 3, + "output-format": "normal", + "dataplacement": "none", + }, + "test_class": FDPTest, + "success": SUCCESS_NONZERO, + }, + # Specify invalid options fdp=1 and dataplacement=streams + { + "test_id": 301, + "fio_opts": { + "rw": 'write', + "bs": 4096, + "io_size": 4096, + "verify": "crc32c", + "fdp": 1, + "fdp_pli": 3, + "output-format": "normal", + "dataplacement": "streams", + }, + "test_class": FDPTest, + "success": SUCCESS_NONZERO, + }, + # Specify invalid options related to dataplacement scheme + ## using old and new sets of options + { + "test_id": 302, + "fio_opts": { + "rw": 'write', + "bs": 4096, + "io_size": 4096, + "verify": "crc32c", + "fdp": 1, + "fdp_pli": 3, + "fdp_pli_select": "scheme", + "output-format": "normal", + }, + "test_class": FDPTest, + "success": SUCCESS_NONZERO, + }, + { + "test_id": 303, + "fio_opts": { + "rw": 'write', + "bs": 4096, + "io_size": 4096, + "verify": "crc32c", + "dataplacement": "fdp", + "plids": 3, + "plid_select": "scheme", + "output-format": "normal", + }, + "test_class": FDPTest, + "success": SUCCESS_NONZERO, + }, + ## Specify invalid ranges with start > end + { + "test_id": 304, + "fio_opts": { + "rw": 'write', + "bs": 4096, + "io_size": 4096, + "verify": "crc32c", + "fdp": 1, + "plids": "3-1", + "output-format": "normal", + }, + "test_class": FDPTest, + "success": SUCCESS_NONZERO, + }, + { + "test_id": 305, + "fio_opts": { + "rw": 'write', + "bs": 4096, + "io_size": 4096, + "verify": "crc32c", + "fdp": 1, + "fdp_pli": "3-1", + "output-format": "normal", + }, + "test_class": FDPTest, + "success": SUCCESS_NONZERO, + }, + ## Specify too many plids + { + "test_id": 306, + "fio_opts": { + "rw": 'write', + "bs": 4096, + "io_size": 4096, + "verify": "crc32c", + "fdp": 1, + "plids": "0-65535", + "output-format": "normal", + }, + "test_class": FDPTest, + "success": SUCCESS_NONZERO, + }, + { + "test_id": 307, + "fio_opts": { + "rw": 'write', + "bs": 4096, + "io_size": 4096, + "verify": "crc32c", + "fdp": 1, + "fdp_pli": "0-65535", + "output-format": "normal", + }, + "test_class": FDPTest, + "success": SUCCESS_NONZERO, + }, + # write to multiple PLIDs using scheme selection of PLIDs + ## using old and new sets of options + { + "test_id": 400, + "fio_opts": { + "rw": "write:{hole_size}", + "bs": "{hole_size}", + "number_ios": "{nios_for_scheme}", + "verify": "crc32c", + "fdp": 1, + "fdp_pli_select": "scheme", + "dp_scheme": "lba.scheme", + "output-format": "json", + }, + "test_class": FDPMultiplePLIDTest, + }, + { + "test_id": 401, + "fio_opts": { + "rw": "write:{hole_size}", + "bs": "{hole_size}", + "number_ios": "{nios_for_scheme}", + "verify": "crc32c", + "dataplacement": "fdp", + "plid_select": "scheme", + "dp_scheme": "lba.scheme", + "output-format": "json", + }, + "test_class": FDPMultiplePLIDTest, + }, + # check whether dataplacement works while replaying iologs + { + "test_id": 402, + "fio_opts": { + "rw": "write:{hole_size}", + "bs": "{hole_size}", + "number_ios": "{nios_for_scheme}", + "verify": "crc32c", + "read_iolog": "iolog", + "dataplacement": "fdp", + "plid_select": "scheme", + "dp_scheme": "lba.scheme", + "output-format": "json", + }, + "test_class": FDPMultiplePLIDTest, + }, +] + +def parse_args(): + """Parse command-line arguments.""" + + parser = argparse.ArgumentParser() + parser.add_argument('-d', '--debug', help='Enable debug messages', action='store_true') + parser.add_argument('-f', '--fio', help='path to file executable (e.g., ./fio)') + parser.add_argument('-a', '--artifact-root', help='artifact root directory') + parser.add_argument('-s', '--skip', nargs='+', type=int, + help='list of test(s) to skip') + parser.add_argument('-o', '--run-only', nargs='+', type=int, + help='list of test(s) to run, skipping all others') + parser.add_argument('--dut', help='target NVMe character device to test ' + '(e.g., /dev/ng0n1). WARNING: THIS IS A DESTRUCTIVE TEST', required=True) + args = parser.parse_args() + + return args + + +FIO_FDP_MAX_RUAMW = 0 +FIO_FDP_NUMBER_PLIDS = 0 + +def main(): + """Run tests using fio's io_uring_cmd ioengine to send NVMe pass through commands.""" + global FIO_FDP_MAX_RUAMW + global FIO_FDP_NUMBER_PLIDS + + args = parse_args() + + if args.debug: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + artifact_root = args.artifact_root if args.artifact_root else \ + f"nvmept-fdp-test-{time.strftime('%Y%m%d-%H%M%S')}" + os.mkdir(artifact_root) + print(f"Artifact directory is {artifact_root}") + + if args.fio: + fio_path = str(Path(args.fio).absolute()) + else: + fio_path = 'fio' + print(f"fio path is {fio_path}") + + for test in TEST_LIST: + test['fio_opts']['filename'] = args.dut + + fdp_status = get_fdp_status(args.dut) + FIO_FDP_NUMBER_PLIDS = min(fdp_status['nruhsd'], 128) + update_all_ruhs(args.dut) + FIO_FDP_MAX_RUAMW = check_all_ruhs(args.dut) + if not FIO_FDP_MAX_RUAMW: + sys.exit(-1) + + test_env = { + 'fio_path': fio_path, + 'fio_root': str(Path(__file__).absolute().parent.parent), + 'artifact_root': artifact_root, + 'basename': 'nvmept-fdp', + } + + _, failed, _ = run_fio_tests(TEST_LIST, test_env, args) + sys.exit(failed) + + +if __name__ == '__main__': + main() diff --git a/t/nvmept_pi.py b/t/nvmept_pi.py new file mode 100755 index 0000000000..df7c0b9fd8 --- /dev/null +++ b/t/nvmept_pi.py @@ -0,0 +1,953 @@ +#!/usr/bin/env python3 +""" +# nvmept_pi.py +# +# Test fio's io_uring_cmd ioengine support for DIF/DIX end-to-end data +# protection. +# +# USAGE +# see python3 nvmept_pi.py --help +# +# EXAMPLES (THIS IS A DESTRUCTIVE TEST!!) +# python3 t/nvmept_pi.py --dut /dev/ng0n1 -f ./fio +# python3 t/nvmept_pi.py --dut /dev/ng0n1 -f ./fio --lbaf 1 +# +# REQUIREMENTS +# Python 3.6 +# +""" +import os +import sys +import json +import time +import locale +import logging +import argparse +import itertools +import subprocess +from pathlib import Path +from fiotestlib import FioJobCmdTest, run_fio_tests +from fiotestcommon import SUCCESS_NONZERO + +NUMBER_IOS = 8192 +BS_LOW = 1 +BS_HIGH = 16 + +class DifDixTest(FioJobCmdTest): + """ + NVMe DIF/DIX test class. + """ + + def setup(self, parameters): + """Setup a test.""" + + fio_args = [ + "--name=nvmept_pi", + f"--ioengine={self.fio_opts['ioengine']}", + f"--filename={self.fio_opts['filename']}", + f"--rw={self.fio_opts['rw']}", + f"--bsrange={self.fio_opts['bsrange']}", + f"--output={self.filenames['output']}", + f"--md_per_io_size={self.fio_opts['md_per_io_size']}", + f"--pi_act={self.fio_opts['pi_act']}", + f"--pi_chk={self.fio_opts['pi_chk']}", + f"--apptag={self.fio_opts['apptag']}", + f"--apptag_mask={self.fio_opts['apptag_mask']}", + ] + for opt in ['fixedbufs', 'nonvectored', 'force_async', 'registerfiles', + 'sqthread_poll', 'sqthread_poll_cpu', 'hipri', 'nowait', + 'time_based', 'runtime', 'verify', 'io_size', 'offset', 'number_ios', + 'output-format']: + if opt in self.fio_opts: + option = f"--{opt}={self.fio_opts[opt]}" + fio_args.append(option) + + if self.fio_opts['ioengine'] == 'io_uring_cmd': + fio_args.append('--cmd_type=nvme') + elif self.fio_opts['ioengine'] == 'xnvme': + fio_args.append('--thread=1') + fio_args.append('--xnvme_async=io_uring_cmd') + + super().setup(fio_args) + + +TEST_LIST = [ +# +# Write data with pi_act=1 and then read the data back (with both +# pi_act=[0,1]). +# + { + # Write workload with variable IO sizes + # pi_act=1 + "test_id": 101, + "fio_opts": { + "rw": 'write', + "number_ios": NUMBER_IOS, + "output-format": "json", + "apptag": "0x8888", + "apptag_mask": "0xFFFF", + "pi_act": 1, + }, + "pi_chk": "APPTAG,GUARD,REFTAG", + "bs_low": BS_LOW, + "bs_high": BS_HIGH, + "test_class": DifDixTest, + }, + { + # Read workload with fixed small IO size + # pi_act=0 + "test_id": 102, + "fio_opts": { + "rw": 'read', + "number_ios": NUMBER_IOS, + "output-format": "json", + "pi_act": 0, + "apptag": "0x8888", + "apptag_mask": "0xFFFF", + }, + "pi_chk": "APPTAG,GUARD,REFTAG", + "bs_low": BS_LOW, + "bs_high": BS_LOW, + "test_class": DifDixTest, + }, + { + # Read workload with fixed small IO size + # pi_act=1 + "test_id": 103, + "fio_opts": { + "rw": 'read', + "number_ios": NUMBER_IOS, + "output-format": "json", + "pi_act": 1, + "apptag": "0x8888", + "apptag_mask": "0xFFFF", + }, + "pi_chk": "APPTAG,GUARD,REFTAG", + "bs_low": BS_LOW, + "bs_high": BS_LOW, + "test_class": DifDixTest, + }, + { + # Write workload with fixed large IO size + # Precondition for read workloads to follow + # pi_act=1 + "test_id": 104, + "fio_opts": { + "rw": 'write', + "number_ios": NUMBER_IOS, + "output-format": "json", + "apptag": "0x8888", + "apptag_mask": "0xFFFF", + "pi_act": 1, + }, + "pi_chk": "APPTAG,GUARD,REFTAG", + "bs_low": BS_HIGH, + "bs_high": BS_HIGH, + "test_class": DifDixTest, + }, + { + # Read workload with variable IO sizes + # pi_act=0 + "test_id": 105, + "fio_opts": { + "rw": 'read', + "number_ios": NUMBER_IOS, + "output-format": "json", + "pi_act": 0, + "apptag": "0x8888", + "apptag_mask": "0xFFFF", + }, + "pi_chk": "APPTAG,GUARD,REFTAG", + "bs_low": BS_LOW, + "bs_high": BS_HIGH, + "test_class": DifDixTest, + }, + { + # Read workload with variable IO sizes + # pi_act=1 + "test_id": 106, + "fio_opts": { + "rw": 'read', + "number_ios": NUMBER_IOS, + "output-format": "json", + "pi_act": 1, + "apptag": "0x8888", + "apptag_mask": "0xFFFF", + }, + "pi_chk": "APPTAG,GUARD,REFTAG", + "bs_low": BS_LOW, + "bs_high": BS_HIGH, + "test_class": DifDixTest, + }, +# +# Write data with pi_act=0 and then read the data back (with both +# pi_act=[0,1]). +# + { + # Write workload with variable IO sizes + # pi_act=0 + "test_id": 201, + "fio_opts": { + "rw": 'write', + "number_ios": NUMBER_IOS, + "output-format": "json", + "apptag": "0x8888", + "apptag_mask": "0xFFFF", + "pi_act": 0, + }, + "pi_chk": "APPTAG,GUARD,REFTAG", + "bs_low": BS_LOW, + "bs_high": BS_HIGH, + "test_class": DifDixTest, + }, + { + # Read workload with fixed small IO size + # pi_act=0 + "test_id": 202, + "fio_opts": { + "rw": 'read', + "number_ios": NUMBER_IOS, + "output-format": "json", + "pi_act": 0, + "apptag": "0x8888", + "apptag_mask": "0xFFFF", + }, + "pi_chk": "APPTAG,GUARD,REFTAG", + "bs_low": BS_LOW, + "bs_high": BS_LOW, + "test_class": DifDixTest, + }, + { + # Read workload with fixed small IO size + # pi_act=1 + "test_id": 203, + "fio_opts": { + "rw": 'read', + "number_ios": NUMBER_IOS, + "output-format": "json", + "pi_act": 1, + "apptag": "0x8888", + "apptag_mask": "0xFFFF", + }, + "pi_chk": "APPTAG,GUARD,REFTAG", + "bs_low": BS_LOW, + "bs_high": BS_LOW, + "test_class": DifDixTest, + }, + { + # Write workload with fixed large IO sizes + # pi_act=0 + "test_id": 204, + "fio_opts": { + "rw": 'write', + "number_ios": NUMBER_IOS, + "output-format": "json", + "apptag": "0x8888", + "apptag_mask": "0xFFFF", + "pi_act": 0, + }, + "pi_chk": "APPTAG,GUARD,REFTAG", + "bs_low": BS_HIGH, + "bs_high": BS_HIGH, + "test_class": DifDixTest, + }, + { + # Read workload with variable IO sizes + # pi_act=0 + "test_id": 205, + "fio_opts": { + "rw": 'read', + "number_ios": NUMBER_IOS, + "output-format": "json", + "pi_act": 0, + "apptag": "0x8888", + "apptag_mask": "0xFFFF", + }, + "pi_chk": "APPTAG,GUARD,REFTAG", + "bs_low": BS_LOW, + "bs_high": BS_HIGH, + "test_class": DifDixTest, + }, + { + # Read workload with variable IO sizes + # pi_act=1 + "test_id": 206, + "fio_opts": { + "rw": 'read', + "number_ios": NUMBER_IOS, + "output-format": "json", + "pi_act": 1, + "apptag": "0x8888", + "apptag_mask": "0xFFFF", + }, + "pi_chk": "APPTAG,GUARD,REFTAG", + "bs_low": BS_LOW, + "bs_high": BS_HIGH, + "test_class": DifDixTest, + }, +# +# Test apptag errors. +# + { + # Read workload with variable IO sizes + # pi_act=0 + # trigger an apptag error + "test_id": 301, + "fio_opts": { + "rw": 'read', + "number_ios": NUMBER_IOS, + "output-format": "json", + "pi_act": 0, + "apptag": "0x0888", + "apptag_mask": "0xFFFF", + }, + "pi_chk": "APPTAG,GUARD,REFTAG", + "bs_low": BS_LOW, + "bs_high": BS_HIGH, + "success": SUCCESS_NONZERO, + "test_class": DifDixTest, + }, + { + # Read workload with variable IO sizes + # pi_act=1 + # trigger an apptag error + "test_id": 302, + "fio_opts": { + "rw": 'read', + "number_ios": NUMBER_IOS, + "output-format": "json", + "pi_act": 1, + "apptag": "0x0888", + "apptag_mask": "0xFFFF", + }, + "pi_chk": "APPTAG,GUARD,REFTAG", + "bs_low": BS_LOW, + "bs_high": BS_HIGH, + "success": SUCCESS_NONZERO, + "test_class": DifDixTest, + }, + { + # Read workload with variable IO sizes + # pi_act=0 + # trigger an apptag error + # same as above but with pi_chk=APPTAG only + "test_id": 303, + "fio_opts": { + "rw": 'read', + "number_ios": NUMBER_IOS, + "output-format": "json", + "pi_act": 0, + "apptag": "0x0888", + "apptag_mask": "0xFFFF", + }, + "pi_chk": "APPTAG", + "bs_low": BS_LOW, + "bs_high": BS_HIGH, + "success": SUCCESS_NONZERO, + "test_class": DifDixTest, + }, + { + # Read workload with variable IO sizes + # pi_act=1 + # trigger an apptag error + # same as above but with pi_chk=APPTAG only + "test_id": 304, + "fio_opts": { + "rw": 'read', + "number_ios": NUMBER_IOS, + "output-format": "json", + "pi_act": 1, + "apptag": "0x0888", + "apptag_mask": "0xFFFF", + }, + "pi_chk": "APPTAG", + "bs_low": BS_LOW, + "bs_high": BS_HIGH, + "success": SUCCESS_NONZERO, + "test_class": DifDixTest, + }, + { + # Read workload with variable IO sizes + # pi_act=0 + # this case would trigger an apptag error, but pi_chk says to check + # only the Guard PI and reftag, so there should be no error + "test_id": 305, + "fio_opts": { + "rw": 'read', + "number_ios": NUMBER_IOS, + "output-format": "json", + "pi_act": 0, + "apptag": "0x0888", + "apptag_mask": "0xFFFF", + }, + "pi_chk": "GUARD,REFTAG", + "bs_low": BS_LOW, + "bs_high": BS_HIGH, + "test_class": DifDixTest, + }, + { + # Read workload with variable IO sizes + # pi_act=1 + # this case would trigger an apptag error, but pi_chk says to check + # only the Guard PI and reftag, so there should be no error + "test_id": 306, + "fio_opts": { + "rw": 'read', + "number_ios": NUMBER_IOS, + "output-format": "json", + "pi_act": 1, + "apptag": "0x0888", + "apptag_mask": "0xFFFF", + }, + "pi_chk": "GUARD,REFTAG", + "bs_low": BS_LOW, + "bs_high": BS_HIGH, + "test_class": DifDixTest, + }, + { + # Read workload with variable IO sizes + # pi_act=0 + # this case would trigger an apptag error, but pi_chk says to check + # only the Guard PI, so there should be no error + "test_id": 307, + "fio_opts": { + "rw": 'read', + "number_ios": NUMBER_IOS, + "output-format": "json", + "pi_act": 0, + "apptag": "0x0888", + "apptag_mask": "0xFFFF", + }, + "pi_chk": "GUARD", + "bs_low": BS_LOW, + "bs_high": BS_HIGH, + "test_class": DifDixTest, + }, + { + # Read workload with variable IO sizes + # pi_act=1 + # this case would trigger an apptag error, but pi_chk says to check + # only the Guard PI, so there should be no error + "test_id": 308, + "fio_opts": { + "rw": 'read', + "number_ios": NUMBER_IOS, + "output-format": "json", + "pi_act": 1, + "apptag": "0x0888", + "apptag_mask": "0xFFFF", + }, + "pi_chk": "GUARD", + "bs_low": BS_LOW, + "bs_high": BS_HIGH, + "test_class": DifDixTest, + }, + { + # Read workload with variable IO sizes + # pi_act=0 + # this case would trigger an apptag error, but pi_chk says to check + # only the reftag, so there should be no error + # This case will be skipped when the device is formatted with Type 3 PI + # since Type 3 PI ignores the reftag + "test_id": 309, + "fio_opts": { + "rw": 'read', + "number_ios": NUMBER_IOS, + "output-format": "json", + "pi_act": 0, + "apptag": "0x0888", + "apptag_mask": "0xFFFF", + }, + "pi_chk": "REFTAG", + "bs_low": BS_LOW, + "bs_high": BS_HIGH, + "skip": "type3", + "test_class": DifDixTest, + }, + { + # Read workload with variable IO sizes + # pi_act=1 + # this case would trigger an apptag error, but pi_chk says to check + # only the reftag, so there should be no error + # This case will be skipped when the device is formatted with Type 3 PI + # since Type 3 PI ignores the reftag + "test_id": 310, + "fio_opts": { + "rw": 'read', + "number_ios": NUMBER_IOS, + "output-format": "json", + "pi_act": 1, + "apptag": "0x0888", + "apptag_mask": "0xFFFF", + }, + "pi_chk": "REFTAG", + "bs_low": BS_LOW, + "bs_high": BS_HIGH, + "skip": "type3", + "test_class": DifDixTest, + }, + { + # Read workload with variable IO sizes + # pi_act=0 + # use apptag mask to ignore apptag mismatch + "test_id": 311, + "fio_opts": { + "rw": 'read', + "number_ios": NUMBER_IOS, + "output-format": "json", + "pi_act": 0, + "apptag": "0x0888", + "apptag_mask": "0x0FFF", + }, + "pi_chk": "APPTAG,GUARD,REFTAG", + "bs_low": BS_LOW, + "bs_high": BS_HIGH, + "test_class": DifDixTest, + }, + { + # Read workload with variable IO sizes + # pi_act=1 + # use apptag mask to ignore apptag mismatch + "test_id": 312, + "fio_opts": { + "rw": 'read', + "number_ios": NUMBER_IOS, + "output-format": "json", + "pi_act": 1, + "apptag": "0x0888", + "apptag_mask": "0x0FFF", + }, + "pi_chk": "APPTAG,GUARD,REFTAG", + "bs_low": BS_LOW, + "bs_high": BS_HIGH, + "test_class": DifDixTest, + }, + { + # Read workload with variable IO sizes + # pi_act=0 + # use apptag mask to ignore apptag mismatch + "test_id": 313, + "fio_opts": { + "rw": 'read', + "number_ios": NUMBER_IOS, + "output-format": "json", + "pi_act": 0, + "apptag": "0xF888", + "apptag_mask": "0x0FFF", + }, + "pi_chk": "APPTAG,GUARD,REFTAG", + "bs_low": BS_LOW, + "bs_high": BS_HIGH, + "test_class": DifDixTest, + }, + { + # Read workload with variable IO sizes + # pi_act=1 + # use apptag mask to ignore apptag mismatch + "test_id": 314, + "fio_opts": { + "rw": 'read', + "number_ios": NUMBER_IOS, + "output-format": "json", + "pi_act": 1, + "apptag": "0xF888", + "apptag_mask": "0x0FFF", + }, + "pi_chk": "APPTAG,GUARD,REFTAG", + "bs_low": BS_LOW, + "bs_high": BS_HIGH, + "test_class": DifDixTest, + }, + { + # Write workload with fixed large IO sizes + # Set apptag=0xFFFF to disable all checking for Type 1 and 2 + # pi_act=1 + "test_id": 315, + "fio_opts": { + "rw": 'write', + "number_ios": NUMBER_IOS, + "output-format": "json", + "apptag": "0xFFFF", + "apptag_mask": "0xFFFF", + "pi_act": 1, + }, + "pi_chk": "APPTAG,GUARD,REFTAG", + "bs_low": BS_HIGH, + "bs_high": BS_HIGH, + "skip": "type3", + "test_class": DifDixTest, + }, + { + # Read workload with variable IO sizes + # pi_act=0 + # Data was written with apptag=0xFFFF + # Reading the data back should disable all checking for Type 1 and 2 + "test_id": 316, + "fio_opts": { + "rw": 'read', + "number_ios": NUMBER_IOS, + "output-format": "json", + "pi_act": 0, + "apptag": "0x0101", + "apptag_mask": "0xFFFF", + }, + "pi_chk": "APPTAG,GUARD,REFTAG", + "bs_low": BS_LOW, + "bs_high": BS_HIGH, + "skip": "type3", + "test_class": DifDixTest, + }, + { + # Read workload with variable IO sizes + # pi_act=1 + # Data was written with apptag=0xFFFF + # Reading the data back should disable all checking for Type 1 and 2 + "test_id": 317, + "fio_opts": { + "rw": 'read', + "number_ios": NUMBER_IOS, + "output-format": "json", + "pi_act": 1, + "apptag": "0x0000", + "apptag_mask": "0xFFFF", + }, + "pi_chk": "APPTAG,GUARD,REFTAG", + "bs_low": BS_LOW, + "bs_high": BS_HIGH, + "skip": "type3", + "test_class": DifDixTest, + }, +# +# Error cases related to block size and metadata size +# + { + # Use a min block size that is not a multiple of lba/elba size to + # trigger an error. + "test_id": 401, + "fio_opts": { + "rw": 'read', + "number_ios": NUMBER_IOS, + "pi_act": 0, + "apptag": "0x8888", + "apptag_mask": "0x0FFF", + }, + "pi_chk": "APPTAG,GUARD,REFTAG", + "bs_low": BS_LOW+0.5, + "bs_high": BS_HIGH, + "success": SUCCESS_NONZERO, + "test_class": DifDixTest, + }, + { + # Use metadata size that is too small + "test_id": 402, + "fio_opts": { + "rw": 'read', + "number_ios": NUMBER_IOS, + "pi_act": 0, + "apptag": "0x8888", + "apptag_mask": "0x0FFF", + }, + "pi_chk": "APPTAG,GUARD,REFTAG", + "bs_low": BS_LOW, + "bs_high": BS_HIGH, + "mdsize_adjustment": -1, + "success": SUCCESS_NONZERO, + "skip": "elba", + "test_class": DifDixTest, + }, + { + # Read workload with variable IO sizes + # pi_act=0 + # Should still work even if metadata size is too large + "test_id": 403, + "fio_opts": { + "rw": 'read', + "number_ios": NUMBER_IOS, + "pi_act": 0, + "apptag": "0x8888", + "apptag_mask": "0x0FFF", + }, + "pi_chk": "APPTAG,GUARD,REFTAG", + "bs_low": BS_LOW, + "bs_high": BS_HIGH, + "mdsize_adjustment": 1, + "test_class": DifDixTest, + }, +] + + +def parse_args(): + """Parse command-line arguments.""" + + parser = argparse.ArgumentParser() + parser.add_argument('-d', '--debug', help='Enable debug messages', action='store_true') + parser.add_argument('-f', '--fio', help='path to file executable (e.g., ./fio)') + parser.add_argument('-a', '--artifact-root', help='artifact root directory') + parser.add_argument('-s', '--skip', nargs='+', type=int, + help='list of test(s) to skip') + parser.add_argument('-o', '--run-only', nargs='+', type=int, + help='list of test(s) to run, skipping all others') + parser.add_argument('--dut', help='target NVMe character device to test ' + '(e.g., /dev/ng0n1). WARNING: THIS IS A DESTRUCTIVE TEST', required=True) + parser.add_argument('-l', '--lbaf', nargs='+', type=int, + help='list of lba formats to test') + parser.add_argument('-i', '--ioengine', default='io_uring_cmd') + args = parser.parse_args() + + return args + + +def get_lbafs(args): + """ + Determine which LBA formats to use. Use either the ones specified on the + command line or if none are specified query the device and use all lba + formats with metadata. + """ + lbaf_list = [] + id_ns_cmd = f"sudo nvme id-ns --output-format=json {args.dut}".split(' ') + id_ns_output = subprocess.check_output(id_ns_cmd) + lbafs = json.loads(id_ns_output)['lbafs'] + if args.lbaf: + for lbaf in args.lbaf: + lbaf_list.append({'lbaf': lbaf, 'ds': 2 ** lbafs[lbaf]['ds'], + 'ms': lbafs[lbaf]['ms'], }) + if lbafs[lbaf]['ms'] == 0: + print(f'Error: lbaf {lbaf} has metadata size zero') + sys.exit(1) + else: + for lbaf_num, lbaf in enumerate(lbafs): + if lbaf['ms'] != 0: + lbaf_list.append({'lbaf': lbaf_num, 'ds': 2 ** lbaf['ds'], + 'ms': lbaf['ms'], }) + + return lbaf_list + + +def get_guard_pi(lbaf_list, args): + """ + Find out how many bits of guard protection information are associated with + each lbaf to be used. If this is not available assume 16-bit guard pi. + Also record the bytes of protection information associated with the number + of guard PI bits. + """ + nvm_id_ns_cmd = f"sudo nvme nvm-id-ns --output-format=json {args.dut}".split(' ') + try: + nvm_id_ns_output = subprocess.check_output(nvm_id_ns_cmd) + except subprocess.CalledProcessError: + print(f"Non-zero return code from {' '.join(nvm_id_ns_cmd)}; " \ + "assuming all lbafs use 16b Guard Protection Information") + for lbaf in lbaf_list: + lbaf['guard_pi_bits'] = 16 + else: + elbafs = json.loads(nvm_id_ns_output)['elbafs'] + for elbaf_num, elbaf in enumerate(elbafs): + for lbaf in lbaf_list: + if lbaf['lbaf'] == elbaf_num: + lbaf['guard_pi_bits'] = 16 << elbaf['pif'] + + # For 16b Guard Protection Information, the PI requires 8 bytes + # For 32b and 64b Guard PI, the PI requires 16 bytes + for lbaf in lbaf_list: + if lbaf['guard_pi_bits'] == 16: + lbaf['pi_bytes'] = 8 + else: + lbaf['pi_bytes'] = 16 + + +def get_capabilities(args): + """ + Determine what end-to-end data protection features the device supports. + """ + caps = { 'pil': [], 'pitype': [], 'elba': [] } + id_ns_cmd = f"sudo nvme id-ns --output-format=json {args.dut}".split(' ') + id_ns_output = subprocess.check_output(id_ns_cmd) + id_ns_json = json.loads(id_ns_output) + + mc = id_ns_json['mc'] + if mc & 1: + caps['elba'].append(1) + if mc & 2: + caps['elba'].append(0) + + dpc = id_ns_json['dpc'] + if dpc & 1: + caps['pitype'].append(1) + if dpc & 2: + caps['pitype'].append(2) + if dpc & 4: + caps['pitype'].append(3) + if dpc & 8: + caps['pil'].append(1) + if dpc & 16: + caps['pil'].append(0) + + for _, value in caps.items(): + if len(value) == 0: + logging.error("One or more end-to-end data protection features unsupported: %s", caps) + sys.exit(-1) + + return caps + + +def format_device(args, lbaf, pitype, pil, elba): + """ + Format device using specified lba format with specified pitype, pil, and + elba values. + """ + + format_cmd = f"sudo nvme format {args.dut} --lbaf={lbaf['lbaf']} " \ + f"--pi={pitype} --pil={pil} --ms={elba} --force" + logging.debug("Format command: %s", format_cmd) + format_cmd = format_cmd.split(' ') + format_cmd_result = subprocess.run(format_cmd, capture_output=True, check=False, + encoding=locale.getpreferredencoding()) + + # Sometimes nvme-cli may format the device successfully but fail to + # rescan the namespaces after the format. Continue if this happens but + # abort if some other error occurs. + if format_cmd_result.returncode != 0: + if 'failed to rescan namespaces' not in format_cmd_result.stderr \ + or 'Success formatting namespace' not in format_cmd_result.stdout: + logging.error(format_cmd_result.stdout) + logging.error(format_cmd_result.stderr) + print("Unable to format device; skipping this configuration") + return False + + logging.debug(format_cmd_result.stdout) + return True + + +def difdix_test(test_env, args, lbaf, pitype, elba): + """ + Adjust test arguments based on values of lbaf, pitype, and elba. Then run + the tests. + """ + for test in TEST_LIST: + test['force_skip'] = False + + blocksize = lbaf['ds'] + # Set fio blocksize parameter at runtime + # If we formatted the device in extended LBA mode (e.g., 520-byte + # sectors), we usually need to add the lba data size and metadata size + # together for fio's bs parameter. However, if pi_act == 1 and the + # device is formatted so that the metadata is the same size as the PI, + # then the device will take care of everything and the application + # should just use regular power of 2 lba data size even when the device + # is in extended lba mode. + if elba: + if not test['fio_opts']['pi_act'] or lbaf['ms'] != lbaf['pi_bytes']: + blocksize += lbaf['ms'] + test['fio_opts']['md_per_io_size'] = 0 + else: + # If we are using a separate buffer for metadata, fio doesn't need to + # do anything when pi_act==1 and protection information size is equal to + # metadata size since the device is taking care of it all. If either of + # the two conditions do not hold, then we do need to allocate a + # separate metadata buffer. + if test['fio_opts']['pi_act'] and lbaf['ms'] == lbaf['pi_bytes']: + test['fio_opts']['md_per_io_size'] = 0 + else: + test['fio_opts']['md_per_io_size'] = lbaf['ms'] * test['bs_high'] + + test['fio_opts']['bsrange'] = f"{blocksize * test['bs_low']}-{blocksize * test['bs_high']}" + if 'mdsize_adjustment' in test: + test['fio_opts']['md_per_io_size'] += test['mdsize_adjustment'] + + # Set fio pi_chk parameter at runtime. If the device is formatted + # with Type 3 protection information, this means that the reference + # tag is not checked and I/O commands may throw an error if they + # are submitted with the REFTAG bit set in pi_chk. Make sure fio + # does not set pi_chk's REFTAG bit if the device is formatted with + # Type 3 PI. + if 'pi_chk' in test: + if pitype == 3 and 'REFTAG' in test['pi_chk']: + test['fio_opts']['pi_chk'] = test['pi_chk'].replace('REFTAG','') + logging.debug("Type 3 PI: dropping REFTAG bit") + else: + test['fio_opts']['pi_chk'] = test['pi_chk'] + + if 'skip' in test: + if pitype == 3 and 'type3' in test['skip']: + test['force_skip'] = True + logging.debug("Type 3 PI: skipping test case") + if elba and 'elba' in test['skip']: + test['force_skip'] = True + logging.debug("extended lba format: skipping test case") + + logging.debug("Test %d: pi_act=%d, bsrange=%s, md_per_io_size=%d", test['test_id'], + test['fio_opts']['pi_act'], test['fio_opts']['bsrange'], + test['fio_opts']['md_per_io_size']) + + return run_fio_tests(TEST_LIST, test_env, args) + + +def main(): + """ + Run tests using fio's io_uring_cmd ioengine to exercise end-to-end data + protection capabilities. + """ + + args = parse_args() + + if args.debug: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + artifact_root = args.artifact_root if args.artifact_root else \ + f"nvmept_pi-test-{time.strftime('%Y%m%d-%H%M%S')}" + os.mkdir(artifact_root) + print(f"Artifact directory is {artifact_root}") + + if args.fio: + fio_path = str(Path(args.fio).absolute()) + else: + fio_path = 'fio' + print(f"fio path is {fio_path}") + + lbaf_list = get_lbafs(args) + get_guard_pi(lbaf_list, args) + caps = get_capabilities(args) + print("Device capabilities:", caps) + + for test in TEST_LIST: + test['fio_opts']['filename'] = args.dut + test['fio_opts']['ioengine'] = args.ioengine + + test_env = { + 'fio_path': fio_path, + 'fio_root': str(Path(__file__).absolute().parent.parent), + 'artifact_root': artifact_root, + 'basename': 'nvmept_pi', + } + + total = { 'passed': 0, 'failed': 0, 'skipped': 0 } + + try: + for lbaf, pil, pitype, elba in itertools.product(lbaf_list, caps['pil'], caps['pitype'], + caps['elba']): + print(f"\nlbaf: {lbaf}, pil: {pil}, pitype: {pitype}, elba: {elba}") + + if not format_device(args, lbaf, pitype, pil, elba): + continue + + test_env['artifact_root'] = \ + os.path.join(artifact_root, f"lbaf{lbaf['lbaf']}pil{pil}pitype{pitype}" \ + f"elba{elba}") + os.mkdir(test_env['artifact_root']) + + passed, failed, skipped = difdix_test(test_env, args, lbaf, pitype, elba) + + total['passed'] += passed + total['failed'] += failed + total['skipped'] += skipped + except KeyboardInterrupt: + pass + + print(f"\n\n{total['passed']} test(s) passed, {total['failed']} failed, " \ + f"{total['skipped']} skipped") + sys.exit(total['failed']) + + +if __name__ == '__main__': + main() diff --git a/t/nvmept_streams.py b/t/nvmept_streams.py new file mode 100755 index 0000000000..e5425506c6 --- /dev/null +++ b/t/nvmept_streams.py @@ -0,0 +1,520 @@ +#!/usr/bin/env python3 +# +# Copyright 2024 Samsung Electronics Co., Ltd All Rights Reserved +# +# For conditions of distribution and use, see the accompanying COPYING file. +# +""" +# nvmept_streams.py +# +# Test fio's NVMe streams support using the io_uring_cmd ioengine with NVMe +# pass-through commands. +# +# USAGE +# see python3 nvmept_streams.py --help +# +# EXAMPLES +# python3 t/nvmept_streams.py --dut /dev/ng0n1 +# python3 t/nvmept_streams.py --dut /dev/ng1n1 -f ./fio +# +# REQUIREMENTS +# Python 3.6 +# +# WARNING +# This is a destructive test +# +# Enable streams with +# nvme dir-send -D 0 -O 1 -e 1 -T 1 /dev/nvme0n1 +# +# See streams directive status with +# nvme dir-receive -D 0 -O 1 -H /dev/nvme0n1 +""" +import os +import sys +import time +import locale +import logging +import argparse +import subprocess +from pathlib import Path +from fiotestlib import FioJobCmdTest, run_fio_tests +from fiotestcommon import SUCCESS_NONZERO + + +class StreamsTest(FioJobCmdTest): + """ + NVMe pass-through test class for streams. Check to make sure output for + selected data direction(s) is non-zero and that zero data appears for other + directions. + """ + + def setup(self, parameters): + """Setup a test.""" + + fio_args = [ + "--name=nvmept-streams", + "--ioengine=io_uring_cmd", + "--cmd_type=nvme", + "--randrepeat=0", + f"--filename={self.fio_opts['filename']}", + f"--rw={self.fio_opts['rw']}", + f"--output={self.filenames['output']}", + f"--output-format={self.fio_opts['output-format']}", + ] + for opt in ['fixedbufs', 'nonvectored', 'force_async', 'registerfiles', + 'sqthread_poll', 'sqthread_poll_cpu', 'hipri', 'nowait', + 'time_based', 'runtime', 'verify', 'io_size', 'num_range', + 'iodepth', 'iodepth_batch', 'iodepth_batch_complete', + 'size', 'rate', 'bs', 'bssplit', 'bsrange', 'randrepeat', + 'buffer_pattern', 'verify_pattern', 'offset', 'dataplacement', + 'plids', 'plid_select' ]: + if opt in self.fio_opts: + option = f"--{opt}={self.fio_opts[opt]}" + fio_args.append(option) + + super().setup(fio_args) + + + def check_result(self): + try: + self._check_result() + finally: + release_all_streams(self.fio_opts['filename']) + + + def _check_result(self): + + super().check_result() + + if 'rw' not in self.fio_opts or \ + not self.passed or \ + 'json' not in self.fio_opts['output-format']: + return + + job = self.json_data['jobs'][0] + + if self.fio_opts['rw'] in ['read', 'randread']: + self.passed = self.check_all_ddirs(['read'], job) + elif self.fio_opts['rw'] in ['write', 'randwrite']: + if 'verify' not in self.fio_opts: + self.passed = self.check_all_ddirs(['write'], job) + else: + self.passed = self.check_all_ddirs(['read', 'write'], job) + elif self.fio_opts['rw'] in ['trim', 'randtrim']: + self.passed = self.check_all_ddirs(['trim'], job) + elif self.fio_opts['rw'] in ['readwrite', 'randrw']: + self.passed = self.check_all_ddirs(['read', 'write'], job) + elif self.fio_opts['rw'] in ['trimwrite', 'randtrimwrite']: + self.passed = self.check_all_ddirs(['trim', 'write'], job) + else: + logging.error("Unhandled rw value %s", self.fio_opts['rw']) + self.passed = False + + if 'iodepth' in self.fio_opts: + # We will need to figure something out if any test uses an iodepth + # different from 8 + if job['iodepth_level']['8'] < 95: + logging.error("Did not achieve requested iodepth") + self.passed = False + else: + logging.debug("iodepth 8 target met %s", job['iodepth_level']['8']) + + stream_ids = [int(stream) for stream in self.fio_opts['plids'].split(',')] + if not self.check_streams(self.fio_opts['filename'], stream_ids): + self.passed = False + logging.error("Streams not as expected") + else: + logging.debug("Streams created as expected") + + + def check_streams(self, dut, stream_ids): + """ + Confirm that the specified stream IDs exist on the specified device. + """ + + id_list = get_device_stream_ids(dut) + if not id_list: + return False + + for stream in stream_ids: + if stream in id_list: + logging.debug("Stream ID %d found active on device", stream) + id_list.remove(stream) + else: + if self.__class__.__name__ != "StreamsTestRand": + logging.error("Stream ID %d not found on device", stream) + else: + logging.debug("Stream ID %d not found on device", stream) + return False + + if len(id_list) != 0: + logging.error("Extra stream IDs %s found on device", str(id_list)) + return False + + return True + + +class StreamsTestRR(StreamsTest): + """ + NVMe pass-through test class for streams. Check to make sure output for + selected data direction(s) is non-zero and that zero data appears for other + directions. Check that Stream IDs are accessed in round robin order. + """ + + def check_streams(self, dut, stream_ids): + """ + The number of IOs is less than the number of stream IDs provided. Let N + be the number of IOs. Make sure that the device only has the first N of + the stream IDs provided. + + This will miss some cases where some other selection algorithm happens + to select the first N stream IDs. The solution would be to repeat this + test multiple times. Multiple trials passing would be evidence that + round robin is working correctly. + """ + + id_list = get_device_stream_ids(dut) + if not id_list: + return False + + num_streams = int(self.fio_opts['io_size'] / self.fio_opts['bs']) + stream_ids = sorted(stream_ids)[0:num_streams] + + return super().check_streams(dut, stream_ids) + + +class StreamsTestRand(StreamsTest): + """ + NVMe pass-through test class for streams. Check to make sure output for + selected data direction(s) is non-zero and that zero data appears for other + directions. Check that Stream IDs are accessed in random order. + """ + + def check_streams(self, dut, stream_ids): + """ + The number of IOs is less than the number of stream IDs provided. Let N + be the number of IOs. Confirm that the stream IDs on the device are not + the first N stream IDs. + + This will produce false positives because it is possible for the first + N stream IDs to be randomly selected. We can reduce the probability of + false positives by increasing N and increasing the number of streams + IDs to choose from, although fio has a max of 16 placement IDs. + """ + + id_list = get_device_stream_ids(dut) + if not id_list: + return False + + num_streams = int(self.fio_opts['io_size'] / self.fio_opts['bs']) + stream_ids = sorted(stream_ids)[0:num_streams] + + return not super().check_streams(dut, stream_ids) + + +def get_device_stream_ids(dut): + cmd = f"sudo nvme dir-receive -D 1 -O 2 -H {dut}" + logging.debug("check streams command: %s", cmd) + cmd = cmd.split(' ') + cmd_result = subprocess.run(cmd, capture_output=True, check=False, + encoding=locale.getpreferredencoding()) + + logging.debug(cmd_result.stdout) + + if cmd_result.returncode != 0: + logging.error("Error obtaining device %s stream IDs: %s", dut, cmd_result.stderr) + return False + + id_list = [] + for line in cmd_result.stdout.split('\n'): + if not 'Stream Identifier' in line: + continue + tokens = line.split(':') + id_list.append(int(tokens[1])) + + return id_list + + +def release_stream(dut, stream_id): + """ + Release stream on given device with selected ID. + """ + cmd = f"nvme dir-send -D 1 -O 1 -S {stream_id} {dut}" + logging.debug("release stream command: %s", cmd) + cmd = cmd.split(' ') + cmd_result = subprocess.run(cmd, capture_output=True, check=False, + encoding=locale.getpreferredencoding()) + + if cmd_result.returncode != 0: + logging.error("Error releasing %s stream %d", dut, stream_id) + return False + + return True + + +def release_all_streams(dut): + """ + Release all streams on specified device. + """ + + id_list = get_device_stream_ids(dut) + if not id_list: + return False + + for stream in id_list: + if not release_stream(dut, stream): + return False + + return True + + +TEST_LIST = [ + # 4k block size + # {seq write, rand write} x {single stream, four streams} + { + "test_id": 1, + "fio_opts": { + "rw": 'write', + "bs": 4096, + "io_size": 256*1024*1024, + "verify": "crc32c", + "plids": "8", + "dataplacement": "streams", + "output-format": "json", + }, + "test_class": StreamsTest, + }, + { + "test_id": 2, + "fio_opts": { + "rw": 'randwrite', + "bs": 4096, + "io_size": 256*1024*1024, + "verify": "crc32c", + "plids": "3", + "dataplacement": "streams", + "output-format": "json", + }, + "test_class": StreamsTest, + }, + { + "test_id": 3, + "fio_opts": { + "rw": 'write', + "bs": 4096, + "io_size": 256*1024*1024, + "verify": "crc32c", + "plids": "1,2,3,4", + "dataplacement": "streams", + "output-format": "json", + }, + "test_class": StreamsTest, + }, + { + "test_id": 4, + "fio_opts": { + "rw": 'randwrite', + "bs": 4096, + "io_size": 256*1024*1024, + "verify": "crc32c", + "plids": "5,6,7,8", + "dataplacement": "streams", + "output-format": "json", + }, + "test_class": StreamsTest, + }, + # 256KiB block size + # {seq write, rand write} x {single stream, four streams} + { + "test_id": 10, + "fio_opts": { + "rw": 'write', + "bs": 256*1024, + "io_size": 256*1024*1024, + "verify": "crc32c", + "plids": "88", + "dataplacement": "streams", + "output-format": "json", + }, + "test_class": StreamsTest, + }, + { + "test_id": 11, + "fio_opts": { + "rw": 'randwrite', + "bs": 256*1024, + "io_size": 256*1024*1024, + "verify": "crc32c", + "plids": "20", + "dataplacement": "streams", + "output-format": "json", + }, + "test_class": StreamsTest, + }, + { + "test_id": 12, + "fio_opts": { + "rw": 'write', + "bs": 256*1024, + "io_size": 256*1024*1024, + "verify": "crc32c", + "plids": "16,32,64,128", + "dataplacement": "streams", + "output-format": "json", + }, + "test_class": StreamsTest, + }, + { + "test_id": 13, + "fio_opts": { + "rw": 'randwrite', + "bs": 256*1024, + "io_size": 256*1024*1024, + "verify": "crc32c", + "plids": "10,20,40,82", + "dataplacement": "streams", + "output-format": "json", + }, + "test_class": StreamsTest, + }, + # Test placement ID selection patterns + # default is round robin + { + "test_id": 20, + "fio_opts": { + "rw": 'write', + "bs": 4096, + "io_size": 8192, + "plids": '88,99,100,123,124,125,126,127,128,129,130,131,132,133,134,135', + "dataplacement": "streams", + "output-format": "json", + }, + "test_class": StreamsTestRR, + }, + { + "test_id": 21, + "fio_opts": { + "rw": 'write', + "bs": 4096, + "io_size": 8192, + "plids": '12,88,99,100,123,124,125,126,127,128,129,130,131,132,133,11', + "dataplacement": "streams", + "output-format": "json", + }, + "test_class": StreamsTestRR, + }, + # explicitly select round robin + { + "test_id": 22, + "fio_opts": { + "rw": 'write', + "bs": 4096, + "io_size": 8192, + "plids": '22,88,99,100,123,124,125,126,127,128,129,130,131,132,133,134', + "dataplacement": "streams", + "output-format": "json", + "plid_select": "roundrobin", + }, + "test_class": StreamsTestRR, + }, + # explicitly select random + { + "test_id": 23, + "fio_opts": { + "rw": 'write', + "bs": 4096, + "io_size": 8192, + "plids": '1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16', + "dataplacement": "streams", + "output-format": "json", + "plid_select": "random", + }, + "test_class": StreamsTestRand, + }, + # Error case with placement ID > 0xFFFF + { + "test_id": 30, + "fio_opts": { + "rw": 'write', + "bs": 4096, + "io_size": 8192, + "plids": "1,2,3,0x10000", + "dataplacement": "streams", + "output-format": "normal", + "plid_select": "random", + }, + "test_class": StreamsTestRand, + "success": SUCCESS_NONZERO, + }, + # Error case with no stream IDs provided + { + "test_id": 31, + "fio_opts": { + "rw": 'write', + "bs": 4096, + "io_size": 8192, + "dataplacement": "streams", + "output-format": "normal", + }, + "test_class": StreamsTestRand, + "success": SUCCESS_NONZERO, + }, + +] + +def parse_args(): + """Parse command-line arguments.""" + + parser = argparse.ArgumentParser() + parser.add_argument('-d', '--debug', help='Enable debug messages', action='store_true') + parser.add_argument('-f', '--fio', help='path to file executable (e.g., ./fio)') + parser.add_argument('-a', '--artifact-root', help='artifact root directory') + parser.add_argument('-s', '--skip', nargs='+', type=int, + help='list of test(s) to skip') + parser.add_argument('-o', '--run-only', nargs='+', type=int, + help='list of test(s) to run, skipping all others') + parser.add_argument('--dut', help='target NVMe character device to test ' + '(e.g., /dev/ng0n1). WARNING: THIS IS A DESTRUCTIVE TEST', required=True) + args = parser.parse_args() + + return args + + +def main(): + """Run tests using fio's io_uring_cmd ioengine to send NVMe pass through commands.""" + + args = parse_args() + + if args.debug: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + artifact_root = args.artifact_root if args.artifact_root else \ + f"nvmept-streams-test-{time.strftime('%Y%m%d-%H%M%S')}" + os.mkdir(artifact_root) + print(f"Artifact directory is {artifact_root}") + + if args.fio: + fio_path = str(Path(args.fio).absolute()) + else: + fio_path = 'fio' + print(f"fio path is {fio_path}") + + for test in TEST_LIST: + test['fio_opts']['filename'] = args.dut + + release_all_streams(args.dut) + test_env = { + 'fio_path': fio_path, + 'fio_root': str(Path(__file__).absolute().parent.parent), + 'artifact_root': artifact_root, + 'basename': 'nvmept-streams', + } + + _, failed, _ = run_fio_tests(TEST_LIST, test_env, args) + sys.exit(failed) + + +if __name__ == '__main__': + main() diff --git a/t/nvmept_trim.py b/t/nvmept_trim.py new file mode 100755 index 0000000000..c990747dbc --- /dev/null +++ b/t/nvmept_trim.py @@ -0,0 +1,591 @@ +#!/usr/bin/env python3 +# +# Copyright 2024 Samsung Electronics Co., Ltd All Rights Reserved +# +# For conditions of distribution and use, see the accompanying COPYING file. +# +""" +# nvmept_trim.py +# +# Test fio's io_uring_cmd ioengine with NVMe pass-through dataset management +# commands that trim multiple ranges. +# +# USAGE +# see python3 nvmept_trim.py --help +# +# EXAMPLES +# python3 t/nvmept_trim.py --dut /dev/ng0n1 +# python3 t/nvmept_trim.py --dut /dev/ng1n1 -f ./fio +# +# REQUIREMENTS +# Python 3.6 +# +""" +import os +import sys +import time +import logging +import argparse +from pathlib import Path +from fiotestlib import FioJobCmdTest, run_fio_tests +from fiotestcommon import SUCCESS_NONZERO + + +class TrimTest(FioJobCmdTest): + """ + NVMe pass-through test class. Check to make sure output for selected data + direction(s) is non-zero and that zero data appears for other directions. + """ + + def setup(self, parameters): + """Setup a test.""" + + fio_args = [ + "--name=nvmept-trim", + "--ioengine=io_uring_cmd", + "--cmd_type=nvme", + f"--filename={self.fio_opts['filename']}", + f"--rw={self.fio_opts['rw']}", + f"--output={self.filenames['output']}", + f"--output-format={self.fio_opts['output-format']}", + ] + for opt in ['fixedbufs', 'nonvectored', 'force_async', 'registerfiles', + 'sqthread_poll', 'sqthread_poll_cpu', 'hipri', 'nowait', + 'time_based', 'runtime', 'verify', 'io_size', 'num_range', + 'iodepth', 'iodepth_batch', 'iodepth_batch_complete', + 'size', 'rate', 'bs', 'bssplit', 'bsrange', 'randrepeat', + 'buffer_pattern', 'verify_pattern', 'verify', 'offset']: + if opt in self.fio_opts: + option = f"--{opt}={self.fio_opts[opt]}" + fio_args.append(option) + + super().setup(fio_args) + + + def check_result(self): + + super().check_result() + + if 'rw' not in self.fio_opts or \ + not self.passed or \ + 'json' not in self.fio_opts['output-format']: + return + + job = self.json_data['jobs'][0] + + if self.fio_opts['rw'] in ['read', 'randread']: + self.passed = self.check_all_ddirs(['read'], job) + elif self.fio_opts['rw'] in ['write', 'randwrite']: + if 'verify' not in self.fio_opts: + self.passed = self.check_all_ddirs(['write'], job) + else: + self.passed = self.check_all_ddirs(['read', 'write'], job) + elif self.fio_opts['rw'] in ['trim', 'randtrim']: + self.passed = self.check_all_ddirs(['trim'], job) + elif self.fio_opts['rw'] in ['readwrite', 'randrw']: + self.passed = self.check_all_ddirs(['read', 'write'], job) + elif self.fio_opts['rw'] in ['trimwrite', 'randtrimwrite']: + self.passed = self.check_all_ddirs(['trim', 'write'], job) + else: + logging.error("Unhandled rw value %s", self.fio_opts['rw']) + self.passed = False + + if 'iodepth' in self.fio_opts: + # We will need to figure something out if any test uses an iodepth + # different from 8 + if job['iodepth_level']['8'] < 95: + logging.error("Did not achieve requested iodepth") + self.passed = False + else: + logging.debug("iodepth 8 target met %s", job['iodepth_level']['8']) + + +class RangeTrimTest(TrimTest): + """ + Multi-range trim test class. + """ + + def get_bs(self): + """Calculate block size and determine whether bs will be an average or exact.""" + + if 'bs' in self.fio_opts: + exact_size = True + bs = self.fio_opts['bs'] + elif 'bssplit' in self.fio_opts: + exact_size = False + bs = 0 + total = 0 + for split in self.fio_opts['bssplit'].split(':'): + [blocksize, share] = split.split('/') + total += int(share) + bs += int(blocksize) * int(share) / 100 + if total != 100: + logging.error("bssplit '%s' total percentage is not 100", self.fio_opts['bssplit']) + self.passed = False + else: + logging.debug("bssplit: average block size is %d", int(bs)) + # The only check we do here for bssplit is to calculate an average + # blocksize and see if the IOPS and bw are consistent + elif 'bsrange' in self.fio_opts: + exact_size = False + [minbs, maxbs] = self.fio_opts['bsrange'].split('-') + minbs = int(minbs) + maxbs = int(maxbs) + bs = int((minbs + maxbs) / 2) + logging.debug("bsrange: average block size is %d", int(bs)) + # The only check we do here for bsrange is to calculate an average + # blocksize and see if the IOPS and bw are consistent + else: + exact_size = True + bs = 4096 + + return bs, exact_size + + + def check_result(self): + """ + Make sure that the number of IO requests is consistent with the + blocksize and num_range values. In other words, if the blocksize is + 4KiB and num_range is 2, we should have 128 IO requests to trim 1MiB. + """ + # TODO Enable debug output to check the actual offsets + + super().check_result() + + if not self.passed or 'json' not in self.fio_opts['output-format']: + return + + job = self.json_data['jobs'][0]['trim'] + bs, exact_size = self.get_bs() + + # make sure bw and IOPS are consistent + bw = job['bw_bytes'] + iops = job['iops'] + runtime = job['runtime'] + + calculated = int(bw*runtime/1000) + expected = job['io_bytes'] + if abs(calculated - expected) / expected > 0.05: + logging.error("Total bytes %d from bw does not match reported total bytes %d", + calculated, expected) + self.passed = False + else: + logging.debug("Total bytes %d from bw matches reported total bytes %d", calculated, + expected) + + calculated = int(iops*runtime/1000*bs*self.fio_opts['num_range']) + if abs(calculated - expected) / expected > 0.05: + logging.error("Total bytes %d from IOPS does not match reported total bytes %d", + calculated, expected) + self.passed = False + else: + logging.debug("Total bytes %d from IOPS matches reported total bytes %d", calculated, + expected) + + if 'size' in self.fio_opts: + io_count = self.fio_opts['size'] / self.fio_opts['num_range'] / bs + if exact_size: + delta = 0.1 + else: + delta = 0.05*job['total_ios'] + + if abs(job['total_ios'] - io_count) > delta: + logging.error("Expected numbers of IOs %d does not match actual value %d", + io_count, job['total_ios']) + self.passed = False + else: + logging.debug("Expected numbers of IOs %d matches actual value %d", io_count, + job['total_ios']) + + if 'rate' in self.fio_opts: + if abs(bw - self.fio_opts['rate']) / self.fio_opts['rate'] > 0.05: + logging.error("Actual rate %f does not match expected rate %f", bw, + self.fio_opts['rate']) + self.passed = False + else: + logging.debug("Actual rate %f matches expeected rate %f", bw, self.fio_opts['rate']) + + + +TEST_LIST = [ + # The group of tests below checks existing use cases to make sure there are + # no regressions. + { + "test_id": 1, + "fio_opts": { + "rw": 'trim', + "time_based": 1, + "runtime": 3, + "output-format": "json", + }, + "test_class": TrimTest, + }, + { + "test_id": 2, + "fio_opts": { + "rw": 'randtrim', + "time_based": 1, + "runtime": 3, + "output-format": "json", + }, + "test_class": TrimTest, + }, + { + "test_id": 3, + "fio_opts": { + "rw": 'trim', + "time_based": 1, + "runtime": 3, + "iodepth": 8, + "iodepth_batch": 4, + "iodepth_batch_complete": 4, + "output-format": "json", + }, + "test_class": TrimTest, + }, + { + "test_id": 4, + "fio_opts": { + "rw": 'randtrim', + "time_based": 1, + "runtime": 3, + "iodepth": 8, + "iodepth_batch": 4, + "iodepth_batch_complete": 4, + "output-format": "json", + }, + "test_class": TrimTest, + }, + { + "test_id": 5, + "fio_opts": { + "rw": 'trimwrite', + "time_based": 1, + "runtime": 3, + "output-format": "json", + }, + "test_class": TrimTest, + }, + { + "test_id": 6, + "fio_opts": { + "rw": 'randtrimwrite', + "time_based": 1, + "runtime": 3, + "output-format": "json", + }, + "test_class": TrimTest, + }, + { + "test_id": 7, + "fio_opts": { + "rw": 'randtrim', + "time_based": 1, + "runtime": 3, + "fixedbufs": 0, + "nonvectored": 1, + "force_async": 1, + "registerfiles": 1, + "sqthread_poll": 1, + "fixedbuffs": 1, + "output-format": "json", + }, + "test_class": TrimTest, + }, + # The group of tests below try out the new functionality + { + "test_id": 100, + "fio_opts": { + "rw": 'trim', + "num_range": 2, + "size": 16*1024*1024, + "output-format": "json", + }, + "test_class": RangeTrimTest, + }, + { + "test_id": 101, + "fio_opts": { + "rw": 'randtrim', + "num_range": 2, + "size": 16*1024*1024, + "output-format": "json", + }, + "test_class": RangeTrimTest, + }, + { + "test_id": 102, + "fio_opts": { + "rw": 'randtrim', + "num_range": 256, + "size": 64*1024*1024, + "output-format": "json", + }, + "test_class": RangeTrimTest, + }, + { + "test_id": 103, + "fio_opts": { + "rw": 'trim', + "num_range": 2, + "bs": 16*1024, + "size": 32*1024*1024, + "output-format": "json", + }, + "test_class": RangeTrimTest, + }, + { + "test_id": 104, + "fio_opts": { + "rw": 'randtrim', + "num_range": 2, + "bs": 16*1024, + "size": 32*1024*1024, + "output-format": "json", + }, + "test_class": RangeTrimTest, + }, + { + "test_id": 105, + "fio_opts": { + "rw": 'randtrim', + "num_range": 2, + "bssplit": "4096/50:16384/50", + "size": 80*1024*1024, + "output-format": "json", + "randrepeat": 0, + }, + "test_class": RangeTrimTest, + }, + { + "test_id": 106, + "fio_opts": { + "rw": 'randtrim', + "num_range": 4, + "bssplit": "4096/25:8192/25:12288/25:16384/25", + "size": 80*1024*1024, + "output-format": "json", + "randrepeat": 0, + }, + "test_class": RangeTrimTest, + }, + { + "test_id": 107, + "fio_opts": { + "rw": 'randtrim', + "num_range": 4, + "bssplit": "4096/20:8192/20:12288/20:16384/20:20480/20", + "size": 72*1024*1024, + "output-format": "json", + "randrepeat": 0, + }, + "test_class": RangeTrimTest, + }, + { + "test_id": 108, + "fio_opts": { + "rw": 'randtrim', + "num_range": 2, + "bsrange": "4096-16384", + "size": 80*1024*1024, + "output-format": "json", + "randrepeat": 0, + }, + "test_class": RangeTrimTest, + }, + { + "test_id": 109, + "fio_opts": { + "rw": 'randtrim', + "num_range": 4, + "bsrange": "4096-20480", + "size": 72*1024*1024, + "output-format": "json", + "randrepeat": 0, + }, + "test_class": RangeTrimTest, + }, + { + "test_id": 110, + "fio_opts": { + "rw": 'randtrim', + "time_based": 1, + "runtime": 10, + "rate": 1024*1024, + "num_range": 2, + "output-format": "json", + }, + "test_class": RangeTrimTest, + }, + # All of the tests below should fail + # TODO check the error messages resulting from the jobs below + { + "test_id": 200, + "fio_opts": { + "rw": 'randtrimwrite', + "time_based": 1, + "runtime": 10, + "rate": 1024*1024, + "num_range": 2, + "output-format": "normal", + }, + "test_class": RangeTrimTest, + "success": SUCCESS_NONZERO, + }, + { + "test_id": 201, + "fio_opts": { + "rw": 'trimwrite', + "time_based": 1, + "runtime": 10, + "rate": 1024*1024, + "num_range": 2, + "output-format": "normal", + }, + "test_class": RangeTrimTest, + "success": SUCCESS_NONZERO, + }, + { + "test_id": 202, + "fio_opts": { + "rw": 'trim', + "time_based": 1, + "runtime": 10, + "num_range": 257, + "output-format": "normal", + }, + "test_class": RangeTrimTest, + "success": SUCCESS_NONZERO, + }, + # The sequence of jobs below constitute a single test with multiple steps + # - write a data pattern + # - verify the data pattern + # - trim the first half of the LBA space + # - verify that the trim'd LBA space no longer returns the original data pattern + # - verify that the remaining LBA space has the expected pattern + { + "test_id": 300, + "fio_opts": { + "rw": 'write', + "output-format": 'json', + "buffer_pattern": 0x0f, + "size": 256*1024*1024, + "bs": 256*1024, + }, + "test_class": TrimTest, + }, + { + "test_id": 301, + "fio_opts": { + "rw": 'read', + "output-format": 'json', + "verify_pattern": 0x0f, + "verify": "pattern", + "size": 256*1024*1024, + "bs": 256*1024, + }, + "test_class": TrimTest, + }, + { + "test_id": 302, + "fio_opts": { + "rw": 'randtrim', + "num_range": 8, + "output-format": 'json', + "size": 128*1024*1024, + "bs": 256*1024, + }, + "test_class": TrimTest, + }, + # The identify namespace data structure has a DLFEAT field which specifies + # what happens when reading data from deallocated blocks. There are three + # options: + # - read behavior not reported + # - deallocated logical block returns all bytes 0x0 + # - deallocated logical block returns all bytes 0xff + # The test below merely checks that the original data pattern is not returned. + # Source: Figure 97 from + # https://nvmexpress.org/wp-content/uploads/NVM-Express-NVM-Command-Set-Specification-1.0c-2022.10.03-Ratified.pdf + { + "test_id": 303, + "fio_opts": { + "rw": 'read', + "output-format": 'json', + "verify_pattern": 0x0f, + "verify": "pattern", + "size": 128*1024*1024, + "bs": 256*1024, + }, + "test_class": TrimTest, + "success": SUCCESS_NONZERO, + }, + { + "test_id": 304, + "fio_opts": { + "rw": 'read', + "output-format": 'json', + "verify_pattern": 0x0f, + "verify": "pattern", + "offset": 128*1024*1024, + "size": 128*1024*1024, + "bs": 256*1024, + }, + "test_class": TrimTest, + }, +] + +def parse_args(): + """Parse command-line arguments.""" + + parser = argparse.ArgumentParser() + parser.add_argument('-d', '--debug', help='Enable debug messages', action='store_true') + parser.add_argument('-f', '--fio', help='path to file executable (e.g., ./fio)') + parser.add_argument('-a', '--artifact-root', help='artifact root directory') + parser.add_argument('-s', '--skip', nargs='+', type=int, + help='list of test(s) to skip') + parser.add_argument('-o', '--run-only', nargs='+', type=int, + help='list of test(s) to run, skipping all others') + parser.add_argument('--dut', help='target NVMe character device to test ' + '(e.g., /dev/ng0n1). WARNING: THIS IS A DESTRUCTIVE TEST', required=True) + args = parser.parse_args() + + return args + + +def main(): + """Run tests using fio's io_uring_cmd ioengine to send NVMe pass through commands.""" + + args = parse_args() + + if args.debug: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + artifact_root = args.artifact_root if args.artifact_root else \ + f"nvmept-trim-test-{time.strftime('%Y%m%d-%H%M%S')}" + os.mkdir(artifact_root) + print(f"Artifact directory is {artifact_root}") + + if args.fio: + fio_path = str(Path(args.fio).absolute()) + else: + fio_path = 'fio' + print(f"fio path is {fio_path}") + + for test in TEST_LIST: + test['fio_opts']['filename'] = args.dut + + test_env = { + 'fio_path': fio_path, + 'fio_root': str(Path(__file__).absolute().parent.parent), + 'artifact_root': artifact_root, + 'basename': 'nvmept-trim', + } + + _, failed, _ = run_fio_tests(TEST_LIST, test_env, args) + sys.exit(failed) + + +if __name__ == '__main__': + main() diff --git a/t/one-core-peak.sh b/t/one-core-peak.sh index 9da8304e7d..3ac119f6e0 100755 --- a/t/one-core-peak.sh +++ b/t/one-core-peak.sh @@ -33,8 +33,8 @@ check_binary() { # Ensure the binaries are present and executable for bin in "$@"; do if [ ! -x ${bin} ]; then - which ${bin} >/dev/null - [ $? -eq 0 ] || fatal "${bin} doesn't exists or is not executable" + command -v ${bin} >/dev/null + [ $? -eq 0 ] || fatal "${bin} doesn't exist or is not executable" fi done } @@ -197,7 +197,7 @@ show_nvme() { fw=$(cat ${device_dir}/firmware_rev | xargs) #xargs for trimming spaces serial=$(cat ${device_dir}/serial | xargs) #xargs for trimming spaces info ${device_name} "MODEL=${model} FW=${fw} serial=${serial} PCI=${pci_addr}@${link_speed} IRQ=${irq} NUMA=${numa} CPUS=${cpus} " - which nvme &> /dev/null + command -v nvme > /dev/null if [ $? -eq 0 ]; then status="" NCQA=$(nvme get-feature -H -f 0x7 ${device} 2>&1 |grep NCQA |cut -d ':' -f 2 | xargs) diff --git a/t/random_seed.py b/t/random_seed.py new file mode 100755 index 0000000000..82beca65b5 --- /dev/null +++ b/t/random_seed.py @@ -0,0 +1,348 @@ +#!/usr/bin/env python3 +""" +# random_seed.py +# +# Test fio's random seed options. +# +# - make sure that randseed overrides randrepeat and allrandrepeat +# - make sure that seeds differ across invocations when [all]randrepeat=0 and randseed is not set +# - make sure that seeds are always the same when [all]randrepeat=1 and randseed is not set +# +# USAGE +# see python3 random_seed.py --help +# +# EXAMPLES +# python3 t/random_seed.py +# python3 t/random_seed.py -f ./fio +# +# REQUIREMENTS +# Python 3.6 +# +""" +import os +import sys +import time +import locale +import logging +import argparse +from pathlib import Path +from fiotestlib import FioJobCmdTest, run_fio_tests + +class FioRandTest(FioJobCmdTest): + """fio random seed test.""" + + def setup(self, parameters): + """Setup the test.""" + + fio_args = [ + "--debug=random", + "--name=random_seed", + "--ioengine=null", + "--filesize=32k", + "--rw=randread", + f"--output={self.filenames['output']}", + ] + for opt in ['randseed', 'randrepeat', 'allrandrepeat']: + if opt in self.fio_opts: + option = f"--{opt}={self.fio_opts[opt]}" + fio_args.append(option) + + super().setup(fio_args) + + def get_rand_seeds(self): + """Collect random seeds from --debug=random output.""" + with open(self.filenames['output'], "r", + encoding=locale.getpreferredencoding()) as out_file: + file_data = out_file.read() + + offsets = 0 + for line in file_data.split('\n'): + if 'random' in line and 'FIO_RAND_NR_OFFS=' in line: + tokens = line.split('=') + offsets = int(tokens[len(tokens)-1]) + break + + if offsets == 0: + pass + # find an exception to throw + + seed_list = [] + for line in file_data.split('\n'): + if 'random' not in line: + continue + if 'rand_seeds[' in line: + tokens = line.split('=') + seed = int(tokens[-1]) + seed_list.append(seed) + # assume that seeds are in order + + return seed_list + + +class TestRR(FioRandTest): + """ + Test object for [all]randrepeat. If run for the first time just collect the + seeds. For later runs make sure the seeds match or do not match those + previously collected. + """ + # one set of seeds is for randrepeat=0 and the other is for randrepeat=1 + seeds = { 0: None, 1: None } + + def check_result(self): + """Check output for allrandrepeat=1.""" + + super().check_result() + if not self.passed: + return + + opt = 'randrepeat' if 'randrepeat' in self.fio_opts else 'allrandrepeat' + rr = self.fio_opts[opt] + rand_seeds = self.get_rand_seeds() + + if not TestRR.seeds[rr]: + TestRR.seeds[rr] = rand_seeds + logging.debug("TestRR: saving rand_seeds for [a]rr=%d", rr) + else: + if rr: + if TestRR.seeds[1] != rand_seeds: + self.passed = False + print(f"TestRR: unexpected seed mismatch for [a]rr={rr}") + else: + logging.debug("TestRR: seeds correctly match for [a]rr=%d", rr) + if TestRR.seeds[0] == rand_seeds: + self.passed = False + print("TestRR: seeds unexpectedly match those from system RNG") + else: + if TestRR.seeds[0] == rand_seeds: + self.passed = False + print(f"TestRR: unexpected seed match for [a]rr={rr}") + else: + logging.debug("TestRR: seeds correctly don't match for [a]rr=%d", rr) + if TestRR.seeds[1] == rand_seeds: + self.passed = False + print("TestRR: random seeds unexpectedly match those from [a]rr=1") + + +class TestRS(FioRandTest): + """ + Test object when randseed=something controls the generated seeds. If run + for the first time for a given randseed just collect the seeds. For later + runs with the same seed make sure the seeds are the same as those + previously collected. + """ + seeds = {} + + def check_result(self): + """Check output for randseed=something.""" + + super().check_result() + if not self.passed: + return + + rand_seeds = self.get_rand_seeds() + randseed = self.fio_opts['randseed'] + + logging.debug("randseed = %s", randseed) + + if randseed not in TestRS.seeds: + TestRS.seeds[randseed] = rand_seeds + logging.debug("TestRS: saving rand_seeds") + else: + if TestRS.seeds[randseed] != rand_seeds: + self.passed = False + print("TestRS: seeds don't match when they should") + else: + logging.debug("TestRS: seeds correctly match") + + # Now try to find seeds generated using a different randseed and make + # sure they *don't* match + for key, value in TestRS.seeds.items(): + if key != randseed: + if value == rand_seeds: + self.passed = False + print("TestRS: randseeds differ but generated seeds match.") + else: + logging.debug("TestRS: randseeds differ and generated seeds also differ.") + + +def parse_args(): + """Parse command-line arguments.""" + + parser = argparse.ArgumentParser() + parser.add_argument('-f', '--fio', help='path to file executable (e.g., ./fio)') + parser.add_argument('-a', '--artifact-root', help='artifact root directory') + parser.add_argument('-d', '--debug', help='enable debug output', action='store_true') + parser.add_argument('-s', '--skip', nargs='+', type=int, + help='list of test(s) to skip') + parser.add_argument('-o', '--run-only', nargs='+', type=int, + help='list of test(s) to run, skipping all others') + args = parser.parse_args() + + return args + + +def main(): + """Run tests of fio random seed options""" + + args = parse_args() + + if args.debug: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + artifact_root = args.artifact_root if args.artifact_root else \ + f"random-seed-test-{time.strftime('%Y%m%d-%H%M%S')}" + os.mkdir(artifact_root) + print(f"Artifact directory is {artifact_root}") + + if args.fio: + fio_path = str(Path(args.fio).absolute()) + else: + fio_path = 'fio' + print(f"fio path is {fio_path}") + + test_list = [ + { + "test_id": 1, + "fio_opts": { + "randrepeat": 0, + }, + "test_class": TestRR, + }, + { + "test_id": 2, + "fio_opts": { + "randrepeat": 0, + }, + "test_class": TestRR, + }, + { + "test_id": 3, + "fio_opts": { + "randrepeat": 1, + }, + "test_class": TestRR, + }, + { + "test_id": 4, + "fio_opts": { + "randrepeat": 1, + }, + "test_class": TestRR, + }, + { + "test_id": 5, + "fio_opts": { + "allrandrepeat": 0, + }, + "test_class": TestRR, + }, + { + "test_id": 6, + "fio_opts": { + "allrandrepeat": 0, + }, + "test_class": TestRR, + }, + { + "test_id": 7, + "fio_opts": { + "allrandrepeat": 1, + }, + "test_class": TestRR, + }, + { + "test_id": 8, + "fio_opts": { + "allrandrepeat": 1, + }, + "test_class": TestRR, + }, + { + "test_id": 9, + "fio_opts": { + "randrepeat": 0, + "randseed": "12345", + }, + "test_class": TestRS, + }, + { + "test_id": 10, + "fio_opts": { + "randrepeat": 0, + "randseed": "12345", + }, + "test_class": TestRS, + }, + { + "test_id": 11, + "fio_opts": { + "randrepeat": 1, + "randseed": "12345", + }, + "test_class": TestRS, + }, + { + "test_id": 12, + "fio_opts": { + "allrandrepeat": 0, + "randseed": "12345", + }, + "test_class": TestRS, + }, + { + "test_id": 13, + "fio_opts": { + "allrandrepeat": 1, + "randseed": "12345", + }, + "test_class": TestRS, + }, + { + "test_id": 14, + "fio_opts": { + "randrepeat": 0, + "randseed": "67890", + }, + "test_class": TestRS, + }, + { + "test_id": 15, + "fio_opts": { + "randrepeat": 1, + "randseed": "67890", + }, + "test_class": TestRS, + }, + { + "test_id": 16, + "fio_opts": { + "allrandrepeat": 0, + "randseed": "67890", + }, + "test_class": TestRS, + }, + { + "test_id": 17, + "fio_opts": { + "allrandrepeat": 1, + "randseed": "67890", + }, + "test_class": TestRS, + }, + ] + + test_env = { + 'fio_path': fio_path, + 'fio_root': str(Path(__file__).absolute().parent.parent), + 'artifact_root': artifact_root, + 'basename': 'random', + } + + _, failed, _ = run_fio_tests(test_list, test_env, args) + sys.exit(failed) + + +if __name__ == '__main__': + main() diff --git a/t/read-to-pipe-async.c b/t/read-to-pipe-async.c index 586e3c95bf..2abe25d3d8 100644 --- a/t/read-to-pipe-async.c +++ b/t/read-to-pipe-async.c @@ -35,6 +35,9 @@ #include #include "../flist.h" +#include "../log.h" + +#include "compiler/compiler.h" static int bs = 4096; static int max_us = 10000; @@ -47,6 +50,18 @@ static int separate_writer = 1; #define PLAT_NR (PLAT_GROUP_NR * PLAT_VAL) #define PLAT_LIST_MAX 20 +#ifndef NDEBUG +#define CHECK_ZERO_OR_ABORT(code) assert(code) +#else +#define CHECK_ZERO_OR_ABORT(code) \ + do { \ + if (fio_unlikely((code) != 0)) { \ + log_err("failed checking code %i != 0", (code)); \ + abort(); \ + } \ + } while (0) +#endif + struct stats { unsigned int plat[PLAT_NR]; unsigned int nr_samples; @@ -121,7 +136,7 @@ uint64_t utime_since(const struct timespec *s, const struct timespec *e) return ret; } -static struct work_item *find_seq(struct writer_thread *w, unsigned int seq) +static struct work_item *find_seq(struct writer_thread *w, int seq) { struct work_item *work; struct flist_head *entry; @@ -224,6 +239,8 @@ static int write_work(struct work_item *work) clock_gettime(CLOCK_MONOTONIC, &s); ret = write(STDOUT_FILENO, work->buf, work->buf_size); + if (ret < 0) + return (int)ret; clock_gettime(CLOCK_MONOTONIC, &e); assert(ret == work->buf_size); @@ -241,16 +258,14 @@ static void *writer_fn(void *data) { struct writer_thread *wt = data; struct work_item *work; - unsigned int seq = 1; + int seq = 1; work = NULL; - while (!wt->thread.exit || !flist_empty(&wt->list)) { + while (!(seq < 0) && (!wt->thread.exit || !flist_empty(&wt->list))) { pthread_mutex_lock(&wt->thread.lock); - if (work) { + if (work) flist_add_tail(&work->list, &wt->done_list); - work = NULL; - } work = find_seq(wt, seq); if (work) @@ -469,10 +484,10 @@ static void init_thread(struct thread_data *thread) int ret; ret = pthread_condattr_init(&cattr); - assert(ret == 0); + CHECK_ZERO_OR_ABORT(ret); #ifdef CONFIG_PTHREAD_CONDATTR_SETCLOCK ret = pthread_condattr_setclock(&cattr, CLOCK_MONOTONIC); - assert(ret == 0); + CHECK_ZERO_OR_ABORT(ret); #endif pthread_cond_init(&thread->cond, &cattr); pthread_cond_init(&thread->done_cond, &cattr); @@ -626,10 +641,10 @@ int main(int argc, char *argv[]) bytes = 0; ret = pthread_condattr_init(&cattr); - assert(ret == 0); + CHECK_ZERO_OR_ABORT(ret); #ifdef CONFIG_PTHREAD_CONDATTR_SETCLOCK ret = pthread_condattr_setclock(&cattr, CLOCK_MONOTONIC); - assert(ret == 0); + CHECK_ZERO_OR_ABORT(ret); #endif clock_gettime(CLOCK_MONOTONIC, &s); diff --git a/t/readonly.py b/t/readonly.py index 464847c603..d36faafa7c 100755 --- a/t/readonly.py +++ b/t/readonly.py @@ -2,11 +2,11 @@ # SPDX-License-Identifier: GPL-2.0-only # # Copyright (c) 2019 Western Digital Corporation or its affiliates. -# -# + +""" # readonly.py # -# Do some basic tests of the --readonly paramter +# Do some basic tests of the --readonly parameter # # USAGE # python readonly.py [-f fio-executable] @@ -18,122 +18,144 @@ # REQUIREMENTS # Python 3.5+ # -# +""" +import os import sys +import time import argparse -import subprocess +from pathlib import Path +from fiotestlib import FioJobCmdTest, run_fio_tests +from fiotestcommon import SUCCESS_DEFAULT, SUCCESS_NONZERO + + +class FioReadOnlyTest(FioJobCmdTest): + """fio read only test.""" + + def setup(self, parameters): + """Setup the test.""" + + fio_args = [ + "--name=readonly", + "--ioengine=null", + "--time_based", + "--runtime=1s", + "--size=1M", + f"--rw={self.fio_opts['rw']}", + ] + if 'readonly-pre' in parameters: + fio_args.insert(0, "--readonly") + if 'readonly-post' in parameters: + fio_args.append("--readonly") + + super().setup(fio_args) + + +TEST_LIST = [ + { + "test_id": 1, + "fio_opts": { "rw": "randread", }, + "readonly-pre": 1, + "success": SUCCESS_DEFAULT, + "test_class": FioReadOnlyTest, + }, + { + "test_id": 2, + "fio_opts": { "rw": "randwrite", }, + "readonly-pre": 1, + "success": SUCCESS_NONZERO, + "test_class": FioReadOnlyTest, + }, + { + "test_id": 3, + "fio_opts": { "rw": "randtrim", }, + "readonly-pre": 1, + "success": SUCCESS_NONZERO, + "test_class": FioReadOnlyTest, + }, + { + "test_id": 4, + "fio_opts": { "rw": "randread", }, + "readonly-post": 1, + "success": SUCCESS_DEFAULT, + "test_class": FioReadOnlyTest, + }, + { + "test_id": 5, + "fio_opts": { "rw": "randwrite", }, + "readonly-post": 1, + "success": SUCCESS_NONZERO, + "test_class": FioReadOnlyTest, + }, + { + "test_id": 6, + "fio_opts": { "rw": "randtrim", }, + "readonly-post": 1, + "success": SUCCESS_NONZERO, + "test_class": FioReadOnlyTest, + }, + { + "test_id": 7, + "fio_opts": { "rw": "randread", }, + "success": SUCCESS_DEFAULT, + "test_class": FioReadOnlyTest, + }, + { + "test_id": 8, + "fio_opts": { "rw": "randwrite", }, + "success": SUCCESS_DEFAULT, + "test_class": FioReadOnlyTest, + }, + { + "test_id": 9, + "fio_opts": { "rw": "randtrim", }, + "success": SUCCESS_DEFAULT, + "test_class": FioReadOnlyTest, + }, + ] def parse_args(): + """Parse command-line arguments.""" + parser = argparse.ArgumentParser() - parser.add_argument('-f', '--fio', - help='path to fio executable (e.g., ./fio)') + parser.add_argument('-f', '--fio', help='path to fio executable (e.g., ./fio)') + parser.add_argument('-a', '--artifact-root', help='artifact root directory') + parser.add_argument('-s', '--skip', nargs='+', type=int, + help='list of test(s) to skip') + parser.add_argument('-o', '--run-only', nargs='+', type=int, + help='list of test(s) to run, skipping all others') args = parser.parse_args() return args -def run_fio(fio, test, index): - fio_args = [ - "--max-jobs=16", - "--name=readonly", - "--ioengine=null", - "--time_based", - "--runtime=1s", - "--size=1M", - "--rw={rw}".format(**test), - ] - if 'readonly-pre' in test: - fio_args.insert(0, "--readonly") - if 'readonly-post' in test: - fio_args.append("--readonly") - - output = subprocess.run([fio] + fio_args, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - - return output - - -def check_output(output, test): - expect_error = False - if 'readonly-pre' in test or 'readonly-post' in test: - if 'write' in test['rw'] or 'trim' in test['rw']: - expect_error = True - -# print(output.stdout) -# print(output.stderr) - - if output.returncode == 0: - if expect_error: - return False - else: - return True - else: - if expect_error: - return True - else: - return False - +def main(): + """Run readonly tests.""" -if __name__ == '__main__': args = parse_args() - tests = [ - { - "rw": "randread", - "readonly-pre": 1, - }, - { - "rw": "randwrite", - "readonly-pre": 1, - }, - { - "rw": "randtrim", - "readonly-pre": 1, - }, - { - "rw": "randread", - "readonly-post": 1, - }, - { - "rw": "randwrite", - "readonly-post": 1, - }, - { - "rw": "randtrim", - "readonly-post": 1, - }, - { - "rw": "randread", - }, - { - "rw": "randwrite", - }, - { - "rw": "randtrim", - }, - ] - - index = 1 - passed = 0 - failed = 0 - if args.fio: - fio_path = args.fio + fio_path = str(Path(args.fio).absolute()) else: fio_path = 'fio' + print(f"fio path is {fio_path}") - for test in tests: - output = run_fio(fio_path, test, index) - status = check_output(output, test) - print("Test {0} {1}".format(index, ("PASSED" if status else "FAILED"))) - if status: - passed = passed + 1 - else: - failed = failed + 1 - index = index + 1 + artifact_root = args.artifact_root if args.artifact_root else \ + f"readonly-test-{time.strftime('%Y%m%d-%H%M%S')}" + os.mkdir(artifact_root) + print(f"Artifact directory is {artifact_root}") - print("{0} tests passed, {1} failed".format(passed, failed)) + test_env = { + 'fio_path': fio_path, + 'fio_root': str(Path(__file__).absolute().parent.parent), + 'artifact_root': artifact_root, + 'basename': 'readonly', + } + _, failed, _ = run_fio_tests(TEST_LIST, test_env, args) sys.exit(failed) + + +if __name__ == '__main__': + main() diff --git a/t/run-fio-tests.py b/t/run-fio-tests.py index 612e50ca6a..b33213128a 100755 --- a/t/run-fio-tests.py +++ b/t/run-fio-tests.py @@ -14,7 +14,7 @@ # # # EXAMPLE -# # git clone git://git.kernel.dk/fio.git +# # git clone https://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio # # cd fio # # make -j # # python3 t/run-fio-tests.py @@ -43,315 +43,41 @@ import os import sys -import json import time import shutil import logging import argparse -import platform -import traceback -import subprocess -import multiprocessing +import re from pathlib import Path +from statsmodels.sandbox.stats.runs import runstest_1samp +from fiotestlib import FioExeTest, FioJobFileTest, run_fio_tests +from fiotestcommon import * -class FioTest(object): - """Base for all fio tests.""" - - def __init__(self, exe_path, parameters, success): - self.exe_path = exe_path - self.parameters = parameters - self.success = success - self.output = {} - self.artifact_root = None - self.testnum = None - self.test_dir = None - self.passed = True - self.failure_reason = '' - self.command_file = None - self.stdout_file = None - self.stderr_file = None - self.exitcode_file = None - - def setup(self, artifact_root, testnum): - """Setup instance variables for test.""" - - self.artifact_root = artifact_root - self.testnum = testnum - self.test_dir = os.path.join(artifact_root, "{:04d}".format(testnum)) - if not os.path.exists(self.test_dir): - os.mkdir(self.test_dir) - - self.command_file = os.path.join( - self.test_dir, - "{0}.command".format(os.path.basename(self.exe_path))) - self.stdout_file = os.path.join( - self.test_dir, - "{0}.stdout".format(os.path.basename(self.exe_path))) - self.stderr_file = os.path.join( - self.test_dir, - "{0}.stderr".format(os.path.basename(self.exe_path))) - self.exitcode_file = os.path.join( - self.test_dir, - "{0}.exitcode".format(os.path.basename(self.exe_path))) - - def run(self): - """Run the test.""" - - raise NotImplementedError() - - def check_result(self): - """Check test results.""" - - raise NotImplementedError() - - -class FioExeTest(FioTest): - """Test consists of an executable binary or script""" - - def __init__(self, exe_path, parameters, success): - """Construct a FioExeTest which is a FioTest consisting of an - executable binary or script. - - exe_path: location of executable binary or script - parameters: list of parameters for executable - success: Definition of test success - """ - - FioTest.__init__(self, exe_path, parameters, success) - - def run(self): - """Execute the binary or script described by this instance.""" - - command = [self.exe_path] + self.parameters - command_file = open(self.command_file, "w+") - command_file.write("%s\n" % command) - command_file.close() - - stdout_file = open(self.stdout_file, "w+") - stderr_file = open(self.stderr_file, "w+") - exitcode_file = open(self.exitcode_file, "w+") - try: - proc = None - # Avoid using subprocess.run() here because when a timeout occurs, - # fio will be stopped with SIGKILL. This does not give fio a - # chance to clean up and means that child processes may continue - # running and submitting IO. - proc = subprocess.Popen(command, - stdout=stdout_file, - stderr=stderr_file, - cwd=self.test_dir, - universal_newlines=True) - proc.communicate(timeout=self.success['timeout']) - exitcode_file.write('{0}\n'.format(proc.returncode)) - logging.debug("Test %d: return code: %d", self.testnum, proc.returncode) - self.output['proc'] = proc - except subprocess.TimeoutExpired: - proc.terminate() - proc.communicate() - assert proc.poll() - self.output['failure'] = 'timeout' - except Exception: - if proc: - if not proc.poll(): - proc.terminate() - proc.communicate() - self.output['failure'] = 'exception' - self.output['exc_info'] = sys.exc_info() - finally: - stdout_file.close() - stderr_file.close() - exitcode_file.close() - - def check_result(self): - """Check results of test run.""" - - if 'proc' not in self.output: - if self.output['failure'] == 'timeout': - self.failure_reason = "{0} timeout,".format(self.failure_reason) - else: - assert self.output['failure'] == 'exception' - self.failure_reason = '{0} exception: {1}, {2}'.format( - self.failure_reason, self.output['exc_info'][0], - self.output['exc_info'][1]) - - self.passed = False - return - - if 'zero_return' in self.success: - if self.success['zero_return']: - if self.output['proc'].returncode != 0: - self.passed = False - self.failure_reason = "{0} non-zero return code,".format(self.failure_reason) - else: - if self.output['proc'].returncode == 0: - self.failure_reason = "{0} zero return code,".format(self.failure_reason) - self.passed = False - - stderr_size = os.path.getsize(self.stderr_file) - if 'stderr_empty' in self.success: - if self.success['stderr_empty']: - if stderr_size != 0: - self.failure_reason = "{0} stderr not empty,".format(self.failure_reason) - self.passed = False - else: - if stderr_size == 0: - self.failure_reason = "{0} stderr empty,".format(self.failure_reason) - self.passed = False - - -class FioJobTest(FioExeTest): - """Test consists of a fio job""" - - def __init__(self, fio_path, fio_job, success, fio_pre_job=None, - fio_pre_success=None, output_format="normal"): - """Construct a FioJobTest which is a FioExeTest consisting of a - single fio job file with an optional setup step. - - fio_path: location of fio executable - fio_job: location of fio job file - success: Definition of test success - fio_pre_job: fio job for preconditioning - fio_pre_success: Definition of test success for fio precon job - output_format: normal (default), json, jsonplus, or terse - """ - - self.fio_job = fio_job - self.fio_pre_job = fio_pre_job - self.fio_pre_success = fio_pre_success if fio_pre_success else success - self.output_format = output_format - self.precon_failed = False - self.json_data = None - self.fio_output = "{0}.output".format(os.path.basename(self.fio_job)) - self.fio_args = [ - "--max-jobs=16", - "--output-format={0}".format(self.output_format), - "--output={0}".format(self.fio_output), - self.fio_job, - ] - FioExeTest.__init__(self, fio_path, self.fio_args, success) - - def setup(self, artifact_root, testnum): - """Setup instance variables for fio job test.""" - - super(FioJobTest, self).setup(artifact_root, testnum) - - self.command_file = os.path.join( - self.test_dir, - "{0}.command".format(os.path.basename(self.fio_job))) - self.stdout_file = os.path.join( - self.test_dir, - "{0}.stdout".format(os.path.basename(self.fio_job))) - self.stderr_file = os.path.join( - self.test_dir, - "{0}.stderr".format(os.path.basename(self.fio_job))) - self.exitcode_file = os.path.join( - self.test_dir, - "{0}.exitcode".format(os.path.basename(self.fio_job))) - - def run_pre_job(self): - """Run fio job precondition step.""" - - precon = FioJobTest(self.exe_path, self.fio_pre_job, - self.fio_pre_success, - output_format=self.output_format) - precon.setup(self.artifact_root, self.testnum) - precon.run() - precon.check_result() - self.precon_failed = not precon.passed - self.failure_reason = precon.failure_reason - - def run(self): - """Run fio job test.""" - - if self.fio_pre_job: - self.run_pre_job() - - if not self.precon_failed: - super(FioJobTest, self).run() - else: - logging.debug("Test %d: precondition step failed", self.testnum) - - @classmethod - def get_file(cls, filename): - """Safely read a file.""" - file_data = '' - success = True - - try: - with open(filename, "r") as output_file: - file_data = output_file.read() - except OSError: - success = False - - return file_data, success - - def check_result(self): - """Check fio job results.""" - - if self.precon_failed: - self.passed = False - self.failure_reason = "{0} precondition step failed,".format(self.failure_reason) - return - - super(FioJobTest, self).check_result() - - if not self.passed: - return - - if 'json' not in self.output_format: - return - - file_data, success = self.get_file(os.path.join(self.test_dir, self.fio_output)) - if not success: - self.failure_reason = "{0} unable to open output file,".format(self.failure_reason) - self.passed = False - return - - # - # Sometimes fio informational messages are included at the top of the - # JSON output, especially under Windows. Try to decode output as JSON - # data, lopping off up to the first four lines - # - lines = file_data.splitlines() - for i in range(5): - file_data = '\n'.join(lines[i:]) - try: - self.json_data = json.loads(file_data) - except json.JSONDecodeError: - continue - else: - logging.debug("Test %d: skipped %d lines decoding JSON data", self.testnum, i) - return - - self.failure_reason = "{0} unable to decode JSON data,".format(self.failure_reason) - self.passed = False - - -class FioJobTest_t0005(FioJobTest): +class FioJobFileTest_t0005(FioJobFileTest): """Test consists of fio test job t0005 Confirm that read['io_kbytes'] == write['io_kbytes'] == 102400""" def check_result(self): - super(FioJobTest_t0005, self).check_result() + super().check_result() if not self.passed: return if self.json_data['jobs'][0]['read']['io_kbytes'] != 102400: - self.failure_reason = "{0} bytes read mismatch,".format(self.failure_reason) + self.failure_reason = f"{self.failure_reason} bytes read mismatch," self.passed = False if self.json_data['jobs'][0]['write']['io_kbytes'] != 102400: - self.failure_reason = "{0} bytes written mismatch,".format(self.failure_reason) + self.failure_reason = f"{self.failure_reason} bytes written mismatch," self.passed = False -class FioJobTest_t0006(FioJobTest): +class FioJobFileTest_t0006(FioJobFileTest): """Test consists of fio test job t0006 Confirm that read['io_kbytes'] ~ 2*write['io_kbytes']""" def check_result(self): - super(FioJobTest_t0006, self).check_result() + super().check_result() if not self.passed: return @@ -360,56 +86,59 @@ def check_result(self): / self.json_data['jobs'][0]['write']['io_kbytes'] logging.debug("Test %d: ratio: %f", self.testnum, ratio) if ratio < 1.99 or ratio > 2.01: - self.failure_reason = "{0} read/write ratio mismatch,".format(self.failure_reason) + self.failure_reason = f"{self.failure_reason} read/write ratio mismatch," self.passed = False -class FioJobTest_t0007(FioJobTest): +class FioJobFileTest_t0007(FioJobFileTest): """Test consists of fio test job t0007 Confirm that read['io_kbytes'] = 87040""" def check_result(self): - super(FioJobTest_t0007, self).check_result() + super().check_result() if not self.passed: return if self.json_data['jobs'][0]['read']['io_kbytes'] != 87040: - self.failure_reason = "{0} bytes read mismatch,".format(self.failure_reason) + self.failure_reason = f"{self.failure_reason} bytes read mismatch," self.passed = False -class FioJobTest_t0008(FioJobTest): +class FioJobFileTest_t0008(FioJobFileTest): """Test consists of fio test job t0008 Confirm that read['io_kbytes'] = 32768 and that - write['io_kbytes'] ~ 16568 + write['io_kbytes'] ~ 16384 - I did runs with fio-ae2fafc8 and saw write['io_kbytes'] values of - 16585, 16588. With two runs of fio-3.16 I obtained 16568""" + This is a 50/50 seq read/write workload. Since fio flips a coin to + determine whether to issue a read or a write, total bytes written will not + be exactly 16384K. But total bytes read will be exactly 32768K because + reads will include the initial phase as well as the verify phase where all + the blocks originally written will be read.""" def check_result(self): - super(FioJobTest_t0008, self).check_result() + super().check_result() if not self.passed: return - ratio = self.json_data['jobs'][0]['write']['io_kbytes'] / 16568 + ratio = self.json_data['jobs'][0]['write']['io_kbytes'] / 16384 logging.debug("Test %d: ratio: %f", self.testnum, ratio) - if ratio < 0.99 or ratio > 1.01: - self.failure_reason = "{0} bytes written mismatch,".format(self.failure_reason) + if ratio < 0.97 or ratio > 1.03: + self.failure_reason = f"{self.failure_reason} bytes written mismatch," self.passed = False if self.json_data['jobs'][0]['read']['io_kbytes'] != 32768: - self.failure_reason = "{0} bytes read mismatch,".format(self.failure_reason) + self.failure_reason = f"{self.failure_reason} bytes read mismatch," self.passed = False -class FioJobTest_t0009(FioJobTest): +class FioJobFileTest_t0009(FioJobFileTest): """Test consists of fio test job t0009 Confirm that runtime >= 60s""" def check_result(self): - super(FioJobTest_t0009, self).check_result() + super().check_result() if not self.passed: return @@ -417,28 +146,27 @@ def check_result(self): logging.debug('Test %d: elapsed: %d', self.testnum, self.json_data['jobs'][0]['elapsed']) if self.json_data['jobs'][0]['elapsed'] < 60: - self.failure_reason = "{0} elapsed time mismatch,".format(self.failure_reason) + self.failure_reason = f"{self.failure_reason} elapsed time mismatch," self.passed = False -class FioJobTest_t0012(FioJobTest): +class FioJobFileTest_t0012(FioJobFileTest): """Test consists of fio test job t0012 Confirm ratios of job iops are 1:5:10 job1,job2,job3 respectively""" def check_result(self): - super(FioJobTest_t0012, self).check_result() + super().check_result() if not self.passed: return iops_files = [] - for i in range(1,4): - file_data, success = self.get_file(os.path.join(self.test_dir, "{0}_iops.{1}.log".format(os.path.basename(self.fio_job), i))) - - if not success: - self.failure_reason = "{0} unable to open output file,".format(self.failure_reason) - self.passed = False + for i in range(1, 4): + filename = os.path.join(self.paths['test_dir'], "{0}_iops.{1}.log".format(os.path.basename( + self.fio_job), i)) + file_data = self.get_file_fail(filename) + if not file_data: return iops_files.append(file_data.splitlines()) @@ -454,22 +182,20 @@ def check_result(self): ratio1 = iops3/iops2 ratio2 = iops3/iops1 - logging.debug( - "sample {0}: job1 iops={1} job2 iops={2} job3 iops={3} job3/job2={4:.3f} job3/job1={5:.3f}".format( - i, iops1, iops2, iops3, ratio1, ratio2 - ) - ) + logging.debug("sample {0}: job1 iops={1} job2 iops={2} job3 iops={3} " \ + "job3/job2={4:.3f} job3/job1={5:.3f}".format(i, iops1, iops2, iops3, ratio1, + ratio2)) # test job1 and job2 succeeded to recalibrate if ratio1 < 1 or ratio1 > 3 or ratio2 < 7 or ratio2 > 13: - self.failure_reason = "{0} iops ratio mismatch iops1={1} iops2={2} iops3={3} expected r1~2 r2~10 got r1={4:.3f} r2={5:.3f},".format( - self.failure_reason, iops1, iops2, iops3, ratio1, ratio2 - ) + self.failure_reason += " iops ratio mismatch iops1={0} iops2={1} iops3={2} " \ + "expected r1~2 r2~10 got r1={3:.3f} r2={4:.3f},".format(iops1, iops2, iops3, + ratio1, ratio2) self.passed = False return -class FioJobTest_t0014(FioJobTest): +class FioJobFileTest_t0014(FioJobFileTest): """Test consists of fio test job t0014 Confirm that job1_iops / job2_iops ~ 1:2 for entire duration and that job1_iops / job3_iops ~ 1:3 for first half of duration. @@ -478,18 +204,17 @@ class FioJobTest_t0014(FioJobTest): re-calibrate the activity dynamically""" def check_result(self): - super(FioJobTest_t0014, self).check_result() + super().check_result() if not self.passed: return iops_files = [] - for i in range(1,4): - file_data, success = self.get_file(os.path.join(self.test_dir, "{0}_iops.{1}.log".format(os.path.basename(self.fio_job), i))) - - if not success: - self.failure_reason = "{0} unable to open output file,".format(self.failure_reason) - self.passed = False + for i in range(1, 4): + filename = os.path.join(self.paths['test_dir'], "{0}_iops.{1}.log".format(os.path.basename( + self.fio_job), i)) + file_data = self.get_file_fail(filename) + if not file_data: return iops_files.append(file_data.splitlines()) @@ -507,10 +232,9 @@ def check_result(self): if ratio1 < 0.43 or ratio1 > 0.57 or ratio2 < 0.21 or ratio2 > 0.45: - self.failure_reason = "{0} iops ratio mismatch iops1={1} iops2={2} iops3={3}\ - expected r1~0.5 r2~0.33 got r1={4:.3f} r2={5:.3f},".format( - self.failure_reason, iops1, iops2, iops3, ratio1, ratio2 - ) + self.failure_reason += " iops ratio mismatch iops1={0} iops2={1} iops3={2} " \ + "expected r1~0.5 r2~0.33 got r1={3:.3f} r2={4:.3f},".format( + iops1, iops2, iops3, ratio1, ratio2) self.passed = False iops1 = iops1 + float(iops_files[0][i].split(',')[1]) @@ -518,175 +242,395 @@ def check_result(self): ratio1 = iops1/iops2 ratio2 = iops1/iops3 - logging.debug( - "sample {0}: job1 iops={1} job2 iops={2} job3 iops={3} job1/job2={4:.3f} job1/job3={5:.3f}".format( - i, iops1, iops2, iops3, ratio1, ratio2 - ) - ) + logging.debug("sample {0}: job1 iops={1} job2 iops={2} job3 iops={3} " \ + "job1/job2={4:.3f} job1/job3={5:.3f}".format(i, iops1, iops2, iops3, + ratio1, ratio2)) # test job1 and job2 succeeded to recalibrate if ratio1 < 0.43 or ratio1 > 0.57: - self.failure_reason = "{0} iops ratio mismatch iops1={1} iops2={2} expected ratio~0.5 got ratio={3:.3f},".format( - self.failure_reason, iops1, iops2, ratio1 - ) + self.failure_reason += " iops ratio mismatch iops1={0} iops2={1} expected ratio~0.5 " \ + "got ratio={2:.3f},".format(iops1, iops2, ratio1) self.passed = False return -class FioJobTest_iops_rate(FioJobTest): - """Test consists of fio test job t0009 +class FioJobFileTest_t0015(FioJobFileTest): + """Test consists of fio test jobs t0015 and t0016 + Confirm that mean(slat) + mean(clat) = mean(tlat)""" + + def check_result(self): + super().check_result() + + if not self.passed: + return + + slat = self.json_data['jobs'][0]['read']['slat_ns']['mean'] + clat = self.json_data['jobs'][0]['read']['clat_ns']['mean'] + tlat = self.json_data['jobs'][0]['read']['lat_ns']['mean'] + logging.debug('Test %d: slat %f, clat %f, tlat %f', self.testnum, slat, clat, tlat) + + if abs(slat + clat - tlat) > 1: + self.failure_reason = "{0} slat {1} + clat {2} = {3} != tlat {4},".format( + self.failure_reason, slat, clat, slat+clat, tlat) + self.passed = False + + +class FioJobFileTest_t0019(FioJobFileTest): + """Test consists of fio test job t0019 + Confirm that all offsets were touched sequentially""" + + def check_result(self): + super().check_result() + + bw_log_filename = os.path.join(self.paths['test_dir'], "test_bw.log") + file_data = self.get_file_fail(bw_log_filename) + if not file_data: + return + + log_lines = file_data.split('\n') + + prev = -4096 + for line in log_lines: + if len(line.strip()) == 0: + continue + cur = int(line.split(',')[4]) + if cur - prev != 4096: + self.passed = False + self.failure_reason = f"offsets {prev}, {cur} not sequential" + return + prev = cur + + if cur/4096 != 255: + self.passed = False + self.failure_reason = f"unexpected last offset {cur}" + + +class FioJobFileTest_t0020(FioJobFileTest): + """Test consists of fio test jobs t0020 and t0021 + Confirm that almost all offsets were touched non-sequentially""" + + def check_result(self): + super().check_result() + + bw_log_filename = os.path.join(self.paths['test_dir'], "test_bw.log") + file_data = self.get_file_fail(bw_log_filename) + if not file_data: + return + + log_lines = file_data.split('\n') + + offsets = [] + + prev = int(log_lines[0].split(',')[4]) + for line in log_lines[1:]: + offsets.append(prev/4096) + if len(line.strip()) == 0: + continue + cur = int(line.split(',')[4]) + prev = cur + + if len(offsets) != 256: + self.passed = False + self.failure_reason += f" number of offsets is {len(offsets)} instead of 256" + + for i in range(256): + if not i in offsets: + self.passed = False + self.failure_reason += f" missing offset {i * 4096}" + + (_, p) = runstest_1samp(list(offsets)) + if p < 0.05: + self.passed = False + self.failure_reason += f" runs test failed with p = {p}" + + +class FioJobFileTest_t0022(FioJobFileTest): + """Test consists of fio test job t0022""" + + def check_result(self): + super().check_result() + + bw_log_filename = os.path.join(self.paths['test_dir'], "test_bw.log") + file_data = self.get_file_fail(bw_log_filename) + if not file_data: + return + + log_lines = file_data.split('\n') + + filesize = 1024*1024 + bs = 4096 + seq_count = 0 + offsets = set() + + prev = int(log_lines[0].split(',')[4]) + for line in log_lines[1:]: + offsets.add(prev/bs) + if len(line.strip()) == 0: + continue + cur = int(line.split(',')[4]) + if cur - prev == bs: + seq_count += 1 + prev = cur + + # 10 is an arbitrary threshold + if seq_count > 10: + self.passed = False + self.failure_reason = f"too many ({seq_count}) consecutive offsets" + + if len(offsets) == filesize/bs: + self.passed = False + self.failure_reason += " no duplicate offsets found with norandommap=1" + + +class FioJobFileTest_t0023(FioJobFileTest): + """Test consists of fio test job t0023 randtrimwrite test.""" + + def check_trimwrite(self, filename): + """Make sure that trims are followed by writes of the same size at the same offset.""" + + bw_log_filename = os.path.join(self.paths['test_dir'], filename) + file_data = self.get_file_fail(bw_log_filename) + if not file_data: + return + + log_lines = file_data.split('\n') + + prev_ddir = 1 + for line in log_lines: + if len(line.strip()) == 0: + continue + vals = line.split(',') + ddir = int(vals[2]) + bs = int(vals[3]) + offset = int(vals[4]) + if prev_ddir == 1: + if ddir != 2: + self.passed = False + self.failure_reason += " {0}: write not preceeded by trim: {1}".format( + bw_log_filename, line) + break + else: + if ddir != 1: # pylint: disable=no-else-break + self.passed = False + self.failure_reason += " {0}: trim not preceeded by write: {1}".format( + bw_log_filename, line) + break + else: + if prev_bs != bs: + self.passed = False + self.failure_reason += " {0}: block size does not match: {1}".format( + bw_log_filename, line) + break + + if prev_offset != offset: + self.passed = False + self.failure_reason += " {0}: offset does not match: {1}".format( + bw_log_filename, line) + break + + prev_ddir = ddir + prev_bs = bs + prev_offset = offset + + + def check_all_offsets(self, filename, sectorsize, filesize): + """Make sure all offsets were touched.""" + + file_data = self.get_file_fail(os.path.join(self.paths['test_dir'], filename)) + if not file_data: + return + + log_lines = file_data.split('\n') + + offsets = set() + + for line in log_lines: + if len(line.strip()) == 0: + continue + vals = line.split(',') + bs = int(vals[3]) + offset = int(vals[4]) + if offset % sectorsize != 0: + self.passed = False + self.failure_reason += " {0}: offset {1} not a multiple of sector size {2}".format( + filename, offset, sectorsize) + break + if bs % sectorsize != 0: + self.passed = False + self.failure_reason += " {0}: block size {1} not a multiple of sector size " \ + "{2}".format(filename, bs, sectorsize) + break + for i in range(int(bs/sectorsize)): + offsets.add(offset/sectorsize + i) + + if len(offsets) != filesize/sectorsize: + self.passed = False + self.failure_reason += " {0}: only {1} offsets touched; expected {2}".format( + filename, len(offsets), filesize/sectorsize) + else: + logging.debug("%s: %d sectors touched", filename, len(offsets)) + + + def check_result(self): + super().check_result() + + filesize = 1024*1024 + + self.check_trimwrite("basic_bw.log") + self.check_trimwrite("bs_bw.log") + self.check_trimwrite("bsrange_bw.log") + self.check_trimwrite("bssplit_bw.log") + self.check_trimwrite("basic_no_rm_bw.log") + self.check_trimwrite("bs_no_rm_bw.log") + self.check_trimwrite("bsrange_no_rm_bw.log") + self.check_trimwrite("bssplit_no_rm_bw.log") + + self.check_all_offsets("basic_bw.log", 4096, filesize) + self.check_all_offsets("bs_bw.log", 8192, filesize) + self.check_all_offsets("bsrange_bw.log", 512, filesize) + self.check_all_offsets("bssplit_bw.log", 512, filesize) + + +class FioJobFileTest_t0024(FioJobFileTest_t0023): + """Test consists of fio test job t0024 trimwrite test.""" + + def check_result(self): + # call FioJobFileTest_t0023's parent to skip checks done by t0023 + super(FioJobFileTest_t0023, self).check_result() + + filesize = 1024*1024 + + self.check_trimwrite("basic_bw.log") + self.check_trimwrite("bs_bw.log") + self.check_trimwrite("bsrange_bw.log") + self.check_trimwrite("bssplit_bw.log") + + self.check_all_offsets("basic_bw.log", 4096, filesize) + self.check_all_offsets("bs_bw.log", 8192, filesize) + self.check_all_offsets("bsrange_bw.log", 512, filesize) + self.check_all_offsets("bssplit_bw.log", 512, filesize) + + +class FioJobFileTest_t0025(FioJobFileTest): + """Test experimental verify read backs written data pattern.""" + def check_result(self): + super().check_result() + + if not self.passed: + return + + if self.json_data['jobs'][0]['read']['io_kbytes'] != 128: + self.passed = False + +class FioJobFileTest_t0027(FioJobFileTest): + def setup(self, *args, **kws): + super().setup(*args, **kws) + self.pattern_file = os.path.join(self.paths['test_dir'], "t0027.pattern") + self.output_file = os.path.join(self.paths['test_dir'], "t0027file") + self.pattern = os.urandom(16 << 10) + with open(self.pattern_file, "wb") as f: + f.write(self.pattern) + + def check_result(self): + super().check_result() + + if not self.passed: + return + + with open(self.output_file, "rb") as f: + data = f.read() + + if data != self.pattern: + self.passed = False + +class FioJobFileTest_t0029(FioJobFileTest): + """Test loops option works with read-verify workload.""" + def check_result(self): + super().check_result() + + if not self.passed: + return + + if self.json_data['jobs'][1]['read']['io_kbytes'] != 8: + self.passed = False + +class FioJobFileTest_LogFileFormat(FioJobFileTest): + """Test log file format""" + def setup(self, *args, **kws): + super().setup(*args, **kws) + self.patterns = {} + + def check_result(self): + super().check_result() + + if not self.passed: + return + + for logfile in self.patterns.keys(): + file_path = os.path.join(self.paths['test_dir'], logfile) + with open(file_path, "r") as f: + line = f.readline() + if not re.match(self.patterns[logfile], line): + self.passed = False + self.failure_reason = "wrong log file format: " + logfile + return + +class FioJobFileTest_t0033(FioJobFileTest_LogFileFormat): + """Test log file format""" + def setup(self, *args, **kws): + super().setup(*args, **kws) + self.patterns = { + 'log_bw.1.log': '\\d+, \\d+, \\d+, \\d+, 0x[\\da-f]+\\n', + 'log_clat.2.log': '\\d+, \\d+, \\d+, \\d+, 0, \\d+\\n', + 'log_iops.3.log': '\\d+, \\d+, \\d+, \\d+, \\d+, 0x[\\da-f]+\\n', + 'log_iops.4.log': '\\d+, \\d+, \\d+, \\d+, 0, 0, \\d+\\n', + } + +class FioJobFileTest_t0034(FioJobFileTest_LogFileFormat): + """Test log file format""" + def setup(self, *args, **kws): + super().setup(*args, **kws) + self.patterns = { + 'log_clat.1.log': '\\d+, \\d+, \\d+, \\d+, \\d+, \\d+, \\d+\\n', + 'log_slat.1.log': '\\d+, \\d+, \\d+, \\d+, \\d+, \\d+, \\d+\\n', + 'log_lat.1.log': '\\d+, \\d+, \\d+, \\d+, \\d+, \\d+, 0\\n', + 'log_clat.2.log': '\\d+, \\d+, \\d+, \\d+, 0, 0, \\d+, 0\\n', + 'log_bw.3.log': '\\d+, \\d+, \\d+, \\d+, \\d+, \\d+, 0\\n', + 'log_iops.3.log': '\\d+, \\d+, \\d+, \\d+, \\d+, \\d+, 0\\n', + } + +class FioJobFileTest_iops_rate(FioJobFileTest): + """Test consists of fio test job t0011 Confirm that job0 iops == 1000 and that job1_iops / job0_iops ~ 8 With two runs of fio-3.16 I observed a ratio of 8.3""" def check_result(self): - super(FioJobTest_iops_rate, self).check_result() + super().check_result() if not self.passed: return iops1 = self.json_data['jobs'][0]['read']['iops'] + logging.debug("Test %d: iops1: %f", self.testnum, iops1) iops2 = self.json_data['jobs'][1]['read']['iops'] + logging.debug("Test %d: iops2: %f", self.testnum, iops2) ratio = iops2 / iops1 - logging.debug("Test %d: iops1: %f", self.testnum, iops1) logging.debug("Test %d: ratio: %f", self.testnum, ratio) if iops1 < 950 or iops1 > 1050: - self.failure_reason = "{0} iops value mismatch,".format(self.failure_reason) + self.failure_reason = f"{self.failure_reason} iops value mismatch," self.passed = False if ratio < 6 or ratio > 10: - self.failure_reason = "{0} iops ratio mismatch,".format(self.failure_reason) + self.failure_reason = f"{self.failure_reason} iops ratio mismatch," self.passed = False -class Requirements(object): - """Requirements consists of multiple run environment characteristics. - These are to determine if a particular test can be run""" - - _linux = False - _libaio = False - _zbd = False - _root = False - _zoned_nullb = False - _not_macos = False - _not_windows = False - _unittests = False - _cpucount4 = False - - def __init__(self, fio_root): - Requirements._not_macos = platform.system() != "Darwin" - Requirements._not_windows = platform.system() != "Windows" - Requirements._linux = platform.system() == "Linux" - - if Requirements._linux: - config_file = os.path.join(fio_root, "config-host.h") - contents, success = FioJobTest.get_file(config_file) - if not success: - print("Unable to open {0} to check requirements".format(config_file)) - Requirements._zbd = True - else: - Requirements._zbd = "CONFIG_HAS_BLKZONED" in contents - Requirements._libaio = "CONFIG_LIBAIO" in contents - - Requirements._root = (os.geteuid() == 0) - if Requirements._zbd and Requirements._root: - try: - subprocess.run(["modprobe", "null_blk"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - if os.path.exists("/sys/module/null_blk/parameters/zoned"): - Requirements._zoned_nullb = True - except Exception: - pass - - if platform.system() == "Windows": - utest_exe = "unittest.exe" - else: - utest_exe = "unittest" - unittest_path = os.path.join(fio_root, "unittests", utest_exe) - Requirements._unittests = os.path.exists(unittest_path) - - Requirements._cpucount4 = multiprocessing.cpu_count() >= 4 - - req_list = [Requirements.linux, - Requirements.libaio, - Requirements.zbd, - Requirements.root, - Requirements.zoned_nullb, - Requirements.not_macos, - Requirements.not_windows, - Requirements.unittests, - Requirements.cpucount4] - for req in req_list: - value, desc = req() - logging.debug("Requirements: Requirement '%s' met? %s", desc, value) - - @classmethod - def linux(cls): - """Are we running on Linux?""" - return Requirements._linux, "Linux required" - - @classmethod - def libaio(cls): - """Is libaio available?""" - return Requirements._libaio, "libaio required" - - @classmethod - def zbd(cls): - """Is ZBD support available?""" - return Requirements._zbd, "Zoned block device support required" - - @classmethod - def root(cls): - """Are we running as root?""" - return Requirements._root, "root required" - - @classmethod - def zoned_nullb(cls): - """Are zoned null block devices available?""" - return Requirements._zoned_nullb, "Zoned null block device support required" - - @classmethod - def not_macos(cls): - """Are we running on a platform other than macOS?""" - return Requirements._not_macos, "platform other than macOS required" - - @classmethod - def not_windows(cls): - """Are we running on a platform other than Windws?""" - return Requirements._not_windows, "platform other than Windows required" - - @classmethod - def unittests(cls): - """Were unittests built?""" - return Requirements._unittests, "Unittests support required" - - @classmethod - def cpucount4(cls): - """Do we have at least 4 CPUs?""" - return Requirements._cpucount4, "4+ CPUs required" - - -SUCCESS_DEFAULT = { - 'zero_return': True, - 'stderr_empty': True, - 'timeout': 600, - } -SUCCESS_NONZERO = { - 'zero_return': False, - 'stderr_empty': False, - 'timeout': 600, - } -SUCCESS_STDERR = { - 'zero_return': True, - 'stderr_empty': False, - 'timeout': 600, - } TEST_LIST = [ { 'test_id': 1, - 'test_class': FioJobTest, + 'test_class': FioJobFileTest, 'job': 't0001-52c58027.fio', 'success': SUCCESS_DEFAULT, 'pre_job': None, @@ -695,7 +639,7 @@ def cpucount4(cls): }, { 'test_id': 2, - 'test_class': FioJobTest, + 'test_class': FioJobFileTest, 'job': 't0002-13af05ae-post.fio', 'success': SUCCESS_DEFAULT, 'pre_job': 't0002-13af05ae-pre.fio', @@ -704,7 +648,7 @@ def cpucount4(cls): }, { 'test_id': 3, - 'test_class': FioJobTest, + 'test_class': FioJobFileTest, 'job': 't0003-0ae2c6e1-post.fio', 'success': SUCCESS_NONZERO, 'pre_job': 't0003-0ae2c6e1-pre.fio', @@ -713,7 +657,7 @@ def cpucount4(cls): }, { 'test_id': 4, - 'test_class': FioJobTest, + 'test_class': FioJobFileTest, 'job': 't0004-8a99fdf6.fio', 'success': SUCCESS_DEFAULT, 'pre_job': None, @@ -722,7 +666,7 @@ def cpucount4(cls): }, { 'test_id': 5, - 'test_class': FioJobTest_t0005, + 'test_class': FioJobFileTest_t0005, 'job': 't0005-f7078f7b.fio', 'success': SUCCESS_DEFAULT, 'pre_job': None, @@ -732,7 +676,7 @@ def cpucount4(cls): }, { 'test_id': 6, - 'test_class': FioJobTest_t0006, + 'test_class': FioJobFileTest_t0006, 'job': 't0006-82af2a7c.fio', 'success': SUCCESS_DEFAULT, 'pre_job': None, @@ -742,7 +686,7 @@ def cpucount4(cls): }, { 'test_id': 7, - 'test_class': FioJobTest_t0007, + 'test_class': FioJobFileTest_t0007, 'job': 't0007-37cf9e3c.fio', 'success': SUCCESS_DEFAULT, 'pre_job': None, @@ -752,7 +696,7 @@ def cpucount4(cls): }, { 'test_id': 8, - 'test_class': FioJobTest_t0008, + 'test_class': FioJobFileTest_t0008, 'job': 't0008-ae2fafc8.fio', 'success': SUCCESS_DEFAULT, 'pre_job': None, @@ -762,7 +706,7 @@ def cpucount4(cls): }, { 'test_id': 9, - 'test_class': FioJobTest_t0009, + 'test_class': FioJobFileTest_t0009, 'job': 't0009-f8b0bd10.fio', 'success': SUCCESS_DEFAULT, 'pre_job': None, @@ -774,7 +718,7 @@ def cpucount4(cls): }, { 'test_id': 10, - 'test_class': FioJobTest, + 'test_class': FioJobFileTest, 'job': 't0010-b7aae4ba.fio', 'success': SUCCESS_DEFAULT, 'pre_job': None, @@ -783,7 +727,7 @@ def cpucount4(cls): }, { 'test_id': 11, - 'test_class': FioJobTest_iops_rate, + 'test_class': FioJobFileTest_iops_rate, 'job': 't0011-5d2788d5.fio', 'success': SUCCESS_DEFAULT, 'pre_job': None, @@ -793,7 +737,7 @@ def cpucount4(cls): }, { 'test_id': 12, - 'test_class': FioJobTest_t0012, + 'test_class': FioJobFileTest_t0012, 'job': 't0012.fio', 'success': SUCCESS_DEFAULT, 'pre_job': None, @@ -803,7 +747,7 @@ def cpucount4(cls): }, { 'test_id': 13, - 'test_class': FioJobTest, + 'test_class': FioJobFileTest, 'job': 't0013.fio', 'success': SUCCESS_DEFAULT, 'pre_job': None, @@ -813,7 +757,7 @@ def cpucount4(cls): }, { 'test_id': 14, - 'test_class': FioJobTest_t0014, + 'test_class': FioJobFileTest_t0014, 'job': 't0014.fio', 'success': SUCCESS_DEFAULT, 'pre_job': None, @@ -821,6 +765,210 @@ def cpucount4(cls): 'output_format': 'json', 'requirements': [], }, + { + 'test_id': 15, + 'test_class': FioJobFileTest_t0015, + 'job': 't0015-4e7e7898.fio', + 'success': SUCCESS_DEFAULT, + 'pre_job': None, + 'pre_success': None, + 'output_format': 'json', + 'requirements': [Requirements.linux, Requirements.libaio], + }, + { + 'test_id': 16, + 'test_class': FioJobFileTest_t0015, + 'job': 't0016-d54ae22.fio', + 'success': SUCCESS_DEFAULT, + 'pre_job': None, + 'pre_success': None, + 'output_format': 'json', + 'requirements': [], + }, + { + 'test_id': 17, + 'test_class': FioJobFileTest_t0015, + 'job': 't0017.fio', + 'success': SUCCESS_DEFAULT, + 'pre_job': None, + 'pre_success': None, + 'output_format': 'json', + 'requirements': [Requirements.not_windows], + }, + { + 'test_id': 18, + 'test_class': FioJobFileTest, + 'job': 't0018.fio', + 'success': SUCCESS_DEFAULT, + 'pre_job': None, + 'pre_success': None, + 'requirements': [Requirements.linux, Requirements.io_uring], + }, + { + 'test_id': 19, + 'test_class': FioJobFileTest_t0019, + 'job': 't0019.fio', + 'success': SUCCESS_DEFAULT, + 'pre_job': None, + 'pre_success': None, + 'requirements': [], + }, + { + 'test_id': 20, + 'test_class': FioJobFileTest_t0020, + 'job': 't0020.fio', + 'success': SUCCESS_DEFAULT, + 'pre_job': None, + 'pre_success': None, + 'requirements': [], + }, + { + 'test_id': 21, + 'test_class': FioJobFileTest_t0020, + 'job': 't0021.fio', + 'success': SUCCESS_DEFAULT, + 'pre_job': None, + 'pre_success': None, + 'requirements': [], + }, + { + 'test_id': 22, + 'test_class': FioJobFileTest_t0022, + 'job': 't0022.fio', + 'success': SUCCESS_DEFAULT, + 'pre_job': None, + 'pre_success': None, + 'requirements': [], + }, + { + 'test_id': 23, + 'test_class': FioJobFileTest_t0023, + 'job': 't0023.fio', + 'success': SUCCESS_DEFAULT, + 'pre_job': None, + 'pre_success': None, + 'requirements': [], + }, + { + 'test_id': 24, + 'test_class': FioJobFileTest_t0024, + 'job': 't0024.fio', + 'success': SUCCESS_DEFAULT, + 'pre_job': None, + 'pre_success': None, + 'requirements': [], + }, + { + 'test_id': 25, + 'test_class': FioJobFileTest_t0025, + 'job': 't0025.fio', + 'success': SUCCESS_DEFAULT, + 'pre_job': None, + 'pre_success': None, + 'output_format': 'json', + 'requirements': [], + }, + { + 'test_id': 26, + 'test_class': FioJobFileTest, + 'job': 't0026.fio', + 'success': SUCCESS_DEFAULT, + 'pre_job': None, + 'pre_success': None, + 'requirements': [Requirements.not_windows], + }, + { + 'test_id': 27, + 'test_class': FioJobFileTest_t0027, + 'job': 't0027.fio', + 'success': SUCCESS_DEFAULT, + 'pre_job': None, + 'pre_success': None, + 'requirements': [], + }, + { + 'test_id': 28, + 'test_class': FioJobFileTest, + 'job': 't0028-c6cade16.fio', + 'success': SUCCESS_DEFAULT, + 'pre_job': None, + 'pre_success': None, + 'requirements': [], + }, + { + 'test_id': 29, + 'test_class': FioJobFileTest_t0029, + 'job': 't0029.fio', + 'success': SUCCESS_DEFAULT, + 'pre_job': None, + 'pre_success': None, + 'output_format': 'json', + 'requirements': [], + }, + { + 'test_id': 30, + 'test_class': FioJobFileTest, + 'job': 't0030.fio', + 'success': SUCCESS_DEFAULT, + 'pre_job': None, + 'pre_success': None, + 'parameters': ['--bandwidth-log'], + 'requirements': [], + }, + { + 'test_id': 31, + 'test_class': FioJobFileTest, + 'job': 't0031.fio', + 'success': SUCCESS_DEFAULT, + 'pre_job': 't0031-pre.fio', + 'pre_success': SUCCESS_DEFAULT, + 'requirements': [Requirements.linux, Requirements.libaio], + }, + { + 'test_id': 33, + 'test_class': FioJobFileTest_t0033, + 'job': 't0033.fio', + 'success': SUCCESS_DEFAULT, + 'pre_job': None, + 'pre_success': None, + 'requirements': [Requirements.linux, Requirements.libaio], + }, + { + 'test_id': 34, + 'test_class': FioJobFileTest_t0034, + 'job': 't0034.fio', + 'success': SUCCESS_DEFAULT, + 'pre_job': None, + 'pre_success': None, + 'requirements': [Requirements.linux, Requirements.libaio], + }, + { + 'test_id': 35, + 'test_class': FioJobFileTest, + 'job': 't0035.fio', + 'success': SUCCESS_DEFAULT, + 'pre_job': None, + 'pre_success': None, + 'requirements': [], + }, + { + 'test_id': 36, + 'test_class': FioJobFileTest, + 'job': 't0036-post.fio', + 'success': SUCCESS_DEFAULT, + 'pre_job': 't0036-pre.fio', + 'pre_success': SUCCESS_DEFAULT, + 'requirements': [], + }, + { + 'test_id': 37, + 'test_class': FioJobFileTest, + 'job': 't0037-post.fio', + 'success': SUCCESS_DEFAULT, + 'pre_job': 't0037-pre.fio', + 'pre_success': SUCCESS_DEFAULT, + 'requirements': [Requirements.linux, Requirements.libaio], + }, { 'test_id': 1000, 'test_class': FioExeTest, @@ -873,7 +1021,7 @@ def cpucount4(cls): 'test_id': 1006, 'test_class': FioExeTest, 'exe': 't/strided.py', - 'parameters': ['{fio_path}'], + 'parameters': ['--fio', '{fio_path}'], 'success': SUCCESS_DEFAULT, 'requirements': [], }, @@ -919,6 +1067,78 @@ def cpucount4(cls): 'success': SUCCESS_DEFAULT, 'requirements': [], }, + { + 'test_id': 1012, + 'test_class': FioExeTest, + 'exe': 't/log_compression.py', + 'parameters': ['-f', '{fio_path}'], + 'success': SUCCESS_DEFAULT, + 'requirements': [], + }, + { + 'test_id': 1013, + 'test_class': FioExeTest, + 'exe': 't/random_seed.py', + 'parameters': ['-f', '{fio_path}'], + 'success': SUCCESS_DEFAULT, + 'requirements': [], + }, + { + 'test_id': 1014, + 'test_class': FioExeTest, + 'exe': 't/nvmept.py', + 'parameters': ['-f', '{fio_path}', '--dut', '{nvmecdev}'], + 'success': SUCCESS_DEFAULT, + 'requirements': [Requirements.linux, Requirements.nvmecdev], + }, + { + 'test_id': 1015, + 'test_class': FioExeTest, + 'exe': 't/nvmept_trim.py', + 'parameters': ['-f', '{fio_path}', '--dut', '{nvmecdev}'], + 'success': SUCCESS_DEFAULT, + 'requirements': [Requirements.linux, Requirements.nvmecdev], + }, + { + 'test_id': 1016, + 'test_class': FioExeTest, + 'exe': 't/client_server.py', + 'parameters': ['-f', '{fio_path}'], + 'success': SUCCESS_DEFAULT, + 'requirements': [Requirements.linux], + }, + { + 'test_id': 1017, + 'test_class': FioExeTest, + 'exe': 't/verify.py', + 'parameters': ['-f', '{fio_path}'], + 'success': SUCCESS_LONG, + 'requirements': [], + }, + { + 'test_id': 1018, + 'test_class': FioExeTest, + 'exe': 't/verify-trim.py', + 'parameters': ['-f', '{fio_path}'], + 'success': SUCCESS_DEFAULT, + 'requirements': [Requirements.linux], + }, + { + 'test_id': 1019, + 'test_class': FioExeTest, + 'exe': 't/sprandom.py', + 'parameters': ['-f', '{fio_path}'], + 'success': SUCCESS_DEFAULT, + 'requirements': [Requirements.linux, Requirements.libaio], + }, + { + 'test_id': 1020, + 'test_class': FioExeTest, + 'exe': 't/t64-switch.sh', + 'parameters': ['{fio_path}'], + 'success': SUCCESS_DEFAULT, + 'requirements': [Requirements.not_windows], + }, ] @@ -942,6 +1162,10 @@ def parse_args(): help='skip requirements checking') parser.add_argument('-p', '--pass-through', action='append', help='pass-through an argument to an executable test') + parser.add_argument('--nvmecdev', action='store', default=None, + help='NVMe character device for **DESTRUCTIVE** testing (e.g., /dev/ng0n1)') + parser.add_argument('-c', '--cleanup', action='store_true', default=False, + help='Delete artifacts for passing tests') args = parser.parse_args() return args @@ -960,7 +1184,7 @@ def main(): if args.pass_through: for arg in args.pass_through: if not ':' in arg: - print("Invalid --pass-through argument '%s'" % arg) + print(f"Invalid --pass-through argument '{arg}'") print("Syntax for --pass-through is TESTNUMBER:ARGUMENT") return split = arg.split(":", 1) @@ -971,7 +1195,7 @@ def main(): fio_root = args.fio_root else: fio_root = str(Path(__file__).absolute().parent.parent) - print("fio root is %s" % fio_root) + print(f"fio root is {fio_root}") if args.fio: fio_path = args.fio @@ -981,107 +1205,25 @@ def main(): else: fio_exe = "fio" fio_path = os.path.join(fio_root, fio_exe) - print("fio path is %s" % fio_path) + print(f"fio path is {fio_path}") if not shutil.which(fio_path): print("Warning: fio executable not found") artifact_root = args.artifact_root if args.artifact_root else \ - "fio-test-{0}".format(time.strftime("%Y%m%d-%H%M%S")) + f"fio-test-{time.strftime('%Y%m%d-%H%M%S')}" os.mkdir(artifact_root) - print("Artifact directory is %s" % artifact_root) + print(f"Artifact directory is {artifact_root}") if not args.skip_req: - req = Requirements(fio_root) - - passed = 0 - failed = 0 - skipped = 0 - - for config in TEST_LIST: - if (args.skip and config['test_id'] in args.skip) or \ - (args.run_only and config['test_id'] not in args.run_only): - skipped = skipped + 1 - print("Test {0} SKIPPED (User request)".format(config['test_id'])) - continue - - if issubclass(config['test_class'], FioJobTest): - if config['pre_job']: - fio_pre_job = os.path.join(fio_root, 't', 'jobs', - config['pre_job']) - else: - fio_pre_job = None - if config['pre_success']: - fio_pre_success = config['pre_success'] - else: - fio_pre_success = None - if 'output_format' in config: - output_format = config['output_format'] - else: - output_format = 'normal' - test = config['test_class']( - fio_path, - os.path.join(fio_root, 't', 'jobs', config['job']), - config['success'], - fio_pre_job=fio_pre_job, - fio_pre_success=fio_pre_success, - output_format=output_format) - desc = config['job'] - elif issubclass(config['test_class'], FioExeTest): - exe_path = os.path.join(fio_root, config['exe']) - if config['parameters']: - parameters = [p.format(fio_path=fio_path) for p in config['parameters']] - else: - parameters = [] - if Path(exe_path).suffix == '.py' and platform.system() == "Windows": - parameters.insert(0, exe_path) - exe_path = "python.exe" - if config['test_id'] in pass_through: - parameters += pass_through[config['test_id']].split() - test = config['test_class'](exe_path, parameters, - config['success']) - desc = config['exe'] - else: - print("Test {0} FAILED: unable to process test config".format(config['test_id'])) - failed = failed + 1 - continue - - if not args.skip_req: - reqs_met = True - for req in config['requirements']: - reqs_met, reason = req() - logging.debug("Test %d: Requirement '%s' met? %s", config['test_id'], reason, - reqs_met) - if not reqs_met: - break - if not reqs_met: - print("Test {0} SKIPPED ({1}) {2}".format(config['test_id'], reason, desc)) - skipped = skipped + 1 - continue - - try: - test.setup(artifact_root, config['test_id']) - test.run() - test.check_result() - except KeyboardInterrupt: - break - except Exception as e: - test.passed = False - test.failure_reason += str(e) - logging.debug("Test %d exception:\n%s\n", config['test_id'], traceback.format_exc()) - if test.passed: - result = "PASSED" - passed = passed + 1 - else: - result = "FAILED: {0}".format(test.failure_reason) - failed = failed + 1 - contents, _ = FioJobTest.get_file(test.stderr_file) - logging.debug("Test %d: stderr:\n%s", config['test_id'], contents) - contents, _ = FioJobTest.get_file(test.stdout_file) - logging.debug("Test %d: stdout:\n%s", config['test_id'], contents) - print("Test {0} {1} {2}".format(config['test_id'], result, desc)) - - print("{0} test(s) passed, {1} failed, {2} skipped".format(passed, failed, skipped)) - + Requirements(fio_root, args) + + test_env = { + 'fio_path': fio_path, + 'fio_root': fio_root, + 'artifact_root': artifact_root, + 'pass_through': pass_through, + } + _, failed, _ = run_fio_tests(TEST_LIST, test_env, args) sys.exit(failed) diff --git a/t/sgunmap-test.py b/t/sgunmap-test.py index 4960a040ea..6687494f30 100755 --- a/t/sgunmap-test.py +++ b/t/sgunmap-test.py @@ -3,7 +3,7 @@ # # sgunmap-test.py # -# Limited functonality test for trim workloads using fio's sg ioengine +# Limited functionality test for trim workloads using fio's sg ioengine # This checks only the three sets of reported iodepths # # !!!WARNING!!! diff --git a/t/sprandom.py b/t/sprandom.py new file mode 100755 index 0000000000..a4b2fcbc42 --- /dev/null +++ b/t/sprandom.py @@ -0,0 +1,197 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0-only +# +# Copyright (c) 2025 Sandisk Corporation or its affiliates + +""" +sprandom.py +----------- +Tests for fio's sprandom feature. + +USAGE: + python t/sprandom.py [-f fio-executable] + +This script is also invoked by t/run-fio-tests.py. +""" + +import sys +import argparse +import time +from pathlib import Path + +from fiotestlib import FioJobCmdTest, run_fio_tests +from fiotestcommon import SUCCESS_DEFAULT, SUCCESS_NONZERO + +SPRANDOM_OPT_LIST = [ + 'spr_op', + 'spr_num_regions', + 'spr_cs', + 'size', + 'norandommap', + 'random_generator', + 'rw', +] + +class FioSPrandomTest(FioJobCmdTest): + """fio sprandom test wrapper.""" + + def setup(self, parameters): + """Setup fio arguments for the test.""" + bs = parameters.get("bs", "4k") + fio_args = [ + "--name=sprandom", + "--ioengine=libaio", + "--filename=sprandom_testfile", + f"--bs={bs}", + f"--blockalign={bs}", + "--direct=1", + "--iodepth=16", + "--sprandom=1", + ] + + # Add variable parameters if provided + + for opt in SPRANDOM_OPT_LIST: + if opt in self.fio_opts: + option = f"--{opt}={self.fio_opts[opt]}" + fio_args.append(option) + if "rw" not in self.fio_opts: + fio_args.append("--rw=randwrite") + + super().setup(fio_args) + + +TEST_LIST = [ + { + "test_id": 1, + "fio_opts": { + "spr_op": "0.10", + "spr_num_regions": "50", + "spr_cs": "0", + "size": "32M", + }, + "success": SUCCESS_DEFAULT, + "test_class": FioSPrandomTest, + }, + { + "test_id": 2, + "fio_opts": { + "spr_op": "0.25", + "spr_num_regions": "100", + "spr_cs": "0", + "size": "64M", + }, + "success": SUCCESS_DEFAULT, + "test_class": FioSPrandomTest, + }, + { + "test_id": 3, + "fio_opts": { + "spr_op": "0.50", + "spr_num_regions": "200", + "spr_cs": "0", + "size": "128M", + "random_generator": "tausworthe", + }, + "success": SUCCESS_NONZERO, + "test_class": FioSPrandomTest, + }, + { + "test_id": 4, + "fio_opts": { + "spr_op": "0.75", + "spr_num_regions": "400", + "spr_cs": "0", + "size": "256M", + "norandommap": "0" + }, + "bs": "16K", + "success": SUCCESS_NONZERO, + "test_class": FioSPrandomTest, + }, + { + "test_id": 5, + "fio_opts": { + "spr_op": "0.75", + "spr_num_regions": "400", + "spr_cs": "0", + "size": "256M", + "rw": "randread", + }, + "bs": "16K", + "success": SUCCESS_NONZERO, + "test_class": FioSPrandomTest, + }, + { + "test_id": 6, + "fio_opts": { + "spr_op": "0.10", + "spr_num_regions": "100", + "spr_cs": "32K", + "size": "32M", + }, + "success": SUCCESS_DEFAULT, + "test_class": FioSPrandomTest, + }, + { + "test_id": 7, + "fio_opts": { + "spr_op": "0.10", + "spr_num_regions": "2000", + "spr_cs": "32K", + "size": "32M", + }, + "success": SUCCESS_NONZERO, + "test_class": FioSPrandomTest, + }, + { + "test_id": 8, + "fio_opts": { + "spr_op": "0.10", + "spr_num_regions": "50", + "spr_cs": "32M", + "size": "32M", + }, + "success": SUCCESS_NONZERO, + "test_class": FioSPrandomTest, + }, +] + + +def parse_args(): + """Parse command-line arguments.""" + parser = argparse.ArgumentParser() + parser.add_argument("-f", "--fio", + help="path to fio executable (default: fio in PATH)") + parser.add_argument('-s', '--skip', nargs='+', type=int, + help='list of test(s) to skip') + parser.add_argument('-a', '--artifact-root', help='artifact root directory') + parser.add_argument('-o', '--run-only', nargs='+', type=int, + help='list of test(s) to run, skipping all others') + + return parser.parse_args() + + +def main(): + """Run sprandom tests.""" + args = parse_args() + + fio_path = str(Path(args.fio).absolute()) if args.fio else "fio" + artifact_root = args.artifact_root if args.artifact_root else \ + f"sprandom-test-{time.strftime('%Y%m%d-%H%M%S')}" + Path(artifact_root).mkdir(parents=True, exist_ok=True) + print(f"Artifact directory is {str(Path(artifact_root).absolute())}") + + test_env = { + "fio_path": fio_path, + "fio_root": str(Path(__file__).absolute().parent.parent), + "artifact_root": artifact_root, + "basename": "sprandom" + } + + _, failed, _ = run_fio_tests(TEST_LIST, test_env, args) + sys.exit(failed) + + +if __name__ == "__main__": + main() diff --git a/t/steadystate_tests.py b/t/steadystate_tests.py index e8bd768c51..d0fa73b28d 100755 --- a/t/steadystate_tests.py +++ b/t/steadystate_tests.py @@ -2,7 +2,7 @@ # # steadystate_tests.py # -# Test option parsing and functonality for fio's steady state detection feature. +# Test option parsing and functionality for fio's steady state detection feature. # # steadystate_tests.py --read file-for-read-testing --write file-for-write-testing ./fio # @@ -115,6 +115,7 @@ def check(data, iops, slope, pct, limit, dur, criterion): {'s': False, 'timeout': 20, 'numjobs': 2}, {'s': True, 'timeout': 100, 'numjobs': 3, 'ss_dur': 10, 'ss_ramp': 5, 'iops': False, 'slope': True, 'ss_limit': 0.1, 'pct': True}, {'s': True, 'timeout': 10, 'numjobs': 3, 'ss_dur': 10, 'ss_ramp': 500, 'iops': False, 'slope': True, 'ss_limit': 0.1, 'pct': True}, + {'s': True, 'timeout': 10, 'numjobs': 3, 'ss_dur': 10, 'ss_ramp': 500, 'iops': False, 'slope': True, 'ss_limit': 0.1, 'pct': True, 'ss_interval': 5}, ] jobnum = 0 diff --git a/t/stest.c b/t/stest.c index c6bf2d1efa..16ce692323 100644 --- a/t/stest.c +++ b/t/stest.c @@ -25,10 +25,11 @@ static FLIST_HEAD(list); static int do_rand_allocs(void) { - unsigned int size, nr, rounds = 0, ret = 0; + unsigned int i, size, nr, rounds = 0, ret = 0; unsigned long total; struct elem *e; bool error; + char *c; while (rounds++ < LOOPS) { #ifdef STEST_SEED @@ -38,12 +39,26 @@ static int do_rand_allocs(void) nr = total = 0; while (total < MAXSMALLOC) { size = 8 * sizeof(struct elem) + (int) (999.0 * (rand() / (RAND_MAX + 1.0))); - e = smalloc(size); + e = scalloc(1, size); if (!e) { printf("fail at %lu, size %u\n", total, size); ret++; break; } + + c = (char *)e; + for (i = 0; i < size; i++) { + if (*(c+i) != 0) { + printf("buffer not cleared at %lu, size %u\n", total, size); + ret++; + break; + } + } + + /* stop the while loop if buffer was not cleared */ + if (i < size) + break; + e->magic1 = MAGIC1; e->magic2 = MAGIC2; e->size = size; @@ -63,15 +78,25 @@ static int do_rand_allocs(void) sfree(e); if (!error) { - e = smalloc(LARGESMALLOC); + e = scalloc(1, LARGESMALLOC); if (!e) { - error = true; ret++; printf("failure allocating %u bytes at %lu allocated during sfree phase\n", LARGESMALLOC, total); + break; } - else - sfree(e); + + c = (char *)e; + for (i = 0; i < LARGESMALLOC; i++) { + if (*(c+i) != 0) { + error = true; + ret++; + printf("large buffer not cleared at %lu, size %u\n", total, size); + break; + } + } + + sfree(e); } } } diff --git a/t/strided.py b/t/strided.py index 45e6f148e1..75c429e454 100755 --- a/t/strided.py +++ b/t/strided.py @@ -1,11 +1,12 @@ #!/usr/bin/env python3 -# + +""" # strided.py # # Test zonemode=strided. This uses the null ioengine when no file is # specified. If a file is specified, use it for randdom read testing. # Some of the zoneranges in the tests are 16MiB. So when using a file -# a minimum size of 32MiB is recommended. +# a minimum size of 64MiB is recommended. # # USAGE # python strided.py fio-executable [-f file/device] @@ -13,12 +14,9 @@ # EXAMPLES # python t/strided.py ./fio # python t/strided.py ./fio -f /dev/sda -# dd if=/dev/zero of=temp bs=1M count=32 +# dd if=/dev/zero of=temp bs=1M count=64 # python t/strided.py ./fio -f temp # -# REQUIREMENTS -# Python 2.6+ -# # ===TEST MATRIX=== # # --zonemode=strided, zoneskip unset @@ -28,322 +26,419 @@ # zonesize= test['filesize']: - zonestart = 0 if 'offset' not in test else test['offset'] - - iosperzone = iosperzone + 1 - tokens = line.split(',') - offset = int(tokens[4]) - if offset < zonestart or offset >= zonestart + test['zonerange']: - print("Offset {0} outside of zone starting at {1}".format( - offset, zonestart)) - return False - - # skip next section if norandommap is enabled with no - # random_generator or with a random_generator != lfsr - if 'norandommap' in test: - if 'random_generator' in test: - if test['random_generator'] != 'lfsr': - continue - else: + if 'filename' in self.fio_opts: + for opt in ['filename', 'filesize']: + option = f"--{opt}={self.fio_opts[opt]}" + fio_args.append(option) + else: + fio_args.append('--ioengine=null') + for opt in ['size', 'io_size', 'filesize']: + option = f"--{opt}={self.fio_opts[opt]}" + fio_args.append(option) + + super().setup(fio_args) + + def check_result(self): + super().check_result() + if not self.passed: + return + + zonestart = 0 if 'offset' not in self.fio_opts else self.fio_opts['offset'] + iospersize = self.fio_opts['zonesize'] / self.fio_opts['bs'] + iosperrange = self.fio_opts['zonerange'] / self.fio_opts['bs'] + iosperzone = 0 + lines = self.iops_log_lines.split('\n') + zoneset = set() + + for line in lines: + if len(line) == 0: continue - # we either have a random map enabled or we - # are using an LFSR - # so all blocks should be unique and we should have - # covered the entire zone when iosperzone % iosperrange == 0 - block = (offset - zonestart) / test['bs'] - if block in zoneset: - print("Offset {0} in zone already touched".format(offset)) - return False + if iosperzone == iospersize: + # time to move to a new zone + iosperzone = 0 + zoneset = set() + zonestart += self.fio_opts['zonerange'] + if zonestart >= self.fio_opts['filesize']: + zonestart = 0 if 'offset' not in self.fio_opts else self.fio_opts['offset'] - zoneset.add(block) - if iosperzone % iosperrange == 0: - if len(zoneset) != iosperrange: - print("Expected {0} blocks in zone but only saw {1}".format( - iosperrange, len(zoneset))) - return False - zoneset = set() + iosperzone = iosperzone + 1 + tokens = line.split(',') + offset = int(tokens[4]) + if offset < zonestart or offset >= zonestart + self.fio_opts['zonerange']: + print(f"Offset {offset} outside of zone starting at {zonestart}") + return - return True + # skip next section if norandommap is enabled with no + # random_generator or with a random_generator != lfsr + if 'norandommap' in self.fio_opts: + if 'random_generator' in self.fio_opts: + if self.fio_opts['random_generator'] != 'lfsr': + continue + else: + continue + # we either have a random map enabled or we + # are using an LFSR + # so all blocks should be unique and we should have + # covered the entire zone when iosperzone % iosperrange == 0 + block = (offset - zonestart) / self.fio_opts['bs'] + if block in zoneset: + print(f"Offset {offset} in zone already touched") + return + + zoneset.add(block) + if iosperzone % iosperrange == 0: + if len(zoneset) != iosperrange: + print(f"Expected {iosperrange} blocks in zone but only saw {len(zoneset)}") + return + zoneset = set() + + +TEST_LIST = [ # randommap enabled + { + "test_id": 1, + "fio_opts": { + "zonerange": 4096, + "zonesize": 4096, + "bs": 4096, + "offset": 8*4096, + "size": 16*4096, + "io_size": 16*4096, + }, + "test_class": StridedTest, + }, + { + "test_id": 2, + "fio_opts": { + "zonerange": 4096, + "zonesize": 4096, + "bs": 4096, + "size": 16*4096, + "io_size": 16*4096, + }, + "test_class": StridedTest, + }, + { + "test_id": 3, + "fio_opts": { + "zonerange": 16*1024*1024, + "zonesize": 16*1024*1024, + "bs": 4096, + "size": 256*1024*1024, + "io_size": 256*1024*204, + }, + "test_class": StridedTest, + }, + { + "test_id": 4, + "fio_opts": { + "zonerange": 4096, + "zonesize": 4*4096, + "bs": 4096, + "size": 16*4096, + "io_size": 16*4096, + }, + "test_class": StridedTest, + }, + { + "test_id": 5, + "fio_opts": { + "zonerange": 16*1024*1024, + "zonesize": 32*1024*1024, + "bs": 4096, + "size": 256*1024*1024, + "io_size": 256*1024*204, + }, + "test_class": StridedTest, + }, + { + "test_id": 6, + "fio_opts": { + "zonerange": 8192, + "zonesize": 4096, + "bs": 4096, + "size": 16*4096, + "io_size": 16*4096, + }, + "test_class": StridedTest, + }, + { + "test_id": 7, + "fio_opts": { + "zonerange": 16*1024*1024, + "zonesize": 8*1024*1024, + "bs": 4096, + "size": 256*1024*1024, + "io_size": 256*1024*204, + }, + "test_class": StridedTest, + }, + # lfsr + { + "test_id": 8, + "fio_opts": { + "random_generator": "lfsr", + "zonerange": 4096*1024, + "zonesize": 4096*1024, + "bs": 4096, + "offset": 8*4096*1024, + "size": 16*4096*1024, + "io_size": 16*4096*1024, + }, + "test_class": StridedTest, + }, + { + "test_id": 9, + "fio_opts": { + "random_generator": "lfsr", + "zonerange": 4096*1024, + "zonesize": 4096*1024, + "bs": 4096, + "size": 16*4096*1024, + "io_size": 16*4096*1024, + }, + "test_class": StridedTest, + }, + { + "test_id": 10, + "fio_opts": { + "random_generator": "lfsr", + "zonerange": 16*1024*1024, + "zonesize": 16*1024*1024, + "bs": 4096, + "size": 256*1024*1024, + "io_size": 256*1024*204, + }, + "test_class": StridedTest, + }, + { + "test_id": 11, + "fio_opts": { + "random_generator": "lfsr", + "zonerange": 4096*1024, + "zonesize": 4*4096*1024, + "bs": 4096, + "size": 16*4096*1024, + "io_size": 16*4096*1024, + }, + "test_class": StridedTest, + }, + { + "test_id": 12, + "fio_opts": { + "random_generator": "lfsr", + "zonerange": 16*1024*1024, + "zonesize": 32*1024*1024, + "bs": 4096, + "size": 256*1024*1024, + "io_size": 256*1024*204, + }, + "test_class": StridedTest, + }, + { + "test_id": 13, + "fio_opts": { + "random_generator": "lfsr", + "zonerange": 8192*1024, + "zonesize": 4096*1024, + "bs": 4096, + "size": 16*4096*1024, + "io_size": 16*4096*1024, + }, + "test_class": StridedTest, + }, + { + "test_id": 14, + "fio_opts": { + "random_generator": "lfsr", + "zonerange": 16*1024*1024, + "zonesize": 8*1024*1024, + "bs": 4096, + "size": 256*1024*1024, + "io_size": 256*1024*204, + }, + "test_class": StridedTest, + }, + # norandommap + { + "test_id": 15, + "fio_opts": { + "norandommap": 1, + "zonerange": 4096, + "zonesize": 4096, + "bs": 4096, + "offset": 8*4096, + "size": 16*4096, + "io_size": 16*4096, + }, + "test_class": StridedTest, + }, + { + "test_id": 16, + "fio_opts": { + "norandommap": 1, + "zonerange": 4096, + "zonesize": 4096, + "bs": 4096, + "size": 16*4096, + "io_size": 16*4096, + }, + "test_class": StridedTest, + }, + { + "test_id": 17, + "fio_opts": { + "norandommap": 1, + "zonerange": 16*1024*1024, + "zonesize": 16*1024*1024, + "bs": 4096, + "size": 256*1024*1024, + "io_size": 256*1024*204, + }, + "test_class": StridedTest, + }, + { + "test_id": 18, + "fio_opts": { + "norandommap": 1, + "zonerange": 4096, + "zonesize": 8192, + "bs": 4096, + "size": 16*4096, + "io_size": 16*4096, + }, + "test_class": StridedTest, + }, + { + "test_id": 19, + "fio_opts": { + "norandommap": 1, + "zonerange": 16*1024*1024, + "zonesize": 32*1024*1024, + "bs": 4096, + "size": 256*1024*1024, + "io_size": 256*1024*204, + }, + "test_class": StridedTest, + }, + { + "test_id": 20, + "fio_opts": { + "norandommap": 1, + "zonerange": 8192, + "zonesize": 4096, + "bs": 4096, + "size": 16*4096, + "io_size": 16*4096, + }, + "test_class": StridedTest, + }, + { + "test_id": 21, + "fio_opts": { + "norandommap": 1, + "zonerange": 16*1024*1024, + "zonesize": 8*1024*1024, + "bs": 4096, + "size": 256*1024*1024, + "io_size": 256*1024*1024, + }, + "test_class": StridedTest, + }, +] + + +def parse_args(): + """Parse command-line arguments.""" + + parser = argparse.ArgumentParser() + parser.add_argument('-f', '--fio', help='path to file executable (e.g., ./fio)') + parser.add_argument('-a', '--artifact-root', help='artifact root directory') + parser.add_argument('-s', '--skip', nargs='+', type=int, + help='list of test(s) to skip') + parser.add_argument('-o', '--run-only', nargs='+', type=int, + help='list of test(s) to run, skipping all others') + parser.add_argument('--dut', + help='target file/device to test.') + args = parser.parse_args() + + return args + + +def main(): + """Run zonemode=strided tests.""" -if __name__ == '__main__': args = parse_args() - tests = [ # randommap enabled - { - "zonerange": 4096, - "zonesize": 4096, - "bs": 4096, - "offset": 8*4096, - "size": 16*4096, - "io_size": 16*4096, - }, - { - "zonerange": 4096, - "zonesize": 4096, - "bs": 4096, - "size": 16*4096, - "io_size": 16*4096, - }, - { - "zonerange": 16*1024*1024, - "zonesize": 16*1024*1024, - "bs": 4096, - "size": 256*1024*1024, - "io_size": 256*1024*204, - }, - { - "zonerange": 4096, - "zonesize": 4*4096, - "bs": 4096, - "size": 16*4096, - "io_size": 16*4096, - }, - { - "zonerange": 16*1024*1024, - "zonesize": 32*1024*1024, - "bs": 4096, - "size": 256*1024*1024, - "io_size": 256*1024*204, - }, - { - "zonerange": 8192, - "zonesize": 4096, - "bs": 4096, - "size": 16*4096, - "io_size": 16*4096, - }, - { - "zonerange": 16*1024*1024, - "zonesize": 8*1024*1024, - "bs": 4096, - "size": 256*1024*1024, - "io_size": 256*1024*204, - }, - # lfsr - { - "random_generator": "lfsr", - "zonerange": 4096*1024, - "zonesize": 4096*1024, - "bs": 4096, - "offset": 8*4096*1024, - "size": 16*4096*1024, - "io_size": 16*4096*1024, - }, - { - "random_generator": "lfsr", - "zonerange": 4096*1024, - "zonesize": 4096*1024, - "bs": 4096, - "size": 16*4096*1024, - "io_size": 16*4096*1024, - }, - { - "random_generator": "lfsr", - "zonerange": 16*1024*1024, - "zonesize": 16*1024*1024, - "bs": 4096, - "size": 256*1024*1024, - "io_size": 256*1024*204, - }, - { - "random_generator": "lfsr", - "zonerange": 4096*1024, - "zonesize": 4*4096*1024, - "bs": 4096, - "size": 16*4096*1024, - "io_size": 16*4096*1024, - }, - { - "random_generator": "lfsr", - "zonerange": 16*1024*1024, - "zonesize": 32*1024*1024, - "bs": 4096, - "size": 256*1024*1024, - "io_size": 256*1024*204, - }, - { - "random_generator": "lfsr", - "zonerange": 8192*1024, - "zonesize": 4096*1024, - "bs": 4096, - "size": 16*4096*1024, - "io_size": 16*4096*1024, - }, - { - "random_generator": "lfsr", - "zonerange": 16*1024*1024, - "zonesize": 8*1024*1024, - "bs": 4096, - "size": 256*1024*1024, - "io_size": 256*1024*204, - }, - # norandommap - { - "norandommap": 1, - "zonerange": 4096, - "zonesize": 4096, - "bs": 4096, - "offset": 8*4096, - "size": 16*4096, - "io_size": 16*4096, - }, - { - "norandommap": 1, - "zonerange": 4096, - "zonesize": 4096, - "bs": 4096, - "size": 16*4096, - "io_size": 16*4096, - }, - { - "norandommap": 1, - "zonerange": 16*1024*1024, - "zonesize": 16*1024*1024, - "bs": 4096, - "size": 256*1024*1024, - "io_size": 256*1024*204, - }, - { - "norandommap": 1, - "zonerange": 4096, - "zonesize": 8192, - "bs": 4096, - "size": 16*4096, - "io_size": 16*4096, - }, - { - "norandommap": 1, - "zonerange": 16*1024*1024, - "zonesize": 32*1024*1024, - "bs": 4096, - "size": 256*1024*1024, - "io_size": 256*1024*204, - }, - { - "norandommap": 1, - "zonerange": 8192, - "zonesize": 4096, - "bs": 4096, - "size": 16*4096, - "io_size": 16*4096, - }, - { - "norandommap": 1, - "zonerange": 16*1024*1024, - "zonesize": 8*1024*1024, - "bs": 4096, - "size": 256*1024*1024, - "io_size": 256*1024*1024, - }, - - ] - - index = 1 - passed = 0 - failed = 0 - - if args.filename: - statinfo = os.stat(args.filename) + artifact_root = args.artifact_root if args.artifact_root else \ + f"strided-test-{time.strftime('%Y%m%d-%H%M%S')}" + os.mkdir(artifact_root) + print(f"Artifact directory is {artifact_root}") + + if args.fio: + fio_path = str(Path(args.fio).absolute()) + else: + fio_path = 'fio' + print(f"fio path is {fio_path}") + + if args.dut: + statinfo = os.stat(args.dut) filesize = statinfo.st_size if filesize == 0: - f = os.open(args.filename, os.O_RDONLY) + f = os.open(args.dut, os.O_RDONLY) filesize = os.lseek(f, 0, os.SEEK_END) os.close(f) - for test in tests: - if args.filename: - test['filename'] = args.filename - test['filesize'] = filesize + for test in TEST_LIST: + if args.dut: + test['fio_opts']['filename'] = os.path.abspath(args.dut) + test['fio_opts']['filesize'] = filesize else: - test['filesize'] = test['size'] - iops_log = run_fio(args.fio, test, index) - status = check_output(iops_log, test) - print("Test {0} {1}".format(index, ("PASSED" if status else "FAILED"))) - if status: - passed = passed + 1 - else: - failed = failed + 1 - index = index + 1 + test['fio_opts']['filesize'] = test['fio_opts']['size'] - print("{0} tests passed, {1} failed".format(passed, failed)) + test_env = { + 'fio_path': fio_path, + 'fio_root': str(Path(__file__).absolute().parent.parent), + 'artifact_root': artifact_root, + 'basename': 'strided', + } + _, failed, _ = run_fio_tests(TEST_LIST, test_env, args) sys.exit(failed) + + +if __name__ == '__main__': + main() diff --git a/t/t64-switch.sh b/t/t64-switch.sh new file mode 100755 index 0000000000..94b1275498 --- /dev/null +++ b/t/t64-switch.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash +# +# Make sure that Fio actually does switch to the tausworthe64 random generator +# when it detects that the combination of block size and file size exceeds the +# limits of the default 32-bit random generator. +# +# Do this by counting the number of times offsets occurring more than once are +# touched. The default random generator should produce more duplicate offsets +# than the tausworthe64 random generator. +# +# Count offsets by parsing Fio's debug output. Use grep and cut to obtain a +# list of offsets, sort them, and count how many times offsets ocurring more +# than once are touched. +# +# Calculate the ratio of tausworthe32 duplicates to tausworthe64 duplicates. I +# am arbitrarily using a minimum ratio of 10 as the criteria for a passing +# test. +# +# Usage: +# t64-switch [FIO [COUNT]] +# + +FIO=${1:-fio} +COUNT=${2:-1000000} + +t32=$(${FIO} --name=test --ioengine=null --filesize=1T --bs=1 --rw=randread --debug=io --number_ios=${COUNT} --norandommap --randrepeat=0 --random_generator=tausworthe | grep complete: | cut -d '=' -f 2 | cut -d ',' -f 1 | sort -g | uniq -D | wc -l | tr -d ' ') +t64=$(${FIO} --name=test --ioengine=null --filesize=1T --bs=1 --rw=randread --debug=io --number_ios=${COUNT} --norandommap --randrepeat=0 | grep complete: | cut -d '=' -f 2 | cut -d ',' -f 1 | sort -g | uniq -D | wc -l | tr -d ' ') +if [ $t64 -gt 0 ]; then + let ratio=$t32/$t64 +else + let ratio=$t32 +fi + +echo tausworthe32: $t32 +echo tausworthe62: $t64 +echo ratio: $ratio + +if [ $ratio -ge 10 ]; then + echo result: pass +else + echo result: fail + exit 1 +fi diff --git a/t/time-test.c b/t/time-test.c index a74d9206f2..3c87d4d4c3 100644 --- a/t/time-test.c +++ b/t/time-test.c @@ -67,7 +67,7 @@ * accuracy because the (ticks * clock_mult) product used for final * fractional chunk * - * iv) 64-bit arithmetic with the clock ticks to nsec conversion occuring in + * iv) 64-bit arithmetic with the clock ticks to nsec conversion occurring in * two stages. This is carried out using locks to update the number of * large time chunks (MAX_CLOCK_SEC_2STAGE) that have elapsed. * diff --git a/t/verify-state.c b/t/verify-state.c index 734c1e4c77..9d76e501a1 100644 --- a/t/verify-state.c +++ b/t/verify-state.c @@ -21,18 +21,21 @@ static void show_s(struct thread_io_list *s, unsigned int no_s) printf("Thread:\t\t%u\n", no_s); printf("Name:\t\t%s\n", s->name); - printf("Completions:\t%llu\n", (unsigned long long) s->no_comps); printf("Depth:\t\t%llu\n", (unsigned long long) s->depth); printf("Number IOs:\t%llu\n", (unsigned long long) s->numberio); printf("Index:\t\t%llu\n", (unsigned long long) s->index); - printf("Completions:\n"); - if (!s->no_comps) + printf("Inflight writes:\n"); + if (!s->depth) return; - for (i = s->no_comps - 1; i >= 0; i--) { - printf("\t(file=%2llu) %llu\n", - (unsigned long long) s->comps[i].fileno, - (unsigned long long) s->comps[i].offset); + for (i = s->depth - 1; i >= 0; i--) { + uint64_t numberio; + numberio = s->inflight[i].numberio; + if (numberio == INVALID_NUMBERIO) + printf("\tNot inflight\n"); + else + printf("\t%llu\n", + (unsigned long long) s->inflight[i].numberio); } } @@ -44,22 +47,18 @@ static void show(struct thread_io_list *s, size_t size) do { int i; - s->no_comps = le64_to_cpu(s->no_comps); s->depth = le32_to_cpu(s->depth); - s->nofiles = le32_to_cpu(s->nofiles); s->numberio = le64_to_cpu(s->numberio); s->index = le64_to_cpu(s->index); - for (i = 0; i < s->no_comps; i++) { - s->comps[i].fileno = le64_to_cpu(s->comps[i].fileno); - s->comps[i].offset = le64_to_cpu(s->comps[i].offset); - } + for (i = 0; i < s->depth; i++) + s->inflight[i].numberio = le64_to_cpu(s->inflight[i].numberio); show_s(s, no_s); no_s++; - size -= __thread_io_list_sz(s->depth, s->nofiles); + size -= __thread_io_list_sz(s->depth); s = (struct thread_io_list *)((char *) s + - __thread_io_list_sz(s->depth, s->nofiles)); + __thread_io_list_sz(s->depth)); } while (size != 0); } @@ -90,7 +89,7 @@ static void show_verify_state(void *buf, size_t size) return; } - if (hdr->version == 0x03) + if (hdr->version == VSTATE_HDR_VERSION) show(s, size); else log_err("Unsupported version %d\n", (int) hdr->version); diff --git a/t/verify-trim.py b/t/verify-trim.py new file mode 100755 index 0000000000..cd98722a33 --- /dev/null +++ b/t/verify-trim.py @@ -0,0 +1,309 @@ +#!/usr/bin/env python3 +""" +# verify-trim.c.py +# +# Test fio's verify trim feature. +# +# USAGE +# see python3 verify-trim.c.py --help +# +# EXAMPLES +# python3 t/verify-trim.c.py +# python3 t/verify-trim.c.py --fio ./fio +# +# REQUIREMENTS +# Python 3.6 +# Linux +# +""" +import os +import sys +import time +import logging +import argparse +import subprocess +from pathlib import Path +from fiotestlib import FioJobCmdTest, run_fio_tests +from fiotestcommon import SUCCESS_NONZERO, Requirements + + +VERIFY_OPT_LIST = [ + 'direct', + 'iodepth', + 'filesize', + 'bs', + 'time_based', + 'runtime', + 'io_size', + 'offset', + 'number_ios', + 'output-format', + 'directory', + 'norandommap', + 'numjobs', + 'nrfiles', + 'openfiles', + 'ioengine', + 'trim_backlog_batch', + 'trim_verify_zero', + 'number_ios', +] + +class VerifyTrimTest(FioJobCmdTest): + """ + VerifyTrim test class. + """ + + def setup(self, parameters): + """Setup a test.""" + + fio_args = [ + "--name=verifytrim", + "--verify=md5", + f"--filename={self.fio_opts['filename']}", + f"--rw={self.fio_opts['rw']}", + f"--trim_percentage={self.fio_opts['trim_percentage']}", + f"--trim_backlog={self.fio_opts['trim_backlog']}", + f"--output={self.filenames['output']}", + ] + for opt in VERIFY_OPT_LIST: + if opt in self.fio_opts: + option = f"--{opt}={self.fio_opts[opt]}" + fio_args.append(option) + + super().setup(fio_args) + + def check_result(self): + super().check_result() + + if self.fio_opts.get('output-format') == 'json': + actual = self.json_data['jobs'][0]['trim']['total_ios'] + expected = self.json_data['jobs'][0]['write']['total_ios'] * self.fio_opts['trim_percentage'] / 100 + if abs(expected - actual) > 0.1*expected: + self.passed = False + self.failure_reason += f" large discrepancy between expected {expected} and {actual} actual trims," + else: + logging.debug("expected %d trims ~match actual %d trims", expected, actual) + + if not self.passed: + with open(self.filenames['stderr'], "r") as se: + contents = se.read() + logging.info("stderr: %s", contents) + + with open(self.filenames['stdout'], "r") as so: + contents = so.read() + logging.info("stdout: %s", contents) + + with open(self.filenames['output'], "r") as out: + contents = out.read() + logging.info("output: %s", contents) + + +TEST_LIST = [ + # These tests are superficial. + # + # TODO: add a test case for trim_verify_zero by inducing a failure; the way + # to do this would be to write non-zero data to a block after it was + # trimmed but before it was read back (how to do this?) + { + # make sure readonly option triggers error message when + # trim_{percentage,backlog} options make trim operations a possibility + "test_id": 1, + "fio_opts": { + "rw": "read", + "trim_percentage": 100, + "trim_backlog": 1, + "readonly": 1, + }, + "test_class": VerifyTrimTest, + "success": SUCCESS_NONZERO, + }, + { + # basic test seq write + # trim_backlog=1 + # trim_percentage=100 + "test_id": 100, + "fio_opts": { + "rw": "write", + "trim_percentage": 100, + "trim_backlog": 1, + "trim_verify_zero": 1, + "number_ios": 64, + "output-format": "json", + }, + "test_class": VerifyTrimTest, + }, + { + # basic test rand write + # trim_backlog=1 + # trim_percentage=100 + "test_id": 101, + "fio_opts": { + "rw": "randwrite", + "trim_percentage": 100, + "trim_backlog": 1, + "trim_verify_zero": 1, + "number_ios": 64, + "output-format": "json", + }, + "test_class": VerifyTrimTest, + }, + { + # basic test seq write + # trim_backlog=1 + # trim_percentage=50 + "test_id": 102, + "fio_opts": { + "rw": "write", + "trim_percentage": 50, + "trim_backlog": 1, + "trim_verify_zero": 1, + "number_ios": 64, + "output-format": "json", + }, + "test_class": VerifyTrimTest, + }, + { + # basic test rand write + # trim_backlog=1 + # trim_percentage=50 + "test_id": 103, + "fio_opts": { + "rw": "randwrite", + "trim_percentage": 50, + "trim_backlog": 1, + "trim_verify_zero": 1, + "number_ios": 64, + "output-format": "json", + }, + "test_class": VerifyTrimTest, + }, + { + # basic test seq write + # trim_backlog=16 + # trim_percentage=50 + "test_id": 104, + "fio_opts": { + "rw": "write", + "trim_percentage": 50, + "trim_backlog": 16, + "trim_verify_zero": 1, + "number_ios": 64, + "output-format": "json", + }, + "test_class": VerifyTrimTest, + }, + { + # basic test rand write + # trim_backlog=16 + # trim_percentage=50 + "test_id": 105, + "fio_opts": { + "rw": "randwrite", + "trim_percentage": 50, + "trim_backlog": 16, + "trim_verify_zero": 1, + "number_ios": 64, + "output-format": "json", + }, + "test_class": VerifyTrimTest, + }, + +] + + +def parse_args(): + """Parse command-line arguments.""" + + parser = argparse.ArgumentParser() + parser.add_argument('-r', '--fio-root', help='fio root path') + parser.add_argument('-d', '--debug', help='Enable debug messages', action='store_true') + parser.add_argument('-f', '--fio', help='path to file executable (e.g., ./fio)') + parser.add_argument('-a', '--artifact-root', help='artifact root directory') + parser.add_argument('-s', '--skip', nargs='+', type=int, + help='list of test(s) to skip') + parser.add_argument('-o', '--run-only', nargs='+', type=int, + help='list of test(s) to run, skipping all others') + parser.add_argument('-k', '--skip-req', action='store_true', + help='skip requirements checking') + parser.add_argument('--dut', + help='Block device to test against (use null_blk if not provided') + args = parser.parse_args() + + return args + + +def main(): + """ + Run tests for fio's verify trim feature. + """ + + args = parse_args() + + if args.debug: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + artifact_root = args.artifact_root if args.artifact_root else \ + f"verify-trim-test-{time.strftime('%Y%m%d-%H%M%S')}" + os.mkdir(artifact_root) + print(f"Artifact directory is {artifact_root}") + + if args.fio: + fio_path = str(Path(args.fio).absolute()) + else: + fio_path = os.path.join(os.path.dirname(__file__), '../fio') + print(f"fio path is {fio_path}") + + if args.fio_root: + fio_root = args.fio_root + else: + fio_root = str(Path(__file__).absolute().parent.parent) + print(f"fio root is {fio_root}") + + if not args.skip_req: + Requirements(fio_root, args) + + cleanup_nullb = False + if not args.dut: + subprocess.run(["sudo", "modprobe", "-r", "null_blk"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + subprocess.run(["sudo", "modprobe", "null_blk", "memory_backed=1", "discard=1"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + if os.path.exists('/dev/nullb0'): + args.dut = '/dev/nullb0' + cleanup_nullb = True + else: + print("No block device provided and could not create null_blk device for testing") + sys.exit(1) + + for test in TEST_LIST: + test['fio_opts']['filename'] = args.dut + + test_env = { + 'fio_path': fio_path, + 'fio_root': str(Path(__file__).absolute().parent.parent), + 'artifact_root': artifact_root, + 'basename': 'verifytrim', + } + + total = { 'passed': 0, 'failed': 0, 'skipped': 0 } + + try: + total['passed'], total['failed'], total['skipped'] = run_fio_tests(TEST_LIST, test_env, args) + except KeyboardInterrupt: + pass + + if cleanup_nullb: + subprocess.run(["sudo", "modprobe", "-r", "null_blk"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + + sys.exit(total['failed']) + + +if __name__ == '__main__': + main() diff --git a/t/verify.py b/t/verify.py new file mode 100755 index 0000000000..4c3d0a3c61 --- /dev/null +++ b/t/verify.py @@ -0,0 +1,924 @@ +#!/usr/bin/env python3 +""" +# verify.py +# +# Test fio's verify options. +# +# USAGE +# see python3 verify.py --help +# +# EXAMPLES +# python3 t/verify.py +# python3 t/verify.py --fio ./fio +# +# REQUIREMENTS +# Python 3.6 +# - 4 CPUs +# +""" +import os +import sys +import time +import errno +import logging +import argparse +import platform +import itertools +from pathlib import Path +from fiotestlib import FioJobCmdTest, run_fio_tests +from fiotestcommon import SUCCESS_DEFAULT, SUCCESS_NONZERO, Requirements + + +VERIFY_OPT_LIST = [ + 'direct', + 'iodepth', + 'filesize', + 'bs', + 'time_based', + 'runtime', + 'io_size', + 'offset', + 'number_ios', + 'output-format', + 'directory', + 'norandommap', + 'numjobs', + 'nrfiles', + 'openfiles', + 'cpus_allowed', + 'fallocate', + 'experimental_verify', + 'verify_backlog', + 'verify_backlog_batch', + 'verify_interval', + 'verify_offset', + 'verify_async', + 'verify_async_cpus', + 'verify_pattern', + 'verify_pattern_interval', + 'verify_only', + 'verify_fatal', +] + +class VerifyTest(FioJobCmdTest): + """ + Verify test class. + """ + + def setup(self, parameters): + """Setup a test.""" + + fio_args = [ + "--name=verify", + "--fallocate=truncate", + f"--ioengine={self.fio_opts['ioengine']}", + f"--rw={self.fio_opts['rw']}", + f"--verify={self.fio_opts['verify']}", + f"--output={os.path.basename(self.filenames['output'])}", + ] + for opt in VERIFY_OPT_LIST: + if opt in self.fio_opts: + option = f"--{opt}={self.fio_opts[opt]}" + fio_args.append(option) + + super().setup(fio_args) + + def check_result(self): + super().check_result() + + if not self.passed: + with open(self.filenames['stderr'], "r") as se: + contents = se.read() + logging.info("stderr: %s", contents) + + with open(self.filenames['stdout'], "r") as so: + contents = so.read() + logging.info("stdout: %s", contents) + + with open(self.filenames['output'], "r") as out: + contents = out.read() + logging.info("output: %s", contents) + +class VerifyCSUMTest(FioJobCmdTest): + """ + Verify test class. Run standard verify jobs, modify the data, and then run + more verify jobs. Hopefully fio will detect that the data has chagned. + """ + + @staticmethod + def add_verify_opts(opt_list, adds): + """Add optional options.""" + + fio_opts = [] + + for opt in adds: + if opt in opt_list: + option = f"--{opt}={opt_list[opt]}" + fio_opts.append(option) + + return fio_opts + + def setup(self, parameters): + """Setup a test.""" + + logging.debug("ioengine is %s", self.fio_opts['ioengine']) + fio_args_base = [ + "--fallocate=truncate", + "--filename=verify", + "--stonewall", + f"--ioengine={self.fio_opts['ioengine']}", + ] + + extra_options = self.add_verify_opts(self.fio_opts, VERIFY_OPT_LIST) + + verify_only = [ + "--verify_only", + f"--rw={self.fio_opts['rw']}", + f"--verify={self.fio_opts['verify']}", + ] + fio_args_base + extra_options + + verify_read = [ + "--rw=randread" if 'rand' in self.fio_opts['rw'] else "--rw=read", + f"--verify={self.fio_opts['verify']}", + ] + fio_args_base + extra_options + + layout = [ + "--name=layout", + f"--rw={self.fio_opts['rw']}", + f"--verify={self.fio_opts['verify']}", + ] + fio_args_base + extra_options + + success_only = ["--name=success_only"] + verify_only + success_read = ["--name=success_read"] + verify_read + + mangle = [ + "--name=mangle", + "--rw=randwrite", + "--randrepeat=0", + f"--bs={self.fio_opts['mangle_bs']}", + "--number_ios=1", + ] + fio_args_base + self.add_verify_opts(self.fio_opts, ['filesize']) + + failure_only = ["--name=failure_only"] + verify_only + failure_read = ["--name=failure_read"] + verify_read + + fio_args = layout + success_only + success_read + mangle + failure_only + failure_read + [f"--output={self.filenames['output']}"] + logging.debug("fio_args: %s", fio_args) + + super().setup(fio_args) + + def check_result(self): + super().check_result() + + checked = {} + + for job in self.json_data['jobs']: + if job['jobname'] == 'layout': + checked[job['jobname']] = True + if job['error']: + self.passed = False + self.failure_reason += " layout job failed" + elif 'success' in job['jobname']: + checked[job['jobname']] = True + if job['error']: + self.passed = False + self.failure_reason += f" verify pass {job['jobname']} that should have succeeded actually failed" + elif job['jobname'] == 'mangle': + checked[job['jobname']] = True + if job['error']: + self.passed = False + self.failure_reason += " mangle job failed" + elif 'failure' in job['jobname']: + checked[job['jobname']] = True + if self.fio_opts['verify'] == 'null' and not job['error']: + continue + if job['error'] != errno.EILSEQ: + self.passed = False + self.failure_reason += f" verify job {job['jobname']} produced {job['error']} instead of errno {errno.EILSEQ} Illegal byte sequence" + logging.debug(self.json_data) + else: + self.passed = False + self.failure_reason += " unknown job name" + + if len(checked) != 6: + self.passed = False + self.failure_reason += " six phases not completed" + + with open(self.filenames['stderr'], "r") as se: + contents = se.read() + logging.debug("stderr: %s", contents) + + +# +# These tests exercise fio's verify_pattern_interval option. +# +TEST_LIST_VPI = [ + { + # Basic test verify=pattern + "test_id": 3000, + "fio_opts": { + "ioengine": "psync", + "rw": "write", + "verify": "pattern", + "filesize": "1M", + "bs": 4096, + "output-format": "json", + }, + "test_class": VerifyTest, + "success": SUCCESS_DEFAULT, + }, + { + # Basic test verify=pattern_hdr + "test_id": 3001, + "fio_opts": { + "ioengine": "psync", + "rw": "write", + "verify": "pattern_hdr", + "filesize": "1M", + "bs": 4096, + "output-format": "json", + }, + "test_class": VerifyTest, + "success": SUCCESS_DEFAULT, + }, +] + + +# +# These tests exercise fio's decisions about verifying the sequence number and +# random seed in the verify header. +# +TEST_LIST_HEADER = [ + { + # Basic test with options at default values + "test_id": 2000, + "fio_opts": { + "ioengine": "libaio", + "filesize": "1M", + "bs": 4096, + "output-format": "json", + }, + "test_class": VerifyTest, + "success": SUCCESS_DEFAULT, + }, + { + # Basic test with iodepth 16 + "test_id": 2001, + "fio_opts": { + "ioengine": "libaio", + "filesize": "1M", + "bs": 4096, + "iodepth": 16, + "output-format": "json", + }, + "test_class": VerifyTest, + "success": SUCCESS_DEFAULT, + }, + { + # Basic test with 3 files + "test_id": 2002, + "fio_opts": { + "ioengine": "libaio", + "filesize": "1M", + "bs": 4096, + "nrfiles": 3, + "output-format": "json", + }, + "test_class": VerifyTest, + "success": SUCCESS_DEFAULT, + }, + { + # Basic test with iodepth 16 and 3 files + "test_id": 2003, + "fio_opts": { + "ioengine": "libaio", + "filesize": "1M", + "bs": 4096, + "iodepth": 16, + "nrfiles": 3, + "output-format": "json", + }, + "test_class": VerifyTest, + "success": SUCCESS_DEFAULT, + }, +] + +# +# These tests are mainly intended to assess the checksum functions. They write +# out data, run some verify jobs, then modify the data, and try to verify the +# data again, expecting to see failures. +# +TEST_LIST_CSUM = [ + { + # basic seq write verify job + "test_id": 1000, + "fio_opts": { + "ioengine": "psync", + "filesize": "1M", + "bs": 4096, + "rw": "write", + "output-format": "json", + "verify_fatal": 1, + }, + "test_class": VerifyCSUMTest, + "success": SUCCESS_NONZERO, + }, + { + # basic rand write verify job + "test_id": 1001, + "fio_opts": { + "ioengine": "psync", + "filesize": "1M", + "bs": 4096, + "rw": "randwrite", + "output-format": "json", + "verify_fatal": 1, + }, + "test_class": VerifyCSUMTest, + "success": SUCCESS_NONZERO, + }, + { + # basic libaio seq write test + "test_id": 1002, + "fio_opts": { + "direct": 1, + "ioengine": "libaio", + "iodepth": 16, + "filesize": "1M", + "bs": 4096, + "rw": "write", + "output-format": "json", + "verify_fatal": 1, + }, + "test_class": VerifyCSUMTest, + "success": SUCCESS_NONZERO, + }, + { + # basic libaio rand write test + "test_id": 1003, + "fio_opts": { + "direct": 1, + "ioengine": "libaio", + "iodepth": 16, + "filesize": "1M", + "bs": 4096, + "rw": "randwrite", + "output-format": "json", + "verify_fatal": 1, + }, + "test_class": VerifyCSUMTest, + "success": SUCCESS_NONZERO, + }, +] + +# +# These tests are run for all combinations of data direction and checksum +# methods. +# +TEST_LIST = [ + { + # norandommap with verify backlog + "test_id": 1, + "fio_opts": { + "direct": 1, + "ioengine": "libaio", + "iodepth": 32, + "filesize": "2M", + "norandommap": 1, + "bs": 512, + "time_based": 1, + "runtime": 3, + "verify_backlog": 128, + "verify_backlog_batch": 64, + }, + "test_class": VerifyTest, + }, + { + # norandommap with verify offset and interval + "test_id": 2, + "fio_opts": { + "direct": 1, + "ioengine": "libaio", + "iodepth": 32, + "filesize": "2M", + "io_size": "4M", + "norandommap": 1, + "bs": 4096, + "verify_interval": 2048, + "verify_offset": 1024, + }, + "test_class": VerifyTest, + }, + { + # norandommap with verify offload to async threads + "test_id": 3, + "fio_opts": { + "direct": 1, + "ioengine": "libaio", + "iodepth": 32, + "filesize": "2M", + "norandommap": 1, + "bs": 4096, + "cpus_allowed": "0-3", + "verify_async": 2, + "verify_async_cpus": "0-1", + }, + "test_class": VerifyTest, + "requirements": [Requirements.not_macos, + Requirements.cpucount4], + # mac os does not support CPU affinity + }, + { + # tausworthe combine all verify options + "test_id": 4, + "fio_opts": { + "direct": 1, + "ioengine": "libaio", + "iodepth": 32, + "filesize": "4M", + "bs": 4096, + "cpus_allowed": "0-3", + "time_based": 1, + "random_generator": "tausworthe", + "runtime": 3, + "verify_interval": 2048, + "verify_offset": 1024, + "verify_backlog": 128, + "verify_backlog_batch": 128, + "verify_async": 2, + "verify_async_cpus": "0-1", + }, + "test_class": VerifyTest, + "requirements": [Requirements.not_macos, + Requirements.cpucount4], + # mac os does not support CPU affinity + }, + { + # norandommap combine all verify options + "test_id": 5, + "fio_opts": { + "direct": 1, + "ioengine": "libaio", + "iodepth": 32, + "filesize": "4M", + "norandommap": 1, + "bs": 4096, + "cpus_allowed": "0-3", + "time_based": 1, + "runtime": 3, + "verify_interval": 2048, + "verify_offset": 1024, + "verify_backlog": 128, + "verify_backlog_batch": 128, + "verify_async": 2, + "verify_async_cpus": "0-1", + }, + "test_class": VerifyTest, + "requirements": [Requirements.not_macos, + Requirements.cpucount4], + # mac os does not support CPU affinity + }, + { + # multiple jobs and files with verify + "test_id": 6, + "fio_opts": { + "direct": 1, + "ioengine": "libaio", + "iodepth": 32, + "filesize": "512K", + "nrfiles": 3, + "openfiles": 2, + "numjobs": 2, + "norandommap": 1, + "bs": 4096, + "verify_interval": 2048, + "verify_offset": 1024, + "verify_backlog": 16, + "verify_backlog_batch": 16, + }, + "test_class": VerifyTest, + "requirements": [Requirements.not_macos,], + # Skip this test on macOS because it is flaky. With rw=write it can + # fail to complete even after 10min which prevents the rw=read instance + # from passing because the read instance depends on the file created by + # the write instance. See failure here: + # https://github.com/vincentkfu/fio/actions/runs/13683127191/job/38260091800#step:14:258 + }, +] + + +def parse_args(): + """Parse command-line arguments.""" + + parser = argparse.ArgumentParser() + parser.add_argument('-r', '--fio-root', help='fio root path') + parser.add_argument('-d', '--debug', help='Enable debug messages', action='store_true') + parser.add_argument('-f', '--fio', help='path to file executable (e.g., ./fio)') + parser.add_argument('-a', '--artifact-root', help='artifact root directory') + parser.add_argument('-c', '--complete', help='Enable all checksums', action='store_true') + parser.add_argument('-s', '--skip', nargs='+', type=int, + help='list of test(s) to skip') + parser.add_argument('-o', '--run-only', nargs='+', type=int, + help='list of test(s) to run, skipping all others') + parser.add_argument('-k', '--skip-req', action='store_true', + help='skip requirements checking') + parser.add_argument('--csum', nargs='+', type=str, + help='list of checksum methods to use, skipping all others') + args = parser.parse_args() + + return args + + +def verify_test_header(test_env, args, csum, mode, sequence): + """ + Adjust test arguments based on values of mode and sequence. Then run the + tests. This function is intended to run a set of tests that test + conditions under which the header random seed and sequence number are + checked. + + The result should be a matrix with these combinations: + {write, write w/verify_only, read/write, read/write w/verify_only, read} x + {sequential, random w/randommap, random w/norandommap, sequence modifiers} + """ + for test in TEST_LIST_HEADER: + # experimental_verify does not work in verify_only=1 mode + if "_vo" in mode and 'experimental_verify' in test['fio_opts'] and \ + test['fio_opts']['experimental_verify']: + test['force_skip'] = True + else: + test['force_skip'] = False + + test['fio_opts']['verify'] = csum + if csum in ('pattern', 'pattern_hdr'): + test['fio_opts']['verify_pattern'] = '"abcd"-120xdeadface' + test['fio_opts'].pop('verify_pattern_interval', None) + elif csum == 'pattern_interval': + test['fio_opts']['verify'] = "pattern_hdr" + test['fio_opts']['verify_pattern'] = '%o' + test['fio_opts']['verify_pattern_interval'] = 512 + elif csum == 'pattern_interval_nohdr': + test['fio_opts']['verify'] = "pattern" + test['fio_opts']['verify_pattern'] = '%o' + test['fio_opts']['verify_pattern_interval'] = 512 + else: + test['fio_opts'].pop('verify_pattern', None) + test['fio_opts'].pop('verify_pattern_interval', None) + + if 'norandommap' in sequence: + test['fio_opts']['norandommap'] = 1 + else: + test['fio_opts']['norandommap'] = 0 + + if 'randommap' in sequence: + prefix = "rand" + else: + prefix = "" + + if 'sequence_modifier' in sequence: + suffix = ":4096" + else: + suffix = "" + + if 'readwrite' in mode: + fio_ddir = 'rw' + elif 'write' in mode: + fio_ddir = 'write' + elif 'read' in mode: + fio_ddir = 'read' + else: + fio_ddir = "" + # TODO throw an exception here + test['fio_opts']['rw'] = prefix + fio_ddir + suffix + logging.debug("ddir is %s", test['fio_opts']['rw']) + + if '_vo' in mode: + vo = 1 + else: + vo = 0 + test['fio_opts']['verify_only'] = vo + + # For 100% read workloads we need to read a file that was written with + # verify enabled. Use a previous test case for this by pointing fio to + # write to a file in a specific directory. + # + # For verify_only tests we also need to point fio to a file that was + # written with verify enabled + if mode == 'read': + directory = os.path.join(test_env['artifact_root'].replace(f'mode_{mode}','mode_write'), + f"{test['test_id']:04d}") + test['fio_opts']['directory'] = str(Path(directory).absolute()) if \ + platform.system() != "Windows" else str(Path(directory).absolute()).replace(':', '\\:') + elif vo: + directory = os.path.join(test_env['artifact_root'].replace('write_vo','write'), + f"{test['test_id']:04d}") + test['fio_opts']['directory'] = str(Path(directory).absolute()) if \ + platform.system() != "Windows" else str(Path(directory).absolute()).replace(':', '\\:') + else: + test['fio_opts'].pop('directory', None) + + return run_fio_tests(TEST_LIST_HEADER, test_env, args) + + +MANGLE_JOB_BS = 0 +def verify_test_csum(test_env, args, mbs, csum): + """ + Adjust test arguments based on values of csum. Then run the tests. + This function is designed for a series of tests that check that checksum + methods can reliably detect data integrity issues. + """ + for test in TEST_LIST_CSUM: + # The crc7 checksum will produce too many false positives since when we + # modify the data there is a 1/128 chance that the checksum will not + # change. So skip this set of tests. + if csum == 'crc7': + test['force_skip'] = True + else: + test['force_skip'] = False + test['fio_opts']['verify'] = csum + + if csum in ('pattern', 'pattern_hdr'): + test['fio_opts']['verify_pattern'] = '"abcd"-120xdeadface' + test['fio_opts'].pop('verify_pattern_interval', None) + elif csum == 'pattern_interval': + test['fio_opts']['verify'] = "pattern_hdr" + test['fio_opts']['verify_pattern'] = '%o' + test['fio_opts']['verify_pattern_interval'] = 512 + elif csum == 'pattern_interval_nohdr': + test['fio_opts']['verify'] = "pattern" + test['fio_opts']['verify_pattern'] = '%o' + test['fio_opts']['verify_pattern_interval'] = 512 + else: + test['fio_opts'].pop('verify_pattern', None) + test['fio_opts'].pop('verify_pattern_interval', None) + + if mbs == MANGLE_JOB_BS: + test['fio_opts']['mangle_bs'] = test['fio_opts']['bs'] + else: + test['fio_opts']['mangle_bs'] = mbs + + # These tests produce verification failures but not when verify=null, + # so adjust the success criterion. + if csum == 'null': + test['success'] = SUCCESS_DEFAULT + else: + test['success'] = SUCCESS_NONZERO + + return run_fio_tests(TEST_LIST_CSUM, test_env, args) + + +def verify_test_vpi(test_env, args, pattern, vpi, vi): + """ + Adjust test arguments based on values of ddir and csum. Then run + the tests. + """ + for test in TEST_LIST_VPI: + test['force_skip'] = False + + test['fio_opts']['verify_pattern'] = pattern + test['fio_opts']['verify_interval'] = vi + test['fio_opts']['verify_pattern_interval'] = vpi + + for key in ['verify_interval', 'verify_pattern_interval']: + if not test['fio_opts'][key]: + test['fio_opts'].pop(key, None) + + return run_fio_tests(TEST_LIST_VPI, test_env, args) + + +def verify_test(test_env, args, ddir, csum): + """ + Adjust test arguments based on values of ddir and csum. Then run + the tests. + """ + for test in TEST_LIST: + test['force_skip'] = False + + test['fio_opts']['rw'] = ddir + test['fio_opts']['verify'] = csum + + if csum in ('pattern', 'pattern_hdr'): + test['fio_opts']['verify_pattern'] = '"abcd"-120xdeadface' + test['fio_opts'].pop('verify_pattern_interval', None) + elif csum == 'pattern_interval': + test['fio_opts']['verify'] = "pattern_hdr" + test['fio_opts']['verify_pattern'] = '%o' + test['fio_opts']['verify_pattern_interval'] = 512 + elif csum == 'pattern_interval_nohdr': + test['fio_opts']['verify'] = "pattern" + test['fio_opts']['verify_pattern'] = '%o' + test['fio_opts']['verify_pattern_interval'] = 512 + else: + test['fio_opts'].pop('verify_pattern', None) + test['fio_opts'].pop('verify_pattern_interval', None) + + # For 100% read data directions we need the write file that was written with + # verify enabled. Use a previous test case for this by telling fio to + # write to a file in a specific directory. + if ddir == 'read': + directory = os.path.join(test_env['artifact_root'].replace(f'ddir_{ddir}','ddir_write'), + f"{test['test_id']:04d}") + test['fio_opts']['directory'] = str(Path(directory).absolute()) if \ + platform.system() != "Windows" else str(Path(directory).absolute()).replace(':', '\\:') + elif ddir == 'randread': + directory = os.path.join(test_env['artifact_root'].replace(f'ddir_{ddir}','ddir_randwrite'), + f"{test['test_id']:04d}") + test['fio_opts']['directory'] = str(Path(directory).absolute()) if \ + platform.system() != "Windows" else str(Path(directory).absolute()).replace(':', '\\:') + else: + test['fio_opts'].pop('directory', None) + + return run_fio_tests(TEST_LIST, test_env, args) + + +# 100% read workloads below must follow write workloads so that the 100% read +# workloads will be reading data written with verification enabled. +DDIR_LIST = [ + 'write', + 'readwrite', + 'read', + 'randwrite', + 'randrw', + 'randread', + ] +CSUM_LIST1 = [ + 'md5', + 'crc64', + 'pattern', + ] +CSUM_LIST2 = [ + 'md5', + 'crc64', + 'crc32c', + 'crc32c-intel', + 'crc16', + 'crc7', + 'xxhash', + 'sha512', + 'sha256', + 'sha1', + 'sha3-224', + 'sha3-384', + 'sha3-512', + 'pattern', + 'pattern_hdr', + 'pattern_interval', + 'pattern_interval_nohdr', + 'null', + ] + +def main(): + """ + Run tests for fio's verify feature. + """ + + args = parse_args() + + if args.debug: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + artifact_root = args.artifact_root if args.artifact_root else \ + f"verify-test-{time.strftime('%Y%m%d-%H%M%S')}" + os.mkdir(artifact_root) + print(f"Artifact directory is {artifact_root}") + + if args.fio: + fio_path = str(Path(args.fio).absolute()) + else: + fio_path = os.path.join(os.path.dirname(__file__), '../fio') + print(f"fio path is {fio_path}") + + if args.fio_root: + fio_root = args.fio_root + else: + fio_root = str(Path(__file__).absolute().parent.parent) + print(f"fio root is {fio_root}") + + if not args.skip_req: + Requirements(fio_root, args) + + test_env = { + 'fio_path': fio_path, + 'fio_root': str(Path(__file__).absolute().parent.parent), + 'artifact_root': artifact_root, + 'basename': 'verify', + } + + if platform.system() == 'Linux': + aio = 'libaio' + sync = 'psync' + elif platform.system() == 'Windows': + aio = 'windowsaio' + sync = 'sync' + else: + aio = 'posixaio' + sync = 'psync' + for test in TEST_LIST: + if 'aio' in test['fio_opts']['ioengine']: + test['fio_opts']['ioengine'] = aio + if 'sync' in test['fio_opts']['ioengine']: + test['fio_opts']['ioengine'] = sync + for test in TEST_LIST_CSUM: + if 'aio' in test['fio_opts']['ioengine']: + test['fio_opts']['ioengine'] = aio + if 'sync' in test['fio_opts']['ioengine']: + test['fio_opts']['ioengine'] = sync + for test in TEST_LIST_HEADER: + if 'aio' in test['fio_opts']['ioengine']: + test['fio_opts']['ioengine'] = aio + if 'sync' in test['fio_opts']['ioengine']: + test['fio_opts']['ioengine'] = sync + for test in TEST_LIST_VPI: + if 'aio' in test['fio_opts']['ioengine']: + test['fio_opts']['ioengine'] = aio + if 'sync' in test['fio_opts']['ioengine']: + test['fio_opts']['ioengine'] = sync + + total = { 'passed': 0, 'failed': 0, 'skipped': 0 } + + if args.complete: + csum_list = CSUM_LIST2 + else: + csum_list = CSUM_LIST1 + + if args.csum: + csum_list = args.csum + + try: + for ddir, csum in itertools.product(DDIR_LIST, csum_list): + print(f"\nddir: {ddir}, checksum: {csum}") + + test_env['artifact_root'] = os.path.join(artifact_root, + f"ddir_{ddir}_csum_{csum}") + os.mkdir(test_env['artifact_root']) + + passed, failed, skipped = verify_test(test_env, args, ddir, csum) + + total['passed'] += passed + total['failed'] += failed + total['skipped'] += skipped + + # MANGLE_JOB_BS means to mangle an entire block which should result in + # a header magic number error + # 4 means to mangle 4 bytes which should result in a checksum error + # unless the 4 bytes occur in the verification header + mangle_bs = [MANGLE_JOB_BS, 4] + for mbs, csum in itertools.product(mangle_bs, csum_list): + print(f"\nmangle block size: {mbs}, checksum: {csum}") + + test_env['artifact_root'] = os.path.join(artifact_root, + f"mbs_{mbs}_csum_{csum}") + os.mkdir(test_env['artifact_root']) + + passed, failed, skipped = verify_test_csum(test_env, args, mbs, csum) + + total['passed'] += passed + total['failed'] += failed + total['skipped'] += skipped + + # The loop below tests combinations of options that exercise fio's + # decisions about disabling checks for the sequence number and random + # seed in the verify header. + mode_list = [ "write", "write_vo", "readwrite", "readwrite_vo", "read" ] + sequence_list = [ "sequential", "randommap", "norandommap", "sequence_modifier" ] + for mode, sequence in itertools.product(mode_list, sequence_list): + print(f"\nmode: {mode}, sequence: {sequence}") + + test_env['artifact_root'] = os.path.join(artifact_root, + f"mode_{mode}_seq_{sequence}") + os.mkdir(test_env['artifact_root']) + + passed, failed, skipped = verify_test_header(test_env, args, 'md5', mode, sequence) + + total['passed'] += passed + total['failed'] += failed + total['skipped'] += skipped + + # The loop below is for verify_pattern_interval tests + pattern_list = ['%o', '"abcde"', '1%o',] + vpi_list = [10, 129, 512, 4089, None] + verify_interval_list = [512, 1024, 2222, 3791, None] + for pattern, vpi, vi in itertools.product(pattern_list, vpi_list, verify_interval_list): + print(f"\npattern: {pattern}, verify_pattern_interval: {vpi}, verify_interval: {vi}") + + test_env['artifact_root'] = os.path.join(artifact_root, + f"pattern_{pattern}_vpi_{vpi}_vi_{vi}").replace('"', '').replace("%", 'pct') + os.mkdir(test_env['artifact_root']) + + passed, failed, skipped = verify_test_vpi(test_env, args, pattern, vpi, vi) + + total['passed'] += passed + total['failed'] += failed + total['skipped'] += skipped + + except KeyboardInterrupt: + pass + + print(f"\n\n{total['passed']} test(s) passed, {total['failed']} failed, " \ + f"{total['skipped']} skipped") + sys.exit(total['failed']) + + +if __name__ == '__main__': + main() diff --git a/t/zbd/functions b/t/zbd/functions index e4e248b9ff..7734371e5e 100644 --- a/t/zbd/functions +++ b/t/zbd/functions @@ -4,6 +4,7 @@ blkzone=$(type -p blkzone 2>/dev/null) sg_inq=$(type -p sg_inq 2>/dev/null) zbc_report_zones=$(type -p zbc_report_zones 2>/dev/null) zbc_reset_zone=$(type -p zbc_reset_zone 2>/dev/null) +zbc_close_zone=$(type -p zbc_close_zone 2>/dev/null) zbc_info=$(type -p zbc_info 2>/dev/null) if [ -z "${blkzone}" ] && { [ -z "${zbc_report_zones}" ] || [ -z "${zbc_reset_zone}" ]; }; then @@ -26,6 +27,17 @@ blkzone_reports_capacity() { "${blkzone}" report -c 1 -o 0 "${dev}" | grep -q 'cap ' } +has_command() { + local cmd="${1}" + + cmd_path=$(type -p "${cmd}" 2>/dev/null) + if [ -z "${cmd_path}" ]; then + echo "${cmd} is not available" + return 1 + fi + return 0 +} + # Whether or not $1 (/dev/...) is a NVME ZNS device. is_nvme_zns() { local s @@ -72,9 +84,11 @@ zone_cap_bs() { local sed_str='s/.*len \([0-9A-Za-z]*\), cap \([0-9A-Za-z]*\).*/\1 \2/p' local cap bs="$zone_size" - # When blkzone is not available or blkzone does not report capacity, + # When blkzone command is neither available nor relevant to the + # test device, or when blkzone command does not report capacity, # assume that zone capacity is same as zone size for all zones. - if [ -z "${blkzone}" ] || ! blkzone_reports_capacity "${dev}"; then + if [ -z "${blkzone}" ] || [ -z "$is_zbd" ] || [ -c "$dev" ] || + ! blkzone_reports_capacity "${dev}"; then echo "$zone_size" return fi @@ -209,8 +223,14 @@ last_online_zone() { # max_open_zones in sysfs, or which lacks zoned block device support completely. max_open_zones() { local dev=$1 + local realdev syspath - if [ -n "${sg_inq}" ] && [ ! -n "${use_libzbc}" ]; then + realdev=$(readlink -f "$dev") + syspath=/sys/block/${realdev##*/}/queue/max_open_zones + + if [ -b "${realdev}" ] && [ -r "${syspath}" ]; then + cat ${syspath} + elif [ -n "${sg_inq}" ] && [ ! -n "${use_libzbc}" ]; then if ! ${sg_inq} -e --page=0xB6 --len=20 --hex "$dev" \ > /dev/null 2>&1; then # When sg_inq can not get max open zones, specify 0 which indicates @@ -228,9 +248,67 @@ max_open_zones() { echo ${max_nr_open_zones} } fi - else + elif [ -n "${use_libzbc}" ]; then ${zbc_report_zones} "$dev" | sed -n 's/^[[:blank:]]*Maximum number of open sequential write required zones:[[:blank:]]*//p' + else + echo 0 + fi +} + +# If sysfs provides, get max_active_zones limit of the zoned block device. +max_active_zones() { + local dev=$1 + local sys_queue="/sys/block/${dev##*/}/queue/" + + if [[ -e "$sys_queue/max_active_zones" ]]; then + cat "$sys_queue/max_active_zones" + return + fi + echo 0 +} + +# Get minimum block size to write to seq zones. Refer the sysfs attribute +# zone_write_granularity which shows the valid minimum size regardless of zoned +# block device type. If the sysfs attribute is not available, refer physical +# block size for rotational SMR drives. For non-rotational devices such as ZNS +# devices, refer logical block size. +min_seq_write_size() { + local sys_path="/sys/block/$1/queue" + local -i size=0 + + if [[ -r "$sys_path/zone_write_granularity" ]]; then + size=$(<"$sys_path/zone_write_granularity") + fi + + if ((size)); then + echo "$size" + elif (($(<"$sys_path/rotational"))); then + cat "$sys_path/physical_block_size" + else + cat "$sys_path/logical_block_size" + fi +} + +urswrz() { + local dev=$1 + + if [ -n "${sg_inq}" ] && [ ! -n "${use_libzbc}" ]; then + if ! ${sg_inq} -e --page=0xB6 --len=10 --hex "$dev" \ + > /dev/null 2>&1; then + # Couldn't get URSWRZ bit. Assume the reads are unrestricted + # because this configuration is more common. + echo 1 + else + ${sg_inq} -e --page=0xB6 --len=10 --hex "$dev" | tail -1 | + { + read -r offset b0 b1 b2 b3 b4 trailer && \ + echo $(( $b4 & 0x01 )) || echo 0 + } + fi + else + ${zbc_info} "$dev" | + sed -n 's/^[[:blank:]].*Read commands are \(un\)restricted*/\1/p' | grep -q ^ && echo 1 || echo 0 fi } @@ -240,12 +318,12 @@ is_zbc() { [[ -z "$(${zbc_info} "$dev" | grep "is not a zoned block device")" ]] } -zbc_logical_block_size() { +zbc_physical_block_size() { local dev=$1 ${zbc_info} "$dev" | - grep "logical blocks" | - sed -n 's/^[[:blank:]]*[0-9]* logical blocks of[[:blank:]]*//p' | + grep "physical blocks" | + sed -n 's/^[[:blank:]]*[0-9]* physical blocks of[[:blank:]]*//p' | sed 's/ B//' } @@ -278,6 +356,18 @@ reset_zone() { fi } +# Close the zone on device $1 at offset $2. The offset must be specified in +# units of 512 byte sectors. +close_zone() { + local dev=$1 offset=$2 + + if [ -n "${blkzone}" ] && [ -z "${use_libzbc}" ]; then + ${blkzone} close -o "${offset}" -c 1 "$dev" + else + ${zbc_close_zone} -sector "$dev" "${offset}" >/dev/null + fi +} + # Extract the number of bytes that have been transferred from a line like # READ: bw=6847KiB/s (7011kB/s), 6847KiB/s-6847KiB/s (7011kB/s-7011kB/s), io=257MiB (269MB), run=38406-38406msec fio_io() { diff --git a/t/zbd/run-tests-against-nullb b/t/zbd/run-tests-against-nullb index 7d2c7fa8fc..f1cba35564 100755 --- a/t/zbd/run-tests-against-nullb +++ b/t/zbd/run-tests-against-nullb @@ -67,16 +67,33 @@ configure_nullb() fi echo "${zone_capacity}" > zone_capacity fi + if ((conv_pcnt)); then if ((!conv_supported)); then echo "null_blk does not support conventional zones" return 2 fi nr_conv=$((dev_size/zone_size*conv_pcnt/100)) - echo "${nr_conv}" > zone_nr_conv + else + nr_conv=0 + fi + echo "${nr_conv}" > zone_nr_conv + + if ((max_open)); then + echo "${max_open}" > zone_max_open + if ((max_active)); then + if ((!max_act_supported)); then + echo "null_blk does not support active zone counts" + return 2 + fi + echo "${max_active}" > zone_max_active + fi fi fi + [[ -w badblocks_once ]] && echo 1 > badblocks_once + [[ -w badblocks_partial_io ]] && echo 1 > badblocks_partial_io + echo 1 > power || return $? return 0 } @@ -90,6 +107,11 @@ show_nullb_config() echo " $(printf "Zone Capacity: %d MB" ${zone_capacity})" if ((max_open)); then echo " $(printf "Max Open: %d Zones" ${max_open})" + if ((max_active)); then + echo " $(printf "Max Active: %d Zones" ${max_active})" + else + echo " Max Active: Unlimited Zones" + fi else echo " Max Open: Unlimited Zones" fi @@ -124,6 +146,7 @@ section3() zone_size=4 zone_capacity=3 max_open=0 + max_active=0 } # Zoned device with mostly sequential zones, ZCAP == ZSIZE, unlimited MaxOpen. @@ -133,6 +156,7 @@ section4() zone_size=1 zone_capacity=1 max_open=0 + max_active=0 } # Zoned device with mostly sequential zones, ZCAP < ZSIZE, unlimited MaxOpen. @@ -142,6 +166,7 @@ section5() zone_size=4 zone_capacity=3 max_open=0 + max_active=0 } # Zoned device with mostly conventional zones, ZCAP == ZSIZE, unlimited MaxOpen. @@ -151,6 +176,7 @@ section6() zone_size=1 zone_capacity=1 max_open=0 + max_active=0 } # Zoned device with mostly conventional zones, ZCAP < ZSIZE, unlimited MaxOpen. @@ -161,9 +187,11 @@ section7() zone_size=4 zone_capacity=3 max_open=0 + max_active=0 } -# Zoned device with no conventional zones, ZCAP == ZSIZE, limited MaxOpen. +# Zoned device with no conventional zones, ZCAP == ZSIZE, limited MaxOpen, +# unlimited MaxActive. section8() { dev_size=1024 @@ -172,9 +200,11 @@ section8() zone_capacity=1 max_open=${set_max_open} zbd_test_opts+=("-o ${max_open}") + max_active=0 } -# Zoned device with no conventional zones, ZCAP < ZSIZE, limited MaxOpen. +# Zoned device with no conventional zones, ZCAP < ZSIZE, limited MaxOpen, +# unlimited MaxActive. section9() { conv_pcnt=0 @@ -182,9 +212,11 @@ section9() zone_capacity=3 max_open=${set_max_open} zbd_test_opts+=("-o ${max_open}") + max_active=0 } -# Zoned device with mostly sequential zones, ZCAP == ZSIZE, limited MaxOpen. +# Zoned device with mostly sequential zones, ZCAP == ZSIZE, limited MaxOpen, +# unlimited MaxActive. section10() { conv_pcnt=10 @@ -192,9 +224,11 @@ section10() zone_capacity=1 max_open=${set_max_open} zbd_test_opts+=("-o ${max_open}") + max_active=0 } -# Zoned device with mostly sequential zones, ZCAP < ZSIZE, limited MaxOpen. +# Zoned device with mostly sequential zones, ZCAP < ZSIZE, limited MaxOpen, +# unlimited MaxActive. section11() { conv_pcnt=10 @@ -202,9 +236,11 @@ section11() zone_capacity=3 max_open=${set_max_open} zbd_test_opts+=("-o ${max_open}") + max_active=0 } -# Zoned device with mostly conventional zones, ZCAP == ZSIZE, limited MaxOpen. +# Zoned device with mostly conventional zones, ZCAP == ZSIZE, limited MaxOpen, +# unlimited MaxActive. section12() { conv_pcnt=66 @@ -212,9 +248,11 @@ section12() zone_capacity=1 max_open=${set_max_open} zbd_test_opts+=("-o ${max_open}") + max_active=0 } -# Zoned device with mostly conventional zones, ZCAP < ZSIZE, limited MaxOpen. +# Zoned device with mostly conventional zones, ZCAP < ZSIZE, limited MaxOpen, +# unlimited MaxActive. section13() { dev_size=2048 @@ -223,6 +261,155 @@ section13() zone_capacity=3 max_open=${set_max_open} zbd_test_opts+=("-o ${max_open}") + max_active=0 +} + +# Zoned device with no conventional zones, ZCAP == ZSIZE, limited MaxOpen, +# MaxActive == MaxOpen. +section14() +{ + dev_size=1024 + conv_pcnt=0 + zone_size=1 + zone_capacity=1 + max_open=${set_max_open} + zbd_test_opts+=("-o ${max_open}") + max_active=${set_max_open} +} + +# Zoned device with no conventional zones, ZCAP < ZSIZE, limited MaxOpen, +# MaxActive == MaxOpen. +section15() +{ + conv_pcnt=0 + zone_size=4 + zone_capacity=3 + max_open=${set_max_open} + zbd_test_opts+=("-o ${max_open}") + max_active=${set_max_open} +} + +# Zoned device with mostly sequential zones, ZCAP == ZSIZE, limited MaxOpen, +# MaxActive == MaxOpen. +section16() +{ + conv_pcnt=10 + zone_size=1 + zone_capacity=1 + max_open=${set_max_open} + zbd_test_opts+=("-o ${max_open}") + max_active=${set_max_open} +} + +# Zoned device with mostly sequential zones, ZCAP < ZSIZE, limited MaxOpen, +# MaxActive == MaxOpen. +section17() +{ + conv_pcnt=10 + zone_size=4 + zone_capacity=3 + max_open=${set_max_open} + zbd_test_opts+=("-o ${max_open}") + max_active=${set_max_open} +} + +# Zoned device with mostly conventional zones, ZCAP == ZSIZE, limited MaxOpen, +# MaxActive == MaxOpen. +section18() +{ + conv_pcnt=66 + zone_size=1 + zone_capacity=1 + max_open=${set_max_open} + zbd_test_opts+=("-o ${max_open}") + max_active=${set_max_open} +} + +# Zoned device with mostly conventional zones, ZCAP < ZSIZE, limited MaxOpen, +# MaxActive == MaxOpen. +section19() +{ + dev_size=2048 + conv_pcnt=66 + zone_size=4 + zone_capacity=3 + max_open=${set_max_open} + zbd_test_opts+=("-o ${max_open}") + max_active=${set_max_open} +} + +# Zoned device with no conventional zones, ZCAP == ZSIZE, limited MaxOpen, +# MaxActive > MaxOpen. +section20() +{ + dev_size=1024 + conv_pcnt=0 + zone_size=1 + zone_capacity=1 + max_open=${set_max_open} + zbd_test_opts+=("-o ${max_open}") + max_active=$((set_max_open+set_extra_max_active)) +} + +# Zoned device with no conventional zones, ZCAP < ZSIZE, limited MaxOpen, +# MaxActive > MaxOpen. +section21() +{ + conv_pcnt=0 + zone_size=4 + zone_capacity=3 + max_open=${set_max_open} + zbd_test_opts+=("-o ${max_open}") + max_active=$((set_max_open+set_extra_max_active)) +} + +# Zoned device with mostly sequential zones, ZCAP == ZSIZE, limited MaxOpen, +# MaxActive > MaxOpen. +section22() +{ + conv_pcnt=10 + zone_size=1 + zone_capacity=1 + max_open=${set_max_open} + zbd_test_opts+=("-o ${max_open}") + max_active=$((set_max_open+set_extra_max_active)) +} + +# Zoned device with mostly sequential zones, ZCAP < ZSIZE, limited MaxOpen, +# MaxActive > MaxOpen. +section23() +{ + conv_pcnt=10 + zone_size=4 + zone_capacity=3 + max_open=${set_max_open} + zbd_test_opts+=("-o ${max_open}") + max_active=$((set_max_open+set_extra_max_active)) +} + +# Zoned device with mostly conventional zones, ZCAP == ZSIZE, limited MaxOpen, +# MaxActive > MaxOpen. +section24() +{ + conv_pcnt=66 + zone_size=1 + zone_capacity=1 + max_open=${set_max_open} + zbd_test_opts+=("-o ${max_open}") + max_active=$((set_max_open+set_extra_max_active)) +} + +# Zoned device with mostly conventional zones, ZCAP < ZSIZE, limited MaxOpen, +# MaxActive > MaxOpen. +section25() +{ + dev_size=2048 + conv_pcnt=66 + zone_size=4 + zone_capacity=3 + max_open=${set_max_open} + zbd_test_opts+=("-o ${max_open}") + max_active=$((set_max_open+set_extra_max_active)) } # @@ -233,10 +420,12 @@ scriptdir="$(cd "$(dirname "$0")" && pwd)" sections=() zcap_supported=1 conv_supported=1 +max_act_supported=1 list_only=0 dev_size=1024 dev_blocksize=4096 set_max_open=8 +set_extra_max_active=2 zbd_test_opts=() num_of_runs=1 test_case=0 @@ -276,6 +465,9 @@ fi if ! cat /sys/kernel/config/nullb/features | grep -q zone_nr_conv; then conv_supported=0 fi +if ! cat /sys/kernel/config/nullb/features | grep -q zone_max_active; then + max_act_supported=0 +fi rc=0 test_rc=0 diff --git a/t/zbd/run-tests-against-scsi_debug b/t/zbd/run-tests-against-scsi_debug new file mode 100755 index 0000000000..b50d7a2419 --- /dev/null +++ b/t/zbd/run-tests-against-scsi_debug @@ -0,0 +1,33 @@ +#!/bin/bash +# +# Copyright (C) 2020 Western Digital Corporation or its affiliates. +# +# SPDX-License-Identifier: GPL-2.0 +# +# A couple of test cases in t/zbd/test-zbd-support script depend on the error +# injection feature of scsi_debug. Prepare a zoned scsi_debug device and run +# only for the test cases. + +declare dev sg scriptdir + +scriptdir="$(cd "$(dirname "$0")" && pwd)" + +modprobe -qr scsi_debug +modprobe scsi_debug add_host=1 zbc=host-managed zone_nr_conv=0 + +dev=$(dmesg | tail -5 | grep "Attached SCSI disk" | grep -Po ".* \[\Ksd[a-z]*") + +if ! grep -qe scsi_debug /sys/block/"${dev}"/device/vpd_pg83; then + echo "Failed to create scsi_debug device" + exit 1 +fi + +sg=$(echo /sys/block/"${dev}"/device/scsi_generic/*) +sg=${sg##*/} + +echo standard engine: +"${scriptdir}"/test-zbd-support -t 72 -t 73 /dev/"${dev}" +echo libzbc engine with block device: +"${scriptdir}"/test-zbd-support -t 72 -t 73 -l /dev/"${dev}" +echo libzbc engine with sg node: +"${scriptdir}"/test-zbd-support -t 72 -t 73 -l /dev/"${sg}" diff --git a/t/zbd/test-zbd-support b/t/zbd/test-zbd-support index 7e2fff00da..40f1de90e8 100755 --- a/t/zbd/test-zbd-support +++ b/t/zbd/test-zbd-support @@ -15,8 +15,10 @@ usage() { echo -e "\t-w Reset all zones before executing each write test case" echo -e "\t-o Run fio with max_open_zones limit" echo -e "\t-t Run only a single test case with specified number" + echo -e "\t-s Start testing from the case with the specified number" echo -e "\t-q Quit the test run after any failed test" echo -e "\t-z Run fio with debug=zbd option" + echo -e "\t-u Use io_uring ioengine in place of libaio" } max() { @@ -38,11 +40,73 @@ min() { ioengine() { if [ -n "$use_libzbc" ]; then echo -n "--ioengine=libzbc" + elif [ "$1" = "libaio" -a -n "$force_io_uring" ]; then + echo -n "--ioengine=io_uring" else echo -n "--ioengine=$1" fi } +get_dev_path_by_id() { + for d in /sys/block/* /sys/block/*/*; do + if [[ ! -r "${d}/dev" ]]; then + continue + fi + if [[ "${1}" == "$(<"${d}/dev")" ]]; then + echo "/dev/${d##*/}" + return 0 + fi + done + return 1 +} + +get_scsi_device_path() { + local dev="${1}" + local syspath + + syspath=/sys/block/"${dev##*/}"/device + if [[ -r /sys/class/scsi_generic/"${dev##*/}"/device ]]; then + syspath=/sys/class/scsi_generic/"${dev##*/}"/device + fi + realpath "$syspath" +} + +dm_destination_dev_set_io_scheduler() { + local dev=$1 sched=$2 + local dest_dev_id dest_dev path + + has_command dmsetup || return 1 + + while read -r dest_dev_id; do + if ! dest_dev=$(get_dev_path_by_id "${dest_dev_id}"); then + continue + fi + path=${dest_dev/dev/sys\/block}/queue/scheduler + if [[ ! -w ${path} ]]; then + echo "Can not set scheduler of device mapper destination: ${dest_dev}" + continue + fi + echo "${2}" > "${path}" + done < <(dmsetup table "$(<"/sys/block/$dev/dm/name")" | + sed -n 's/.* \([0-9]*:[0-9]*\).*/\1/p') +} + +dev_has_dm_map() { + local dev=${1} target_type=${2} + local dm_name + + has_command dmsetup || return 1 + + dm_name=$(<"/sys/block/$dev/dm/name") + if ! dmsetup status "${dm_name}" | grep -qe "${target_type}"; then + return 1 + fi + if dmsetup status "${dm_name}" | grep -v "${target_type}"; then + return 1 + fi + return 0 +} + set_io_scheduler() { local dev=$1 sched=$2 @@ -59,7 +123,17 @@ set_io_scheduler() { esac fi - echo "$sched" >"/sys/block/$dev/queue/scheduler" + if [ -w "/sys/block/$dev/queue/scheduler" ]; then + echo "$sched" >"/sys/block/$dev/queue/scheduler" + elif [ -r "/sys/block/$dev/dm/name" ] && + ( dev_has_dm_map "$dev" linear || + dev_has_dm_map "$dev" flakey || + dev_has_dm_map "$dev" crypt ); then + dm_destination_dev_set_io_scheduler "$dev" "$sched" + else + echo "can not set io scheduler" + exit 1 + fi } check_read() { @@ -163,7 +237,7 @@ write_and_run_one_fio_job() { shift 2 r=$(((RANDOM << 16) | RANDOM)) write_opts=(--name="write_job" --rw=write "$(ioengine "psync")" \ - --bs="${logical_block_size}" --zonemode=zbd \ + --bs="${min_seq_write_size}" --zonemode=zbd \ --zonesize="${zone_size}" --thread=1 --direct=1 \ --offset="${write_offset}" --size="${write_size}") write_opts+=("${job_var_opts[@]}") @@ -229,6 +303,14 @@ require_regular_block_dev() { return 0 } +require_block_dev() { + if [[ -b "$realdev" ]]; then + return 0 + fi + SKIP_REASON="$dev is not a block device" + return 1 +} + require_seq_zones() { local req_seq_zones=${1} local seq_bytes=$((disk_size - first_sequential_zone_sector * 512)) @@ -251,8 +333,84 @@ require_conv_zones() { return 0 } -# Check whether buffered writes are refused. +require_max_open_zones() { + local min=${1} + + if ((max_open_zones !=0 && max_open_zones < min)); then + SKIP_REASON="max_open_zones of $dev is smaller than $min" + return 1 + fi + return 0 +} + +require_max_active_zones() { + local min=${1} + + if ((max_active_zones == 0)); then + SKIP_REASON="$dev does not have max_active_zones limit" + return 1 + fi + if ((max_active_zones < min)); then + SKIP_REASON="max_active_zones of $dev is smaller than $min" + return 1 + fi + return 0 +} + +require_no_max_active_zones() { + if ((max_active_zones > 0)); then + SKIP_REASON="$dev has max_active_zones limit" + return 1 + fi + return 0 +} + +require_badblock() { + local syspath sdebug_path + + syspath=/sys/kernel/config/nullb/"${dev##*/}" + if [[ -d "${syspath}" ]]; then + if [[ ! -w "${syspath}/badblocks" ]]; then + SKIP_REASON="$dev does not have badblocks attribute" + return 1 + fi + if [[ ! -w "${syspath}/badblocks_once" ]]; then + SKIP_REASON="$dev does not have badblocks_once attribute" + return 1 + fi + if ((! $(<"${syspath}/badblocks_once"))); then + SKIP_REASON="badblocks_once attribute is not set for $dev" + return 1 + fi + return 0 + fi + + syspath=$(get_scsi_device_path "$dev") + if [[ -r ${syspath}/model && + $(<"${syspath}"/model) =~ scsi_debug ]]; then + sdebug_path=/sys/kernel/debug/scsi_debug/${syspath##*/} + if [[ ! -w "$sdebug_path"/error ]]; then + SKIP_REASON="$dev does not have write error injection" + return 1 + fi + return 0 + fi + + SKIP_REASON="$dev does not support either badblocks or error injection" + return 1 +} + +require_nullb() { + if [[ ! -d /sys/kernel/config/nullb/"${dev##*/}" ]]; then + SKIP_REASON="$dev is not null_blk" + return 1 + fi + return 0 +} + +# Check whether buffered writes are refused for block devices. test1() { + require_block_dev || return $SKIP_TESTCASE run_fio --name=job1 --filename="$dev" --rw=write --direct=0 --bs=4K \ "$(ioengine "psync")" --size="${zone_size}" --thread=1 \ --zonemode=zbd --zonesize="${zone_size}" 2>&1 | @@ -313,12 +471,20 @@ test4() { size=$((zone_size)) [ -n "$is_zbd" ] && reset_zone "$dev" $((off / 512)) opts+=("--name=$dev" "--filename=$dev" "--offset=$off") - opts+=(--bs="$(min $((logical_block_size * 256)) $size)") + opts+=(--bs="$(min $((min_seq_write_size * 256)) $size)") opts+=("--size=$size" "--thread=1" "--read_beyond_wp=1") opts+=("$(ioengine "psync")" "--rw=read" "--direct=1" "--disable_lat=1") opts+=("--zonemode=zbd" "--zonesize=${zone_size}") - run_fio "${opts[@]}" >> "${logfile}.${test_number}" 2>&1 || return $? - check_read $size || return $? + run_fio "${opts[@]}" >> "${logfile}.${test_number}" 2>&1 + fio_rc=$? + if [[ $unrestricted_reads != 0 ]]; then + if [[ $fio_rc != 0 ]]; then + return "$fio_rc" + fi + check_read $size || return $? + else + [ $fio_rc == 0 ] && return 1 || return 0 + fi } # Sequential write to sequential zones. @@ -329,7 +495,7 @@ test5() { off=$((first_sequential_zone_sector * 512)) capacity=$(total_zone_capacity 4 $off $dev) size=$((4 * zone_size)) - bs=$(min "$(max $((zone_size / 64)) "$logical_block_size")" "$zone_cap_bs") + bs=$(min "$(max $((zone_size / 64)) "$min_seq_write_size")" "$zone_cap_bs") run_fio_on_seq "$(ioengine "psync")" --iodepth=1 --rw=write \ --bs="$bs" --do_verify=1 --verify=md5 \ >>"${logfile}.${test_number}" 2>&1 || return $? @@ -345,7 +511,7 @@ test6() { off=$((first_sequential_zone_sector * 512)) capacity=$(total_zone_capacity 4 $off $dev) size=$((4 * zone_size)) - bs=$(min "$(max $((zone_size / 64)) "$logical_block_size")" "$zone_cap_bs") + bs=$(min "$(max $((zone_size / 64)) "$min_seq_write_size")" "$zone_cap_bs") write_and_run_one_fio_job \ $((first_sequential_zone_sector * 512)) "${size}" \ --offset="${off}" \ @@ -438,7 +604,8 @@ test11() { test12() { local size off capacity - prep_write + [ -n "$is_zbd" ] && reset_zone "$dev" -1 + size=$((8 * zone_size)) off=$((first_sequential_zone_sector * 512)) capacity=$(total_zone_capacity 8 $off $dev) @@ -453,7 +620,10 @@ test12() { test13() { local size off capacity - prep_write + require_max_open_zones 4 || return $SKIP_TESTCASE + + [ -n "$is_zbd" ] && reset_zone "$dev" -1 + size=$((8 * zone_size)) off=$((first_sequential_zone_sector * 512)) capacity=$(total_zone_capacity 8 $off $dev) @@ -702,7 +872,9 @@ test29() { require_seq_zones 80 || return $SKIP_TESTCASE off=$((first_sequential_zone_sector * 512 + 64 * zone_size)) size=$((16*zone_size)) - prep_write + + [ -n "$is_zbd" ] && reset_zone "$dev" -1 + opts=("--debug=zbd") for ((i=0;i>"${logfile}.${test_number}" 2>&1 @@ -742,19 +914,23 @@ test31() { # To distribute the write target zones evenly, skip certain zones for every # write. Utilize zonemode strided for such write patterns. bs=$((128 * 1024)) + off=$((first_sequential_zone_sector * 512)) + size=$((disk_size - off)) nz=$((max_open_zones)) if [[ $nz -eq 0 ]]; then nz=128 fi - off=$((first_sequential_zone_sector * 512)) - size=$((disk_size - off)) + if ((size / zone_size < nz)); then + nz=$((size / zone_size)) + fi inc=$(((size / nz / zone_size) * zone_size)) opts=("--name=$dev" "--filename=$dev" "--rw=write" "--bs=${bs}") opts+=("--offset=$off" "--size=$((inc * nz))" "--io_size=$((bs * nz))") opts+=("--zonemode=strided" "--zonesize=${bs}" "--zonerange=${inc}") - opts+=("--direct=1") + opts+=("--direct=1" "$(ioengine "psync")") echo "fio ${opts[@]}" >> "${logfile}.${test_number}" - "$(dirname "$0")/../../fio" "${opts[@]}" >> "${logfile}.${test_number}" 2>&1 + "$(dirname "$0")/../../fio" "${opts[@]}" >> "${logfile}.${test_number}" \ + 2>&1 || return $? # Next, run the test. opts=("--name=$dev" "--filename=$dev" "--offset=$off" "--size=$size") @@ -772,7 +948,8 @@ test32() { require_zbd || return $SKIP_TESTCASE - prep_write + [ -n "$is_zbd" ] && reset_zone "$dev" -1 + off=$((first_sequential_zone_sector * 512)) size=$((disk_size - off)) opts+=("--name=$dev" "--filename=$dev" "--offset=$off" "--size=$size") @@ -789,7 +966,8 @@ test33() { local bs io_size size local off capacity=0; - prep_write + [ -n "$is_zbd" ] && reset_zone "$dev" -1 + off=$((first_sequential_zone_sector * 512)) capacity=$(total_zone_capacity 1 $off $dev) size=$((2 * zone_size)) @@ -798,20 +976,30 @@ test33() { run_fio_on_seq "$(ioengine "psync")" --iodepth=1 --rw=write \ --size=$size --io_size=$io_size --bs=$bs \ >> "${logfile}.${test_number}" 2>&1 || return $? - check_written $(((io_size + bs - 1) / bs * bs)) || return $? + check_written $((io_size / bs * bs)) || return $? } -# Write to sequential zones with a block size that is not a divisor of the -# zone size and with data verification enabled. +# Test repeated async write job with verify using two unaligned block sizes. test34() { - local size + local bs off zone_capacity + local -a block_sizes - prep_write - size=$((2 * zone_size)) - run_fio_on_seq "$(ioengine "psync")" --iodepth=1 --rw=write --size=$size \ - --do_verify=1 --verify=md5 --bs=$((3 * zone_size / 4)) \ - >> "${logfile}.${test_number}" 2>&1 && return 1 - grep -q 'not a divisor of' "${logfile}.${test_number}" + require_zbd || return $SKIP_TESTCASE + prep_write + + off=$((first_sequential_zone_sector * 512)) + zone_capacity=$(total_zone_capacity 1 $off $dev) + block_sizes=($((4096 * 7)) $(($(min ${zone_capacity} 4194304) - 4096))) + + for bs in ${block_sizes[@]}; do + run_fio --name=job --filename="${dev}" --rw=randwrite \ + --bs="${bs}" --offset="${off}" \ + --size=$((4 * zone_size)) --iodepth=256 \ + "$(ioengine "libaio")" --time_based=1 --runtime=15s \ + --zonemode=zbd --direct=1 --zonesize="${zone_size}" \ + --verify=crc32c --do_verify=1 ${job_var_opts[@]} \ + >> "${logfile}.${test_number}" 2>&1 || return $? + done } # Test 1/4 for the I/O boundary rounding code: $size < $zone_size. @@ -869,9 +1057,9 @@ test38() { local bs off size prep_write - size=$((logical_block_size)) - off=$((disk_size - logical_block_size)) - bs=$((logical_block_size)) + size=$((min_seq_write_size)) + off=$((disk_size - min_seq_write_size)) + bs=$((min_seq_write_size)) run_one_fio_job --offset=$off --size=$size "$(ioengine "psync")" \ --iodepth=1 --rw=write --do_verify=1 --verify=md5 \ --bs=$bs --zonemode=zbd --zonesize="${zone_size}" \ @@ -889,7 +1077,7 @@ read_one_block() { exit 1 fi off=${result[0]} - bs=$((logical_block_size)) + bs=$((min_seq_write_size)) run_one_fio_job --rw=read "$(ioengine "psync")" --offset=$off --bs=$bs \ --size=$bs "$@" 2>&1 | tee -a "${logfile}.${test_number}" @@ -899,14 +1087,14 @@ read_one_block() { test39() { require_zbd || return $SKIP_TESTCASE read_one_block --zonemode=none >/dev/null || return $? - check_read $((logical_block_size)) || return $? + check_read $((min_seq_write_size)) || return $? } # Check whether fio accepts --zonemode=strided for zoned block devices. test40() { local bs - bs=$((logical_block_size)) + bs=$((min_seq_write_size)) require_zbd || return $SKIP_TESTCASE read_one_block --zonemode=strided | grep -q 'fio: --zonesize must be specified when using --zonemode=strided' || @@ -944,15 +1132,20 @@ test44() { test45() { local bs i + local grep_str="fio: first I/O failed. If .* is a zoned block device, consider --zonemode=zbd" require_zbd || return $SKIP_TESTCASE prep_write - bs=$((logical_block_size)) - run_one_fio_job "$(ioengine "psync")" --iodepth=1 --rw=randwrite --bs=$bs\ - --offset=$((first_sequential_zone_sector * 512)) \ - --size="$zone_size" --do_verify=1 --verify=md5 2>&1 | - tee -a "${logfile}.${test_number}" | - grep -q "fio: first I/O failed. If .* is a zoned block device, consider --zonemode=zbd" + bs=$((min_seq_write_size)) + for ((i = 0; i < 10; i++)); do + run_one_fio_job "$(ioengine "psync")" --iodepth=1 --rw=randwrite \ + --offset=$((first_sequential_zone_sector * 512)) \ + --bs="$bs" --time_based --runtime=1s \ + --do_verify=1 --verify=md5 \ + >> "${logfile}.${test_number}" 2>&1 + grep -qe "$grep_str" "${logfile}.${test_number}" && return 0 + done + return 1 } # Random write to sequential zones, libaio, 8 jobs, queue depth 64 per job @@ -972,7 +1165,7 @@ test47() { local bs prep_write - bs=$((logical_block_size)) + bs=$((min_seq_write_size)) run_fio_on_seq "$(ioengine "psync")" --rw=write --bs=$bs --zoneskip=1 \ >> "${logfile}.${test_number}" 2>&1 && return 1 grep -q 'zoneskip 1 is not a multiple of the device zone size' "${logfile}.${test_number}" @@ -989,7 +1182,9 @@ test48() { off=$((first_sequential_zone_sector * 512 + 64 * zone_size)) size=$((16*zone_size)) - prep_write + + [ -n "$is_zbd" ] && reset_zone "$dev" -1 + opts=("--aux-path=/tmp" "--allow_file_create=0" "--significant_figures=10") opts+=("--debug=zbd") opts+=("$(ioengine "libaio")" "--rw=randwrite" "--direct=1") @@ -1059,7 +1254,7 @@ test51() { require_conv_zones 8 || return $SKIP_TESTCASE require_seq_zones 8 || return $SKIP_TESTCASE - prep_write + reset_zone "$dev" -1 off=$((first_sequential_zone_sector * 512 - 8 * zone_size)) opts+=("--size=$((16 * zone_size))" "$(ioengine "libaio")") @@ -1075,8 +1270,8 @@ test51() { run_fio "${opts[@]}" >> "${logfile}.${test_number}" 2>&1 || return $? } -# Verify that zone_reset_threshold only takes logical blocks from seq -# zones into account, and logical blocks of conv zones are not counted. +# Verify that zone_reset_threshold only accounts written bytes in seq +# zones, and written data bytes of conv zones are not counted. test52() { local off io_size @@ -1140,6 +1335,7 @@ test54() { require_zbd || return $SKIP_TESTCASE require_seq_zones 8 || return $SKIP_TESTCASE + prep_write run_fio --name=job --filename=${dev} "$(ioengine "libaio")" \ --time_based=1 --runtime=30s --continue_on_error=0 \ --offset=$((first_sequential_zone_sector * 512)) \ @@ -1147,7 +1343,6 @@ test54() { --rw=randrw:2 --rwmixwrite=25 --bsrange=4k-${zone_size} \ --zonemode=zbd --zonesize=${zone_size} \ --verify=crc32c --do_verify=1 --verify_backlog=2 \ - --experimental_verify=1 \ --alloc-size=65536 --random_generator=tausworthe64 \ ${job_var_opts[@]} --debug=zbd \ >> "${logfile}.${test_number}" 2>&1 || return $? @@ -1156,12 +1351,13 @@ test54() { # test 'z' suffix parsing only test55() { local bs - bs=$((logical_block_size)) + bs=$((min_seq_write_size)) require_zbd || return $SKIP_TESTCASE # offset=1z + offset_increment=10z + size=2z require_seq_zones 13 || return $SKIP_TESTCASE + prep_write run_fio --name=j \ --filename=${dev} \ --direct=1 \ @@ -1182,11 +1378,12 @@ test55() { # test 'z' suffix parsing only test56() { local bs - bs=$((logical_block_size)) + bs=$((min_seq_write_size)) require_regular_block_dev || return $SKIP_TESTCASE require_seq_zones 10 || return $SKIP_TESTCASE + prep_write run_fio --name=j \ --filename=${dev} \ --direct=1 \ @@ -1208,6 +1405,7 @@ test57() { require_zbd || return $SKIP_TESTCASE + prep_write bs=$((4096 * 7)) off=$((first_sequential_zone_sector * 512)) @@ -1226,7 +1424,7 @@ test58() { require_seq_zones 128 || return $SKIP_TESTCASE size=$((zone_size * 128)) - bs="$(max $((zone_size / 128)) "$logical_block_size")" + bs="$(max $((zone_size / 128)) "$min_seq_write_size")" prep_write off=$((first_sequential_zone_sector * 512)) run_fio --zonemode=zbd --direct=1 --zonesize="${zone_size}" --thread=1 \ @@ -1245,6 +1443,433 @@ test58() { >>"${logfile}.${test_number}" 2>&1 } +# Test zone_reset_threshold with verify. +test59() { + local off bs loops=2 size=$((zone_size)) w + local -a workloads=(write randwrite rw randrw) + + prep_write + off=$((first_sequential_zone_sector * 512)) + + bs=$(min $((256*1024)) "$zone_size") + for w in "${workloads[@]}"; do + run_fio_on_seq "$(ioengine "psync")" --rw=${w} --bs="$bs" \ + --size=$size --loops=$loops --do_verify=1 \ + --verify=md5 --zone_reset_frequency=.9 \ + --zone_reset_threshold=.1 \ + >> "${logfile}.${test_number}" 2>&1 || return $? + done +} + +# Test fio errors out experimental_verify option with zonemode=zbd. +test60() { + run_fio_on_seq "$(ioengine "psync")" --rw=write --size=$zone_size \ + --do_verify=1 --verify=md5 --experimental_verify=1 \ + >> "${logfile}.${test_number}" 2>&1 && return 1 + grep -q 'not support experimental verify' "${logfile}.${test_number}" +} + +# Test fio errors out zone_reset_threshold option for multiple jobs with +# different write ranges. +test61() { + run_fio_on_seq "$(ioengine "psync")" --rw=write --size="$zone_size" \ + --numjobs=2 --offset_increment="$zone_size" \ + --zone_reset_threshold=0.1 --zone_reset_frequency=1 \ + --exitall_on_error=1 \ + >> "${logfile}.${test_number}" 2>&1 && return 1 + grep -q 'different write ranges' "${logfile}.${test_number}" +} + +# Test zone_reset_threshold option works for multiple jobs with same write +# range. +test62() { + local bs loops=2 size=$((zone_size)) + + [ -n "$is_zbd" ] && reset_zone "$dev" -1 + + # Two jobs write to single zone twice. Reset zone happens at next write + # after half of the zone gets filled. So 2 * 2 * 2 - 1 = 7 times zone + # resets are expected. + bs=$(min $((256*1024)) $((zone_size / 4))) + run_fio_on_seq "$(ioengine "psync")" --rw=write --bs="$bs" \ + --size=$size --loops=$loops --numjobs=2 \ + --zone_reset_frequency=1 --zone_reset_threshold=.5 \ + --group_reporting=1 \ + >> "${logfile}.${test_number}" 2>&1 || return $? + check_written $((size * loops * 2)) || return $? + check_reset_count -eq 7 || return $? +} + +# Test zone_reset_threshold option works for a read job and a write job with +# different IO range. +test63() { + local bs loops=2 size=$((zone_size)) off1 off2 + + [ -n "$is_zbd" ] && reset_zone "$dev" -1 + + off1=$((first_sequential_zone_sector * 512)) + off2=$((off1 + zone_size)) + bs=$(min $((256*1024)) $((zone_size / 4))) + + # One job writes to single zone twice. Reset zone happens at next write + # after half of the zone gets filled. So 2 * 2 - 1 = 3 times zone resets + # are expected. + run_fio "$(ioengine "psync")" --bs="$bs" --size=$size --loops=$loops \ + --filename="$dev" --group_reporting=1 \ + --zonemode=zbd --zonesize="$zone_size" --direct=1 \ + --zone_reset_frequency=1 --zone_reset_threshold=.5 \ + --name=r --rw=read --offset=$off1 "${job_var_opts[@]}" \ + --name=w --rw=write --offset=$off2 "${job_var_opts[@]}" \ + >> "${logfile}.${test_number}" 2>&1 || return $? + check_written $((size * loops)) || return $? + check_reset_count -eq 3 || return $? +} + +# Test write zone accounting handles almost full zones correctly. Prepare an +# almost full, but not full zone. Write to the zone with verify using larger +# block size. Then confirm fio does not report write zone accounting failure. +test64() { + local bs cap + + [ -n "$is_zbd" ] && reset_zone "$dev" -1 + + bs=$((zone_size / 8)) + cap=$(total_zone_capacity 1 $((first_sequential_zone_sector*512)) $dev) + run_fio_on_seq "$(ioengine "psync")" --rw=write --bs="$bs" \ + --size=$((zone_size)) \ + --io_size=$((cap - bs)) \ + >> "${logfile}.${test_number}" 2>&1 || return $? + + bs=$((zone_size / 2)) + run_fio_on_seq "$(ioengine "psync")" --rw=write --bs="$bs" \ + --size=$((zone_size)) --do_verify=1 --verify=md5 \ + >> "${logfile}.${test_number}" 2>&1 || return $? +} + +# Test open zone accounting handles trim workload correctly. Prepare open zones +# as many as max_open_zones=4. Trim one of the 4 zones. Then write to another +# zone and check the write amount is expected size. +test65() { + local off capacity + + [ -n "$is_zbd" ] && reset_zone "$dev" -1 + + off=$((first_sequential_zone_sector * 512)) + capacity=$(total_zone_capacity 1 $off "$dev") + run_fio --zonemode=zbd --direct=1 --zonesize="$zone_size" --thread=1 \ + --filename="$dev" --group_reporting=1 --max_open_zones=4 \ + "$(ioengine "psync")" \ + --name="prep_open_zones" --rw=randwrite --offset="$off" \ + --size="$((zone_size * 4))" --bs=4096 --io_size="$zone_size" \ + --name=trimjob --wait_for="prep_open_zones" --rw=trim \ + --bs="$zone_size" --offset="$off" --size="$zone_size" \ + --name=write --wait_for="trimjob" --rw=write --bs=4096 \ + --offset="$((off + zone_size * 4))" --size="$zone_size" \ + >> "${logfile}.${test_number}" 2>&1 + + check_written $((zone_size + capacity)) +} + +# Test closed zones are handled as open zones. This test case requires zoned +# block devices which has same max_open_zones and max_active_zones. +test66() { + local i off + + require_zbd || return $SKIP_TESTCASE + require_max_active_zones 2 || return $SKIP_TESTCASE + require_max_open_zones "${max_active_zones}" || return $SKIP_TESTCASE + require_seq_zones $((max_active_zones * 16)) || return $SKIP_TESTCASE + + reset_zone "$dev" -1 + + # Prepare max_active_zones in closed condition. + off=$((first_sequential_zone_sector * 512)) + run_fio --name=w --filename="$dev" --zonemod=zbd --direct=1 \ + --offset=$((off)) --zonesize="${zone_size}" --rw=randwrite \ + --bs=4096 --size="$((zone_size * max_active_zones))" \ + --io_size="${zone_size}" "$(ioengine "psync")" \ + >> "${logfile}.${test_number}" 2>&1 || return $? + for ((i = 0; i < max_active_zones; i++)); do + close_zone "$dev" $((off / 512)) || return $? + off=$((off + zone_size)) + done + + # Run random write to the closed zones and empty zones. This confirms + # that fio handles closed zones as write target open zones. Otherwise, + # fio writes to the empty zones and hit the max_active_zones limit. + off=$((first_sequential_zone_sector * 512)) + run_one_fio_job --zonemod=zbd --direct=1 \ + "$(ioengine "psync")" --rw=randwrite --bs=4096 \ + --max_open_zones="$max_active_zones" --offset=$((off)) \ + --size=$((max_active_zones * 16 * zone_size)) \ + --io_size=$((zone_size)) --zonesize="${zone_size}" \ + --time_based --runtime=5s \ + >> "${logfile}.${test_number}" 2>&1 +} + +# Test max_active_zones limit failure is reported with good error message. +test67() { + local i off + + require_zbd || return $SKIP_TESTCASE + require_max_active_zones 2 || return $SKIP_TESTCASE + require_max_open_zones "${max_active_zones}" || return $SKIP_TESTCASE + require_seq_zones $((max_active_zones + 1)) || return $SKIP_TESTCASE + + reset_zone "$dev" -1 + + # Prepare max_active_zones in open condition. + off=$((first_sequential_zone_sector * 512)) + run_fio --name=w --filename="$dev" --zonemod=zbd --direct=1 \ + --offset=$((off)) --zonesize="${zone_size}" --rw=randwrite \ + --bs=4096 --size="$((zone_size * max_active_zones))" \ + --io_size="${zone_size}" "$(ioengine "psync")" \ + >> "${logfile}.${test_number}" 2>&1 || return $? + + # Write to antoher zone and trigger max_active_zones limit error. + off=$((off + zone_size * max_active_zones)) + run_one_fio_job --zonemod=zbd --direct=1 "$(ioengine "psync")" \ + --rw=write --bs=$min_seq_write_size --offset=$((off)) \ + --size=$((zone_size)) --zonesize="${zone_size}" \ + >> "${logfile}.${test_number}" 2>&1 && return $? + grep -q 'Exceeded max_active_zones limit' "${logfile}.${test_number}" +} + +# Test rw=randrw and rwmixwrite=0 options do not issue write I/O unit +test68() { + local off size + + require_zbd || return "$SKIP_TESTCASE" + + reset_zone "${dev}" -1 + + # Write some data as preparation + off=$((first_sequential_zone_sector * 512)) + size=$min_seq_write_size + run_one_fio_job "$(ioengine "psync")" --rw=write --offset="$off" \ + --io_size="$size" --zonemode=strided \ + --zonesize="$zone_size" --zonerange="$zone_size" \ + >> "${logfile}.${test_number}" 2>&1 || return $? + # Run random mixed read and write specifying zero write ratio + run_fio_on_seq "$(ioengine "psync")" --rw=randrw --rwmixwrite=0 \ + --time_based --runtime=1s \ + >> "${logfile}.${test_number}" 2>&1 || return $? + # "WRITE:" shall be recoreded only once for the preparation + [[ $(grep -c "WRITE:" "${logfile}.${test_number}") == 1 ]] +} + +# Test rw=rw and verify_backlog=1 options do not cause verify failure +test69() { + require_zbd || return "$SKIP_TESTCASE" + + prep_write + run_fio --name=job --filename="$dev" --time_based --runtime=15s \ + --rw=rw --offset=$((first_sequential_zone_sector * 512)) \ + "$(ioengine "libaio")" --iodepth=32 --randrepeat=0 \ + --verify=crc32 --verify_backlog=1 --zonemode=zbd --direct=1 \ + >> "${logfile}.${test_number}" 2>&1 || return $? +} + +# Test max_open_zones and job_max_open_zones do not error out for non-write jobs +test70() { + require_zbd || return "$SKIP_TESTCASE" + + reset_zone "${dev}" -1 + + # Write data to two zones and make them open + run_fio_on_seq "$(ioengine "psync")" --io_size="$min_seq_write_size" \ + --rw=write --offset_increment=1z --numjobs=2 \ + --group_reporting=1 >> "${logfile}.${test_number}" 2>&1 + + # Confirm max_open_zones=1 for read workload does not fail + run_fio_on_seq "$(ioengine "psync")" --io_size="$min_seq_write_size" \ + --rw=read --max_open_zones=1 \ + >> "${logfile}.${test_number}" 2>&1 || return $? + + # Confirm job_max_open_zones=1 for read workload does not fail + run_fio_on_seq "$(ioengine "psync")" --io_size="$min_seq_write_size" \ + --rw=read --job_max_open_zones=1 \ + >> "${logfile}.${test_number}" 2>&1 + grep -q 'valid only for write jobs' \ + "${logfile}.${test_number}" || return $? + + # Confirm max_open_zones=1 for trim workload does not fail + run_fio_on_seq "$(ioengine "psync")" --rw=trim --io_size=1z \ + --bs="$zone_size" --max_open_zones=1 \ + >> "${logfile}.${test_number}" 2>&1 +} + +# Test random write does not end early when the zones as many as max_open_zones +# have remainder smaller than block size. +test71() { + local off size capacity zone_fill_size i + + require_zbd || return "$SKIP_TESTCASE" + require_seq_zones 8 || return "$SKIP_TESTCASE" + require_no_max_active_zones || return "$SKIP_TESTCASE" + + reset_zone "${dev}" -1 + + # Fill data to every other zone in the test target 8 zones. This leaves + # 4 zones in the implicit open condition. Leave 12kb remainder in the + # 4 zones. + off=$((first_sequential_zone_sector * 512)) + size=$min_seq_write_size + capacity=$(total_zone_capacity 1 "$off" "$dev") + zone_fill_size=$((capacity - 3 * 4096)) + run_one_fio_job "$(ioengine "psync")" --rw=write --offset="$off" \ + --bs=4k --zonemode=strided \ + --zonesize="$zone_fill_size" \ + --zonerange=$((zone_size * 2)) \ + --io_size=$((zone_fill_size * 4)) \ + >> "${logfile}.${test_number}" 2>&1 || return $? + # Close the 4 zones to not fail the next fio command with the + # --max_open_zones=1 option + for ((i = 0; i < 4; i++)); do + close_zone "$dev" $(((off + zone_size * 2 * i) / 512)) || return $? + done + + # Run random write with 8kb block size + run_one_fio_job "$(ioengine "psync")" --rw=randwrite --offset="$off" \ + --bs=$((4096 * 2)) --zonemode=zbd \ + --zonesize="$zone_size" --size=$((zone_size * 8)) \ + --max_open_zones=1 --debug=zbd \ + >> "${logfile}.${test_number}" 2>&1 || return $? + + check_written $((zone_size * 8)) || return $? +} + +set_nullb_badblocks() { + local syspath + + syspath=/sys/kernel/config/nullb/"${dev##*/}" + if [[ -w $syspath/badblocks ]]; then + echo "$1" > "$syspath"/badblocks + fi + + return 0 +} + +# The helper function to set up badblocks or error command and echo back +# number of expected failures. If the device is null_blk, set the errors +# at the sectors based of 1st argument (offset) and 2nd argument (gap). +# If the device is scsi_debug, set the first write commands to fail. +set_badblocks() { + local off=$(($1 / 512)) + local gap=$(($2 / 512)) + local syspath block scsi_dev + + # null_blk + syspath=/sys/kernel/config/nullb/"${dev##*/}" + if [[ -d ${syspath} ]]; then + block=$((off + 2)) + set_nullb_badblocks "+${block}-${block}" + block=$((off + gap + 11)) + set_nullb_badblocks "+${block}-${block}" + block=$((off + gap*2 + 8)) + set_nullb_badblocks "+${block}-${block}" + + echo 3 + return + fi + + # scsi_debug + scsi_dev=$(get_scsi_device_path "$dev") + syspath=/sys/kernel/debug/scsi_debug/"${scsi_dev##*/}"/ + echo 2 -1 0x8a 0x00 0x00 0x02 0x03 0x11 0x02 > "$syspath"/error + + echo 1 +} + +# Single job sequential sync write to sequential zones, with continue_on_error +test72() { + local size off capacity bs expected_errors + + require_zbd || return "$SKIP_TESTCASE" + require_badblock || return "$SKIP_TESTCASE" + + prep_write + off=$((first_sequential_zone_sector * 512)) + bs=$(min "$(max $((zone_size / 64)) "$min_seq_write_size")" "$zone_cap_bs") + expected_errors=$(set_badblocks "$off" "$zone_size") + size=$((4 * zone_size)) + capacity=$((size - bs * expected_errors)) + run_fio_on_seq "$(ioengine "psync")" --rw=write --offset="$off" \ + --size="$size" --bs="$bs" --do_verify=1 --verify=md5 \ + --continue_on_error=1 --recover_zbd_write_error=1 \ + --ignore_error=,EIO:61 --debug=zbd \ + >>"${logfile}.${test_number}" 2>&1 || return $? + check_written "$capacity" || return $? + grep -qe "Write pointer move succeeded" "${logfile}.${test_number}" +} + +# Multi job sequential async write to sequential zones, with continue_on_error +test73() { + local size off capacity bs + + require_zbd || return "$SKIP_TESTCASE" + require_badblock || return "$SKIP_TESTCASE" + + prep_write + off=$((first_sequential_zone_sector * 512)) + bs=$(min "$(max $((zone_size / 64)) "$min_seq_write_size")" "$zone_cap_bs") + set_badblocks "$off" "$zone_size" > /dev/null + capacity=$(total_zone_capacity 4 "$off" "$dev") + size=$((zone_size * 4)) + run_fio --name=w --filename="${dev}" --rw=write "$(ioengine "libaio")" \ + --iodepth=32 --numjob=8 --group_reporting=1 --offset="$off" \ + --size="$size" --bs="$bs" --zonemode=zbd --direct=1 \ + --zonesize="$zone_size" --continue_on_error=1 \ + --recover_zbd_write_error=1 --debug=zbd \ + >>"${logfile}.${test_number}" 2>&1 || return $? + grep -qe "Write pointer move succeeded" \ + "${logfile}.${test_number}" +} + +# Single job sequential sync write to sequential zones, with continue_on_error, +# with failures in the recovery writes. +test74() { + local size off bs + + require_zbd || return "$SKIP_TESTCASE" + require_nullb || return "$SKIP_TESTCASE" + require_badblock || return "$SKIP_TESTCASE" + + prep_write + off=$((first_sequential_zone_sector * 512)) + bs=$(min "$(max $((zone_size / 64)) "$min_seq_write_size")" "$zone_cap_bs") + set_badblocks "$off" "$((bs / 2))" > /dev/null + size=$((4 * zone_size)) + run_fio_on_seq "$(ioengine "psync")" --rw=write --offset="$off" \ + --size="$size" --bs="$bs" --continue_on_error=1 \ + --recover_zbd_write_error=1 --ignore_error=,EIO:61 \ + >>"${logfile}.${test_number}" 2>&1 || return $? + grep -qe "Failed to recover write pointer" "${logfile}.${test_number}" +} + +# Multi job sequential async write to sequential zones, with continue_on_error +# with failures in the recovery writes. +test75() { + local size off bs + + require_zbd || return "$SKIP_TESTCASE" + require_nullb || return "$SKIP_TESTCASE" + require_badblock || return "$SKIP_TESTCASE" + + prep_write + off=$((first_sequential_zone_sector * 512)) + bs=$(min "$(max $((zone_size / 64)) "$min_seq_write_size")" "$zone_cap_bs") + set_badblocks "$off" $((bs / 2)) > /dev/null + size=$((zone_size * 4)) + run_fio --name=w --filename="${dev}" --rw=write "$(ioengine "libaio")" \ + --iodepth=32 --numjob=8 --group_reporting=1 --offset="$off" \ + --size="$size" --bs="$bs" --zonemode=zbd --direct=1 \ + --zonesize="$zone_size" --continue_on_error=1 \ + --recover_zbd_write_error=1 --debug=zbd \ + >>"${logfile}.${test_number}" 2>&1 || return $? + grep -qe "Failed to recover write pointer" "${logfile}.${test_number}" +} + SECONDS=0 tests=() dynamic_analyzer=() @@ -1254,6 +1879,8 @@ use_libzbc= zbd_debug= max_open_zones_opt= quit_on_err= +force_io_uring= +start_test=1 while [ "${1#-}" != "$1" ]; do case "$1" in @@ -1267,10 +1894,12 @@ while [ "${1#-}" != "$1" ]; do -w) reset_before_write=1; shift;; -t) tests+=("$2"); shift; shift;; -o) max_open_zones_opt="${2}"; shift; shift;; + -s) start_test=$2; shift; shift;; -v) dynamic_analyzer=(valgrind "--read-var-info=yes"); shift;; -q) quit_on_err=1; shift;; -z) zbd_debug=1; shift;; + -u) force_io_uring=1; shift;; --) shift; break;; *) usage; exit 1;; esac @@ -1281,6 +1910,11 @@ if [ $# != 1 ]; then exit 1 fi +if [ -n "$use_libzbc" -a -n "$force_io_uring" ]; then + echo "Please specify only one of -l and -u options" + exit 1 +fi + # shellcheck source=functions source "$(dirname "$0")/functions" || exit $? @@ -1304,7 +1938,7 @@ if [[ -b "$realdev" ]]; then realsysfs=$(readlink "/sys/dev/block/$major:$minor") basename=$(basename "${realsysfs%/*}") fi - logical_block_size=$(<"/sys/block/$basename/queue/logical_block_size") + min_seq_write_size=$(min_seq_write_size "$basename") case "$(<"/sys/class/block/$basename/queue/zoned")" in host-managed|host-aware) is_zbd=true @@ -1318,10 +1952,12 @@ if [[ -b "$realdev" ]]; then first_sequential_zone_sector=${result[0]} sectors_per_zone=${result[1]} zone_size=$((sectors_per_zone * 512)) + unrestricted_reads=$(urswrz "$dev") if ! max_open_zones=$(max_open_zones "$dev"); then echo "Failed to determine maximum number of open zones" exit 1 fi + max_active_zones=$(max_active_zones "$dev") set_io_scheduler "$basename" deadline || exit $? if [ -n "$reset_all_zones" ]; then reset_zone "$dev" -1 @@ -1329,13 +1965,16 @@ if [[ -b "$realdev" ]]; then ;; *) first_sequential_zone_sector=$(((disk_size / 2) & - (logical_block_size - 1))) - zone_size=$(max 65536 "$logical_block_size") + (min_seq_write_size - 1))) + zone_size=$(max 65536 "$min_seq_write_size") sectors_per_zone=$((zone_size / 512)) max_open_zones=128 + max_active_zones=0 + unrestricted_reads=1 set_io_scheduler "$basename" none || exit $? ;; esac + elif [[ -c "$realdev" ]]; then # For an SG node, we must have libzbc option specified if [[ ! -n "$use_libzbc" ]]; then @@ -1353,8 +1992,8 @@ elif [[ -c "$realdev" ]]; then echo "Failed to determine disk size" exit 1 fi - if ! logical_block_size=($(zbc_logical_block_size "$dev")); then - echo "Failed to determine logical block size" + if ! min_seq_write_size=($(zbc_physical_block_size "$dev")); then + echo "Failed to determine physical block size" exit 1 fi if ! result=($(first_sequential_zone "$dev")); then @@ -1364,10 +2003,12 @@ elif [[ -c "$realdev" ]]; then first_sequential_zone_sector=${result[0]} sectors_per_zone=${result[1]} zone_size=$((sectors_per_zone * 512)) + unrestricted_reads=$(urswrz "$dev") if ! max_open_zones=$(max_open_zones "$dev"); then echo "Failed to determine maximum number of open zones" exit 1 fi + max_active_zones=0 if [ -n "$reset_all_zones" ]; then reset_zone "$dev" -1 fi @@ -1412,6 +2053,7 @@ trap 'intr=1' SIGINT ret=0 for test_number in "${tests[@]}"; do + [ "${test_number}" -lt "${start_test}" ] && continue rm -f "${logfile}.${test_number}" unset SKIP_REASON echo -n "Running test $(printf "%02d" $test_number) ... " diff --git a/thread_options.h b/thread_options.h index 8f4c8a5996..3e66d47709 100644 --- a/thread_options.h +++ b/thread_options.h @@ -50,6 +50,12 @@ struct split { unsigned long long val2[ZONESPLIT_MAX]; }; +struct split_prio { + uint64_t bs; + int32_t prio; + uint32_t perc; +}; + struct bssplit { uint64_t bs; uint32_t perc; @@ -121,6 +127,7 @@ struct thread_options { unsigned int nr_files; unsigned int open_files; + unsigned int filetype; enum file_lock_mode file_lock_mode; unsigned int odirect; @@ -138,8 +145,9 @@ struct thread_options { unsigned int do_verify; unsigned int verify_interval; unsigned int verify_offset; - char verify_pattern[MAX_PATTERN_SIZE]; + char *verify_pattern; unsigned int verify_pattern_bytes; + unsigned int verify_pattern_interval; struct pattern_fmt verify_fmt[8]; unsigned int verify_fmt_sz; unsigned int verify_fatal; @@ -150,13 +158,14 @@ struct thread_options { unsigned int experimental_verify; unsigned int verify_state; unsigned int verify_state_save; + unsigned int verify_write_sequence; + unsigned int verify_header_seed; unsigned int use_thread; unsigned int unlink; unsigned int unlink_each_loop; unsigned int do_disk_util; unsigned int override_sync; unsigned int rand_repeatable; - unsigned int allrand_repeatable; unsigned long long rand_seed; unsigned int log_avg_msec; unsigned int log_hist_msec; @@ -165,9 +174,14 @@ struct thread_options { unsigned int log_offset; unsigned int log_gz; unsigned int log_gz_store; - unsigned int log_unix_epoch; + unsigned int log_alternate_epoch; + unsigned int log_alternate_epoch_clock_id; unsigned int norandommap; unsigned int softrandommap; + unsigned int sprandom; + unsigned int spr_num_regions; + unsigned long long spr_cache_size; + fio_fp64_t spr_over_provisioning; unsigned int bs_unaligned; unsigned int fsync_on_close; unsigned int bs_is_seq_rand; @@ -199,10 +213,12 @@ struct thread_options { unsigned long long start_delay_high; unsigned long long timeout; unsigned long long ramp_time; + unsigned long long ramp_size; unsigned int ss_state; fio_fp64_t ss_limit; unsigned long long ss_dur; unsigned long long ss_ramp_time; + unsigned long long ss_check_interval; unsigned int overwrite; unsigned int bw_avg_time; unsigned int iops_avg_time; @@ -237,9 +253,11 @@ struct thread_options { unsigned int iolog; unsigned int rwmixcycle; unsigned int rwmix[DDIR_RWDIR_CNT]; + char *comm; unsigned int nice; unsigned int ioprio; unsigned int ioprio_class; + unsigned int ioprio_hint; unsigned int file_service_type; unsigned int group_reporting; unsigned int stats; @@ -248,13 +266,14 @@ struct thread_options { unsigned int zero_buffers; unsigned int refill_buffers; unsigned int scramble_buffers; - char buffer_pattern[MAX_PATTERN_SIZE]; + char *buffer_pattern; unsigned int buffer_pattern_bytes; unsigned int compress_percentage; unsigned int compress_chunk; unsigned int dedupe_percentage; unsigned int dedupe_mode; unsigned int dedupe_working_set_percentage; + unsigned int dedupe_global; unsigned int time_based; unsigned int disable_lat; unsigned int disable_clat; @@ -263,6 +282,7 @@ struct thread_options { unsigned int unified_rw_rep; unsigned int gtod_reduce; unsigned int gtod_cpu; + unsigned int job_start_clock_id; enum fio_cs clocksource; unsigned int no_stall; unsigned int trim_percentage; @@ -299,6 +319,8 @@ struct thread_options { char *exec_prerun; char *exec_postrun; + unsigned int thinkcycles; + unsigned int thinktime; unsigned int thinktime_spin; unsigned int thinktime_blocks; @@ -341,12 +363,14 @@ struct thread_options { unsigned long long offset_increment; unsigned long long number_ios; + unsigned int num_range; + unsigned int sync_file_range; unsigned long long latency_target; unsigned long long latency_window; - fio_fp64_t latency_percentile; uint32_t latency_run; + fio_fp64_t latency_percentile; /* * flow support @@ -374,11 +398,20 @@ struct thread_options { int max_open_zones; unsigned int job_max_open_zones; unsigned int ignore_zone_limits; + unsigned int recover_zbd_write_error; fio_fp64_t zrt; fio_fp64_t zrf; + unsigned int fdp; + unsigned int dp_type; + unsigned int dp_id_select; + uint16_t dp_ids[FIO_MAX_DP_IDS]; + unsigned int dp_nr_ids; + char *dp_scheme_file; + unsigned int log_entries; unsigned int log_prio; + unsigned int log_issue_time; }; #define FIO_TOP_STR_MAX 256 @@ -406,7 +439,6 @@ struct thread_options_pack { uint32_t iodepth_batch_complete_min; uint32_t iodepth_batch_complete_max; uint32_t serialize_overlap; - uint32_t pad; uint64_t size; uint64_t io_size; @@ -417,13 +449,11 @@ struct thread_options_pack { uint32_t fill_device; uint32_t file_append; uint32_t unique_filename; - uint32_t pad3; uint64_t file_size_low; uint64_t file_size_high; uint64_t start_offset; uint64_t start_offset_align; uint32_t start_offset_nz; - uint32_t pad4; uint64_t bs[DDIR_RWDIR_CNT]; uint64_t ba[DDIR_RWDIR_CNT]; @@ -438,6 +468,7 @@ struct thread_options_pack { uint32_t nr_files; uint32_t open_files; + uint32_t filetype; uint32_t file_lock_mode; uint32_t odirect; @@ -455,8 +486,8 @@ struct thread_options_pack { uint32_t do_verify; uint32_t verify_interval; uint32_t verify_offset; - uint8_t verify_pattern[MAX_PATTERN_SIZE]; uint32_t verify_pattern_bytes; + uint32_t verify_pattern_interval; uint32_t verify_fatal; uint32_t verify_dump; uint32_t verify_async; @@ -465,14 +496,14 @@ struct thread_options_pack { uint32_t experimental_verify; uint32_t verify_state; uint32_t verify_state_save; + uint32_t verify_write_sequence; + uint32_t verify_header_seed; uint32_t use_thread; uint32_t unlink; uint32_t unlink_each_loop; uint32_t do_disk_util; uint32_t override_sync; uint32_t rand_repeatable; - uint32_t allrand_repeatable; - uint32_t pad2; uint64_t rand_seed; uint32_t log_avg_msec; uint32_t log_hist_msec; @@ -481,9 +512,14 @@ struct thread_options_pack { uint32_t log_offset; uint32_t log_gz; uint32_t log_gz_store; - uint32_t log_unix_epoch; + uint32_t log_alternate_epoch; + uint32_t log_alternate_epoch_clock_id; uint32_t norandommap; uint32_t softrandommap; + uint32_t sprandom; + uint32_t spr_num_regions; + uint64_t spr_cache_size; + fio_fp64_t spr_over_provisioning; uint32_t bs_unaligned; uint32_t fsync_on_close; uint32_t bs_is_seq_rand; @@ -514,10 +550,12 @@ struct thread_options_pack { uint64_t start_delay_high; uint64_t timeout; uint64_t ramp_time; + uint64_t ramp_size; uint64_t ss_dur; uint64_t ss_ramp_time; uint32_t ss_state; fio_fp64_t ss_limit; + uint64_t ss_check_interval; uint32_t overwrite; uint32_t bw_avg_time; uint32_t iops_avg_time; @@ -550,9 +588,11 @@ struct thread_options_pack { uint32_t iolog; uint32_t rwmixcycle; uint32_t rwmix[DDIR_RWDIR_CNT]; + uint8_t comm[FIO_TOP_STR_MAX]; uint32_t nice; uint32_t ioprio; uint32_t ioprio_class; + uint32_t ioprio_hint; uint32_t file_service_type; uint32_t group_reporting; uint32_t stats; @@ -561,13 +601,13 @@ struct thread_options_pack { uint32_t zero_buffers; uint32_t refill_buffers; uint32_t scramble_buffers; - uint8_t buffer_pattern[MAX_PATTERN_SIZE]; uint32_t buffer_pattern_bytes; uint32_t compress_percentage; uint32_t compress_chunk; uint32_t dedupe_percentage; uint32_t dedupe_mode; uint32_t dedupe_working_set_percentage; + uint32_t dedupe_global; uint32_t time_based; uint32_t disable_lat; uint32_t disable_clat; @@ -576,6 +616,7 @@ struct thread_options_pack { uint32_t unified_rw_rep; uint32_t gtod_reduce; uint32_t gtod_cpu; + uint32_t job_start_clock_id; uint32_t clocksource; uint32_t no_stall; uint32_t trim_percentage; @@ -586,6 +627,7 @@ struct thread_options_pack { uint32_t lat_percentiles; uint32_t slat_percentiles; uint32_t percentile_precision; + uint32_t pad; fio_fp64_t percentile_list[FIO_IO_U_LIST_MAX_LEN]; uint8_t read_iolog_file[FIO_TOP_STR_MAX]; @@ -611,6 +653,8 @@ struct thread_options_pack { uint8_t exec_prerun[FIO_TOP_STR_MAX]; uint8_t exec_postrun[FIO_TOP_STR_MAX]; + uint32_t thinkcycles; + uint32_t thinktime; uint32_t thinktime_spin; uint32_t thinktime_blocks; @@ -656,8 +700,8 @@ struct thread_options_pack { uint64_t latency_target; uint64_t latency_window; uint64_t max_latency[DDIR_RWDIR_CNT]; - fio_fp64_t latency_percentile; uint32_t latency_run; + fio_fp64_t latency_percentile; /* * flow support @@ -683,12 +727,29 @@ struct thread_options_pack { uint32_t zone_mode; int32_t max_open_zones; uint32_t ignore_zone_limits; + uint32_t recover_zbd_write_error; uint32_t log_entries; uint32_t log_prio; + uint32_t log_issue_time; + + uint32_t fdp; + uint32_t dp_type; + uint32_t dp_id_select; + uint16_t dp_ids[FIO_MAX_DP_IDS]; + uint32_t dp_nr_ids; + uint8_t dp_scheme_file[FIO_TOP_STR_MAX]; + + uint32_t num_range; + /* + * verify_pattern followed by buffer_pattern from the unpacked struct + */ + uint8_t patterns[]; } __attribute__((packed)); -extern void convert_thread_options_to_cpu(struct thread_options *o, struct thread_options_pack *top); +extern int convert_thread_options_to_cpu(struct thread_options *o, + struct thread_options_pack *top, size_t top_sz); +extern size_t thread_options_pack_size(struct thread_options *o); extern void convert_thread_options_to_net(struct thread_options_pack *top, struct thread_options *); extern int fio_test_cconv(struct thread_options *); extern void options_default_fill(struct thread_options *o); @@ -702,4 +763,8 @@ extern int str_split_parse(struct thread_data *td, char *str, extern int split_parse_ddir(struct thread_options *o, struct split *split, char *str, bool absolute, unsigned int max_splits); +extern int split_parse_prio_ddir(struct thread_options *o, + struct split_prio **entries, int *nr_entries, + char *str); + #endif diff --git a/time.c b/time.c index cd0e2a8914..386c76fc31 100644 --- a/time.c +++ b/time.c @@ -6,6 +6,12 @@ static struct timespec genesis; static unsigned long ns_granularity; +enum ramp_period_states { + RAMP_RUNNING, + RAMP_FINISHING, + RAMP_DONE +}; + void timespec_add_msec(struct timespec *ts, unsigned int msec) { uint64_t adj_nsec = 1000000ULL * msec; @@ -38,6 +44,17 @@ uint64_t usec_spin(unsigned int usec) return t; } +/* + * busy loop for a fixed amount of cycles + */ +void cycles_spin(unsigned int n) +{ + unsigned long i; + + for (i=0; i < n; i++) + nop; +} + uint64_t usec_sleep(struct thread_data *td, unsigned long usec) { struct timespec req; @@ -99,47 +116,130 @@ uint64_t utime_since_genesis(void) return utime_since_now(&genesis); } -bool in_ramp_time(struct thread_data *td) +bool in_ramp_period(struct thread_data *td) { - return td->o.ramp_time && !td->ramp_time_over; + return td->ramp_period_state != RAMP_DONE; +} + +bool ramp_period_enabled = false; + +int ramp_period_check(void) +{ + uint64_t group_bytes = 0; + int prev_groupid = -1; + bool group_ramp_period_over = false; + + for_each_td(td) { + if (td->ramp_period_state != RAMP_RUNNING) + continue; + + if (td->o.ramp_time && + utime_since_now(&td->epoch) >= td->o.ramp_time) { + td->ramp_period_state = RAMP_FINISHING; + continue; + } + + if (td->o.ramp_size) { + int ddir; + const bool needs_lock = td_async_processing(td); + + if (!td->o.group_reporting || + (td->o.group_reporting && + td->groupid != prev_groupid)) { + group_bytes = 0; + prev_groupid = td->groupid; + group_ramp_period_over = false; + } + + if (needs_lock) + __td_io_u_lock(td); + + for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) + group_bytes += td->io_bytes[ddir]; + + if (needs_lock) + __td_io_u_unlock(td); + + if (group_bytes >= td->o.ramp_size) { + td->ramp_period_state = RAMP_FINISHING; + /* + * Mark ramp up for all threads in the group as + * done. + */ + if (td->o.group_reporting && + !group_ramp_period_over) { + group_ramp_period_over = true; + for_each_td(td2) { + if (td2->groupid == td->groupid) + td2->ramp_period_state = RAMP_FINISHING; + } end_for_each(); + } + } + } + } end_for_each(); + + return 0; } static bool parent_update_ramp(struct thread_data *td) { struct thread_data *parent = td->parent; - if (!parent || parent->ramp_time_over) + if (!parent || parent->ramp_period_state == RAMP_DONE) return false; reset_all_stats(parent); - parent->ramp_time_over = true; + parent->ramp_period_state = RAMP_DONE; td_set_runstate(parent, TD_RAMP); return true; } -bool ramp_time_over(struct thread_data *td) + +bool ramp_period_over(struct thread_data *td) { - if (!td->o.ramp_time || td->ramp_time_over) + if (td->ramp_period_state == RAMP_DONE) return true; - if (utime_since_now(&td->epoch) >= td->o.ramp_time) { - td->ramp_time_over = true; - reset_all_stats(td); - reset_io_stats(td); - td_set_runstate(td, TD_RAMP); + if (td->ramp_period_state == RAMP_RUNNING) + return false; - /* - * If we have a parent, the parent isn't doing IO. Hence - * the parent never enters do_io(), which will switch us - * from RAMP -> RUNNING. Do this manually here. - */ - if (parent_update_ramp(td)) - td_set_runstate(td, TD_RUNNING); + td->ramp_period_state = RAMP_DONE; + reset_all_stats(td); + reset_io_stats(td); + td_set_runstate(td, TD_RAMP); - return true; - } + /* + * If we have a parent, the parent isn't doing IO. Hence + * the parent never enters do_io(), which will switch us + * from RAMP -> RUNNING. Do this manually here. + */ + if (parent_update_ramp(td)) + td_set_runstate(td, TD_RUNNING); - return false; + return true; +} + +int td_ramp_period_init(struct thread_data *td) +{ + if (td->o.ramp_time || td->o.ramp_size) { + if (td->o.ramp_time && td->o.ramp_size) { + td_verror(td, EINVAL, "job rejected: cannot specify both ramp_time and ramp_size"); + return 1; + } + /* Make sure options are consistent within reporting group */ + for_each_td(td2) { + if (td->groupid == td2->groupid && + td->o.ramp_size != td2->o.ramp_size) { + td_verror(td, EINVAL, "job rejected: inconsistent ramp_size within reporting group"); + return 1; + } + } end_for_each(); + td->ramp_period_state = RAMP_RUNNING; + ramp_period_enabled = true; + } else { + td->ramp_period_state = RAMP_DONE; + } + return 0; } void fio_time_init(void) @@ -172,14 +272,22 @@ void set_genesis_time(void) fio_gettime(&genesis, NULL); } -void set_epoch_time(struct thread_data *td, int log_unix_epoch) +void set_epoch_time(struct thread_data *td, clockid_t log_alternate_epoch_clock_id, clockid_t job_start_clock_id) { + struct timespec ts; fio_gettime(&td->epoch, NULL); - if (log_unix_epoch) { - struct timeval tv; - gettimeofday(&tv, NULL); - td->unix_epoch = (unsigned long long)(tv.tv_sec) * 1000 + - (unsigned long long)(tv.tv_usec) / 1000; + clock_gettime(log_alternate_epoch_clock_id, &ts); + td->alternate_epoch = (unsigned long long)(ts.tv_sec) * 1000 + + (unsigned long long)(ts.tv_nsec) / 1000000; + if (job_start_clock_id == log_alternate_epoch_clock_id) + { + td->job_start = td->alternate_epoch; + } + else + { + clock_gettime(job_start_clock_id, &ts); + td->job_start = (unsigned long long)(ts.tv_sec) * 1000 + + (unsigned long long)(ts.tv_nsec) / 1000000; } } diff --git a/tools/fio_generate_plots b/tools/fio_generate_plots index e455878815..468cf27a6c 100755 --- a/tools/fio_generate_plots +++ b/tools/fio_generate_plots @@ -21,7 +21,7 @@ if [ -z "$1" ]; then exit 1 fi -GNUPLOT=$(which gnuplot) +GNUPLOT=$(command -v gnuplot) if [ ! -x "$GNUPLOT" ] then echo You need gnuplot installed to generate graphs diff --git a/tools/fio_jsonplus_clat2csv b/tools/fio_jsonplus_clat2csv index 7f310fcc47..8fdd014d95 100755 --- a/tools/fio_jsonplus_clat2csv +++ b/tools/fio_jsonplus_clat2csv @@ -135,7 +135,7 @@ def more_bins(indices, bins): Returns: True if the indices do not yet point to the end of each bin in bins. - False if the indices point beyond their repsective bins. + False if the indices point beyond their respective bins. """ for key, value in six.iteritems(indices): @@ -160,7 +160,7 @@ def debug_print(debug, *args): def get_csvfile(dest, jobnum): """Generate CSV filename from command-line arguments and job numbers. - Paramaters: + Parameters: dest file specification for CSV filename. jobnum job number. diff --git a/tools/fiograph/fiograph.conf b/tools/fiograph/fiograph.conf index cfd2fd8eb3..74f9752d43 100644 --- a/tools/fiograph/fiograph.conf +++ b/tools/fiograph/fiograph.conf @@ -45,16 +45,22 @@ specific_options=stat_type specific_options=volume brick [ioengine_http] -specific_options=https http_host http_user http_pass http_s3_key http_s3_keyid http_swift_auth_token http_s3_region http_mode http_verbose +specific_options=https http_host http_user http_pass http_s3_key http_s3_keyid http_swift_auth_token http_s3_region http_mode http_verbose http_s3_storage_class http_s3_sse_customer_key http_s3_sse_customer_algorithm [ioengine_ime_aio] specific_options=ime_psync ime_psyncv [ioengine_io_uring] -specific_options=hipri cmdprio_percentage cmdprio_class cmdprio cmdprio_bssplit fixedbufs registerfiles sqthread_poll sqthread_poll_cpu nonvectored uncached nowait force_async +specific_options=hipri cmdprio_percentage cmdprio_class cmdprio cmdprio_bssplit fixedbufs registerfiles sqthread_poll sqthread_poll_cpu nonvectored nowait force_async atomic uncached + +[ioengine_io_uring_cmd] +specific_options=hipri cmdprio_percentage cmdprio_class cmdprio cmdprio_bssplit fixedbufs registerfiles sqthread_poll sqthread_poll_cpu nonvectored nowait force_async cmd_type md_per_io_size pi_act pi_chk apptag apptag_mask [ioengine_libaio] -specific_options=userspace_reap cmdprio_percentage cmdprio_class cmdprio cmdprio_bssplit nowait +specific_options=userspace_reap cmdprio_percentage cmdprio_class cmdprio cmdprio_bssplit nowait atomic + +[ioengine_libblkio] +specific_options=libblkio_driver libblkio_path libblkio_pre_connect_props libblkio_num_entries libblkio_queue_size libblkio_pre_start_props hipri libblkio_vectored libblkio_write_zeroes_on_trim libblkio_wait_mode libblkio_force_enable_completion_eventfd [ioengine_libcufile] specific_options=gpu_dev_ids cuda_io @@ -65,15 +71,6 @@ specific_options=namenode hostname port hdfsdirectory chunk_size single_ins [ioengine_libiscsi] specific_options=initiator -[ioengine_librpma_apm_server] -specific_options=librpma_apm_client - -[ioengine_busy_wait_polling] -specific_options=serverip port direct_write_to_pmem - -[ioengine_librpma_gpspm_server] -specific_options=librpma_gpspm_client - [ioengine_mmap] specific_options=thp @@ -99,7 +96,10 @@ specific_options=clustername rbdname pool clientname busy_poll specific_options=hostname bindname port verb [ioengine_sg] -specific_options=hipri readfua writefua sg_write_mode sg +specific_options=hipri readfua writefua sg_write_mode stream_id [ioengine_pvsync2] -specific_options=hipri hipri_percentage uncached nowait sync psync vsync pvsync +specific_options=hipri hipri_percentage nowait sync psync vsync pvsync atomic uncached + +[ioengine_xnvme] +specific_options=hipri sqthread_poll xnvme_be xnvme_async xnvme_sync xnvme_admin xnvme_dev_nsid xnvme_iovec diff --git a/tools/fiograph/fiograph.py b/tools/fiograph/fiograph.py index b5669a2dab..cfb9b04187 100755 --- a/tools/fiograph/fiograph.py +++ b/tools/fiograph/fiograph.py @@ -1,4 +1,7 @@ #!/usr/bin/env python3 +import uuid +import time +import errno from graphviz import Digraph import argparse import configparser @@ -218,7 +221,7 @@ def fio_to_graphviz(filename, format): # The first job will be a new execution group new_execution_group = True - # Let's interate on all sections to create links between them + # Let's iterate on all sections to create links between them for section_name in fio_file.sections(): # The current section section = fio_file[section_name] @@ -274,7 +277,7 @@ def setup_commandline(): parser.add_argument('--format', action='store', type=str, default='png', - help='the output format') + help='the output format (see https://graphviz.org/docs/outputs/)') parser.add_argument('--view', action='store_true', default=False, help='view the graph') @@ -283,7 +286,6 @@ def setup_commandline(): help='keep the graphviz script file') parser.add_argument('--config', action='store', type=str, - default='fiograph.conf', help='the configuration filename') args = parser.parse_args() return args @@ -292,16 +294,38 @@ def setup_commandline(): def main(): global config_file args = setup_commandline() - if args.output is None: - output_file = args.file - output_file = output_file.replace('.fio', '') + + if args.config is None: + if os.path.exists('fiograph.conf'): + config_filename = 'fiograph.conf' + else: + config_filename = os.path.join(os.path.dirname(__file__), 'fiograph.conf') + if not os.path.exists(config_filename): + raise FileNotFoundError("Cannot locate configuration file") else: - output_file = args.output + config_filename = args.config config_file = configparser.RawConfigParser(allow_no_value=True) - config_file.read(args.config) - fio_to_graphviz(args.file, args.format).render(output_file, view=args.view) + config_file.read(config_filename) + + temp_filename = uuid.uuid4().hex + image_filename = fio_to_graphviz(args.file, args.format).render(temp_filename, view=args.view) + + output_filename_stub = args.file + if args.output: + output_filename = args.output + else: + if output_filename_stub.endswith('.fio'): + output_filename_stub = output_filename_stub[:-4] + output_filename = image_filename.replace(temp_filename, output_filename_stub) + if args.view: + time.sleep(1) + # allow time for the file to be opened before renaming it + os.rename(image_filename, output_filename) + if not args.keep: - os.remove(output_file) + os.remove(temp_filename) + else: + os.rename(temp_filename, output_filename_stub + '.gv') main() diff --git a/tools/fiologparser.py b/tools/fiologparser.py index 054f1f6078..708c5d4920 100755 --- a/tools/fiologparser.py +++ b/tools/fiologparser.py @@ -166,7 +166,7 @@ def read_data(self, fn): f = open(fn, 'r') p_time = 0 for line in f: - (time, value, foo, bar) = line.rstrip('\r\n').rsplit(', ') + (time, value) = line.rstrip('\r\n').rsplit(', ')[:2] self.add_sample(p_time, int(time), int(value)) p_time = int(time) diff --git a/tools/genfio b/tools/genfio index 8518bbccf3..c9bc2f764d 100755 --- a/tools/genfio +++ b/tools/genfio @@ -22,7 +22,8 @@ BLK_SIZE= BLOCK_SIZE=4k SEQ=-1 -TEMPLATE=/tmp/template.fio +TEMPLATE=$(mktemp "${TMPDIR:-${TEMP:-/tmp}}/template.fio.XXXXXX") || exit $? +trap 'rm -f "$TEMPLATE"' EXIT OUTFILE= DISKS= PRINTABLE_DISKS= @@ -48,7 +49,7 @@ show_help() { one test after another then one disk after another Disabled by default -p : Run parallel test - one test after anoter but all disks at the same time + one test after another but all disks at the same time Enabled by default -D iodepth : Run with the specified iodepth Default is $IODEPTH diff --git a/tools/hist/fio-histo-log-pctiles.py b/tools/hist/fio-histo-log-pctiles.py index 08e7722d04..b5d167de22 100755 --- a/tools/hist/fio-histo-log-pctiles.py +++ b/tools/hist/fio-histo-log-pctiles.py @@ -748,7 +748,7 @@ def test_e1_get_pctiles_flat_histo(self): def test_e2_get_pctiles_highest_pct(self): fio_v3_bucket_count = 29 * 64 with open(self.fn, 'w') as f: - # make a empty fio v3 histogram + # make an empty fio v3 histogram buckets = [ 0 for j in range(0, fio_v3_bucket_count) ] # add one I/O request to last bucket buckets[-1] = 1 diff --git a/tools/plot/fio2gnuplot b/tools/plot/fio2gnuplot index d2dc81df9b..ce3ca2cc9f 100755 --- a/tools/plot/fio2gnuplot +++ b/tools/plot/fio2gnuplot @@ -492,8 +492,8 @@ def main(argv): #We need to adjust the output filename regarding the pattern required by the user if (pattern_set_by_user == True): gnuplot_output_filename=pattern - # As we do have some glob in the pattern, let's make this simpliest - # We do remove the simpliest parts of the expression to get a clear file name + # As we do have some glob in the pattern, let's make this simplest + # We do remove the simplest parts of the expression to get a clear file name gnuplot_output_filename=gnuplot_output_filename.replace('-*-','-') gnuplot_output_filename=gnuplot_output_filename.replace('*','-') gnuplot_output_filename=gnuplot_output_filename.replace('--','-') diff --git a/tools/plot/fio2gnuplot.1 b/tools/plot/fio2gnuplot.1 index 6fb1283f50..bfa10d26ef 100644 --- a/tools/plot/fio2gnuplot.1 +++ b/tools/plot/fio2gnuplot.1 @@ -35,7 +35,7 @@ The resulting graph helps at understanding trends. .TP .B Grouped 2D graph -All files are plotted in a single image to ease the comparaison. The same rendering options as per the individual 2D graph are used : +All files are plotted in a single image to ease the comparison. The same rendering options as per the individual 2D graph are used : .RS .IP \(bu 3 raw diff --git a/tools/plot/fio2gnuplot.manpage b/tools/plot/fio2gnuplot.manpage index 6a12cf8196..be3f13c202 100644 --- a/tools/plot/fio2gnuplot.manpage +++ b/tools/plot/fio2gnuplot.manpage @@ -20,7 +20,7 @@ DESCRIPTION The resulting graph helps at understanding trends. Grouped 2D graph - All files are plotted in a single image to ease the comparaison. The same rendering options as per the individual 2D graph are used : + All files are plotted in a single image to ease the comparison. The same rendering options as per the individual 2D graph are used : - raw - smooth - trend diff --git a/unittests/lib/num2str.c b/unittests/lib/num2str.c index 8f12cf83fe..49e803467a 100644 --- a/unittests/lib/num2str.c +++ b/unittests/lib/num2str.c @@ -37,11 +37,46 @@ static void test_num2str(void) } } +struct bytes2str_testcase { + uint64_t bytes; + const char *expected; +}; + +static const struct bytes2str_testcase bytes2str_testcases[] = { + { 0, "0.00 B" }, + { 512, "512.00 B" }, + { 1024, "1.00 KiB" }, + { 1536, "1.50 KiB" }, + { 1048576, "1.00 MiB" }, + { 1073741824ULL, "1.00 GiB" }, + { 1099511627776ULL, "1.00 TiB" }, + { 1125899906842624ULL, "1.00 PiB" }, + { 1152921504606846976ULL, "1.00 EiB" }, +}; + +static void test_bytes2str_simple(void) +{ + char buf[64]; + int i; + + for (i = 0; i < FIO_ARRAY_SIZE(bytes2str_testcases); ++i) { + const struct bytes2str_testcase *tc = &bytes2str_testcases[i]; + const char *result = bytes2str_simple(buf, sizeof(buf), tc->bytes); + + CU_ASSERT_PTR_EQUAL(result, buf); + CU_ASSERT_STRING_EQUAL(result, tc->expected); + } +} + static struct fio_unittest_entry tests[] = { { .name = "num2str/1", .fn = test_num2str, }, + { + .name = "bytes2str_simple/1", + .fn = test_bytes2str_simple, + }, { .name = NULL, }, diff --git a/unittests/lib/pcbuf.c b/unittests/lib/pcbuf.c new file mode 100644 index 0000000000..f6167423cc --- /dev/null +++ b/unittests/lib/pcbuf.c @@ -0,0 +1,116 @@ +/** + * SPDX-License-Identifier: GPL-2.0 only + * + * Copyright (c) 2025 Sandisk Corporation or its affiliates. + */ +#include +#include +#include +#include +#include + +#include "../unittest.h" +#include "pcbuf.h" + +#define TEST_CAPACITY 8 /* Small capacity for wrap-around testing */ + +static void test_pcbuf_basic_ops(void) +{ + struct pc_buf *cb = pcb_alloc(TEST_CAPACITY); + uint64_t i; + + CU_ASSERT_PTR_NOT_NULL(cb); + + CU_ASSERT_TRUE(pcb_is_empty(cb)); + CU_ASSERT_FALSE(pcb_is_full(cb)); + CU_ASSERT_EQUAL(pcb_committed_size(cb), 0); + CU_ASSERT_EQUAL(pcb_staged_size(cb), 0); + CU_ASSERT_TRUE(pcb_space_available(cb)); + + /* Stage data up to capacity-1 (since 1 slot is reserved) */ + for (i = 0; i < TEST_CAPACITY - 1; ++i) { + CU_ASSERT_TRUE(pcb_push_staged(cb, i + 100)); + } + + /* Next push should fail (buffer full) */ + CU_ASSERT_FALSE(pcb_push_staged(cb, 999)); + + CU_ASSERT_EQUAL(pcb_staged_size(cb), TEST_CAPACITY - 1); + CU_ASSERT_EQUAL(pcb_committed_size(cb), 0); + CU_ASSERT_TRUE(pcb_is_empty(cb)); + CU_ASSERT_TRUE(pcb_is_full(cb)); + + /* Commit staged data */ + pcb_commit(cb); + + CU_ASSERT_EQUAL(pcb_committed_size(cb), TEST_CAPACITY - 1); + CU_ASSERT_EQUAL(pcb_staged_size(cb), 0); + CU_ASSERT_FALSE(pcb_is_empty(cb)); + + /* Pop all committed data */ + for (i = 0; i < TEST_CAPACITY - 1; ++i) { + uint64_t val; + CU_ASSERT_TRUE(pcb_pop(cb, &val)); + CU_ASSERT_EQUAL(val, i + 100); + } + + /* Buffer should now be empty again */ + CU_ASSERT_TRUE(pcb_is_empty(cb)); + CU_ASSERT_FALSE(pcb_is_full(cb)); + CU_ASSERT_TRUE(pcb_space_available(cb)); + + free(cb); +} + +static void test_pcbuf_wraparound(void) +{ + struct pc_buf *cb = pcb_alloc(TEST_CAPACITY); + uint64_t expected[] = {201, 202, 203, 204, 205, 999}; + size_t num_expected = sizeof(expected)/sizeof(expected[0]); + uint64_t val; + uint64_t i; + + CU_ASSERT_PTR_NOT_NULL(cb); + + /* Stage up to near capacity and commit */ + for (i = 0; i < TEST_CAPACITY - 2; ++i) + CU_ASSERT_TRUE(pcb_push_staged(cb, i + 200)); + + pcb_commit(cb); + + /* Pop one item to move read_tail forward */ + CU_ASSERT_TRUE(pcb_pop(cb, &val)); + CU_ASSERT_EQUAL(val, 200); + + /* Now stage one more item to cause wraparound */ + CU_ASSERT_TRUE(pcb_push_staged(cb, 999)); + pcb_commit(cb); + + /* Pop remaining items, ensure correctness */ + for (i = 0; i < num_expected; ++i) { + CU_ASSERT_TRUE(pcb_pop(cb, &val)); + CU_ASSERT_EQUAL(val, expected[i]); + } + + CU_ASSERT_TRUE(pcb_is_empty(cb)); + free(cb); +} + +static struct fio_unittest_entry tests[] = { + { + .name = "pcbuf/basic_ops", + .fn = test_pcbuf_basic_ops, + }, + { + .name = "pcbuf/wraparound", + .fn = test_pcbuf_wraparound, + }, + { + .name = NULL, + }, +}; + +CU_ErrorCode fio_unittest_lib_pcbuf(void) +{ + return fio_unittest_add_suite("pcbuf.h", NULL, NULL, tests); +} diff --git a/unittests/unittest.c b/unittests/unittest.c index f490b4852b..4a034b40ae 100644 --- a/unittests/unittest.c +++ b/unittests/unittest.c @@ -50,6 +50,7 @@ int main(void) fio_unittest_register(fio_unittest_lib_memalign); fio_unittest_register(fio_unittest_lib_num2str); fio_unittest_register(fio_unittest_lib_strntol); + fio_unittest_register(fio_unittest_lib_pcbuf); fio_unittest_register(fio_unittest_oslib_strlcat); fio_unittest_register(fio_unittest_oslib_strndup); fio_unittest_register(fio_unittest_oslib_strcasestr); diff --git a/unittests/unittest.h b/unittests/unittest.h index ecb7d12415..0f45bfbdd6 100644 --- a/unittests/unittest.h +++ b/unittests/unittest.h @@ -17,6 +17,7 @@ CU_ErrorCode fio_unittest_add_suite(const char*, CU_InitializeFunc, CU_ErrorCode fio_unittest_lib_memalign(void); CU_ErrorCode fio_unittest_lib_num2str(void); CU_ErrorCode fio_unittest_lib_strntol(void); +CU_ErrorCode fio_unittest_lib_pcbuf(void); CU_ErrorCode fio_unittest_oslib_strlcat(void); CU_ErrorCode fio_unittest_oslib_strndup(void); CU_ErrorCode fio_unittest_oslib_strcasestr(void); diff --git a/verify-state.h b/verify-state.h index 6da1585b24..27eb9e9a06 100644 --- a/verify-state.h +++ b/verify-state.h @@ -22,23 +22,18 @@ struct thread_rand_state { }; }; -/* - * For dumping current write state - */ -struct file_comp { - uint64_t fileno; - uint64_t offset; +/* a single inflight write */ +struct inflight_write { + uint64_t numberio; }; struct thread_io_list { - uint64_t no_comps; - uint32_t depth; - uint32_t nofiles; - uint64_t numberio; + uint32_t depth; /* I/O depth of the job that saves the verify state */ + uint64_t numberio; /* Number of issued writes */ uint64_t index; struct thread_rand_state rand; uint8_t name[64]; - struct file_comp comps[0]; + struct inflight_write inflight[0]; }; struct all_io_list { @@ -46,7 +41,7 @@ struct all_io_list { struct thread_io_list state[0]; }; -#define VSTATE_HDR_VERSION 0x03 +#define VSTATE_HDR_VERSION 0x05 struct verify_state_hdr { uint64_t version; @@ -62,18 +57,18 @@ extern void __verify_save_state(struct all_io_list *, const char *); extern void verify_save_state(int mask); extern int verify_load_state(struct thread_data *, const char *); extern void verify_free_state(struct thread_data *); -extern int verify_state_should_stop(struct thread_data *, struct io_u *); +extern int verify_state_should_stop(struct thread_data *, uint64_t); extern void verify_assign_state(struct thread_data *, void *); extern int verify_state_hdr(struct verify_state_hdr *, struct thread_io_list *); -static inline size_t __thread_io_list_sz(uint32_t depth, uint32_t nofiles) +static inline size_t __thread_io_list_sz(uint32_t depth) { - return sizeof(struct thread_io_list) + depth * nofiles * sizeof(struct file_comp); + return sizeof(struct thread_io_list) + depth * sizeof(struct inflight_write); } static inline size_t thread_io_list_sz(struct thread_io_list *s) { - return __thread_io_list_sz(le32_to_cpu(s->depth), le32_to_cpu(s->nofiles)); + return __thread_io_list_sz(le32_to_cpu(s->depth)); } static inline struct thread_io_list *io_list_next(struct thread_io_list *s) @@ -106,4 +101,6 @@ static inline void verify_state_gen_name(char *out, size_t size, out[size - 1] = '\0'; } +#define INVALID_NUMBERIO UINT64_MAX + #endif diff --git a/verify.c b/verify.c index 0e1e463934..e2011a0f22 100644 --- a/verify.c +++ b/verify.c @@ -51,6 +51,8 @@ void fill_verify_pattern(struct thread_data *td, void *p, unsigned int len, struct io_u *io_u, uint64_t seed, int use_seed) { struct thread_options *o = &td->o; + unsigned int interval = o->verify_pattern_interval; + unsigned long long offset = io_u->offset; if (!o->verify_pattern_bytes) { dprint(FD_VERIFY, "fill random bytes len=%u\n", len); @@ -65,18 +67,33 @@ void fill_verify_pattern(struct thread_data *td, void *p, unsigned int len, return; } - /* Skip if we were here and we do not need to patch pattern - * with format */ - if (!td->o.verify_fmt_sz && io_u->buf_filled_len >= len) { + /* Skip if we were here and we do not need to patch pattern with + * format. However, we cannot skip if verify_offset is set because we + * have swapped the header with pattern bytes */ + if (!td->o.verify_fmt_sz && io_u->buf_filled_len >= len && !td->o.verify_offset) { dprint(FD_VERIFY, "using already filled verify pattern b=%d len=%u\n", o->verify_pattern_bytes, len); return; } - (void)paste_format(td->o.verify_pattern, td->o.verify_pattern_bytes, - td->o.verify_fmt, td->o.verify_fmt_sz, - p, len, io_u); + if (!interval) + interval = len; + + io_u->offset += (p - io_u->buf) - (p - io_u->buf) % interval; + for (unsigned int bytes_done = 0, bytes_todo = 0; bytes_done < len; + bytes_done += bytes_todo, p += bytes_todo, io_u->offset += interval) { + bytes_todo = (p - io_u->buf) % interval; + if (!bytes_todo) + bytes_todo = interval; + bytes_todo = min(bytes_todo, len - bytes_done); + + (void)paste_format(td->o.verify_pattern, td->o.verify_pattern_bytes, + td->o.verify_fmt, td->o.verify_fmt_sz, + p, bytes_todo, io_u); + } + io_u->buf_filled_len = len; + io_u->offset = offset; } static unsigned int get_hdr_inc(struct thread_data *td, struct io_u *io_u) @@ -338,12 +355,20 @@ static void dump_verify_buffers(struct verify_header *hdr, struct vcont *vc) static void log_verify_failure(struct verify_header *hdr, struct vcont *vc) { unsigned long long offset; + uint32_t len; + struct thread_data *td = vc->td; offset = vc->io_u->verify_offset; - offset += vc->hdr_num * hdr->len; + if (td->o.verify != VERIFY_PATTERN_NO_HDR) { + len = hdr->len; + offset += (unsigned long long) vc->hdr_num * len; + } else { + len = vc->io_u->buflen; + } + log_err("%.8s: verify failed at file %s offset %llu, length %u" " (requested block: offset=%llu, length=%llu, flags=%x)\n", - vc->name, vc->io_u->file->file_name, offset, hdr->len, + vc->name, vc->io_u->file->file_name, offset, len, vc->io_u->verify_offset, vc->io_u->buflen, vc->io_u->flags); if (vc->good_crc && vc->bad_crc) { @@ -364,53 +389,131 @@ static inline void *io_u_verify_off(struct verify_header *hdr, struct vcont *vc) return vc->io_u->buf + vc->hdr_num * hdr->len + hdr_size(vc->td, hdr); } +static int check_pattern(char *buf, unsigned int len, unsigned int mod, + unsigned int pattern_size, char *pattern, unsigned int header_size) +{ + unsigned int i; + int rc; + + rc = cmp_pattern(pattern, pattern_size, mod, buf, len); + if (!rc) + goto done; + + /* Slow path, compare each byte */ + for (i = 0; i < len; i++) { + if (buf[i] != pattern[mod]) { + unsigned int bits; + + bits = hweight8(buf[i] ^ pattern[mod]); + log_err("fio: got pattern '%02x', wanted '%02x'. Bad bits %d\n", + (unsigned char)buf[i], + (unsigned char)pattern[mod], + bits); + log_err("fio: bad pattern block offset %u\n", + i + header_size); + rc = EILSEQ; + goto done; + } + mod++; + if (mod == pattern_size) + mod = 0; + } + +done: + return rc; +} + +/* + * The current thread will need its own buffer if there are multiple threads + * and the pattern contains the offset. Fio currently only has one pattern + * format specifier so we only need to check that one, but this may need to be + * changed if fio ever gains more pattern format specifiers. + */ +static inline bool pattern_need_buffer(struct thread_data *td) +{ + return (td->o.verify_async || td->o.use_thread) && + td->o.verify_fmt_sz && + td->o.verify_fmt[0].desc->paste == paste_blockoff; +} + static int verify_io_u_pattern(struct verify_header *hdr, struct vcont *vc) { struct thread_data *td = vc->td; struct io_u *io_u = vc->io_u; char *buf, *pattern; unsigned int header_size = __hdr_size(td->o.verify); - unsigned int len, mod, i, pattern_size; + unsigned int len, mod, pattern_size, pattern_interval_mod, bytes_done = 0, bytes_todo; int rc; + unsigned long long offset = io_u->offset; pattern = td->o.verify_pattern; pattern_size = td->o.verify_pattern_bytes; assert(pattern_size != 0); - (void)paste_format_inplace(pattern, pattern_size, - td->o.verify_fmt, td->o.verify_fmt_sz, io_u); + /* + * Make this thread safe when verify_async is set and the verify + * pattern includes the offset. + */ + if (pattern_need_buffer(td)) { + pattern = malloc(pattern_size); + assert(pattern); + memcpy(pattern, td->o.verify_pattern, pattern_size); + } + + if (!td->o.verify_pattern_interval) { + (void)paste_format_inplace(pattern, pattern_size, + td->o.verify_fmt, td->o.verify_fmt_sz, io_u); + } + + /* + * We have 3 cases here: + * 1. Compare the entire buffer if (1) verify_interval is not set and + * (2) verify_pattern_interval is not set + * 2. Compare the entire *verify_interval* if (1) verify_interval *is* + * set and (2) verify_pattern_interval is not set + * 3. Compare *verify_pattern_interval* segments or subsets thereof if + * (2) verify_pattern_interval is set + */ buf = (char *) hdr + header_size; len = get_hdr_inc(td, io_u) - header_size; - mod = (get_hdr_inc(td, io_u) * vc->hdr_num + header_size) % pattern_size; - - rc = cmp_pattern(pattern, pattern_size, mod, buf, len); - if (!rc) - return 0; + if (td->o.verify_pattern_interval) { + unsigned int extent = get_hdr_inc(td, io_u) * vc->hdr_num + header_size; + pattern_interval_mod = extent % td->o.verify_pattern_interval; + mod = pattern_interval_mod % pattern_size; + bytes_todo = min(len, td->o.verify_pattern_interval - pattern_interval_mod); + io_u->offset += extent / td->o.verify_pattern_interval * td->o.verify_pattern_interval; + } else { + mod = (get_hdr_inc(td, io_u) * vc->hdr_num + header_size) % pattern_size; + bytes_todo = len; + pattern_interval_mod = 0; + } - /* Slow path, compare each byte */ - for (i = 0; i < len; i++) { - if (buf[i] != pattern[mod]) { - unsigned int bits; + while (bytes_done < len) { + if (td->o.verify_pattern_interval) { + (void)paste_format_inplace(pattern, pattern_size, + td->o.verify_fmt, td->o.verify_fmt_sz, + io_u); + } - bits = hweight8(buf[i] ^ pattern[mod]); - log_err("fio: got pattern '%02x', wanted '%02x'. Bad bits %d\n", - (unsigned char)buf[i], - (unsigned char)pattern[mod], - bits); - log_err("fio: bad pattern block offset %u\n", i); + rc = check_pattern(buf, bytes_todo, mod, pattern_size, pattern, header_size); + if (rc) { vc->name = "pattern"; log_verify_failure(hdr, vc); - return EILSEQ; + break; } - mod++; - if (mod == td->o.verify_pattern_bytes) - mod = 0; + + mod = 0; + bytes_done += bytes_todo; + buf += bytes_todo; + io_u->offset += td->o.verify_pattern_interval; + bytes_todo = min(len - bytes_done, td->o.verify_pattern_interval); } - /* Unreachable line */ - assert(0); - return EILSEQ; + io_u->offset = offset; + if (pattern_need_buffer(td)) + free(pattern); + return rc; } static int verify_io_u_xxhash(struct verify_header *hdr, struct vcont *vc) @@ -528,6 +631,7 @@ static int verify_io_u_sha512(struct verify_header *hdr, struct vcont *vc) fio_sha512_init(&sha512_ctx); fio_sha512_update(&sha512_ctx, p, hdr->len - hdr_size(vc->td, hdr)); + fio_sha512_final(&sha512_ctx); if (!memcmp(vh->sha512, sha512_ctx.buf, sizeof(sha512))) return 0; @@ -818,12 +922,17 @@ static int verify_header(struct io_u *io_u, struct thread_data *td, hdr->magic, FIO_HDR_MAGIC); goto err; } + if (hdr->version != VERIFY_HEADER_VERSION) { + log_err("verify: unsupported header version %x, wanted %x. Are you trying to verify across versions of fio?", + hdr->version, VERIFY_HEADER_VERSION); + goto err; + } if (hdr->len != hdr_len) { log_err("verify: bad header length %u, wanted %u", hdr->len, hdr_len); goto err; } - if (hdr->rand_seed != io_u->rand_seed) { + if (td->o.verify_header_seed && (hdr->rand_seed != io_u->rand_seed)) { log_err("verify: bad header rand_seed %"PRIu64 ", wanted %"PRIu64, hdr->rand_seed, io_u->rand_seed); @@ -839,15 +948,16 @@ static int verify_header(struct io_u *io_u, struct thread_data *td, /* * For read-only workloads, the program cannot be certain of the * last numberio written to a block. Checking of numberio will be - * done only for workloads that write data. For verify_only, - * numberio check is skipped. + * done only for workloads that write data. For verify_only or + * any mode de-selecting verify_write_sequence, numberio check is + * skipped. */ if (td_write(td) && (td_min_bs(td) == td_max_bs(td)) && !td->o.time_based) - if (!td->o.verify_only) + if (td->o.verify_write_sequence) if (hdr->numberio != io_u->numberio) { - log_err("verify: bad header numberio %"PRIu16 - ", wanted %"PRIu16, + log_err("verify: bad header numberio %"PRIu64 + ", wanted %"PRIu64, hdr->numberio, io_u->numberio); goto err; } @@ -891,6 +1001,13 @@ int verify_io_u(struct thread_data *td, struct io_u **io_u_ptr) if (td_ioengine_flagged(td, FIO_FAKEIO)) return 0; + /* + * If data has already been verified from the device, we can skip + * the actual verification phase here. + */ + if (io_u->flags & IO_U_F_VER_IN_DEV) + return 0; + if (io_u->flags & IO_U_F_TRIMMED) { ret = verify_trimmed_io_u(td, io_u); goto done; @@ -916,12 +1033,6 @@ int verify_io_u(struct thread_data *td, struct io_u **io_u_ptr) memswp(p, p + td->o.verify_offset, header_size); hdr = p; - /* - * Make rand_seed check pass when have verify_backlog. - */ - if (!td_rw(td) || (td->flags & TD_F_VER_BACKLOG)) - io_u->rand_seed = hdr->rand_seed; - if (td->o.verify != VERIFY_PATTERN_NO_HDR) { ret = verify_header(io_u, td, hdr, hdr_num, hdr_inc); if (ret) @@ -992,7 +1103,7 @@ int verify_io_u(struct thread_data *td, struct io_u **io_u_ptr) ret = EINVAL; } - if (ret && verify_type != hdr->verify_type) + if (ret && verify_type != hdr->verify_type && verify_type != VERIFY_PATTERN_NO_HDR) log_err("fio: verify type mismatch (%u media, %u given)\n", hdr->verify_type, verify_type); } @@ -1073,6 +1184,7 @@ static void fill_sha512(struct verify_header *hdr, void *p, unsigned int len) fio_sha512_init(&sha512_ctx); fio_sha512_update(&sha512_ctx, p, len); + fio_sha512_final(&sha512_ctx); } static void fill_sha256(struct verify_header *hdr, void *p, unsigned int len) @@ -1153,6 +1265,7 @@ static void __fill_hdr(struct thread_data *td, struct io_u *io_u, void *p = hdr; hdr->magic = FIO_HDR_MAGIC; + hdr->version = VERIFY_HEADER_VERSION; hdr->verify_type = td->o.verify; hdr->len = header_len; hdr->rand_seed = rand_seed; @@ -1287,8 +1400,6 @@ void populate_verify_io_u(struct thread_data *td, struct io_u *io_u) if (td->o.verify == VERIFY_NULL) return; - io_u->numberio = td->io_issues[io_u->ddir]; - fill_pattern_headers(td, io_u, 0, 0); } @@ -1524,54 +1635,12 @@ int paste_blockoff(char *buf, unsigned int len, void *priv) return 0; } -static int __fill_file_completions(struct thread_data *td, - struct thread_io_list *s, - struct fio_file *f, unsigned int *index) -{ - unsigned int comps; - int i, j; - - if (!f->last_write_comp) - return 0; - - if (td->io_blocks[DDIR_WRITE] < td->o.iodepth) - comps = td->io_blocks[DDIR_WRITE]; - else - comps = td->o.iodepth; - - j = f->last_write_idx - 1; - for (i = 0; i < comps; i++) { - if (j == -1) - j = td->o.iodepth - 1; - s->comps[*index].fileno = __cpu_to_le64(f->fileno); - s->comps[*index].offset = cpu_to_le64(f->last_write_comp[j]); - (*index)++; - j--; - } - - return comps; -} - -static int fill_file_completions(struct thread_data *td, - struct thread_io_list *s, unsigned int *index) -{ - struct fio_file *f; - unsigned int i; - int comps = 0; - - for_each_file(td, f, i) - comps += __fill_file_completions(td, s, f, index); - - return comps; -} - struct all_io_list *get_all_io_list(int save_mask, size_t *sz) { struct all_io_list *rep; - struct thread_data *td; size_t depth; void *next; - int i, nr; + int nr; compiletime_assert(sizeof(struct all_io_list) == 8, "all_io_list"); @@ -1581,59 +1650,56 @@ struct all_io_list *get_all_io_list(int save_mask, size_t *sz) */ depth = 0; nr = 0; - for_each_td(td, i) { - if (save_mask != IO_LIST_ALL && (i + 1) != save_mask) + for_each_td(td) { + if (save_mask != IO_LIST_ALL && (__td_index + 1) != save_mask) continue; td->stop_io = 1; td->flags |= TD_F_VSTATE_SAVED; depth += (td->o.iodepth * td->o.nr_files); nr++; - } + } end_for_each(); if (!nr) return NULL; *sz = sizeof(*rep); *sz += nr * sizeof(struct thread_io_list); - *sz += depth * sizeof(struct file_comp); - rep = malloc(*sz); - memset(rep, 0, *sz); + *sz += depth * sizeof(struct inflight_write); + rep = calloc(1, *sz); rep->threads = cpu_to_le64((uint64_t) nr); next = &rep->state[0]; - for_each_td(td, i) { + for_each_td(td) { struct thread_io_list *s = next; - unsigned int comps, index = 0; - if (save_mask != IO_LIST_ALL && (i + 1) != save_mask) + if (save_mask != IO_LIST_ALL && (__td_index + 1) != save_mask) continue; - comps = fill_file_completions(td, s, &index); - - s->no_comps = cpu_to_le64((uint64_t) comps); - s->depth = cpu_to_le64((uint64_t) td->o.iodepth); - s->nofiles = cpu_to_le64((uint64_t) td->o.nr_files); - s->numberio = cpu_to_le64((uint64_t) td->io_issues[DDIR_WRITE]); - s->index = cpu_to_le64((uint64_t) i); - if (td->random_state.use64) { - s->rand.state64.s[0] = cpu_to_le64(td->random_state.state64.s1); - s->rand.state64.s[1] = cpu_to_le64(td->random_state.state64.s2); - s->rand.state64.s[2] = cpu_to_le64(td->random_state.state64.s3); - s->rand.state64.s[3] = cpu_to_le64(td->random_state.state64.s4); - s->rand.state64.s[4] = cpu_to_le64(td->random_state.state64.s5); + for (int i = 0; i < td->o.iodepth; i++) + s->inflight[i].numberio = cpu_to_le64(atomic_load_acquire(&td->inflight_numberio[i])); + + s->depth = cpu_to_le32((uint32_t) td->o.iodepth); + s->numberio = cpu_to_le64((uint64_t) atomic_load_acquire(&td->inflight_issued)); + s->index = cpu_to_le64((uint64_t) __td_index); + if (td->offset_state.use64) { + s->rand.state64.s[0] = cpu_to_le64(td->offset_state.state64.s1); + s->rand.state64.s[1] = cpu_to_le64(td->offset_state.state64.s2); + s->rand.state64.s[2] = cpu_to_le64(td->offset_state.state64.s3); + s->rand.state64.s[3] = cpu_to_le64(td->offset_state.state64.s4); + s->rand.state64.s[4] = cpu_to_le64(td->offset_state.state64.s5); s->rand.state64.s[5] = 0; s->rand.use64 = cpu_to_le64((uint64_t)1); } else { - s->rand.state32.s[0] = cpu_to_le32(td->random_state.state32.s1); - s->rand.state32.s[1] = cpu_to_le32(td->random_state.state32.s2); - s->rand.state32.s[2] = cpu_to_le32(td->random_state.state32.s3); + s->rand.state32.s[0] = cpu_to_le32(td->offset_state.state32.s1); + s->rand.state32.s[1] = cpu_to_le32(td->offset_state.state32.s2); + s->rand.state32.s[2] = cpu_to_le32(td->offset_state.state32.s3); s->rand.state32.s[3] = 0; s->rand.use64 = 0; } snprintf((char *) s->name, sizeof(s->name), "%s", td->o.name); next = io_list_next(s); - } + } end_for_each(); return rep; } @@ -1650,6 +1716,10 @@ static int open_state_file(const char *name, const char *prefix, int num, else flags = O_RDONLY; +#ifdef _WIN32 + flags |= O_BINARY; +#endif + verify_state_gen_name(out, sizeof(out), name, prefix, num); fd = open(out, flags, 0644); @@ -1738,9 +1808,7 @@ void verify_assign_state(struct thread_data *td, void *p) struct thread_io_list *s = p; int i; - s->no_comps = le64_to_cpu(s->no_comps); s->depth = le32_to_cpu(s->depth); - s->nofiles = le32_to_cpu(s->nofiles); s->numberio = le64_to_cpu(s->numberio); s->rand.use64 = le64_to_cpu(s->rand.use64); @@ -1752,9 +1820,9 @@ void verify_assign_state(struct thread_data *td, void *p) s->rand.state32.s[i] = le32_to_cpu(s->rand.state32.s[i]); } - for (i = 0; i < s->no_comps; i++) { - s->comps[i].fileno = le64_to_cpu(s->comps[i].fileno); - s->comps[i].offset = le64_to_cpu(s->comps[i].offset); + for (i = 0; i < s->depth; i++) { + s->inflight[i].numberio = le64_to_cpu(s->inflight[i].numberio); + dprint(FD_VERIFY, "verify_assign_state numberio=%"PRIu64", inflight[%d]=%"PRIu64"\n", s->numberio, i, s->inflight[i].numberio); } td->vstate = p; @@ -1840,38 +1908,31 @@ int verify_load_state(struct thread_data *td, const char *prefix) /* * Use the loaded verify state to know when to stop doing verification */ -int verify_state_should_stop(struct thread_data *td, struct io_u *io_u) +int verify_state_should_stop(struct thread_data *td, uint64_t numberio) { struct thread_io_list *s = td->vstate; - struct fio_file *f = io_u->file; int i; - if (!s || !f) - return 0; - - /* - * If we're not into the window of issues - depth yet, continue. If - * issue is shorter than depth, do check. - */ - if ((td->io_blocks[DDIR_READ] < s->depth || - s->numberio - td->io_blocks[DDIR_READ] > s->depth) && - s->numberio > s->depth) + dprint(FD_VERIFY, "verify_state_should_stop numberio=%"PRIu64"\n", numberio); + if (!s) return 0; - /* - * We're in the window of having to check if this io was - * completed or not. If the IO was seen as completed, then - * lets verify it. + /* If the current seq is lower than the max issued seq, check to make sure + * the write was not inflight. */ - for (i = 0; i < s->no_comps; i++) { - if (s->comps[i].fileno != f->fileno) - continue; - if (io_u->verify_offset == s->comps[i].offset) - return 0; + if (numberio < s->numberio) { + for (i = 0; i < s->depth; i++) { + if (s->inflight[i].numberio == numberio) { + log_info("Stop verify because seq %"PRIu64" was an inflight write\n", + numberio); + return 1; + } + } + } else { + log_info("Stop verify because seq %"PRIu64" >= %"PRIu64"\n", + numberio, s->numberio); + return 1; } - /* - * Not found, we have to stop - */ - return 1; + return 0; } diff --git a/verify.h b/verify.h index 539e6f6cf5..e361337c8f 100644 --- a/verify.h +++ b/verify.h @@ -32,6 +32,12 @@ enum { VERIFY_NULL, /* pretend to verify */ }; +/* + * Set the high bit to distinguish versioned headers from older + * non-versioned headers. + */ +#define VERIFY_HEADER_VERSION 0x81 + /* * A header structure associated with each checksummed data block. It is * followed by a checksum specific header that contains the verification @@ -39,14 +45,15 @@ enum { */ struct verify_header { uint16_t magic; - uint16_t verify_type; + uint8_t version; + uint8_t verify_type; uint32_t len; uint64_t rand_seed; uint64_t offset; uint32_t time_sec; uint32_t time_nsec; + uint64_t numberio; uint16_t thread; - uint16_t numberio; uint32_t crc32; }; diff --git a/workqueue.c b/workqueue.c index 9e6c41ff2f..e3ff032393 100644 --- a/workqueue.c +++ b/workqueue.c @@ -136,7 +136,8 @@ static void *worker_thread(void *data) sk_out_assign(sw->sk_out); if (wq->ops.nice) { - if (nice(wq->ops.nice) < 0) { + errno = 0; + if (nice(wq->ops.nice) == -1 && errno != 0) { log_err("workqueue: nice %s\n", strerror(errno)); ret = 1; } @@ -284,6 +285,7 @@ static int start_worker(struct workqueue *wq, unsigned int index, sw->wq = wq; sw->index = index; sw->sk_out = sk_out; + sw->flags = 0; if (wq->ops.alloc_worker_fn) { ret = wq->ops.alloc_worker_fn(sw); @@ -294,7 +296,7 @@ static int start_worker(struct workqueue *wq, unsigned int index, ret = pthread_create(&sw->thread, NULL, worker_thread, sw); if (!ret) { pthread_mutex_lock(&sw->lock); - sw->flags = SW_F_IDLE; + sw->flags |= SW_F_IDLE; pthread_mutex_unlock(&sw->lock); return 0; } diff --git a/zbd.c b/zbd.c index c18998c46f..7a66b665cd 100644 --- a/zbd.c +++ b/zbd.c @@ -11,6 +11,7 @@ #include #include +#include "compiler/compiler.h" #include "os/os.h" #include "file.h" #include "fio.h" @@ -22,13 +23,139 @@ #include "pshared.h" #include "zbd.h" +static bool is_valid_offset(const struct fio_file *f, uint64_t offset) +{ + return (uint64_t)(offset - f->file_offset) < f->io_size; +} + +static inline unsigned int zbd_zone_idx(const struct fio_file *f, + struct fio_zone_info *zone) +{ + return zone - f->zbd_info->zone_info; +} + +/** + * zbd_offset_to_zone_idx - convert an offset into a zone number + * @f: file pointer. + * @offset: offset in bytes. If this offset is in the first zone_size bytes + * past the disk size then the index of the sentinel is returned. + */ +static unsigned int zbd_offset_to_zone_idx(const struct fio_file *f, + uint64_t offset) +{ + uint32_t zone_idx; + + if (f->zbd_info->zone_size_log2 > 0) + zone_idx = offset >> f->zbd_info->zone_size_log2; + else + zone_idx = offset / f->zbd_info->zone_size; + + return min(zone_idx, f->zbd_info->nr_zones); +} + +/** + * zbd_zone_end - Return zone end location + * @z: zone info pointer. + */ +static inline uint64_t zbd_zone_end(const struct fio_zone_info *z) +{ + return (z+1)->start; +} + +/** + * zbd_zone_capacity_end - Return zone capacity limit end location + * @z: zone info pointer. + */ +static inline uint64_t zbd_zone_capacity_end(const struct fio_zone_info *z) +{ + return z->start + z->capacity; +} + +/** + * zbd_zone_remainder - Return the number of bytes that are still available for + * writing before the zone gets full + * @z: zone info pointer. + */ +static inline uint64_t zbd_zone_remainder(struct fio_zone_info *z) +{ + if (z->wp >= zbd_zone_capacity_end(z)) + return 0; + + return zbd_zone_capacity_end(z) - z->wp; +} + +/** + * zbd_zone_full - verify whether a minimum number of bytes remain in a zone + * @f: file pointer. + * @z: zone info pointer. + * @required: minimum number of bytes that must remain in a zone. + * + * The caller must hold z->mutex. + */ +static bool zbd_zone_full(const struct fio_file *f, struct fio_zone_info *z, + uint64_t required) +{ + assert((required & 511) == 0); + + return z->has_wp && required > zbd_zone_remainder(z); +} + +static void zone_lock(struct thread_data *td, const struct fio_file *f, + struct fio_zone_info *z) +{ +#ifndef NDEBUG + unsigned int const nz = zbd_zone_idx(f, z); + /* A thread should never lock zones outside its working area. */ + assert(f->min_zone <= nz && nz < f->max_zone); + assert(z->has_wp); +#endif + + /* + * Lock the io_u target zone. The zone will be unlocked if io_u offset + * is changed or when io_u completes and zbd_put_io() executed. + * To avoid multiple jobs doing asynchronous I/Os from deadlocking each + * other waiting for zone locks when building an io_u batch, first + * only trylock the zone. If the zone is already locked by another job, + * process the currently queued I/Os so that I/O progress is made and + * zones unlocked. + */ + if (pthread_mutex_trylock(&z->mutex) != 0) { + if (!td_ioengine_flagged(td, FIO_SYNCIO)) + io_u_quiesce(td); + pthread_mutex_lock(&z->mutex); + } +} + +static inline void zone_unlock(struct fio_zone_info *z) +{ + assert(z->has_wp); + pthread_mutex_unlock(&z->mutex); +} + +static inline struct fio_zone_info *zbd_get_zone(const struct fio_file *f, + unsigned int zone_idx) +{ + return &f->zbd_info->zone_info[zone_idx]; +} + +static inline struct fio_zone_info * +zbd_offset_to_zone(const struct fio_file *f, uint64_t offset) +{ + return zbd_get_zone(f, zbd_offset_to_zone_idx(f, offset)); +} + +static bool accounting_vdb(struct thread_data *td, const struct fio_file *f) +{ + return td->o.zrt.u.f && td_write(td); +} + /** * zbd_get_zoned_model - Get a device zoned model * @td: FIO thread data * @f: FIO file for which to get model information */ -int zbd_get_zoned_model(struct thread_data *td, struct fio_file *f, - enum zbd_zoned_model *model) +static int zbd_get_zoned_model(struct thread_data *td, struct fio_file *f, + enum zbd_zoned_model *model) { int ret; @@ -71,9 +198,9 @@ int zbd_get_zoned_model(struct thread_data *td, struct fio_file *f, * upon failure. If the zone report is empty, always assume an error (device * problem) and return -EIO. */ -int zbd_report_zones(struct thread_data *td, struct fio_file *f, - uint64_t offset, struct zbd_zone *zones, - unsigned int nr_zones) +static int zbd_report_zones(struct thread_data *td, struct fio_file *f, + uint64_t offset, struct zbd_zone *zones, + unsigned int nr_zones) { int ret; @@ -83,8 +210,8 @@ int zbd_report_zones(struct thread_data *td, struct fio_file *f, ret = blkzoned_report_zones(td, f, offset, zones, nr_zones); if (ret < 0) { td_verror(td, errno, "report zones failed"); - log_err("%s: report zones from sector %"PRIu64" failed (%d).\n", - f->file_name, offset >> 9, errno); + log_err("%s: report zones from sector %"PRIu64" failed (nr_zones=%d; errno=%d).\n", + f->file_name, offset >> 9, nr_zones, errno); } else if (ret == 0) { td_verror(td, errno, "Empty zone report"); log_err("%s: report zones from sector %"PRIu64" is empty.\n", @@ -105,8 +232,8 @@ int zbd_report_zones(struct thread_data *td, struct fio_file *f, * Reset the write pointer of all zones in the range @offset...@offset+@length. * Returns 0 upon success and a negative error code upon failure. */ -int zbd_reset_wp(struct thread_data *td, struct fio_file *f, - uint64_t offset, uint64_t length) +static int zbd_reset_wp(struct thread_data *td, struct fio_file *f, + uint64_t offset, uint64_t length) { int ret; @@ -124,256 +251,536 @@ int zbd_reset_wp(struct thread_data *td, struct fio_file *f, } /** - * zbd_get_max_open_zones - Get the maximum number of open zones - * @td: FIO thread data - * @f: FIO file for which to get max open zones - * @max_open_zones: Upon success, result will be stored here. + * __zbd_reset_zone - reset the write pointer of a single zone + * @td: FIO thread data. + * @f: FIO file associated with the disk for which to reset a write pointer. + * @z: Zone to reset. * - * A @max_open_zones value set to zero means no limit. + * Returns 0 upon success and a negative error code upon failure. + * + * The caller must hold z->mutex. + */ +static int __zbd_reset_zone(struct thread_data *td, struct fio_file *f, + struct fio_zone_info *z) +{ + uint64_t offset = z->start; + uint64_t length = (z+1)->start - offset; + uint64_t data_in_zone = z->wp - z->start; + int ret = 0; + + if (!data_in_zone) + return 0; + + assert(is_valid_offset(f, offset + length - 1)); + + dprint(FD_ZBD, "%s: resetting wp of zone %u.\n", + f->file_name, zbd_zone_idx(f, z)); + + switch (f->zbd_info->model) { + case ZBD_HOST_AWARE: + case ZBD_HOST_MANAGED: + ret = zbd_reset_wp(td, f, offset, length); + if (ret < 0) + return ret; + break; + default: + break; + } + + if (accounting_vdb(td, f)) { + pthread_mutex_lock(&f->zbd_info->mutex); + f->zbd_info->wp_valid_data_bytes -= data_in_zone; + pthread_mutex_unlock(&f->zbd_info->mutex); + } + + z->wp = z->start; + + td->ts.nr_zone_resets++; + + return ret; +} + +/** + * zbd_write_zone_put - Remove a zone from the write target zones array. + * @td: FIO thread data. + * @f: FIO file that has the write zones array to remove. + * @zone_idx: Index of the zone to remove. + * + * The caller must hold f->zbd_info->mutex. + */ +static void zbd_write_zone_put(struct thread_data *td, const struct fio_file *f, + struct fio_zone_info *z) +{ + uint32_t zi; + + if (!z->write) + return; + + for (zi = 0; zi < f->zbd_info->num_write_zones; zi++) { + if (zbd_get_zone(f, f->zbd_info->write_zones[zi]) == z) + break; + } + if (zi == f->zbd_info->num_write_zones) + return; + + dprint(FD_ZBD, "%s: removing zone %u from write zone array\n", + f->file_name, zbd_zone_idx(f, z)); + + memmove(f->zbd_info->write_zones + zi, + f->zbd_info->write_zones + zi + 1, + (ZBD_MAX_WRITE_ZONES - (zi + 1)) * + sizeof(f->zbd_info->write_zones[0])); + + f->zbd_info->num_write_zones--; + td->num_write_zones--; + z->write = 0; +} + +/** + * zbd_reset_zone - reset the write pointer of a single zone and remove the zone + * from the array of write zones. + * @td: FIO thread data. + * @f: FIO file associated with the disk for which to reset a write pointer. + * @z: Zone to reset. * * Returns 0 upon success and a negative error code upon failure. + * + * The caller must hold z->mutex. */ -int zbd_get_max_open_zones(struct thread_data *td, struct fio_file *f, - unsigned int *max_open_zones) +static int zbd_reset_zone(struct thread_data *td, struct fio_file *f, + struct fio_zone_info *z) { int ret; - if (td->io_ops && td->io_ops->get_max_open_zones) - ret = td->io_ops->get_max_open_zones(td, f, max_open_zones); - else - ret = blkzoned_get_max_open_zones(td, f, max_open_zones); + ret = __zbd_reset_zone(td, f, z); + if (ret) + return ret; + + pthread_mutex_lock(&f->zbd_info->mutex); + zbd_write_zone_put(td, f, z); + pthread_mutex_unlock(&f->zbd_info->mutex); + return 0; +} + +/** + * zbd_finish_zone - finish the specified zone + * @td: FIO thread data. + * @f: FIO file for which to finish a zone + * @z: Zone to finish. + * + * Finish the zone at @offset with open or close status. + */ +static int zbd_finish_zone(struct thread_data *td, struct fio_file *f, + struct fio_zone_info *z) +{ + uint64_t offset = z->start; + uint64_t length = f->zbd_info->zone_size; + int ret = 0; + + switch (f->zbd_info->model) { + case ZBD_HOST_AWARE: + case ZBD_HOST_MANAGED: + if (td->io_ops && td->io_ops->finish_zone) + ret = td->io_ops->finish_zone(td, f, offset, length); + else + ret = blkzoned_finish_zone(td, f, offset, length); + break; + default: + break; + } + if (ret < 0) { - td_verror(td, errno, "get max open zones failed"); - log_err("%s: get max open zones failed (%d).\n", - f->file_name, errno); + td_verror(td, errno, "finish zone failed"); + log_err("%s: finish zone at sector %"PRIu64" failed (%d).\n", + f->file_name, offset >> 9, errno); + } else { + z->wp = (z+1)->start; } return ret; } /** - * zbd_zone_idx - convert an offset into a zone number - * @f: file pointer. - * @offset: offset in bytes. If this offset is in the first zone_size bytes - * past the disk size then the index of the sentinel is returned. + * zbd_reset_zones - Reset a range of zones. + * @td: fio thread data. + * @f: fio file for which to reset zones + * @zb: first zone to reset. + * @ze: first zone not to reset. + * + * Returns 0 upon success and 1 upon failure. */ -static uint32_t zbd_zone_idx(const struct fio_file *f, uint64_t offset) +static int zbd_reset_zones(struct thread_data *td, struct fio_file *f, + struct fio_zone_info *const zb, + struct fio_zone_info *const ze) { - uint32_t zone_idx; + struct fio_zone_info *z; + const uint64_t min_bs = td->o.min_bs[DDIR_WRITE]; + int res = 0; - if (f->zbd_info->zone_size_log2 > 0) - zone_idx = offset >> f->zbd_info->zone_size_log2; - else - zone_idx = offset / f->zbd_info->zone_size; + if (fio_unlikely(0 == min_bs)) + return 1; - return min(zone_idx, f->zbd_info->nr_zones); + dprint(FD_ZBD, "%s: examining zones %u .. %u\n", + f->file_name, zbd_zone_idx(f, zb), zbd_zone_idx(f, ze)); + + for (z = zb; z < ze; z++) { + if (!z->has_wp) + continue; + + zone_lock(td, f, z); + + if (z->wp != z->start) { + dprint(FD_ZBD, "%s: resetting zone %u\n", + f->file_name, zbd_zone_idx(f, z)); + if (zbd_reset_zone(td, f, z) < 0) + res = 1; + } + + zone_unlock(z); + } + + return res; } /** - * zbd_zone_end - Return zone end location - * @z: zone info pointer. + * zbd_move_zone_wp - move the write pointer of a zone by writing the data in + * the specified buffer + * @td: FIO thread data. + * @f: FIO file for which to move write pointer + * @z: Target zone to move the write pointer + * @length: Length of the move + * @buf: Buffer which holds the data to write + * + * Move the write pointer at the specified offset by writing the data + * in the specified buffer. + * Returns 0 upon success and a negative error code upon failure. */ -static inline uint64_t zbd_zone_end(const struct fio_zone_info *z) +static int zbd_move_zone_wp(struct thread_data *td, struct fio_file *f, + struct zbd_zone *z, uint64_t length, + const char *buf) { - return (z+1)->start; + int ret = 0; + + switch (f->zbd_info->model) { + case ZBD_HOST_AWARE: + case ZBD_HOST_MANAGED: + if (td->io_ops && td->io_ops->move_zone_wp) + ret = td->io_ops->move_zone_wp(td, f, z, length, buf); + else + ret = blkzoned_move_zone_wp(td, f, z, length, buf); + break; + default: + break; + } + + if (ret < 0) { + td_verror(td, errno, "move wp failed"); + log_err("%s: moving wp for %"PRIu64" sectors at sector %"PRIu64" failed (%d).\n", + f->file_name, length >> 9, z->wp >> 9, errno); + } + + return ret; } /** - * zbd_zone_capacity_end - Return zone capacity limit end location - * @z: zone info pointer. + * zbd_get_max_open_zones - Get the maximum number of open zones + * @td: FIO thread data + * @f: FIO file for which to get max open zones + * @max_open_zones: Upon success, result will be stored here. + * + * A @max_open_zones value set to zero means no limit. + * + * Returns 0 upon success and a negative error code upon failure. */ -static inline uint64_t zbd_zone_capacity_end(const struct fio_zone_info *z) +static int zbd_get_max_open_zones(struct thread_data *td, struct fio_file *f, + unsigned int *max_open_zones) { - return z->start + z->capacity; + int ret; + + if (td->io_ops && td->io_ops->get_max_open_zones) + ret = td->io_ops->get_max_open_zones(td, f, max_open_zones); + else + ret = blkzoned_get_max_open_zones(td, f, max_open_zones); + if (ret < 0) { + td_verror(td, errno, "get max open zones failed"); + log_err("%s: get max open zones failed (%d).\n", + f->file_name, errno); + } + + return ret; } /** - * zbd_zone_full - verify whether a minimum number of bytes remain in a zone - * @f: file pointer. - * @z: zone info pointer. - * @required: minimum number of bytes that must remain in a zone. + * zbd_get_max_active_zones - Get the maximum number of active zones + * @td: FIO thread data + * @f: FIO file for which to get max active zones * - * The caller must hold z->mutex. + * Returns max_active_zones limit value of the target file if it is available. + * Otherwise return zero, which means no limit. */ -static bool zbd_zone_full(const struct fio_file *f, struct fio_zone_info *z, - uint64_t required) +static unsigned int zbd_get_max_active_zones(struct thread_data *td, + struct fio_file *f) { - assert((required & 511) == 0); + unsigned int max_active_zones; + int ret; + + if (td->io_ops && td->io_ops->get_max_active_zones) + ret = td->io_ops->get_max_active_zones(td, f, + &max_active_zones); + else + ret = blkzoned_get_max_active_zones(td, f, &max_active_zones); + if (ret < 0) { + dprint(FD_ZBD, "%s: max_active_zones is not available\n", + f->file_name); + return 0; + } - return z->has_wp && - z->wp + required > zbd_zone_capacity_end(z); + return max_active_zones; } -static void zone_lock(struct thread_data *td, const struct fio_file *f, - struct fio_zone_info *z) +/** + * __zbd_write_zone_get - Add a zone to the array of write zones. + * @td: fio thread data. + * @f: fio file that has the write zones array to add. + * @zone_idx: Index of the zone to add. + * + * Do same operation as @zbd_write_zone_get, except it adds the zone at + * @zone_idx to write target zones array even when it does not have remainder + * space to write one block. + */ +static bool __zbd_write_zone_get(struct thread_data *td, + const struct fio_file *f, + struct fio_zone_info *z) { - struct zoned_block_device_info *zbd = f->zbd_info; - uint32_t nz = z - zbd->zone_info; + struct zoned_block_device_info *zbdi = f->zbd_info; + uint32_t zone_idx = zbd_zone_idx(f, z); + bool res = true; - /* A thread should never lock zones outside its working area. */ - assert(f->min_zone <= nz && nz < f->max_zone); + if (z->cond == ZBD_ZONE_COND_OFFLINE) + return false; - assert(z->has_wp); + /* + * Skip full zones with data verification enabled because resetting a + * zone causes data loss and hence causes verification to fail. + */ + if (td->o.verify != VERIFY_NONE && zbd_zone_remainder(z) == 0) + return false; /* - * Lock the io_u target zone. The zone will be unlocked if io_u offset - * is changed or when io_u completes and zbd_put_io() executed. - * To avoid multiple jobs doing asynchronous I/Os from deadlocking each - * other waiting for zone locks when building an io_u batch, first - * only trylock the zone. If the zone is already locked by another job, - * process the currently queued I/Os so that I/O progress is made and - * zones unlocked. + * zbdi->max_write_zones == 0 means that there is no limit on the + * maximum number of write target zones. In this case, do no track write + * target zones in zbdi->write_zones array. */ - if (pthread_mutex_trylock(&z->mutex) != 0) { - if (!td_ioengine_flagged(td, FIO_SYNCIO)) - io_u_quiesce(td); - pthread_mutex_lock(&z->mutex); + if (!zbdi->max_write_zones) + return true; + + pthread_mutex_lock(&zbdi->mutex); + + if (z->write) { + /* + * If the zone is going to be completely filled by writes + * already in-flight, handle it as a full zone instead of a + * write target zone. + */ + if (!zbd_zone_remainder(z)) + res = false; + goto out; } -} -static inline void zone_unlock(struct fio_zone_info *z) -{ - int ret; + res = false; + /* Zero means no limit */ + if (td->o.job_max_open_zones > 0 && + td->num_write_zones >= td->o.job_max_open_zones) + goto out; + if (zbdi->num_write_zones >= zbdi->max_write_zones) + goto out; - assert(z->has_wp); - ret = pthread_mutex_unlock(&z->mutex); - assert(!ret); -} + dprint(FD_ZBD, "%s: adding zone %u to write zone array\n", + f->file_name, zone_idx); -static bool is_valid_offset(const struct fio_file *f, uint64_t offset) -{ - return (uint64_t)(offset - f->file_offset) < f->io_size; + zbdi->write_zones[zbdi->num_write_zones++] = zone_idx; + td->num_write_zones++; + z->write = 1; + res = true; + +out: + pthread_mutex_unlock(&zbdi->mutex); + return res; } -static inline struct fio_zone_info *get_zone(const struct fio_file *f, - unsigned int zone_nr) +/** + * zbd_write_zone_get - Add a zone to the array of write zones. + * @td: fio thread data. + * @f: fio file that has the open zones to add. + * @zone_idx: Index of the zone to add. + * + * Add a ZBD zone to write target zones array, if it is not yet added. Returns + * true if either the zone was already added or if the zone was successfully + * added to the array without exceeding the maximum number of write zones. + * Returns false if the zone was not already added and addition of the zone + * would cause the zone limit to be exceeded. + */ +static bool zbd_write_zone_get(struct thread_data *td, const struct fio_file *f, + struct fio_zone_info *z) { - return &f->zbd_info->zone_info[zone_nr]; + const uint64_t min_bs = td->o.min_bs[DDIR_WRITE]; + + /* + * Skip full zones with data verification enabled because resetting a + * zone causes data loss and hence causes verification to fail. + */ + if (td->o.verify != VERIFY_NONE && zbd_zone_full(f, z, min_bs)) + return false; + + return __zbd_write_zone_get(td, f, z); } -/* Verify whether direct I/O is used for all host-managed zoned drives. */ +/* Verify whether direct I/O is used for all host-managed zoned block drives. */ static bool zbd_using_direct_io(void) { - struct thread_data *td; struct fio_file *f; - int i, j; + int j; - for_each_td(td, i) { + for_each_td(td) { if (td->o.odirect || !(td->o.td_ddir & TD_DDIR_WRITE)) continue; for_each_file(td, f, j) { - if (f->zbd_info && + if (f->zbd_info && f->filetype == FIO_TYPE_BLOCK && f->zbd_info->model == ZBD_HOST_MANAGED) return false; } - } + } end_for_each(); return true; } /* Whether or not the I/O range for f includes one or more sequential zones */ -static bool zbd_is_seq_job(struct fio_file *f) +static bool zbd_is_seq_job(const struct fio_file *f) { uint32_t zone_idx, zone_idx_b, zone_idx_e; assert(f->zbd_info); + if (f->io_size == 0) return false; - zone_idx_b = zbd_zone_idx(f, f->file_offset); - zone_idx_e = zbd_zone_idx(f, f->file_offset + f->io_size - 1); + + zone_idx_b = zbd_offset_to_zone_idx(f, f->file_offset); + zone_idx_e = + zbd_offset_to_zone_idx(f, f->file_offset + f->io_size - 1); for (zone_idx = zone_idx_b; zone_idx <= zone_idx_e; zone_idx++) - if (get_zone(f, zone_idx)->has_wp) + if (zbd_get_zone(f, zone_idx)->has_wp) return true; return false; } /* - * Verify whether offset and size parameters are aligned with zone boundaries. + * Verify whether the file offset and size parameters are aligned with zone + * boundaries. If the file offset is not aligned, align it down to the start of + * the zone containing the start offset and align up the file io_size parameter. */ -static bool zbd_verify_sizes(void) +static bool zbd_zone_align_file_sizes(struct thread_data *td, + struct fio_file *f) { const struct fio_zone_info *z; - struct thread_data *td; - struct fio_file *f; uint64_t new_offset, new_end; - uint32_t zone_idx; - int i, j; - for_each_td(td, i) { - for_each_file(td, f, j) { - if (!f->zbd_info) - continue; - if (f->file_offset >= f->real_file_size) - continue; - if (!zbd_is_seq_job(f)) - continue; + if (!f->zbd_info) + return true; + if (f->file_offset >= f->real_file_size) + return true; + if (!zbd_is_seq_job(f)) + return true; - if (!td->o.zone_size) { - td->o.zone_size = f->zbd_info->zone_size; - if (!td->o.zone_size) { - log_err("%s: invalid 0 zone size\n", - f->file_name); - return false; - } - } else if (td->o.zone_size != f->zbd_info->zone_size) { - log_err("%s: job parameter zonesize %llu does not match disk zone size %"PRIu64".\n", - f->file_name, td->o.zone_size, - f->zbd_info->zone_size); - return false; - } + if (!td->o.zone_size) { + td->o.zone_size = f->zbd_info->zone_size; + if (!td->o.zone_size) { + log_err("%s: invalid 0 zone size\n", + f->file_name); + return false; + } + } else if (td->o.zone_size != f->zbd_info->zone_size) { + log_err("%s: zonesize %llu does not match the device zone size %"PRIu64".\n", + f->file_name, td->o.zone_size, + f->zbd_info->zone_size); + return false; + } - if (td->o.zone_skip % td->o.zone_size) { - log_err("%s: zoneskip %llu is not a multiple of the device zone size %llu.\n", - f->file_name, td->o.zone_skip, - td->o.zone_size); - return false; - } + if (td->o.zone_skip % td->o.zone_size) { + log_err("%s: zoneskip %llu is not a multiple of the device zone size %llu.\n", + f->file_name, td->o.zone_skip, + td->o.zone_size); + return false; + } - zone_idx = zbd_zone_idx(f, f->file_offset); - z = get_zone(f, zone_idx); - if ((f->file_offset != z->start) && - (td->o.td_ddir != TD_DDIR_READ)) { - new_offset = zbd_zone_end(z); - if (new_offset >= f->file_offset + f->io_size) { - log_info("%s: io_size must be at least one zone\n", - f->file_name); - return false; - } - log_info("%s: rounded up offset from %"PRIu64" to %"PRIu64"\n", - f->file_name, f->file_offset, - new_offset); - f->io_size -= (new_offset - f->file_offset); - f->file_offset = new_offset; - } - zone_idx = zbd_zone_idx(f, f->file_offset + f->io_size); - z = get_zone(f, zone_idx); - new_end = z->start; - if ((td->o.td_ddir != TD_DDIR_READ) && - (f->file_offset + f->io_size != new_end)) { - if (new_end <= f->file_offset) { - log_info("%s: io_size must be at least one zone\n", - f->file_name); - return false; - } - log_info("%s: rounded down io_size from %"PRIu64" to %"PRIu64"\n", - f->file_name, f->io_size, - new_end - f->file_offset); - f->io_size = new_end - f->file_offset; - } + if (td->o.td_ddir == TD_DDIR_READ) { + z = zbd_offset_to_zone(f, f->file_offset + f->io_size); + new_end = z->start; + if (f->file_offset + f->io_size > new_end) { + log_info("%s: rounded io_size from %"PRIu64" to %"PRIu64"\n", + f->file_name, f->io_size, + new_end - f->file_offset); + f->io_size = new_end - f->file_offset; + } + return true; + } + + z = zbd_offset_to_zone(f, f->file_offset); + if (f->file_offset != z->start) { + new_offset = zbd_zone_end(z); + if (new_offset >= f->file_offset + f->io_size) { + log_info("%s: io_size must be at least one zone\n", + f->file_name); + return false; + } + log_info("%s: rounded up offset from %"PRIu64" to %"PRIu64"\n", + f->file_name, f->file_offset, + new_offset); + f->io_size -= (new_offset - f->file_offset); + f->file_offset = new_offset; + } + + z = zbd_offset_to_zone(f, f->file_offset + f->io_size); + new_end = z->start; + if (f->file_offset + f->io_size != new_end) { + if (new_end <= f->file_offset) { + log_info("%s: io_size must be at least one zone\n", + f->file_name); + return false; } + log_info("%s: rounded down io_size from %"PRIu64" to %"PRIu64"\n", + f->file_name, f->io_size, + new_end - f->file_offset); + f->io_size = new_end - f->file_offset; } return true; } +/* + * Verify whether offset and size parameters are aligned with zone boundaries. + */ +static bool zbd_verify_sizes(void) +{ + struct fio_file *f; + int j; + + for_each_td(td) { + for_each_file(td, f, j) { + if (!zbd_zone_align_file_sizes(td, f)) + return false; + } + } end_for_each(); + + return true; +} + static bool zbd_verify_bs(void) { - struct thread_data *td; struct fio_file *f; - int i, j, k; + int j; - for_each_td(td, i) { + for_each_td(td) { if (td_trim(td) && (td->o.min_bs[DDIR_TRIM] != td->o.max_bs[DDIR_TRIM] || td->o.bssplit_nr[DDIR_TRIM])) { @@ -385,6 +792,7 @@ static bool zbd_verify_bs(void) if (!f->zbd_info) continue; + zone_size = f->zbd_info->zone_size; if (td_trim(td) && td->o.bs[DDIR_TRIM] != zone_size) { log_info("%s: trim block size %llu is not the zone size %"PRIu64"\n", @@ -392,17 +800,8 @@ static bool zbd_verify_bs(void) zone_size); return false; } - for (k = 0; k < FIO_ARRAY_SIZE(td->o.bs); k++) { - if (td->o.verify != VERIFY_NONE && - zone_size % td->o.bs[k] != 0) { - log_info("%s: block size %llu is not a divisor of the zone size %"PRIu64"\n", - f->file_name, td->o.bs[k], - zone_size); - return false; - } - } } - } + } end_for_each(); return true; } @@ -501,7 +900,8 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f) int nr_zones, nrz; struct zbd_zone *zones, *z; struct fio_zone_info *p; - uint64_t zone_size, offset; + uint64_t zone_size, offset, capacity; + bool same_zone_cap = true; struct zoned_block_device_info *zbd_info = NULL; int i, j, ret = -ENOMEM; @@ -518,6 +918,7 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f) } zone_size = zones[0].len; + capacity = zones[0].capacity; nr_zones = (f->real_file_size + zone_size - 1) / zone_size; if (td->o.zone_size == 0) { @@ -529,8 +930,8 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f) goto out; } - dprint(FD_ZBD, "Device %s has %d zones of size %"PRIu64" KB\n", f->file_name, - nr_zones, zone_size / 1024); + dprint(FD_ZBD, "Device %s has %d zones of size %"PRIu64" KB\n", + f->file_name, nr_zones, zone_size / 1024); zbd_info = scalloc(1, sizeof(*zbd_info) + (nr_zones + 1) * sizeof(zbd_info->zone_info[0])); @@ -546,6 +947,9 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f) PTHREAD_MUTEX_RECURSIVE); p->start = z->start; p->capacity = z->capacity; + if (capacity != z->capacity) + same_zone_cap = false; + switch (z->cond) { case ZBD_ZONE_COND_NOT_WP: case ZBD_ZONE_COND_FULL: @@ -569,8 +973,9 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f) p->cond = z->cond; if (j > 0 && p->start != p[-1].start + zone_size) { - log_info("%s: invalid zone data\n", - f->file_name); + log_info("%s: invalid zone data [%d:%d]: %"PRIu64" + %"PRIu64" != %"PRIu64"\n", + f->file_name, j, i, + p[-1].start, zone_size, p->start); ret = -EINVAL; goto out; } @@ -579,6 +984,7 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f) offset = z->start + z->len; if (j >= nr_zones) break; + nrz = zbd_report_zones(td, f, offset, zones, min((uint32_t)(nr_zones - j), ZBD_REPORT_MAX_ZONES)); @@ -598,6 +1004,12 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f) f->zbd_info->zone_size_log2 = is_power_of_2(zone_size) ? ilog2(zone_size) : 0; f->zbd_info->nr_zones = nr_zones; + f->zbd_info->max_active_zones = zbd_get_max_active_zones(td, f); + + if (same_zone_cap) + dprint(FD_ZBD, "Zone capacity = %"PRIu64" KB\n", + capacity / 1024); + zbd_info = NULL; ret = 0; @@ -607,7 +1019,7 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f) return ret; } -static int zbd_set_max_open_zones(struct thread_data *td, struct fio_file *f) +static int zbd_set_max_write_zones(struct thread_data *td, struct fio_file *f) { struct zoned_block_device_info *zbd = f->zbd_info; unsigned int max_open_zones; @@ -615,7 +1027,7 @@ static int zbd_set_max_open_zones(struct thread_data *td, struct fio_file *f) if (zbd->model != ZBD_HOST_MANAGED || td->o.ignore_zone_limits) { /* Only host-managed devices have a max open limit */ - zbd->max_open_zones = td->o.max_open_zones; + zbd->max_write_zones = td->o.max_open_zones; goto out; } @@ -626,13 +1038,13 @@ static int zbd_set_max_open_zones(struct thread_data *td, struct fio_file *f) if (!max_open_zones) { /* No device limit */ - zbd->max_open_zones = td->o.max_open_zones; + zbd->max_write_zones = td->o.max_open_zones; } else if (!td->o.max_open_zones) { /* No user limit. Set limit to device limit */ - zbd->max_open_zones = max_open_zones; + zbd->max_write_zones = max_open_zones; } else if (td->o.max_open_zones <= max_open_zones) { /* Both user limit and dev limit. User limit not too large */ - zbd->max_open_zones = td->o.max_open_zones; + zbd->max_write_zones = td->o.max_open_zones; } else { /* Both user limit and dev limit. User limit too large */ td_verror(td, EINVAL, @@ -644,14 +1056,15 @@ static int zbd_set_max_open_zones(struct thread_data *td, struct fio_file *f) out: /* Ensure that the limit is not larger than FIO's internal limit */ - if (zbd->max_open_zones > ZBD_MAX_OPEN_ZONES) { + if (zbd->max_write_zones > ZBD_MAX_WRITE_ZONES) { td_verror(td, EINVAL, "'max_open_zones' value is too large"); - log_err("'max_open_zones' value is larger than %u\n", ZBD_MAX_OPEN_ZONES); + log_err("'max_open_zones' value is larger than %u\n", + ZBD_MAX_WRITE_ZONES); return -EINVAL; } - dprint(FD_ZBD, "%s: using max open zones limit: %"PRIu32"\n", - f->file_name, zbd->max_open_zones); + dprint(FD_ZBD, "%s: using max write zones limit: %"PRIu32"\n", + f->file_name, zbd->max_write_zones); return 0; } @@ -693,7 +1106,7 @@ static int zbd_create_zone_info(struct thread_data *td, struct fio_file *f) assert(f->zbd_info); f->zbd_info->model = zbd_model; - ret = zbd_set_max_open_zones(td, f); + ret = zbd_set_max_write_zones(td, f); if (ret) { zbd_free_zone_info(f); return ret; @@ -728,11 +1141,10 @@ void zbd_free_zone_info(struct fio_file *f) */ static int zbd_init_zone_info(struct thread_data *td, struct fio_file *file) { - struct thread_data *td2; struct fio_file *f2; - int i, j, ret; + int j, ret; - for_each_td(td2, i) { + for_each_td(td2) { for_each_file(td2, f2, j) { if (td2 == td && f2 == file) continue; @@ -743,19 +1155,15 @@ static int zbd_init_zone_info(struct thread_data *td, struct fio_file *file) file->zbd_info->refcount++; return 0; } - } + } end_for_each(); ret = zbd_create_zone_info(td, file); if (ret < 0) td_verror(td, -ret, "zbd_create_zone_info() failed"); + return ret; } -static bool zbd_open_zone(struct thread_data *td, const struct fio_file *f, - uint32_t zone_idx); -static int zbd_reset_zone(struct thread_data *td, struct fio_file *f, - struct fio_zone_info *z); - int zbd_init_files(struct thread_data *td) { struct fio_file *f; @@ -765,6 +1173,7 @@ int zbd_init_files(struct thread_data *td) if (zbd_init_zone_info(td, f)) return 1; } + return 0; } @@ -775,28 +1184,71 @@ void zbd_recalc_options_with_zone_granularity(struct thread_data *td) for_each_file(td, f, i) { struct zoned_block_device_info *zbd = f->zbd_info; - // zonemode=strided doesn't get per-file zone size. - uint64_t zone_size = zbd ? zbd->zone_size : td->o.zone_size; + uint64_t zone_size; + /* zonemode=strided doesn't get per-file zone size. */ + zone_size = zbd ? zbd->zone_size : td->o.zone_size; if (zone_size == 0) continue; - if (td->o.size_nz > 0) { + if (td->o.size_nz > 0) td->o.size = td->o.size_nz * zone_size; - } - if (td->o.io_size_nz > 0) { + if (td->o.io_size_nz > 0) td->o.io_size = td->o.io_size_nz * zone_size; - } - if (td->o.start_offset_nz > 0) { + if (td->o.start_offset_nz > 0) td->o.start_offset = td->o.start_offset_nz * zone_size; - } - if (td->o.offset_increment_nz > 0) { - td->o.offset_increment = td->o.offset_increment_nz * zone_size; - } - if (td->o.zone_skip_nz > 0) { + if (td->o.offset_increment_nz > 0) + td->o.offset_increment = + td->o.offset_increment_nz * zone_size; + if (td->o.zone_skip_nz > 0) td->o.zone_skip = td->o.zone_skip_nz * zone_size; + } +} + +static uint64_t zbd_verify_and_set_vdb(struct thread_data *td, + const struct fio_file *f) +{ + struct fio_zone_info *zb, *ze, *z; + uint64_t wp_vdb = 0; + struct zoned_block_device_info *zbdi = f->zbd_info; + + assert(td->runstate < TD_RUNNING); + assert(zbdi); + + if (!accounting_vdb(td, f)) + return 0; + + /* + * Ensure that the I/O range includes one or more sequential zones so + * that f->min_zone and f->max_zone have different values. + */ + if (!zbd_is_seq_job(f)) + return 0; + + if (zbdi->write_min_zone != zbdi->write_max_zone) { + if (zbdi->write_min_zone != f->min_zone || + zbdi->write_max_zone != f->max_zone) { + td_verror(td, EINVAL, + "multi-jobs with different write ranges are " + "not supported with zone_reset_threshold"); + log_err("multi-jobs with different write ranges are " + "not supported with zone_reset_threshold\n"); } + return 0; } + + zbdi->write_min_zone = f->min_zone; + zbdi->write_max_zone = f->max_zone; + + zb = zbd_get_zone(f, f->min_zone); + ze = zbd_get_zone(f, f->max_zone); + for (z = zb; z < ze; z++) + if (z->has_wp) + wp_vdb += z->wp - z->start; + + zbdi->wp_valid_data_bytes = wp_vdb; + + return wp_vdb; } int zbd_setup_files(struct thread_data *td) @@ -815,15 +1267,43 @@ int zbd_setup_files(struct thread_data *td) if (!zbd_verify_bs()) return 1; + if (td->o.recover_zbd_write_error && td_write(td)) { + if (!td->o.continue_on_error) { + log_err("recover_zbd_write_error works only when continue_on_error is set\n"); + return 1; + } + if (td->o.verify != VERIFY_NONE && + !td_ioengine_flagged(td, FIO_SYNCIO)) { + log_err("recover_zbd_write_error for async IO engines does not support verify\n"); + return 1; + } + } + + if (td->o.experimental_verify) { + log_err("zonemode=zbd does not support experimental verify\n"); + return 1; + } + + /* Enable zone reset stat report for write and trim workloads */ + if (td_write(td) || td_trim(td)) + td->ts.count_zone_resets = 1; + for_each_file(td, f, i) { struct zoned_block_device_info *zbd = f->zbd_info; struct fio_zone_info *z; int zi; + uint64_t vdb; assert(zbd); - f->min_zone = zbd_zone_idx(f, f->file_offset); - f->max_zone = zbd_zone_idx(f, f->file_offset + f->io_size); + f->min_zone = zbd_offset_to_zone_idx(f, f->file_offset); + f->max_zone = + zbd_offset_to_zone_idx(f, f->file_offset + f->io_size); + + vdb = zbd_verify_and_set_vdb(td, f); + + dprint(FD_ZBD, "%s(%s): valid data bytes = %" PRIu64 "\n", + __func__, f->file_name, vdb); /* * When all zones in the I/O range are conventional, io_size @@ -835,165 +1315,60 @@ int zbd_setup_files(struct thread_data *td) assert(f->min_zone < f->max_zone); if (td->o.max_open_zones > 0 && - zbd->max_open_zones != td->o.max_open_zones) { + zbd->max_write_zones != td->o.max_open_zones) { log_err("Different 'max_open_zones' values\n"); return 1; } + /* + * If this job does not do write operations, skip open zone + * condition check. + */ + if (!td_write(td)) { + if (td->o.job_max_open_zones) + log_info("'job_max_open_zones' is valid only for write jobs\n"); + continue; + } + /* * The per job max open zones limit cannot be used without a * global max open zones limit. (As the tracking of open zones * is disabled when there is no global max open zones limit.) */ - if (td->o.job_max_open_zones && !zbd->max_open_zones) { + if (td->o.job_max_open_zones && !zbd->max_write_zones) { log_err("'job_max_open_zones' cannot be used without a global open zones limit\n"); return 1; } /* - * zbd->max_open_zones is the global limit shared for all jobs + * zbd->max_write_zones is the global limit shared for all jobs * that target the same zoned block device. Force sync the per * thread global limit with the actual global limit. (The real * per thread/job limit is stored in td->o.job_max_open_zones). */ - td->o.max_open_zones = zbd->max_open_zones; + td->o.max_open_zones = zbd->max_write_zones; for (zi = f->min_zone; zi < f->max_zone; zi++) { - z = &zbd->zone_info[zi]; - if (z->cond != ZBD_ZONE_COND_IMP_OPEN && - z->cond != ZBD_ZONE_COND_EXP_OPEN) - continue; - if (zbd_open_zone(td, f, zi)) - continue; - /* - * If the number of open zones exceeds specified limits, - * reset all extra open zones. - */ - if (zbd_reset_zone(td, f, z) < 0) { - log_err("Failed to reest zone %d\n", zi); - return 1; - } - } - } - - return 0; -} - -static inline unsigned int zbd_zone_nr(const struct fio_file *f, - struct fio_zone_info *zone) -{ - return zone - f->zbd_info->zone_info; -} - -/** - * zbd_reset_zone - reset the write pointer of a single zone - * @td: FIO thread data. - * @f: FIO file associated with the disk for which to reset a write pointer. - * @z: Zone to reset. - * - * Returns 0 upon success and a negative error code upon failure. - * - * The caller must hold z->mutex. - */ -static int zbd_reset_zone(struct thread_data *td, struct fio_file *f, - struct fio_zone_info *z) -{ - uint64_t offset = z->start; - uint64_t length = (z+1)->start - offset; - uint64_t data_in_zone = z->wp - z->start; - int ret = 0; - - if (!data_in_zone) - return 0; - - assert(is_valid_offset(f, offset + length - 1)); - - dprint(FD_ZBD, "%s: resetting wp of zone %u.\n", f->file_name, - zbd_zone_nr(f, z)); - switch (f->zbd_info->model) { - case ZBD_HOST_AWARE: - case ZBD_HOST_MANAGED: - ret = zbd_reset_wp(td, f, offset, length); - if (ret < 0) - return ret; - break; - default: - break; - } - - pthread_mutex_lock(&f->zbd_info->mutex); - f->zbd_info->sectors_with_data -= data_in_zone; - f->zbd_info->wp_sectors_with_data -= data_in_zone; - pthread_mutex_unlock(&f->zbd_info->mutex); - z->wp = z->start; - z->verify_block = 0; - - td->ts.nr_zone_resets++; - - return ret; -} - -/* The caller must hold f->zbd_info->mutex */ -static void zbd_close_zone(struct thread_data *td, const struct fio_file *f, - unsigned int zone_idx) -{ - uint32_t open_zone_idx = 0; - - for (; open_zone_idx < f->zbd_info->num_open_zones; open_zone_idx++) { - if (f->zbd_info->open_zones[open_zone_idx] == zone_idx) - break; - } - if (open_zone_idx == f->zbd_info->num_open_zones) - return; - - dprint(FD_ZBD, "%s: closing zone %d\n", f->file_name, zone_idx); - memmove(f->zbd_info->open_zones + open_zone_idx, - f->zbd_info->open_zones + open_zone_idx + 1, - (ZBD_MAX_OPEN_ZONES - (open_zone_idx + 1)) * - sizeof(f->zbd_info->open_zones[0])); - f->zbd_info->num_open_zones--; - td->num_open_zones--; - get_zone(f, zone_idx)->open = 0; -} - -/* - * Reset a range of zones. Returns 0 upon success and 1 upon failure. - * @td: fio thread data. - * @f: fio file for which to reset zones - * @zb: first zone to reset. - * @ze: first zone not to reset. - */ -static int zbd_reset_zones(struct thread_data *td, struct fio_file *f, - struct fio_zone_info *const zb, - struct fio_zone_info *const ze) -{ - struct fio_zone_info *z; - const uint64_t min_bs = td->o.min_bs[DDIR_WRITE]; - int res = 0; - - assert(min_bs); - - dprint(FD_ZBD, "%s: examining zones %u .. %u\n", f->file_name, - zbd_zone_nr(f, zb), zbd_zone_nr(f, ze)); - for (z = zb; z < ze; z++) { - uint32_t nz = zbd_zone_nr(f, z); - - if (!z->has_wp) - continue; - zone_lock(td, f, z); - pthread_mutex_lock(&f->zbd_info->mutex); - zbd_close_zone(td, f, nz); - pthread_mutex_unlock(&f->zbd_info->mutex); - if (z->wp != z->start) { - dprint(FD_ZBD, "%s: resetting zone %u\n", - f->file_name, zbd_zone_nr(f, z)); - if (zbd_reset_zone(td, f, z) < 0) - res = 1; + z = &zbd->zone_info[zi]; + if (z->cond != ZBD_ZONE_COND_IMP_OPEN && + z->cond != ZBD_ZONE_COND_EXP_OPEN && + z->cond != ZBD_ZONE_COND_CLOSED) + continue; + if (!zbd->max_active_zones && + z->cond == ZBD_ZONE_COND_CLOSED) + continue; + if (__zbd_write_zone_get(td, f, z)) + continue; + /* + * If the number of open zones exceeds specified limits, + * error out. + */ + log_err("Number of open zones exceeds max_open_zones limit\n"); + return 1; } - zone_unlock(z); } - return res; + return 0; } /* @@ -1033,224 +1408,162 @@ static bool zbd_dec_and_reset_write_cnt(const struct thread_data *td, return write_cnt == 0; } -enum swd_action { - CHECK_SWD, - SET_SWD, -}; - -/* Calculate the number of sectors with data (swd) and perform action 'a' */ -static uint64_t zbd_process_swd(struct thread_data *td, - const struct fio_file *f, enum swd_action a) -{ - struct fio_zone_info *zb, *ze, *z; - uint64_t swd = 0; - uint64_t wp_swd = 0; - - zb = get_zone(f, f->min_zone); - ze = get_zone(f, f->max_zone); - for (z = zb; z < ze; z++) { - if (z->has_wp) { - zone_lock(td, f, z); - wp_swd += z->wp - z->start; - } - swd += z->wp - z->start; - } - pthread_mutex_lock(&f->zbd_info->mutex); - switch (a) { - case CHECK_SWD: - assert(f->zbd_info->sectors_with_data == swd); - assert(f->zbd_info->wp_sectors_with_data == wp_swd); - break; - case SET_SWD: - f->zbd_info->sectors_with_data = swd; - f->zbd_info->wp_sectors_with_data = wp_swd; - break; - } - pthread_mutex_unlock(&f->zbd_info->mutex); - for (z = zb; z < ze; z++) - if (z->has_wp) - zone_unlock(z); - - return swd; -} - -/* - * The swd check is useful for debugging but takes too much time to leave - * it enabled all the time. Hence it is disabled by default. - */ -static const bool enable_check_swd = false; - -/* Check whether the values of zbd_info.*sectors_with_data are correct. */ -static void zbd_check_swd(struct thread_data *td, const struct fio_file *f) -{ - if (!enable_check_swd) - return; - - zbd_process_swd(td, f, CHECK_SWD); -} - void zbd_file_reset(struct thread_data *td, struct fio_file *f) { struct fio_zone_info *zb, *ze; - uint64_t swd; + bool verify_data_left = false; if (!f->zbd_info || !td_write(td)) return; - zb = get_zone(f, f->min_zone); - ze = get_zone(f, f->max_zone); - swd = zbd_process_swd(td, f, SET_SWD); - dprint(FD_ZBD, "%s(%s): swd = %" PRIu64 "\n", __func__, f->file_name, - swd); + zb = zbd_get_zone(f, f->min_zone); + ze = zbd_get_zone(f, f->max_zone); + /* * If data verification is enabled reset the affected zones before * writing any data to avoid that a zone reset has to be issued while * writing data, which causes data loss. */ - if (td->o.verify != VERIFY_NONE && td->runstate != TD_VERIFYING) - zbd_reset_zones(td, f, zb, ze); + if (td->o.verify != VERIFY_NONE) { + verify_data_left = td->runstate == TD_VERIFYING || + td->io_hist_len || td->verify_batch; + if (!verify_data_left) + zbd_reset_zones(td, f, zb, ze); + } + zbd_reset_write_cnt(td, f); } -/* The caller must hold f->zbd_info->mutex. */ -static bool is_zone_open(const struct thread_data *td, const struct fio_file *f, - unsigned int zone_idx) +/* Return random zone index for one of the write target zones. */ +static uint32_t pick_random_zone_idx(const struct fio_file *f, + const struct io_u *io_u) { - struct zoned_block_device_info *zbdi = f->zbd_info; - int i; - - /* This function should never be called when zbdi->max_open_zones == 0 */ - assert(zbdi->max_open_zones); - assert(td->o.job_max_open_zones == 0 || td->num_open_zones <= td->o.job_max_open_zones); - assert(td->o.job_max_open_zones <= zbdi->max_open_zones); - assert(zbdi->num_open_zones <= zbdi->max_open_zones); - - for (i = 0; i < zbdi->num_open_zones; i++) - if (zbdi->open_zones[i] == zone_idx) - return true; - - return false; + return (io_u->offset - f->file_offset) * + f->zbd_info->num_write_zones / f->io_size; } /* - * Open a ZBD zone if it was not yet open. Returns true if either the zone was - * already open or if opening a new zone is allowed. Returns false if the zone - * was not yet open and opening a new zone would cause the zone limit to be - * exceeded. + * Randomly choose a zone in the array of write zones and in the range for the + * file f. If such a zone is found, return its index in f->zbd_info->zone_info[] + * using @zone_idx, and return true. Otherwise, return false. + * + * Caller must hold f->zbd_info->mutex. */ -static bool zbd_open_zone(struct thread_data *td, const struct fio_file *f, - uint32_t zone_idx) +static bool zbd_pick_write_zone(const struct fio_file* f, + const struct io_u *io_u, uint32_t *zone_idx) { - const uint64_t min_bs = td->o.min_bs[DDIR_WRITE]; struct zoned_block_device_info *zbdi = f->zbd_info; - struct fio_zone_info *z = get_zone(f, zone_idx); - bool res = true; - - if (z->cond == ZBD_ZONE_COND_OFFLINE) - return false; - - /* - * Skip full zones with data verification enabled because resetting a - * zone causes data loss and hence causes verification to fail. - */ - if (td->o.verify != VERIFY_NONE && zbd_zone_full(f, z, min_bs)) - return false; + uint32_t write_zone_idx; + uint32_t cur_zone_idx; + int i; /* - * zbdi->max_open_zones == 0 means that there is no limit on the maximum - * number of open zones. In this case, do no track open zones in - * zbdi->open_zones array. + * An array of write target zones is per-device, shared across all jobs. + * Start with quasi-random candidate zone. Ignore zones which do not + * belong to offset/size range of the current job. */ - if (!zbdi->max_open_zones) - return true; - - pthread_mutex_lock(&zbdi->mutex); - if (is_zone_open(td, f, zone_idx)) { - /* - * If the zone is already open and going to be full by writes - * in-flight, handle it as a full zone instead of an open zone. - */ - if (z->wp >= zbd_zone_capacity_end(z)) - res = false; - goto out; + write_zone_idx = pick_random_zone_idx(f, io_u); + assert(!write_zone_idx || write_zone_idx < zbdi->num_write_zones); + + for (i = 0; i < zbdi->num_write_zones; i++) { + if (write_zone_idx >= zbdi->num_write_zones) + write_zone_idx = 0; + cur_zone_idx = zbdi->write_zones[write_zone_idx]; + if (f->min_zone <= cur_zone_idx && cur_zone_idx < f->max_zone) { + *zone_idx = cur_zone_idx; + return true; + } + write_zone_idx++; } - res = false; - /* Zero means no limit */ - if (td->o.job_max_open_zones > 0 && - td->num_open_zones >= td->o.job_max_open_zones) - goto out; - if (zbdi->num_open_zones >= zbdi->max_open_zones) - goto out; - dprint(FD_ZBD, "%s: opening zone %d\n", f->file_name, zone_idx); - zbdi->open_zones[zbdi->num_open_zones++] = zone_idx; - td->num_open_zones++; - z->open = 1; - res = true; - -out: - pthread_mutex_unlock(&zbdi->mutex); - return res; -} -/* Return random zone index for one of the open zones. */ -static uint32_t pick_random_zone_idx(const struct fio_file *f, - const struct io_u *io_u) -{ - return (io_u->offset - f->file_offset) * f->zbd_info->num_open_zones / - f->io_size; + return false; } static bool any_io_in_flight(void) { - struct thread_data *td; - int i; - - for_each_td(td, i) { + for_each_td(td) { if (td->io_u_in_flight) return true; - } + } end_for_each(); return false; } -/* - * Modify the offset of an I/O unit that does not refer to an open zone such - * that it refers to an open zone. Close an open zone and open a new zone if - * necessary. The open zone is searched across sequential zones. +/** + * zbd_convert_to_write_zone - Convert the target zone of an io_u to a writable zone + * @td: The fio thread data + * @io_u: The I/O unit that targets the zone to convert + * @zb: The zone selected at the beginning of the function call. The caller must + * hold zb->mutex. + * + * Modify the offset of an I/O unit that does not refer to a zone such that + * in write target zones array. Add a zone to or remove a zone from the array if + * necessary. The write target zone is searched across sequential zones. * This algorithm can only work correctly if all write pointers are - * a multiple of the fio block size. The caller must neither hold z->mutex - * nor f->zbd_info->mutex. Returns with z->mutex held upon success. + * a multiple of the fio block size. The caller must not hold + * f->zbd_info->mutex. Returns with z->mutex held upon success. */ -static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td, - struct io_u *io_u) +static struct fio_zone_info *zbd_convert_to_write_zone(struct thread_data *td, + struct io_u *io_u, + struct fio_zone_info *zb) { const uint64_t min_bs = td->o.min_bs[io_u->ddir]; struct fio_file *f = io_u->file; struct zoned_block_device_info *zbdi = f->zbd_info; struct fio_zone_info *z; - unsigned int open_zone_idx = -1; uint32_t zone_idx, new_zone_idx; int i; - bool wait_zone_close; + bool wait_zone_write; bool in_flight; bool should_retry = true; + bool need_zone_finish; assert(is_valid_offset(f, io_u->offset)); - if (zbdi->max_open_zones || td->o.job_max_open_zones) { + if (zbd_zone_remainder(zb) > 0 && zbd_zone_remainder(zb) < min_bs) { + pthread_mutex_lock(&f->zbd_info->mutex); + zbd_write_zone_put(td, f, zb); + pthread_mutex_unlock(&f->zbd_info->mutex); + dprint(FD_ZBD, "%s: finish zone %d\n", + f->file_name, zbd_zone_idx(f, zb)); + io_u_quiesce(td); + zbd_finish_zone(td, f, zb); + zone_unlock(zb); + + if (zbd_zone_idx(f, zb) + 1 >= f->max_zone && !td_random(td)) + return NULL; + + /* Find the next write pointer zone */ + do { + zb++; + if (zbd_zone_idx(f, zb) >= f->max_zone) + zb = zbd_get_zone(f, f->min_zone); + } while (!zb->has_wp); + + zone_lock(td, f, zb); + } + + if (zbd_write_zone_get(td, f, zb)) + return zb; + + zone_unlock(zb); + + if (zbdi->max_write_zones || td->o.job_max_open_zones) { /* - * This statement accesses zbdi->open_zones[] on purpose + * This statement accesses zbdi->write_zones[] on purpose * without locking. */ - zone_idx = zbdi->open_zones[pick_random_zone_idx(f, io_u)]; + zone_idx = zbdi->write_zones[pick_random_zone_idx(f, io_u)]; } else { - zone_idx = zbd_zone_idx(f, io_u->offset); + zone_idx = zbd_offset_to_zone_idx(f, io_u->offset); } if (zone_idx < f->min_zone) zone_idx = f->min_zone; else if (zone_idx >= f->max_zone) zone_idx = f->max_zone - 1; - dprint(FD_ZBD, "%s(%s): starting from zone %d (offset %lld, buflen %lld)\n", + + dprint(FD_ZBD, + "%s(%s): starting from zone %d (offset %lld, buflen %lld)\n", __func__, f->file_name, zone_idx, io_u->offset, io_u->buflen); /* @@ -1260,59 +1573,39 @@ static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td, * has been obtained. Hence the loop. */ for (;;) { - uint32_t tmp_idx; - - z = get_zone(f, zone_idx); + z = zbd_get_zone(f, zone_idx); if (z->has_wp) zone_lock(td, f, z); + pthread_mutex_lock(&zbdi->mutex); + if (z->has_wp) { if (z->cond != ZBD_ZONE_COND_OFFLINE && - zbdi->max_open_zones == 0 && td->o.job_max_open_zones == 0) + zbdi->max_write_zones == 0 && + td->o.job_max_open_zones == 0) goto examine_zone; - if (zbdi->num_open_zones == 0) { - dprint(FD_ZBD, "%s(%s): no zones are open\n", + if (zbdi->num_write_zones == 0) { + dprint(FD_ZBD, "%s(%s): no zone is write target\n", __func__, f->file_name); - goto open_other_zone; + goto choose_other_zone; } } - /* - * List of opened zones is per-device, shared across all threads. - * Start with quasi-random candidate zone. - * Ignore zones which don't belong to thread's offset/size area. - */ - open_zone_idx = pick_random_zone_idx(f, io_u); - assert(!open_zone_idx || - open_zone_idx < zbdi->num_open_zones); - tmp_idx = open_zone_idx; - for (i = 0; i < zbdi->num_open_zones; i++) { - uint32_t tmpz; - - if (tmp_idx >= zbdi->num_open_zones) - tmp_idx = 0; - tmpz = zbdi->open_zones[tmp_idx]; - if (f->min_zone <= tmpz && tmpz < f->max_zone) { - open_zone_idx = tmp_idx; - goto found_candidate_zone; - } - - tmp_idx++; + if (!zbd_pick_write_zone(f, io_u, &new_zone_idx)) { + dprint(FD_ZBD, "%s(%s): no candidate zone\n", + __func__, f->file_name); + pthread_mutex_unlock(&zbdi->mutex); + if (z->has_wp) + zone_unlock(z); + return NULL; } - dprint(FD_ZBD, "%s(%s): no candidate zone\n", - __func__, f->file_name); - pthread_mutex_unlock(&zbdi->mutex); - if (z->has_wp) - zone_unlock(z); - return NULL; - -found_candidate_zone: - new_zone_idx = zbdi->open_zones[open_zone_idx]; if (new_zone_idx == zone_idx) break; zone_idx = new_zone_idx; + pthread_mutex_unlock(&zbdi->mutex); + if (z->has_wp) zone_unlock(z); } @@ -1320,36 +1613,37 @@ static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td, /* Both z->mutex and zbdi->mutex are held. */ examine_zone: - if (z->wp + min_bs <= zbd_zone_capacity_end(z)) { + if (zbd_zone_remainder(z) >= min_bs) { pthread_mutex_unlock(&zbdi->mutex); goto out; } -open_other_zone: - /* Check if number of open zones reaches one of limits. */ - wait_zone_close = - zbdi->num_open_zones == f->max_zone - f->min_zone || - (zbdi->max_open_zones && - zbdi->num_open_zones == zbdi->max_open_zones) || +choose_other_zone: + /* Check if number of write target zones reaches one of limits. */ + wait_zone_write = + zbdi->num_write_zones == f->max_zone - f->min_zone || + (zbdi->max_write_zones && + zbdi->num_write_zones == zbdi->max_write_zones) || (td->o.job_max_open_zones && - td->num_open_zones == td->o.job_max_open_zones); + td->num_write_zones == td->o.job_max_open_zones); pthread_mutex_unlock(&zbdi->mutex); /* Only z->mutex is held. */ /* - * When number of open zones reaches to one of limits, wait for - * zone close before opening a new zone. + * When number of write target zones reaches to one of limits, wait for + * zone write completion to one of them before trying a new zone. */ - if (wait_zone_close) { - dprint(FD_ZBD, "%s(%s): quiesce to allow open zones to close\n", + if (wait_zone_write) { + dprint(FD_ZBD, + "%s(%s): quiesce to remove a zone from write target zones array\n", __func__, f->file_name); io_u_quiesce(td); } retry: - /* Zone 'z' is full, so try to open a new zone. */ + /* Zone 'z' is full, so try to choose a new zone. */ for (i = f->io_size / zbdi->zone_size; i > 0; i--) { zone_idx++; if (z->has_wp) @@ -1358,99 +1652,95 @@ static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td, if (!is_valid_offset(f, z->start)) { /* Wrap-around. */ zone_idx = f->min_zone; - z = get_zone(f, zone_idx); + z = zbd_get_zone(f, zone_idx); } assert(is_valid_offset(f, z->start)); if (!z->has_wp) continue; zone_lock(td, f, z); - if (z->open) + if (z->write) continue; - if (zbd_open_zone(td, f, zone_idx)) + if (zbd_write_zone_get(td, f, z)) goto out; } /* Only z->mutex is held. */ - /* Check whether the write fits in any of the already opened zones. */ + /* Check whether the write fits in any of the write target zones. */ pthread_mutex_lock(&zbdi->mutex); - for (i = 0; i < zbdi->num_open_zones; i++) { - zone_idx = zbdi->open_zones[i]; + need_zone_finish = true; + for (i = 0; i < zbdi->num_write_zones; i++) { + zone_idx = zbdi->write_zones[i]; if (zone_idx < f->min_zone || zone_idx >= f->max_zone) continue; pthread_mutex_unlock(&zbdi->mutex); zone_unlock(z); - z = get_zone(f, zone_idx); + z = zbd_get_zone(f, zone_idx); zone_lock(td, f, z); - if (z->wp + min_bs <= zbd_zone_capacity_end(z)) + if (zbd_zone_remainder(z) >= min_bs) { + need_zone_finish = false; goto out; + } pthread_mutex_lock(&zbdi->mutex); } /* * When any I/O is in-flight or when all I/Os in-flight get completed, - * the I/Os might have closed zones then retry the steps to open a zone. - * Before retry, call io_u_quiesce() to complete in-flight writes. + * the I/Os might have removed zones from the write target array then + * retry the steps to choose a zone. Before retry, call io_u_quiesce() + * to complete in-flight writes. */ in_flight = any_io_in_flight(); if (in_flight || should_retry) { - dprint(FD_ZBD, "%s(%s): wait zone close and retry open zones\n", + dprint(FD_ZBD, + "%s(%s): wait zone write and retry write target zone selection\n", __func__, f->file_name); + should_retry = in_flight; pthread_mutex_unlock(&zbdi->mutex); zone_unlock(z); io_u_quiesce(td); zone_lock(td, f, z); - should_retry = in_flight; goto retry; } + if (td_random(td) && td->o.verify == VERIFY_NONE && need_zone_finish) + /* + * If all open zones have remainder smaller than the block size + * for random write jobs, choose one of the write target zones + * and finish it. When verify is enabled, skip this zone finish + * operation to avoid verify data corruption by overwrite to the + * zone. + */ + if (zbd_pick_write_zone(f, io_u, &zone_idx)) { + pthread_mutex_unlock(&zbdi->mutex); + zone_unlock(z); + z = zbd_get_zone(f, zone_idx); + zone_lock(td, f, z); + io_u_quiesce(td); + dprint(FD_ZBD, "%s(%s): All write target zones have remainder smaller than block size. Choose zone %d and finish.\n", + __func__, f->file_name, zone_idx); + zbd_finish_zone(td, f, z); + goto out; + } + pthread_mutex_unlock(&zbdi->mutex); + zone_unlock(z); - dprint(FD_ZBD, "%s(%s): did not open another zone\n", __func__, - f->file_name); + + dprint(FD_ZBD, "%s(%s): did not choose another write zone\n", + __func__, f->file_name); + return NULL; out: - dprint(FD_ZBD, "%s(%s): returning zone %d\n", __func__, f->file_name, - zone_idx); + dprint(FD_ZBD, "%s(%s): returning zone %d\n", + __func__, f->file_name, zone_idx); + io_u->offset = z->start; assert(z->has_wp); assert(z->cond != ZBD_ZONE_COND_OFFLINE); - return z; -} - -/* The caller must hold z->mutex. */ -static struct fio_zone_info *zbd_replay_write_order(struct thread_data *td, - struct io_u *io_u, - struct fio_zone_info *z) -{ - const struct fio_file *f = io_u->file; - const uint64_t min_bs = td->o.min_bs[DDIR_WRITE]; - - if (!zbd_open_zone(td, f, zbd_zone_nr(f, z))) { - zone_unlock(z); - z = zbd_convert_to_open_zone(td, io_u); - assert(z); - } - - if (z->verify_block * min_bs >= z->capacity) { - log_err("%s: %d * %"PRIu64" >= %"PRIu64"\n", f->file_name, z->verify_block, - min_bs, z->capacity); - /* - * If the assertion below fails during a test run, adding - * "--experimental_verify=1" to the command line may help. - */ - assert(false); - } - io_u->offset = z->start + z->verify_block * min_bs; - if (io_u->offset + io_u->buflen >= zbd_zone_capacity_end(z)) { - log_err("%s: %llu + %llu >= %"PRIu64"\n", f->file_name, io_u->offset, - io_u->buflen, zbd_zone_capacity_end(z)); - assert(false); - } - z->verify_block += io_u->buflen / min_bs; return z; } @@ -1468,7 +1758,7 @@ zbd_find_zone(struct thread_data *td, struct io_u *io_u, uint64_t min_bytes, { struct fio_file *f = io_u->file; struct fio_zone_info *z1, *z2; - const struct fio_zone_info *const zf = get_zone(f, f->min_zone); + const struct fio_zone_info *const zf = zbd_get_zone(f, f->min_zone); /* * Skip to the next non-empty zone in case of sequential I/O and to @@ -1485,6 +1775,7 @@ zbd_find_zone(struct thread_data *td, struct io_u *io_u, uint64_t min_bytes, } else if (!td_random(td)) { break; } + if (td_random(td) && z2 >= zf && z2->cond != ZBD_ZONE_COND_OFFLINE) { if (z2->has_wp) @@ -1495,8 +1786,11 @@ zbd_find_zone(struct thread_data *td, struct io_u *io_u, uint64_t min_bytes, zone_unlock(z2); } } - dprint(FD_ZBD, "%s: no zone has %"PRIu64" bytes of readable data\n", + + dprint(FD_ZBD, + "%s: no zone has %"PRIu64" bytes of readable data\n", f->file_name, min_bytes); + return NULL; } @@ -1505,7 +1799,8 @@ zbd_find_zone(struct thread_data *td, struct io_u *io_u, uint64_t min_bytes, * @io_u: I/O unit * @z: zone info pointer * - * If the write command made the zone full, close it. + * If the write command made the zone full, remove it from the write target + * zones array. * * The caller must hold z->mutex. */ @@ -1517,7 +1812,7 @@ static void zbd_end_zone_io(struct thread_data *td, const struct io_u *io_u, if (io_u->ddir == DDIR_WRITE && io_u->offset + io_u->buflen >= zbd_zone_capacity_end(z)) { pthread_mutex_lock(&f->zbd_info->mutex); - zbd_close_zone(td, f, zbd_zone_nr(f, z)); + zbd_write_zone_put(td, f, z); pthread_mutex_unlock(&f->zbd_info->mutex); } } @@ -1531,55 +1826,67 @@ static void zbd_end_zone_io(struct thread_data *td, const struct io_u *io_u, * For write and trim operations, update the write pointer of the I/O unit * target zone. */ -static void zbd_queue_io(struct thread_data *td, struct io_u *io_u, int q, - bool success) +static void zbd_queue_io(struct thread_data *td, struct io_u *io_u, int *q) { const struct fio_file *f = io_u->file; struct zoned_block_device_info *zbd_info = f->zbd_info; + bool success = io_u->error == 0; struct fio_zone_info *z; - uint32_t zone_idx; uint64_t zone_end; assert(zbd_info); - zone_idx = zbd_zone_idx(f, io_u->offset); - assert(zone_idx < zbd_info->nr_zones); - z = get_zone(f, zone_idx); - + z = zbd_offset_to_zone(f, io_u->offset); assert(z->has_wp); + if (!success && td->o.recover_zbd_write_error && + io_u->ddir == DDIR_WRITE && td_ioengine_flagged(td, FIO_SYNCIO) && + *q == FIO_Q_COMPLETED) { + zbd_recover_write_error(td, io_u); + if (!io_u->error) + success = true; + } + if (!success) goto unlock; dprint(FD_ZBD, "%s: queued I/O (%lld, %llu) for zone %u\n", - f->file_name, io_u->offset, io_u->buflen, zone_idx); + f->file_name, io_u->offset, io_u->buflen, zbd_zone_idx(f, z)); switch (io_u->ddir) { case DDIR_WRITE: zone_end = min((uint64_t)(io_u->offset + io_u->buflen), zbd_zone_capacity_end(z)); - pthread_mutex_lock(&zbd_info->mutex); + /* * z->wp > zone_end means that one or more I/O errors * have occurred. */ - if (z->wp <= zone_end) { - zbd_info->sectors_with_data += zone_end - z->wp; - zbd_info->wp_sectors_with_data += zone_end - z->wp; + if (accounting_vdb(td, f) && z->wp <= zone_end) { + pthread_mutex_lock(&zbd_info->mutex); + zbd_info->wp_valid_data_bytes += zone_end - z->wp; + pthread_mutex_unlock(&zbd_info->mutex); } - pthread_mutex_unlock(&zbd_info->mutex); z->wp = zone_end; break; default: break; } - if (q == FIO_Q_COMPLETED && !io_u->error) + if (*q == FIO_Q_COMPLETED && !io_u->error) zbd_end_zone_io(td, io_u, z); unlock: - if (!success || q != FIO_Q_QUEUED) { + if (!success || *q != FIO_Q_QUEUED) { + if (io_u->ddir == DDIR_WRITE) { + z->writes_in_flight--; + if (z->writes_in_flight == 0 && z->fixing_zone_wp) { + dprint(FD_ZBD, "%s: Fixed write pointer of the zone %u\n", + f->file_name, zbd_zone_idx(f, z)); + z->fixing_zone_wp = 0; + } + } /* BUSY or COMPLETED: unlock the zone */ zone_unlock(z); io_u->zbd_put_io = NULL; @@ -1593,26 +1900,29 @@ static void zbd_queue_io(struct thread_data *td, struct io_u *io_u, int q, static void zbd_put_io(struct thread_data *td, const struct io_u *io_u) { const struct fio_file *f = io_u->file; - struct zoned_block_device_info *zbd_info = f->zbd_info; struct fio_zone_info *z; - uint32_t zone_idx; - - assert(zbd_info); - zone_idx = zbd_zone_idx(f, io_u->offset); - assert(zone_idx < zbd_info->nr_zones); - z = get_zone(f, zone_idx); + assert(f->zbd_info); + z = zbd_offset_to_zone(f, io_u->offset); assert(z->has_wp); dprint(FD_ZBD, "%s: terminate I/O (%lld, %llu) for zone %u\n", - f->file_name, io_u->offset, io_u->buflen, zone_idx); + f->file_name, io_u->offset, io_u->buflen, zbd_zone_idx(f, z)); zbd_end_zone_io(td, io_u, z); + if (io_u->ddir == DDIR_WRITE) { + z->writes_in_flight--; + if (z->writes_in_flight == 0 && z->fixing_zone_wp) { + z->fixing_zone_wp = 0; + dprint(FD_ZBD, "%s: Fixed write pointer of the zone %u\n", + f->file_name, zbd_zone_idx(f, z)); + } + } + zone_unlock(z); - zbd_check_swd(td, f); } /* @@ -1649,28 +1959,26 @@ void setup_zbd_zone_mode(struct thread_data *td, struct io_u *io_u) struct fio_file *f = io_u->file; enum fio_ddir ddir = io_u->ddir; struct fio_zone_info *z; - uint32_t zone_idx; assert(td->o.zone_mode == ZONE_MODE_ZBD); assert(td->o.zone_size); assert(f->zbd_info); - zone_idx = zbd_zone_idx(f, f->last_pos[ddir]); - z = get_zone(f, zone_idx); + z = zbd_offset_to_zone(f, f->last_pos[ddir]); /* * When the zone capacity is smaller than the zone size and the I/O is * sequential write, skip to zone end if the latest position is at the * zone capacity limit. */ - if (z->capacity < f->zbd_info->zone_size && !td_random(td) && - ddir == DDIR_WRITE && + if (z->capacity < f->zbd_info->zone_size && + !td_random(td) && ddir == DDIR_WRITE && f->last_pos[ddir] >= zbd_zone_capacity_end(z)) { dprint(FD_ZBD, "%s: Jump from zone capacity limit to zone end:" " (%"PRIu64" -> %"PRIu64") for zone %u (%"PRIu64")\n", f->file_name, f->last_pos[ddir], - zbd_zone_end(z), zone_idx, z->capacity); + zbd_zone_end(z), zbd_zone_idx(f, z), z->capacity); td->io_skip_bytes += zbd_zone_end(z) - f->last_pos[ddir]; f->last_pos[ddir] = zbd_zone_end(z); } @@ -1731,8 +2039,8 @@ enum fio_ddir zbd_adjust_ddir(struct thread_data *td, struct io_u *io_u, if (ddir != DDIR_READ || !td_rw(td)) return ddir; - if (io_u->file->zbd_info->sectors_with_data || - td->o.read_beyond_wp) + if (io_u->file->last_start[DDIR_WRITE] != -1ULL || + td->o.read_beyond_wp || td->o.rwmix[DDIR_WRITE] == 0) return DDIR_READ; return DDIR_WRITE; @@ -1751,7 +2059,6 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u) { struct fio_file *f = io_u->file; struct zoned_block_device_info *zbdi = f->zbd_info; - uint32_t zone_idx_b; struct fio_zone_info *zb, *zl, *orig_zb; uint32_t orig_len = io_u->buflen; uint64_t min_bs = td->o.min_bs[io_u->ddir]; @@ -1762,14 +2069,15 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u) assert(min_bs); assert(is_valid_offset(f, io_u->offset)); assert(io_u->buflen); - zone_idx_b = zbd_zone_idx(f, io_u->offset); - zb = get_zone(f, zone_idx_b); + + zb = zbd_offset_to_zone(f, io_u->offset); orig_zb = zb; if (!zb->has_wp) { /* Accept non-write I/Os for conventional zones. */ if (io_u->ddir != DDIR_WRITE) return io_u_accept; + /* * Make sure that writes to conventional zones * don't cross over to any sequential zones. @@ -1783,12 +2091,16 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u) "%s: off=%llu + min_bs=%"PRIu64" > next zone %"PRIu64"\n", f->file_name, io_u->offset, min_bs, (zb + 1)->start); - io_u->offset = zb->start + (zb + 1)->start - io_u->offset; - new_len = min(io_u->buflen, (zb + 1)->start - io_u->offset); + io_u->offset = + zb->start + (zb + 1)->start - io_u->offset; + new_len = min(io_u->buflen, + (zb + 1)->start - io_u->offset); } else { new_len = (zb + 1)->start - io_u->offset; } + io_u->buflen = new_len / min_bs * min_bs; + return io_u_accept; } @@ -1800,16 +2112,20 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u) io_u->ddir == DDIR_READ && td->o.read_beyond_wp) return io_u_accept; - zbd_check_swd(td, f); - +retry_lock: zone_lock(td, f, zb); + if (!td_ioengine_flagged(td, FIO_SYNCIO) && zb->fixing_zone_wp) { + zone_unlock(zb); + io_u_quiesce(td); + goto retry_lock; + } + switch (io_u->ddir) { case DDIR_READ: - if (td->runstate == TD_VERIFYING && td_write(td)) { - zb = zbd_replay_write_order(td, io_u, zb); + if (td->runstate == TD_VERIFYING && td_write(td)) goto accept; - } + /* * Check that there is enough written data in the zone to do an * I/O of at least min_bs B. If there isn't, find a new zone for @@ -1820,7 +2136,7 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u) if (range < min_bs || ((!td_random(td)) && (io_u->offset + min_bs > zb->wp))) { zone_unlock(zb); - zl = get_zone(f, f->max_zone); + zl = zbd_get_zone(f, f->max_zone); zb = zbd_find_zone(td, io_u, min_bs, zb, zl); if (!zb) { dprint(FD_ZBD, @@ -1839,6 +2155,7 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u) if (!td_random(td)) io_u->offset = zb->start; } + /* * Make sure the I/O is within the zone valid data range while * maximizing the I/O size and preserving randomness. @@ -1849,12 +2166,14 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u) io_u->offset = zb->start + ((io_u->offset - orig_zb->start) % (range - io_u->buflen)) / min_bs * min_bs; + /* * When zbd_find_zone() returns a conventional zone, * we can simply accept the new i/o offset here. */ if (!zb->has_wp) return io_u_accept; + /* * Make sure the I/O does not cross over the zone wp position. */ @@ -1866,9 +2185,12 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u) dprint(FD_IO, "Changed length from %u into %llu\n", orig_len, io_u->buflen); } + assert(zb->start <= io_u->offset); assert(io_u->offset + io_u->buflen <= zb->wp); + goto accept; + case DDIR_WRITE: if (io_u->buflen > zbdi->zone_size) { td_verror(td, EINVAL, "I/O buflen exceeds zone size"); @@ -1877,26 +2199,42 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u) f->file_name, io_u->buflen, zbdi->zone_size); goto eof; } - if (!zbd_open_zone(td, f, zone_idx_b)) { - zone_unlock(zb); - zb = zbd_convert_to_open_zone(td, io_u); - if (!zb) { - dprint(FD_IO, "%s: can't convert to open zone", - f->file_name); - goto eof; - } + +retry: + zb = zbd_convert_to_write_zone(td, io_u, zb); + if (!zb) { + dprint(FD_IO, "%s: can't convert to write target zone", + f->file_name); + goto eof; } + + if (zbd_zone_remainder(zb) > 0 && + zbd_zone_remainder(zb) < min_bs) + goto retry; + /* Check whether the zone reset threshold has been exceeded */ if (td->o.zrf.u.f) { - if (zbdi->wp_sectors_with_data >= + if (zbdi->wp_valid_data_bytes >= f->io_size * td->o.zrt.u.f && - zbd_dec_and_reset_write_cnt(td, f)) { + zbd_dec_and_reset_write_cnt(td, f)) zb->reset_zone = 1; - } } + /* Reset the zone pointer if necessary */ if (zb->reset_zone || zbd_zone_full(f, zb, min_bs)) { - assert(td->o.verify == VERIFY_NONE); + if (td->o.verify != VERIFY_NONE) { + /* + * Unset io-u->file to tell get_next_verify() + * that this IO is not requeue. + */ + io_u->file = NULL; + if (!get_next_verify(td, io_u)) { + zone_unlock(zb); + return io_u_accept; + } + io_u->file = f; + } + /* * Since previous write requests may have been submitted * asynchronously and since we will submit the zone @@ -1906,7 +2244,7 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u) */ io_u_quiesce(td); zb->reset_zone = 0; - if (zbd_reset_zone(td, f, zb) < 0) + if (__zbd_reset_zone(td, f, zb) < 0) goto eof; if (zb->capacity < min_bs) { @@ -1916,6 +2254,7 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u) goto eof; } } + /* Make writes occur at the write pointer */ assert(!zbd_zone_full(f, zb, min_bs)); io_u->offset = zb->wp; @@ -1925,6 +2264,7 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u) f->file_name, io_u->offset); goto eof; } + /* * Make sure that the buflen is a multiple of the minimal * block size. Give up if shrinking would make the request too @@ -1941,10 +2281,13 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u) orig_len, io_u->buflen); goto accept; } + td_verror(td, EIO, "zone remainder too small"); log_err("zone remainder %lld smaller than min block size %"PRIu64"\n", (zbd_zone_capacity_end(zb) - io_u->offset), min_bs); + goto eof; + case DDIR_TRIM: /* Check random trim targets a non-empty zone */ if (!td_random(td) || zb->wp > zb->start) @@ -1952,7 +2295,7 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u) /* Find out a non-empty zone to trim */ zone_unlock(zb); - zl = get_zone(f, f->max_zone); + zl = zbd_get_zone(f, f->max_zone); zb = zbd_find_zone(td, io_u, 1, zb, zl); if (zb) { io_u->offset = zb->start; @@ -1960,7 +2303,9 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u) f->file_name, io_u->offset); goto accept; } + goto eof; + case DDIR_SYNC: /* fall-through */ case DDIR_DATASYNC: @@ -1968,6 +2313,7 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u) case DDIR_WAIT: case DDIR_LAST: case DDIR_INVAL: + case DDIR_TIMEOUT: goto accept; } @@ -1978,19 +2324,25 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u) assert(zb->cond != ZBD_ZONE_COND_OFFLINE); assert(!io_u->zbd_queue_io); assert(!io_u->zbd_put_io); + io_u->zbd_queue_io = zbd_queue_io; io_u->zbd_put_io = zbd_put_io; + if (io_u->ddir == DDIR_WRITE) + zb->writes_in_flight++; + /* * Since we return with the zone lock still held, * add an annotation to let Coverity know that it * is intentional. */ /* coverity[missing_unlock] */ + return io_u_accept; eof: if (zb && zb->has_wp) zone_unlock(zb); + return io_u_eof; } @@ -2014,21 +2366,19 @@ char *zbd_write_status(const struct thread_stat *ts) * Return io_u_completed when reset zone succeeds. Return 0 when the target zone * does not have write pointer. On error, return negative errno. */ -int zbd_do_io_u_trim(const struct thread_data *td, struct io_u *io_u) +int zbd_do_io_u_trim(struct thread_data *td, struct io_u *io_u) { struct fio_file *f = io_u->file; struct fio_zone_info *z; - uint32_t zone_idx; int ret; - zone_idx = zbd_zone_idx(f, io_u->offset); - z = get_zone(f, zone_idx); - + z = zbd_offset_to_zone(f, io_u->offset); if (!z->has_wp) return 0; if (io_u->offset != z->start) { - log_err("Trim offset not at zone start (%lld)\n", io_u->offset); + log_err("Trim offset not at zone start (%lld)\n", + io_u->offset); return -EINVAL; } @@ -2038,3 +2388,83 @@ int zbd_do_io_u_trim(const struct thread_data *td, struct io_u *io_u) return io_u_completed; } + +void zbd_log_err(const struct thread_data *td, const struct io_u *io_u) +{ + const struct fio_file *f = io_u->file; + + if (td->o.zone_mode != ZONE_MODE_ZBD) + return; + + if (io_u->error == EOVERFLOW) + log_err("%s: Exceeded max_active_zones limit. Check conditions of zones out of I/O ranges.\n", + f->file_name); +} + +void zbd_recover_write_error(struct thread_data *td, struct io_u *io_u) +{ + struct fio_file *f = io_u->file; + struct fio_zone_info *z; + struct zbd_zone zrep; + unsigned long long retry_offset; + unsigned long long retry_len; + char *retry_buf; + uint64_t write_end_offset; + int ret; + + z = zbd_offset_to_zone(f, io_u->offset); + if (!z->has_wp) + return; + write_end_offset = io_u->offset + io_u->buflen - z->start; + + assert(z->writes_in_flight); + + if (!z->fixing_zone_wp) { + z->fixing_zone_wp = 1; + dprint(FD_ZBD, "%s: Start fixing %u write pointer\n", + f->file_name, zbd_zone_idx(f, z)); + } + + if (z->max_write_error_offset < write_end_offset) + z->max_write_error_offset = write_end_offset; + + if (z->writes_in_flight > 1) + return; + + /* + * This is the last write to the zone since the write error to recover. + * Get the zone current write pointer and recover the write pointer + * position so that next write can continue. + */ + ret = zbd_report_zones(td, f, z->start, &zrep, 1); + if (ret != 1) { + log_info("fio: Report zone for write recovery failed for %s\n", + f->file_name); + return; + } + + if (zrep.wp < z->start || + z->start + z->max_write_error_offset < zrep.wp ) { + log_info("fio: unexpected write pointer position on error for %s: wp=%"PRIu64"\n", + f->file_name, zrep.wp); + return; + } + + retry_offset = zrep.wp; + retry_len = z->start + z->max_write_error_offset - retry_offset; + retry_buf = NULL; + if (retry_offset >= io_u->offset) + retry_buf = (char *)io_u->buf + (retry_offset - io_u->offset); + + ret = zbd_move_zone_wp(td, io_u->file, &zrep, retry_len, retry_buf); + if (ret) { + log_info("fio: Failed to recover write pointer for %s\n", + f->file_name); + return; + } + + z->wp = retry_offset + retry_len; + + dprint(FD_ZBD, "%s: Write pointer move succeeded for error=%d\n", + f->file_name, io_u->error); +} diff --git a/zbd.h b/zbd.h index 0a73b41dd9..14204316d4 100644 --- a/zbd.h +++ b/zbd.h @@ -25,48 +25,57 @@ enum io_u_action { * @start: zone start location (bytes) * @wp: zone write pointer location (bytes) * @capacity: maximum size usable from the start of a zone (bytes) - * @verify_block: number of blocks that have been verified for this zone + * @writes_in_flight: number of writes in flight fo the zone + * @max_write_error_offset: maximum offset from zone start among the failed + * writes to the zone * @mutex: protects the modifiable members in this structure * @type: zone type (BLK_ZONE_TYPE_*) * @cond: zone state (BLK_ZONE_COND_*) * @has_wp: whether or not this zone can have a valid write pointer - * @open: whether or not this zone is currently open. Only relevant if - * max_open_zones > 0. + * @write: whether or not this zone is the write target at this moment. Only + * relevant if zbd->max_open_zones > 0. * @reset_zone: whether or not this zone should be reset before writing to it + * @fixing_zone_wp: whether or not the write pointer of this zone is under fix */ struct fio_zone_info { pthread_mutex_t mutex; uint64_t start; uint64_t wp; uint64_t capacity; - uint32_t verify_block; + uint32_t writes_in_flight; + uint32_t max_write_error_offset; enum zbd_zone_type type:2; enum zbd_zone_cond cond:4; unsigned int has_wp:1; - unsigned int open:1; + unsigned int write:1; unsigned int reset_zone:1; + unsigned int fixing_zone_wp:1; }; /** * zoned_block_device_info - zoned block device characteristics * @model: Device model. - * @max_open_zones: global limit on the number of simultaneously opened - * sequential write zones. A zero value means unlimited open zones, - * and that open zones will not be tracked in the open_zones array. + * @max_write_zones: global limit on the number of sequential write zones which + * are simultaneously written. A zero value means unlimited zones of + * simultaneous writes and that write target zones will not be tracked in + * the write_zones array. + * @max_active_zones: device side limit on the number of sequential write zones + * in open or closed conditions. A zero value means unlimited number of + * zones in the conditions. * @mutex: Protects the modifiable members in this structure (refcount and * num_open_zones). * @zone_size: size of a single zone in bytes. - * @sectors_with_data: total size of data in all zones in units of 512 bytes - * @wp_sectors_with_data: total size of data in zones with write pointers in - * units of 512 bytes + * @wp_valid_data_bytes: total size of data in zones with write pointers + * @write_min_zone: Minimum zone index of all job's write ranges. Inclusive. + * @write_max_zone: Maximum zone index of all job's write ranges. Exclusive. * @zone_size_log2: log2 of the zone size in bytes if it is a power of 2 or 0 * if the zone size is not a power of 2. * @nr_zones: number of zones * @refcount: number of fio files that share this structure - * @num_open_zones: number of open zones + * @num_write_zones: number of write target zones * @write_cnt: Number of writes since the latest zone reset triggered by * the zone_reset_frequency fio job parameter. - * @open_zones: zone numbers of open zones + * @write_zones: zone numbers of write target zones * @zone_info: description of the individual zones * * Only devices for which all zones have the same size are supported. @@ -75,17 +84,19 @@ struct fio_zone_info { */ struct zoned_block_device_info { enum zbd_zoned_model model; - uint32_t max_open_zones; + uint32_t max_write_zones; + uint32_t max_active_zones; pthread_mutex_t mutex; uint64_t zone_size; - uint64_t sectors_with_data; - uint64_t wp_sectors_with_data; + uint64_t wp_valid_data_bytes; + uint32_t write_min_zone; + uint32_t write_max_zone; uint32_t zone_size_log2; uint32_t nr_zones; uint32_t refcount; - uint32_t num_open_zones; + uint32_t num_write_zones; uint32_t write_cnt; - uint32_t open_zones[ZBD_MAX_OPEN_ZONES]; + uint32_t write_zones[ZBD_MAX_WRITE_ZONES]; struct fio_zone_info zone_info[0]; }; @@ -100,7 +111,9 @@ enum fio_ddir zbd_adjust_ddir(struct thread_data *td, struct io_u *io_u, enum fio_ddir ddir); enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u); char *zbd_write_status(const struct thread_stat *ts); -int zbd_do_io_u_trim(const struct thread_data *td, struct io_u *io_u); +int zbd_do_io_u_trim(struct thread_data *td, struct io_u *io_u); +void zbd_log_err(const struct thread_data *td, const struct io_u *io_u); +void zbd_recover_write_error(struct thread_data *td, struct io_u *io_u); static inline void zbd_close_file(struct fio_file *f) { @@ -109,10 +122,10 @@ static inline void zbd_close_file(struct fio_file *f) } static inline void zbd_queue_io_u(struct thread_data *td, struct io_u *io_u, - enum fio_q_status status) + enum fio_q_status *status) { if (io_u->zbd_queue_io) { - io_u->zbd_queue_io(td, io_u, status, io_u->error == 0); + io_u->zbd_queue_io(td, io_u, (int *)status); io_u->zbd_queue_io = NULL; } } diff --git a/zbd_types.h b/zbd_types.h index 0a8630cb71..5f44f308f6 100644 --- a/zbd_types.h +++ b/zbd_types.h @@ -8,7 +8,7 @@ #include -#define ZBD_MAX_OPEN_ZONES 4096 +#define ZBD_MAX_WRITE_ZONES 4096 /* * Zoned block device models.