From 95bf651d59b4b741eaea68e15f5ea01093a77159 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Kutryj?= Date: Fri, 29 May 2026 09:33:05 +0200 Subject: [PATCH] feat(checkout): harden slow-retry tier 2 against hangs and false positives MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to #538. Three fixes to the opt-in SEMAPHORE_GIT_CLONE_SLOW_RETRY path: - DoH alt-IP resolution now passes curl --connect-timeout/--max-time (5s/10s, overridable). Without them, tier 2 hangs indefinitely when dns.google is blocked or unreachable — common on locked-down self-hosted runners. - Speed check gains a startup grace window (SEMAPHORE_GIT_CLONE_SLOW_GRACE, default 30s) before slow-detection arms. Server-side object counting/compression and connection setup can run for seconds with little on-disk growth; penalizing it killed healthy clones of large repos. - SSH alt-IP path checks for nc up front and fails cleanly with a clear message instead of producing a cryptic git/ssh error when nc is absent. Adds bats coverage for the grace window, curl timeout flags, and the missing-nc path. Full suite (22 tests) verified on a Semaphore ubuntu2204 f1-standard-2 testbox. Co-Authored-By: Claude Opus 4.8 (1M context) --- libcheckout | 26 +++++++++++- tests/libcheckout_slow_retry.bats | 70 +++++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+), 2 deletions(-) diff --git a/libcheckout b/libcheckout index 7f5a2854..6ba5cea9 100755 --- a/libcheckout +++ b/libcheckout @@ -381,6 +381,7 @@ function checkout::clone_with_speed_check() { # Returns: 0 = success, 1 = killed (slow), 2 = git error local threshold="${SEMAPHORE_GIT_CLONE_SLOW_THRESHOLD:-20000}" local timeout="${SEMAPHORE_GIT_CLONE_SLOW_TIMEOUT:-15}" + local grace="${SEMAPHORE_GIT_CLONE_SLOW_GRACE:-30}" local check_interval=5 local target_dir="${SEMAPHORE_GIT_DIR}" @@ -389,15 +390,26 @@ function checkout::clone_with_speed_check() { local prev_size=0 local slow_seconds=0 + local elapsed=0 while kill -0 "$pid" 2>/dev/null; do sleep "$check_interval" kill -0 "$pid" 2>/dev/null || break + elapsed=$((elapsed + check_interval)) local cur_size cur_size=$(du -sk "${target_dir}" 2>/dev/null | awk '{print $1}') cur_size=${cur_size:-0} + # Don't arm slow-detection during the startup grace window. Server-side + # object counting/compression and connection setup can run for seconds + # with little or no on-disk growth before the receive phase ramps up; + # penalizing it would kill healthy clones of large repos. + if [ "$elapsed" -lt "$grace" ]; then + prev_size=$cur_size + continue + fi + if [ "$cur_size" -gt 0 ]; then local speed=$(( (cur_size - prev_size) * 1024 / check_interval )) prev_size=$cur_size @@ -429,9 +441,15 @@ function checkout::resolve_alt_ips() { local regions IFS=',' read -ra regions <<< "${SEMAPHORE_GIT_CLONE_ALT_REGIONS:-74.0.0.0/8,177.0.0.0/8,110.0.0.0/8}" + local connect_timeout="${SEMAPHORE_GIT_CLONE_DOH_CONNECT_TIMEOUT:-5}" + local max_time="${SEMAPHORE_GIT_CLONE_DOH_MAX_TIME:-10}" + for region in "${regions[@]}"; do local ip - ip=$(curl -sf "https://dns.google/resolve?name=${git_host}&type=A&edns_client_subnet=${region}" | \ + # --connect-timeout / --max-time keep tier 2 from hanging when dns.google + # is blocked or unreachable (common on locked-down self-hosted runners). + ip=$(curl -sf --connect-timeout "${connect_timeout}" --max-time "${max_time}" \ + "https://dns.google/resolve?name=${git_host}&type=A&edns_client_subnet=${region}" | \ grep -o '"data":"[^"]*"' | sed 's/"data":"//;s/"//' | head -1) if [ -n "$ip" ] && [ "$ip" != "$current_ip" ]; then @@ -451,7 +469,11 @@ function checkout::clone_with_alt_ip() { # HTTPS: inject curloptResolve between 'git' and 'clone' checkout::clone_with_speed_check git -c "http.curloptResolve=${git_host}:${git_port}:${alt_ip}" "${@:2}" else - # SSH: route through alternative IP via ProxyCommand + # SSH: route through alternative IP via ProxyCommand (needs nc) + if ! command -v nc >/dev/null 2>&1; then + echo "[checkout] 'nc' not available; cannot route SSH clone through alternative endpoint" + return 1 + fi local orig_ssh_command="${GIT_SSH_COMMAND:-}" export GIT_SSH_COMMAND="${orig_ssh_command:-ssh} -o ProxyCommand='nc ${alt_ip} ${git_port}'" checkout::clone_with_speed_check "$@" diff --git a/tests/libcheckout_slow_retry.bats b/tests/libcheckout_slow_retry.bats index f0996e65..90882841 100644 --- a/tests/libcheckout_slow_retry.bats +++ b/tests/libcheckout_slow_retry.bats @@ -12,9 +12,12 @@ setup() { unset SEMAPHORE_GIT_CLONE_SLOW_RETRY unset SEMAPHORE_GIT_CLONE_SLOW_THRESHOLD unset SEMAPHORE_GIT_CLONE_SLOW_TIMEOUT + unset SEMAPHORE_GIT_CLONE_SLOW_GRACE unset SEMAPHORE_GIT_CLONE_RETRY_COUNT unset SEMAPHORE_GIT_CLONE_ALT_IP_RETRIES unset SEMAPHORE_GIT_CLONE_ALT_REGIONS + unset SEMAPHORE_GIT_CLONE_DOH_CONNECT_TIMEOUT + unset SEMAPHORE_GIT_CLONE_DOH_MAX_TIME export SEMAPHORE_GIT_URL="https://github.com/mojombo/grit.git" export SEMAPHORE_GIT_BRANCH=master @@ -105,6 +108,7 @@ SCRIPT @test "slow retry - speed check detects and kills slow process" { export SEMAPHORE_GIT_CLONE_SLOW_THRESHOLD=999999999 export SEMAPHORE_GIT_CLONE_SLOW_TIMEOUT=5 + export SEMAPHORE_GIT_CLONE_SLOW_GRACE=0 local mock="/tmp/slow_mock_$$" cat > "$mock" <<'SCRIPT' @@ -121,12 +125,60 @@ SCRIPT assert_output --partial "[checkout] Slow clone detected" } +@test "slow retry - speed check grace window protects slow start" { + # Slow throughput, but the grace window outlasts the process, so it must + # not be killed as slow (guards against false positives on big-repo startup). + export SEMAPHORE_GIT_CLONE_SLOW_THRESHOLD=999999999 + export SEMAPHORE_GIT_CLONE_SLOW_TIMEOUT=5 + export SEMAPHORE_GIT_CLONE_SLOW_GRACE=60 + + local mock="/tmp/slow_mock_$$" + cat > "$mock" <<'SCRIPT' +#!/bin/bash +dir="$1" +mkdir -p "$dir/.git/objects" +dd if=/dev/zero of="$dir/.git/objects/pack" bs=1024 count=1 2>/dev/null +sleep 12 +SCRIPT + chmod +x "$mock" + + run checkout::clone_with_speed_check "$mock" "$SEMAPHORE_GIT_DIR" + assert_success + refute_output --partial "[checkout] Slow clone detected" +} + +# === resolve_alt_ips timeout === + +@test "slow retry - resolve_alt_ips passes curl connect/max timeouts" { + export SEMAPHORE_GIT_CLONE_ALT_REGIONS="74.0.0.0/8" + + local mock_dir="/tmp/slow_mock_net_$$" + local args_file="${mock_dir}/curl_args" + mkdir -p "$mock_dir" + cat > "$mock_dir/curl" <