Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 24 additions & 2 deletions libcheckout
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,7 @@ function checkout::clone_with_speed_check() {
# Returns: 0 = success, 1 = killed (slow), 2 = git error
local threshold="${SEMAPHORE_GIT_CLONE_SLOW_THRESHOLD:-20000}"
local timeout="${SEMAPHORE_GIT_CLONE_SLOW_TIMEOUT:-15}"
local grace="${SEMAPHORE_GIT_CLONE_SLOW_GRACE:-30}"
local check_interval=5
local target_dir="${SEMAPHORE_GIT_DIR}"

Expand All @@ -389,15 +390,26 @@ function checkout::clone_with_speed_check() {

local prev_size=0
local slow_seconds=0
local elapsed=0

while kill -0 "$pid" 2>/dev/null; do
sleep "$check_interval"
kill -0 "$pid" 2>/dev/null || break
elapsed=$((elapsed + check_interval))

local cur_size
cur_size=$(du -sk "${target_dir}" 2>/dev/null | awk '{print $1}')
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@skipi Disk-size delta is not a reliable network-throughput signal. Healthy clone phases can have flat/negative growth and be killed as "slow". Grace only delays arming; it does not fix the signal. Please replace this with transport-level checks (e.g. git/curl low-speed controls for HTTPS) plus an absolute timeout.

cur_size=${cur_size:-0}

# Don't arm slow-detection during the startup grace window. Server-side
# object counting/compression and connection setup can run for seconds
# with little or no on-disk growth before the receive phase ramps up;
# penalizing it would kill healthy clones of large repos.
if [ "$elapsed" -lt "$grace" ]; then
prev_size=$cur_size
continue
fi

if [ "$cur_size" -gt 0 ]; then
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@skipi Slow accounting still requires cur_size > 0, so pre-write stalls (DNS/TCP/TLS/auth) are never detected by this watchdog. Please add a no-progress timeout and a hard wall-clock timeout independent of directory size.

local speed=$(( (cur_size - prev_size) * 1024 / check_interval ))
prev_size=$cur_size
Expand Down Expand Up @@ -429,9 +441,15 @@ function checkout::resolve_alt_ips() {
local regions
IFS=',' read -ra regions <<< "${SEMAPHORE_GIT_CLONE_ALT_REGIONS:-74.0.0.0/8,177.0.0.0/8,110.0.0.0/8}"

local connect_timeout="${SEMAPHORE_GIT_CLONE_DOH_CONNECT_TIMEOUT:-5}"
local max_time="${SEMAPHORE_GIT_CLONE_DOH_MAX_TIME:-10}"

for region in "${regions[@]}"; do
local ip
ip=$(curl -sf "https://dns.google/resolve?name=${git_host}&type=A&edns_client_subnet=${region}" | \
# --connect-timeout / --max-time keep tier 2 from hanging when dns.google
# is blocked or unreachable (common on locked-down self-hosted runners).
ip=$(curl -sf --connect-timeout "${connect_timeout}" --max-time "${max_time}" \
"https://dns.google/resolve?name=${git_host}&type=A&edns_client_subnet=${region}" | \
grep -o '"data":"[^"]*"' | sed 's/"data":"//;s/"//' | head -1)

if [ -n "$ip" ] && [ "$ip" != "$current_ip" ]; then
Expand All @@ -451,7 +469,11 @@ function checkout::clone_with_alt_ip() {
# HTTPS: inject curloptResolve between 'git' and 'clone'
checkout::clone_with_speed_check git -c "http.curloptResolve=${git_host}:${git_port}:${alt_ip}" "${@:2}"
else
# SSH: route through alternative IP via ProxyCommand
# SSH: route through alternative IP via ProxyCommand (needs nc)
if ! command -v nc >/dev/null 2>&1; then
echo "[checkout] 'nc' not available; cannot route SSH clone through alternative endpoint"
return 1
fi
local orig_ssh_command="${GIT_SSH_COMMAND:-}"
export GIT_SSH_COMMAND="${orig_ssh_command:-ssh} -o ProxyCommand='nc ${alt_ip} ${git_port}'"
checkout::clone_with_speed_check "$@"
Expand Down
70 changes: 70 additions & 0 deletions tests/libcheckout_slow_retry.bats
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,12 @@ setup() {
unset SEMAPHORE_GIT_CLONE_SLOW_RETRY
unset SEMAPHORE_GIT_CLONE_SLOW_THRESHOLD
unset SEMAPHORE_GIT_CLONE_SLOW_TIMEOUT
unset SEMAPHORE_GIT_CLONE_SLOW_GRACE
unset SEMAPHORE_GIT_CLONE_RETRY_COUNT
unset SEMAPHORE_GIT_CLONE_ALT_IP_RETRIES
unset SEMAPHORE_GIT_CLONE_ALT_REGIONS
unset SEMAPHORE_GIT_CLONE_DOH_CONNECT_TIMEOUT
unset SEMAPHORE_GIT_CLONE_DOH_MAX_TIME

export SEMAPHORE_GIT_URL="https://github.com/mojombo/grit.git"
export SEMAPHORE_GIT_BRANCH=master
Expand Down Expand Up @@ -105,6 +108,7 @@ SCRIPT
@test "slow retry - speed check detects and kills slow process" {
export SEMAPHORE_GIT_CLONE_SLOW_THRESHOLD=999999999
export SEMAPHORE_GIT_CLONE_SLOW_TIMEOUT=5
export SEMAPHORE_GIT_CLONE_SLOW_GRACE=0

local mock="/tmp/slow_mock_$$"
cat > "$mock" <<'SCRIPT'
Expand All @@ -121,12 +125,60 @@ SCRIPT
assert_output --partial "[checkout] Slow clone detected"
}

@test "slow retry - speed check grace window protects slow start" {
# Slow throughput, but the grace window outlasts the process, so it must
# not be killed as slow (guards against false positives on big-repo startup).
export SEMAPHORE_GIT_CLONE_SLOW_THRESHOLD=999999999
export SEMAPHORE_GIT_CLONE_SLOW_TIMEOUT=5
export SEMAPHORE_GIT_CLONE_SLOW_GRACE=60

local mock="/tmp/slow_mock_$$"
cat > "$mock" <<'SCRIPT'
#!/bin/bash
dir="$1"
mkdir -p "$dir/.git/objects"
dd if=/dev/zero of="$dir/.git/objects/pack" bs=1024 count=1 2>/dev/null
sleep 12
SCRIPT
chmod +x "$mock"

run checkout::clone_with_speed_check "$mock" "$SEMAPHORE_GIT_DIR"
assert_success
refute_output --partial "[checkout] Slow clone detected"
}

# === resolve_alt_ips timeout ===

@test "slow retry - resolve_alt_ips passes curl connect/max timeouts" {
export SEMAPHORE_GIT_CLONE_ALT_REGIONS="74.0.0.0/8"

local mock_dir="/tmp/slow_mock_net_$$"
local args_file="${mock_dir}/curl_args"
mkdir -p "$mock_dir"
cat > "$mock_dir/curl" <<SCRIPT
#!/bin/bash
echo "\$@" >> "${args_file}"
echo '{"Answer":[{"data":"185.199.108.133"}]}'
SCRIPT
chmod +x "$mock_dir/curl"
export PATH="$mock_dir:$PATH"

run checkout::resolve_alt_ips "github.com" "140.82.121.4"
assert_success
assert_output --partial "185.199.108.133"

run cat "${args_file}"
assert_output --partial "--connect-timeout 5"
assert_output --partial "--max-time 10"
}

# === resilient_clone integration ===

@test "slow retry - resilient clone retries on slow then reports failure" {
export SEMAPHORE_GIT_CLONE_SLOW_RETRY=true
export SEMAPHORE_GIT_CLONE_SLOW_THRESHOLD=999999999
export SEMAPHORE_GIT_CLONE_SLOW_TIMEOUT=5
export SEMAPHORE_GIT_CLONE_SLOW_GRACE=0
export SEMAPHORE_GIT_CLONE_RETRY_COUNT=2
export SEMAPHORE_GIT_CLONE_ALT_IP_RETRIES=0

Expand Down Expand Up @@ -344,6 +396,24 @@ SCRIPT
assert_output --partial "http.curloptResolve=github.com:443:1.2.3.4"
}

@test "slow retry - clone_with_alt_ip fails cleanly for SSH when nc missing" {
export SEMAPHORE_GIT_URL="git@github.com:mojombo/grit.git"

# Restrict PATH to a dir with only a git stub so 'command -v nc' fails.
local mock_dir="/tmp/slow_mock_bin_$$"
mkdir -p "$mock_dir"
cat > "$mock_dir/git" <<'SCRIPT'
#!/bin/bash
echo "git should not be called"
SCRIPT
chmod +x "$mock_dir/git"

PATH="$mock_dir" run checkout::clone_with_alt_ip "1.2.3.4" "github.com" "22" git clone "${SEMAPHORE_GIT_URL}" "${SEMAPHORE_GIT_DIR}"
assert_failure
assert_output --partial "'nc' not available"
refute_output --partial "git should not be called"
}

# === Full checkout flow with slow retry ===

@test "slow retry - full checkout with flag on succeeds for push" {
Expand Down
Loading