From d86713ec48102d4ebec7e7198732a156e199a756 Mon Sep 17 00:00:00 2001 From: aleck Date: Thu, 3 Jul 2025 13:12:20 +0800 Subject: [PATCH] fix: Add automatic ECR token refresh mechanism MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix ECR token expiration issue by implementing automatic token refresh for Nomad cluster nodes that require ECR access. Affected nodes and ECR token requirements: - API节点 (API Node): E2B API服务 ✅ ECR token required - Client节点 (Client Node): 沙箱执行环境 ✅ ECR token required - Build节点 (Build Node): 模板构建环境 ✅ ECR token required - Server节点 (Server Node): Nomad/Consul管理 ❌ No ECR token needed Modified startup scripts: 1. infra-iac/terraform/scripts/start-api.sh 2. infra-iac/terraform/scripts/start-client.sh 3. infra-iac/terraform/scripts/start-build-cluster.sh Implementation details: - Initial ECR token setup during node startup - Automatic token refresh script (/usr/local/bin/refresh-ecr-token.sh) - Cron job for periodic refresh (every 10 hours) - Nomad service restart after token refresh - Comprehensive error handling and logging - ECR token validity: 12 hours, refresh interval: 10 hours This ensures continuous Docker image pulling capability from ECR without manual intervention or service disruption. --- infra-iac/terraform/scripts/start-api.sh | 79 ++++++++++++++++++- .../terraform/scripts/start-build-cluster.sh | 78 +++++++++++++++++- infra-iac/terraform/scripts/start-client.sh | 78 +++++++++++++++++- 3 files changed, 226 insertions(+), 9 deletions(-) diff --git a/infra-iac/terraform/scripts/start-api.sh b/infra-iac/terraform/scripts/start-api.sh index 5e55134..4a71f35 100755 --- a/infra-iac/terraform/scripts/start-api.sh +++ b/infra-iac/terraform/scripts/start-api.sh @@ -38,18 +38,91 @@ aws s3 cp "s3://${SCRIPTS_BUCKET}/run-consul-${RUN_CONSUL_FILE_HASH}.sh" /opt/co aws s3 cp "s3://${SCRIPTS_BUCKET}/run-api-nomad-${RUN_NOMAD_FILE_HASH}.sh" /opt/nomad/bin/run-nomad.sh chmod +x /opt/consul/bin/run-consul.sh /opt/nomad/bin/run-nomad.sh +# Create initial Docker configuration mkdir -p /root/docker touch /root/docker/config.json -# export ECR_AUTH_TOKEN=$(aws ecr get-authorization-token --output text --query 'authorizationData[].authorizationToken') -cat </root/docker/config.json + +# Initial ECR token setup (without restarting Nomad since it's not running yet) +echo "[$(date)] Setting up initial ECR token..." +new_token=$(aws ecr get-authorization-token --region "${AWS_REGION}" --output text --query 'authorizationData[].authorizationToken' 2>/dev/null) + +if [ -n "$new_token" ]; then + cat </root/docker/config.json { "auths": { "${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com": { - "auth": "$(aws ecr get-authorization-token --output text --query 'authorizationData[].authorizationToken')" + "auth": "$new_token" } } } EOF + echo "[$(date)] Initial ECR token configured successfully" +else + echo "[$(date)] Warning: Failed to get initial ECR token" +fi + +# Create ECR token refresh script +cat <<'REFRESH_SCRIPT' >/usr/local/bin/refresh-ecr-token.sh +#!/bin/bash +# ECR Token Refresh Script for API Node +set -euo pipefail + +AWS_ACCOUNT_ID="${AWS_ACCOUNT_ID}" +AWS_REGION="${AWS_REGION}" +CONSUL_TOKEN="${CONSUL_TOKEN}" +LOG_FILE="/var/log/ecr-token-refresh.log" + +# Log function +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE" +} + +log "Starting ECR token refresh..." + +# Get new ECR token +new_token=$(aws ecr get-authorization-token --region "$AWS_REGION" --output text --query 'authorizationData[].authorizationToken' 2>/dev/null) + +if [ -n "$new_token" ]; then + # Update Docker config + cat </root/docker/config.json +{ + "auths": { + "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com": { + "auth": "$new_token" + } + } +} +EOF + log "ECR token refreshed successfully" + + # Restart Nomad to pick up new token + if pgrep -f nomad > /dev/null; then + log "Restarting Nomad service..." + pkill -f nomad + sleep 3 + /opt/nomad/bin/run-nomad.sh --consul-token "$CONSUL_TOKEN" & + log "Nomad restarted successfully" + else + log "Nomad not running, skipping restart" + fi +else + log "ERROR: Failed to get ECR token" + exit 1 +fi +REFRESH_SCRIPT + +# Make refresh script executable +chmod +x /usr/local/bin/refresh-ecr-token.sh + +# Add cron job to refresh ECR token every 10 hours +cat </etc/cron.d/ecr-token-refresh +# Refresh ECR token every 10 hours +0 */10 * * * root /usr/local/bin/refresh-ecr-token.sh >> /var/log/ecr-token-refresh.log 2>&1 +CRON_JOB + +# Ensure cron service is running +systemctl enable cron +systemctl start cron mkdir -p /etc/systemd/resolved.conf.d/ touch /etc/systemd/resolved.conf.d/consul.conf diff --git a/infra-iac/terraform/scripts/start-build-cluster.sh b/infra-iac/terraform/scripts/start-build-cluster.sh index 19d9d2a..2016258 100755 --- a/infra-iac/terraform/scripts/start-build-cluster.sh +++ b/infra-iac/terraform/scripts/start-build-cluster.sh @@ -60,16 +60,88 @@ chmod +x /opt/consul/bin/run-consul.sh /opt/nomad/bin/run-nomad.sh mkdir -p /root/docker touch /root/docker/config.json -# export ECR_AUTH_TOKEN=$(aws ecr get-authorization-token --output text --query 'authorizationData[].authorizationToken') -cat </root/docker/config.json + +# Initial ECR token setup (without restarting Nomad since it's not running yet) +echo "[$(date)] Setting up initial ECR token..." +new_token=$(aws ecr get-authorization-token --region "${AWS_REGION}" --output text --query 'authorizationData[].authorizationToken' 2>/dev/null) + +if [ -n "$new_token" ]; then + cat </root/docker/config.json { "auths": { "${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com": { - "auth": "$(aws ecr get-authorization-token --output text --query 'authorizationData[].authorizationToken')" + "auth": "$new_token" } } } EOF + echo "[$(date)] Initial ECR token configured successfully" +else + echo "[$(date)] Warning: Failed to get initial ECR token" +fi + +# Create ECR token refresh script +cat <<'REFRESH_SCRIPT' >/usr/local/bin/refresh-ecr-token.sh +#!/bin/bash +# ECR Token Refresh Script for Build Node +set -euo pipefail + +AWS_ACCOUNT_ID="${AWS_ACCOUNT_ID}" +AWS_REGION="${AWS_REGION}" +CONSUL_TOKEN="${CONSUL_TOKEN}" +LOG_FILE="/var/log/ecr-token-refresh.log" + +# Log function +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE" +} + +log "Starting ECR token refresh..." + +# Get new ECR token +new_token=$(aws ecr get-authorization-token --region "$AWS_REGION" --output text --query 'authorizationData[].authorizationToken' 2>/dev/null) + +if [ -n "$new_token" ]; then + # Update Docker config + cat </root/docker/config.json +{ + "auths": { + "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com": { + "auth": "$new_token" + } + } +} +EOF + log "ECR token refreshed successfully" + + # Restart Nomad to pick up new token + if pgrep -f nomad > /dev/null; then + log "Restarting Nomad service..." + pkill -f nomad + sleep 3 + /opt/nomad/bin/run-nomad.sh --consul-token "$CONSUL_TOKEN" & + log "Nomad restarted successfully" + else + log "Nomad not running, skipping restart" + fi +else + log "ERROR: Failed to get ECR token" + exit 1 +fi +REFRESH_SCRIPT + +# Make refresh script executable +chmod +x /usr/local/bin/refresh-ecr-token.sh + +# Add cron job to refresh ECR token every 10 hours +cat </etc/cron.d/ecr-token-refresh +# Refresh ECR token every 10 hours +0 */10 * * * root /usr/local/bin/refresh-ecr-token.sh >> /var/log/ecr-token-refresh.log 2>&1 +CRON_JOB + +# Ensure cron service is running +systemctl enable cron +systemctl start cron mkdir -p /etc/systemd/resolved.conf.d/ touch /etc/systemd/resolved.conf.d/consul.conf diff --git a/infra-iac/terraform/scripts/start-client.sh b/infra-iac/terraform/scripts/start-client.sh index cd4ccd9..f5b386d 100755 --- a/infra-iac/terraform/scripts/start-client.sh +++ b/infra-iac/terraform/scripts/start-client.sh @@ -121,16 +121,88 @@ chmod +x /opt/consul/bin/run-consul.sh /opt/nomad/bin/run-nomad.sh mkdir -p /root/docker touch /root/docker/config.json -# export ECR_AUTH_TOKEN=$(aws ecr get-authorization-token --output text --query 'authorizationData[].authorizationToken') -cat </root/docker/config.json + +# Initial ECR token setup (without restarting Nomad since it's not running yet) +echo "[$(date)] Setting up initial ECR token..." +new_token=$(aws ecr get-authorization-token --region "${AWS_REGION}" --output text --query 'authorizationData[].authorizationToken' 2>/dev/null) + +if [ -n "$new_token" ]; then + cat </root/docker/config.json { "auths": { "${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com": { - "auth": "$(aws ecr get-authorization-token --output text --query 'authorizationData[].authorizationToken')" + "auth": "$new_token" } } } EOF + echo "[$(date)] Initial ECR token configured successfully" +else + echo "[$(date)] Warning: Failed to get initial ECR token" +fi + +# Create ECR token refresh script +cat <<'REFRESH_SCRIPT' >/usr/local/bin/refresh-ecr-token.sh +#!/bin/bash +# ECR Token Refresh Script for Client Node +set -euo pipefail + +AWS_ACCOUNT_ID="${AWS_ACCOUNT_ID}" +AWS_REGION="${AWS_REGION}" +CONSUL_TOKEN="${CONSUL_TOKEN}" +LOG_FILE="/var/log/ecr-token-refresh.log" + +# Log function +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE" +} + +log "Starting ECR token refresh..." + +# Get new ECR token +new_token=$(aws ecr get-authorization-token --region "$AWS_REGION" --output text --query 'authorizationData[].authorizationToken' 2>/dev/null) + +if [ -n "$new_token" ]; then + # Update Docker config + cat </root/docker/config.json +{ + "auths": { + "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com": { + "auth": "$new_token" + } + } +} +EOF + log "ECR token refreshed successfully" + + # Restart Nomad to pick up new token + if pgrep -f nomad > /dev/null; then + log "Restarting Nomad service..." + pkill -f nomad + sleep 3 + /opt/nomad/bin/run-nomad.sh --consul-token "$CONSUL_TOKEN" & + log "Nomad restarted successfully" + else + log "Nomad not running, skipping restart" + fi +else + log "ERROR: Failed to get ECR token" + exit 1 +fi +REFRESH_SCRIPT + +# Make refresh script executable +chmod +x /usr/local/bin/refresh-ecr-token.sh + +# Add cron job to refresh ECR token every 10 hours +cat </etc/cron.d/ecr-token-refresh +# Refresh ECR token every 10 hours +0 */10 * * * root /usr/local/bin/refresh-ecr-token.sh >> /var/log/ecr-token-refresh.log 2>&1 +CRON_JOB + +# Ensure cron service is running +systemctl enable cron +systemctl start cron mkdir -p /etc/systemd/resolved.conf.d/ touch /etc/systemd/resolved.conf.d/consul.conf