From 0814b040e11c4b1e26e22702f7ad8bc2c29eea82 Mon Sep 17 00:00:00 2001 From: Hardik Date: Wed, 24 Jun 2026 16:27:22 +0530 Subject: [PATCH] fix(automation): bound + detach the Claude run so a hang can't wedge or stick First live run on PR #126 left the foreground shell stuck: Claude pushed the commit itself and then did not exit, so the supervisor code after it (push + ack + handled marker) never ran. Under cron that also holds the flock lock, freezing every later run, and the comment never gets marked handled (so it would be re-processed forever). - Wrap the Claude invocation in `setsid timeout -k 30s "$CLAUDE_TIMEOUT"` (default 30m). `setsid` detaches from the controlling terminal so a lingering child can't stick an interactive run; `timeout` returns control to the supervisor, which still pushes any commits (idempotent if Claude pushed) and writes the handled marker. A timed-out run (rc=124) is logged, not fatal. - Give this watcher its own dev port (devPort, default 3101) distinct from the issue watcher's 3100, and reap it after each run -- no cross-watcher kill. - Reinforce the prompt: stop any dev server and END THE TURN; never push. Adds claudeTimeout + devPort to the example config and documents both. Co-Authored-By: Claude Opus 4.8 --- automation/README.md | 7 ++++ automation/claude-pr-review-watcher.sh | 33 +++++++++++++++---- .../pr-review-watcher.config.example.json | 4 ++- 3 files changed, 37 insertions(+), 7 deletions(-) diff --git a/automation/README.md b/automation/README.md index da8ef8d..6e09b9d 100644 --- a/automation/README.md +++ b/automation/README.md @@ -157,6 +157,13 @@ cp automation/pr-review-watcher.config.example.json ~/pr-review-watcher/pr-revie (`pr-review-.log`, per-PR `claude-pr--*.log`). - Same **auth preflight** as the issue watcher — no-ops until Claude Code is signed in on pms1 (or `ANTHROPIC_API_KEY` is set). +- **Bounded + detached run:** each Claude invocation is wrapped in `setsid timeout` + (`claudeTimeout`, default `30m`). `setsid` detaches it from any terminal (so a manual + run can't leave your shell stuck on a lingering child); `timeout` guarantees control + returns to the supervisor — so even a stuck/misbehaving run still gets its commits + pushed and its `handled:` marker written, and can never wedge the flock lock for later + cron runs. Runtime checks use **port `3101`** (`devPort`), distinct from the issue + watcher's `3100`, and the watcher reaps that port after every run. - A Windows `.ps1` port is not provided yet (pms1 is the sole worker); port it from `claude-issue-watcher.ps1` only if you need a failover. diff --git a/automation/claude-pr-review-watcher.sh b/automation/claude-pr-review-watcher.sh index cfe11cf..a76ebb0 100644 --- a/automation/claude-pr-review-watcher.sh +++ b/automation/claude-pr-review-watcher.sh @@ -37,6 +37,15 @@ MAX_PRS=$(cfg '.maxPrsPerRun // 1') MAX_COMMENTS=$(cfg '.maxCommentsPerPr // 20') CLAUDE=$(cfg .claudeExe) TURNS=$(cfg '.claudeMaxTurns // 150') +# Hard wall-clock cap on a single Claude run. --max-turns bounds turns but a single +# stuck turn (or a server Claude spawned) can still block forever -- which under cron +# would hold the flock lock and freeze every later run. `timeout` guarantees control +# returns to the supervisor so it can still push partial work + write the handled marker. +CLAUDE_TIMEOUT=$(cfg '.claudeTimeout // "30m"') +# Ephemeral dev-server port for Claude's runtime checks. DISTINCT from the issue +# watcher's 3100 so the two never collide if their cron runs overlap (3000=prod, +# 3100=autofix/issue-watcher, 3200=staging, 3101=this watcher). +DEV_PORT=$(cfg '.devPort // 3101') API="$FORGEJO_URL/api/v1" # Hidden marker the bot stamps on its acknowledgement comments. The "handled:" @@ -214,9 +223,9 @@ while [ "$p" -lt "$n_prs" ]; do printf '%s\n' " storage is local in this dev mode." printf '%s\n' "- Run integration tests after loading the env:" printf '%s\n' " cd App && set -a && . ./.env && set +a && pnpm test:integration" - printf '%s\n' "- If you need runtime verification you MAY start a dev server ON PORT 3100 ONLY:" - printf '%s\n' " cd App && pnpm dev -p 3100 (production runs on 3000 -- NEVER touch 3000)" - printf '%s\n' " Stop ONLY your own server by port ('fuser -k 3100/tcp'); NEVER a broad 'pkill -f next'." + printf '%s\n' "- If you need runtime verification you MAY start a dev server ON PORT $DEV_PORT ONLY:" + printf '%s\n' " cd App && pnpm dev -p $DEV_PORT (production runs on 3000 -- NEVER touch 3000)" + printf '%s\n' " Stop ONLY your own server by port ('fuser -k $DEV_PORT/tcp'); NEVER a broad 'pkill -f next'." printf '%s\n' "" printf '%s\n' "## Your job (PR policy: every code change ships with tests + docs)" printf '%s\n' "1. Make the focused changes the review comments ask for -- nothing more." @@ -227,15 +236,27 @@ while [ "$p" -lt "$n_prs" ]; do printf '%s\n' "4. Update any docs the change affects (App/README.md, App/CLAUDE.md, Docs/, CHANGELOG.md)." printf '%s\n' "5. Commit ALL changes to the current branch with a conventional message referencing #$num." printf '%s\n' "6. Do NOT push, do NOT switch branches, do NOT open/close PRs. The supervisor pushes." + printf '%s\n' "7. BEFORE you finish: stop any dev server you started ('fuser -k $DEV_PORT/tcp'), leave NO" + printf '%s\n' " background process running, and then END YOUR TURN. Do not wait or keep the session open." printf '%s\n' "If a comment is unclear, out of scope, or too risky to do unattended (migrations, payments," printf '%s\n' "permissions), make NO commit for it and explain why in CLAUDE_RESULT.md in the repo root." } > "$prompt_file" clog="$LOG_DIR/claude-pr-$num-$(date +%Y%m%d-%H%M%S).log" - log " Running Claude on PR #$num (log: $clog)" - ( cd "$WORKDIR" && "$CLAUDE" -p --dangerously-skip-permissions \ + log " Running Claude on PR #$num (log: $clog, timeout: $CLAUDE_TIMEOUT)" + # `timeout` sends TERM at the limit, then KILL 30s later, in its own session + # (-s/setsid via `setsid`) so the whole process group dies -- not just `claude`, + # but any dev server it spawned. Bounded so a stuck run can never wedge the lock. + ( cd "$WORKDIR" && setsid timeout -k 30s "$CLAUDE_TIMEOUT" \ + "$CLAUDE" -p --dangerously-skip-permissions \ --max-turns "$TURNS" --output-format text < "$prompt_file" > "$clog" 2>&1 ); rc=$? - log " Claude exited with code $rc for PR #$num" + if [ "$rc" -eq 124 ]; then + log " Claude TIMED OUT after $CLAUDE_TIMEOUT on PR #$num (rc=124) -- continuing with any committed work" + else + log " Claude exited with code $rc for PR #$num" + fi + # Backstop: reap a dev server the run may have left on this watcher's dev port. + fuser -k "$DEV_PORT/tcp" >/dev/null 2>&1 || true rm -f "$prompt_file" note="" diff --git a/automation/pr-review-watcher.config.example.json b/automation/pr-review-watcher.config.example.json index 8709972..4676c29 100644 --- a/automation/pr-review-watcher.config.example.json +++ b/automation/pr-review-watcher.config.example.json @@ -9,5 +9,7 @@ "maxPrsPerRun": 1, "maxCommentsPerPr": 20, "claudeExe": "/home/shad0w/.nvm/versions/node//bin/claude", - "claudeMaxTurns": 150 + "claudeMaxTurns": 150, + "claudeTimeout": "30m", + "devPort": 3101 }