feat(skill/playwright): unify shared-session CLI flow for web skills

This commit is contained in:
2026-03-04 12:19:57 +08:00
parent aa72b570e1
commit 8fc5c6e128
12 changed files with 411 additions and 236 deletions
+4
View File
@@ -18,6 +18,9 @@ description: "Generate images in Gemini web via browser automation, download res
- Ensure browser session can access Gemini (`https://gemini.google.com/app`).
- If login, captcha, or MFA is required, pause and ask user to complete it manually.
- Use the shared Playwright session policy across all skills:
- `export PLAYWRIGHT_SHARED_SESSION=codex-shared`
- Invoke Playwright CLI through `/Users/xd/java/xhs/tools/pw` (do not pass `--session` manually).
- Decide output directory before generation, for example:
- `/Users/xd/java/xhs/output/gemini`
@@ -126,4 +129,5 @@ Return:
## Scripts
- `/Users/xd/java/xhs/tools/pw`: Shared Playwright CLI entrypoint with fixed session + lock.
- `scripts/collect_downloads.py`: Collect recent downloaded images with fallback sources, dedupe, and manifest.
+1 -1
View File
@@ -1,4 +1,4 @@
interface:
display_name: "Gemini Image Web"
short_description: "Generate Gemini images via web, multi-request, dedupe, and manifest."
default_prompt: "Use $gemini-image-web to generate one image per Gemini request until target count is reached, download full-size outputs, then collect files with fallback source strategy, dedupe, and manifest metadata."
default_prompt: "Use $gemini-image-web with PLAYWRIGHT_SHARED_SESSION=codex-shared; run browser steps only through /Users/xd/java/xhs/tools/pw, generate one image per Gemini request until target count is reached, download full-size outputs, then collect files with fallback source strategy, dedupe, and manifest metadata."
+7 -2
View File
@@ -20,13 +20,17 @@ description: "Generate music in Gemini web via browser automation, download resu
- Ensure browser session can access Gemini (`https://gemini.google.com/app`).
- If login, captcha, or MFA is required, pause and ask user to complete it manually.
- Use the shared Playwright session policy across all skills:
- `export PLAYWRIGHT_SHARED_SESSION=codex-shared`
- Invoke Playwright CLI through `/Users/xd/java/xhs/tools/pw` (do not pass `--session` manually).
- Decide output directory before generation, for example:
- `/Users/xd/java/xhs/output/gemini-music`
Quick run:
```bash
bash scripts/run_music_flow.sh \
export PLAYWRIGHT_SHARED_SESSION=codex-shared
python3 scripts/run_music_flow.py \
--prompt "创作一段 90 BPM 的 lo-fi hiphop,温暖、夜晚、钢琴和刷镲,时长 30 秒。" \
--target /Users/xd/java/xhs/output/gemini-music \
--count 1
@@ -152,5 +156,6 @@ Return:
## Scripts
- `scripts/run_music_flow.sh`: End-to-end runner (login gate, enter music tool, generate, download MP3, collect files).
- `/Users/xd/java/xhs/tools/pw`: Shared Playwright CLI entrypoint with fixed session + lock.
- `scripts/run_music_flow.py`: End-to-end runner (login gate, enter music tool, generate, download MP3, collect files).
- `scripts/collect_downloads.py`: Collect recent downloaded audio files with fallback sources, dedupe, and manifest.
+1 -1
View File
@@ -1,4 +1,4 @@
interface:
display_name: "Gemini Music Web"
short_description: "Generate Gemini music via web with login gate and manifest."
default_prompt: "Use $gemini-music-web to run scripts/run_music_flow.sh: verify Gemini login, enter 创作音乐, generate tracks one-by-one, prefer MP3 download, then collect files with dedupe and manifest metadata."
default_prompt: "Use $gemini-music-web with PLAYWRIGHT_SHARED_SESSION=codex-shared; run scripts/run_music_flow.py via /Users/xd/java/xhs/tools/pw-backed CLI flow to verify login, generate music, prefer MP3 download, and collect deduped outputs with manifest."
@@ -151,6 +151,8 @@ def resolve_sources(raw_sources: list[str] | None) -> list[Path]:
if raw_sources:
return [Path(item).expanduser().resolve() for item in raw_sources]
auto_sources = discover_playwright_sources()
auto_sources.append((Path.cwd() / ".playwright-cli").resolve())
auto_sources.append((Path(__file__).resolve().parents[3] / ".playwright-cli").resolve())
auto_sources.append((Path.home() / "Downloads").resolve())
result: list[Path] = []
seen: set[Path] = set()
+232
View File
@@ -0,0 +1,232 @@
#!/usr/bin/env python3
"""Run Gemini music generation flow end-to-end via Playwright CLI."""
from __future__ import annotations
import argparse
import json
import os
import re
import subprocess
import sys
import time
from pathlib import Path
class FlowError(RuntimeError):
"""Raised when a subprocess command in the flow fails."""
def run_command(
cmd: list[str], *, capture_output: bool = True, check: bool = True
) -> subprocess.CompletedProcess[str]:
kwargs: dict[str, object] = {"text": True}
if capture_output:
kwargs["stdout"] = subprocess.PIPE
kwargs["stderr"] = subprocess.STDOUT
proc = subprocess.run(cmd, **kwargs)
if check and proc.returncode != 0:
output = proc.stdout if capture_output else ""
raise FlowError(
f"Command failed ({proc.returncode}): {' '.join(cmd)}\n{output}"
)
return proc
def run_pw(pw_shared: Path, *args: str) -> str:
proc = run_command([str(pw_shared), *args], capture_output=True)
return proc.stdout or ""
def is_login_required(pw_shared: Path) -> bool:
out = run_pw(
pw_shared,
"eval",
(
"() => {"
"const hasAccount = !!document.querySelector("
"'button[aria-label*=\\\"Google 账号\\\"], "
"button[aria-label*=\\\"Google Account\\\"]'"
");"
"const hasService = !!document.querySelector('a[href*=\\\"ServiceLogin\\\"]');"
"const hasLoginCtl = Array.from(document.querySelectorAll('a,button'))"
".some(el => /登录|Sign in/i.test((el.textContent || '').trim()));"
"return !hasAccount && (hasService || hasLoginCtl);"
"}"
),
)
return bool(re.search(r"(?m)^true$", out))
def enter_music_tool(pw_shared: Path) -> None:
js = r"""
async (page) => {
const labels = [/创作音乐/, /制作音乐/, /Create music/i, /Music/i];
const tryCardButtons = async () => {
for (const re of labels) {
const btn = page.getByRole('button', { name: re }).first();
if (await btn.count()) {
try {
await btn.click({ timeout: 2000 });
return true;
} catch (_) {
// Overlay may intercept pointer. Fall through to menu strategy.
}
}
}
return false;
};
const tryToolMenu = async () => {
await page.getByRole('button', { name: '工具', exact: true }).click();
for (const re of labels) {
const itemCheck = page.getByRole('menuitemcheckbox', { name: re }).first();
if (await itemCheck.count()) {
await itemCheck.click();
return true;
}
const itemPlain = page.getByRole('menuitem', { name: re }).first();
if (await itemPlain.count()) {
await itemPlain.click();
return true;
}
}
return false;
};
let ok = await tryCardButtons();
if (!ok) ok = await tryToolMenu();
if (!ok) ok = await tryToolMenu();
if (!ok) throw new Error('Music tool entry not found');
}
"""
run_pw(pw_shared, "run-code", js)
def submit_and_download_one(pw_shared: Path, prompt: str) -> None:
js = f"""
async (page) => {{
const prompt = {json.dumps(prompt)};
const input = page.getByRole('textbox', {{ name: /为 Gemini 输入提示|Enter a prompt/i }}).first();
await input.click();
await input.fill(prompt);
await input.press('Enter');
const stopBtn = page.getByRole('button', {{ name: /停止回答|Stop response/i }}).first();
await stopBtn.waitFor({{ state: 'visible', timeout: 15000 }}).catch(() => {{}});
await stopBtn.waitFor({{ state: 'hidden', timeout: 240000 }});
const downloadBtn = page.getByRole('button', {{ name: /下载音乐作品|Download music/i }}).last();
await downloadBtn.click();
const mp3Item = page.getByRole('menuitem', {{ name: /纯音频|MP3/i }}).first();
if (await mp3Item.count()) {{
await mp3Item.click();
}} else {{
const anyItem = page.getByRole('menuitem').first();
if (await anyItem.count()) await anyItem.click();
}}
await page.waitForTimeout(1200);
}}
"""
run_pw(pw_shared, "run-code", js)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Generate music on Gemini web and collect downloaded files."
)
parser.add_argument("--prompt", required=True, help="Prompt text for music generation.")
parser.add_argument(
"--target", required=True, help="Absolute output directory for collected files."
)
parser.add_argument(
"--count", type=int, default=1, help="Number of tracks to generate. Default: 1."
)
parser.add_argument(
"--no-headed",
action="store_true",
help="Run browser without headed mode.",
)
return parser.parse_args()
def main() -> int:
args = parse_args()
if args.count < 1:
print("--count must be a positive integer.", file=sys.stderr)
return 1
repo_root = Path(__file__).resolve().parents[3]
pw_shared = Path(
os.environ.get("PW_SHARED_WRAPPER", str(repo_root / "tools/pw"))
).expanduser()
collect_script = (Path(__file__).resolve().parent / "collect_downloads.py").resolve()
if not pw_shared.exists() or not pw_shared.is_file():
print(f"Shared Playwright wrapper not found: {pw_shared}", file=sys.stderr)
return 1
if not os.access(pw_shared, os.X_OK):
print(f"Shared Playwright wrapper is not executable: {pw_shared}", file=sys.stderr)
return 1
if not collect_script.exists():
print(f"Collector script not found: {collect_script}", file=sys.stderr)
return 1
target = Path(args.target).expanduser().resolve()
target.mkdir(parents=True, exist_ok=True)
start_ts = time.time()
try:
os.environ["PLAYWRIGHT_SHARED_INIT_MODE"] = (
"headless" if args.no_headed else "headed"
)
run_pw(pw_shared, "snapshot")
run_pw(pw_shared, "goto", "https://gemini.google.com/app")
run_pw(pw_shared, "snapshot")
if is_login_required(pw_shared):
print(
"Gemini is not logged in. Please log in at https://gemini.google.com/app and rerun.",
file=sys.stderr,
)
return 2
enter_music_tool(pw_shared)
for i in range(1, args.count + 1):
current_prompt = args.prompt
if args.count > 1:
current_prompt = (
f"{args.prompt}\n"
f"变体要求:这是第 {i} / {args.count} 首。保持风格一致,但旋律和节奏细节需要变化。"
)
submit_and_download_one(pw_shared, current_prompt)
collect_cmd = [
sys.executable,
str(collect_script),
"--target",
str(target),
"--since",
str(start_ts),
"--expected-count",
str(args.count),
"--limit",
str(args.count),
"--prefix",
"gemini-music",
"--prompt",
args.prompt,
]
proc = run_command(collect_cmd, capture_output=False, check=False)
return proc.returncode
except FlowError as exc:
print(str(exc), file=sys.stderr)
return 1
if __name__ == "__main__":
raise SystemExit(main())
@@ -1,230 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
usage() {
cat <<'EOF'
Usage:
run_music_flow.sh --prompt "<text>" --target /abs/output/dir [--count N] [--session NAME] [--no-headed]
Example:
run_music_flow.sh \
--prompt "创作一段 90 BPM 的 lo-fi hiphop,温暖、夜晚、钢琴和刷镲,时长 30 秒。" \
--target /Users/xd/java/xhs/output/gemini-music \
--count 2
EOF
}
PROMPT=""
TARGET=""
COUNT=1
SESSION="gmw$(date +%s)"
HEADED=1
while [[ $# -gt 0 ]]; do
case "$1" in
--prompt)
PROMPT="${2:-}"
shift 2
;;
--target)
TARGET="${2:-}"
shift 2
;;
--count)
COUNT="${2:-1}"
shift 2
;;
--session)
SESSION="${2:-$SESSION}"
shift 2
;;
--no-headed)
HEADED=0
shift
;;
-h|--help)
usage
exit 0
;;
*)
echo "Unknown arg: $1" >&2
usage
exit 1
;;
esac
done
if [[ -z "$PROMPT" || -z "$TARGET" ]]; then
echo "Both --prompt and --target are required." >&2
usage
exit 1
fi
if ! [[ "$COUNT" =~ ^[0-9]+$ ]] || [[ "$COUNT" -lt 1 ]]; then
echo "--count must be a positive integer." >&2
exit 1
fi
CODEX_HOME="${CODEX_HOME:-$HOME/.codex}"
PWCLI="${PWCLI:-$CODEX_HOME/skills/playwright/scripts/playwright_cli.sh}"
COLLECT_SCRIPT="$(cd "$(dirname "$0")" && pwd)/collect_downloads.py"
if ! command -v npx >/dev/null 2>&1; then
echo "npx is required." >&2
exit 1
fi
if [[ ! -x "$PWCLI" ]]; then
echo "Playwright wrapper not found or not executable: $PWCLI" >&2
exit 1
fi
if [[ ! -f "$COLLECT_SCRIPT" ]]; then
echo "Collector script not found: $COLLECT_SCRIPT" >&2
exit 1
fi
pw() {
"$PWCLI" --session "$SESSION" "$@"
}
json_escape() {
python3 - "$1" <<'PY'
import json
import sys
print(json.dumps(sys.argv[1]))
PY
}
is_login_required() {
local out
out="$(
pw eval "() => {
const hasAccount = !!document.querySelector('button[aria-label*=\\\"Google 账号\\\"], button[aria-label*=\\\"Google Account\\\"]');
const hasService = !!document.querySelector('a[href*=\\\"ServiceLogin\\\"]');
const hasLoginCtl = Array.from(document.querySelectorAll('a,button')).some(el => /登录|Sign in/i.test((el.textContent || '').trim()));
return !hasAccount && (hasService || hasLoginCtl);
}"
)"
echo "$out" | rg -q '^true$'
}
enter_music_tool() {
local js
js="$(cat <<'JS'
const labels = [/创作音乐/, /制作音乐/, /Create music/i, /Music/i];
const tryCardButtons = async () => {
for (const re of labels) {
const btn = page.getByRole('button', { name: re }).first();
if (await btn.count()) {
try {
await btn.click({ timeout: 2000 });
return true;
} catch (_) {
// Overlay may intercept pointer. Fall through to menu strategy.
}
}
}
return false;
};
const tryToolMenu = async () => {
await page.getByRole('button', { name: '工具', exact: true }).click();
for (const re of labels) {
const itemCheck = page.getByRole('menuitemcheckbox', { name: re }).first();
if (await itemCheck.count()) {
await itemCheck.click();
return true;
}
const itemPlain = page.getByRole('menuitem', { name: re }).first();
if (await itemPlain.count()) {
await itemPlain.click();
return true;
}
}
return false;
};
let ok = await tryCardButtons();
if (!ok) ok = await tryToolMenu();
if (!ok) {
// Re-open the tool menu once and retry as a last attempt.
ok = await tryToolMenu();
}
if (!ok) {
throw new Error('Music tool entry not found');
}
JS
)"
pw run-code "$js" >/dev/null
}
submit_and_download_one() {
local track_prompt="$1"
local escaped
escaped="$(json_escape "$track_prompt")"
local js
js="$(cat <<JS
const prompt = $escaped;
const input = page.getByRole('textbox', { name: /为 Gemini 输入提示|Enter a prompt/i }).first();
await input.click();
await input.fill(prompt);
await input.press('Enter');
const stopBtn = page.getByRole('button', { name: /停止回答|Stop response/i }).first();
await stopBtn.waitFor({ state: 'visible', timeout: 15000 }).catch(() => {});
await stopBtn.waitFor({ state: 'hidden', timeout: 240000 });
const downloadBtn = page.getByRole('button', { name: /下载音乐作品|Download music/i }).last();
await downloadBtn.click();
const mp3Item = page.getByRole('menuitem', { name: /纯音频|MP3/i }).first();
if (await mp3Item.count()) {
await mp3Item.click();
} else {
const anyItem = page.getByRole('menuitem').first();
if (await anyItem.count()) await anyItem.click();
}
await page.waitForTimeout(1200);
JS
)"
pw run-code "$js" >/dev/null
}
mkdir -p "$TARGET"
start_ts="$(python3 - <<'PY'
import time
print(time.time())
PY
)"
if [[ "$HEADED" -eq 1 ]]; then
pw open "https://gemini.google.com/app" --headed >/dev/null
else
pw open "https://gemini.google.com/app" >/dev/null
fi
pw snapshot >/dev/null
if is_login_required; then
echo "Gemini is not logged in. Please log in at https://gemini.google.com/app and rerun." >&2
exit 2
fi
enter_music_tool
for ((i=1; i<=COUNT; i++)); do
current_prompt="$PROMPT"
if [[ "$COUNT" -gt 1 ]]; then
current_prompt="$PROMPT
变体要求:这是第 $i / $COUNT 首。保持风格一致,但旋律和节奏细节需要变化。"
fi
submit_and_download_one "$current_prompt"
done
python3 "$COLLECT_SCRIPT" \
--target "$TARGET" \
--since "$start_ts" \
--expected-count "$COUNT" \
--limit "$COUNT" \
--prefix "gemini-music" \
--prompt "$PROMPT"
+7
View File
@@ -13,6 +13,13 @@ description: "Browse XiaoHongShu (小红书) with Playwright and execute account
4. Validate action success from UI state.
5. Summarize exactly what was done.
## Shared Session Policy
- Reuse a single Playwright CLI session across all web-automation skills:
- `export PLAYWRIGHT_SHARED_SESSION=codex-shared`
- Use `/Users/xd/java/xhs/tools/pw` as the only Playwright CLI entrypoint.
- Do not pass `--session` directly in skill-specific commands.
## 1) Confirm Login State
- Snapshot current page.
+1 -1
View File
@@ -1,7 +1,7 @@
interface:
display_name: "XHS Engage"
short_description: "Browse feed and engage via likes, favorites, and comments."
default_prompt: "Use $xiaohongshu-engage to browse XiaoHongShu and interact with relevant posts through likes, favorites, and concise comments."
default_prompt: "Use $xiaohongshu-engage with PLAYWRIGHT_SHARED_SESSION=codex-shared and execute browser steps only through /Users/xd/java/xhs/tools/pw while browsing XiaoHongShu and interacting with relevant posts through likes, favorites, and concise comments."
policy:
allow_implicit_invocation: true
+7
View File
@@ -16,6 +16,13 @@ description: "Execute XiaoHongShu (小红书) image-note publishing workflow in
7. Execute publish behavior according to publish mode.
8. Save publish evidence and return summary.
## Shared Session Policy
- Reuse a single Playwright CLI session across all web-automation skills:
- `export PLAYWRIGHT_SHARED_SESSION=codex-shared`
- Use `/Users/xd/java/xhs/tools/pw` as the only Playwright CLI entrypoint.
- Do not pass `--session` directly in skill-specific commands.
## 1) Enter Creator Publish Page
- From web homepage, click left sidebar `发布`.
@@ -1,4 +1,4 @@
interface:
display_name: "XHS Publish Note"
short_description: "Publish XHS image notes with manifest image linkage, hard gates, and publish modes"
default_prompt: "Use $xiaohongshu-publish-note to publish XiaoHongShu image notes: prefer user image paths, otherwise generate via $gemini-image-web and use manifest target paths, enforce hard gates (images>=1, topics>=5, no location), and run in safe_mode by default unless live_mode is explicitly requested."
default_prompt: "Use $xiaohongshu-publish-note with PLAYWRIGHT_SHARED_SESSION=codex-shared and run all browser steps via /Users/xd/java/xhs/tools/pw; publish XiaoHongShu image notes by preferring user image paths, otherwise generate via $gemini-image-web and use manifest target paths, enforce hard gates (images>=1, topics>=5, no location), and run in safe_mode by default unless live_mode is explicitly requested."
Executable
+148
View File
@@ -0,0 +1,148 @@
#!/usr/bin/env bash
set -euo pipefail
if [[ $# -lt 1 ]]; then
echo "Usage: tools/pw <playwright-cli-command> [args...]" >&2
exit 1
fi
for arg in "$@"; do
if [[ "$arg" == "--session" || "$arg" == "--session="* ]]; then
echo "Do not pass --session directly. Use PLAYWRIGHT_SHARED_SESSION." >&2
exit 2
fi
done
CODEX_HOME="${CODEX_HOME:-$HOME/.codex}"
PWCLI="${PWCLI:-$CODEX_HOME/skills/playwright/scripts/playwright_cli.sh}"
SESSION="${PLAYWRIGHT_SHARED_SESSION:-codex-shared}"
LOCK_TIMEOUT="${PLAYWRIGHT_SHARED_LOCK_TIMEOUT:-120}"
LOCK_DIR="${PLAYWRIGHT_SHARED_LOCK_DIR:-/tmp/pw-shared-session.lock}"
INIT_MODE="${PLAYWRIGHT_SHARED_INIT_MODE:-headed}"
if ! command -v npx >/dev/null 2>&1; then
echo "npx is required." >&2
exit 1
fi
if [[ ! -x "$PWCLI" ]]; then
echo "Playwright wrapper not found or not executable: $PWCLI" >&2
exit 1
fi
acquire_lock() {
local start_ts now lock_pid lock_mtime
start_ts="$(date +%s)"
while ! mkdir "$LOCK_DIR" 2>/dev/null; do
lock_pid=""
if [[ -f "$LOCK_DIR/pid" ]]; then
lock_pid="$(cat "$LOCK_DIR/pid" 2>/dev/null || true)"
fi
if [[ -z "$lock_pid" ]]; then
lock_mtime="$(stat -f %m "$LOCK_DIR" 2>/dev/null || echo 0)"
now="$(date +%s)"
if (( now - lock_mtime >= 5 )); then
rm -rf "$LOCK_DIR" 2>/dev/null || true
continue
fi
fi
if [[ -n "$lock_pid" ]] && ! kill -0 "$lock_pid" 2>/dev/null; then
rm -rf "$LOCK_DIR" 2>/dev/null || true
continue
fi
now="$(date +%s)"
if (( now - start_ts >= LOCK_TIMEOUT )); then
echo "Timeout waiting for Playwright shared-session lock: $LOCK_DIR" >&2
exit 1
fi
sleep 1
done
printf "%s\n" "$$" > "$LOCK_DIR/pid"
trap 'rm -rf "$LOCK_DIR" 2>/dev/null || true' EXIT INT TERM
}
run_pw() {
"$PWCLI" --session "$SESSION" "$@"
}
is_missing_session_error() {
local text="$1"
if echo "$text" | rg -qi "session.*not found|no browser|not open|closed"; then
return 0
fi
return 1
}
acquire_lock
cmd=("$@")
verb="${cmd[0]}"
if [[ "$verb" == "open" ]]; then
url=""
for (( i = 1; i < ${#cmd[@]}; i++ )); do
arg="${cmd[$i]}"
if [[ "$arg" != -* ]]; then
url="$arg"
break
fi
done
if [[ -n "$url" ]]; then
set +e
out="$(run_pw snapshot 2>&1)"
code=$?
set -e
if [[ $code -eq 0 ]]; then
run_pw goto "$url"
run_pw snapshot
exit 0
fi
if ! is_missing_session_error "$out"; then
echo "$out" >&2
exit $code
fi
fi
fi
if [[ "$verb" != "open" ]]; then
set +e
out="$(run_pw "${cmd[@]}" 2>&1)"
code=$?
set -e
if [[ $code -eq 0 ]]; then
echo "$out"
exit 0
fi
if is_missing_session_error "$out"; then
set +e
if [[ "$INIT_MODE" == "headless" ]]; then
run_pw open about:blank >/dev/null 2>&1
init_code=$?
elif [[ "$INIT_MODE" == "headed" ]]; then
run_pw open about:blank --headed >/dev/null 2>&1
init_code=$?
if [[ $init_code -ne 0 ]]; then
run_pw open about:blank >/dev/null 2>&1
init_code=$?
fi
else
run_pw open about:blank --headed >/dev/null 2>&1
init_code=$?
if [[ $init_code -ne 0 ]]; then
run_pw open about:blank >/dev/null 2>&1
init_code=$?
fi
fi
set -e
if [[ $init_code -ne 0 ]]; then
echo "Failed to initialize shared Playwright session." >&2
exit 1
fi
run_pw "${cmd[@]}"
exit 0
fi
echo "$out" >&2
exit $code
fi
run_pw "${cmd[@]}"