From b9153c70c7a7e9427743f08c4f4aaf129da59218 Mon Sep 17 00:00:00 2001 From: kurihada Date: Wed, 4 Mar 2026 13:54:27 +0800 Subject: [PATCH] feat(skill/gemini-image-web): unify image flow with music/video skills --- skills/gemini-image-web/SKILL.md | 71 +++-- skills/gemini-image-web/agents/openai.yaml | 2 +- .../scripts/collect_downloads.py | 33 +- .../scripts/run_image_flow.py | 293 ++++++++++++++++++ 4 files changed, 364 insertions(+), 35 deletions(-) create mode 100755 skills/gemini-image-web/scripts/run_image_flow.py diff --git a/skills/gemini-image-web/SKILL.md b/skills/gemini-image-web/SKILL.md index 940d968..86f3c7d 100644 --- a/skills/gemini-image-web/SKILL.md +++ b/skills/gemini-image-web/SKILL.md @@ -7,12 +7,14 @@ description: "Generate images in Gemini web via browser automation, download res ## Workflow -1. Open Gemini web and confirm user is logged in. -2. Set output directory and target image count. -3. Send one image-generation prompt per request until target count is reached. -4. For each request, wait until generation ends (`停止回答` button disappears), then download. -5. Collect downloaded files into target folder with batch naming, dedupe, and manifest. -6. Return file paths, manifest path, and failure summary. +1. Open Gemini web and check whether the user is logged in. +2. If not logged in, stop and explicitly ask the user to log in. +3. If logged in, open `工具` and click `创作图片`/`制作图片`. +4. Set output directory and target image count. +5. Send one image-generation prompt per request until target count is reached. +6. For each request, wait until generation ends (`停止回答` button disappears), then download the latest image result. +7. Collect downloaded files into target folder with batch naming, dedupe, and manifest. +8. Return file paths, manifest path, and failure summary. ## 1) Prerequisites @@ -22,16 +24,38 @@ description: "Generate images in Gemini web via browser automation, download res - `export PLAYWRIGHT_SHARED_SESSION=codex-shared` - Invoke Playwright CLI through `/Users/xd/java/xhs/tools/pw` (do not pass `--session` manually). - Decide output directory before generation, for example: - - `/Users/xd/java/xhs/output/gemini` + - `/Users/xd/java/xhs/output/gemini-image` -## 2) Open Gemini +Quick run: + +```bash +export PLAYWRIGHT_SHARED_SESSION=codex-shared +python3 scripts/run_image_flow.py \ + --prompt "生成一张电影感赛博朋克街景海报,夜晚霓虹,雨天反光,纵向构图。" \ + --target /Users/xd/java/xhs/output/gemini-image \ + --count 1 +``` + +## 2) Open Gemini And Enforce Login Gate - Navigate to Gemini app page. -- Confirm login state by checking account/avatar area. -- If not logged in, stop and ask user to complete login manually. +- Check login state via account/avatar area or login controls. +- If login controls are present (`登录`, `Sign in`, or `ServiceLogin` URL), stop immediately and ask user to log in. +- Continue only when login is confirmed. - If model selection is needed, choose a model that supports image output. -## 3) Multi-Image Generation Strategy +## 3) Enter Image Creation Tool + +- Click `工具`. +- Click image tool item by visible text priority: + - `创作图片` + - `制作图片` + - `Create image` + - `Image` +- If quick-intent card click is intercepted by overlay, retry via `工具` menu item. +- If image tool is not present after login is confirmed, stop and report capability unavailable for this account/region/model. + +## 4) Multi-Image Generation Strategy - Gemini web currently returns one image per request. - If user asks for `N` images, run `N` requests in sequence. @@ -44,7 +68,7 @@ Prompt construction rules: - Include visual style, lighting, composition, and aspect ratio. - Include banned elements only if user requests negative constraints. -## 4) Wait For Completion (Explicit End Condition) +## 5) Wait For Completion (Explicit End Condition) - After submit, wait for generation state to appear. - Treat generation as complete only when: @@ -52,14 +76,14 @@ Prompt construction rules: - latest assistant response has downloadable image action. - If refs are stale or state is unclear, re-snapshot and retry once. -## 5) Download Images +## 6) Download Images - Download from the latest assistant response block (not old history blocks). - Click `下载完整尺寸的图片`. - Wait for download completion toast/progress to end before next request. - Repeat until target count is reached or retry budget is exhausted. -## 6) Collect Downloaded Files +## 7) Collect Downloaded Files Use bundled script: @@ -67,11 +91,11 @@ Use bundled script: python3 scripts/collect_downloads.py \ --source /var/folders/.../playwright-mcp-output/ \ --source ~/Downloads \ - --target /ABS/PATH/TO/output/gemini \ + --target /ABS/PATH/TO/output/gemini-image \ --since \ --limit \ --expected-count \ - --prefix gemini \ + --prefix gemini-image \ --batch-id \ --prompt "" ``` @@ -80,17 +104,19 @@ Script behavior: - Source strategy: - Prefer Playwright temp download directory first. - - Fallback to `~/Downloads` when primary source has no matches. + - Also scan `.playwright-cli` and fallback to `~/Downloads`. - Filters to image extensions (`png,jpg,jpeg,webp`). - Uses batch naming (`--NN.ext`). - Dedupes by SHA-256 (current run + existing target files). - Captures dimensions (`width`, `height`) and writes JSON manifest. - Prints absolute output paths and manifest path. -## 7) Failure Handling By Step +## 8) Failure Handling By Step - Login step: - If login/captcha/MFA blocks, stop and ask user to complete manually. +- Tool-selection step: + - If `创作图片` is missing after login, stop and report unsupported capability. - Generation step: - If failed once, retry once with minimal prompt rewrite. - If still failing, record failure reason and continue remaining quota if requested. @@ -105,7 +131,7 @@ Script behavior: - If dedupe removes all files, return manifest with `no_files_after_dedupe`. - If collected count < required count, return `insufficient_files`. -## 8) Return Output +## 9) Return Output Return: @@ -115,13 +141,13 @@ Return: - manifest absolute path - retries, failures, and skipped duplicates -## 9) Reliability Rules +## 10) Reliability Rules -- Re-snapshot after navigation, model switch, and generation completion. +- Re-snapshot after navigation, tool switch, and generation completion. - If refs are stale or click intercepted, re-snapshot and retry once. - Do not assume static selectors across Gemini updates; rely on visible text and role-first matching. -## 10) Boundaries +## 11) Boundaries - Do not bypass login verification, captcha, paywalls, or security checks. - Do not submit disallowed or unsafe image prompts. @@ -130,4 +156,5 @@ Return: ## Scripts - `/Users/xd/java/xhs/tools/pw`: Shared Playwright CLI entrypoint with fixed session + lock. +- `scripts/run_image_flow.py`: End-to-end runner (login gate, enter image tool, generate, download image, collect files). - `scripts/collect_downloads.py`: Collect recent downloaded images with fallback sources, dedupe, and manifest. diff --git a/skills/gemini-image-web/agents/openai.yaml b/skills/gemini-image-web/agents/openai.yaml index e56e74b..7f48033 100644 --- a/skills/gemini-image-web/agents/openai.yaml +++ b/skills/gemini-image-web/agents/openai.yaml @@ -1,4 +1,4 @@ interface: display_name: "Gemini Image Web" short_description: "Generate Gemini images via web, multi-request, dedupe, and manifest." - default_prompt: "Use $gemini-image-web with PLAYWRIGHT_SHARED_SESSION=codex-shared; run browser steps only through /Users/xd/java/xhs/tools/pw, generate one image per Gemini request until target count is reached, download full-size outputs, then collect files with fallback source strategy, dedupe, and manifest metadata." + default_prompt: "Use $gemini-image-web with PLAYWRIGHT_SHARED_SESSION=codex-shared; run scripts/run_image_flow.py via /Users/xd/java/xhs/tools/pw-backed CLI flow to verify login, generate images, prefer full-size download, and collect deduped outputs with manifest." diff --git a/skills/gemini-image-web/scripts/collect_downloads.py b/skills/gemini-image-web/scripts/collect_downloads.py index 40b29ce..509a02d 100755 --- a/skills/gemini-image-web/scripts/collect_downloads.py +++ b/skills/gemini-image-web/scripts/collect_downloads.py @@ -58,8 +58,8 @@ def parse_args() -> argparse.Namespace: ) parser.add_argument( "--prefix", - default="gemini", - help="Filename prefix for collected files. Default: gemini", + default="gemini-image", + help="Filename prefix for collected files. Default: gemini-image", ) parser.add_argument( "--batch-id", @@ -107,7 +107,7 @@ def collect_candidates(source: Path, since_ts: float, allowed_ext: set[str]) -> files: list[Path] = [] if not source.exists(): return files - for path in source.iterdir(): + for path in source.rglob("*"): if not path.is_file(): continue ext = path.suffix.lower().lstrip(".") @@ -125,7 +125,10 @@ def collect_candidates(source: Path, since_ts: float, allowed_ext: set[str]) -> def discover_playwright_sources() -> list[Path]: globs = ( + "/var/folders/*/*/T/playwright-mcp-output/*", + "/private/var/folders/*/*/T/playwright-mcp-output/*", "/var/folders/*/*/*/T/playwright-mcp-output/*", + "/private/var/folders/*/*/*/T/playwright-mcp-output/*", "/tmp/playwright-mcp-output/*", ) candidates: list[Path] = [] @@ -147,6 +150,8 @@ def resolve_sources(raw_sources: list[str] | None) -> list[Path]: if raw_sources: return [Path(item).expanduser().resolve() for item in raw_sources] auto_sources = discover_playwright_sources() + auto_sources.append((Path.cwd() / ".playwright-cli").resolve()) + auto_sources.append((Path(__file__).resolve().parents[3] / ".playwright-cli").resolve()) auto_sources.append((Path.home() / "Downloads").resolve()) result: list[Path] = [] seen: set[Path] = set() @@ -214,16 +219,23 @@ def iso_ts(ts: float) -> str: return datetime.fromtimestamp(ts, tz=timezone.utc).isoformat() -def select_source_candidates( +def collect_candidates_all_sources( sources: list[Path], since_ts: float, allowed_ext: set[str] -) -> tuple[Path | None, list[Path], list[dict[str, object]]]: +) -> tuple[list[Path], list[dict[str, object]]]: tried: list[dict[str, object]] = [] + merged: list[Path] = [] + seen: set[Path] = set() for source in sources: files = collect_candidates(source, since_ts, allowed_ext) tried.append({"source": str(source), "matches": len(files)}) - if files: - return source, files, tried - return None, [], tried + for file_path in files: + resolved = file_path.resolve() + if resolved in seen: + continue + seen.add(resolved) + merged.append(file_path) + merged.sort(key=lambda p: p.stat().st_mtime, reverse=True) + return merged, tried def collect_existing_hashes(target: Path, allowed_ext: set[str]) -> set[str]: @@ -269,9 +281,7 @@ def main() -> int: return 2 sources = resolve_sources(args.source) - selected_source, candidates, tried_sources = select_source_candidates( - sources, args.since, allowed_ext - ) + candidates, tried_sources = collect_candidates_all_sources(sources, args.since, allowed_ext) if not candidates: payload = { "status": "no_matching_files", @@ -345,7 +355,6 @@ def main() -> int: "batch_id": batch_id, "prompt": args.prompt, "target_dir": str(target), - "source_dir": str(selected_source) if selected_source else None, "sources_tried": tried_sources, "since_ts": args.since, "limit": args.limit, diff --git a/skills/gemini-image-web/scripts/run_image_flow.py b/skills/gemini-image-web/scripts/run_image_flow.py new file mode 100755 index 0000000..e24da7b --- /dev/null +++ b/skills/gemini-image-web/scripts/run_image_flow.py @@ -0,0 +1,293 @@ +#!/usr/bin/env python3 +"""Run Gemini image generation flow end-to-end via Playwright CLI.""" + +from __future__ import annotations + +import argparse +import json +import os +import re +import subprocess +import sys +import time +from pathlib import Path + + +class FlowError(RuntimeError): + """Raised when a subprocess command in the flow fails.""" + + +def run_command( + cmd: list[str], *, capture_output: bool = True, check: bool = True +) -> subprocess.CompletedProcess[str]: + kwargs: dict[str, object] = {"text": True} + if capture_output: + kwargs["stdout"] = subprocess.PIPE + kwargs["stderr"] = subprocess.STDOUT + proc = subprocess.run(cmd, **kwargs) + if check and proc.returncode != 0: + output = proc.stdout if capture_output else "" + raise FlowError( + f"Command failed ({proc.returncode}): {' '.join(cmd)}\n{output}" + ) + return proc + + +def run_pw(pw_shared: Path, *args: str) -> str: + proc = run_command([str(pw_shared), *args], capture_output=True) + return proc.stdout or "" + + +def is_login_required(pw_shared: Path) -> bool: + out = run_pw( + pw_shared, + "eval", + ( + "() => {" + "const hasAccount = !!document.querySelector(" + "'button[aria-label*=\\\"Google 账号\\\"], " + "button[aria-label*=\\\"Google Account\\\"]'" + ");" + "const hasService = !!document.querySelector('a[href*=\\\"ServiceLogin\\\"]');" + "const hasLoginCtl = Array.from(document.querySelectorAll('a,button'))" + ".some(el => /登录|Sign in/i.test((el.textContent || '').trim()));" + "return !hasAccount && (hasService || hasLoginCtl);" + "}" + ), + ) + return bool(re.search(r"(?m)^true$", out)) + + +def enter_image_tool(pw_shared: Path) -> None: + js = r""" +async (page) => { +const labels = [/创作图片/, /制作图片/, /Create image/i, /Image/i]; + +const openToolMenu = async () => { + const cn = page.getByRole('button', { name: '工具', exact: true }).first(); + if (await cn.count()) { + await cn.click(); + return true; + } + const generic = page.getByRole('button', { name: /工具|Tools/i }).first(); + if (await generic.count()) { + await generic.click(); + return true; + } + return false; +}; + +const tryCardButtons = async () => { + for (const re of labels) { + const btn = page.getByRole('button', { name: re }).first(); + if (await btn.count()) { + try { + await btn.click({ timeout: 2000 }); + return true; + } catch (_) { + // Overlay may intercept pointer. Fall through to menu strategy. + } + } + } + return false; +}; + +const tryToolMenu = async () => { + const opened = await openToolMenu(); + if (!opened) return false; + for (const re of labels) { + const itemCheck = page.getByRole('menuitemcheckbox', { name: re }).first(); + if (await itemCheck.count()) { + await itemCheck.click(); + return true; + } + const itemPlain = page.getByRole('menuitem', { name: re }).first(); + if (await itemPlain.count()) { + await itemPlain.click(); + return true; + } + } + return false; +}; + +let ok = await tryCardButtons(); +if (!ok) ok = await tryToolMenu(); +if (!ok) ok = await tryToolMenu(); +if (!ok) throw new Error('Image tool entry not found'); +} +""" + run_pw(pw_shared, "run-code", js) + + +def submit_and_download_one(pw_shared: Path, prompt: str) -> None: + js = f""" +async (page) => {{ +const prompt = {json.dumps(prompt)}; +const input = page.getByRole('textbox', {{ name: /为 Gemini 输入提示|Enter a prompt/i }}).first(); +await input.click(); +await input.fill(prompt); +await input.press('Enter'); + +const stopBtn = page.getByRole('button', {{ name: /停止回答|Stop response/i }}).first(); +await stopBtn.waitFor({{ state: 'visible', timeout: 15000 }}).catch(() => {{}}); +await stopBtn.waitFor({{ state: 'hidden', timeout: 240000 }}); + +const downloadBtn = page.getByRole('button', {{ name: /下载完整尺寸的图片|下载图片|Download full size|Download image|Download/i }}).last(); +if (!(await downloadBtn.count())) {{ + throw new Error('Image download button not found'); +}} + +const downloadPromise = page.waitForEvent('download', {{ timeout: 45000 }}).catch(() => null); +await downloadBtn.click(); + +const preferredItem = page.getByRole('menuitem', {{ name: /完整尺寸|Full size|PNG|JPG|JPEG|WEBP/i }}).first(); +if (await preferredItem.isVisible().catch(() => false)) {{ + await preferredItem.click(); +}} else {{ + const anyItem = page.getByRole('menuitem').first(); + if (await anyItem.isVisible().catch(() => false)) {{ + await anyItem.click(); + }} +}} + +const download = await downloadPromise; +if (!download) {{ + const failedToast = page.getByText(/下载失败|Download failed|无法下载|保存失败/i).first(); + if (await failedToast.isVisible().catch(() => false)) {{ + throw new Error('Image download failed'); + }} + throw new Error('Image download did not start'); +}} +await download.path().catch(() => null); +await page.waitForTimeout(800); +}} +""" + run_pw(pw_shared, "run-code", js) + + +def retry_click_latest_download(pw_shared: Path) -> None: + js = r""" +async (page) => { +const btn = page.getByRole('button', { name: /下载完整尺寸的图片|下载图片|Download full size|Download image|Download/i }).last(); +if (!(await btn.count())) { + throw new Error('Image download button not found for retry'); +} +const downloadPromise = page.waitForEvent('download', { timeout: 45000 }).catch(() => null); +await btn.click(); +const download = await downloadPromise; +if (!download) { + throw new Error('Retry image download did not start'); +} +await download.path().catch(() => null); +await page.waitForTimeout(800); +} +""" + run_pw(pw_shared, "run-code", js) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Generate images on Gemini web and collect downloaded files." + ) + parser.add_argument("--prompt", required=True, help="Prompt text for image generation.") + parser.add_argument( + "--target", required=True, help="Absolute output directory for collected files." + ) + parser.add_argument( + "--count", type=int, default=1, help="Number of images to generate. Default: 1." + ) + parser.add_argument( + "--no-headed", + action="store_true", + help="Run browser without headed mode.", + ) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + if args.count < 1: + print("--count must be a positive integer.", file=sys.stderr) + return 1 + + repo_root = Path(__file__).resolve().parents[3] + pw_shared = Path( + os.environ.get("PW_SHARED_WRAPPER", str(repo_root / "tools/pw")) + ).expanduser() + collect_script = (Path(__file__).resolve().parent / "collect_downloads.py").resolve() + + if not pw_shared.exists() or not pw_shared.is_file(): + print(f"Shared Playwright wrapper not found: {pw_shared}", file=sys.stderr) + return 1 + if not os.access(pw_shared, os.X_OK): + print(f"Shared Playwright wrapper is not executable: {pw_shared}", file=sys.stderr) + return 1 + if not collect_script.exists(): + print(f"Collector script not found: {collect_script}", file=sys.stderr) + return 1 + + target = Path(args.target).expanduser().resolve() + target.mkdir(parents=True, exist_ok=True) + start_ts = time.time() + + try: + os.environ["PLAYWRIGHT_SHARED_INIT_MODE"] = ( + "headless" if args.no_headed else "headed" + ) + run_pw(pw_shared, "snapshot") + run_pw(pw_shared, "goto", "https://gemini.google.com/app") + run_pw(pw_shared, "snapshot") + + if is_login_required(pw_shared): + print( + "Gemini is not logged in. Please log in at https://gemini.google.com/app and rerun.", + file=sys.stderr, + ) + return 2 + + enter_image_tool(pw_shared) + + for i in range(1, args.count + 1): + current_prompt = args.prompt + if args.count > 1: + current_prompt = ( + f"{args.prompt}\n" + f"变体要求:这是第 {i} / {args.count} 张。保持主题一致,但构图和光影细节需要变化。" + ) + submit_and_download_one(pw_shared, current_prompt) + + collect_cmd = [ + sys.executable, + str(collect_script), + "--target", + str(target), + "--since", + str(start_ts), + "--expected-count", + str(args.count), + "--limit", + str(args.count), + "--prefix", + "gemini-image", + "--prompt", + args.prompt, + ] + proc = run_command(collect_cmd, capture_output=False, check=False) + if proc.returncode == 0: + return 0 + + # Fallback: click latest image download button once and retry collection. + try: + retry_click_latest_download(pw_shared) + except FlowError: + return proc.returncode + + retry_proc = run_command(collect_cmd, capture_output=False, check=False) + return retry_proc.returncode + except FlowError as exc: + print(str(exc), file=sys.stderr) + return 1 + + +if __name__ == "__main__": + raise SystemExit(main())