From 56d5a6de9600a6914b2b48d5b1ba3158ae621e27 Mon Sep 17 00:00:00 2001 From: kurihada Date: Sun, 1 Mar 2026 22:44:45 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E6=90=9C=E7=B4=A2=E5=92=8C?= =?UTF-8?q?=E7=94=A8=E6=88=B7=E4=B8=BB=E9=A1=B5=E7=9A=84=E7=AC=94=E8=AE=B0?= =?UTF-8?q?=20ID=20=E6=8F=90=E5=8F=96=E5=8F=8A=E9=A3=8E=E6=8E=A7=E7=BB=95?= =?UTF-8?q?=E8=BF=87=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 搜索页笔记 href 格式为 /search_result/,修正正则以兼容 /explore/ 和 /search_result/ - 用户主页笔记 href 格式为 /user/profile//,扩展正则并取正确捕获组 - 用户主页访问前先 warm-up 到 /explore,绕过 XHS headless IP 风控(code 300012) - xsec_source 改为 pc_feed 以匹配用户从 feed 页获取的 token 类型 - 新增 debug-search.ts / debug-qrcode.ts / debug-profile.ts 诊断脚本 --- scripts/debug-profile.ts | 39 ++++++++ scripts/debug-qrcode.ts | 106 +++++++++++++++++++++ scripts/debug-search.ts | 110 ++++++++++++++++++++++ src/platforms/xiaohongshu/search.ts | 2 +- src/platforms/xiaohongshu/user-profile.ts | 12 ++- 5 files changed, 265 insertions(+), 4 deletions(-) create mode 100644 scripts/debug-profile.ts create mode 100644 scripts/debug-qrcode.ts create mode 100644 scripts/debug-search.ts diff --git a/scripts/debug-profile.ts b/scripts/debug-profile.ts new file mode 100644 index 0000000..95863ac --- /dev/null +++ b/scripts/debug-profile.ts @@ -0,0 +1,39 @@ +import { chromium } from 'rebrowser-playwright'; +import { readFileSync } from 'node:fs'; +const COOKIE_FILE = `${process.env.HOME}/.social-mcp/xiaohongshu/cookies.json`; +const userId = '5b29b622e8ac2b5a12ae97fc'; +const xsecToken = 'ABrhIpSL55O66wuekMtlJUxsX4EpaNTlfCYwDo6UfKrrM='; + +async function main() { + const raw = JSON.parse(readFileSync(COOKIE_FILE, 'utf-8')); + const browser = await chromium.launch({ headless: true, args: ['--no-sandbox','--disable-setuid-sandbox','--disable-dev-shm-usage','--disable-gpu'] }); + const ctx = await browser.newContext({ storageState: raw }); + const page = await ctx.newPage(); + + // Warm up: visit explore first + console.log('Warming up: visiting explore...'); + await page.goto('https://www.xiaohongshu.com/explore', { waitUntil: 'domcontentloaded' }); + await page.waitForTimeout(2000); + console.log('Explore title:', await page.title()); + + // Now try profile with pc_feed source (matching the token's source) + const url = `https://www.xiaohongshu.com/user/profile/${userId}?xsec_token=${encodeURIComponent(xsecToken)}&xsec_source=pc_feed`; + console.log('\nNavigating to profile (xsec_source=pc_feed)...'); + await page.goto(url, { waitUntil: 'domcontentloaded' }); + await page.waitForTimeout(2000); + console.log('title:', await page.title()); + console.log('url:', page.url().slice(0, 80)); + + const nickname = await page.$eval('.user-info .user-name', el => el.textContent?.trim() ?? '').catch(() => 'NOT FOUND'); + console.log('nickname:', nickname); + + const feeds = await page.$$('.feeds-container .note-item'); + console.log('note items:', feeds.length); + if (feeds.length > 0) { + const href = await feeds[0]!.$eval('a.cover', el => el.getAttribute('href') ?? '').catch(() => ''); + console.log('first note href:', href); + } + + await browser.close(); +} +main().catch(e => { console.error(e); process.exit(1); }); diff --git a/scripts/debug-qrcode.ts b/scripts/debug-qrcode.ts new file mode 100644 index 0000000..9b42fc7 --- /dev/null +++ b/scripts/debug-qrcode.ts @@ -0,0 +1,106 @@ +/** + * Standalone diagnostic script — run with: + * npx tsx scripts/debug-qrcode.ts + * + * Opens XHS explore page in headless mode and dumps: + * - page title & URL after navigation + * - all element counts for candidate selectors + * - screenshot saved to /tmp/xhs-debug.png + */ + +import { chromium } from 'rebrowser-playwright'; +import { writeFileSync } from 'node:fs'; + +const EXPLORE_URL = 'https://www.xiaohongshu.com/explore'; +const WAIT_MS = 15_000; // wait 15s for SPA to settle + +const SELECTORS_TO_PROBE = [ + // Current (fixed) + 'img.qrcode-img', + '.qrcode-img', + // Original (broken) + '.login-container .qrcode-img', + // Login button candidates + '.login-btn', + 'button.login-btn', + // Logged-in indicator candidates + '.user .link-wrapper .channel', + // Modal/container + '.container', + 'div.container', + // QR area + '.code-area', + '.qrcode', + // Phone input (also in modal) + 'input[name="xhs-pc-web-phone"]', +]; + +async function main() { + console.log('Launching browser (headless: false)...'); + const browser = await chromium.launch({ + headless: false, + args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu'], + }); + + const ctx = await browser.newContext(); + const page = await ctx.newPage(); + + console.log(`Navigating to ${EXPLORE_URL} ...`); + const t0 = Date.now(); + await page.goto(EXPLORE_URL, { waitUntil: 'domcontentloaded' }); + console.log(` domcontentloaded in ${Date.now() - t0}ms`); + console.log(` title: "${await page.title()}"`); + console.log(` url: ${page.url()}`); + + console.log(`\nWaiting ${WAIT_MS / 1000}s for SPA to render...`); + await page.waitForTimeout(WAIT_MS); + console.log(` title after wait: "${await page.title()}"`); + console.log(` url after wait: ${page.url()}`); + + console.log('\n--- Selector probe results ---'); + for (const sel of SELECTORS_TO_PROBE) { + const count = await page.locator(sel).count(); + const marker = count > 0 ? '✓' : '✗'; + console.log(` ${marker} [${count}] ${sel}`); + if (count > 0 && sel.includes('qrcode')) { + // Print src attribute if it's an img + try { + const src = await page.locator(sel).first().getAttribute('src'); + const preview = src ? src.slice(0, 60) + '...' : '(null)'; + console.log(` src: ${preview}`); + } catch {/* not an img */} + } + } + + // Dump all img srcs that look like QR codes (data URIs) + console.log('\n--- All with data: src on page ---'); + const imgs = await page.locator('img[src^="data:"]').all(); + for (const img of imgs) { + const cls = await img.getAttribute('class').catch(() => ''); + const src = await img.getAttribute('src').catch(() => ''); + console.log(` class="${cls}" src="${src?.slice(0, 80)}..."`); + } + + // Save screenshot + const screenshotPath = '/tmp/xhs-debug.png'; + await page.screenshot({ path: screenshotPath, fullPage: false }); + console.log(`\nScreenshot saved → ${screenshotPath}`); + + // Also dump page HTML around any element matching qrcode + console.log('\n--- Outer HTML of .qrcode (if found) ---'); + const qrDiv = page.locator('.qrcode').first(); + if (await qrDiv.count() > 0) { + const html = await qrDiv.evaluate((el: Element) => el.outerHTML.slice(0, 500)); + console.log(html); + } else { + console.log(' .qrcode not found'); + } + + await browser.close(); + console.log('\nDone.'); +} + +main().catch((err) => { + console.error('Fatal:', err); + process.exit(1); +}); diff --git a/scripts/debug-search.ts b/scripts/debug-search.ts new file mode 100644 index 0000000..45c25b1 --- /dev/null +++ b/scripts/debug-search.ts @@ -0,0 +1,110 @@ +/** + * Standalone diagnostic script for search: + * npx tsx scripts/debug-search.ts [keyword] + */ + +import { chromium } from 'rebrowser-playwright'; +import { readFileSync } from 'node:fs'; + +const keyword = process.argv[2] ?? '美食'; +const SEARCH_URL = `https://www.xiaohongshu.com/search_result?keyword=${encodeURIComponent(keyword)}`; +const COOKIE_FILE = `${process.env.HOME}/.social-mcp/xiaohongshu/cookies.json`; + +async function main() { + // Load cookies if available + let storageState: object | undefined; + try { + const raw = JSON.parse(readFileSync(COOKIE_FILE, 'utf-8')); + storageState = raw; + console.log(`Loaded cookies (${raw.cookies?.length ?? 0} cookies)`); + } catch { + console.log('No cookies found — running without session'); + } + + const browser = await chromium.launch({ + headless: true, + args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu'], + }); + + const ctx = await browser.newContext(storageState ? { storageState: storageState as any } : {}); + const page = await ctx.newPage(); + + console.log(`\nNavigating to: ${SEARCH_URL}`); + await page.goto(SEARCH_URL, { waitUntil: 'domcontentloaded' }); + console.log(`title: "${await page.title()}"`); + console.log(`url: ${page.url()}`); + + console.log('\nWaiting 5s for SPA...'); + await page.waitForTimeout(5000); + console.log(`title after wait: "${await page.title()}"`); + console.log(`url after wait: ${page.url()}`); + + // --- __INITIAL_STATE__ --- + console.log('\n--- __INITIAL_STATE__ top-level keys ---'); + const state: any = await page.evaluate('window.__INITIAL_STATE__').catch(() => null); + if (!state) { + console.log(' __INITIAL_STATE__ not found!'); + } else { + const keys = Object.keys(state); + console.log(` keys: ${keys.join(', ')}`); + + // Print structure of each key + for (const k of keys) { + const v = state[k]; + if (v && typeof v === 'object') { + const subKeys = Object.keys(v); + console.log(` [${k}]: { ${subKeys.map((sk: string) => { + const sv = (v as any)[sk]; + return `${sk}: ${Array.isArray(sv) ? `Array(${sv.length})` : typeof sv}`; + }).join(', ')} }`); + } else { + console.log(` [${k}]: ${typeof v}`); + } + } + + // Try to find feeds/notes arrays + console.log('\n--- Looking for feed/note arrays in __INITIAL_STATE__ ---'); + function findArrays(obj: any, path: string, depth = 0) { + if (depth > 3) return; + if (!obj || typeof obj !== 'object') return; + for (const [k, v] of Object.entries(obj)) { + const fullPath = path ? `${path}.${k}` : k; + if (Array.isArray(v) && v.length > 0) { + const sample = v[0]; + const sampleKeys = typeof sample === 'object' ? Object.keys(sample as object).slice(0, 6).join(', ') : typeof sample; + console.log(` Array found: ${fullPath} (length=${v.length}) sample keys: [${sampleKeys}]`); + } else if (v && typeof v === 'object' && !Array.isArray(v)) { + findArrays(v, fullPath, depth + 1); + } + } + } + findArrays(state, ''); + } + + // --- DOM selectors --- + console.log('\n--- DOM selector probe ---'); + const selectors = [ + '.feeds-container .note-item', + '.note-item', + '#global-search-result-container', + '.search-result', + '.result-container', + '[class*="note"]', + '[class*="feed"]', + '[class*="result"]', + ]; + for (const sel of selectors) { + const count = await page.locator(sel).count(); + if (count > 0) console.log(` ✓ [${count}] ${sel}`); + else console.log(` ✗ [0] ${sel}`); + } + + // Screenshot + const screenshotPath = '/tmp/xhs-search-debug.png'; + await page.screenshot({ path: screenshotPath }); + console.log(`\nScreenshot → ${screenshotPath}`); + + await browser.close(); +} + +main().catch((e) => { console.error(e); process.exit(1); }); diff --git a/src/platforms/xiaohongshu/search.ts b/src/platforms/xiaohongshu/search.ts index 4e1559a..60887b4 100644 --- a/src/platforms/xiaohongshu/search.ts +++ b/src/platforms/xiaohongshu/search.ts @@ -332,7 +332,7 @@ async function scrapeSearchResultsFromDom(page: Page): Promise { .$eval('a.cover', (el) => el.getAttribute('href') ?? '') .catch(() => ''); - const idMatch = href.match(/\/explore\/([a-f0-9]+)/); + const idMatch = href.match(/\/(?:explore|search_result)\/([a-f0-9]+)/); const tokenMatch = href.match(/xsec_token=([^&]+)/); const id = idMatch?.[1] ?? ''; const xsecToken = tokenMatch?.[1] ?? ''; diff --git a/src/platforms/xiaohongshu/user-profile.ts b/src/platforms/xiaohongshu/user-profile.ts index 9fd87d8..bb8b72f 100644 --- a/src/platforms/xiaohongshu/user-profile.ts +++ b/src/platforms/xiaohongshu/user-profile.ts @@ -129,9 +129,15 @@ export async function getUserProfile( userId: string, xsecToken: string, ): Promise { - const url = `${USER_PROFILE_BASE_URL}/${userId}?xsec_token=${encodeURIComponent(xsecToken)}&xsec_source=pc_note`; + const url = `${USER_PROFILE_BASE_URL}/${userId}?xsec_token=${encodeURIComponent(xsecToken)}&xsec_source=pc_feed`; log.debug({ userId, url }, 'Navigating to user profile page'); + // XHS applies stricter bot detection on profile pages than on search pages. + // Visiting the explore page first establishes a natural session context that + // allows the subsequent profile navigation to pass the IP-risk check. + await page.goto('https://www.xiaohongshu.com/explore', { waitUntil: 'domcontentloaded' }); + await page.waitForTimeout(1000); + await page.goto(url, { waitUntil: 'domcontentloaded' }); // Wait for the user profile header to appear. @@ -378,9 +384,9 @@ async function scrapeProfileFromDom( .$eval('a.cover', (el) => el.getAttribute('href') ?? '') .catch(() => ''); - const idMatch = href.match(/\/explore\/([a-f0-9]+)/); + const idMatch = href.match(/\/(?:explore|search_result)\/([a-f0-9]+)|\/user\/profile\/[^/]+\/([a-f0-9]+)/); const tokenMatch = href.match(/xsec_token=([^&]+)/); - const id = idMatch?.[1] ?? ''; + const id = idMatch?.[1] ?? idMatch?.[2] ?? ''; const noteXsecToken = tokenMatch?.[1] ?? ''; if (!id) continue;