修复搜索和用户主页的笔记 ID 提取及风控绕过问题

- 搜索页笔记 href 格式为 /search_result/<id>,修正正则以兼容 /explore/ 和 /search_result/
- 用户主页笔记 href 格式为 /user/profile/<userId>/<noteId>,扩展正则并取正确捕获组
- 用户主页访问前先 warm-up 到 /explore,绕过 XHS headless IP 风控(code 300012)
- xsec_source 改为 pc_feed 以匹配用户从 feed 页获取的 token 类型
- 新增 debug-search.ts / debug-qrcode.ts / debug-profile.ts 诊断脚本
This commit is contained in:
2026-03-01 22:44:45 +08:00
parent 8b39520ec7
commit 56d5a6de96
5 changed files with 265 additions and 4 deletions
+39
View File
@@ -0,0 +1,39 @@
import { chromium } from 'rebrowser-playwright';
import { readFileSync } from 'node:fs';
const COOKIE_FILE = `${process.env.HOME}/.social-mcp/xiaohongshu/cookies.json`;
const userId = '5b29b622e8ac2b5a12ae97fc';
const xsecToken = 'ABrhIpSL55O66wuekMtlJUxsX4EpaNTlfCYwDo6UfKrrM=';
async function main() {
const raw = JSON.parse(readFileSync(COOKIE_FILE, 'utf-8'));
const browser = await chromium.launch({ headless: true, args: ['--no-sandbox','--disable-setuid-sandbox','--disable-dev-shm-usage','--disable-gpu'] });
const ctx = await browser.newContext({ storageState: raw });
const page = await ctx.newPage();
// Warm up: visit explore first
console.log('Warming up: visiting explore...');
await page.goto('https://www.xiaohongshu.com/explore', { waitUntil: 'domcontentloaded' });
await page.waitForTimeout(2000);
console.log('Explore title:', await page.title());
// Now try profile with pc_feed source (matching the token's source)
const url = `https://www.xiaohongshu.com/user/profile/${userId}?xsec_token=${encodeURIComponent(xsecToken)}&xsec_source=pc_feed`;
console.log('\nNavigating to profile (xsec_source=pc_feed)...');
await page.goto(url, { waitUntil: 'domcontentloaded' });
await page.waitForTimeout(2000);
console.log('title:', await page.title());
console.log('url:', page.url().slice(0, 80));
const nickname = await page.$eval('.user-info .user-name', el => el.textContent?.trim() ?? '').catch(() => 'NOT FOUND');
console.log('nickname:', nickname);
const feeds = await page.$$('.feeds-container .note-item');
console.log('note items:', feeds.length);
if (feeds.length > 0) {
const href = await feeds[0]!.$eval('a.cover', el => el.getAttribute('href') ?? '').catch(() => '');
console.log('first note href:', href);
}
await browser.close();
}
main().catch(e => { console.error(e); process.exit(1); });
+106
View File
@@ -0,0 +1,106 @@
/**
* Standalone diagnostic script — run with:
* npx tsx scripts/debug-qrcode.ts
*
* Opens XHS explore page in headless mode and dumps:
* - page title & URL after navigation
* - all element counts for candidate selectors
* - screenshot saved to /tmp/xhs-debug.png
*/
import { chromium } from 'rebrowser-playwright';
import { writeFileSync } from 'node:fs';
const EXPLORE_URL = 'https://www.xiaohongshu.com/explore';
const WAIT_MS = 15_000; // wait 15s for SPA to settle
const SELECTORS_TO_PROBE = [
// Current (fixed)
'img.qrcode-img',
'.qrcode-img',
// Original (broken)
'.login-container .qrcode-img',
// Login button candidates
'.login-btn',
'button.login-btn',
// Logged-in indicator candidates
'.user .link-wrapper .channel',
// Modal/container
'.container',
'div.container',
// QR area
'.code-area',
'.qrcode',
// Phone input (also in modal)
'input[name="xhs-pc-web-phone"]',
];
async function main() {
console.log('Launching browser (headless: false)...');
const browser = await chromium.launch({
headless: false,
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu'],
});
const ctx = await browser.newContext();
const page = await ctx.newPage();
console.log(`Navigating to ${EXPLORE_URL} ...`);
const t0 = Date.now();
await page.goto(EXPLORE_URL, { waitUntil: 'domcontentloaded' });
console.log(` domcontentloaded in ${Date.now() - t0}ms`);
console.log(` title: "${await page.title()}"`);
console.log(` url: ${page.url()}`);
console.log(`\nWaiting ${WAIT_MS / 1000}s for SPA to render...`);
await page.waitForTimeout(WAIT_MS);
console.log(` title after wait: "${await page.title()}"`);
console.log(` url after wait: ${page.url()}`);
console.log('\n--- Selector probe results ---');
for (const sel of SELECTORS_TO_PROBE) {
const count = await page.locator(sel).count();
const marker = count > 0 ? '✓' : '✗';
console.log(` ${marker} [${count}] ${sel}`);
if (count > 0 && sel.includes('qrcode')) {
// Print src attribute if it's an img
try {
const src = await page.locator(sel).first().getAttribute('src');
const preview = src ? src.slice(0, 60) + '...' : '(null)';
console.log(` src: ${preview}`);
} catch {/* not an img */}
}
}
// Dump all img srcs that look like QR codes (data URIs)
console.log('\n--- All <img> with data: src on page ---');
const imgs = await page.locator('img[src^="data:"]').all();
for (const img of imgs) {
const cls = await img.getAttribute('class').catch(() => '');
const src = await img.getAttribute('src').catch(() => '');
console.log(` class="${cls}" src="${src?.slice(0, 80)}..."`);
}
// Save screenshot
const screenshotPath = '/tmp/xhs-debug.png';
await page.screenshot({ path: screenshotPath, fullPage: false });
console.log(`\nScreenshot saved → ${screenshotPath}`);
// Also dump page HTML around any element matching qrcode
console.log('\n--- Outer HTML of .qrcode (if found) ---');
const qrDiv = page.locator('.qrcode').first();
if (await qrDiv.count() > 0) {
const html = await qrDiv.evaluate((el: Element) => el.outerHTML.slice(0, 500));
console.log(html);
} else {
console.log(' .qrcode not found');
}
await browser.close();
console.log('\nDone.');
}
main().catch((err) => {
console.error('Fatal:', err);
process.exit(1);
});
+110
View File
@@ -0,0 +1,110 @@
/**
* Standalone diagnostic script for search:
* npx tsx scripts/debug-search.ts [keyword]
*/
import { chromium } from 'rebrowser-playwright';
import { readFileSync } from 'node:fs';
const keyword = process.argv[2] ?? '美食';
const SEARCH_URL = `https://www.xiaohongshu.com/search_result?keyword=${encodeURIComponent(keyword)}`;
const COOKIE_FILE = `${process.env.HOME}/.social-mcp/xiaohongshu/cookies.json`;
async function main() {
// Load cookies if available
let storageState: object | undefined;
try {
const raw = JSON.parse(readFileSync(COOKIE_FILE, 'utf-8'));
storageState = raw;
console.log(`Loaded cookies (${raw.cookies?.length ?? 0} cookies)`);
} catch {
console.log('No cookies found — running without session');
}
const browser = await chromium.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu'],
});
const ctx = await browser.newContext(storageState ? { storageState: storageState as any } : {});
const page = await ctx.newPage();
console.log(`\nNavigating to: ${SEARCH_URL}`);
await page.goto(SEARCH_URL, { waitUntil: 'domcontentloaded' });
console.log(`title: "${await page.title()}"`);
console.log(`url: ${page.url()}`);
console.log('\nWaiting 5s for SPA...');
await page.waitForTimeout(5000);
console.log(`title after wait: "${await page.title()}"`);
console.log(`url after wait: ${page.url()}`);
// --- __INITIAL_STATE__ ---
console.log('\n--- __INITIAL_STATE__ top-level keys ---');
const state: any = await page.evaluate('window.__INITIAL_STATE__').catch(() => null);
if (!state) {
console.log(' __INITIAL_STATE__ not found!');
} else {
const keys = Object.keys(state);
console.log(` keys: ${keys.join(', ')}`);
// Print structure of each key
for (const k of keys) {
const v = state[k];
if (v && typeof v === 'object') {
const subKeys = Object.keys(v);
console.log(` [${k}]: { ${subKeys.map((sk: string) => {
const sv = (v as any)[sk];
return `${sk}: ${Array.isArray(sv) ? `Array(${sv.length})` : typeof sv}`;
}).join(', ')} }`);
} else {
console.log(` [${k}]: ${typeof v}`);
}
}
// Try to find feeds/notes arrays
console.log('\n--- Looking for feed/note arrays in __INITIAL_STATE__ ---');
function findArrays(obj: any, path: string, depth = 0) {
if (depth > 3) return;
if (!obj || typeof obj !== 'object') return;
for (const [k, v] of Object.entries(obj)) {
const fullPath = path ? `${path}.${k}` : k;
if (Array.isArray(v) && v.length > 0) {
const sample = v[0];
const sampleKeys = typeof sample === 'object' ? Object.keys(sample as object).slice(0, 6).join(', ') : typeof sample;
console.log(` Array found: ${fullPath} (length=${v.length}) sample keys: [${sampleKeys}]`);
} else if (v && typeof v === 'object' && !Array.isArray(v)) {
findArrays(v, fullPath, depth + 1);
}
}
}
findArrays(state, '');
}
// --- DOM selectors ---
console.log('\n--- DOM selector probe ---');
const selectors = [
'.feeds-container .note-item',
'.note-item',
'#global-search-result-container',
'.search-result',
'.result-container',
'[class*="note"]',
'[class*="feed"]',
'[class*="result"]',
];
for (const sel of selectors) {
const count = await page.locator(sel).count();
if (count > 0) console.log(` ✓ [${count}] ${sel}`);
else console.log(` ✗ [0] ${sel}`);
}
// Screenshot
const screenshotPath = '/tmp/xhs-search-debug.png';
await page.screenshot({ path: screenshotPath });
console.log(`\nScreenshot → ${screenshotPath}`);
await browser.close();
}
main().catch((e) => { console.error(e); process.exit(1); });
+1 -1
View File
@@ -332,7 +332,7 @@ async function scrapeSearchResultsFromDom(page: Page): Promise<Feed[]> {
.$eval('a.cover', (el) => el.getAttribute('href') ?? '')
.catch(() => '');
const idMatch = href.match(/\/explore\/([a-f0-9]+)/);
const idMatch = href.match(/\/(?:explore|search_result)\/([a-f0-9]+)/);
const tokenMatch = href.match(/xsec_token=([^&]+)/);
const id = idMatch?.[1] ?? '';
const xsecToken = tokenMatch?.[1] ?? '';
+9 -3
View File
@@ -129,9 +129,15 @@ export async function getUserProfile(
userId: string,
xsecToken: string,
): Promise<UserProfile> {
const url = `${USER_PROFILE_BASE_URL}/${userId}?xsec_token=${encodeURIComponent(xsecToken)}&xsec_source=pc_note`;
const url = `${USER_PROFILE_BASE_URL}/${userId}?xsec_token=${encodeURIComponent(xsecToken)}&xsec_source=pc_feed`;
log.debug({ userId, url }, 'Navigating to user profile page');
// XHS applies stricter bot detection on profile pages than on search pages.
// Visiting the explore page first establishes a natural session context that
// allows the subsequent profile navigation to pass the IP-risk check.
await page.goto('https://www.xiaohongshu.com/explore', { waitUntil: 'domcontentloaded' });
await page.waitForTimeout(1000);
await page.goto(url, { waitUntil: 'domcontentloaded' });
// Wait for the user profile header to appear.
@@ -378,9 +384,9 @@ async function scrapeProfileFromDom(
.$eval('a.cover', (el) => el.getAttribute('href') ?? '')
.catch(() => '');
const idMatch = href.match(/\/explore\/([a-f0-9]+)/);
const idMatch = href.match(/\/(?:explore|search_result)\/([a-f0-9]+)|\/user\/profile\/[^/]+\/([a-f0-9]+)/);
const tokenMatch = href.match(/xsec_token=([^&]+)/);
const id = idMatch?.[1] ?? '';
const id = idMatch?.[1] ?? idMatch?.[2] ?? '';
const noteXsecToken = tokenMatch?.[1] ?? '';
if (!id) continue;