修复搜索和用户主页的笔记 ID 提取及风控绕过问题
- 搜索页笔记 href 格式为 /search_result/<id>,修正正则以兼容 /explore/ 和 /search_result/ - 用户主页笔记 href 格式为 /user/profile/<userId>/<noteId>,扩展正则并取正确捕获组 - 用户主页访问前先 warm-up 到 /explore,绕过 XHS headless IP 风控(code 300012) - xsec_source 改为 pc_feed 以匹配用户从 feed 页获取的 token 类型 - 新增 debug-search.ts / debug-qrcode.ts / debug-profile.ts 诊断脚本
This commit is contained in:
@@ -0,0 +1,39 @@
|
|||||||
|
import { chromium } from 'rebrowser-playwright';
|
||||||
|
import { readFileSync } from 'node:fs';
|
||||||
|
const COOKIE_FILE = `${process.env.HOME}/.social-mcp/xiaohongshu/cookies.json`;
|
||||||
|
const userId = '5b29b622e8ac2b5a12ae97fc';
|
||||||
|
const xsecToken = 'ABrhIpSL55O66wuekMtlJUxsX4EpaNTlfCYwDo6UfKrrM=';
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
const raw = JSON.parse(readFileSync(COOKIE_FILE, 'utf-8'));
|
||||||
|
const browser = await chromium.launch({ headless: true, args: ['--no-sandbox','--disable-setuid-sandbox','--disable-dev-shm-usage','--disable-gpu'] });
|
||||||
|
const ctx = await browser.newContext({ storageState: raw });
|
||||||
|
const page = await ctx.newPage();
|
||||||
|
|
||||||
|
// Warm up: visit explore first
|
||||||
|
console.log('Warming up: visiting explore...');
|
||||||
|
await page.goto('https://www.xiaohongshu.com/explore', { waitUntil: 'domcontentloaded' });
|
||||||
|
await page.waitForTimeout(2000);
|
||||||
|
console.log('Explore title:', await page.title());
|
||||||
|
|
||||||
|
// Now try profile with pc_feed source (matching the token's source)
|
||||||
|
const url = `https://www.xiaohongshu.com/user/profile/${userId}?xsec_token=${encodeURIComponent(xsecToken)}&xsec_source=pc_feed`;
|
||||||
|
console.log('\nNavigating to profile (xsec_source=pc_feed)...');
|
||||||
|
await page.goto(url, { waitUntil: 'domcontentloaded' });
|
||||||
|
await page.waitForTimeout(2000);
|
||||||
|
console.log('title:', await page.title());
|
||||||
|
console.log('url:', page.url().slice(0, 80));
|
||||||
|
|
||||||
|
const nickname = await page.$eval('.user-info .user-name', el => el.textContent?.trim() ?? '').catch(() => 'NOT FOUND');
|
||||||
|
console.log('nickname:', nickname);
|
||||||
|
|
||||||
|
const feeds = await page.$$('.feeds-container .note-item');
|
||||||
|
console.log('note items:', feeds.length);
|
||||||
|
if (feeds.length > 0) {
|
||||||
|
const href = await feeds[0]!.$eval('a.cover', el => el.getAttribute('href') ?? '').catch(() => '');
|
||||||
|
console.log('first note href:', href);
|
||||||
|
}
|
||||||
|
|
||||||
|
await browser.close();
|
||||||
|
}
|
||||||
|
main().catch(e => { console.error(e); process.exit(1); });
|
||||||
@@ -0,0 +1,106 @@
|
|||||||
|
/**
|
||||||
|
* Standalone diagnostic script — run with:
|
||||||
|
* npx tsx scripts/debug-qrcode.ts
|
||||||
|
*
|
||||||
|
* Opens XHS explore page in headless mode and dumps:
|
||||||
|
* - page title & URL after navigation
|
||||||
|
* - all element counts for candidate selectors
|
||||||
|
* - screenshot saved to /tmp/xhs-debug.png
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { chromium } from 'rebrowser-playwright';
|
||||||
|
import { writeFileSync } from 'node:fs';
|
||||||
|
|
||||||
|
const EXPLORE_URL = 'https://www.xiaohongshu.com/explore';
|
||||||
|
const WAIT_MS = 15_000; // wait 15s for SPA to settle
|
||||||
|
|
||||||
|
const SELECTORS_TO_PROBE = [
|
||||||
|
// Current (fixed)
|
||||||
|
'img.qrcode-img',
|
||||||
|
'.qrcode-img',
|
||||||
|
// Original (broken)
|
||||||
|
'.login-container .qrcode-img',
|
||||||
|
// Login button candidates
|
||||||
|
'.login-btn',
|
||||||
|
'button.login-btn',
|
||||||
|
// Logged-in indicator candidates
|
||||||
|
'.user .link-wrapper .channel',
|
||||||
|
// Modal/container
|
||||||
|
'.container',
|
||||||
|
'div.container',
|
||||||
|
// QR area
|
||||||
|
'.code-area',
|
||||||
|
'.qrcode',
|
||||||
|
// Phone input (also in modal)
|
||||||
|
'input[name="xhs-pc-web-phone"]',
|
||||||
|
];
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
console.log('Launching browser (headless: false)...');
|
||||||
|
const browser = await chromium.launch({
|
||||||
|
headless: false,
|
||||||
|
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu'],
|
||||||
|
});
|
||||||
|
|
||||||
|
const ctx = await browser.newContext();
|
||||||
|
const page = await ctx.newPage();
|
||||||
|
|
||||||
|
console.log(`Navigating to ${EXPLORE_URL} ...`);
|
||||||
|
const t0 = Date.now();
|
||||||
|
await page.goto(EXPLORE_URL, { waitUntil: 'domcontentloaded' });
|
||||||
|
console.log(` domcontentloaded in ${Date.now() - t0}ms`);
|
||||||
|
console.log(` title: "${await page.title()}"`);
|
||||||
|
console.log(` url: ${page.url()}`);
|
||||||
|
|
||||||
|
console.log(`\nWaiting ${WAIT_MS / 1000}s for SPA to render...`);
|
||||||
|
await page.waitForTimeout(WAIT_MS);
|
||||||
|
console.log(` title after wait: "${await page.title()}"`);
|
||||||
|
console.log(` url after wait: ${page.url()}`);
|
||||||
|
|
||||||
|
console.log('\n--- Selector probe results ---');
|
||||||
|
for (const sel of SELECTORS_TO_PROBE) {
|
||||||
|
const count = await page.locator(sel).count();
|
||||||
|
const marker = count > 0 ? '✓' : '✗';
|
||||||
|
console.log(` ${marker} [${count}] ${sel}`);
|
||||||
|
if (count > 0 && sel.includes('qrcode')) {
|
||||||
|
// Print src attribute if it's an img
|
||||||
|
try {
|
||||||
|
const src = await page.locator(sel).first().getAttribute('src');
|
||||||
|
const preview = src ? src.slice(0, 60) + '...' : '(null)';
|
||||||
|
console.log(` src: ${preview}`);
|
||||||
|
} catch {/* not an img */}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Dump all img srcs that look like QR codes (data URIs)
|
||||||
|
console.log('\n--- All <img> with data: src on page ---');
|
||||||
|
const imgs = await page.locator('img[src^="data:"]').all();
|
||||||
|
for (const img of imgs) {
|
||||||
|
const cls = await img.getAttribute('class').catch(() => '');
|
||||||
|
const src = await img.getAttribute('src').catch(() => '');
|
||||||
|
console.log(` class="${cls}" src="${src?.slice(0, 80)}..."`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save screenshot
|
||||||
|
const screenshotPath = '/tmp/xhs-debug.png';
|
||||||
|
await page.screenshot({ path: screenshotPath, fullPage: false });
|
||||||
|
console.log(`\nScreenshot saved → ${screenshotPath}`);
|
||||||
|
|
||||||
|
// Also dump page HTML around any element matching qrcode
|
||||||
|
console.log('\n--- Outer HTML of .qrcode (if found) ---');
|
||||||
|
const qrDiv = page.locator('.qrcode').first();
|
||||||
|
if (await qrDiv.count() > 0) {
|
||||||
|
const html = await qrDiv.evaluate((el: Element) => el.outerHTML.slice(0, 500));
|
||||||
|
console.log(html);
|
||||||
|
} else {
|
||||||
|
console.log(' .qrcode not found');
|
||||||
|
}
|
||||||
|
|
||||||
|
await browser.close();
|
||||||
|
console.log('\nDone.');
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch((err) => {
|
||||||
|
console.error('Fatal:', err);
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
@@ -0,0 +1,110 @@
|
|||||||
|
/**
|
||||||
|
* Standalone diagnostic script for search:
|
||||||
|
* npx tsx scripts/debug-search.ts [keyword]
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { chromium } from 'rebrowser-playwright';
|
||||||
|
import { readFileSync } from 'node:fs';
|
||||||
|
|
||||||
|
const keyword = process.argv[2] ?? '美食';
|
||||||
|
const SEARCH_URL = `https://www.xiaohongshu.com/search_result?keyword=${encodeURIComponent(keyword)}`;
|
||||||
|
const COOKIE_FILE = `${process.env.HOME}/.social-mcp/xiaohongshu/cookies.json`;
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
// Load cookies if available
|
||||||
|
let storageState: object | undefined;
|
||||||
|
try {
|
||||||
|
const raw = JSON.parse(readFileSync(COOKIE_FILE, 'utf-8'));
|
||||||
|
storageState = raw;
|
||||||
|
console.log(`Loaded cookies (${raw.cookies?.length ?? 0} cookies)`);
|
||||||
|
} catch {
|
||||||
|
console.log('No cookies found — running without session');
|
||||||
|
}
|
||||||
|
|
||||||
|
const browser = await chromium.launch({
|
||||||
|
headless: true,
|
||||||
|
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu'],
|
||||||
|
});
|
||||||
|
|
||||||
|
const ctx = await browser.newContext(storageState ? { storageState: storageState as any } : {});
|
||||||
|
const page = await ctx.newPage();
|
||||||
|
|
||||||
|
console.log(`\nNavigating to: ${SEARCH_URL}`);
|
||||||
|
await page.goto(SEARCH_URL, { waitUntil: 'domcontentloaded' });
|
||||||
|
console.log(`title: "${await page.title()}"`);
|
||||||
|
console.log(`url: ${page.url()}`);
|
||||||
|
|
||||||
|
console.log('\nWaiting 5s for SPA...');
|
||||||
|
await page.waitForTimeout(5000);
|
||||||
|
console.log(`title after wait: "${await page.title()}"`);
|
||||||
|
console.log(`url after wait: ${page.url()}`);
|
||||||
|
|
||||||
|
// --- __INITIAL_STATE__ ---
|
||||||
|
console.log('\n--- __INITIAL_STATE__ top-level keys ---');
|
||||||
|
const state: any = await page.evaluate('window.__INITIAL_STATE__').catch(() => null);
|
||||||
|
if (!state) {
|
||||||
|
console.log(' __INITIAL_STATE__ not found!');
|
||||||
|
} else {
|
||||||
|
const keys = Object.keys(state);
|
||||||
|
console.log(` keys: ${keys.join(', ')}`);
|
||||||
|
|
||||||
|
// Print structure of each key
|
||||||
|
for (const k of keys) {
|
||||||
|
const v = state[k];
|
||||||
|
if (v && typeof v === 'object') {
|
||||||
|
const subKeys = Object.keys(v);
|
||||||
|
console.log(` [${k}]: { ${subKeys.map((sk: string) => {
|
||||||
|
const sv = (v as any)[sk];
|
||||||
|
return `${sk}: ${Array.isArray(sv) ? `Array(${sv.length})` : typeof sv}`;
|
||||||
|
}).join(', ')} }`);
|
||||||
|
} else {
|
||||||
|
console.log(` [${k}]: ${typeof v}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to find feeds/notes arrays
|
||||||
|
console.log('\n--- Looking for feed/note arrays in __INITIAL_STATE__ ---');
|
||||||
|
function findArrays(obj: any, path: string, depth = 0) {
|
||||||
|
if (depth > 3) return;
|
||||||
|
if (!obj || typeof obj !== 'object') return;
|
||||||
|
for (const [k, v] of Object.entries(obj)) {
|
||||||
|
const fullPath = path ? `${path}.${k}` : k;
|
||||||
|
if (Array.isArray(v) && v.length > 0) {
|
||||||
|
const sample = v[0];
|
||||||
|
const sampleKeys = typeof sample === 'object' ? Object.keys(sample as object).slice(0, 6).join(', ') : typeof sample;
|
||||||
|
console.log(` Array found: ${fullPath} (length=${v.length}) sample keys: [${sampleKeys}]`);
|
||||||
|
} else if (v && typeof v === 'object' && !Array.isArray(v)) {
|
||||||
|
findArrays(v, fullPath, depth + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
findArrays(state, '');
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- DOM selectors ---
|
||||||
|
console.log('\n--- DOM selector probe ---');
|
||||||
|
const selectors = [
|
||||||
|
'.feeds-container .note-item',
|
||||||
|
'.note-item',
|
||||||
|
'#global-search-result-container',
|
||||||
|
'.search-result',
|
||||||
|
'.result-container',
|
||||||
|
'[class*="note"]',
|
||||||
|
'[class*="feed"]',
|
||||||
|
'[class*="result"]',
|
||||||
|
];
|
||||||
|
for (const sel of selectors) {
|
||||||
|
const count = await page.locator(sel).count();
|
||||||
|
if (count > 0) console.log(` ✓ [${count}] ${sel}`);
|
||||||
|
else console.log(` ✗ [0] ${sel}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Screenshot
|
||||||
|
const screenshotPath = '/tmp/xhs-search-debug.png';
|
||||||
|
await page.screenshot({ path: screenshotPath });
|
||||||
|
console.log(`\nScreenshot → ${screenshotPath}`);
|
||||||
|
|
||||||
|
await browser.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch((e) => { console.error(e); process.exit(1); });
|
||||||
@@ -332,7 +332,7 @@ async function scrapeSearchResultsFromDom(page: Page): Promise<Feed[]> {
|
|||||||
.$eval('a.cover', (el) => el.getAttribute('href') ?? '')
|
.$eval('a.cover', (el) => el.getAttribute('href') ?? '')
|
||||||
.catch(() => '');
|
.catch(() => '');
|
||||||
|
|
||||||
const idMatch = href.match(/\/explore\/([a-f0-9]+)/);
|
const idMatch = href.match(/\/(?:explore|search_result)\/([a-f0-9]+)/);
|
||||||
const tokenMatch = href.match(/xsec_token=([^&]+)/);
|
const tokenMatch = href.match(/xsec_token=([^&]+)/);
|
||||||
const id = idMatch?.[1] ?? '';
|
const id = idMatch?.[1] ?? '';
|
||||||
const xsecToken = tokenMatch?.[1] ?? '';
|
const xsecToken = tokenMatch?.[1] ?? '';
|
||||||
|
|||||||
@@ -129,9 +129,15 @@ export async function getUserProfile(
|
|||||||
userId: string,
|
userId: string,
|
||||||
xsecToken: string,
|
xsecToken: string,
|
||||||
): Promise<UserProfile> {
|
): Promise<UserProfile> {
|
||||||
const url = `${USER_PROFILE_BASE_URL}/${userId}?xsec_token=${encodeURIComponent(xsecToken)}&xsec_source=pc_note`;
|
const url = `${USER_PROFILE_BASE_URL}/${userId}?xsec_token=${encodeURIComponent(xsecToken)}&xsec_source=pc_feed`;
|
||||||
log.debug({ userId, url }, 'Navigating to user profile page');
|
log.debug({ userId, url }, 'Navigating to user profile page');
|
||||||
|
|
||||||
|
// XHS applies stricter bot detection on profile pages than on search pages.
|
||||||
|
// Visiting the explore page first establishes a natural session context that
|
||||||
|
// allows the subsequent profile navigation to pass the IP-risk check.
|
||||||
|
await page.goto('https://www.xiaohongshu.com/explore', { waitUntil: 'domcontentloaded' });
|
||||||
|
await page.waitForTimeout(1000);
|
||||||
|
|
||||||
await page.goto(url, { waitUntil: 'domcontentloaded' });
|
await page.goto(url, { waitUntil: 'domcontentloaded' });
|
||||||
|
|
||||||
// Wait for the user profile header to appear.
|
// Wait for the user profile header to appear.
|
||||||
@@ -378,9 +384,9 @@ async function scrapeProfileFromDom(
|
|||||||
.$eval('a.cover', (el) => el.getAttribute('href') ?? '')
|
.$eval('a.cover', (el) => el.getAttribute('href') ?? '')
|
||||||
.catch(() => '');
|
.catch(() => '');
|
||||||
|
|
||||||
const idMatch = href.match(/\/explore\/([a-f0-9]+)/);
|
const idMatch = href.match(/\/(?:explore|search_result)\/([a-f0-9]+)|\/user\/profile\/[^/]+\/([a-f0-9]+)/);
|
||||||
const tokenMatch = href.match(/xsec_token=([^&]+)/);
|
const tokenMatch = href.match(/xsec_token=([^&]+)/);
|
||||||
const id = idMatch?.[1] ?? '';
|
const id = idMatch?.[1] ?? idMatch?.[2] ?? '';
|
||||||
const noteXsecToken = tokenMatch?.[1] ?? '';
|
const noteXsecToken = tokenMatch?.[1] ?? '';
|
||||||
|
|
||||||
if (!id) continue;
|
if (!id) continue;
|
||||||
|
|||||||
Reference in New Issue
Block a user