修复搜索和用户主页的笔记 ID 提取及风控绕过问题

- 搜索页笔记 href 格式为 /search_result/<id>,修正正则以兼容 /explore/ 和 /search_result/
- 用户主页笔记 href 格式为 /user/profile/<userId>/<noteId>,扩展正则并取正确捕获组
- 用户主页访问前先 warm-up 到 /explore,绕过 XHS headless IP 风控(code 300012)
- xsec_source 改为 pc_feed 以匹配用户从 feed 页获取的 token 类型
- 新增 debug-search.ts / debug-qrcode.ts / debug-profile.ts 诊断脚本
This commit is contained in:
2026-03-01 22:44:45 +08:00
parent 8b39520ec7
commit 56d5a6de96
5 changed files with 265 additions and 4 deletions
+1 -1
View File
@@ -332,7 +332,7 @@ async function scrapeSearchResultsFromDom(page: Page): Promise<Feed[]> {
.$eval('a.cover', (el) => el.getAttribute('href') ?? '')
.catch(() => '');
const idMatch = href.match(/\/explore\/([a-f0-9]+)/);
const idMatch = href.match(/\/(?:explore|search_result)\/([a-f0-9]+)/);
const tokenMatch = href.match(/xsec_token=([^&]+)/);
const id = idMatch?.[1] ?? '';
const xsecToken = tokenMatch?.[1] ?? '';
+9 -3
View File
@@ -129,9 +129,15 @@ export async function getUserProfile(
userId: string,
xsecToken: string,
): Promise<UserProfile> {
const url = `${USER_PROFILE_BASE_URL}/${userId}?xsec_token=${encodeURIComponent(xsecToken)}&xsec_source=pc_note`;
const url = `${USER_PROFILE_BASE_URL}/${userId}?xsec_token=${encodeURIComponent(xsecToken)}&xsec_source=pc_feed`;
log.debug({ userId, url }, 'Navigating to user profile page');
// XHS applies stricter bot detection on profile pages than on search pages.
// Visiting the explore page first establishes a natural session context that
// allows the subsequent profile navigation to pass the IP-risk check.
await page.goto('https://www.xiaohongshu.com/explore', { waitUntil: 'domcontentloaded' });
await page.waitForTimeout(1000);
await page.goto(url, { waitUntil: 'domcontentloaded' });
// Wait for the user profile header to appear.
@@ -378,9 +384,9 @@ async function scrapeProfileFromDom(
.$eval('a.cover', (el) => el.getAttribute('href') ?? '')
.catch(() => '');
const idMatch = href.match(/\/explore\/([a-f0-9]+)/);
const idMatch = href.match(/\/(?:explore|search_result)\/([a-f0-9]+)|\/user\/profile\/[^/]+\/([a-f0-9]+)/);
const tokenMatch = href.match(/xsec_token=([^&]+)/);
const id = idMatch?.[1] ?? '';
const id = idMatch?.[1] ?? idMatch?.[2] ?? '';
const noteXsecToken = tokenMatch?.[1] ?? '';
if (!id) continue;