修复搜索和用户主页的笔记 ID 提取及风控绕过问题
- 搜索页笔记 href 格式为 /search_result/<id>,修正正则以兼容 /explore/ 和 /search_result/ - 用户主页笔记 href 格式为 /user/profile/<userId>/<noteId>,扩展正则并取正确捕获组 - 用户主页访问前先 warm-up 到 /explore,绕过 XHS headless IP 风控(code 300012) - xsec_source 改为 pc_feed 以匹配用户从 feed 页获取的 token 类型 - 新增 debug-search.ts / debug-qrcode.ts / debug-profile.ts 诊断脚本
This commit is contained in:
@@ -332,7 +332,7 @@ async function scrapeSearchResultsFromDom(page: Page): Promise<Feed[]> {
|
||||
.$eval('a.cover', (el) => el.getAttribute('href') ?? '')
|
||||
.catch(() => '');
|
||||
|
||||
const idMatch = href.match(/\/explore\/([a-f0-9]+)/);
|
||||
const idMatch = href.match(/\/(?:explore|search_result)\/([a-f0-9]+)/);
|
||||
const tokenMatch = href.match(/xsec_token=([^&]+)/);
|
||||
const id = idMatch?.[1] ?? '';
|
||||
const xsecToken = tokenMatch?.[1] ?? '';
|
||||
|
||||
@@ -129,9 +129,15 @@ export async function getUserProfile(
|
||||
userId: string,
|
||||
xsecToken: string,
|
||||
): Promise<UserProfile> {
|
||||
const url = `${USER_PROFILE_BASE_URL}/${userId}?xsec_token=${encodeURIComponent(xsecToken)}&xsec_source=pc_note`;
|
||||
const url = `${USER_PROFILE_BASE_URL}/${userId}?xsec_token=${encodeURIComponent(xsecToken)}&xsec_source=pc_feed`;
|
||||
log.debug({ userId, url }, 'Navigating to user profile page');
|
||||
|
||||
// XHS applies stricter bot detection on profile pages than on search pages.
|
||||
// Visiting the explore page first establishes a natural session context that
|
||||
// allows the subsequent profile navigation to pass the IP-risk check.
|
||||
await page.goto('https://www.xiaohongshu.com/explore', { waitUntil: 'domcontentloaded' });
|
||||
await page.waitForTimeout(1000);
|
||||
|
||||
await page.goto(url, { waitUntil: 'domcontentloaded' });
|
||||
|
||||
// Wait for the user profile header to appear.
|
||||
@@ -378,9 +384,9 @@ async function scrapeProfileFromDom(
|
||||
.$eval('a.cover', (el) => el.getAttribute('href') ?? '')
|
||||
.catch(() => '');
|
||||
|
||||
const idMatch = href.match(/\/explore\/([a-f0-9]+)/);
|
||||
const idMatch = href.match(/\/(?:explore|search_result)\/([a-f0-9]+)|\/user\/profile\/[^/]+\/([a-f0-9]+)/);
|
||||
const tokenMatch = href.match(/xsec_token=([^&]+)/);
|
||||
const id = idMatch?.[1] ?? '';
|
||||
const id = idMatch?.[1] ?? idMatch?.[2] ?? '';
|
||||
const noteXsecToken = tokenMatch?.[1] ?? '';
|
||||
|
||||
if (!id) continue;
|
||||
|
||||
Reference in New Issue
Block a user