import type { Page } from 'rebrowser-playwright'; import { logger } from '@social/core/utils/logger.js'; import { XHS_SELECTORS } from './selectors.js'; import { extractInitialState, parseCountString, ensureHttps } from './feeds.js'; import type { UserProfile, Feed } from './types.js'; // --------------------------------------------------------------------------- // Constants // --------------------------------------------------------------------------- const USER_PROFILE_BASE_URL = 'https://www.xiaohongshu.com/user/profile'; const SEL = XHS_SELECTORS.userProfile; const log = logger.child({ module: 'xhs-user-profile' }); // --------------------------------------------------------------------------- // __INITIAL_STATE__ raw types for user profile // --------------------------------------------------------------------------- interface RawProfileState { user?: { userPageData?: RawUserPageData; userInfo?: RawUserInfo; }; userProfile?: { userInfo?: RawUserInfo; notes?: RawProfileNote[]; }; [key: string]: unknown; } interface RawUserPageData { basicInfo?: RawUserInfo; interactions?: RawInteractions; notes?: RawProfileNote[]; noteCount?: number | string; note_count?: number | string; } interface RawUserInfo { userId?: string; user_id?: string; nickname?: string; nick_name?: string; nickName?: string; avatar?: string; avatarUrl?: string; avatar_url?: string; images?: string; desc?: string; description?: string; gender?: number | string; ipLocation?: string; ip_location?: string; fstatus?: string; follows?: number | string; fans?: number | string; interaction?: number | string; noteCount?: number | string; note_count?: number | string; } interface RawInteractions { follows?: string | number; fans?: string | number; interaction?: string | number; } interface RawProfileNote { id?: string; noteId?: string; note_id?: string; xsecToken?: string; xsec_token?: string; displayTitle?: string; display_title?: string; title?: string; desc?: string; type?: string; cover?: { url?: string; urlPre?: string; url_pre?: string; urlDefault?: string; url_default?: string; infoList?: Array<{ url?: string }>; info_list?: Array<{ url?: string }>; }; user?: { userId?: string; user_id?: string; nickname?: string; nick_name?: string; avatar?: string; }; interactInfo?: { likedCount?: string; liked_count?: string; likeCount?: string; like_count?: string; }; interact_info?: { likedCount?: string; liked_count?: string; likeCount?: string; like_count?: string; }; likedCount?: string; liked_count?: string; } // --------------------------------------------------------------------------- // getUserProfile // --------------------------------------------------------------------------- /** * Navigate to a Xiaohongshu user profile page and extract their information, * including basic info, follower/following counts, and recent notes. * * @param page - A Playwright Page managed by BrowserManager. * @param userId - The user ID. * @param xsecToken - Security token required to access the profile page. * @returns A UserProfile object with the user's data. */ export async function getUserProfile( page: Page, userId: string, xsecToken: string, ): Promise { const url = `${USER_PROFILE_BASE_URL}/${userId}?xsec_token=${encodeURIComponent(xsecToken)}&xsec_source=pc_feed`; log.debug({ userId, url }, 'Navigating to user profile page'); // XHS applies stricter bot detection on profile pages than on search pages. // Visiting the explore page first establishes a natural session context that // allows the subsequent profile navigation to pass the IP-risk check. await page.goto('https://www.xiaohongshu.com/explore', { waitUntil: 'domcontentloaded' }); await page.waitForTimeout(1000); await page.goto(url, { waitUntil: 'domcontentloaded' }); // Wait for the user profile header to appear. await page .waitForSelector(SEL.headerContainer, { timeout: 15_000 }) .catch(() => { log.warn({ userId }, 'User profile header not found within timeout, proceeding'); }); // Allow render to settle. await page.waitForTimeout(1500); // ----------------------------------------------------------------------- // Strategy 1: Extract from __INITIAL_STATE__ // ----------------------------------------------------------------------- const initialState = await extractInitialState(page) as RawProfileState | null; if (initialState) { const profile = parseProfileFromState(initialState, userId, xsecToken); if (profile) { log.info({ userId, feedsCount: profile.feeds.length }, 'Extracted user profile from __INITIAL_STATE__'); return profile; } log.debug('__INITIAL_STATE__ found but no profile data extracted, falling back to DOM'); } // ----------------------------------------------------------------------- // Strategy 2: Fall back to DOM scraping // ----------------------------------------------------------------------- log.debug({ userId }, 'Falling back to DOM scraping for user profile'); const profile = await scrapeProfileFromDom(page, userId, xsecToken); log.info({ userId, feedCount: profile.feeds.length }, 'Extracted user profile from DOM'); return profile; } // --------------------------------------------------------------------------- // __INITIAL_STATE__ parsing // --------------------------------------------------------------------------- /** * Parse user profile data from __INITIAL_STATE__. */ function parseProfileFromState( state: RawProfileState, userId: string, _xsecToken: string, ): UserProfile | null { // Try multiple known locations for user data. const userPageData = state.user?.userPageData; const userInfo = userPageData?.basicInfo ?? state.user?.userInfo ?? state.userProfile?.userInfo; if (!userInfo) { return null; } const id = userInfo.userId ?? userInfo.user_id ?? userId; const nickname = userInfo.nickname ?? userInfo.nick_name ?? userInfo.nickName ?? ''; const avatar = userInfo.avatar ?? userInfo.avatarUrl ?? userInfo.avatar_url ?? userInfo.images ?? ''; const description = userInfo.desc ?? userInfo.description ?? ''; // Gender: 0=unknown, 1=male, 2=female const genderRaw = userInfo.gender; let gender = ''; if (genderRaw === 1 || genderRaw === '1') gender = 'male'; else if (genderRaw === 2 || genderRaw === '2') gender = 'female'; const ipLocation = userInfo.ipLocation ?? userInfo.ip_location ?? ''; // Follower / following / interaction counts. const interactions = userPageData?.interactions; const follows = toNumber(interactions?.follows ?? userInfo.follows ?? 0); const fans = toNumber(interactions?.fans ?? userInfo.fans ?? 0); const interaction = toNumber(interactions?.interaction ?? userInfo.interaction ?? 0); // Notes / feeds on the profile page. const rawNotes: RawProfileNote[] = userPageData?.notes ?? state.userProfile?.notes ?? []; const feeds = rawNotes .map((note) => parseProfileNote(note, userId)) .filter((f): f is Feed => f !== null); return { id, nickname, avatar: avatar ? ensureHttps(avatar) : '', description, gender, ipLocation, follows, fans, interaction, feeds, }; } /** * Parse a note from the user profile state into a Feed object. */ function parseProfileNote( raw: RawProfileNote, ownerUserId: string, ): Feed | null { const id = raw.id ?? raw.noteId ?? raw.note_id ?? ''; if (!id) return null; const noteXsecToken = raw.xsecToken ?? raw.xsec_token ?? ''; const title = raw.displayTitle ?? raw.display_title ?? raw.title ?? ''; const description = raw.desc ?? ''; const rawType = raw.type ?? ''; const type: 'normal' | 'video' = rawType.toLowerCase().includes('video') ? 'video' : 'normal'; // Cover image. let coverUrl = ''; if (raw.cover) { coverUrl = raw.cover.url ?? raw.cover.urlPre ?? raw.cover.url_pre ?? raw.cover.urlDefault ?? raw.cover.url_default ?? ''; if (!coverUrl) { const infoList = raw.cover.infoList ?? raw.cover.info_list; if (infoList && infoList.length > 0 && infoList[0]?.url) { coverUrl = infoList[0].url; } } if (coverUrl) coverUrl = ensureHttps(coverUrl); } // Like count. const interact = raw.interactInfo ?? raw.interact_info; const likeCountStr = interact?.likedCount ?? interact?.liked_count ?? interact?.likeCount ?? interact?.like_count ?? raw.likedCount ?? raw.liked_count ?? '0'; const likeCount = parseCountString(likeCountStr); // User. const rawUser = raw.user; const user = { id: rawUser?.userId ?? rawUser?.user_id ?? ownerUserId, nickname: rawUser?.nickname ?? rawUser?.nick_name ?? '', avatar: rawUser?.avatar ?? '', }; return { id, xsecToken: noteXsecToken, title, description, type, coverUrl, likeCount, user, }; } // --------------------------------------------------------------------------- // DOM scraping fallback — uses Playwright Node-side API exclusively // --------------------------------------------------------------------------- /** * Scrape user profile data from the rendered DOM using Playwright's * Node-side APIs to avoid needing DOM lib types. */ async function scrapeProfileFromDom( page: Page, userId: string, xsecToken: string, ): Promise { // Nickname const nickname = await page .$eval(SEL.nickname, (el) => el.textContent?.trim() ?? '') .catch(() => ''); // Avatar const avatar = await page .$eval(SEL.avatar, (img) => img.getAttribute('src') ?? '') .catch(() => ''); // Description / bio const description = await page .$eval(SEL.description, (el) => el.textContent?.trim() ?? '') .catch(() => ''); // Gender — try the gender icon class. const gender = await page .$eval(SEL.gender, (el) => { const cls = el.className.toLowerCase(); if (cls.includes('male') && !cls.includes('female')) return 'male'; if (cls.includes('female')) return 'female'; return ''; }) .catch(() => ''); // IP location const ipLocation = await page .$eval(SEL.ipLocation, (el) => el.textContent?.trim() ?? '') .catch(() => ''); // Follower / following / interaction counts. // These are typically in a row of .data-item elements. const dataCounts = await page.$$eval(SEL.followCount, (items) => items.map((item) => { const countEl = item.querySelector('.count'); return countEl?.textContent?.trim() ?? '0'; }), ).catch(() => [] as string[]); const follows = parseCountString(dataCounts[0] ?? '0'); const fans = parseCountString(dataCounts[1] ?? '0'); const interaction = parseCountString(dataCounts[2] ?? '0'); // Scrape feed items on the profile page. const feedElements = await page.$$(SEL.feedItem); const feeds: Feed[] = []; for (const card of feedElements) { try { const href = await card .$eval('a.cover', (el) => el.getAttribute('href') ?? '') .catch(() => ''); const idMatch = href.match(/\/(?:explore|search_result)\/([a-f0-9]+)|\/user\/profile\/[^/]+\/([a-f0-9]+)/); const tokenMatch = href.match(/xsec_token=([^&]+)/); const id = idMatch?.[1] ?? idMatch?.[2] ?? ''; const noteXsecToken = tokenMatch?.[1] ?? ''; if (!id) continue; const coverUrl = await card .$eval('a.cover img', (el) => el.getAttribute('src') ?? el.getAttribute('data-src') ?? '') .catch(() => ''); const feedTitle = await card .$eval('.footer .title', (el) => el.textContent?.trim() ?? '') .catch(() => ''); const likeText = await card .$eval('.footer .like-wrapper .count', (el) => el.textContent?.trim() ?? '0') .catch(() => '0'); const hasVideoIcon = await card.$('.play-icon').then((el) => el !== null).catch(() => false); feeds.push({ id, xsecToken: noteXsecToken || xsecToken, title: feedTitle, description: '', type: hasVideoIcon ? 'video' : 'normal', coverUrl, likeCount: parseCountString(likeText), user: { id: userId, nickname: '', avatar: '' }, }); } catch { continue; } } return { id: userId, nickname, avatar, description, gender, ipLocation, follows, fans, interaction, feeds, }; } // --------------------------------------------------------------------------- // Helpers // --------------------------------------------------------------------------- /** * Convert a string or number to a number, handling abbreviations. */ function toNumber(val: string | number): number { if (typeof val === 'number') return val; return parseCountString(val); }