Files
social-mcp/apps/xhs-mcp/src/platforms/xiaohongshu/user-profile.ts
T

426 lines
13 KiB
TypeScript

import type { Page } from 'rebrowser-playwright';
import { logger } from '@social/core/utils/logger.js';
import { XHS_SELECTORS } from './selectors.js';
import { extractInitialState, parseCountString, ensureHttps } from './feeds.js';
import type { UserProfile, Feed } from './types.js';
// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------
const USER_PROFILE_BASE_URL = 'https://www.xiaohongshu.com/user/profile';
const SEL = XHS_SELECTORS.userProfile;
const log = logger.child({ module: 'xhs-user-profile' });
// ---------------------------------------------------------------------------
// __INITIAL_STATE__ raw types for user profile
// ---------------------------------------------------------------------------
interface RawProfileState {
user?: {
userPageData?: RawUserPageData;
userInfo?: RawUserInfo;
};
userProfile?: {
userInfo?: RawUserInfo;
notes?: RawProfileNote[];
};
[key: string]: unknown;
}
interface RawUserPageData {
basicInfo?: RawUserInfo;
interactions?: RawInteractions;
notes?: RawProfileNote[];
noteCount?: number | string;
note_count?: number | string;
}
interface RawUserInfo {
userId?: string;
user_id?: string;
nickname?: string;
nick_name?: string;
nickName?: string;
avatar?: string;
avatarUrl?: string;
avatar_url?: string;
images?: string;
desc?: string;
description?: string;
gender?: number | string;
ipLocation?: string;
ip_location?: string;
fstatus?: string;
follows?: number | string;
fans?: number | string;
interaction?: number | string;
noteCount?: number | string;
note_count?: number | string;
}
interface RawInteractions {
follows?: string | number;
fans?: string | number;
interaction?: string | number;
}
interface RawProfileNote {
id?: string;
noteId?: string;
note_id?: string;
xsecToken?: string;
xsec_token?: string;
displayTitle?: string;
display_title?: string;
title?: string;
desc?: string;
type?: string;
cover?: {
url?: string;
urlPre?: string;
url_pre?: string;
urlDefault?: string;
url_default?: string;
infoList?: Array<{ url?: string }>;
info_list?: Array<{ url?: string }>;
};
user?: {
userId?: string;
user_id?: string;
nickname?: string;
nick_name?: string;
avatar?: string;
};
interactInfo?: {
likedCount?: string;
liked_count?: string;
likeCount?: string;
like_count?: string;
};
interact_info?: {
likedCount?: string;
liked_count?: string;
likeCount?: string;
like_count?: string;
};
likedCount?: string;
liked_count?: string;
}
// ---------------------------------------------------------------------------
// getUserProfile
// ---------------------------------------------------------------------------
/**
* Navigate to a Xiaohongshu user profile page and extract their information,
* including basic info, follower/following counts, and recent notes.
*
* @param page - A Playwright Page managed by BrowserManager.
* @param userId - The user ID.
* @param xsecToken - Security token required to access the profile page.
* @returns A UserProfile object with the user's data.
*/
export async function getUserProfile(
page: Page,
userId: string,
xsecToken: string,
): Promise<UserProfile> {
const url = `${USER_PROFILE_BASE_URL}/${userId}?xsec_token=${encodeURIComponent(xsecToken)}&xsec_source=pc_feed`;
log.debug({ userId, url }, 'Navigating to user profile page');
// XHS applies stricter bot detection on profile pages than on search pages.
// Visiting the explore page first establishes a natural session context that
// allows the subsequent profile navigation to pass the IP-risk check.
await page.goto('https://www.xiaohongshu.com/explore', { waitUntil: 'domcontentloaded' });
await page.waitForTimeout(1000);
await page.goto(url, { waitUntil: 'domcontentloaded' });
// Wait for the user profile header to appear.
await page
.waitForSelector(SEL.headerContainer, { timeout: 15_000 })
.catch(() => {
log.warn({ userId }, 'User profile header not found within timeout, proceeding');
});
// Allow render to settle.
await page.waitForTimeout(1500);
// -----------------------------------------------------------------------
// Strategy 1: Extract from __INITIAL_STATE__
// -----------------------------------------------------------------------
const initialState = await extractInitialState(page) as RawProfileState | null;
if (initialState) {
const profile = parseProfileFromState(initialState, userId, xsecToken);
if (profile) {
log.info({ userId, feedsCount: profile.feeds.length }, 'Extracted user profile from __INITIAL_STATE__');
return profile;
}
log.debug('__INITIAL_STATE__ found but no profile data extracted, falling back to DOM');
}
// -----------------------------------------------------------------------
// Strategy 2: Fall back to DOM scraping
// -----------------------------------------------------------------------
log.debug({ userId }, 'Falling back to DOM scraping for user profile');
const profile = await scrapeProfileFromDom(page, userId, xsecToken);
log.info({ userId, feedCount: profile.feeds.length }, 'Extracted user profile from DOM');
return profile;
}
// ---------------------------------------------------------------------------
// __INITIAL_STATE__ parsing
// ---------------------------------------------------------------------------
/**
* Parse user profile data from __INITIAL_STATE__.
*/
function parseProfileFromState(
state: RawProfileState,
userId: string,
_xsecToken: string,
): UserProfile | null {
// Try multiple known locations for user data.
const userPageData = state.user?.userPageData;
const userInfo =
userPageData?.basicInfo ??
state.user?.userInfo ??
state.userProfile?.userInfo;
if (!userInfo) {
return null;
}
const id = userInfo.userId ?? userInfo.user_id ?? userId;
const nickname = userInfo.nickname ?? userInfo.nick_name ?? userInfo.nickName ?? '';
const avatar = userInfo.avatar ?? userInfo.avatarUrl ?? userInfo.avatar_url ?? userInfo.images ?? '';
const description = userInfo.desc ?? userInfo.description ?? '';
// Gender: 0=unknown, 1=male, 2=female
const genderRaw = userInfo.gender;
let gender = '';
if (genderRaw === 1 || genderRaw === '1') gender = 'male';
else if (genderRaw === 2 || genderRaw === '2') gender = 'female';
const ipLocation = userInfo.ipLocation ?? userInfo.ip_location ?? '';
// Follower / following / interaction counts.
const interactions = userPageData?.interactions;
const follows = toNumber(interactions?.follows ?? userInfo.follows ?? 0);
const fans = toNumber(interactions?.fans ?? userInfo.fans ?? 0);
const interaction = toNumber(interactions?.interaction ?? userInfo.interaction ?? 0);
// Notes / feeds on the profile page.
const rawNotes: RawProfileNote[] =
userPageData?.notes ?? state.userProfile?.notes ?? [];
const feeds = rawNotes
.map((note) => parseProfileNote(note, userId))
.filter((f): f is Feed => f !== null);
return {
id,
nickname,
avatar: avatar ? ensureHttps(avatar) : '',
description,
gender,
ipLocation,
follows,
fans,
interaction,
feeds,
};
}
/**
* Parse a note from the user profile state into a Feed object.
*/
function parseProfileNote(
raw: RawProfileNote,
ownerUserId: string,
): Feed | null {
const id = raw.id ?? raw.noteId ?? raw.note_id ?? '';
if (!id) return null;
const noteXsecToken = raw.xsecToken ?? raw.xsec_token ?? '';
const title = raw.displayTitle ?? raw.display_title ?? raw.title ?? '';
const description = raw.desc ?? '';
const rawType = raw.type ?? '';
const type: 'normal' | 'video' =
rawType.toLowerCase().includes('video') ? 'video' : 'normal';
// Cover image.
let coverUrl = '';
if (raw.cover) {
coverUrl =
raw.cover.url ?? raw.cover.urlPre ?? raw.cover.url_pre ??
raw.cover.urlDefault ?? raw.cover.url_default ?? '';
if (!coverUrl) {
const infoList = raw.cover.infoList ?? raw.cover.info_list;
if (infoList && infoList.length > 0 && infoList[0]?.url) {
coverUrl = infoList[0].url;
}
}
if (coverUrl) coverUrl = ensureHttps(coverUrl);
}
// Like count.
const interact = raw.interactInfo ?? raw.interact_info;
const likeCountStr =
interact?.likedCount ?? interact?.liked_count ??
interact?.likeCount ?? interact?.like_count ??
raw.likedCount ?? raw.liked_count ?? '0';
const likeCount = parseCountString(likeCountStr);
// User.
const rawUser = raw.user;
const user = {
id: rawUser?.userId ?? rawUser?.user_id ?? ownerUserId,
nickname: rawUser?.nickname ?? rawUser?.nick_name ?? '',
avatar: rawUser?.avatar ?? '',
};
return {
id,
xsecToken: noteXsecToken,
title,
description,
type,
coverUrl,
likeCount,
user,
};
}
// ---------------------------------------------------------------------------
// DOM scraping fallback — uses Playwright Node-side API exclusively
// ---------------------------------------------------------------------------
/**
* Scrape user profile data from the rendered DOM using Playwright's
* Node-side APIs to avoid needing DOM lib types.
*/
async function scrapeProfileFromDom(
page: Page,
userId: string,
xsecToken: string,
): Promise<UserProfile> {
// Nickname
const nickname = await page
.$eval(SEL.nickname, (el) => el.textContent?.trim() ?? '')
.catch(() => '');
// Avatar
const avatar = await page
.$eval(SEL.avatar, (img) => img.getAttribute('src') ?? '')
.catch(() => '');
// Description / bio
const description = await page
.$eval(SEL.description, (el) => el.textContent?.trim() ?? '')
.catch(() => '');
// Gender — try the gender icon class.
const gender = await page
.$eval(SEL.gender, (el) => {
const cls = el.className.toLowerCase();
if (cls.includes('male') && !cls.includes('female')) return 'male';
if (cls.includes('female')) return 'female';
return '';
})
.catch(() => '');
// IP location
const ipLocation = await page
.$eval(SEL.ipLocation, (el) => el.textContent?.trim() ?? '')
.catch(() => '');
// Follower / following / interaction counts.
// These are typically in a row of .data-item elements.
const dataCounts = await page.$$eval(SEL.followCount, (items) =>
items.map((item) => {
const countEl = item.querySelector('.count');
return countEl?.textContent?.trim() ?? '0';
}),
).catch(() => [] as string[]);
const follows = parseCountString(dataCounts[0] ?? '0');
const fans = parseCountString(dataCounts[1] ?? '0');
const interaction = parseCountString(dataCounts[2] ?? '0');
// Scrape feed items on the profile page.
const feedElements = await page.$$(SEL.feedItem);
const feeds: Feed[] = [];
for (const card of feedElements) {
try {
const href = await card
.$eval('a.cover', (el) => el.getAttribute('href') ?? '')
.catch(() => '');
const idMatch = href.match(/\/(?:explore|search_result)\/([a-f0-9]+)|\/user\/profile\/[^/]+\/([a-f0-9]+)/);
const tokenMatch = href.match(/xsec_token=([^&]+)/);
const id = idMatch?.[1] ?? idMatch?.[2] ?? '';
const noteXsecToken = tokenMatch?.[1] ?? '';
if (!id) continue;
const coverUrl = await card
.$eval('a.cover img', (el) => el.getAttribute('src') ?? el.getAttribute('data-src') ?? '')
.catch(() => '');
const feedTitle = await card
.$eval('.footer .title', (el) => el.textContent?.trim() ?? '')
.catch(() => '');
const likeText = await card
.$eval('.footer .like-wrapper .count', (el) => el.textContent?.trim() ?? '0')
.catch(() => '0');
const hasVideoIcon = await card.$('.play-icon').then((el) => el !== null).catch(() => false);
feeds.push({
id,
xsecToken: noteXsecToken || xsecToken,
title: feedTitle,
description: '',
type: hasVideoIcon ? 'video' : 'normal',
coverUrl,
likeCount: parseCountString(likeText),
user: { id: userId, nickname: '', avatar: '' },
});
} catch {
continue;
}
}
return {
id: userId,
nickname,
avatar,
description,
gender,
ipLocation,
follows,
fans,
interaction,
feeds,
};
}
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
/**
* Convert a string or number to a number, handling abbreviations.
*/
function toNumber(val: string | number): number {
if (typeof val === 'number') return val;
return parseCountString(val);
}