426 lines
13 KiB
TypeScript
426 lines
13 KiB
TypeScript
import type { Page } from 'rebrowser-playwright';
|
|
|
|
import { logger } from '@social/core/utils/logger.js';
|
|
import { XHS_SELECTORS } from './selectors.js';
|
|
import { extractInitialState, parseCountString, ensureHttps } from './feeds.js';
|
|
import type { UserProfile, Feed } from './types.js';
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Constants
|
|
// ---------------------------------------------------------------------------
|
|
|
|
const USER_PROFILE_BASE_URL = 'https://www.xiaohongshu.com/user/profile';
|
|
|
|
const SEL = XHS_SELECTORS.userProfile;
|
|
|
|
const log = logger.child({ module: 'xhs-user-profile' });
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// __INITIAL_STATE__ raw types for user profile
|
|
// ---------------------------------------------------------------------------
|
|
|
|
interface RawProfileState {
|
|
user?: {
|
|
userPageData?: RawUserPageData;
|
|
userInfo?: RawUserInfo;
|
|
};
|
|
userProfile?: {
|
|
userInfo?: RawUserInfo;
|
|
notes?: RawProfileNote[];
|
|
};
|
|
[key: string]: unknown;
|
|
}
|
|
|
|
interface RawUserPageData {
|
|
basicInfo?: RawUserInfo;
|
|
interactions?: RawInteractions;
|
|
notes?: RawProfileNote[];
|
|
noteCount?: number | string;
|
|
note_count?: number | string;
|
|
}
|
|
|
|
interface RawUserInfo {
|
|
userId?: string;
|
|
user_id?: string;
|
|
nickname?: string;
|
|
nick_name?: string;
|
|
nickName?: string;
|
|
avatar?: string;
|
|
avatarUrl?: string;
|
|
avatar_url?: string;
|
|
images?: string;
|
|
desc?: string;
|
|
description?: string;
|
|
gender?: number | string;
|
|
ipLocation?: string;
|
|
ip_location?: string;
|
|
fstatus?: string;
|
|
follows?: number | string;
|
|
fans?: number | string;
|
|
interaction?: number | string;
|
|
noteCount?: number | string;
|
|
note_count?: number | string;
|
|
}
|
|
|
|
interface RawInteractions {
|
|
follows?: string | number;
|
|
fans?: string | number;
|
|
interaction?: string | number;
|
|
}
|
|
|
|
interface RawProfileNote {
|
|
id?: string;
|
|
noteId?: string;
|
|
note_id?: string;
|
|
xsecToken?: string;
|
|
xsec_token?: string;
|
|
displayTitle?: string;
|
|
display_title?: string;
|
|
title?: string;
|
|
desc?: string;
|
|
type?: string;
|
|
cover?: {
|
|
url?: string;
|
|
urlPre?: string;
|
|
url_pre?: string;
|
|
urlDefault?: string;
|
|
url_default?: string;
|
|
infoList?: Array<{ url?: string }>;
|
|
info_list?: Array<{ url?: string }>;
|
|
};
|
|
user?: {
|
|
userId?: string;
|
|
user_id?: string;
|
|
nickname?: string;
|
|
nick_name?: string;
|
|
avatar?: string;
|
|
};
|
|
interactInfo?: {
|
|
likedCount?: string;
|
|
liked_count?: string;
|
|
likeCount?: string;
|
|
like_count?: string;
|
|
};
|
|
interact_info?: {
|
|
likedCount?: string;
|
|
liked_count?: string;
|
|
likeCount?: string;
|
|
like_count?: string;
|
|
};
|
|
likedCount?: string;
|
|
liked_count?: string;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// getUserProfile
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Navigate to a Xiaohongshu user profile page and extract their information,
|
|
* including basic info, follower/following counts, and recent notes.
|
|
*
|
|
* @param page - A Playwright Page managed by BrowserManager.
|
|
* @param userId - The user ID.
|
|
* @param xsecToken - Security token required to access the profile page.
|
|
* @returns A UserProfile object with the user's data.
|
|
*/
|
|
export async function getUserProfile(
|
|
page: Page,
|
|
userId: string,
|
|
xsecToken: string,
|
|
): Promise<UserProfile> {
|
|
const url = `${USER_PROFILE_BASE_URL}/${userId}?xsec_token=${encodeURIComponent(xsecToken)}&xsec_source=pc_feed`;
|
|
log.debug({ userId, url }, 'Navigating to user profile page');
|
|
|
|
// XHS applies stricter bot detection on profile pages than on search pages.
|
|
// Visiting the explore page first establishes a natural session context that
|
|
// allows the subsequent profile navigation to pass the IP-risk check.
|
|
await page.goto('https://www.xiaohongshu.com/explore', { waitUntil: 'domcontentloaded' });
|
|
await page.waitForTimeout(1000);
|
|
|
|
await page.goto(url, { waitUntil: 'domcontentloaded' });
|
|
|
|
// Wait for the user profile header to appear.
|
|
await page
|
|
.waitForSelector(SEL.headerContainer, { timeout: 15_000 })
|
|
.catch(() => {
|
|
log.warn({ userId }, 'User profile header not found within timeout, proceeding');
|
|
});
|
|
|
|
// Allow render to settle.
|
|
await page.waitForTimeout(1500);
|
|
|
|
// -----------------------------------------------------------------------
|
|
// Strategy 1: Extract from __INITIAL_STATE__
|
|
// -----------------------------------------------------------------------
|
|
const initialState = await extractInitialState(page) as RawProfileState | null;
|
|
|
|
if (initialState) {
|
|
const profile = parseProfileFromState(initialState, userId, xsecToken);
|
|
if (profile) {
|
|
log.info({ userId, feedsCount: profile.feeds.length }, 'Extracted user profile from __INITIAL_STATE__');
|
|
return profile;
|
|
}
|
|
log.debug('__INITIAL_STATE__ found but no profile data extracted, falling back to DOM');
|
|
}
|
|
|
|
// -----------------------------------------------------------------------
|
|
// Strategy 2: Fall back to DOM scraping
|
|
// -----------------------------------------------------------------------
|
|
log.debug({ userId }, 'Falling back to DOM scraping for user profile');
|
|
const profile = await scrapeProfileFromDom(page, userId, xsecToken);
|
|
log.info({ userId, feedCount: profile.feeds.length }, 'Extracted user profile from DOM');
|
|
return profile;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// __INITIAL_STATE__ parsing
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Parse user profile data from __INITIAL_STATE__.
|
|
*/
|
|
function parseProfileFromState(
|
|
state: RawProfileState,
|
|
userId: string,
|
|
_xsecToken: string,
|
|
): UserProfile | null {
|
|
// Try multiple known locations for user data.
|
|
const userPageData = state.user?.userPageData;
|
|
const userInfo =
|
|
userPageData?.basicInfo ??
|
|
state.user?.userInfo ??
|
|
state.userProfile?.userInfo;
|
|
|
|
if (!userInfo) {
|
|
return null;
|
|
}
|
|
|
|
const id = userInfo.userId ?? userInfo.user_id ?? userId;
|
|
const nickname = userInfo.nickname ?? userInfo.nick_name ?? userInfo.nickName ?? '';
|
|
const avatar = userInfo.avatar ?? userInfo.avatarUrl ?? userInfo.avatar_url ?? userInfo.images ?? '';
|
|
const description = userInfo.desc ?? userInfo.description ?? '';
|
|
|
|
// Gender: 0=unknown, 1=male, 2=female
|
|
const genderRaw = userInfo.gender;
|
|
let gender = '';
|
|
if (genderRaw === 1 || genderRaw === '1') gender = 'male';
|
|
else if (genderRaw === 2 || genderRaw === '2') gender = 'female';
|
|
|
|
const ipLocation = userInfo.ipLocation ?? userInfo.ip_location ?? '';
|
|
|
|
// Follower / following / interaction counts.
|
|
const interactions = userPageData?.interactions;
|
|
const follows = toNumber(interactions?.follows ?? userInfo.follows ?? 0);
|
|
const fans = toNumber(interactions?.fans ?? userInfo.fans ?? 0);
|
|
const interaction = toNumber(interactions?.interaction ?? userInfo.interaction ?? 0);
|
|
|
|
// Notes / feeds on the profile page.
|
|
const rawNotes: RawProfileNote[] =
|
|
userPageData?.notes ?? state.userProfile?.notes ?? [];
|
|
const feeds = rawNotes
|
|
.map((note) => parseProfileNote(note, userId))
|
|
.filter((f): f is Feed => f !== null);
|
|
|
|
return {
|
|
id,
|
|
nickname,
|
|
avatar: avatar ? ensureHttps(avatar) : '',
|
|
description,
|
|
gender,
|
|
ipLocation,
|
|
follows,
|
|
fans,
|
|
interaction,
|
|
feeds,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Parse a note from the user profile state into a Feed object.
|
|
*/
|
|
function parseProfileNote(
|
|
raw: RawProfileNote,
|
|
ownerUserId: string,
|
|
): Feed | null {
|
|
const id = raw.id ?? raw.noteId ?? raw.note_id ?? '';
|
|
if (!id) return null;
|
|
|
|
const noteXsecToken = raw.xsecToken ?? raw.xsec_token ?? '';
|
|
const title = raw.displayTitle ?? raw.display_title ?? raw.title ?? '';
|
|
const description = raw.desc ?? '';
|
|
const rawType = raw.type ?? '';
|
|
const type: 'normal' | 'video' =
|
|
rawType.toLowerCase().includes('video') ? 'video' : 'normal';
|
|
|
|
// Cover image.
|
|
let coverUrl = '';
|
|
if (raw.cover) {
|
|
coverUrl =
|
|
raw.cover.url ?? raw.cover.urlPre ?? raw.cover.url_pre ??
|
|
raw.cover.urlDefault ?? raw.cover.url_default ?? '';
|
|
if (!coverUrl) {
|
|
const infoList = raw.cover.infoList ?? raw.cover.info_list;
|
|
if (infoList && infoList.length > 0 && infoList[0]?.url) {
|
|
coverUrl = infoList[0].url;
|
|
}
|
|
}
|
|
if (coverUrl) coverUrl = ensureHttps(coverUrl);
|
|
}
|
|
|
|
// Like count.
|
|
const interact = raw.interactInfo ?? raw.interact_info;
|
|
const likeCountStr =
|
|
interact?.likedCount ?? interact?.liked_count ??
|
|
interact?.likeCount ?? interact?.like_count ??
|
|
raw.likedCount ?? raw.liked_count ?? '0';
|
|
const likeCount = parseCountString(likeCountStr);
|
|
|
|
// User.
|
|
const rawUser = raw.user;
|
|
const user = {
|
|
id: rawUser?.userId ?? rawUser?.user_id ?? ownerUserId,
|
|
nickname: rawUser?.nickname ?? rawUser?.nick_name ?? '',
|
|
avatar: rawUser?.avatar ?? '',
|
|
};
|
|
|
|
return {
|
|
id,
|
|
xsecToken: noteXsecToken,
|
|
title,
|
|
description,
|
|
type,
|
|
coverUrl,
|
|
likeCount,
|
|
user,
|
|
};
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// DOM scraping fallback — uses Playwright Node-side API exclusively
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Scrape user profile data from the rendered DOM using Playwright's
|
|
* Node-side APIs to avoid needing DOM lib types.
|
|
*/
|
|
async function scrapeProfileFromDom(
|
|
page: Page,
|
|
userId: string,
|
|
xsecToken: string,
|
|
): Promise<UserProfile> {
|
|
// Nickname
|
|
const nickname = await page
|
|
.$eval(SEL.nickname, (el) => el.textContent?.trim() ?? '')
|
|
.catch(() => '');
|
|
|
|
// Avatar
|
|
const avatar = await page
|
|
.$eval(SEL.avatar, (img) => img.getAttribute('src') ?? '')
|
|
.catch(() => '');
|
|
|
|
// Description / bio
|
|
const description = await page
|
|
.$eval(SEL.description, (el) => el.textContent?.trim() ?? '')
|
|
.catch(() => '');
|
|
|
|
// Gender — try the gender icon class.
|
|
const gender = await page
|
|
.$eval(SEL.gender, (el) => {
|
|
const cls = el.className.toLowerCase();
|
|
if (cls.includes('male') && !cls.includes('female')) return 'male';
|
|
if (cls.includes('female')) return 'female';
|
|
return '';
|
|
})
|
|
.catch(() => '');
|
|
|
|
// IP location
|
|
const ipLocation = await page
|
|
.$eval(SEL.ipLocation, (el) => el.textContent?.trim() ?? '')
|
|
.catch(() => '');
|
|
|
|
// Follower / following / interaction counts.
|
|
// These are typically in a row of .data-item elements.
|
|
const dataCounts = await page.$$eval(SEL.followCount, (items) =>
|
|
items.map((item) => {
|
|
const countEl = item.querySelector('.count');
|
|
return countEl?.textContent?.trim() ?? '0';
|
|
}),
|
|
).catch(() => [] as string[]);
|
|
|
|
const follows = parseCountString(dataCounts[0] ?? '0');
|
|
const fans = parseCountString(dataCounts[1] ?? '0');
|
|
const interaction = parseCountString(dataCounts[2] ?? '0');
|
|
|
|
// Scrape feed items on the profile page.
|
|
const feedElements = await page.$$(SEL.feedItem);
|
|
const feeds: Feed[] = [];
|
|
|
|
for (const card of feedElements) {
|
|
try {
|
|
const href = await card
|
|
.$eval('a.cover', (el) => el.getAttribute('href') ?? '')
|
|
.catch(() => '');
|
|
|
|
const idMatch = href.match(/\/(?:explore|search_result)\/([a-f0-9]+)|\/user\/profile\/[^/]+\/([a-f0-9]+)/);
|
|
const tokenMatch = href.match(/xsec_token=([^&]+)/);
|
|
const id = idMatch?.[1] ?? idMatch?.[2] ?? '';
|
|
const noteXsecToken = tokenMatch?.[1] ?? '';
|
|
|
|
if (!id) continue;
|
|
|
|
const coverUrl = await card
|
|
.$eval('a.cover img', (el) => el.getAttribute('src') ?? el.getAttribute('data-src') ?? '')
|
|
.catch(() => '');
|
|
|
|
const feedTitle = await card
|
|
.$eval('.footer .title', (el) => el.textContent?.trim() ?? '')
|
|
.catch(() => '');
|
|
|
|
const likeText = await card
|
|
.$eval('.footer .like-wrapper .count', (el) => el.textContent?.trim() ?? '0')
|
|
.catch(() => '0');
|
|
|
|
const hasVideoIcon = await card.$('.play-icon').then((el) => el !== null).catch(() => false);
|
|
|
|
feeds.push({
|
|
id,
|
|
xsecToken: noteXsecToken || xsecToken,
|
|
title: feedTitle,
|
|
description: '',
|
|
type: hasVideoIcon ? 'video' : 'normal',
|
|
coverUrl,
|
|
likeCount: parseCountString(likeText),
|
|
user: { id: userId, nickname: '', avatar: '' },
|
|
});
|
|
} catch {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
return {
|
|
id: userId,
|
|
nickname,
|
|
avatar,
|
|
description,
|
|
gender,
|
|
ipLocation,
|
|
follows,
|
|
fans,
|
|
interaction,
|
|
feeds,
|
|
};
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Helpers
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Convert a string or number to a number, handling abbreviations.
|
|
*/
|
|
function toNumber(val: string | number): number {
|
|
if (typeof val === 'number') return val;
|
|
return parseCountString(val);
|
|
}
|