重构为Monorepo:拆分xhs/xhh应用与core包并完成双服务部署改造
This commit is contained in:
@@ -0,0 +1,425 @@
|
||||
import type { Page } from 'rebrowser-playwright';
|
||||
|
||||
import { logger } from '@social/core/utils/logger.js';
|
||||
import { XHS_SELECTORS } from './selectors.js';
|
||||
import { extractInitialState, parseCountString, ensureHttps } from './feeds.js';
|
||||
import type { UserProfile, Feed } from './types.js';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Constants
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const USER_PROFILE_BASE_URL = 'https://www.xiaohongshu.com/user/profile';
|
||||
|
||||
const SEL = XHS_SELECTORS.userProfile;
|
||||
|
||||
const log = logger.child({ module: 'xhs-user-profile' });
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// __INITIAL_STATE__ raw types for user profile
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
interface RawProfileState {
|
||||
user?: {
|
||||
userPageData?: RawUserPageData;
|
||||
userInfo?: RawUserInfo;
|
||||
};
|
||||
userProfile?: {
|
||||
userInfo?: RawUserInfo;
|
||||
notes?: RawProfileNote[];
|
||||
};
|
||||
[key: string]: unknown;
|
||||
}
|
||||
|
||||
interface RawUserPageData {
|
||||
basicInfo?: RawUserInfo;
|
||||
interactions?: RawInteractions;
|
||||
notes?: RawProfileNote[];
|
||||
noteCount?: number | string;
|
||||
note_count?: number | string;
|
||||
}
|
||||
|
||||
interface RawUserInfo {
|
||||
userId?: string;
|
||||
user_id?: string;
|
||||
nickname?: string;
|
||||
nick_name?: string;
|
||||
nickName?: string;
|
||||
avatar?: string;
|
||||
avatarUrl?: string;
|
||||
avatar_url?: string;
|
||||
images?: string;
|
||||
desc?: string;
|
||||
description?: string;
|
||||
gender?: number | string;
|
||||
ipLocation?: string;
|
||||
ip_location?: string;
|
||||
fstatus?: string;
|
||||
follows?: number | string;
|
||||
fans?: number | string;
|
||||
interaction?: number | string;
|
||||
noteCount?: number | string;
|
||||
note_count?: number | string;
|
||||
}
|
||||
|
||||
interface RawInteractions {
|
||||
follows?: string | number;
|
||||
fans?: string | number;
|
||||
interaction?: string | number;
|
||||
}
|
||||
|
||||
interface RawProfileNote {
|
||||
id?: string;
|
||||
noteId?: string;
|
||||
note_id?: string;
|
||||
xsecToken?: string;
|
||||
xsec_token?: string;
|
||||
displayTitle?: string;
|
||||
display_title?: string;
|
||||
title?: string;
|
||||
desc?: string;
|
||||
type?: string;
|
||||
cover?: {
|
||||
url?: string;
|
||||
urlPre?: string;
|
||||
url_pre?: string;
|
||||
urlDefault?: string;
|
||||
url_default?: string;
|
||||
infoList?: Array<{ url?: string }>;
|
||||
info_list?: Array<{ url?: string }>;
|
||||
};
|
||||
user?: {
|
||||
userId?: string;
|
||||
user_id?: string;
|
||||
nickname?: string;
|
||||
nick_name?: string;
|
||||
avatar?: string;
|
||||
};
|
||||
interactInfo?: {
|
||||
likedCount?: string;
|
||||
liked_count?: string;
|
||||
likeCount?: string;
|
||||
like_count?: string;
|
||||
};
|
||||
interact_info?: {
|
||||
likedCount?: string;
|
||||
liked_count?: string;
|
||||
likeCount?: string;
|
||||
like_count?: string;
|
||||
};
|
||||
likedCount?: string;
|
||||
liked_count?: string;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// getUserProfile
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Navigate to a Xiaohongshu user profile page and extract their information,
|
||||
* including basic info, follower/following counts, and recent notes.
|
||||
*
|
||||
* @param page - A Playwright Page managed by BrowserManager.
|
||||
* @param userId - The user ID.
|
||||
* @param xsecToken - Security token required to access the profile page.
|
||||
* @returns A UserProfile object with the user's data.
|
||||
*/
|
||||
export async function getUserProfile(
|
||||
page: Page,
|
||||
userId: string,
|
||||
xsecToken: string,
|
||||
): Promise<UserProfile> {
|
||||
const url = `${USER_PROFILE_BASE_URL}/${userId}?xsec_token=${encodeURIComponent(xsecToken)}&xsec_source=pc_feed`;
|
||||
log.debug({ userId, url }, 'Navigating to user profile page');
|
||||
|
||||
// XHS applies stricter bot detection on profile pages than on search pages.
|
||||
// Visiting the explore page first establishes a natural session context that
|
||||
// allows the subsequent profile navigation to pass the IP-risk check.
|
||||
await page.goto('https://www.xiaohongshu.com/explore', { waitUntil: 'domcontentloaded' });
|
||||
await page.waitForTimeout(1000);
|
||||
|
||||
await page.goto(url, { waitUntil: 'domcontentloaded' });
|
||||
|
||||
// Wait for the user profile header to appear.
|
||||
await page
|
||||
.waitForSelector(SEL.headerContainer, { timeout: 15_000 })
|
||||
.catch(() => {
|
||||
log.warn({ userId }, 'User profile header not found within timeout, proceeding');
|
||||
});
|
||||
|
||||
// Allow render to settle.
|
||||
await page.waitForTimeout(1500);
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Strategy 1: Extract from __INITIAL_STATE__
|
||||
// -----------------------------------------------------------------------
|
||||
const initialState = await extractInitialState(page) as RawProfileState | null;
|
||||
|
||||
if (initialState) {
|
||||
const profile = parseProfileFromState(initialState, userId, xsecToken);
|
||||
if (profile) {
|
||||
log.info({ userId, feedsCount: profile.feeds.length }, 'Extracted user profile from __INITIAL_STATE__');
|
||||
return profile;
|
||||
}
|
||||
log.debug('__INITIAL_STATE__ found but no profile data extracted, falling back to DOM');
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Strategy 2: Fall back to DOM scraping
|
||||
// -----------------------------------------------------------------------
|
||||
log.debug({ userId }, 'Falling back to DOM scraping for user profile');
|
||||
const profile = await scrapeProfileFromDom(page, userId, xsecToken);
|
||||
log.info({ userId, feedCount: profile.feeds.length }, 'Extracted user profile from DOM');
|
||||
return profile;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// __INITIAL_STATE__ parsing
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Parse user profile data from __INITIAL_STATE__.
|
||||
*/
|
||||
function parseProfileFromState(
|
||||
state: RawProfileState,
|
||||
userId: string,
|
||||
_xsecToken: string,
|
||||
): UserProfile | null {
|
||||
// Try multiple known locations for user data.
|
||||
const userPageData = state.user?.userPageData;
|
||||
const userInfo =
|
||||
userPageData?.basicInfo ??
|
||||
state.user?.userInfo ??
|
||||
state.userProfile?.userInfo;
|
||||
|
||||
if (!userInfo) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const id = userInfo.userId ?? userInfo.user_id ?? userId;
|
||||
const nickname = userInfo.nickname ?? userInfo.nick_name ?? userInfo.nickName ?? '';
|
||||
const avatar = userInfo.avatar ?? userInfo.avatarUrl ?? userInfo.avatar_url ?? userInfo.images ?? '';
|
||||
const description = userInfo.desc ?? userInfo.description ?? '';
|
||||
|
||||
// Gender: 0=unknown, 1=male, 2=female
|
||||
const genderRaw = userInfo.gender;
|
||||
let gender = '';
|
||||
if (genderRaw === 1 || genderRaw === '1') gender = 'male';
|
||||
else if (genderRaw === 2 || genderRaw === '2') gender = 'female';
|
||||
|
||||
const ipLocation = userInfo.ipLocation ?? userInfo.ip_location ?? '';
|
||||
|
||||
// Follower / following / interaction counts.
|
||||
const interactions = userPageData?.interactions;
|
||||
const follows = toNumber(interactions?.follows ?? userInfo.follows ?? 0);
|
||||
const fans = toNumber(interactions?.fans ?? userInfo.fans ?? 0);
|
||||
const interaction = toNumber(interactions?.interaction ?? userInfo.interaction ?? 0);
|
||||
|
||||
// Notes / feeds on the profile page.
|
||||
const rawNotes: RawProfileNote[] =
|
||||
userPageData?.notes ?? state.userProfile?.notes ?? [];
|
||||
const feeds = rawNotes
|
||||
.map((note) => parseProfileNote(note, userId))
|
||||
.filter((f): f is Feed => f !== null);
|
||||
|
||||
return {
|
||||
id,
|
||||
nickname,
|
||||
avatar: avatar ? ensureHttps(avatar) : '',
|
||||
description,
|
||||
gender,
|
||||
ipLocation,
|
||||
follows,
|
||||
fans,
|
||||
interaction,
|
||||
feeds,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a note from the user profile state into a Feed object.
|
||||
*/
|
||||
function parseProfileNote(
|
||||
raw: RawProfileNote,
|
||||
ownerUserId: string,
|
||||
): Feed | null {
|
||||
const id = raw.id ?? raw.noteId ?? raw.note_id ?? '';
|
||||
if (!id) return null;
|
||||
|
||||
const noteXsecToken = raw.xsecToken ?? raw.xsec_token ?? '';
|
||||
const title = raw.displayTitle ?? raw.display_title ?? raw.title ?? '';
|
||||
const description = raw.desc ?? '';
|
||||
const rawType = raw.type ?? '';
|
||||
const type: 'normal' | 'video' =
|
||||
rawType.toLowerCase().includes('video') ? 'video' : 'normal';
|
||||
|
||||
// Cover image.
|
||||
let coverUrl = '';
|
||||
if (raw.cover) {
|
||||
coverUrl =
|
||||
raw.cover.url ?? raw.cover.urlPre ?? raw.cover.url_pre ??
|
||||
raw.cover.urlDefault ?? raw.cover.url_default ?? '';
|
||||
if (!coverUrl) {
|
||||
const infoList = raw.cover.infoList ?? raw.cover.info_list;
|
||||
if (infoList && infoList.length > 0 && infoList[0]?.url) {
|
||||
coverUrl = infoList[0].url;
|
||||
}
|
||||
}
|
||||
if (coverUrl) coverUrl = ensureHttps(coverUrl);
|
||||
}
|
||||
|
||||
// Like count.
|
||||
const interact = raw.interactInfo ?? raw.interact_info;
|
||||
const likeCountStr =
|
||||
interact?.likedCount ?? interact?.liked_count ??
|
||||
interact?.likeCount ?? interact?.like_count ??
|
||||
raw.likedCount ?? raw.liked_count ?? '0';
|
||||
const likeCount = parseCountString(likeCountStr);
|
||||
|
||||
// User.
|
||||
const rawUser = raw.user;
|
||||
const user = {
|
||||
id: rawUser?.userId ?? rawUser?.user_id ?? ownerUserId,
|
||||
nickname: rawUser?.nickname ?? rawUser?.nick_name ?? '',
|
||||
avatar: rawUser?.avatar ?? '',
|
||||
};
|
||||
|
||||
return {
|
||||
id,
|
||||
xsecToken: noteXsecToken,
|
||||
title,
|
||||
description,
|
||||
type,
|
||||
coverUrl,
|
||||
likeCount,
|
||||
user,
|
||||
};
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// DOM scraping fallback — uses Playwright Node-side API exclusively
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Scrape user profile data from the rendered DOM using Playwright's
|
||||
* Node-side APIs to avoid needing DOM lib types.
|
||||
*/
|
||||
async function scrapeProfileFromDom(
|
||||
page: Page,
|
||||
userId: string,
|
||||
xsecToken: string,
|
||||
): Promise<UserProfile> {
|
||||
// Nickname
|
||||
const nickname = await page
|
||||
.$eval(SEL.nickname, (el) => el.textContent?.trim() ?? '')
|
||||
.catch(() => '');
|
||||
|
||||
// Avatar
|
||||
const avatar = await page
|
||||
.$eval(SEL.avatar, (img) => img.getAttribute('src') ?? '')
|
||||
.catch(() => '');
|
||||
|
||||
// Description / bio
|
||||
const description = await page
|
||||
.$eval(SEL.description, (el) => el.textContent?.trim() ?? '')
|
||||
.catch(() => '');
|
||||
|
||||
// Gender — try the gender icon class.
|
||||
const gender = await page
|
||||
.$eval(SEL.gender, (el) => {
|
||||
const cls = el.className.toLowerCase();
|
||||
if (cls.includes('male') && !cls.includes('female')) return 'male';
|
||||
if (cls.includes('female')) return 'female';
|
||||
return '';
|
||||
})
|
||||
.catch(() => '');
|
||||
|
||||
// IP location
|
||||
const ipLocation = await page
|
||||
.$eval(SEL.ipLocation, (el) => el.textContent?.trim() ?? '')
|
||||
.catch(() => '');
|
||||
|
||||
// Follower / following / interaction counts.
|
||||
// These are typically in a row of .data-item elements.
|
||||
const dataCounts = await page.$$eval(SEL.followCount, (items) =>
|
||||
items.map((item) => {
|
||||
const countEl = item.querySelector('.count');
|
||||
return countEl?.textContent?.trim() ?? '0';
|
||||
}),
|
||||
).catch(() => [] as string[]);
|
||||
|
||||
const follows = parseCountString(dataCounts[0] ?? '0');
|
||||
const fans = parseCountString(dataCounts[1] ?? '0');
|
||||
const interaction = parseCountString(dataCounts[2] ?? '0');
|
||||
|
||||
// Scrape feed items on the profile page.
|
||||
const feedElements = await page.$$(SEL.feedItem);
|
||||
const feeds: Feed[] = [];
|
||||
|
||||
for (const card of feedElements) {
|
||||
try {
|
||||
const href = await card
|
||||
.$eval('a.cover', (el) => el.getAttribute('href') ?? '')
|
||||
.catch(() => '');
|
||||
|
||||
const idMatch = href.match(/\/(?:explore|search_result)\/([a-f0-9]+)|\/user\/profile\/[^/]+\/([a-f0-9]+)/);
|
||||
const tokenMatch = href.match(/xsec_token=([^&]+)/);
|
||||
const id = idMatch?.[1] ?? idMatch?.[2] ?? '';
|
||||
const noteXsecToken = tokenMatch?.[1] ?? '';
|
||||
|
||||
if (!id) continue;
|
||||
|
||||
const coverUrl = await card
|
||||
.$eval('a.cover img', (el) => el.getAttribute('src') ?? el.getAttribute('data-src') ?? '')
|
||||
.catch(() => '');
|
||||
|
||||
const feedTitle = await card
|
||||
.$eval('.footer .title', (el) => el.textContent?.trim() ?? '')
|
||||
.catch(() => '');
|
||||
|
||||
const likeText = await card
|
||||
.$eval('.footer .like-wrapper .count', (el) => el.textContent?.trim() ?? '0')
|
||||
.catch(() => '0');
|
||||
|
||||
const hasVideoIcon = await card.$('.play-icon').then((el) => el !== null).catch(() => false);
|
||||
|
||||
feeds.push({
|
||||
id,
|
||||
xsecToken: noteXsecToken || xsecToken,
|
||||
title: feedTitle,
|
||||
description: '',
|
||||
type: hasVideoIcon ? 'video' : 'normal',
|
||||
coverUrl,
|
||||
likeCount: parseCountString(likeText),
|
||||
user: { id: userId, nickname: '', avatar: '' },
|
||||
});
|
||||
} catch {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
id: userId,
|
||||
nickname,
|
||||
avatar,
|
||||
description,
|
||||
gender,
|
||||
ipLocation,
|
||||
follows,
|
||||
fans,
|
||||
interaction,
|
||||
feeds,
|
||||
};
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Convert a string or number to a number, handling abbreviations.
|
||||
*/
|
||||
function toNumber(val: string | number): number {
|
||||
if (typeof val === 'number') return val;
|
||||
return parseCountString(val);
|
||||
}
|
||||
Reference in New Issue
Block a user