feat: social-mcp 初始实现

多平台社交自动化 MCP 服务,首批支持小红书。

- 13 个 MCP 工具:登录管理、内容浏览、发布、互动
- 13 个 REST API 端点,支持 Bearer token 认证和限流
- BrowserManager:串行队列、背压、崩溃恢复
- Cookie 持久化:原子写入、0600 权限
- 安全:DNS rebinding 防御、错误脱敏、深层日志 redact
- Docker 部署支持
- 28 个单元测试全部通过
This commit is contained in:
2026-02-28 22:57:22 +08:00
commit 8da5f40c9f
38 changed files with 11273 additions and 0 deletions
+374
View File
@@ -0,0 +1,374 @@
import {
chromium,
type Browser,
type BrowserContext,
type Page,
} from 'rebrowser-playwright';
import { config } from '../config/index.js';
import { cookieStore } from '../cookie/store.js';
import { logger } from '../utils/logger.js';
// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------
/** How long acquirePage() waits before auto-releasing the page (5 minutes). */
const ACQUIRE_SAFETY_TIMEOUT_MS = 5 * 60_000;
// ---------------------------------------------------------------------------
// BrowserManager
// ---------------------------------------------------------------------------
export class BrowserManager {
// -- State ----------------------------------------------------------------
private browser: Browser | null = null;
private contexts = new Map<string, BrowserContext>();
private queues = new Map<string, Promise<void>>();
private queueDepths = new Map<string, number>();
/**
* Mutex-style promise that prevents concurrent browser launches.
* While a launch is in-flight every caller awaits the same promise.
*/
private launchPromise: Promise<Browser> | null = null;
// -- Public API -----------------------------------------------------------
/**
* Execute `fn` on a fresh page inside the platform-specific context.
*
* Operations are serialised per-platform through a promise chain so that
* at most one page is active per platform at any time. Back-pressure is
* enforced via `config.maxQueueDepth`.
*
* @param platform - Platform identifier (e.g. "twitter", "xiaohongshu").
* @param fn - Async callback that receives the page.
* @param timeoutMs - Optional timeout override (defaults to the
* `operationTimeouts.default` value from config).
* @returns The value returned by `fn`.
*/
async withPage<T>(
platform: string,
fn: (page: Page) => Promise<T>,
timeoutMs?: number,
): Promise<T> {
// -- Back-pressure check ------------------------------------------------
const currentDepth = this.queueDepths.get(platform) ?? 0;
if (currentDepth >= config.maxQueueDepth) {
throw new Error(
`Queue full for platform "${platform}" (depth=${currentDepth}, ` +
`max=${config.maxQueueDepth}). Try again later.`,
);
}
this.queueDepths.set(platform, currentDepth + 1);
// -- Resolve effective timeout ------------------------------------------
const effectiveTimeout =
timeoutMs ?? config.operationTimeouts['default'] ?? 60_000;
// -- Build the task and chain onto the per-platform queue ---------------
const previous = this.queues.get(platform) ?? Promise.resolve();
const task: Promise<T> = previous.then(async () => {
const browser = await this.ensureBrowser();
// The browser may have disconnected while this task was queued.
if (!browser.isConnected()) {
throw new Error('Browser disconnected while waiting in queue');
}
const ctx = await this.getContext(platform);
const page = await ctx.newPage();
page.setDefaultTimeout(effectiveTimeout);
page.setDefaultNavigationTimeout(effectiveTimeout);
try {
// Race the user function against a hard timeout. clearTimeout
// is handled implicitly: when fn resolves first the timeout
// promise is simply abandoned and its timer unref'd so it cannot
// keep the process alive.
const result = await Promise.race<T>([
fn(page),
this.createTimeout<T>(effectiveTimeout, platform),
]);
return result;
} finally {
await page.close().catch((err: unknown) => {
logger.warn({ err, platform }, 'Failed to close page');
});
}
});
// Swallow errors so the promise chain continues for the next caller.
// The actual rejection is still returned to **this** caller via `task`.
const chainContinuation = task.then(
() => {},
() => {},
);
// Decrement queue depth when this task settles, regardless of outcome.
void chainContinuation.finally(() => {
const depth = this.queueDepths.get(platform) ?? 1;
if (depth <= 1) {
this.queueDepths.delete(platform);
} else {
this.queueDepths.set(platform, depth - 1);
}
});
this.queues.set(platform, chainContinuation);
return task;
}
/**
* Acquire a page that the caller manages manually (e.g. for interactive
* login flows). The caller **must** call `release()` when finished.
*
* A safety-net timer auto-releases the page after 5 minutes to prevent
* resource leaks if the caller forgets.
*
* @param platform - Platform identifier.
* @returns Object with `page` and an idempotent `release` function.
*/
async acquirePage(
platform: string,
): Promise<{ page: Page; release: () => Promise<void> }> {
await this.ensureBrowser();
const ctx = await this.getContext(platform);
const page = await ctx.newPage();
let released = false;
const release = async (): Promise<void> => {
if (released) return;
released = true;
clearTimeout(safetyTimer);
await page.close().catch((err: unknown) => {
logger.warn({ err, platform }, 'Failed to close acquired page');
});
};
const safetyTimer = setTimeout(() => {
if (!released) {
logger.warn(
{ platform },
`acquirePage safety timeout: auto-releasing page after ${ACQUIRE_SAFETY_TIMEOUT_MS}ms`,
);
void release();
}
}, ACQUIRE_SAFETY_TIMEOUT_MS);
// Prevent the timer from keeping the Node.js process alive.
if (typeof safetyTimer === 'object' && 'unref' in safetyTimer) {
safetyTimer.unref();
}
return { page, release };
}
/**
* Save the current cookie / storage state of a platform's browser context
* to disk via the CookieStore.
*
* @param platform - Platform identifier whose context should be persisted.
*/
async saveCookies(platform: string): Promise<void> {
const ctx = this.contexts.get(platform);
if (!ctx) {
logger.warn(
{ platform },
'saveCookies called but no context exists for platform',
);
return;
}
const state = await ctx.storageState();
// Playwright's storageState() return type is structurally compatible with
// our CookieStore's StorageState interface.
await cookieStore.save(platform, state);
logger.debug({ platform }, 'Cookies saved');
}
/**
* Wait for every in-flight platform queue to settle. Useful during
* graceful shutdown so that running operations finish before teardown.
*/
async drain(): Promise<void> {
const pending = [...this.queues.values()];
if (pending.length === 0) return;
logger.info(
{ queueCount: pending.length },
'Draining browser operation queues',
);
await Promise.allSettled(pending);
logger.info('All browser operation queues drained');
}
/**
* Close all browser contexts and the browser itself, then reset internal
* state. Safe to call multiple times.
*/
async close(): Promise<void> {
// Close every context individually so callers that need to flush
// storageState can do so before calling close().
const contextClosePromises = [...this.contexts.values()].map((ctx) =>
ctx.close().catch((err: unknown) => {
logger.warn({ err }, 'Error closing browser context during shutdown');
}),
);
await Promise.all(contextClosePromises);
if (this.browser) {
await this.browser.close().catch((err: unknown) => {
logger.warn({ err }, 'Error closing browser during shutdown');
});
}
this.browser = null;
this.contexts.clear();
this.queues.clear();
this.queueDepths.clear();
this.launchPromise = null;
logger.info('BrowserManager closed');
}
// -- Private helpers ------------------------------------------------------
/**
* Ensure the browser is launched and connected. Uses a launch mutex so
* that concurrent callers share a single launch attempt instead of
* spawning multiple browser processes.
*/
private async ensureBrowser(): Promise<Browser> {
if (this.browser?.isConnected()) {
return this.browser;
}
// If another caller is already launching, piggy-back on that promise.
if (this.launchPromise) {
return this.launchPromise;
}
this.launchPromise = this.launchBrowser();
try {
const browser = await this.launchPromise;
return browser;
} finally {
this.launchPromise = null;
}
}
/**
* Launch a Chromium instance via rebrowser-playwright.
*/
private async launchBrowser(): Promise<Browser> {
logger.info(
{ headless: config.headless, browserBin: config.browserBin ?? 'default' },
'Launching browser',
);
const browser = await chromium.launch({
headless: config.headless,
...(config.browserBin ? { executablePath: config.browserBin } : {}),
});
// React to unexpected disconnects (e.g. browser crash, OOM kill).
browser.on('disconnected', () => {
logger.error('Browser disconnected unexpectedly');
this.browser = null;
this.contexts.clear();
// launchPromise is intentionally NOT cleared here so the next caller
// that calls ensureBrowser() will attempt a fresh launch.
});
this.browser = browser;
logger.info('Browser launched successfully');
return browser;
}
/**
* Get (or lazily create) a BrowserContext for the given platform.
*
* On first creation we attempt to restore cookies from the CookieStore
* so that sessions survive process restarts.
*/
private async getContext(platform: string): Promise<BrowserContext> {
const existing = this.contexts.get(platform);
if (existing) return existing;
if (!this.browser) {
throw new Error('Cannot create context: browser is not launched');
}
// Attempt to restore a previous session's storage state from disk.
let storageState: Awaited<ReturnType<BrowserContext['storageState']>> | undefined;
try {
const loaded = await cookieStore.load(platform);
if (loaded) {
storageState = loaded;
logger.debug(
{ platform, cookieCount: loaded.cookies.length },
'Restoring saved cookies into new context',
);
}
} catch (err: unknown) {
// Cookie load failure should never prevent context creation.
logger.warn(
{ err, platform },
'Failed to load saved cookies -- creating fresh context',
);
}
const ctx = await this.browser.newContext(
storageState ? { storageState } : {},
);
this.contexts.set(platform, ctx);
logger.debug({ platform }, 'Browser context created');
return ctx;
}
/**
* Create a promise that rejects after `ms` milliseconds, used as the
* timeout arm in `Promise.race` inside `withPage`.
*
* The timer is `unref()`'d so it cannot keep the Node.js event loop alive
* during graceful shutdown. When the user's function wins the race the
* dangling timeout promise is harmlessly garbage-collected.
*/
private createTimeout<T>(ms: number, platform: string): Promise<T> {
return new Promise<T>((_resolve, reject) => {
const timer = setTimeout(() => {
reject(
new Error(
`Operation timed out after ${ms}ms for platform "${platform}"`,
),
);
}, ms);
// Prevent the timeout from keeping the process alive during shutdown.
if (typeof timer === 'object' && 'unref' in timer) {
timer.unref();
}
});
}
}
// ---------------------------------------------------------------------------
// Singleton export
// ---------------------------------------------------------------------------
export const browserManager = new BrowserManager();
+112
View File
@@ -0,0 +1,112 @@
import os from 'node:os';
import path from 'node:path';
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
function envString(key: string, fallback: string): string {
return process.env[key] ?? fallback;
}
function envInt(key: string, fallback: number): number {
const raw = process.env[key];
if (raw === undefined) return fallback;
const parsed = Number.parseInt(raw, 10);
if (Number.isNaN(parsed)) {
// eslint-disable-next-line no-console
console.error(`[config] Invalid integer for ${key}="${raw}", using default ${fallback}`);
return fallback;
}
return parsed;
}
function envBool(key: string, fallback: boolean): boolean {
const raw = process.env[key];
if (raw === undefined) return fallback;
// Accept common truthy / falsy strings
if (['true', '1', 'yes'].includes(raw.toLowerCase())) return true;
if (['false', '0', 'no'].includes(raw.toLowerCase())) return false;
return fallback;
}
// ---------------------------------------------------------------------------
// HOST safety check — must run before exporting config
// ---------------------------------------------------------------------------
const host = envString('HOST', '127.0.0.1');
if (host === '0.0.0.0' || host === '::') {
const allow = process.env['ALLOW_REMOTE'];
if (allow !== 'yes-i-understand-the-risk') {
// Use console.error directly — the logger module depends on config,
// so it is not available yet at this point.
// eslint-disable-next-line no-console
console.error(
`[FATAL] HOST is set to "${host}" which exposes the service to the network.\n` +
`If you really intend to do this, set ALLOW_REMOTE=yes-i-understand-the-risk\n` +
`Refusing to start.`,
);
process.exit(1);
}
}
// ---------------------------------------------------------------------------
// Operation timeouts (milliseconds)
// Matches the tiers described in PLAN.md section 6.1
// ---------------------------------------------------------------------------
const operationTimeouts: Record<string, number> = {
like: 15_000, // 15s — quick interactions
favorite: 15_000, // 15s
comment: 20_000, // 20s
reply: 20_000, // 20s
feed_list: 30_000, // 30s — page load + extraction
search: 30_000, // 30s
feed_detail: 60_000, // 60s — includes scroll loading
user_profile: 60_000, // 60s
publish: 300_000, // 5min — upload may be slow
login: 300_000, // 5min — user interaction
default: 60_000, // 1min — fallback
};
// ---------------------------------------------------------------------------
// Config type
// ---------------------------------------------------------------------------
export interface AppConfig {
/** HTTP port */
port: number;
/** HTTP bind address */
host: string;
/** Run browser in headless mode */
headless: boolean;
/** Custom browser executable path (optional) */
browserBin: string | undefined;
/** Pino log level */
logLevel: string;
/** NODE_ENV */
nodeEnv: string;
/** Directory for per-platform cookie storage */
cookieDir: string;
/** Max pending operations per platform queue */
maxQueueDepth: number;
/** Per-operation-type timeout in ms */
operationTimeouts: Record<string, number>;
}
// ---------------------------------------------------------------------------
// Exported config singleton
// ---------------------------------------------------------------------------
export const config: AppConfig = {
port: envInt('PORT', 3000),
host,
headless: envBool('HEADLESS', true),
browserBin: process.env['BROWSER_BIN'] || undefined,
logLevel: envString('LOG_LEVEL', 'info'),
nodeEnv: envString('NODE_ENV', 'development'),
cookieDir: envString('COOKIE_DIR', path.join(os.homedir(), '.social-mcp')),
maxQueueDepth: envInt('MAX_QUEUE_DEPTH', 10),
operationTimeouts,
};
+171
View File
@@ -0,0 +1,171 @@
import fs from 'node:fs/promises';
import path from 'node:path';
import { config } from '../config/index.js';
import { logger } from '../utils/logger.js';
// ---------------------------------------------------------------------------
// Types — mirrors Playwright's BrowserContext.storageState() shape
// ---------------------------------------------------------------------------
export interface Cookie {
name: string;
value: string;
domain: string;
path: string;
expires: number;
httpOnly: boolean;
secure: boolean;
sameSite: 'Strict' | 'Lax' | 'None';
}
export interface StorageState {
cookies: Cookie[];
origins: Array<{
origin: string;
localStorage: Array<{ name: string; value: string }>;
}>;
}
// ---------------------------------------------------------------------------
// CookieStore
// ---------------------------------------------------------------------------
const log = logger.child({ module: 'cookie-store' });
export class CookieStore {
/**
* Return the absolute path to the cookies.json for a given platform.
*/
getPath(platform: string): string {
return path.join(config.cookieDir, platform, 'cookies.json');
}
/**
* Load the persisted storage state for a platform.
* Returns `null` when no cookie file exists yet.
*/
async load(platform: string): Promise<StorageState | null> {
const filePath = this.getPath(platform);
try {
const raw = await fs.readFile(filePath, 'utf-8');
const parsed: unknown = JSON.parse(raw);
// Minimal structural validation so we don't blindly trust disk data.
if (!isStorageState(parsed)) {
log.warn({ platform, filePath }, 'Cookie file failed validation, treating as absent');
return null;
}
log.debug({ platform, cookieCount: parsed.cookies.length }, 'Loaded cookies from disk');
return parsed;
} catch (err: unknown) {
if (isNodeError(err) && err.code === 'ENOENT') {
log.debug({ platform }, 'No cookie file found');
return null;
}
log.error({ err, platform, filePath }, 'Failed to load cookie file');
throw err;
}
}
/**
* Persist a storage state for a platform using an atomic write.
*
* Strategy:
* 1. Ensure the platform directory exists (mode 0o700).
* 2. Write to a temporary file (`.tmp.<pid>`) inside the same directory.
* 3. Set file permissions to 0o600.
* 4. Atomically rename the temp file to the final path.
*
* Because rename is atomic on the same filesystem, readers will never
* observe a partially-written cookies.json.
*/
async save(platform: string, state: StorageState): Promise<void> {
const filePath = this.getPath(platform);
const dir = path.dirname(filePath);
const tmpPath = path.join(dir, `.tmp.${process.pid}`);
try {
// Ensure directory exists with restricted permissions.
await fs.mkdir(dir, { recursive: true, mode: 0o700 });
const json = JSON.stringify(state, null, 2);
// Write to temp file, set permissions, then atomically rename.
await fs.writeFile(tmpPath, json, { encoding: 'utf-8', mode: 0o600 });
await fs.rename(tmpPath, filePath);
log.debug(
{ platform, cookieCount: state.cookies.length },
'Saved cookies to disk',
);
} catch (err: unknown) {
log.error({ err, platform, filePath }, 'Failed to save cookie file');
// Best-effort cleanup of the temp file.
try {
await fs.unlink(tmpPath);
} catch {
// Ignore — the temp file may not have been created.
}
throw err;
}
}
/**
* Delete the cookie file for a platform.
* Silently succeeds when no file exists.
*/
async delete(platform: string): Promise<void> {
const filePath = this.getPath(platform);
try {
await fs.unlink(filePath);
log.debug({ platform }, 'Deleted cookie file');
} catch (err: unknown) {
if (isNodeError(err) && err.code === 'ENOENT') {
log.debug({ platform }, 'Cookie file already absent, nothing to delete');
return;
}
log.error({ err, platform, filePath }, 'Failed to delete cookie file');
throw err;
}
}
}
// ---------------------------------------------------------------------------
// Singleton
// ---------------------------------------------------------------------------
export const cookieStore = new CookieStore();
// ---------------------------------------------------------------------------
// Internal helpers
// ---------------------------------------------------------------------------
interface NodeError extends Error {
code?: string;
}
function isNodeError(err: unknown): err is NodeError {
return err instanceof Error;
}
/**
* Lightweight runtime check that the parsed JSON matches the StorageState
* shape we expect. This is intentionally lenient — we only verify the
* top-level structure so that forward-compatible fields are not rejected.
*/
function isStorageState(value: unknown): value is StorageState {
if (typeof value !== 'object' || value === null) return false;
const obj = value as Record<string, unknown>;
if (!Array.isArray(obj['cookies'])) return false;
if (!Array.isArray(obj['origins'])) return false;
return true;
}
+89
View File
@@ -0,0 +1,89 @@
import { logger } from './utils/logger.js';
import { browserManager } from './browser/manager.js';
import { AppServer } from './server/app.js';
import { xiaohongshuPlugin } from './platforms/xiaohongshu/index.js';
// ---------------------------------------------------------------------------
// Bootstrap
// ---------------------------------------------------------------------------
const appServer = new AppServer();
// -- Platform plugins -------------------------------------------------------
appServer.registerPlugin(xiaohongshuPlugin);
// -- Start ------------------------------------------------------------------
appServer.start().catch((err: unknown) => {
logger.fatal({ err }, 'Failed to start server');
process.exit(1);
});
// ---------------------------------------------------------------------------
// Graceful shutdown
// ---------------------------------------------------------------------------
let shuttingDown = false;
async function gracefulShutdown(signal: string): Promise<void> {
if (shuttingDown) return;
shuttingDown = true;
logger.info({ signal }, 'Received shutdown signal — starting graceful shutdown');
// Safety net: if graceful shutdown takes too long, force exit.
const forceExitTimer = setTimeout(() => {
logger.fatal('Graceful shutdown timed out after 45s — forcing exit');
process.exit(1);
}, 45_000);
// Prevent the safety-net timer from keeping the process alive on its own.
if (typeof forceExitTimer === 'object' && 'unref' in forceExitTimer) {
forceExitTimer.unref();
}
try {
// Step 1: Drain browser queues so in-flight operations finish (max 30s).
logger.info('Shutdown step 1/5: draining browser queues');
await Promise.race([
browserManager.drain(),
new Promise<void>((resolve) => setTimeout(resolve, 30_000).unref()),
]);
// Step 2: Close the browser and all contexts.
logger.info('Shutdown step 2/5: closing browser');
await browserManager.close();
// Step 3: Close the HTTP server (stop accepting new connections).
logger.info('Shutdown step 3/5: closing HTTP server');
await appServer.close();
// Step 4: Flush structured logs so nothing is lost.
logger.info('Shutdown step 4/5: flushing logger');
logger.flush();
// Step 5: Exit cleanly.
logger.info('Shutdown step 5/5: exiting');
process.exit(0);
} catch (err: unknown) {
logger.fatal({ err }, 'Error during graceful shutdown');
process.exit(1);
}
}
process.on('SIGINT', () => void gracefulShutdown('SIGINT'));
process.on('SIGTERM', () => void gracefulShutdown('SIGTERM'));
// ---------------------------------------------------------------------------
// Global error handlers
// ---------------------------------------------------------------------------
process.on('unhandledRejection', (reason: unknown) => {
logger.fatal({ err: reason }, 'Unhandled promise rejection');
void gracefulShutdown('unhandledRejection');
});
process.on('uncaughtException', (err: Error) => {
logger.fatal({ err }, 'Uncaught exception');
void gracefulShutdown('uncaughtException');
});
+322
View File
@@ -0,0 +1,322 @@
import type { Page } from 'rebrowser-playwright';
import { logger } from '../../utils/logger.js';
import { XHS_SELECTORS } from './selectors.js';
// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------
const FEED_DETAIL_URL = 'https://www.xiaohongshu.com/explore';
/** Wait after typing comment text before submitting. */
const TYPE_SETTLE_MS = 500;
/** Wait after submit click to verify success. */
const SUBMIT_SETTLE_MS = 2_000;
const selComment = XHS_SELECTORS.comment;
const selDetail = XHS_SELECTORS.feedDetail;
const log = logger.child({ module: 'xhs-comment' });
// ---------------------------------------------------------------------------
// postComment
// ---------------------------------------------------------------------------
/**
* Post a top-level comment on a Xiaohongshu note.
*
* @param page - Playwright Page managed by BrowserManager.
* @param feedId - The note / feed ID to comment on.
* @param xsecToken - Security token for accessing the feed page.
* @param content - Comment text to post.
* @returns Object indicating whether the comment was posted successfully.
*/
export async function postComment(
page: Page,
feedId: string,
xsecToken: string,
content: string,
): Promise<{ success: boolean }> {
log.info({ feedId }, 'Posting comment on note');
// -------------------------------------------------------------------------
// 1. Navigate to the feed detail page
// -------------------------------------------------------------------------
const feedUrl = buildFeedUrl(feedId, xsecToken);
await page.goto(feedUrl, { waitUntil: 'domcontentloaded' });
// Wait for the note container to be visible.
await page.waitForSelector(selDetail.noteContainer, { timeout: 10_000 });
await page.waitForTimeout(1_000);
// -------------------------------------------------------------------------
// 2. Find and focus the comment input
// -------------------------------------------------------------------------
const commentInput = await findCommentInput(page);
if (!commentInput) {
log.warn('Comment input not found on feed detail page');
return { success: false };
}
// -------------------------------------------------------------------------
// 3. Type the comment content
// -------------------------------------------------------------------------
await commentInput.click();
await page.waitForTimeout(300);
await page.keyboard.type(content, { delay: 30 });
await page.waitForTimeout(TYPE_SETTLE_MS);
// -------------------------------------------------------------------------
// 4. Submit the comment
// -------------------------------------------------------------------------
const submitted = await submitComment(page);
if (!submitted) {
log.warn('Failed to submit comment — submit button not found or click failed');
return { success: false };
}
// -------------------------------------------------------------------------
// 5. Verify the comment was posted
// -------------------------------------------------------------------------
await page.waitForTimeout(SUBMIT_SETTLE_MS);
// Check for the comment text in the page to verify success.
const pageContent = await page.content();
const success = pageContent.includes(content.slice(0, 20));
log.info({ feedId, success }, 'Comment post complete');
return { success };
}
// ---------------------------------------------------------------------------
// replyComment
// ---------------------------------------------------------------------------
/**
* Reply to an existing comment on a Xiaohongshu note.
*
* @param page - Playwright Page managed by BrowserManager.
* @param feedId - The note / feed ID.
* @param xsecToken - Security token for accessing the feed page.
* @param content - Reply text to post.
* @param commentId - Optional ID of the comment to reply to (for targeting).
* @param userId - Optional user ID of the comment author (for @ mention).
* @returns Object indicating whether the reply was posted successfully.
*/
export async function replyComment(
page: Page,
feedId: string,
xsecToken: string,
content: string,
commentId?: string,
userId?: string,
): Promise<{ success: boolean }> {
log.info({ feedId, commentId, userId }, 'Replying to comment on note');
// -------------------------------------------------------------------------
// 1. Navigate to the feed detail page
// -------------------------------------------------------------------------
const feedUrl = buildFeedUrl(feedId, xsecToken);
await page.goto(feedUrl, { waitUntil: 'domcontentloaded' });
await page.waitForSelector(selDetail.noteContainer, { timeout: 10_000 });
await page.waitForTimeout(1_000);
// -------------------------------------------------------------------------
// 2. Find the target comment and click its reply button
// -------------------------------------------------------------------------
let replyInputFound = false;
if (commentId) {
// Try to find the comment by its ID attribute and click its reply button.
replyInputFound = await clickReplyOnComment(page, commentId);
}
if (!replyInputFound) {
// Fallback: find the first comment's reply button.
const commentItems = await page.$$(selComment.commentItem);
if (commentItems.length > 0) {
// Hover to reveal the reply button (some UIs show it on hover).
const firstComment = commentItems[0]!;
await firstComment.hover();
await page.waitForTimeout(300);
const replyBtn = await firstComment.$(selComment.commentReplyButton);
if (replyBtn) {
await replyBtn.click();
await page.waitForTimeout(500);
replyInputFound = true;
}
}
}
// If we still haven't activated a reply input, fall back to the main
// comment input and prefix with @userId if available.
if (!replyInputFound) {
log.debug('Reply button not found, falling back to main comment input');
}
// -------------------------------------------------------------------------
// 3. Type the reply content
// -------------------------------------------------------------------------
// Try to find the reply-specific input first, then fall back to the
// main comment input.
const replyInput =
(await page.$(selComment.replyInput)) ??
(await findCommentInput(page));
if (!replyInput) {
log.warn('Reply input not found');
return { success: false };
}
await replyInput.click();
await page.waitForTimeout(300);
// If we're using the main comment input as fallback, prefix with @user.
const textToType =
!replyInputFound && userId ? `@${userId} ${content}` : content;
await page.keyboard.type(textToType, { delay: 30 });
await page.waitForTimeout(TYPE_SETTLE_MS);
// -------------------------------------------------------------------------
// 4. Submit the reply
// -------------------------------------------------------------------------
const submitted = await submitComment(page);
if (!submitted) {
log.warn('Failed to submit reply');
return { success: false };
}
// -------------------------------------------------------------------------
// 5. Verify
// -------------------------------------------------------------------------
await page.waitForTimeout(SUBMIT_SETTLE_MS);
const pageContent = await page.content();
const success = pageContent.includes(content.slice(0, 20));
log.info({ feedId, commentId, success }, 'Reply post complete');
return { success };
}
// ---------------------------------------------------------------------------
// Internal helpers
// ---------------------------------------------------------------------------
/**
* Build the URL for a feed detail page.
*/
function buildFeedUrl(feedId: string, xsecToken: string): string {
return `${FEED_DETAIL_URL}/${feedId}?xsec_token=${encodeURIComponent(xsecToken)}&xsec_source=pc_search`;
}
/**
* Find the main comment input element. Tries the primary selector first,
* then the alternative contenteditable selector.
*/
async function findCommentInput(page: Page) {
// Try the primary comment textarea.
let input = await page.$(selComment.commentInput);
if (input) return input;
// Try the alternative contenteditable div.
input = await page.$(selComment.commentInputAlt);
if (input) return input;
// Last resort: try clicking in the comment area to activate the input.
// Some UIs only show the input after clicking in the comment zone.
const commentArea = await page.$('.comment-area, .comments-container');
if (commentArea) {
await commentArea.click();
await page.waitForTimeout(500);
input = await page.$(selComment.commentInput);
if (input) return input;
input = await page.$(selComment.commentInputAlt);
if (input) return input;
}
return null;
}
/**
* Find a specific comment by its ID and click its reply button.
*/
async function clickReplyOnComment(
page: Page,
commentId: string,
): Promise<boolean> {
// Try to find comment by data attribute or ID.
const commentEl =
(await page.$(`[id="comment-${commentId}"]`)) ??
(await page.$(`[data-comment-id="${commentId}"]`));
if (!commentEl) {
log.debug({ commentId }, 'Target comment element not found by ID');
return false;
}
// Hover to reveal the reply button.
await commentEl.hover();
await page.waitForTimeout(300);
const replyBtn = await commentEl.$(selComment.commentReplyButton);
if (!replyBtn) {
log.debug({ commentId }, 'Reply button not found on target comment');
return false;
}
await replyBtn.click();
await page.waitForTimeout(500);
return true;
}
/**
* Find and click the comment submit button. Tries multiple selectors.
*/
async function submitComment(page: Page): Promise<boolean> {
// Try the primary submit button.
let submitBtn = await page.$(selComment.commentSubmitButton);
if (!submitBtn) {
// Some UIs submit on Ctrl+Enter / Cmd+Enter.
log.debug('Submit button not found, trying keyboard shortcut');
await page.keyboard.press('Control+Enter');
return true;
}
// Check if the button is enabled before clicking.
const isDisabled = await submitBtn.isDisabled().catch(() => false);
if (isDisabled) {
log.debug('Submit button is disabled, waiting briefly');
await page.waitForTimeout(1_000);
// Re-query in case the button became enabled.
submitBtn = await page.$(selComment.commentSubmitButton);
if (!submitBtn) return false;
}
await submitBtn.click();
return true;
}
+724
View File
@@ -0,0 +1,724 @@
import type { Page, ElementHandle } from 'rebrowser-playwright';
import { logger } from '../../utils/logger.js';
import { XHS_SELECTORS } from './selectors.js';
import { extractInitialState, parseCountString, ensureHttps } from './feeds.js';
import type { FeedDetail, Comment } from './types.js';
// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------
const FEED_DETAIL_BASE_URL = 'https://www.xiaohongshu.com/explore';
/** Maximum number of "show more" clicks to load comments. */
const MAX_LOAD_MORE_CLICKS = 20;
/** Delay between "show more" clicks to let the page render. */
const LOAD_MORE_DELAY_MS = 1500;
const SEL = XHS_SELECTORS.feedDetail;
const log = logger.child({ module: 'xhs-feed-detail' });
// ---------------------------------------------------------------------------
// __INITIAL_STATE__ raw types for feed detail
// ---------------------------------------------------------------------------
interface RawDetailState {
noteData?: {
data?: {
noteData?: RawNoteData;
};
noteData?: RawNoteData;
};
note?: {
noteDetailMap?: Record<string, { note?: RawNoteData }>;
note?: RawNoteData;
noteData?: RawNoteData;
};
[key: string]: unknown;
}
interface RawNoteData {
noteId?: string;
id?: string;
title?: string;
desc?: string;
description?: string;
type?: string;
imageList?: RawNoteImage[];
image_list?: RawNoteImage[];
video?: RawNoteVideo;
tagList?: RawNoteTag[];
tag_list?: RawNoteTag[];
interactInfo?: RawNoteInteract;
interact_info?: RawNoteInteract;
time?: number;
createTime?: number;
create_time?: number;
lastUpdateTime?: number;
last_update_time?: number;
ipLocation?: string;
ip_location?: string;
user?: RawNoteUser;
xsecToken?: string;
xsec_token?: string;
comments?: RawCommentData[];
}
interface RawNoteImage {
url?: string;
urlPre?: string;
urlDefault?: string;
url_pre?: string;
url_default?: string;
infoList?: Array<{ url?: string }>;
info_list?: Array<{ url?: string }>;
}
interface RawNoteVideo {
url?: string;
consumer?: {
originVideoKey?: string;
origin_video_key?: string;
};
media?: {
stream?: {
h264?: Array<{
masterUrl?: string;
master_url?: string;
}>;
};
video?: {
url?: string;
};
};
}
interface RawNoteTag {
id?: string;
name?: string;
type?: string;
}
interface RawNoteInteract {
likedCount?: string;
liked_count?: string;
collectedCount?: string;
collected_count?: string;
commentCount?: string;
comment_count?: string;
shareCount?: string;
share_count?: string;
}
interface RawNoteUser {
userId?: string;
user_id?: string;
nickname?: string;
nick_name?: string;
nickName?: string;
avatar?: string;
avatarUrl?: string;
avatar_url?: string;
}
interface RawCommentData {
id?: string;
userId?: string;
user_id?: string;
userInfo?: {
userId?: string;
user_id?: string;
nickname?: string;
nick_name?: string;
image?: string;
avatar?: string;
};
content?: string;
likeCount?: string | number;
like_count?: string | number;
createTime?: number;
create_time?: number;
ipLocation?: string;
ip_location?: string;
subComments?: RawCommentData[];
sub_comments?: RawCommentData[];
subCommentCount?: number | string;
sub_comment_count?: number | string;
}
// ---------------------------------------------------------------------------
// getFeedDetail
// ---------------------------------------------------------------------------
/**
* Navigate to a Xiaohongshu note detail page and extract comprehensive
* information including title, content, images/video, stats, and comments.
*
* @param page - A Playwright Page managed by BrowserManager.
* @param feedId - The note (feed) ID.
* @param xsecToken - Security token required to access the note.
* @param loadAllComments - If true, scrolls and clicks "load more" to fetch
* as many comments as possible.
* @returns A FeedDetail object with full note data and comments.
*/
export async function getFeedDetail(
page: Page,
feedId: string,
xsecToken: string,
loadAllComments = false,
): Promise<FeedDetail> {
const url = `${FEED_DETAIL_BASE_URL}/${feedId}?xsec_token=${encodeURIComponent(xsecToken)}&xsec_source=pc_feed`;
log.debug({ feedId, url }, 'Navigating to feed detail page');
await page.goto(url, { waitUntil: 'domcontentloaded' });
// Wait for the note content container to appear.
await page.waitForSelector(
`${SEL.noteContainer}, ${SEL.title}, ${SEL.description}`,
{ timeout: 15_000 },
).catch(() => {
log.warn({ feedId }, 'Note container not found within timeout, proceeding with extraction');
});
// Allow render to settle.
await page.waitForTimeout(1500);
// -----------------------------------------------------------------------
// Strategy 1: Extract from __INITIAL_STATE__
// -----------------------------------------------------------------------
const initialState = await extractInitialState(page) as RawDetailState | null;
let detail: FeedDetail | null = null;
if (initialState) {
detail = parseDetailFromState(initialState, feedId, xsecToken);
if (detail) {
log.debug({ feedId }, 'Extracted feed detail from __INITIAL_STATE__');
}
}
// -----------------------------------------------------------------------
// Strategy 2: Fall back to DOM scraping
// -----------------------------------------------------------------------
if (!detail) {
log.debug({ feedId }, 'Falling back to DOM scraping for feed detail');
detail = await scrapeDetailFromDom(page, feedId, xsecToken);
}
// -----------------------------------------------------------------------
// Load comments (from DOM — __INITIAL_STATE__ may not include them)
// -----------------------------------------------------------------------
if (detail.comments.length === 0 || loadAllComments) {
const comments = await scrapeComments(page, loadAllComments);
if (comments.length > 0) {
detail.comments = comments;
}
}
log.info(
{ feedId, commentCount: detail.comments.length, imageCount: detail.images.length },
'Feed detail extraction complete',
);
return detail;
}
// ---------------------------------------------------------------------------
// __INITIAL_STATE__ parsing
// ---------------------------------------------------------------------------
/**
* Parse feed detail from __INITIAL_STATE__.
*/
function parseDetailFromState(
state: RawDetailState,
feedId: string,
xsecToken: string,
): FeedDetail | null {
// Try multiple possible locations for note data.
let noteData: RawNoteData | undefined;
// Location 1: state.noteData.data.noteData (common structure)
noteData = state.noteData?.data?.noteData;
// Location 2: state.noteData.noteData
if (!noteData) {
noteData = state.noteData?.noteData;
}
// Location 3: state.note.noteDetailMap[feedId].note
if (!noteData && state.note?.noteDetailMap) {
const mapEntry = state.note.noteDetailMap[feedId];
noteData = mapEntry?.note;
}
// Location 4: state.note.note or state.note.noteData
if (!noteData) {
noteData = state.note?.note ?? state.note?.noteData;
}
if (!noteData) {
return null;
}
const id = noteData.noteId ?? noteData.id ?? feedId;
const title = noteData.title ?? '';
const description = noteData.desc ?? noteData.description ?? '';
const rawType = noteData.type ?? '';
const type: 'normal' | 'video' =
rawType.toLowerCase().includes('video') ? 'video' : 'normal';
// Images
const rawImages = noteData.imageList ?? noteData.image_list ?? [];
const images = rawImages
.map((img) => {
if (img.url) return ensureHttps(img.url);
if (img.urlPre) return ensureHttps(img.urlPre);
if (img.urlDefault) return ensureHttps(img.urlDefault);
if (img.url_pre) return ensureHttps(img.url_pre);
if (img.url_default) return ensureHttps(img.url_default);
const info = img.infoList ?? img.info_list;
if (info && info.length > 0 && info[0]?.url) return ensureHttps(info[0].url);
return '';
})
.filter((url) => url !== '');
// Video URL
let videoUrl: string | undefined;
if (noteData.video) {
const v = noteData.video;
videoUrl =
v.url ??
v.media?.stream?.h264?.[0]?.masterUrl ??
v.media?.stream?.h264?.[0]?.master_url ??
v.media?.video?.url ??
undefined;
if (videoUrl) videoUrl = ensureHttps(videoUrl);
}
// Tags
const rawTags = noteData.tagList ?? noteData.tag_list ?? [];
const tags = rawTags
.map((t) => t.name ?? '')
.filter((name) => name !== '');
// Interaction stats
const interact = noteData.interactInfo ?? noteData.interact_info;
const likeCount = parseCountString(
interact?.likedCount ?? interact?.liked_count ?? '0',
);
const collectCount = parseCountString(
interact?.collectedCount ?? interact?.collected_count ?? '0',
);
const commentCount = parseCountString(
interact?.commentCount ?? interact?.comment_count ?? '0',
);
const shareCount = parseCountString(
interact?.shareCount ?? interact?.share_count ?? '0',
);
// Timestamps
const createTimeRaw = noteData.time ?? noteData.createTime ?? noteData.create_time;
const createTime = createTimeRaw
? new Date(typeof createTimeRaw === 'number' && createTimeRaw < 1e12 ? createTimeRaw * 1000 : createTimeRaw).toISOString()
: '';
const lastUpdateTimeRaw = noteData.lastUpdateTime ?? noteData.last_update_time;
const lastUpdateTime = lastUpdateTimeRaw
? new Date(typeof lastUpdateTimeRaw === 'number' && lastUpdateTimeRaw < 1e12 ? lastUpdateTimeRaw * 1000 : lastUpdateTimeRaw).toISOString()
: '';
const ipLocation = noteData.ipLocation ?? noteData.ip_location ?? '';
// User
const rawUser = noteData.user;
const user = {
id: rawUser?.userId ?? rawUser?.user_id ?? '',
nickname: rawUser?.nickname ?? rawUser?.nick_name ?? rawUser?.nickName ?? '',
avatar: rawUser?.avatar ?? rawUser?.avatarUrl ?? rawUser?.avatar_url ?? '',
};
// Comments from state (may be empty)
const rawComments = noteData.comments ?? [];
const comments = rawComments.map(parseRawComment).filter((c): c is Comment => c !== null);
const resolvedXsecToken = noteData.xsecToken ?? noteData.xsec_token ?? xsecToken;
return {
id,
xsecToken: resolvedXsecToken,
title,
description,
type,
images,
videoUrl,
tags,
likeCount,
collectCount,
commentCount,
shareCount,
createTime,
lastUpdateTime,
ipLocation,
user,
comments,
};
}
/**
* Parse a raw comment from __INITIAL_STATE__ into a Comment object.
*/
function parseRawComment(raw: RawCommentData): Comment | null {
const id = raw.id ?? '';
if (!id) return null;
const userInfo = raw.userInfo;
const userId = raw.userId ?? raw.user_id ?? userInfo?.userId ?? userInfo?.user_id ?? '';
const nickname = userInfo?.nickname ?? userInfo?.nick_name ?? '';
const avatar = userInfo?.image ?? userInfo?.avatar ?? '';
const content = raw.content ?? '';
const likeCountRaw = raw.likeCount ?? raw.like_count ?? 0;
const likeCount = typeof likeCountRaw === 'string'
? parseCountString(likeCountRaw)
: likeCountRaw;
const createTimeRaw = raw.createTime ?? raw.create_time;
const createTime = createTimeRaw
? new Date(typeof createTimeRaw === 'number' && createTimeRaw < 1e12 ? createTimeRaw * 1000 : createTimeRaw).toISOString()
: '';
const ipLocation = raw.ipLocation ?? raw.ip_location ?? '';
const rawSubs = raw.subComments ?? raw.sub_comments ?? [];
const subComments = rawSubs.map(parseRawComment).filter((c): c is Comment => c !== null);
return {
id,
userId,
nickname,
avatar,
content,
likeCount,
createTime,
ipLocation,
subComments,
};
}
// ---------------------------------------------------------------------------
// DOM scraping fallback — uses Playwright Node-side API exclusively
// ---------------------------------------------------------------------------
/**
* Scrape feed detail from the rendered DOM using Playwright's Node-side
* APIs ($eval, $$eval, $) to avoid needing DOM lib types.
*/
async function scrapeDetailFromDom(
page: Page,
feedId: string,
xsecToken: string,
): Promise<FeedDetail> {
// Title
const title = await page
.$eval(SEL.title, (el) => el.textContent?.trim() ?? '')
.catch(() => '');
// Description
const description = await page
.$eval(SEL.description, (el) => el.textContent?.trim() ?? '')
.catch(() => '');
// Images — try image list first, then hero image.
let images: string[] = await page
.$$eval(SEL.images, (imgs) =>
imgs.map((img) => img.getAttribute('src') ?? '').filter(Boolean),
)
.catch(() => [] as string[]);
if (images.length === 0) {
const heroSrc = await page
.$eval(SEL.heroImage, (img) => img.getAttribute('src') ?? '')
.catch(() => '');
if (heroSrc) images = [heroSrc];
}
// Video URL
let videoUrl: string | undefined;
const videoSrc = await page
.$eval(SEL.video, (video) => video.getAttribute('src') ?? '')
.catch(() => '');
if (videoSrc) {
videoUrl = videoSrc;
} else {
const sourceSrc = await page
.$eval(SEL.videoSource, (source) => source.getAttribute('src') ?? '')
.catch(() => '');
if (sourceSrc) videoUrl = sourceSrc;
}
const type: 'normal' | 'video' = videoUrl ? 'video' : 'normal';
// Tags
const tags: string[] = await page
.$$eval(SEL.tags, (els) =>
els
.map((el) => el.textContent?.trim().replace(/^#/, '') ?? '')
.filter(Boolean),
)
.catch(() => [] as string[]);
// Stats
const likeCount = await extractCount(page, SEL.likeCount);
const collectCount = await extractCount(page, SEL.collectCount);
const commentCount = await extractCount(page, SEL.commentCount);
const shareCount = await extractCount(page, SEL.shareCount);
// Create time
const createTime = await page
.$eval(SEL.createTime, (el) => el.textContent?.trim() ?? '')
.catch(() => '');
// IP location
const ipLocation = await page
.$eval(SEL.ipLocation, (el) => el.textContent?.trim() ?? '')
.catch(() => '');
// Author info
const authorName = await page
.$eval(SEL.authorName, (el) => el.textContent?.trim() ?? '')
.catch(() => '');
const authorAvatar = await page
.$eval(SEL.authorAvatar, (img) => img.getAttribute('src') ?? '')
.catch(() => '');
// Extract author ID from the author link href.
const authorLinkHref = await page
.$eval(SEL.authorLink, (el) => el.getAttribute('href') ?? '')
.catch(() => '');
const authorIdMatch = authorLinkHref.match(/\/user\/profile\/([a-f0-9]+)/);
const authorId = authorIdMatch?.[1] ?? '';
return {
id: feedId,
xsecToken,
title,
description,
type,
images,
videoUrl,
tags,
likeCount,
collectCount,
commentCount,
shareCount,
createTime,
lastUpdateTime: '',
ipLocation,
user: {
id: authorId,
nickname: authorName,
avatar: authorAvatar,
},
comments: [],
};
}
// ---------------------------------------------------------------------------
// Comment scraping from DOM — uses Playwright Node-side API exclusively
// ---------------------------------------------------------------------------
/**
* Scrape comments from the note detail page DOM.
*
* @param page - The current Playwright page (already on the detail URL).
* @param loadAllComments - If true, clicks "show more" buttons repeatedly.
* @returns An array of Comment objects.
*/
async function scrapeComments(
page: Page,
loadAllComments: boolean,
): Promise<Comment[]> {
// Scroll down to the comments section to trigger lazy loading.
// Use a string expression to avoid needing DOM types.
await page.evaluate(`
(() => {
const commentsArea = document.querySelector('.comments-container');
if (commentsArea) {
commentsArea.scrollIntoView({ behavior: 'smooth' });
} else {
window.scrollTo(0, document.body.scrollHeight);
}
})()
`);
await page.waitForTimeout(1500);
// If loadAllComments, keep clicking "show more" until it disappears or
// we hit the maximum click limit.
if (loadAllComments) {
let clicks = 0;
while (clicks < MAX_LOAD_MORE_CLICKS) {
const showMoreBtn = await page.$(SEL.showMoreComments);
if (!showMoreBtn) break;
const isVisible = await showMoreBtn.isVisible().catch(() => false);
if (!isVisible) break;
await showMoreBtn.click().catch(() => {});
await page.waitForTimeout(LOAD_MORE_DELAY_MS);
clicks++;
}
if (clicks > 0) {
log.debug({ clicks }, 'Clicked "show more comments" button');
}
}
// Now extract all visible comments using Playwright Node-side API.
const commentElements = await page.$$(SEL.commentItem);
const comments: Comment[] = [];
for (const commentEl of commentElements) {
try {
const comment = await parseCommentElement(commentEl);
if (comment) {
comments.push(comment);
}
} catch {
// Skip comments that fail to parse.
continue;
}
}
return comments;
}
/**
* Parse a single comment element into a Comment object using Playwright
* Node-side API.
*/
async function parseCommentElement(
commentEl: ElementHandle,
): Promise<Comment | null> {
const content = await commentEl
.$eval(SEL.commentContent, (el) => el.textContent?.trim() ?? '')
.catch(() => '');
const nickname = await commentEl
.$eval(SEL.commentAuthor, (el) => el.textContent?.trim() ?? '')
.catch(() => '');
const avatar = await commentEl
.$eval(SEL.commentAvatar, (el) => el.getAttribute('src') ?? '')
.catch(() => '');
const likeText = await commentEl
.$eval(SEL.commentLikeCount, (el) => el.textContent?.trim() ?? '0')
.catch(() => '0');
const createTime = await commentEl
.$eval(SEL.commentTime, (el) => el.textContent?.trim() ?? '')
.catch(() => '');
const ipLocation = await commentEl
.$eval(SEL.commentIpLocation, (el) => el.textContent?.trim() ?? '')
.catch(() => '');
// Try to extract comment ID from the element's attributes.
const commentId = await commentEl.evaluate(
(el) =>
el.getAttribute('id') ??
el.getAttribute('data-id') ??
el.getAttribute('data-comment-id') ??
'',
);
// Try to extract user ID from an author link.
const authorHref = await commentEl
.$eval('a[href*="/user/profile/"]', (el) => el.getAttribute('href') ?? '')
.catch(() => '');
const userIdMatch = authorHref.match(/\/user\/profile\/([a-f0-9]+)/);
const userId = userIdMatch?.[1] ?? '';
// Sub-comments (replies)
const subCommentElements = await commentEl.$$(SEL.subCommentItem);
const subComments: Comment[] = [];
for (const subEl of subCommentElements) {
try {
const subContent = await subEl
.$eval(SEL.commentContent, (el) => el.textContent?.trim() ?? '')
.catch(() => '');
const subAuthor = await subEl
.$eval(SEL.commentAuthor, (el) => el.textContent?.trim() ?? '')
.catch(() => '');
const subAvatar = await subEl
.$eval(SEL.commentAvatar, (el) => el.getAttribute('src') ?? '')
.catch(() => '');
const subLikeText = await subEl
.$eval(SEL.commentLikeCount, (el) => el.textContent?.trim() ?? '0')
.catch(() => '0');
const subTime = await subEl
.$eval(SEL.commentTime, (el) => el.textContent?.trim() ?? '')
.catch(() => '');
const subIp = await subEl
.$eval(SEL.commentIpLocation, (el) => el.textContent?.trim() ?? '')
.catch(() => '');
const subId = await subEl.evaluate(
(el) =>
el.getAttribute('id') ??
el.getAttribute('data-id') ??
el.getAttribute('data-comment-id') ??
'',
);
const subAuthorHref = await subEl
.$eval('a[href*="/user/profile/"]', (el) => el.getAttribute('href') ?? '')
.catch(() => '');
const subUserIdMatch = subAuthorHref.match(/\/user\/profile\/([a-f0-9]+)/);
subComments.push({
id: subId,
userId: subUserIdMatch?.[1] ?? '',
nickname: subAuthor,
avatar: subAvatar,
content: subContent,
likeCount: parseCountString(subLikeText),
createTime: subTime,
ipLocation: subIp,
subComments: [],
});
} catch {
continue;
}
}
return {
id: commentId,
userId,
nickname,
avatar,
content,
likeCount: parseCountString(likeText),
createTime,
ipLocation,
subComments,
};
}
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
/**
* Extract a numeric count from an element on the page, handling
* abbreviations like "1.2w" and "3k".
*/
async function extractCount(page: Page, selector: string): Promise<number> {
const text = await page
.$eval(selector, (el) => el.textContent?.trim() ?? '0')
.catch(() => '0');
return parseCountString(text);
}
+401
View File
@@ -0,0 +1,401 @@
import type { Page } from 'rebrowser-playwright';
import { logger } from '../../utils/logger.js';
import type { Feed } from './types.js';
// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------
const EXPLORE_URL = 'https://www.xiaohongshu.com/explore';
const log = logger.child({ module: 'xhs-feeds' });
// ---------------------------------------------------------------------------
// __INITIAL_STATE__ raw types (partial — only the fields we care about)
// ---------------------------------------------------------------------------
/** Shape of a single feed item inside __INITIAL_STATE__.homeFeed.feeds */
interface RawFeedItem {
id?: string;
noteId?: string;
note_id?: string;
xsecToken?: string;
xsec_token?: string;
displayTitle?: string;
display_title?: string;
title?: string;
desc?: string;
description?: string;
type?: string;
noteCard?: RawNoteCard;
model_type?: string;
cover?: RawImage;
user?: RawUser;
interactInfo?: RawInteractInfo;
interact_info?: RawInteractInfo;
likedCount?: string;
liked_count?: string;
}
interface RawNoteCard {
noteId?: string;
displayTitle?: string;
display_title?: string;
title?: string;
desc?: string;
type?: string;
cover?: RawImage;
user?: RawUser;
interactInfo?: RawInteractInfo;
interact_info?: RawInteractInfo;
xsecToken?: string;
xsec_token?: string;
}
interface RawImage {
url?: string;
urlPre?: string;
urlDefault?: string;
url_pre?: string;
url_default?: string;
infoList?: Array<{ url?: string }>;
info_list?: Array<{ url?: string }>;
}
interface RawUser {
userId?: string;
user_id?: string;
nickname?: string;
nick_name?: string;
nickName?: string;
avatar?: string;
avatarUrl?: string;
avatar_url?: string;
}
interface RawInteractInfo {
likedCount?: string;
liked_count?: string;
likeCount?: string;
like_count?: string;
}
/**
* Partial shape of the __INITIAL_STATE__ global variable.
* Xiaohongshu places SSR data here for hydration.
*/
interface InitialState {
homeFeed?: {
feeds?: RawFeedItem[];
};
feed?: {
feeds?: RawFeedItem[];
};
explore?: {
feeds?: RawFeedItem[];
};
[key: string]: unknown;
}
// ---------------------------------------------------------------------------
// listFeeds — extract feeds from the explore page
// ---------------------------------------------------------------------------
/**
* Navigate to the Xiaohongshu explore (home) page and extract the feed list
* from the server-rendered `__INITIAL_STATE__` global variable.
*
* Falls back to DOM scraping if `__INITIAL_STATE__` is unavailable or does
* not contain feed data.
*
* @param page - A Playwright Page managed by BrowserManager.
* @returns An array of Feed objects.
*/
export async function listFeeds(page: Page): Promise<Feed[]> {
log.debug('Navigating to explore page');
await page.goto(EXPLORE_URL, { waitUntil: 'domcontentloaded' });
// Allow the page a moment for client-side hydration to settle.
await page.waitForTimeout(2000);
// -----------------------------------------------------------------------
// Strategy 1: Extract from __INITIAL_STATE__
// -----------------------------------------------------------------------
const initialState = await extractInitialState(page);
if (initialState) {
const feeds = parseFeedsFromState(initialState);
if (feeds.length > 0) {
log.info({ count: feeds.length }, 'Extracted feeds from __INITIAL_STATE__');
return feeds;
}
log.debug('__INITIAL_STATE__ found but no feeds extracted, falling back to DOM');
}
// -----------------------------------------------------------------------
// Strategy 2: Fall back to DOM scraping using Playwright Node-side API
// -----------------------------------------------------------------------
log.debug('Falling back to DOM scraping for feed list');
const feeds = await scrapeFeedsFromDom(page);
log.info({ count: feeds.length }, 'Extracted feeds from DOM');
return feeds;
}
// ---------------------------------------------------------------------------
// __INITIAL_STATE__ extraction
// ---------------------------------------------------------------------------
/**
* Attempt to extract the `__INITIAL_STATE__` object from the page.
* Returns `null` if the variable is not present or not an object.
*
* The evaluate callback runs in the browser context. We return `unknown`
* and cast on the Node side to avoid needing DOM lib types.
*/
async function extractInitialState(page: Page): Promise<InitialState | null> {
try {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const state: unknown = await page.evaluate('window.__INITIAL_STATE__');
if (state && typeof state === 'object') {
return state as InitialState;
}
log.debug('__INITIAL_STATE__ is not present or not an object');
return null;
} catch (err: unknown) {
log.warn({ err }, 'Failed to extract __INITIAL_STATE__');
return null;
}
}
// ---------------------------------------------------------------------------
// Feed parsing from __INITIAL_STATE__
// ---------------------------------------------------------------------------
/**
* Parse the raw initial state into structured Feed objects.
* Handles multiple possible shapes of the state data (Xiaohongshu has
* changed the structure over time).
*/
function parseFeedsFromState(state: InitialState): Feed[] {
// Try multiple known locations for the feed list.
const rawFeeds: RawFeedItem[] =
state.homeFeed?.feeds ??
state.feed?.feeds ??
state.explore?.feeds ??
[];
if (rawFeeds.length === 0) {
// Walk top-level keys looking for an array that resembles feeds.
for (const key of Object.keys(state)) {
const value = state[key];
if (value && typeof value === 'object' && !Array.isArray(value)) {
const obj = value as Record<string, unknown>;
if (Array.isArray(obj['feeds'])) {
return (obj['feeds'] as RawFeedItem[])
.map(parseRawFeedItem)
.filter((f): f is Feed => f !== null);
}
}
}
return [];
}
return rawFeeds
.map(parseRawFeedItem)
.filter((f): f is Feed => f !== null);
}
/**
* Convert a single raw feed item into a structured Feed, or return null if
* insufficient data is available.
*/
function parseRawFeedItem(raw: RawFeedItem): Feed | null {
// The feed data can be either flat or nested under `noteCard`.
const card = raw.noteCard;
const id =
raw.id ?? raw.noteId ?? raw.note_id ?? card?.noteId ?? '';
const xsecToken =
raw.xsecToken ?? raw.xsec_token ?? card?.xsecToken ?? card?.xsec_token ?? '';
const title =
raw.displayTitle ?? raw.display_title ?? raw.title ??
card?.displayTitle ?? card?.display_title ?? card?.title ?? '';
const description =
raw.desc ?? raw.description ?? card?.desc ?? '';
// Type detection — default to 'normal' if unclear.
const rawType = raw.type ?? raw.model_type ?? card?.type ?? '';
const type: 'normal' | 'video' =
rawType.toLowerCase().includes('video') ? 'video' : 'normal';
// Cover image URL — try multiple possible locations.
const rawCover = raw.cover ?? card?.cover;
const coverUrl = extractImageUrl(rawCover);
// User info
const rawUser = raw.user ?? card?.user;
const userId = rawUser?.userId ?? rawUser?.user_id ?? '';
const nickname =
rawUser?.nickname ?? rawUser?.nick_name ?? rawUser?.nickName ?? '';
const avatar =
rawUser?.avatar ?? rawUser?.avatarUrl ?? rawUser?.avatar_url ?? '';
// Like count — can be in interactInfo, or directly on the item.
const interactInfo = raw.interactInfo ?? raw.interact_info ?? card?.interactInfo ?? card?.interact_info;
const likeCountStr =
interactInfo?.likedCount ?? interactInfo?.liked_count ??
interactInfo?.likeCount ?? interactInfo?.like_count ??
raw.likedCount ?? raw.liked_count ?? '0';
const likeCount = parseCountString(likeCountStr);
// Must have at least an ID to be a valid feed.
if (!id) {
return null;
}
return {
id,
xsecToken,
title,
description,
type,
coverUrl,
likeCount,
user: {
id: userId,
nickname,
avatar,
},
};
}
// ---------------------------------------------------------------------------
// DOM scraping fallback — uses Playwright Node-side API exclusively
// ---------------------------------------------------------------------------
/**
* Scrape feed data using Playwright's Node-side selectors (`page.$$`,
* `page.$eval`) to avoid needing DOM lib types in our TypeScript config.
*/
async function scrapeFeedsFromDom(page: Page): Promise<Feed[]> {
// Wait for at least one feed card to appear.
await page.waitForSelector('.note-item', { timeout: 10_000 }).catch(() => null);
const cardElements = await page.$$('.note-item');
const feeds: Feed[] = [];
for (const card of cardElements) {
try {
// Extract feed link to get ID and xsec_token from the URL.
const href = await card.$eval('a.cover', (el) => el.getAttribute('href') ?? '').catch(() => '');
const idMatch = href.match(/\/explore\/([a-f0-9]+)/);
const tokenMatch = href.match(/xsec_token=([^&]+)/);
const id = idMatch?.[1] ?? '';
const xsecToken = tokenMatch?.[1] ?? '';
if (!id) continue;
// Cover image
const coverUrl = await card.$eval('a.cover img', (el) => el.getAttribute('src') ?? '').catch(() => '');
// Title
const title = await card.$eval('.footer .title', (el) => el.textContent?.trim() ?? '').catch(() => '');
// Author name
const nickname = await card.$eval('.footer .author-wrapper .name', (el) => el.textContent?.trim() ?? '').catch(() => '');
// Author avatar
const avatar = await card.$eval('.footer .author-wrapper .author-head img', (el) => el.getAttribute('src') ?? '').catch(() => '');
// Author ID from link
const authorHref = await card.$eval('.footer .author-wrapper a', (el) => el.getAttribute('href') ?? '').catch(() => '');
const authorIdMatch = authorHref.match(/\/user\/profile\/([a-f0-9]+)/);
const userId = authorIdMatch?.[1] ?? '';
// Like count
const likeText = await card.$eval('.footer .like-wrapper .count', (el) => el.textContent?.trim() ?? '0').catch(() => '0');
const likeCount = parseCountString(likeText);
// Type — check if there is a video icon.
const hasVideoIcon = await card.$('.play-icon').then((el) => el !== null).catch(() => false);
feeds.push({
id,
xsecToken,
title,
description: '',
type: hasVideoIcon ? 'video' : 'normal',
coverUrl,
likeCount,
user: { id: userId, nickname, avatar },
});
} catch {
// Skip cards that fail to parse.
continue;
}
}
return feeds;
}
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
/**
* Extract an image URL from the raw cover image object, trying multiple
* possible field names.
*/
function extractImageUrl(raw: RawImage | undefined): string {
if (!raw) return '';
// Direct URL fields
if (raw.url) return ensureHttps(raw.url);
if (raw.urlPre) return ensureHttps(raw.urlPre);
if (raw.urlDefault) return ensureHttps(raw.urlDefault);
if (raw.url_pre) return ensureHttps(raw.url_pre);
if (raw.url_default) return ensureHttps(raw.url_default);
// infoList — array of image variants, take the first.
const infoList = raw.infoList ?? raw.info_list;
if (infoList && infoList.length > 0 && infoList[0]?.url) {
return ensureHttps(infoList[0].url);
}
return '';
}
/**
* Ensure a URL has an https:// prefix. Xiaohongshu sometimes returns
* protocol-relative URLs (//sns-...) or bare http.
*/
function ensureHttps(url: string): string {
if (url.startsWith('//')) return `https:${url}`;
if (url.startsWith('http://')) return url.replace('http://', 'https://');
return url;
}
/**
* Parse a count string that may contain abbreviations like "1.2w" (万) or
* "3k" into a number.
*/
function parseCountString(str: string): number {
if (!str) return 0;
const cleaned = str.replace(/,/g, '').trim().toLowerCase();
if (cleaned.includes('w') || cleaned.includes('万')) {
return Math.round(parseFloat(cleaned) * 10_000);
}
if (cleaned.includes('k') || cleaned.includes('千')) {
return Math.round(parseFloat(cleaned) * 1_000);
}
const n = parseInt(cleaned, 10);
return isNaN(n) ? 0 : n;
}
// Re-export for use by other modules (search, user-profile, feed-detail)
// that need the same extraction / parsing helpers.
export { extractInitialState, parseCountString, ensureHttps };
+537
View File
@@ -0,0 +1,537 @@
import type { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
import type { Router } from 'express';
import type { BrowserManager } from '../../browser/manager.js';
import { config } from '../../config/index.js';
import { withErrorHandling } from '../../utils/errors.js';
import { validateMediaPath } from '../../utils/downloader.js';
import { checkLoginStatus, getLoginQRCode, deleteCookies } from './login.js';
import { listFeeds } from './feeds.js';
import { searchFeeds } from './search.js';
import { getFeedDetail } from './feed-detail.js';
import { getUserProfile } from './user-profile.js';
import { publishImageNote } from './publish.js';
import { publishVideoNote } from './publish-video.js';
import { postComment, replyComment } from './comment.js';
import { toggleLike, toggleFavorite } from './interaction.js';
import { createXhsRoutes } from './routes.js';
import {
CheckLoginSchema,
GetLoginQRCodeSchema,
DeleteCookiesSchema,
ListFeedsSchema,
SearchSchema,
GetFeedDetailSchema,
GetUserProfileSchema,
PublishImageSchema,
PublishVideoSchema,
PostCommentSchema,
ReplyCommentSchema,
LikeSchema,
FavoriteSchema,
} from './schemas.js';
import type { SearchFilters } from './types.js';
import type { PlatformPlugin } from '../../server/app.js';
// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------
const PLATFORM = 'xiaohongshu';
/** Maximum file size for video uploads (500 MB). */
const VIDEO_MAX_SIZE_MB = 500;
/** Maximum file size for image uploads (20 MB — default in validateMediaPath). */
const IMAGE_MAX_SIZE_MB = 20;
// ---------------------------------------------------------------------------
// PlatformPlugin implementation
// ---------------------------------------------------------------------------
export const xiaohongshuPlugin: PlatformPlugin = {
name: PLATFORM,
// =========================================================================
// REST API routes (Phase 5)
// =========================================================================
registerRoutes(router: Router, browser: BrowserManager): void {
const xhsRouter = createXhsRoutes(browser);
router.use('/', xhsRouter);
},
// =========================================================================
// MCP tools
// =========================================================================
registerTools(server: McpServer, browser: BrowserManager): void {
// =====================================================================
// Phase 2: Login management (3 tools)
// =====================================================================
// -----------------------------------------------------------------------
// xhs_check_login
// -----------------------------------------------------------------------
server.tool(
'xhs_check_login',
'Check Xiaohongshu login status',
CheckLoginSchema,
async () => {
return withErrorHandling('xhs_check_login', async () => {
const timeoutMs = config.operationTimeouts['login'] ?? config.operationTimeouts['default'] ?? 60_000;
const status = await browser.withPage(
PLATFORM,
async (page) => checkLoginStatus(page),
timeoutMs,
);
return {
content: [
{
type: 'text' as const,
text: JSON.stringify(status),
},
],
};
});
},
);
// -----------------------------------------------------------------------
// xhs_get_login_qrcode
// -----------------------------------------------------------------------
server.tool(
'xhs_get_login_qrcode',
'Get Xiaohongshu login QR code (user scans with phone)',
GetLoginQRCodeSchema,
async () => {
return withErrorHandling('xhs_get_login_qrcode', async () => {
const result = await getLoginQRCode(browser);
return {
content: [
{
type: 'text' as const,
text: JSON.stringify(result),
},
],
};
});
},
);
// -----------------------------------------------------------------------
// xhs_delete_cookies
// -----------------------------------------------------------------------
server.tool(
'xhs_delete_cookies',
'Delete Xiaohongshu cookies and reset login session',
DeleteCookiesSchema,
async () => {
return withErrorHandling('xhs_delete_cookies', async () => {
await deleteCookies(browser);
return {
content: [
{
type: 'text' as const,
text: JSON.stringify({ success: true, message: 'Cookies deleted' }),
},
],
};
});
},
);
// =====================================================================
// Phase 3: Content browsing (4 tools)
// =====================================================================
// -----------------------------------------------------------------------
// xhs_list_feeds
// -----------------------------------------------------------------------
server.tool(
'xhs_list_feeds',
'Get Xiaohongshu explore page recommended feed list',
ListFeedsSchema,
async () => {
return withErrorHandling('xhs_list_feeds', async () => {
const timeoutMs = config.operationTimeouts['feed_list'] ?? config.operationTimeouts['default'] ?? 60_000;
const feeds = await browser.withPage(
PLATFORM,
async (page) => listFeeds(page),
timeoutMs,
);
return {
content: [
{
type: 'text' as const,
text: JSON.stringify(feeds),
},
],
};
});
},
);
// -----------------------------------------------------------------------
// xhs_search
// -----------------------------------------------------------------------
server.tool(
'xhs_search',
'Search Xiaohongshu notes by keyword with optional filters (sort, type, time range)',
SearchSchema,
async (args) => {
return withErrorHandling('xhs_search', async () => {
const timeoutMs = config.operationTimeouts['search'] ?? config.operationTimeouts['default'] ?? 60_000;
const filters: SearchFilters | undefined = args.filters
? {
sort: args.filters.sort,
type: args.filters.type,
time: args.filters.time,
}
: undefined;
const feeds = await browser.withPage(
PLATFORM,
async (page) => searchFeeds(page, args.keyword, filters),
timeoutMs,
);
return {
content: [
{
type: 'text' as const,
text: JSON.stringify(feeds),
},
],
};
});
},
);
// -----------------------------------------------------------------------
// xhs_get_feed_detail
// -----------------------------------------------------------------------
server.tool(
'xhs_get_feed_detail',
'Get Xiaohongshu note detail including content, images, stats, and comments',
GetFeedDetailSchema,
async (args) => {
return withErrorHandling('xhs_get_feed_detail', async () => {
const timeoutMs = config.operationTimeouts['feed_detail'] ?? config.operationTimeouts['default'] ?? 60_000;
const detail = await browser.withPage(
PLATFORM,
async (page) =>
getFeedDetail(
page,
args.feed_id,
args.xsec_token,
args.load_all_comments,
),
timeoutMs,
);
return {
content: [
{
type: 'text' as const,
text: JSON.stringify(detail),
},
],
};
});
},
);
// -----------------------------------------------------------------------
// xhs_get_user_profile
// -----------------------------------------------------------------------
server.tool(
'xhs_get_user_profile',
'Get Xiaohongshu user profile information including bio, stats, and recent notes',
GetUserProfileSchema,
async (args) => {
return withErrorHandling('xhs_get_user_profile', async () => {
const timeoutMs = config.operationTimeouts['user_profile'] ?? config.operationTimeouts['default'] ?? 60_000;
const profile = await browser.withPage(
PLATFORM,
async (page) =>
getUserProfile(page, args.user_id, args.xsec_token),
timeoutMs,
);
return {
content: [
{
type: 'text' as const,
text: JSON.stringify(profile),
},
],
};
});
},
);
// =====================================================================
// Phase 4: Content publishing (2 tools)
// =====================================================================
// -----------------------------------------------------------------------
// xhs_publish_image
// -----------------------------------------------------------------------
server.tool(
'xhs_publish_image',
'Publish an image note on Xiaohongshu. Provide local file paths for images.',
PublishImageSchema,
async (args) => {
return withErrorHandling('xhs_publish_image', async () => {
// Fail fast: validate all image paths BEFORE acquiring a browser page.
const validatedPaths: string[] = [];
for (const imagePath of args.images) {
const resolved = await validateMediaPath(imagePath, {
maxSizeMB: IMAGE_MAX_SIZE_MB,
});
validatedPaths.push(resolved);
}
const timeoutMs =
config.operationTimeouts['publish'] ??
config.operationTimeouts['default'] ??
300_000;
const result = await browser.withPage(
PLATFORM,
async (page) =>
publishImageNote(page, args.title, args.content, validatedPaths, {
tags: args.tags,
scheduleAt: args.schedule_at,
isOriginal: args.is_original,
visibility: args.visibility,
}),
timeoutMs,
);
return {
content: [
{
type: 'text' as const,
text: JSON.stringify(result),
},
],
};
});
},
);
// -----------------------------------------------------------------------
// xhs_publish_video
// -----------------------------------------------------------------------
server.tool(
'xhs_publish_video',
'Publish a video note on Xiaohongshu. Provide a local file path for the video.',
PublishVideoSchema,
async (args) => {
return withErrorHandling('xhs_publish_video', async () => {
// Fail fast: validate the video path BEFORE acquiring a browser page.
const validatedPath = await validateMediaPath(args.video, {
maxSizeMB: VIDEO_MAX_SIZE_MB,
});
const timeoutMs =
config.operationTimeouts['publish'] ??
config.operationTimeouts['default'] ??
300_000;
const result = await browser.withPage(
PLATFORM,
async (page) =>
publishVideoNote(page, args.title, args.content, validatedPath, {
tags: args.tags,
scheduleAt: args.schedule_at,
visibility: args.visibility,
}),
timeoutMs,
);
return {
content: [
{
type: 'text' as const,
text: JSON.stringify(result),
},
],
};
});
},
);
// =====================================================================
// Phase 4: Interactions (4 tools)
// =====================================================================
// -----------------------------------------------------------------------
// xhs_post_comment
// -----------------------------------------------------------------------
server.tool(
'xhs_post_comment',
'Post a comment on a Xiaohongshu note',
PostCommentSchema,
async (args) => {
return withErrorHandling('xhs_post_comment', async () => {
const timeoutMs =
config.operationTimeouts['comment'] ??
config.operationTimeouts['default'] ??
20_000;
const result = await browser.withPage(
PLATFORM,
async (page) =>
postComment(page, args.feed_id, args.xsec_token, args.content),
timeoutMs,
);
return {
content: [
{
type: 'text' as const,
text: JSON.stringify(result),
},
],
};
});
},
);
// -----------------------------------------------------------------------
// xhs_reply_comment
// -----------------------------------------------------------------------
server.tool(
'xhs_reply_comment',
'Reply to a comment on a Xiaohongshu note',
ReplyCommentSchema,
async (args) => {
return withErrorHandling('xhs_reply_comment', async () => {
const timeoutMs =
config.operationTimeouts['reply'] ??
config.operationTimeouts['default'] ??
20_000;
const result = await browser.withPage(
PLATFORM,
async (page) =>
replyComment(
page,
args.feed_id,
args.xsec_token,
args.content,
args.comment_id,
args.user_id,
),
timeoutMs,
);
return {
content: [
{
type: 'text' as const,
text: JSON.stringify(result),
},
],
};
});
},
);
// -----------------------------------------------------------------------
// xhs_like
// -----------------------------------------------------------------------
server.tool(
'xhs_like',
'Like or unlike a Xiaohongshu note',
LikeSchema,
async (args) => {
return withErrorHandling('xhs_like', async () => {
const timeoutMs =
config.operationTimeouts['like'] ??
config.operationTimeouts['default'] ??
15_000;
const result = await browser.withPage(
PLATFORM,
async (page) =>
toggleLike(page, args.feed_id, args.xsec_token, args.unlike),
timeoutMs,
);
return {
content: [
{
type: 'text' as const,
text: JSON.stringify(result),
},
],
};
});
},
);
// -----------------------------------------------------------------------
// xhs_favorite
// -----------------------------------------------------------------------
server.tool(
'xhs_favorite',
'Favorite or unfavorite a Xiaohongshu note',
FavoriteSchema,
async (args) => {
return withErrorHandling('xhs_favorite', async () => {
const timeoutMs =
config.operationTimeouts['favorite'] ??
config.operationTimeouts['default'] ??
15_000;
const result = await browser.withPage(
PLATFORM,
async (page) =>
toggleFavorite(
page,
args.feed_id,
args.xsec_token,
args.unfavorite,
),
timeoutMs,
);
return {
content: [
{
type: 'text' as const,
text: JSON.stringify(result),
},
],
};
});
},
);
},
};
+214
View File
@@ -0,0 +1,214 @@
import type { Page } from 'rebrowser-playwright';
import { logger } from '../../utils/logger.js';
import { XHS_SELECTORS } from './selectors.js';
// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------
const FEED_DETAIL_URL = 'https://www.xiaohongshu.com/explore';
/** Wait after clicking like/favorite to let the state update. */
const TOGGLE_SETTLE_MS = 1_000;
const selInteraction = XHS_SELECTORS.interaction;
const selDetail = XHS_SELECTORS.feedDetail;
const log = logger.child({ module: 'xhs-interaction' });
// ---------------------------------------------------------------------------
// toggleLike
// ---------------------------------------------------------------------------
/**
* Like or unlike a Xiaohongshu note.
*
* @param page - Playwright Page managed by BrowserManager.
* @param feedId - The note / feed ID.
* @param xsecToken - Security token for accessing the feed page.
* @param unlike - If true, unlike the note (toggle off). Default: false.
* @returns Object with success status and the resulting liked state.
*/
export async function toggleLike(
page: Page,
feedId: string,
xsecToken: string,
unlike?: boolean,
): Promise<{ success: boolean; liked: boolean }> {
log.info({ feedId, unlike: unlike ?? false }, 'Toggling like on note');
// -------------------------------------------------------------------------
// 1. Navigate to the feed detail page
// -------------------------------------------------------------------------
const feedUrl = buildFeedUrl(feedId, xsecToken);
await page.goto(feedUrl, { waitUntil: 'domcontentloaded' });
// Wait for the note container and interaction bar to be visible.
await page.waitForSelector(selDetail.noteContainer, { timeout: 10_000 });
await page.waitForTimeout(1_000);
// -------------------------------------------------------------------------
// 2. Check the current like state
// -------------------------------------------------------------------------
const isCurrentlyLiked = await isElementActive(
page,
selInteraction.likeButtonActive,
);
log.debug({ isCurrentlyLiked, desiredUnlike: unlike ?? false }, 'Current like state');
// Determine whether we need to toggle.
// - unlike=true means we want the note to NOT be liked → toggle only if currently liked.
// - unlike=false means we want the note to BE liked → toggle only if currently not liked.
const shouldToggle = unlike ? isCurrentlyLiked : !isCurrentlyLiked;
if (!shouldToggle) {
// Already in the desired state — no action needed.
const liked = !unlike;
log.info({ feedId, liked, alreadyInState: true }, 'Like already in desired state');
return { success: true, liked };
}
// -------------------------------------------------------------------------
// 3. Click the like button
// -------------------------------------------------------------------------
const likeBtn = await page.$(selInteraction.likeButton);
if (!likeBtn) {
log.warn('Like button not found on feed detail page');
return { success: false, liked: isCurrentlyLiked };
}
await likeBtn.click();
await page.waitForTimeout(TOGGLE_SETTLE_MS);
// -------------------------------------------------------------------------
// 4. Verify the new state
// -------------------------------------------------------------------------
const isNowLiked = await isElementActive(
page,
selInteraction.likeButtonActive,
);
const expectedLiked = !unlike;
const success = isNowLiked === expectedLiked;
log.info({ feedId, liked: isNowLiked, success }, 'Like toggle complete');
return { success, liked: isNowLiked };
}
// ---------------------------------------------------------------------------
// toggleFavorite
// ---------------------------------------------------------------------------
/**
* Favorite or unfavorite a Xiaohongshu note.
*
* @param page - Playwright Page managed by BrowserManager.
* @param feedId - The note / feed ID.
* @param xsecToken - Security token for accessing the feed page.
* @param unfavorite - If true, unfavorite the note (toggle off). Default: false.
* @returns Object with success status and the resulting favorited state.
*/
export async function toggleFavorite(
page: Page,
feedId: string,
xsecToken: string,
unfavorite?: boolean,
): Promise<{ success: boolean; favorited: boolean }> {
log.info({ feedId, unfavorite: unfavorite ?? false }, 'Toggling favorite on note');
// -------------------------------------------------------------------------
// 1. Navigate to the feed detail page
// -------------------------------------------------------------------------
const feedUrl = buildFeedUrl(feedId, xsecToken);
await page.goto(feedUrl, { waitUntil: 'domcontentloaded' });
await page.waitForSelector(selDetail.noteContainer, { timeout: 10_000 });
await page.waitForTimeout(1_000);
// -------------------------------------------------------------------------
// 2. Check the current favorite state
// -------------------------------------------------------------------------
const isCurrentlyFavorited = await isElementActive(
page,
selInteraction.favoriteButtonActive,
);
log.debug(
{ isCurrentlyFavorited, desiredUnfavorite: unfavorite ?? false },
'Current favorite state',
);
const shouldToggle = unfavorite ? isCurrentlyFavorited : !isCurrentlyFavorited;
if (!shouldToggle) {
const favorited = !unfavorite;
log.info(
{ feedId, favorited, alreadyInState: true },
'Favorite already in desired state',
);
return { success: true, favorited };
}
// -------------------------------------------------------------------------
// 3. Click the favorite button
// -------------------------------------------------------------------------
const favBtn = await page.$(selInteraction.favoriteButton);
if (!favBtn) {
log.warn('Favorite button not found on feed detail page');
return { success: false, favorited: isCurrentlyFavorited };
}
await favBtn.click();
await page.waitForTimeout(TOGGLE_SETTLE_MS);
// -------------------------------------------------------------------------
// 4. Verify the new state
// -------------------------------------------------------------------------
const isNowFavorited = await isElementActive(
page,
selInteraction.favoriteButtonActive,
);
const expectedFavorited = !unfavorite;
const success = isNowFavorited === expectedFavorited;
log.info({ feedId, favorited: isNowFavorited, success }, 'Favorite toggle complete');
return { success, favorited: isNowFavorited };
}
// ---------------------------------------------------------------------------
// Internal helpers
// ---------------------------------------------------------------------------
/**
* Build the URL for a feed detail page.
*/
function buildFeedUrl(feedId: string, xsecToken: string): string {
return `${FEED_DETAIL_URL}/${feedId}?xsec_token=${encodeURIComponent(xsecToken)}&xsec_source=pc_search`;
}
/**
* Check whether an element matching the given selector exists on the page.
* Used to determine the active/inactive state of like/favorite buttons.
*
* The selector for the "active" state uses CSS classes that are only present
* when the button is in its toggled-on state (e.g. `.like-wrapper.active`).
*/
async function isElementActive(page: Page, selector: string): Promise<boolean> {
const el = await page.$(selector);
return el !== null;
}
+186
View File
@@ -0,0 +1,186 @@
import type { Page } from 'rebrowser-playwright';
import type { BrowserManager } from '../../browser/manager.js';
import { logger } from '../../utils/logger.js';
import { cookieStore } from '../../cookie/store.js';
import { XHS_SELECTORS } from './selectors.js';
import type { LoginStatus, QRCodeResult } from './types.js';
// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------
const PLATFORM = 'xiaohongshu';
const EXPLORE_URL = 'https://www.xiaohongshu.com/explore';
/** How long to wait for the user to scan the QR code (4 minutes). */
const QR_SCAN_TIMEOUT_MS = 4 * 60 * 1000;
const log = logger.child({ module: 'xhs-login' });
// ---------------------------------------------------------------------------
// checkLoginStatus
// ---------------------------------------------------------------------------
/**
* Navigate to the explore page and determine whether the user is logged in
* by checking for the presence of the logged-in indicator element.
*
* @param page - A Playwright Page already managed by the caller.
* @returns An object indicating login status.
*/
export async function checkLoginStatus(page: Page): Promise<LoginStatus> {
await page.goto(EXPLORE_URL, { waitUntil: 'domcontentloaded' });
// Give the page a moment for client-side rendering to settle.
const indicator = await page
.waitForSelector(XHS_SELECTORS.login.loggedInIndicator, { timeout: 5_000 })
.catch(() => null);
if (!indicator) {
return { loggedIn: false };
}
// Attempt to extract a username from the indicator area.
const username = await indicator.textContent().catch(() => null);
return {
loggedIn: true,
...(username ? { username: username.trim() } : {}),
};
}
// ---------------------------------------------------------------------------
// getLoginQRCode
// ---------------------------------------------------------------------------
/**
* Open the explore page, trigger the login modal if needed, and extract the
* QR code image data.
*
* Because the user must scan the QR code with their phone (which takes an
* indeterminate amount of time), this function uses `acquirePage` instead of
* `withPage`. A fire-and-forget background task waits for the scan to
* complete, saves cookies, and releases the page.
*
* @param browser - The shared BrowserManager instance.
* @returns QR code data or an indication that the user is already logged in.
*/
export async function getLoginQRCode(
browser: BrowserManager,
): Promise<QRCodeResult> {
const { page, release } = await browser.acquirePage(PLATFORM);
try {
await page.goto(EXPLORE_URL, { waitUntil: 'domcontentloaded' });
// Check whether the user is already logged in.
const alreadyLoggedIn = await page
.waitForSelector(XHS_SELECTORS.login.loggedInIndicator, { timeout: 3_000 })
.then(() => true)
.catch(() => false);
if (alreadyLoggedIn) {
await release();
return { qrcodeData: '', alreadyLoggedIn: true, timeout: '0' };
}
// If the QR code is not visible yet, click the login button to open it.
const qrVisible = await page
.waitForSelector(XHS_SELECTORS.login.qrCodeImage, { timeout: 3_000 })
.then(() => true)
.catch(() => false);
if (!qrVisible) {
const loginBtn = await page
.waitForSelector(XHS_SELECTORS.login.loginButton, { timeout: 5_000 })
.catch(() => null);
if (loginBtn) {
await loginBtn.click();
}
}
// Wait for the QR code image to appear.
const qrElement = await page.waitForSelector(
XHS_SELECTORS.login.qrCodeImage,
{ timeout: 10_000 },
);
const qrcodeData = await qrElement.getAttribute('src');
if (!qrcodeData) {
await release();
throw new Error('QR code image src attribute is empty');
}
// Fire-and-forget: wait for the user to scan the QR code in the
// background. On success, save cookies and release the page. On
// failure or timeout, just release the page. The `.catch()` ensures
// no unhandled rejection escapes.
waitForLoginAndRelease(page, browser, release).catch((err: unknown) => {
log.error({ err }, 'Login wait flow encountered an unexpected error');
});
return {
qrcodeData,
alreadyLoggedIn: false,
timeout: '4m',
};
} catch (err) {
// If anything goes wrong before we hand off to the background task,
// make sure the page is released.
await release();
throw err;
}
}
// ---------------------------------------------------------------------------
// deleteCookies
// ---------------------------------------------------------------------------
/**
* Delete persisted cookies for the Xiaohongshu platform.
*
* @param _browser - The shared BrowserManager instance (unused for now but
* passed for consistency; a future version may also clear
* the in-memory browser context).
*/
export async function deleteCookies(_browser: BrowserManager): Promise<void> {
await cookieStore.delete(PLATFORM);
log.info('Xiaohongshu cookies deleted');
}
// ---------------------------------------------------------------------------
// Internal: waitForLoginAndRelease
// ---------------------------------------------------------------------------
/**
* Background task that waits for the logged-in indicator to appear (meaning
* the user has scanned the QR code). On success it persists cookies. The
* page is released in all cases (success, timeout, error) via `finally`.
*
* `release` is idempotent (guaranteed by BrowserManager.acquirePage), so
* even if the safety-net timer inside acquirePage fires concurrently, there
* is no double-close.
*/
async function waitForLoginAndRelease(
page: Page,
browser: BrowserManager,
release: () => Promise<void>,
): Promise<void> {
try {
await page.waitForSelector(XHS_SELECTORS.login.loggedInIndicator, {
timeout: QR_SCAN_TIMEOUT_MS,
});
log.info('QR code scanned — login detected, saving cookies');
await browser.saveCookies(PLATFORM);
} catch {
// Timeout or page closed — not an error, just means the user did not
// scan in time (or the page was released by the safety timer).
log.debug('Login wait ended without successful scan');
} finally {
await release();
}
}
+313
View File
@@ -0,0 +1,313 @@
import type { Page } from 'rebrowser-playwright';
import { logger } from '../../utils/logger.js';
import { XHS_SELECTORS } from './selectors.js';
// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------
const CREATOR_PUBLISH_URL = 'https://creator.xiaohongshu.com/publish/publish';
/**
* Maximum time to wait for video upload to complete.
* Videos can be up to 500 MB, so we allow up to 4 minutes for upload.
*/
const VIDEO_UPLOAD_TIMEOUT_MS = 240_000;
/** Wait after upload completes to let the UI settle. */
const UPLOAD_SETTLE_MS = 2_000;
/** Wait after filling form fields. */
const FIELD_SETTLE_MS = 500;
/** Wait after clicking publish before checking result. */
const PUBLISH_SETTLE_MS = 3_000;
const sel = XHS_SELECTORS.publish;
const log = logger.child({ module: 'xhs-publish-video' });
// ---------------------------------------------------------------------------
// publishVideoNote
// ---------------------------------------------------------------------------
export interface PublishVideoOptions {
tags?: string[];
scheduleAt?: string;
visibility?: string;
}
/**
* Publish a video note on Xiaohongshu via the creator center UI.
*
* @param page - Playwright Page managed by BrowserManager.
* @param title - Note title (required, non-empty).
* @param content - Note body / description text.
* @param videoPath - Local path to the video file (already validated by caller).
* @param options - Optional tags, schedule, and visibility.
* @returns Object indicating success and an optional noteId if detectable.
*/
export async function publishVideoNote(
page: Page,
title: string,
content: string,
videoPath: string,
options?: PublishVideoOptions,
): Promise<{ success: boolean; noteId?: string }> {
log.info(
{ hasOptions: !!options },
'Starting video note publish',
);
// -------------------------------------------------------------------------
// 1. Navigate to the creator publish page
// -------------------------------------------------------------------------
await page.goto(CREATOR_PUBLISH_URL, { waitUntil: 'domcontentloaded' });
// Allow the SPA to hydrate.
await page.waitForTimeout(2_000);
// -------------------------------------------------------------------------
// 2. Upload the video via the file input
// -------------------------------------------------------------------------
// The creator page uses the same file input for both images and video.
// The platform detects the file type from the uploaded content.
const fileInput = await page.waitForSelector(sel.imageFileInput, {
timeout: 10_000,
});
await fileInput.setInputFiles(videoPath);
log.debug('Video file set on input element, waiting for upload to complete');
// Wait for the video thumbnail / player to appear, indicating upload is done.
// Video uploads take significantly longer than images.
await page.waitForSelector(sel.uploadedVideoItem, {
timeout: VIDEO_UPLOAD_TIMEOUT_MS,
});
// Give the UI time to settle after video processing.
await page.waitForTimeout(UPLOAD_SETTLE_MS);
log.debug('Video uploaded successfully');
// -------------------------------------------------------------------------
// 3. Fill in the title
// -------------------------------------------------------------------------
const titleInput = await page.waitForSelector(sel.titleInput, {
timeout: 5_000,
});
await titleInput.click();
await titleInput.fill('');
await page.keyboard.type(title, { delay: 30 });
await page.waitForTimeout(FIELD_SETTLE_MS);
// -------------------------------------------------------------------------
// 4. Fill in the content / description
// -------------------------------------------------------------------------
const contentEditor = await page.waitForSelector(sel.contentEditor, {
timeout: 5_000,
});
await contentEditor.click();
await page.keyboard.type(content, { delay: 20 });
await page.waitForTimeout(FIELD_SETTLE_MS);
// -------------------------------------------------------------------------
// 5. Add tags (optional)
// -------------------------------------------------------------------------
if (options?.tags && options.tags.length > 0) {
await addTags(page, options.tags);
}
// -------------------------------------------------------------------------
// 6. Set visibility (optional)
// -------------------------------------------------------------------------
if (options?.visibility && options.visibility !== 'public') {
await setVisibility(page, options.visibility);
}
// -------------------------------------------------------------------------
// 7. Set schedule (optional)
// -------------------------------------------------------------------------
if (options?.scheduleAt) {
await setSchedule(page, options.scheduleAt);
}
// -------------------------------------------------------------------------
// 8. Click the publish button
// -------------------------------------------------------------------------
const publishBtn = await page.waitForSelector(sel.publishButton, {
timeout: 5_000,
});
await publishBtn.click();
log.debug('Publish button clicked, waiting for success');
// -------------------------------------------------------------------------
// 9. Wait for success indicator
// -------------------------------------------------------------------------
const result = await waitForPublishResult(page);
log.info({ result }, 'Video note publish complete');
return result;
}
// ---------------------------------------------------------------------------
// Internal helpers (shared patterns with publish.ts — kept separate to
// avoid circular imports and keep each module self-contained)
// ---------------------------------------------------------------------------
/**
* Add hashtag / topic tags by typing into the content editor.
*/
async function addTags(page: Page, tags: string[]): Promise<void> {
for (const tag of tags) {
const editor = await page.$(sel.contentEditor);
if (editor) {
await editor.click();
await page.waitForTimeout(300);
}
await page.keyboard.type(`#${tag}`, { delay: 50 });
await page.waitForTimeout(800);
const suggestion = await page
.waitForSelector(sel.tagSuggestionItem, { timeout: 3_000 })
.catch(() => null);
if (suggestion) {
await suggestion.click();
} else {
await page.keyboard.press('Enter');
}
await page.waitForTimeout(FIELD_SETTLE_MS);
}
}
/**
* Set the note visibility (private or friends-only).
*/
async function setVisibility(page: Page, visibility: string): Promise<void> {
const visBtn = await page.$(sel.visibilityButton);
if (!visBtn) {
log.debug('Visibility button not found, skipping');
return;
}
await visBtn.click();
await page.waitForTimeout(FIELD_SETTLE_MS);
let optionSelector: string;
switch (visibility) {
case 'private':
optionSelector = sel.visibilityPrivate;
break;
case 'friends':
optionSelector = sel.visibilityFriends;
break;
default:
optionSelector = sel.visibilityPublic;
break;
}
const option = await page.$(optionSelector);
if (option) {
await option.click();
await page.waitForTimeout(FIELD_SETTLE_MS);
} else {
log.debug({ visibility }, 'Visibility option not found');
}
}
/**
* Open the schedule / timing picker and set the publish date.
*/
async function setSchedule(page: Page, scheduleAt: string): Promise<void> {
const scheduleBtn = await page.$(sel.scheduleButton);
if (!scheduleBtn) {
log.debug('Schedule button not found, skipping');
return;
}
await scheduleBtn.click();
await page.waitForTimeout(FIELD_SETTLE_MS);
const scheduleInput = await page.$(sel.scheduleInput);
if (scheduleInput) {
await scheduleInput.click();
await scheduleInput.fill('');
await page.keyboard.type(scheduleAt, { delay: 30 });
await page.keyboard.press('Enter');
await page.waitForTimeout(FIELD_SETTLE_MS);
} else {
log.debug('Schedule input not found');
}
}
/**
* Wait for the publish success indicator.
*/
async function waitForPublishResult(
page: Page,
): Promise<{ success: boolean; noteId?: string }> {
const urlChangePromise = page
.waitForURL(sel.publishSuccessUrlPattern, { timeout: 30_000 })
.then(() => true)
.catch(() => false);
const successElementPromise = page
.waitForSelector(sel.publishSuccess, { timeout: 30_000 })
.then(() => true)
.catch(() => false);
await page.waitForTimeout(PUBLISH_SETTLE_MS);
const [urlChanged, elementAppeared] = await Promise.all([
urlChangePromise,
successElementPromise,
]);
if (!urlChanged && !elementAppeared) {
const pageContent = await page.content();
const hasSuccessText =
pageContent.includes('发布成功') || pageContent.includes('已发布');
if (!hasSuccessText) {
log.warn('No success indicator found after video publish');
return { success: false };
}
}
const noteId = extractNoteIdFromUrl(page.url());
return { success: true, noteId };
}
/**
* Attempt to extract a note / post ID from the URL after successful publish.
*/
function extractNoteIdFromUrl(url: string): string | undefined {
try {
const parsed = new URL(url);
const noteIdParam = parsed.searchParams.get('noteId');
if (noteIdParam) return noteIdParam;
const pathMatch = parsed.pathname.match(/\/note\/([a-f0-9]+)/);
if (pathMatch?.[1]) return pathMatch[1];
} catch {
// URL parsing failed — noteId is optional.
}
return undefined;
}
+375
View File
@@ -0,0 +1,375 @@
import type { Page } from 'rebrowser-playwright';
import { logger } from '../../utils/logger.js';
import { XHS_SELECTORS } from './selectors.js';
// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------
const CREATOR_PUBLISH_URL = 'https://creator.xiaohongshu.com/publish/publish';
/** Maximum time to wait for image uploads to finish (60 seconds). */
const UPLOAD_WAIT_TIMEOUT_MS = 60_000;
/** Polling interval for checking upload completion. */
const UPLOAD_POLL_INTERVAL_MS = 500;
/** Wait between image upload completions to let the UI settle. */
const UPLOAD_SETTLE_MS = 1_500;
/** Wait after filling form fields to let debounce / auto-save settle. */
const FIELD_SETTLE_MS = 500;
/** Wait after clicking publish before checking result. */
const PUBLISH_SETTLE_MS = 3_000;
const sel = XHS_SELECTORS.publish;
const log = logger.child({ module: 'xhs-publish' });
// ---------------------------------------------------------------------------
// publishImageNote
// ---------------------------------------------------------------------------
export interface PublishImageOptions {
tags?: string[];
scheduleAt?: string;
isOriginal?: boolean;
visibility?: string;
}
/**
* Publish an image note on Xiaohongshu via the creator center UI.
*
* @param page - Playwright Page managed by BrowserManager.
* @param title - Note title (required, non-empty).
* @param content - Note body / description text.
* @param imagePaths - Array of local file paths (already validated by caller).
* @param options - Optional tags, schedule, original flag, and visibility.
* @returns Object indicating success and an optional noteId if detectable.
*/
export async function publishImageNote(
page: Page,
title: string,
content: string,
imagePaths: string[],
options?: PublishImageOptions,
): Promise<{ success: boolean; noteId?: string }> {
log.info(
{ imageCount: imagePaths.length, hasOptions: !!options },
'Starting image note publish',
);
// -------------------------------------------------------------------------
// 1. Navigate to the creator publish page
// -------------------------------------------------------------------------
await page.goto(CREATOR_PUBLISH_URL, { waitUntil: 'domcontentloaded' });
// Allow the SPA to hydrate.
await page.waitForTimeout(2_000);
// -------------------------------------------------------------------------
// 2. Upload images via the file input
// -------------------------------------------------------------------------
const fileInput = await page.waitForSelector(sel.imageFileInput, {
timeout: 10_000,
});
// Playwright's setInputFiles supports multiple files at once.
await fileInput.setInputFiles(imagePaths);
log.debug({ count: imagePaths.length }, 'Files set on input element');
// Wait for all image thumbnails to appear (one per uploaded image).
// Poll using page.$$ (Node-side API) to avoid needing browser-context
// DOM types which are not available in our TypeScript lib config.
await waitForUploadedImages(page, imagePaths.length);
// Give the UI a moment to settle after all uploads.
await page.waitForTimeout(UPLOAD_SETTLE_MS);
log.debug('All images uploaded successfully');
// -------------------------------------------------------------------------
// 3. Fill in the title
// -------------------------------------------------------------------------
const titleInput = await page.waitForSelector(sel.titleInput, {
timeout: 5_000,
});
await titleInput.click();
await titleInput.fill('');
await page.keyboard.type(title, { delay: 30 });
await page.waitForTimeout(FIELD_SETTLE_MS);
// -------------------------------------------------------------------------
// 4. Fill in the content / description
// -------------------------------------------------------------------------
const contentEditor = await page.waitForSelector(sel.contentEditor, {
timeout: 5_000,
});
await contentEditor.click();
await page.keyboard.type(content, { delay: 20 });
await page.waitForTimeout(FIELD_SETTLE_MS);
// -------------------------------------------------------------------------
// 5. Add tags (optional)
// -------------------------------------------------------------------------
if (options?.tags && options.tags.length > 0) {
await addTags(page, options.tags);
}
// -------------------------------------------------------------------------
// 6. Set original flag (optional)
// -------------------------------------------------------------------------
if (options?.isOriginal) {
await setOriginal(page);
}
// -------------------------------------------------------------------------
// 7. Set visibility (optional)
// -------------------------------------------------------------------------
if (options?.visibility && options.visibility !== 'public') {
await setVisibility(page, options.visibility);
}
// -------------------------------------------------------------------------
// 8. Set schedule (optional)
// -------------------------------------------------------------------------
if (options?.scheduleAt) {
await setSchedule(page, options.scheduleAt);
}
// -------------------------------------------------------------------------
// 9. Click the publish button
// -------------------------------------------------------------------------
const publishBtn = await page.waitForSelector(sel.publishButton, {
timeout: 5_000,
});
await publishBtn.click();
log.debug('Publish button clicked, waiting for success');
// -------------------------------------------------------------------------
// 10. Wait for success indicator
// -------------------------------------------------------------------------
const result = await waitForPublishResult(page);
log.info({ result }, 'Image note publish complete');
return result;
}
// ---------------------------------------------------------------------------
// Internal helpers
// ---------------------------------------------------------------------------
/**
* Poll until the expected number of uploaded image thumbnails appear on the
* page. Uses `page.$$` (Node-side) so we don't need browser-context DOM
* types in our TypeScript configuration.
*/
async function waitForUploadedImages(
page: Page,
expectedCount: number,
): Promise<void> {
const deadline = Date.now() + UPLOAD_WAIT_TIMEOUT_MS;
while (Date.now() < deadline) {
const items = await page.$$(sel.uploadedImageItem);
if (items.length >= expectedCount) {
return;
}
await page.waitForTimeout(UPLOAD_POLL_INTERVAL_MS);
}
throw new Error(
`Timed out waiting for ${String(expectedCount)} uploaded images after ${String(UPLOAD_WAIT_TIMEOUT_MS)}ms`,
);
}
/**
* Add hashtag / topic tags by typing into the tag input.
* For each tag, type the `#` prefix plus tag text, then select from
* the dropdown suggestion or press Enter.
*/
async function addTags(page: Page, tags: string[]): Promise<void> {
for (const tag of tags) {
// Click the content editor to ensure we're in the right context,
// then type `#` + tag text which triggers the topic selector.
const editor = await page.$(sel.contentEditor);
if (editor) {
await editor.click();
await page.waitForTimeout(300);
}
// Type the hashtag prefix which triggers the topic dropdown
await page.keyboard.type(`#${tag}`, { delay: 50 });
await page.waitForTimeout(800);
// Try to click the first suggestion item; if not available, press Enter.
const suggestion = await page
.waitForSelector(sel.tagSuggestionItem, { timeout: 3_000 })
.catch(() => null);
if (suggestion) {
await suggestion.click();
} else {
await page.keyboard.press('Enter');
}
await page.waitForTimeout(FIELD_SETTLE_MS);
}
}
/**
* Check the "original content" checkbox if it exists and is not already checked.
*/
async function setOriginal(page: Page): Promise<void> {
const checkbox = await page.$(sel.originalCheckbox);
if (checkbox) {
const isChecked = await checkbox.isChecked();
if (!isChecked) {
await checkbox.click();
await page.waitForTimeout(FIELD_SETTLE_MS);
}
} else {
log.debug('Original checkbox not found, skipping');
}
}
/**
* Set the note visibility (private or friends-only).
*/
async function setVisibility(page: Page, visibility: string): Promise<void> {
const visBtn = await page.$(sel.visibilityButton);
if (!visBtn) {
log.debug('Visibility button not found, skipping');
return;
}
await visBtn.click();
await page.waitForTimeout(FIELD_SETTLE_MS);
let optionSelector: string;
switch (visibility) {
case 'private':
optionSelector = sel.visibilityPrivate;
break;
case 'friends':
optionSelector = sel.visibilityFriends;
break;
default:
optionSelector = sel.visibilityPublic;
break;
}
const option = await page.$(optionSelector);
if (option) {
await option.click();
await page.waitForTimeout(FIELD_SETTLE_MS);
} else {
log.debug({ visibility }, 'Visibility option not found');
}
}
/**
* Open the schedule / timing picker and set the publish date.
*/
async function setSchedule(page: Page, scheduleAt: string): Promise<void> {
const scheduleBtn = await page.$(sel.scheduleButton);
if (!scheduleBtn) {
log.debug('Schedule button not found, skipping');
return;
}
await scheduleBtn.click();
await page.waitForTimeout(FIELD_SETTLE_MS);
const scheduleInput = await page.$(sel.scheduleInput);
if (scheduleInput) {
await scheduleInput.click();
await scheduleInput.fill('');
await page.keyboard.type(scheduleAt, { delay: 30 });
await page.keyboard.press('Enter');
await page.waitForTimeout(FIELD_SETTLE_MS);
} else {
log.debug('Schedule input not found');
}
}
/**
* Wait for the publish success indicator (URL change or success element).
* Returns the result with an optional noteId extracted from the URL.
*/
async function waitForPublishResult(
page: Page,
): Promise<{ success: boolean; noteId?: string }> {
// Strategy 1: Wait for the URL to change to a success page.
// Strategy 2: Wait for a success element to appear.
// Use Promise.all so both run concurrently.
const urlChangePromise = page
.waitForURL(sel.publishSuccessUrlPattern, { timeout: 30_000 })
.then(() => true)
.catch(() => false);
const successElementPromise = page
.waitForSelector(sel.publishSuccess, { timeout: 30_000 })
.then(() => true)
.catch(() => false);
// Also wait a short baseline for the button click to process.
await page.waitForTimeout(PUBLISH_SETTLE_MS);
const [urlChanged, elementAppeared] = await Promise.all([
urlChangePromise,
successElementPromise,
]);
if (!urlChanged && !elementAppeared) {
// Final fallback: check if the page content indicates success.
const pageContent = await page.content();
const hasSuccessText =
pageContent.includes('发布成功') || pageContent.includes('已发布');
if (!hasSuccessText) {
log.warn('No success indicator found after publish');
return { success: false };
}
}
// Try to extract the note ID from the current URL if available.
const noteId = extractNoteIdFromUrl(page.url());
return { success: true, noteId };
}
/**
* Attempt to extract a note / post ID from the URL after successful publish.
*/
function extractNoteIdFromUrl(url: string): string | undefined {
// Pattern: /publish/success?noteId=xxx or /note/xxx
try {
const parsed = new URL(url);
const noteIdParam = parsed.searchParams.get('noteId');
if (noteIdParam) return noteIdParam;
// Try path-based pattern: /note/<id>
const pathMatch = parsed.pathname.match(/\/note\/([a-f0-9]+)/);
if (pathMatch?.[1]) return pathMatch[1];
} catch {
// URL parsing failed — not a problem, noteId is optional.
}
return undefined;
}
+590
View File
@@ -0,0 +1,590 @@
import { Router } from 'express';
import { z, ZodError } from 'zod';
import type { BrowserManager } from '../../browser/manager.js';
import { config } from '../../config/index.js';
import { logger } from '../../utils/logger.js';
import { classifyError, sanitizeErrorMessage } from '../../utils/errors.js';
import { validateMediaPath } from '../../utils/downloader.js';
import { rateLimiter } from '../../server/middleware.js';
import { checkLoginStatus, getLoginQRCode, deleteCookies } from './login.js';
import { listFeeds } from './feeds.js';
import { searchFeeds } from './search.js';
import { getFeedDetail } from './feed-detail.js';
import { getUserProfile } from './user-profile.js';
import { publishImageNote } from './publish.js';
import { publishVideoNote } from './publish-video.js';
import { postComment, replyComment } from './comment.js';
import { toggleLike, toggleFavorite } from './interaction.js';
import {
SearchSchema,
GetFeedDetailSchema,
GetUserProfileSchema,
PublishImageSchema,
PublishVideoSchema,
PostCommentSchema,
ReplyCommentSchema,
LikeSchema,
FavoriteSchema,
} from './schemas.js';
import type { SearchFilters } from './types.js';
// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------
const PLATFORM = 'xiaohongshu';
/** Maximum file size for video uploads (500 MB). */
const VIDEO_MAX_SIZE_MB = 500;
/** Maximum file size for image uploads (20 MB). */
const IMAGE_MAX_SIZE_MB = 20;
const log = logger.child({ module: 'xhs-routes' });
// ---------------------------------------------------------------------------
// Zod schemas for REST body validation
//
// The MCP schemas in schemas.ts are "shape" objects (plain objects with zod
// fields). For REST validation we wrap them in z.object() where needed.
// ---------------------------------------------------------------------------
const SearchBodySchema = z.object({
keyword: SearchSchema.keyword,
filters: SearchSchema.filters,
});
const FeedDetailBodySchema = z.object({
feed_id: GetFeedDetailSchema.feed_id,
xsec_token: GetFeedDetailSchema.xsec_token,
load_all_comments: GetFeedDetailSchema.load_all_comments,
});
const UserProfileBodySchema = z.object({
user_id: GetUserProfileSchema.user_id,
xsec_token: GetUserProfileSchema.xsec_token,
});
const PublishImageBodySchema = z.object({
title: PublishImageSchema.title,
content: PublishImageSchema.content,
images: PublishImageSchema.images,
tags: PublishImageSchema.tags,
schedule_at: PublishImageSchema.schedule_at,
is_original: PublishImageSchema.is_original,
visibility: PublishImageSchema.visibility,
});
const PublishVideoBodySchema = z.object({
title: PublishVideoSchema.title,
content: PublishVideoSchema.content,
video: PublishVideoSchema.video,
tags: PublishVideoSchema.tags,
schedule_at: PublishVideoSchema.schedule_at,
visibility: PublishVideoSchema.visibility,
});
const PostCommentBodySchema = z.object({
feed_id: PostCommentSchema.feed_id,
xsec_token: PostCommentSchema.xsec_token,
content: PostCommentSchema.content,
});
const ReplyCommentBodySchema = z.object({
feed_id: ReplyCommentSchema.feed_id,
xsec_token: ReplyCommentSchema.xsec_token,
content: ReplyCommentSchema.content,
comment_id: ReplyCommentSchema.comment_id,
user_id: ReplyCommentSchema.user_id,
});
const LikeBodySchema = z.object({
feed_id: LikeSchema.feed_id,
xsec_token: LikeSchema.xsec_token,
unlike: LikeSchema.unlike,
});
const FavoriteBodySchema = z.object({
feed_id: FavoriteSchema.feed_id,
xsec_token: FavoriteSchema.xsec_token,
unfavorite: FavoriteSchema.unfavorite,
});
// ---------------------------------------------------------------------------
// Standard JSON response helpers
// ---------------------------------------------------------------------------
interface ApiSuccessResponse<T> {
success: true;
data: T;
}
interface ApiErrorResponse {
success: false;
error: {
code: string;
message: string;
};
}
type ApiResponse<T> = ApiSuccessResponse<T> | ApiErrorResponse;
function successResponse<T>(data: T): ApiSuccessResponse<T> {
return { success: true, data };
}
function errorResponse(code: string, message: string): ApiErrorResponse {
return { success: false, error: { code, message } };
}
// ---------------------------------------------------------------------------
// Rate limiters
// ---------------------------------------------------------------------------
const readRateLimiter = rateLimiter({ windowMs: 60_000, maxRequests: 60 });
const writeRateLimiter = rateLimiter({ windowMs: 60_000, maxRequests: 10 });
// ---------------------------------------------------------------------------
// Route factory
// ---------------------------------------------------------------------------
/**
* Create and return an Express Router with all Xiaohongshu REST API routes.
*
* Every handler calls the SAME action functions used by the MCP tools so
* that business logic is never duplicated.
*/
export function createXhsRoutes(browser: BrowserManager): Router {
const router = Router();
// =========================================================================
// Login management
// =========================================================================
// -----------------------------------------------------------------------
// GET /login/status
// -----------------------------------------------------------------------
router.get('/login/status', readRateLimiter, (_req, res) => {
void (async () => {
try {
const timeoutMs =
config.operationTimeouts['login'] ??
config.operationTimeouts['default'] ??
60_000;
const status = await browser.withPage(
PLATFORM,
async (page) => checkLoginStatus(page),
timeoutMs,
);
res.json(successResponse(status) as ApiResponse<typeof status>);
} catch (err) {
handleError(res, err);
}
})();
});
// -----------------------------------------------------------------------
// GET /login/qrcode
// -----------------------------------------------------------------------
router.get('/login/qrcode', readRateLimiter, (_req, res) => {
void (async () => {
try {
const result = await getLoginQRCode(browser);
res.json(successResponse(result) as ApiResponse<typeof result>);
} catch (err) {
handleError(res, err);
}
})();
});
// -----------------------------------------------------------------------
// DELETE /login/cookies
// -----------------------------------------------------------------------
router.delete('/login/cookies', writeRateLimiter, (_req, res) => {
void (async () => {
try {
await deleteCookies(browser);
res.json(successResponse({ message: 'Cookies deleted' }) as ApiResponse<{ message: string }>);
} catch (err) {
handleError(res, err);
}
})();
});
// =========================================================================
// Content browsing
// =========================================================================
// -----------------------------------------------------------------------
// GET /feeds
// -----------------------------------------------------------------------
router.get('/feeds', readRateLimiter, (_req, res) => {
void (async () => {
try {
const timeoutMs =
config.operationTimeouts['feed_list'] ??
config.operationTimeouts['default'] ??
60_000;
const feeds = await browser.withPage(
PLATFORM,
async (page) => listFeeds(page),
timeoutMs,
);
res.json(successResponse(feeds) as ApiResponse<typeof feeds>);
} catch (err) {
handleError(res, err);
}
})();
});
// -----------------------------------------------------------------------
// POST /search
// -----------------------------------------------------------------------
router.post('/search', readRateLimiter, (req, res) => {
void (async () => {
try {
const body = SearchBodySchema.parse(req.body);
const timeoutMs =
config.operationTimeouts['search'] ??
config.operationTimeouts['default'] ??
60_000;
const filters: SearchFilters | undefined = body.filters
? {
sort: body.filters.sort,
type: body.filters.type,
time: body.filters.time,
}
: undefined;
const feeds = await browser.withPage(
PLATFORM,
async (page) => searchFeeds(page, body.keyword, filters),
timeoutMs,
);
res.json(successResponse(feeds) as ApiResponse<typeof feeds>);
} catch (err) {
handleError(res, err);
}
})();
});
// -----------------------------------------------------------------------
// POST /feeds/detail
// -----------------------------------------------------------------------
router.post('/feeds/detail', readRateLimiter, (req, res) => {
void (async () => {
try {
const body = FeedDetailBodySchema.parse(req.body);
const timeoutMs =
config.operationTimeouts['feed_detail'] ??
config.operationTimeouts['default'] ??
60_000;
const detail = await browser.withPage(
PLATFORM,
async (page) =>
getFeedDetail(
page,
body.feed_id,
body.xsec_token,
body.load_all_comments,
),
timeoutMs,
);
res.json(successResponse(detail) as ApiResponse<typeof detail>);
} catch (err) {
handleError(res, err);
}
})();
});
// -----------------------------------------------------------------------
// POST /user/profile
// -----------------------------------------------------------------------
router.post('/user/profile', readRateLimiter, (req, res) => {
void (async () => {
try {
const body = UserProfileBodySchema.parse(req.body);
const timeoutMs =
config.operationTimeouts['user_profile'] ??
config.operationTimeouts['default'] ??
60_000;
const profile = await browser.withPage(
PLATFORM,
async (page) =>
getUserProfile(page, body.user_id, body.xsec_token),
timeoutMs,
);
res.json(successResponse(profile) as ApiResponse<typeof profile>);
} catch (err) {
handleError(res, err);
}
})();
});
// =========================================================================
// Content publishing
// =========================================================================
// -----------------------------------------------------------------------
// POST /publish/image
// -----------------------------------------------------------------------
router.post('/publish/image', writeRateLimiter, (req, res) => {
void (async () => {
try {
const body = PublishImageBodySchema.parse(req.body);
// Validate all image paths before acquiring a browser page.
const validatedPaths: string[] = [];
for (const imagePath of body.images) {
const resolved = await validateMediaPath(imagePath, {
maxSizeMB: IMAGE_MAX_SIZE_MB,
});
validatedPaths.push(resolved);
}
const timeoutMs =
config.operationTimeouts['publish'] ??
config.operationTimeouts['default'] ??
300_000;
const result = await browser.withPage(
PLATFORM,
async (page) =>
publishImageNote(page, body.title, body.content, validatedPaths, {
tags: body.tags,
scheduleAt: body.schedule_at,
isOriginal: body.is_original,
visibility: body.visibility,
}),
timeoutMs,
);
res.json(successResponse(result) as ApiResponse<typeof result>);
} catch (err) {
handleError(res, err);
}
})();
});
// -----------------------------------------------------------------------
// POST /publish/video
// -----------------------------------------------------------------------
router.post('/publish/video', writeRateLimiter, (req, res) => {
void (async () => {
try {
const body = PublishVideoBodySchema.parse(req.body);
// Validate the video path before acquiring a browser page.
const validatedPath = await validateMediaPath(body.video, {
maxSizeMB: VIDEO_MAX_SIZE_MB,
});
const timeoutMs =
config.operationTimeouts['publish'] ??
config.operationTimeouts['default'] ??
300_000;
const result = await browser.withPage(
PLATFORM,
async (page) =>
publishVideoNote(page, body.title, body.content, validatedPath, {
tags: body.tags,
scheduleAt: body.schedule_at,
visibility: body.visibility,
}),
timeoutMs,
);
res.json(successResponse(result) as ApiResponse<typeof result>);
} catch (err) {
handleError(res, err);
}
})();
});
// =========================================================================
// Interactions
// =========================================================================
// -----------------------------------------------------------------------
// POST /comment
// -----------------------------------------------------------------------
router.post('/comment', writeRateLimiter, (req, res) => {
void (async () => {
try {
const body = PostCommentBodySchema.parse(req.body);
const timeoutMs =
config.operationTimeouts['comment'] ??
config.operationTimeouts['default'] ??
20_000;
const result = await browser.withPage(
PLATFORM,
async (page) =>
postComment(page, body.feed_id, body.xsec_token, body.content),
timeoutMs,
);
res.json(successResponse(result) as ApiResponse<typeof result>);
} catch (err) {
handleError(res, err);
}
})();
});
// -----------------------------------------------------------------------
// POST /comment/reply
// -----------------------------------------------------------------------
router.post('/comment/reply', writeRateLimiter, (req, res) => {
void (async () => {
try {
const body = ReplyCommentBodySchema.parse(req.body);
const timeoutMs =
config.operationTimeouts['reply'] ??
config.operationTimeouts['default'] ??
20_000;
const result = await browser.withPage(
PLATFORM,
async (page) =>
replyComment(
page,
body.feed_id,
body.xsec_token,
body.content,
body.comment_id,
body.user_id,
),
timeoutMs,
);
res.json(successResponse(result) as ApiResponse<typeof result>);
} catch (err) {
handleError(res, err);
}
})();
});
// -----------------------------------------------------------------------
// POST /like
// -----------------------------------------------------------------------
router.post('/like', writeRateLimiter, (req, res) => {
void (async () => {
try {
const body = LikeBodySchema.parse(req.body);
const timeoutMs =
config.operationTimeouts['like'] ??
config.operationTimeouts['default'] ??
15_000;
const result = await browser.withPage(
PLATFORM,
async (page) =>
toggleLike(page, body.feed_id, body.xsec_token, body.unlike),
timeoutMs,
);
res.json(successResponse(result) as ApiResponse<typeof result>);
} catch (err) {
handleError(res, err);
}
})();
});
// -----------------------------------------------------------------------
// POST /favorite
// -----------------------------------------------------------------------
router.post('/favorite', writeRateLimiter, (req, res) => {
void (async () => {
try {
const body = FavoriteBodySchema.parse(req.body);
const timeoutMs =
config.operationTimeouts['favorite'] ??
config.operationTimeouts['default'] ??
15_000;
const result = await browser.withPage(
PLATFORM,
async (page) =>
toggleFavorite(
page,
body.feed_id,
body.xsec_token,
body.unfavorite,
),
timeoutMs,
);
res.json(successResponse(result) as ApiResponse<typeof result>);
} catch (err) {
handleError(res, err);
}
})();
});
return router;
}
// ---------------------------------------------------------------------------
// Error handling helper
// ---------------------------------------------------------------------------
/**
* Unified error handler for REST route handlers.
*
* - ZodError -> 400 with VALIDATION_ERROR
* - Business errors -> appropriate code from classifyError()
* - Unknown errors -> 500
*/
function handleError(res: import('express').Response, err: unknown): void {
if (err instanceof ZodError) {
const issues = err.issues.map((i) => `${i.path.join('.')}: ${i.message}`).join('; ');
res.status(400).json(errorResponse('VALIDATION_ERROR', issues));
return;
}
const error = err instanceof Error ? err : new Error(String(err));
const category = classifyError(error);
const message = sanitizeErrorMessage(error.message);
log.error({ err: error, category }, 'REST API handler error');
let statusCode: number;
switch (category) {
case 'AUTH_REQUIRED':
statusCode = 401;
break;
case 'TIMEOUT':
statusCode = 504;
break;
case 'NETWORK':
statusCode = 502;
break;
default:
statusCode = 500;
break;
}
res.status(statusCode).json(errorResponse(category, message));
}
+148
View File
@@ -0,0 +1,148 @@
import { z } from 'zod';
// ---------------------------------------------------------------------------
// MCP tool parameter schemas for all 13 Xiaohongshu tools.
//
// Phase 2 tools (login) have no parameters — their schemas are empty objects.
// Phase 3/4 schemas are defined here so that the full tool surface is
// established upfront and types can be inferred with z.infer<>.
// ---------------------------------------------------------------------------
// -- Phase 2: Login management (3 tools) -----------------------------------
/** xhs_check_login — no parameters. */
export const CheckLoginSchema = {};
/** xhs_get_login_qrcode — no parameters. */
export const GetLoginQRCodeSchema = {};
/** xhs_delete_cookies — no parameters. */
export const DeleteCookiesSchema = {};
// -- Phase 3: Content browsing (4 tools) -----------------------------------
/** xhs_list_feeds — no parameters. */
export const ListFeedsSchema = {};
/** xhs_search */
export const SearchSchema = {
keyword: z.string().describe('Search keyword'),
filters: z
.object({
sort: z
.enum(['general', 'time_descending', 'popularity_descending'])
.optional()
.describe('Sort order'),
type: z
.enum(['all', 'note', 'video'])
.optional()
.describe('Content type filter'),
time: z
.enum(['all', 'day', 'week', 'half_year'])
.optional()
.describe('Time range filter'),
})
.optional()
.describe('Optional search filters'),
};
/** xhs_get_feed_detail */
export const GetFeedDetailSchema = {
feed_id: z.string().describe('Feed (note) ID'),
xsec_token: z.string().describe('Security token for the feed'),
load_all_comments: z
.boolean()
.optional()
.default(false)
.describe('Whether to scroll and load all comments'),
};
/** xhs_get_user_profile */
export const GetUserProfileSchema = {
user_id: z.string().describe('User ID'),
xsec_token: z.string().describe('Security token for the user page'),
};
// -- Phase 4: Content publishing (2 tools) ---------------------------------
/** xhs_publish_image */
export const PublishImageSchema = {
title: z.string().min(1).describe('Note title'),
content: z.string().describe('Note body text'),
images: z
.array(z.string())
.min(1)
.describe('Array of image file paths or URLs'),
tags: z.array(z.string()).optional().describe('Hashtags to attach'),
schedule_at: z
.string()
.optional()
.describe('ISO 8601 datetime for scheduled publishing'),
is_original: z
.boolean()
.optional()
.default(false)
.describe('Mark as original content'),
visibility: z
.enum(['public', 'private', 'friends'])
.optional()
.default('public')
.describe('Visibility setting'),
};
/** xhs_publish_video */
export const PublishVideoSchema = {
title: z.string().min(1).describe('Note title'),
content: z.string().describe('Note body text'),
video: z.string().describe('Video file path or URL'),
tags: z.array(z.string()).optional().describe('Hashtags to attach'),
schedule_at: z
.string()
.optional()
.describe('ISO 8601 datetime for scheduled publishing'),
visibility: z
.enum(['public', 'private', 'friends'])
.optional()
.default('public')
.describe('Visibility setting'),
};
// -- Phase 4: Interactions (4 tools) ---------------------------------------
/** xhs_post_comment */
export const PostCommentSchema = {
feed_id: z.string().describe('Feed ID to comment on'),
xsec_token: z.string().describe('Security token for the feed'),
content: z.string().min(1).describe('Comment text'),
};
/** xhs_reply_comment */
export const ReplyCommentSchema = {
feed_id: z.string().describe('Feed ID'),
xsec_token: z.string().describe('Security token for the feed'),
comment_id: z.string().optional().describe('Comment ID to reply to'),
user_id: z.string().optional().describe('User ID of the comment author'),
content: z.string().min(1).describe('Reply text'),
};
/** xhs_like */
export const LikeSchema = {
feed_id: z.string().describe('Feed ID to like'),
xsec_token: z.string().describe('Security token for the feed'),
unlike: z
.boolean()
.optional()
.default(false)
.describe('Set to true to unlike'),
};
/** xhs_favorite */
export const FavoriteSchema = {
feed_id: z.string().describe('Feed ID to favorite'),
xsec_token: z.string().describe('Security token for the feed'),
unfavorite: z
.boolean()
.optional()
.default(false)
.describe('Set to true to unfavorite'),
};
+387
View File
@@ -0,0 +1,387 @@
import type { Page } from 'rebrowser-playwright';
import { logger } from '../../utils/logger.js';
import { extractInitialState, parseCountString, ensureHttps } from './feeds.js';
import type { Feed, SearchFilters } from './types.js';
// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------
const SEARCH_BASE_URL = 'https://www.xiaohongshu.com/search_result';
const log = logger.child({ module: 'xhs-search' });
// ---------------------------------------------------------------------------
// Sort value mapping
// ---------------------------------------------------------------------------
/** Map our public sort enum values to the URL query parameter values. */
const SORT_PARAM: Record<string, string> = {
general: '0',
time_descending: '1',
popularity_descending: '2',
};
/** Map our note type filter values to the URL query parameter values. */
const TYPE_PARAM: Record<string, string> = {
all: '0',
note: '1',
video: '2',
};
/** Map time range filter values to URL query parameter values. */
const TIME_PARAM: Record<string, string> = {
all: '0',
day: '1',
week: '2',
half_year: '3',
};
// ---------------------------------------------------------------------------
// __INITIAL_STATE__ raw types for search results
// ---------------------------------------------------------------------------
interface RawSearchFeedItem {
id?: string;
noteId?: string;
note_id?: string;
xsecToken?: string;
xsec_token?: string;
displayTitle?: string;
display_title?: string;
title?: string;
name?: string;
desc?: string;
description?: string;
noteCard?: RawSearchNoteCard;
type?: string;
model_type?: string;
cover?: RawSearchImage;
user?: RawSearchUser;
interactInfo?: RawSearchInteractInfo;
interact_info?: RawSearchInteractInfo;
likedCount?: string;
liked_count?: string;
}
interface RawSearchNoteCard {
noteId?: string;
displayTitle?: string;
display_title?: string;
title?: string;
desc?: string;
type?: string;
cover?: RawSearchImage;
user?: RawSearchUser;
interactInfo?: RawSearchInteractInfo;
interact_info?: RawSearchInteractInfo;
xsecToken?: string;
xsec_token?: string;
}
interface RawSearchImage {
url?: string;
urlPre?: string;
urlDefault?: string;
url_pre?: string;
url_default?: string;
infoList?: Array<{ url?: string }>;
info_list?: Array<{ url?: string }>;
}
interface RawSearchUser {
userId?: string;
user_id?: string;
nickname?: string;
nick_name?: string;
nickName?: string;
avatar?: string;
avatarUrl?: string;
avatar_url?: string;
}
interface RawSearchInteractInfo {
likedCount?: string;
liked_count?: string;
likeCount?: string;
like_count?: string;
}
interface SearchInitialState {
searchNotes?: {
feeds?: RawSearchFeedItem[];
};
searchResult?: {
notes?: RawSearchFeedItem[];
feeds?: RawSearchFeedItem[];
};
search?: {
feeds?: RawSearchFeedItem[];
notes?: RawSearchFeedItem[];
};
[key: string]: unknown;
}
// ---------------------------------------------------------------------------
// searchFeeds
// ---------------------------------------------------------------------------
/**
* Search Xiaohongshu for notes matching a keyword, with optional filters.
*
* @param page - A Playwright Page managed by BrowserManager.
* @param keyword - The search term.
* @param filters - Optional sorting, type, and time range filters.
* @returns An array of Feed objects matching the search.
*/
export async function searchFeeds(
page: Page,
keyword: string,
filters?: SearchFilters,
): Promise<Feed[]> {
const url = buildSearchUrl(keyword, filters);
log.debug({ keyword, filters, url }, 'Navigating to search page');
await page.goto(url, { waitUntil: 'domcontentloaded' });
// Wait for the page to render search results.
await page.waitForTimeout(2000);
// -----------------------------------------------------------------------
// Strategy 1: Extract from __INITIAL_STATE__
// -----------------------------------------------------------------------
const initialState = await extractInitialState(page) as SearchInitialState | null;
if (initialState) {
const feeds = parseSearchFeedsFromState(initialState);
if (feeds.length > 0) {
log.info({ keyword, count: feeds.length }, 'Extracted search results from __INITIAL_STATE__');
return feeds;
}
log.debug('__INITIAL_STATE__ found but no search feeds extracted, falling back to DOM');
}
// -----------------------------------------------------------------------
// Strategy 2: Fall back to DOM scraping using Playwright Node-side API
// -----------------------------------------------------------------------
log.debug('Falling back to DOM scraping for search results');
const feeds = await scrapeSearchResultsFromDom(page);
log.info({ keyword, count: feeds.length }, 'Extracted search results from DOM');
return feeds;
}
// ---------------------------------------------------------------------------
// URL construction
// ---------------------------------------------------------------------------
/**
* Build the full search URL with query parameters for keyword and filters.
*/
function buildSearchUrl(keyword: string, filters?: SearchFilters): string {
const params = new URLSearchParams();
params.set('keyword', keyword);
if (filters?.sort && SORT_PARAM[filters.sort]) {
params.set('sort', SORT_PARAM[filters.sort]!);
}
if (filters?.type && TYPE_PARAM[filters.type]) {
params.set('type', TYPE_PARAM[filters.type]!);
}
if (filters?.time && TIME_PARAM[filters.time]) {
params.set('time', TIME_PARAM[filters.time]!);
}
return `${SEARCH_BASE_URL}?${params.toString()}`;
}
// ---------------------------------------------------------------------------
// __INITIAL_STATE__ parsing for search results
// ---------------------------------------------------------------------------
/**
* Parse search results from the __INITIAL_STATE__ data.
*/
function parseSearchFeedsFromState(state: SearchInitialState): Feed[] {
// Try multiple known locations where search data may live.
const rawFeeds: RawSearchFeedItem[] =
state.searchNotes?.feeds ??
state.searchResult?.notes ??
state.searchResult?.feeds ??
state.search?.feeds ??
state.search?.notes ??
[];
if (rawFeeds.length === 0) {
// Walk top-level keys looking for an array that resembles feeds.
for (const key of Object.keys(state)) {
const value = state[key];
if (value && typeof value === 'object' && !Array.isArray(value)) {
const obj = value as Record<string, unknown>;
const candidates = ['feeds', 'notes', 'items'];
for (const candidate of candidates) {
if (Array.isArray(obj[candidate])) {
const parsed = (obj[candidate] as RawSearchFeedItem[])
.map(parseRawSearchItem)
.filter((f): f is Feed => f !== null);
if (parsed.length > 0) return parsed;
}
}
}
}
return [];
}
return rawFeeds
.map(parseRawSearchItem)
.filter((f): f is Feed => f !== null);
}
/**
* Convert a single raw search result item into a structured Feed.
*/
function parseRawSearchItem(raw: RawSearchFeedItem): Feed | null {
const card = raw.noteCard;
const id =
raw.id ?? raw.noteId ?? raw.note_id ?? card?.noteId ?? '';
const xsecToken =
raw.xsecToken ?? raw.xsec_token ?? card?.xsecToken ?? card?.xsec_token ?? '';
const title =
raw.displayTitle ?? raw.display_title ?? raw.title ?? raw.name ??
card?.displayTitle ?? card?.display_title ?? card?.title ?? '';
const description =
raw.desc ?? raw.description ?? card?.desc ?? '';
const rawType = raw.type ?? raw.model_type ?? card?.type ?? '';
const type: 'normal' | 'video' =
rawType.toLowerCase().includes('video') ? 'video' : 'normal';
const rawCover = raw.cover ?? card?.cover;
const coverUrl = extractSearchImageUrl(rawCover);
const rawUser = raw.user ?? card?.user;
const userId = rawUser?.userId ?? rawUser?.user_id ?? '';
const nickname =
rawUser?.nickname ?? rawUser?.nick_name ?? rawUser?.nickName ?? '';
const avatar =
rawUser?.avatar ?? rawUser?.avatarUrl ?? rawUser?.avatar_url ?? '';
const interactInfo = raw.interactInfo ?? raw.interact_info ?? card?.interactInfo ?? card?.interact_info;
const likeCountStr =
interactInfo?.likedCount ?? interactInfo?.liked_count ??
interactInfo?.likeCount ?? interactInfo?.like_count ??
raw.likedCount ?? raw.liked_count ?? '0';
const likeCount = parseCountString(likeCountStr);
if (!id) return null;
return {
id,
xsecToken,
title,
description,
type,
coverUrl,
likeCount,
user: { id: userId, nickname, avatar },
};
}
/**
* Extract image URL from a raw search cover object.
*/
function extractSearchImageUrl(raw: RawSearchImage | undefined): string {
if (!raw) return '';
if (raw.url) return ensureHttps(raw.url);
if (raw.urlPre) return ensureHttps(raw.urlPre);
if (raw.urlDefault) return ensureHttps(raw.urlDefault);
if (raw.url_pre) return ensureHttps(raw.url_pre);
if (raw.url_default) return ensureHttps(raw.url_default);
const infoList = raw.infoList ?? raw.info_list;
if (infoList && infoList.length > 0 && infoList[0]?.url) {
return ensureHttps(infoList[0].url);
}
return '';
}
// ---------------------------------------------------------------------------
// DOM scraping fallback — uses Playwright Node-side API exclusively
// ---------------------------------------------------------------------------
/**
* Scrape search results using Playwright's Node-side API to avoid
* needing DOM lib types.
*/
async function scrapeSearchResultsFromDom(page: Page): Promise<Feed[]> {
// Wait for the search result note items to appear.
await page
.waitForSelector('.feeds-container .note-item', { timeout: 10_000 })
.catch(() => null);
const cardElements = await page.$$('.feeds-container .note-item');
const feeds: Feed[] = [];
for (const card of cardElements) {
try {
const href = await card
.$eval('a.cover', (el) => el.getAttribute('href') ?? '')
.catch(() => '');
const idMatch = href.match(/\/explore\/([a-f0-9]+)/);
const tokenMatch = href.match(/xsec_token=([^&]+)/);
const id = idMatch?.[1] ?? '';
const xsecToken = tokenMatch?.[1] ?? '';
if (!id) continue;
const coverUrl = await card
.$eval('a.cover img', (el) => el.getAttribute('src') ?? '')
.catch(() => '');
const title = await card
.$eval('.footer .title', (el) => el.textContent?.trim() ?? '')
.catch(() => '');
const nickname = await card
.$eval('.footer .author-wrapper .name', (el) => el.textContent?.trim() ?? '')
.catch(() => '');
const avatar = await card
.$eval('.footer .author-wrapper .author-head img', (el) => el.getAttribute('src') ?? '')
.catch(() => '');
const authorHref = await card
.$eval('.footer .author-wrapper a', (el) => el.getAttribute('href') ?? '')
.catch(() => '');
const authorIdMatch = authorHref.match(/\/user\/profile\/([a-f0-9]+)/);
const userId = authorIdMatch?.[1] ?? '';
const likeText = await card
.$eval('.footer .like-wrapper .count', (el) => el.textContent?.trim() ?? '0')
.catch(() => '0');
const likeCount = parseCountString(likeText);
const hasVideoIcon = await card.$('.play-icon').then((el) => el !== null).catch(() => false);
feeds.push({
id,
xsecToken,
title,
description: '',
type: hasVideoIcon ? 'video' : 'normal',
coverUrl,
likeCount,
user: { id: userId, nickname, avatar },
});
} catch {
continue;
}
}
return feeds;
}
+203
View File
@@ -0,0 +1,203 @@
// ---------------------------------------------------------------------------
// CSS Selectors — centralised so that UI changes only require edits here.
// ---------------------------------------------------------------------------
export const XHS_SELECTORS = {
login: {
/** QR code image on the login modal / page. */
qrCodeImage: '.login-container .qrcode-img',
/** Element present only when the user is logged in (sidebar channel link). */
loggedInIndicator: '.user .link-wrapper .channel',
/** The "login" button that opens the QR code modal (if not already shown). */
loginButton: '.login-btn',
},
feed: {
/** Container for each feed card on the explore page. */
feedCard: '.note-item',
/** The cover image within a feed card. */
coverImage: '.note-item a.cover img',
/** The title/footer within a feed card. */
footerTitle: '.note-item .footer .title',
/** Author name within a feed card. */
authorName: '.note-item .footer .author-wrapper .name',
/** Author avatar within a feed card. */
authorAvatar: '.note-item .footer .author-wrapper .author-head img',
/** Like count within a feed card. */
likeCount: '.note-item .footer .like-wrapper .count',
},
search: {
/** Search result container. */
resultContainer: '#global-search-result-container',
/** Individual search result note items. */
noteItem: '.feeds-container .note-item',
/** Search result cover image. */
coverImage: '.feeds-container .note-item a.cover img',
/** Search result title. */
title: '.feeds-container .note-item .footer .title',
/** Search result author name. */
authorName: '.feeds-container .note-item .footer .author-wrapper .name',
/** Search result author avatar. */
authorAvatar: '.feeds-container .note-item .footer .author-wrapper .author-head img',
/** Search result like count. */
likeCount: '.feeds-container .note-item .footer .like-wrapper .count',
},
feedDetail: {
/** The main content container for a note detail page. */
noteContainer: '#noteContainer',
/** The title of the note. */
title: '#detail-title',
/** The description / body content of the note. */
description: '#detail-desc',
/** Individual images in an image note. */
images: '.note-image-list .note-image img',
/** The single hero image (some notes use this instead of a list). */
heroImage: '.note-hero img',
/** Video player element. */
video: '#videoplayer video',
/** Video player source. */
videoSource: '#videoplayer video source',
/** Tag links within the note body. */
tags: '#detail-desc a.tag',
/** Like count. */
likeCount: '.engage-bar .like-wrapper .count',
/** Collect (favorite) count. */
collectCount: '.engage-bar .collect-wrapper .count',
/** Comment count. */
commentCount: '.engage-bar .chat-wrapper .count',
/** Share count. */
shareCount: '.engage-bar .share-wrapper .count',
/** Publish / create time text. */
createTime: '.note-scroller .bottom-container .date',
/** IP location. */
ipLocation: '.note-scroller .bottom-container .ip-location',
/** Author nickname on the detail page. */
authorName: '.author-container .info .name',
/** Author avatar on the detail page. */
authorAvatar: '.author-container .info .avatar img',
/** Author user ID link. */
authorLink: '.author-container .info a',
/** Comment list container. */
commentListContainer: '.comments-container .list-container',
/** Individual top-level comment items. */
commentItem: '.comments-container .list-container .list-item',
/** Parent comment content text. */
commentContent: '.content',
/** Comment author name. */
commentAuthor: '.author .name',
/** Comment author avatar. */
commentAvatar: '.author .avatar img',
/** Comment like count. */
commentLikeCount: '.like .count',
/** Comment publish time. */
commentTime: '.date',
/** Comment IP location. */
commentIpLocation: '.ip-location',
/** Sub-comment (reply) items. */
subCommentItem: '.sub-comment-list .sub-comment-item',
/** "Show more comments" button. */
showMoreComments: '.comments-container .show-more',
/** "Load more replies" button within a comment thread. */
loadMoreReplies: '.sub-comment-list .show-more',
},
userProfile: {
/** Profile header container. */
headerContainer: '.user-info',
/** User nickname. */
nickname: '.user-info .user-name',
/** User avatar image. */
avatar: '.user-info .user-image img',
/** User bio / description text. */
description: '.user-info .user-desc',
/** User gender icon or text. */
gender: '.user-info .gender-icon',
/** IP location. */
ipLocation: '.user-info .user-ip',
/** Follower / following / interaction count elements. */
followCount: '.user-info .data-area .data-item',
/** Note count (displayed somewhere on the profile page). */
noteCountTab: '.reds-tab-item',
/** Individual feed items on the user profile. */
feedItem: '.feeds-container .note-item',
},
// -- Phase 4: Publish -----------------------------------------------------
publish: {
/** The file input element for uploading images on the creator publish page. */
imageFileInput: 'input[type="file"]',
/** Title input field on the publish form. */
titleInput: '#note-title',
/** Content / body editor area on the publish form (contenteditable). */
contentEditor: '#note-content',
/** The tag / topic button that opens the topic input. */
tagButton: '#topicBtn',
/** Tag / topic input field for typing hashtags. */
tagInput: '#topicBtn input',
/** Topic / hashtag suggestion dropdown item. */
tagSuggestionItem: '.publish-topic-item, .topic-item',
/** "Publish" / submit button. */
publishButton: '.publishBtn',
/** Schedule / timing selector button. */
scheduleButton: '.timing-btn, button:has-text("定时")',
/** Schedule date/time input field. */
scheduleInput: '.timing-input input, .schedule-input input',
/** Original content declaration checkbox. */
originalCheckbox: '.original-checkbox input, input[type="checkbox"][name="original"]',
/** Visibility / permission setting button. */
visibilityButton: '.permission-btn, button:has-text("可见")',
/** Visibility option for public. */
visibilityPublic: '.permission-option:has-text("公开"), .visibility-option:has-text("公开")',
/** Visibility option for private. */
visibilityPrivate: '.permission-option:has-text("私密"), .visibility-option:has-text("私密")',
/** Visibility option for friends only. */
visibilityFriends: '.permission-option:has-text("好友"), .visibility-option:has-text("好友")',
/** Upload complete indicator (images uploaded and thumbnails visible). */
uploadedImageItem: '.upload-item img, .img-item img, .image-item img',
/** Video upload complete indicator (video thumbnail visible). */
uploadedVideoItem: '.upload-video video, .video-item video, .video-container video',
/** Success indicator shown after publish completes. */
publishSuccess: '.success-panel, .publish-success, .note-success',
/** URL in the address bar after successful publish (used as a fallback check). */
publishSuccessUrlPattern: /\/publish\/success/,
},
// -- Phase 4: Comment / Reply ---------------------------------------------
comment: {
/** The comment input field / textarea on the feed detail page. */
commentInput: '#content-textarea',
/** Alternative comment input (contenteditable div). */
commentInputAlt: '[contenteditable][data-placeholder]',
/** Comment submit / send button. */
commentSubmitButton: '.comment-submit, button.submit, .btn-send',
/** Parent comment element (used to find specific comment by ID). */
commentItem: '.comment-item, .note-comment-item, [id^="comment-"]',
/** Reply button on an individual comment. */
commentReplyButton: '.reply-btn, .comment-reply',
/** Reply input that appears after clicking reply. */
replyInput: '.reply-input textarea, .reply-content [contenteditable], .reply-area textarea',
},
// -- Phase 4: Interaction (Like / Favorite) --------------------------------
interaction: {
/** Like button on the feed detail page. */
likeButton: '.engage-bar .like-wrapper, span.like-wrapper',
/** Like button in active/liked state. */
likeButtonActive: '.engage-bar .like-wrapper.active, span.like-wrapper.active',
/** Like count element next to the like button. */
likeCount: '.engage-bar .like-wrapper .count',
/** Favorite / collect button on the feed detail page. */
favoriteButton: '.engage-bar .collect-wrapper, span.collect-wrapper',
/** Favorite button in active/favorited state. */
favoriteButtonActive: '.engage-bar .collect-wrapper.active, span.collect-wrapper.active',
/** Favorite count element next to the favorite button. */
favoriteCount: '.engage-bar .collect-wrapper .count',
/** Container for the interaction bar at the bottom of a feed detail. */
interactionBar: '.interact-container, .engage-bar',
},
} as const;
+98
View File
@@ -0,0 +1,98 @@
// ---------------------------------------------------------------------------
// Xiaohongshu domain types
// ---------------------------------------------------------------------------
// -- Login -----------------------------------------------------------------
export interface LoginStatus {
loggedIn: boolean;
username?: string;
}
export interface QRCodeResult {
/** Base64 data URI of the QR code image. */
qrcodeData: string;
/** Whether the user was already logged in (no QR code needed). */
alreadyLoggedIn: boolean;
/** Human-readable timeout hint (e.g. "4m"). */
timeout: string;
}
// -- Feed -----------------------------------------------------------------
export interface FeedUser {
id: string;
nickname: string;
avatar: string;
}
export interface Feed {
id: string;
xsecToken: string;
title: string;
description: string;
type: 'normal' | 'video';
coverUrl: string;
likeCount: number;
user: FeedUser;
}
// -- Feed Detail ----------------------------------------------------------
export interface FeedDetail {
id: string;
xsecToken: string;
title: string;
description: string;
type: 'normal' | 'video';
images: string[];
videoUrl?: string;
tags: string[];
likeCount: number;
collectCount: number;
commentCount: number;
shareCount: number;
createTime: string;
lastUpdateTime: string;
ipLocation: string;
user: FeedUser;
comments: Comment[];
}
// -- Comment --------------------------------------------------------------
export interface Comment {
id: string;
userId: string;
nickname: string;
avatar: string;
content: string;
likeCount: number;
createTime: string;
ipLocation: string;
subComments: Comment[];
}
// -- User Profile ---------------------------------------------------------
export interface UserProfile {
id: string;
nickname: string;
avatar: string;
description: string;
gender: string;
ipLocation: string;
follows: number;
fans: number;
interaction: number;
feedCount: number;
feeds: Feed[];
}
// -- Search Filters -------------------------------------------------------
export interface SearchFilters {
sort?: 'general' | 'time_descending' | 'popularity_descending';
type?: 'all' | 'note' | 'video';
time?: 'all' | 'day' | 'week' | 'half_year';
}
+442
View File
@@ -0,0 +1,442 @@
import type { Page } from 'rebrowser-playwright';
import { logger } from '../../utils/logger.js';
import { XHS_SELECTORS } from './selectors.js';
import { extractInitialState, parseCountString, ensureHttps } from './feeds.js';
import type { UserProfile, Feed } from './types.js';
// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------
const USER_PROFILE_BASE_URL = 'https://www.xiaohongshu.com/user/profile';
const SEL = XHS_SELECTORS.userProfile;
const log = logger.child({ module: 'xhs-user-profile' });
// ---------------------------------------------------------------------------
// __INITIAL_STATE__ raw types for user profile
// ---------------------------------------------------------------------------
interface RawProfileState {
user?: {
userPageData?: RawUserPageData;
userInfo?: RawUserInfo;
};
userProfile?: {
userInfo?: RawUserInfo;
notes?: RawProfileNote[];
};
[key: string]: unknown;
}
interface RawUserPageData {
basicInfo?: RawUserInfo;
interactions?: RawInteractions;
notes?: RawProfileNote[];
noteCount?: number | string;
note_count?: number | string;
}
interface RawUserInfo {
userId?: string;
user_id?: string;
nickname?: string;
nick_name?: string;
nickName?: string;
avatar?: string;
avatarUrl?: string;
avatar_url?: string;
images?: string;
desc?: string;
description?: string;
gender?: number | string;
ipLocation?: string;
ip_location?: string;
fstatus?: string;
follows?: number | string;
fans?: number | string;
interaction?: number | string;
noteCount?: number | string;
note_count?: number | string;
}
interface RawInteractions {
follows?: string | number;
fans?: string | number;
interaction?: string | number;
}
interface RawProfileNote {
id?: string;
noteId?: string;
note_id?: string;
xsecToken?: string;
xsec_token?: string;
displayTitle?: string;
display_title?: string;
title?: string;
desc?: string;
type?: string;
cover?: {
url?: string;
urlPre?: string;
url_pre?: string;
urlDefault?: string;
url_default?: string;
infoList?: Array<{ url?: string }>;
info_list?: Array<{ url?: string }>;
};
user?: {
userId?: string;
user_id?: string;
nickname?: string;
nick_name?: string;
avatar?: string;
};
interactInfo?: {
likedCount?: string;
liked_count?: string;
likeCount?: string;
like_count?: string;
};
interact_info?: {
likedCount?: string;
liked_count?: string;
likeCount?: string;
like_count?: string;
};
likedCount?: string;
liked_count?: string;
}
// ---------------------------------------------------------------------------
// getUserProfile
// ---------------------------------------------------------------------------
/**
* Navigate to a Xiaohongshu user profile page and extract their information,
* including basic info, follower/following counts, and recent notes.
*
* @param page - A Playwright Page managed by BrowserManager.
* @param userId - The user ID.
* @param xsecToken - Security token required to access the profile page.
* @returns A UserProfile object with the user's data.
*/
export async function getUserProfile(
page: Page,
userId: string,
xsecToken: string,
): Promise<UserProfile> {
const url = `${USER_PROFILE_BASE_URL}/${userId}?xsec_token=${encodeURIComponent(xsecToken)}&xsec_source=pc_note`;
log.debug({ userId, url }, 'Navigating to user profile page');
await page.goto(url, { waitUntil: 'domcontentloaded' });
// Wait for the user profile header to appear.
await page
.waitForSelector(SEL.headerContainer, { timeout: 15_000 })
.catch(() => {
log.warn({ userId }, 'User profile header not found within timeout, proceeding');
});
// Allow render to settle.
await page.waitForTimeout(1500);
// -----------------------------------------------------------------------
// Strategy 1: Extract from __INITIAL_STATE__
// -----------------------------------------------------------------------
const initialState = await extractInitialState(page) as RawProfileState | null;
if (initialState) {
const profile = parseProfileFromState(initialState, userId, xsecToken);
if (profile) {
log.info({ userId, feedCount: profile.feeds.length }, 'Extracted user profile from __INITIAL_STATE__');
return profile;
}
log.debug('__INITIAL_STATE__ found but no profile data extracted, falling back to DOM');
}
// -----------------------------------------------------------------------
// Strategy 2: Fall back to DOM scraping
// -----------------------------------------------------------------------
log.debug({ userId }, 'Falling back to DOM scraping for user profile');
const profile = await scrapeProfileFromDom(page, userId, xsecToken);
log.info({ userId, feedCount: profile.feeds.length }, 'Extracted user profile from DOM');
return profile;
}
// ---------------------------------------------------------------------------
// __INITIAL_STATE__ parsing
// ---------------------------------------------------------------------------
/**
* Parse user profile data from __INITIAL_STATE__.
*/
function parseProfileFromState(
state: RawProfileState,
userId: string,
_xsecToken: string,
): UserProfile | null {
// Try multiple known locations for user data.
const userPageData = state.user?.userPageData;
const userInfo =
userPageData?.basicInfo ??
state.user?.userInfo ??
state.userProfile?.userInfo;
if (!userInfo) {
return null;
}
const id = userInfo.userId ?? userInfo.user_id ?? userId;
const nickname = userInfo.nickname ?? userInfo.nick_name ?? userInfo.nickName ?? '';
const avatar = userInfo.avatar ?? userInfo.avatarUrl ?? userInfo.avatar_url ?? userInfo.images ?? '';
const description = userInfo.desc ?? userInfo.description ?? '';
// Gender: 0=unknown, 1=male, 2=female
const genderRaw = userInfo.gender;
let gender = '';
if (genderRaw === 1 || genderRaw === '1') gender = 'male';
else if (genderRaw === 2 || genderRaw === '2') gender = 'female';
const ipLocation = userInfo.ipLocation ?? userInfo.ip_location ?? '';
// Follower / following / interaction counts.
const interactions = userPageData?.interactions;
const follows = toNumber(interactions?.follows ?? userInfo.follows ?? 0);
const fans = toNumber(interactions?.fans ?? userInfo.fans ?? 0);
const interaction = toNumber(interactions?.interaction ?? userInfo.interaction ?? 0);
// Note count.
const feedCount = toNumber(
userPageData?.noteCount ?? userPageData?.note_count ??
userInfo.noteCount ?? userInfo.note_count ?? 0,
);
// Notes / feeds on the profile page.
const rawNotes: RawProfileNote[] =
userPageData?.notes ?? state.userProfile?.notes ?? [];
const feeds = rawNotes
.map((note) => parseProfileNote(note, userId))
.filter((f): f is Feed => f !== null);
return {
id,
nickname,
avatar: avatar ? ensureHttps(avatar) : '',
description,
gender,
ipLocation,
follows,
fans,
interaction,
feedCount,
feeds,
};
}
/**
* Parse a note from the user profile state into a Feed object.
*/
function parseProfileNote(
raw: RawProfileNote,
ownerUserId: string,
): Feed | null {
const id = raw.id ?? raw.noteId ?? raw.note_id ?? '';
if (!id) return null;
const noteXsecToken = raw.xsecToken ?? raw.xsec_token ?? '';
const title = raw.displayTitle ?? raw.display_title ?? raw.title ?? '';
const description = raw.desc ?? '';
const rawType = raw.type ?? '';
const type: 'normal' | 'video' =
rawType.toLowerCase().includes('video') ? 'video' : 'normal';
// Cover image.
let coverUrl = '';
if (raw.cover) {
coverUrl =
raw.cover.url ?? raw.cover.urlPre ?? raw.cover.url_pre ??
raw.cover.urlDefault ?? raw.cover.url_default ?? '';
if (!coverUrl) {
const infoList = raw.cover.infoList ?? raw.cover.info_list;
if (infoList && infoList.length > 0 && infoList[0]?.url) {
coverUrl = infoList[0].url;
}
}
if (coverUrl) coverUrl = ensureHttps(coverUrl);
}
// Like count.
const interact = raw.interactInfo ?? raw.interact_info;
const likeCountStr =
interact?.likedCount ?? interact?.liked_count ??
interact?.likeCount ?? interact?.like_count ??
raw.likedCount ?? raw.liked_count ?? '0';
const likeCount = parseCountString(likeCountStr);
// User.
const rawUser = raw.user;
const user = {
id: rawUser?.userId ?? rawUser?.user_id ?? ownerUserId,
nickname: rawUser?.nickname ?? rawUser?.nick_name ?? '',
avatar: rawUser?.avatar ?? '',
};
return {
id,
xsecToken: noteXsecToken,
title,
description,
type,
coverUrl,
likeCount,
user,
};
}
// ---------------------------------------------------------------------------
// DOM scraping fallback — uses Playwright Node-side API exclusively
// ---------------------------------------------------------------------------
/**
* Scrape user profile data from the rendered DOM using Playwright's
* Node-side APIs to avoid needing DOM lib types.
*/
async function scrapeProfileFromDom(
page: Page,
userId: string,
xsecToken: string,
): Promise<UserProfile> {
// Nickname
const nickname = await page
.$eval(SEL.nickname, (el) => el.textContent?.trim() ?? '')
.catch(() => '');
// Avatar
const avatar = await page
.$eval(SEL.avatar, (img) => img.getAttribute('src') ?? '')
.catch(() => '');
// Description / bio
const description = await page
.$eval(SEL.description, (el) => el.textContent?.trim() ?? '')
.catch(() => '');
// Gender — try the gender icon class.
const gender = await page
.$eval(SEL.gender, (el) => {
const cls = el.className.toLowerCase();
if (cls.includes('male') && !cls.includes('female')) return 'male';
if (cls.includes('female')) return 'female';
return '';
})
.catch(() => '');
// IP location
const ipLocation = await page
.$eval(SEL.ipLocation, (el) => el.textContent?.trim() ?? '')
.catch(() => '');
// Follower / following / interaction counts.
// These are typically in a row of .data-item elements.
const dataCounts = await page.$$eval(SEL.followCount, (items) =>
items.map((item) => {
const countEl = item.querySelector('.count');
return countEl?.textContent?.trim() ?? '0';
}),
).catch(() => [] as string[]);
const follows = parseCountString(dataCounts[0] ?? '0');
const fans = parseCountString(dataCounts[1] ?? '0');
const interaction = parseCountString(dataCounts[2] ?? '0');
// Note count from tab — use a string expression to run in browser context
// without needing DOM types in our TypeScript config.
const feedCount = await page
.$$eval(SEL.noteCountTab, (tabs) => {
for (const tab of tabs) {
const text = tab.textContent ?? '';
if (text.includes('\u7B14\u8BB0')) {
const match = text.match(/\d+/);
return match ? parseInt(match[0], 10) : 0;
}
}
return 0;
})
.catch(() => 0);
// Scrape feed items on the profile page.
const feedElements = await page.$$(SEL.feedItem);
const feeds: Feed[] = [];
for (const card of feedElements) {
try {
const href = await card
.$eval('a.cover', (el) => el.getAttribute('href') ?? '')
.catch(() => '');
const idMatch = href.match(/\/explore\/([a-f0-9]+)/);
const tokenMatch = href.match(/xsec_token=([^&]+)/);
const id = idMatch?.[1] ?? '';
const noteXsecToken = tokenMatch?.[1] ?? '';
if (!id) continue;
const coverUrl = await card
.$eval('a.cover img', (el) => el.getAttribute('src') ?? el.getAttribute('data-src') ?? '')
.catch(() => '');
const feedTitle = await card
.$eval('.footer .title', (el) => el.textContent?.trim() ?? '')
.catch(() => '');
const likeText = await card
.$eval('.footer .like-wrapper .count', (el) => el.textContent?.trim() ?? '0')
.catch(() => '0');
const hasVideoIcon = await card.$('.play-icon').then((el) => el !== null).catch(() => false);
feeds.push({
id,
xsecToken: noteXsecToken || xsecToken,
title: feedTitle,
description: '',
type: hasVideoIcon ? 'video' : 'normal',
coverUrl,
likeCount: parseCountString(likeText),
user: { id: userId, nickname: '', avatar: '' },
});
} catch {
continue;
}
}
return {
id: userId,
nickname,
avatar,
description,
gender,
ipLocation,
follows,
fans,
interaction,
feedCount,
feeds,
};
}
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
/**
* Convert a string or number to a number, handling abbreviations.
*/
function toNumber(val: string | number): number {
if (typeof val === 'number') return val;
return parseCountString(val);
}
+342
View File
@@ -0,0 +1,342 @@
import http from 'node:http';
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
import { SSEServerTransport } from '@modelcontextprotocol/sdk/server/sse.js';
import express from 'express';
import { config } from '../config/index.js';
import { BrowserManager, browserManager } from '../browser/manager.js';
import { logger } from '../utils/logger.js';
import {
dnsRebindingGuard,
shutdownGuard,
errorHandler,
bearerAuth,
initBearerToken,
} from './middleware.js';
// ---------------------------------------------------------------------------
// Package version -- read once at module load for the /health endpoint and
// the MCP server info. Uses a static string so we avoid importing JSON
// (which would require `resolveJsonModule` + ESM assertion gymnastics).
// ---------------------------------------------------------------------------
const PACKAGE_VERSION = '0.1.0';
// ---------------------------------------------------------------------------
// PlatformPlugin interface
// ---------------------------------------------------------------------------
/**
* Contract that every platform adapter (Twitter, Xiaohongshu, etc.) must
* implement to plug into the application.
*
* - `registerTools` is the minimum requirement: expose MCP tools.
* - Optional hooks allow the plugin to mount REST routes, perform async
* initialisation, clean up on shutdown, and report its health.
*/
export interface PlatformPlugin {
/** Human-readable name used in logs and health-check output. */
name: string;
/** Register MCP tools on the shared McpServer instance. */
registerTools(server: McpServer, browser: BrowserManager): void;
/** Optionally mount Express routes (e.g. OAuth callbacks, webhooks). */
registerRoutes?(router: express.Router, browser: BrowserManager): void;
/** Async initialisation (cookie restore, feature detection, etc.). */
init?(): Promise<void>;
/** Teardown hook called during graceful shutdown. */
shutdown?(): Promise<void>;
/** Return platform-specific health information. */
healthCheck?(): Promise<{ healthy: boolean; message?: string }>;
}
// ---------------------------------------------------------------------------
// AppServer
// ---------------------------------------------------------------------------
export class AppServer {
// -- Public surface -------------------------------------------------------
/** The underlying Express application -- use for plugin route mounting. */
readonly app: express.Express;
/** The MCP server instance -- use for plugin tool registration. */
readonly mcpServer: McpServer;
// -- Internal state -------------------------------------------------------
private httpServer: http.Server | null = null;
private shuttingDown = false;
private readonly plugins: PlatformPlugin[] = [];
/**
* SSE transports keyed by session ID so that POST /messages can route
* incoming JSON-RPC messages to the correct transport instance.
*/
private readonly transports = new Map<string, SSEServerTransport>();
// -- Constructor ----------------------------------------------------------
constructor() {
// 1. Express app + body parsing
this.app = express();
this.app.use(express.json());
// 2. Security & availability middleware
this.app.use(dnsRebindingGuard);
this.app.use(shutdownGuard(() => this.shuttingDown));
// 3. MCP server
this.mcpServer = new McpServer(
{ name: 'social-mcp', version: PACKAGE_VERSION },
);
// 4. SSE transport endpoints
this.setupSseEndpoints();
// 5. Health endpoint
this.setupHealthEndpoint();
// 6. Bearer token auth for /api/* routes
initBearerToken();
this.app.use('/api', bearerAuth);
// 7. Error handler (must be registered last -- re-registered after plugins)
this.app.use(errorHandler);
}
// -- Plugin registration --------------------------------------------------
/**
* Register a platform plugin. Call this **before** `start()` so that all
* tools and routes are wired up before the server begins accepting
* connections.
*/
registerPlugin(plugin: PlatformPlugin): void {
logger.info({ plugin: plugin.name }, 'Registering platform plugin');
plugin.registerTools(this.mcpServer, browserManager);
if (plugin.registerRoutes) {
const router = express.Router();
plugin.registerRoutes(router, browserManager);
// Mount REST API routes under /api/xhs (for xiaohongshu)
this.app.use(`/api/xhs`, router);
}
this.plugins.push(plugin);
}
// -- Lifecycle ------------------------------------------------------------
/**
* Initialise all plugins and start listening for HTTP connections on
* `config.host:config.port`.
*
* Returns a promise that resolves once the server is ready.
*/
async start(): Promise<void> {
// Initialise plugins (sequentially so order is deterministic).
for (const plugin of this.plugins) {
if (plugin.init) {
logger.info({ plugin: plugin.name }, 'Initialising plugin');
await plugin.init();
}
}
// Re-register the error handler so it sits after any plugin routes.
this.app.use(errorHandler);
return new Promise<void>((resolve, reject) => {
this.httpServer = this.app
.listen(config.port, config.host, () => {
logger.info(
{ host: config.host, port: config.port },
'AppServer listening',
);
resolve();
})
.on('error', (err: Error) => {
reject(err);
});
});
}
/**
* Initiate graceful shutdown:
* 1. Set the shutting-down flag (new requests get 503).
* 2. Shut down every plugin.
* 3. Close all SSE transports and the MCP server.
* 4. Close the HTTP server.
*/
async close(): Promise<void> {
if (this.shuttingDown) return;
this.shuttingDown = true;
logger.info('AppServer shutting down');
// Shut down plugins
for (const plugin of this.plugins) {
if (plugin.shutdown) {
try {
await plugin.shutdown();
} catch (err: unknown) {
logger.warn({ err, plugin: plugin.name }, 'Error shutting down plugin');
}
}
}
// Close all SSE transports
for (const [sessionId, transport] of this.transports) {
try {
await transport.close();
} catch (err: unknown) {
logger.warn({ err, sessionId }, 'Error closing SSE transport');
}
}
this.transports.clear();
// Close the MCP server
try {
await this.mcpServer.close();
} catch (err: unknown) {
logger.warn({ err }, 'Error closing MCP server');
}
// Close the HTTP server
if (this.httpServer) {
await new Promise<void>((resolve) => {
this.httpServer!.close(() => {
resolve();
});
});
this.httpServer = null;
}
logger.info('AppServer shut down complete');
}
// -- Private: SSE endpoints -----------------------------------------------
private setupSseEndpoints(): void {
// GET /sse -- establish a new SSE connection
this.app.get('/sse', (req, res) => {
logger.debug({ ip: req.ip }, 'New SSE connection request');
const transport = new SSEServerTransport('/messages', res);
const sessionId = transport.sessionId;
this.transports.set(sessionId, transport);
logger.info({ sessionId }, 'SSE transport created');
// Clean up when the client disconnects.
res.on('close', () => {
logger.info({ sessionId }, 'SSE client disconnected');
this.transports.delete(sessionId);
});
// Connect the transport to the MCP server. This starts the SSE
// stream and sends the initial endpoint event to the client.
void this.mcpServer.connect(transport).catch((err: unknown) => {
logger.error({ err, sessionId }, 'Failed to connect SSE transport to MCP server');
this.transports.delete(sessionId);
});
});
// POST /messages -- receive JSON-RPC messages for an existing session
this.app.post('/messages', (req, res) => {
const sessionId = req.query['sessionId'] as string | undefined;
if (!sessionId) {
res.status(400).json({ error: 'Missing sessionId query parameter' });
return;
}
const transport = this.transports.get(sessionId);
if (!transport) {
res.status(404).json({ error: 'Unknown or expired session' });
return;
}
// Delegate to the transport; it will parse the body and route the
// JSON-RPC message to the MCP server.
void transport.handlePostMessage(req, res).catch((err: unknown) => {
logger.error({ err, sessionId }, 'Error handling POST /messages');
if (!res.headersSent) {
res.status(500).json({ error: 'Internal server error' });
}
});
});
}
// -- Private: Health endpoint ---------------------------------------------
private setupHealthEndpoint(): void {
this.app.get('/health', (_req, res) => {
void this.buildHealthResponse()
.then((body) => {
const status = body.healthy ? 200 : 503;
res.status(status).json(body);
})
.catch((err: unknown) => {
logger.error({ err }, 'Health check failed unexpectedly');
res.status(500).json({ healthy: false, error: 'Health check error' });
});
});
}
private async buildHealthResponse(): Promise<Record<string, unknown>> {
// Memory usage
const mem = process.memoryUsage();
const memoryMb = {
rss: Math.round(mem.rss / 1024 / 1024),
heapUsed: Math.round(mem.heapUsed / 1024 / 1024),
heapTotal: Math.round(mem.heapTotal / 1024 / 1024),
external: Math.round(mem.external / 1024 / 1024),
};
// Active SSE sessions
const activeSessions = this.transports.size;
// Plugin health checks
const pluginHealth: Record<string, { healthy: boolean; message?: string }> = {};
let allPluginsHealthy = true;
for (const plugin of this.plugins) {
if (plugin.healthCheck) {
try {
const result = await plugin.healthCheck();
pluginHealth[plugin.name] = result;
if (!result.healthy) {
allPluginsHealthy = false;
}
} catch (err: unknown) {
const message = err instanceof Error ? err.message : String(err);
pluginHealth[plugin.name] = { healthy: false, message };
allPluginsHealthy = false;
}
} else {
pluginHealth[plugin.name] = { healthy: true };
}
}
const healthy = allPluginsHealthy && !this.shuttingDown;
return {
healthy,
version: PACKAGE_VERSION,
uptime: Math.round(process.uptime()),
shuttingDown: this.shuttingDown,
activeSessions,
plugins: pluginHealth,
memory: memoryMb,
};
}
}
+278
View File
@@ -0,0 +1,278 @@
import crypto from 'node:crypto';
import fs from 'node:fs';
import path from 'node:path';
import type { Request, Response, NextFunction } from 'express';
import { config } from '../config/index.js';
import { logger } from '../utils/logger.js';
import { sanitizeErrorMessage } from '../utils/errors.js';
// ---------------------------------------------------------------------------
// Allowed hosts for DNS rebinding protection
// ---------------------------------------------------------------------------
const allowedHosts = new Set<string>([
'127.0.0.1',
'localhost',
`127.0.0.1:${config.port}`,
`localhost:${config.port}`,
]);
// ---------------------------------------------------------------------------
// 1. DNS Rebinding Guard
// ---------------------------------------------------------------------------
/**
* Reject requests whose `Host` header does not match an expected localhost
* value. This prevents DNS rebinding attacks from reaching the service when
* it is bound to the loopback interface.
*/
export function dnsRebindingGuard(
req: Request,
res: Response,
next: NextFunction,
): void {
const host = req.headers.host;
if (!host || !allowedHosts.has(host)) {
logger.warn(
{ host, ip: req.ip, method: req.method, url: req.originalUrl },
'DNS rebinding guard: blocked request with disallowed Host header',
);
res.status(403).json({ error: 'Forbidden' });
return;
}
next();
}
// ---------------------------------------------------------------------------
// 2. Shutdown Guard (factory)
// ---------------------------------------------------------------------------
/**
* Factory that returns middleware rejecting new requests once the server has
* started its graceful shutdown sequence.
*
* @param getShuttingDown - Callback that returns `true` when shutdown is in progress.
*/
export function shutdownGuard(
getShuttingDown: () => boolean,
): (req: Request, res: Response, next: NextFunction) => void {
return (_req: Request, res: Response, next: NextFunction): void => {
if (getShuttingDown()) {
res.status(503).json({ error: 'Server is shutting down' });
return;
}
next();
};
}
// ---------------------------------------------------------------------------
// 3. Error Handler
// ---------------------------------------------------------------------------
/**
* Express error-handling middleware (four-argument signature).
*
* Logs the full error internally while returning a sanitized message to the
* client so that internal filesystem paths, tokens, and stack traces are
* never exposed.
*/
export function errorHandler(
err: Error,
req: Request,
res: Response,
_next: NextFunction,
): void {
logger.error(
{ err, method: req.method, url: req.originalUrl },
'Unhandled error in request pipeline',
);
const message = sanitizeErrorMessage(err.message || 'Internal server error');
res.status(500).json({ error: message });
}
// ---------------------------------------------------------------------------
// 4. Bearer Token Authentication
// ---------------------------------------------------------------------------
const TOKEN_FILENAME = '.api-token';
/** Cached token once loaded/generated. */
let cachedToken: string | null = null;
/**
* Load or generate the Bearer API token.
*
* - On first start, generates a random 32-byte hex token.
* - Stores it at `config.cookieDir/.api-token` with 0o600 permissions.
* - On subsequent starts, reads the existing token from disk.
* - Logs the token to console so the user can copy it.
*
* Must be called once during server startup.
*/
export function initBearerToken(): string {
if (cachedToken) return cachedToken;
const tokenPath = path.join(config.cookieDir, TOKEN_FILENAME);
// Ensure the directory exists.
try {
fs.mkdirSync(config.cookieDir, { recursive: true, mode: 0o700 });
} catch {
// Directory may already exist.
}
// Try to read an existing token.
try {
const existing = fs.readFileSync(tokenPath, 'utf-8').trim();
if (existing.length >= 32) {
cachedToken = existing;
logger.info('API Bearer token loaded from disk');
// eslint-disable-next-line no-console
console.log(`\n REST API Bearer Token: ${cachedToken}\n`);
return cachedToken;
}
} catch {
// File does not exist or is unreadable — generate a new token.
}
// Generate a new token.
cachedToken = crypto.randomBytes(32).toString('hex');
fs.writeFileSync(tokenPath, cachedToken + '\n', { mode: 0o600 });
logger.info('New API Bearer token generated and saved');
// eslint-disable-next-line no-console
console.log(`\n REST API Bearer Token: ${cachedToken}\n`);
return cachedToken;
}
/**
* Express middleware that validates a `Bearer <token>` header against the
* stored API token. Uses `crypto.timingSafeEqual` to prevent timing attacks.
*
* Apply to `/api/*` routes only.
*/
export function bearerAuth(
req: Request,
res: Response,
next: NextFunction,
): void {
const authHeader = req.headers.authorization;
if (!authHeader || !authHeader.startsWith('Bearer ')) {
res.status(401).json({
success: false,
error: { code: 'UNAUTHORIZED', message: 'Missing or invalid Authorization header' },
});
return;
}
const provided = authHeader.slice(7); // Strip "Bearer "
if (!cachedToken) {
res.status(500).json({
success: false,
error: { code: 'INTERNAL', message: 'API token not initialized' },
});
return;
}
// Use timing-safe comparison to prevent timing attacks.
const providedBuf = Buffer.from(provided, 'utf-8');
const expectedBuf = Buffer.from(cachedToken, 'utf-8');
if (
providedBuf.length !== expectedBuf.length ||
!crypto.timingSafeEqual(providedBuf, expectedBuf)
) {
res.status(403).json({
success: false,
error: { code: 'FORBIDDEN', message: 'Invalid Bearer token' },
});
return;
}
next();
}
// ---------------------------------------------------------------------------
// 5. Rate Limiter (in-memory, per-IP)
// ---------------------------------------------------------------------------
interface RateLimiterOptions {
/** Time window in milliseconds. */
windowMs: number;
/** Maximum number of requests allowed in the window. */
maxRequests: number;
}
interface RateLimiterEntry {
/** Request timestamps within the current window. */
timestamps: number[];
}
/**
* Create an in-memory per-IP rate limiter middleware.
*
* Returns 429 when the rate limit is exceeded. Old entries are automatically
* cleaned up every 60 seconds to prevent memory leaks.
*/
export function rateLimiter(opts: RateLimiterOptions) {
const store = new Map<string, RateLimiterEntry>();
// Periodic cleanup of stale entries.
const cleanupInterval = setInterval(() => {
const now = Date.now();
for (const [ip, entry] of store) {
entry.timestamps = entry.timestamps.filter((t) => now - t < opts.windowMs);
if (entry.timestamps.length === 0) {
store.delete(ip);
}
}
}, 60_000);
// Do not let the cleanup timer keep the process alive during shutdown.
if (typeof cleanupInterval === 'object' && 'unref' in cleanupInterval) {
cleanupInterval.unref();
}
return (req: Request, res: Response, next: NextFunction): void => {
const ip = req.ip ?? req.socket.remoteAddress ?? 'unknown';
const now = Date.now();
let entry = store.get(ip);
if (!entry) {
entry = { timestamps: [] };
store.set(ip, entry);
}
// Remove timestamps outside the current window.
entry.timestamps = entry.timestamps.filter((t) => now - t < opts.windowMs);
if (entry.timestamps.length >= opts.maxRequests) {
const retryAfterMs = opts.windowMs - (now - (entry.timestamps[0] ?? now));
const retryAfterSec = Math.ceil(retryAfterMs / 1000);
res.set('Retry-After', String(retryAfterSec));
res.status(429).json({
success: false,
error: {
code: 'RATE_LIMITED',
message: `Too many requests. Try again in ${String(retryAfterSec)} seconds.`,
},
});
return;
}
entry.timestamps.push(now);
next();
};
}
+242
View File
@@ -0,0 +1,242 @@
import { open, stat, unlink, writeFile, mkdir } from "node:fs/promises";
import path from "node:path";
import { randomUUID } from "node:crypto";
import { logger } from "./logger.js";
// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------
const DEFAULT_MAX_SIZE_MB = 20;
const BYTES_PER_MB = 1024 * 1024;
/** Minimum bytes we need to read to identify all supported formats. */
const MAGIC_BYTES_LEN = 12;
/**
* Map Content-Type values to file extensions. Used as a fallback when the
* URL does not contain a recognisable extension.
*/
const MIME_TO_EXT: Record<string, string> = {
"image/jpeg": ".jpg",
"image/png": ".png",
"image/webp": ".webp",
"video/mp4": ".mp4",
};
// ---------------------------------------------------------------------------
// Magic-byte detection
// ---------------------------------------------------------------------------
function detectMimeType(header: Buffer): string | undefined {
// JPEG: starts with FF D8 FF
if (header[0] === 0xff && header[1] === 0xd8 && header[2] === 0xff) {
return "image/jpeg";
}
// PNG: starts with 89 50 4E 47
if (
header[0] === 0x89 &&
header[1] === 0x50 &&
header[2] === 0x4e &&
header[3] === 0x47
) {
return "image/png";
}
// WebP: RIFF....WEBP (bytes 0-3 = "RIFF", bytes 8-11 = "WEBP")
if (
header[0] === 0x52 &&
header[1] === 0x49 &&
header[2] === 0x46 &&
header[3] === 0x46 &&
header[8] === 0x57 &&
header[9] === 0x45 &&
header[10] === 0x42 &&
header[11] === 0x50
) {
return "image/webp";
}
// MP4: "ftyp" at byte offset 4
if (
header[4] === 0x66 &&
header[5] === 0x74 &&
header[6] === 0x79 &&
header[7] === 0x70
) {
return "video/mp4";
}
return undefined;
}
// ---------------------------------------------------------------------------
// validateMediaPath
// ---------------------------------------------------------------------------
/**
* Validate that a local media file exists, is within size limits, and is one
* of the supported media types (JPEG, PNG, WebP, MP4).
*
* @returns The resolved absolute path to the file.
*/
export async function validateMediaPath(
filePath: string,
opts?: { maxSizeMB?: number },
): Promise<string> {
const resolved = path.resolve(filePath);
// Guard against path traversal -- reject if the *original* input tries to
// escape via ".." segments. We check the raw input rather than the resolved
// path so that a legitimate directory named ".." is not silently accepted.
if (filePath.includes("..")) {
throw new Error(
`Path traversal detected: the path must not contain ".." segments`,
);
}
// Existence & size check
let stats: Awaited<ReturnType<typeof stat>>;
try {
stats = await stat(resolved);
} catch {
throw new Error(`File not found: ${resolved}`);
}
if (!stats.isFile()) {
throw new Error(`Not a regular file: ${resolved}`);
}
const maxBytes = (opts?.maxSizeMB ?? DEFAULT_MAX_SIZE_MB) * BYTES_PER_MB;
if (stats.size > maxBytes) {
const sizeMB = (stats.size / BYTES_PER_MB).toFixed(2);
const limitMB = (maxBytes / BYTES_PER_MB).toFixed(0);
throw new Error(
`File too large: ${sizeMB} MB exceeds the ${limitMB} MB limit`,
);
}
if (stats.size < MAGIC_BYTES_LEN) {
throw new Error(`File too small to identify media type (${stats.size} bytes)`);
}
// MIME type check via magic bytes
const fd = await open(resolved, "r");
try {
const buf = Buffer.alloc(MAGIC_BYTES_LEN);
await fd.read(buf, 0, MAGIC_BYTES_LEN, 0);
const mime = detectMimeType(buf);
if (mime === undefined) {
throw new Error(
`Unsupported media type for file: ${resolved}. ` +
`Supported types: JPEG, PNG, WebP, MP4`,
);
}
logger.debug({ path: resolved, mime, bytes: stats.size }, "媒体文件校验通过");
} finally {
await fd.close();
}
return resolved;
}
// ---------------------------------------------------------------------------
// downloadFile
// ---------------------------------------------------------------------------
/**
* Derive a file extension from the URL path or the Content-Type header.
* Falls back to an empty string when neither source yields a known extension.
*/
function deriveExtension(url: string, contentType: string | null): string {
// Try to pull an extension from the URL pathname first.
try {
const pathname = new URL(url).pathname;
const ext = path.extname(pathname).toLowerCase();
if (ext && ext.length <= 5) {
return ext;
}
} catch {
// URL parsing failed -- fall through to Content-Type.
}
// Fall back to Content-Type header.
if (contentType) {
const baseMime = contentType.split(";")[0]?.trim().toLowerCase();
if (baseMime) {
const ext = MIME_TO_EXT[baseMime];
if (ext) {
return ext;
}
}
}
return "";
}
/**
* Download a file from a remote URL and save it into `destDir` with a random
* filename.
*
* @returns The absolute path to the downloaded file.
*/
export async function downloadFile(
url: string,
destDir: string,
): Promise<string> {
const resolvedDir = path.resolve(destDir);
// Ensure destination directory exists (recursive in case parents are missing).
await mkdir(resolvedDir, { recursive: true });
logger.debug({ url, destDir: resolvedDir }, "开始下载文件");
const response = await fetch(url);
if (!response.ok) {
throw new Error(
`Download failed: HTTP ${String(response.status)} ${response.statusText} for ${url}`,
);
}
const contentType = response.headers.get("content-type");
const ext = deriveExtension(url, contentType);
const filename = `${randomUUID()}${ext}`;
const destPath = path.join(resolvedDir, filename);
const arrayBuffer = await response.arrayBuffer();
const buffer = Buffer.from(arrayBuffer);
await writeFile(destPath, buffer, { mode: 0o600 });
logger.debug(
{ path: destPath, bytes: buffer.length, mime: contentType },
"文件下载完成",
);
return destPath;
}
// ---------------------------------------------------------------------------
// cleanupFile
// ---------------------------------------------------------------------------
/**
* Delete a local file. Silently succeeds if the file does not exist.
*/
export async function cleanupFile(filePath: string): Promise<void> {
try {
await unlink(filePath);
logger.debug({ path: filePath }, "临时文件已清理");
} catch (err: unknown) {
// ENOENT means the file was already gone -- that is fine.
if (err instanceof Error && "code" in err && err.code === "ENOENT") {
return;
}
throw err;
}
}
+136
View File
@@ -0,0 +1,136 @@
import { logger } from './logger.js';
// ---------------------------------------------------------------------------
// Error classification
// ---------------------------------------------------------------------------
export enum ErrorCategory {
TIMEOUT = 'TIMEOUT',
AUTH_REQUIRED = 'AUTH_REQUIRED',
SELECTOR_NOT_FOUND = 'SELECTOR_NOT_FOUND',
NETWORK = 'NETWORK',
PLATFORM_ERROR = 'PLATFORM_ERROR',
INTERNAL = 'INTERNAL',
}
/**
* Inspect an Error's `message` and `name` to determine which category it
* belongs to. The checks are intentionally broad so that errors surfaced by
* Playwright, Puppeteer, or native fetch all get classified correctly.
*/
export function classifyError(err: Error): ErrorCategory {
const haystack = `${err.name} ${err.message}`.toLowerCase();
// Selector check BEFORE timeout — Playwright's selector timeout message
// is "Timeout waiting for selector ..." which contains both keywords.
// The more specific match must come first.
if (
haystack.includes('waiting for selector') ||
haystack.includes('找不到元素')
) {
return ErrorCategory.SELECTOR_NOT_FOUND;
}
if (haystack.includes('timeout') || err.name === 'TimeoutError') {
return ErrorCategory.TIMEOUT;
}
if (haystack.includes('net::err_')) {
return ErrorCategory.NETWORK;
}
if (haystack.includes('login') || haystack.includes('登录')) {
return ErrorCategory.AUTH_REQUIRED;
}
return ErrorCategory.INTERNAL;
}
// ---------------------------------------------------------------------------
// Message sanitization
// ---------------------------------------------------------------------------
/**
* Strip potentially sensitive or noisy information from an error message
* before it is returned to the MCP client.
*
* - File-system paths (/xxx/yyy/...) -> [path]
* - URLs (http(s)://...) -> [url]
* - Long hex strings (>= 32 chars) -> [hash]
* - Truncated to 200 characters
*/
export function sanitizeErrorMessage(message: string): string {
let sanitized = message;
// Replace URLs first so that the path regex does not partially match them.
sanitized = sanitized.replace(/https?:\/\/[^\s)'"]+/g, '[url]');
// Replace absolute file-system paths (Unix-style).
sanitized = sanitized.replace(/\/(?:[^\s/]+\/)+[^\s/)'":]*/g, '[path]');
// Replace long hexadecimal strings (session ids, hashes, tokens, etc.).
sanitized = sanitized.replace(/[0-9a-fA-F]{32,}/g, '[hash]');
// Truncate to 200 characters.
if (sanitized.length > 200) {
sanitized = sanitized.slice(0, 200);
}
return sanitized;
}
// ---------------------------------------------------------------------------
// MCP tool result type
// ---------------------------------------------------------------------------
export type McpToolResult = {
content: Array<{ type: 'text'; text: string }>;
isError?: boolean;
};
// ---------------------------------------------------------------------------
// Error-handling wrapper
// ---------------------------------------------------------------------------
/**
* Execute an MCP tool handler inside a try/catch that automatically
* classifies, sanitizes, and logs any thrown error before returning a
* well-structured MCP error response.
*
* Usage:
* ```ts
* const result = await withErrorHandling('publish_post', async () => {
* // ... tool logic that returns McpToolResult
* });
* ```
*/
export async function withErrorHandling(
toolName: string,
fn: () => Promise<McpToolResult>,
): Promise<McpToolResult> {
try {
return await fn();
} catch (caught: unknown) {
const err =
caught instanceof Error ? caught : new Error(String(caught));
const category = classifyError(err);
const sanitized = sanitizeErrorMessage(err.message);
logger.error(
{ tool: toolName, category, err },
'工具执行失败',
);
const payload = JSON.stringify({
tool: toolName,
error: category,
message: sanitized,
});
return {
content: [{ type: 'text', text: payload }],
isError: true,
};
}
}
+68
View File
@@ -0,0 +1,68 @@
import pino from "pino";
const isProduction = process.env["NODE_ENV"] === "production";
// In production, suppress Playwright debug output that bypasses pino.
if (isProduction) {
delete process.env["DEBUG"];
}
const redactPaths: string[] = [
// Auth & credentials
"**.cookie",
"**.cookies",
"**.set-cookie",
"**.authorization",
"**.password",
"**.secret",
// Tokens
"**.token",
"**.xsec_token",
"**.access_token",
"**.refresh_token",
// API keys
"**.api_key",
"**.apikey",
// Sessions
"**.sessionid",
"**.session_id",
// Playwright StorageState structures
"**.cookies[*].value",
"**.origins[*].localStorage[*].value",
];
const errorSerializer = (err: Error): Record<string, unknown> => {
const serialized: Record<string, unknown> = {
type: err.constructor?.name ?? "Error",
message: err.message,
};
if (!isProduction && err.stack) {
serialized["stack"] = err.stack;
}
return serialized;
};
export const logger: pino.Logger = pino({
level: process.env["LOG_LEVEL"] ?? "info",
redact: {
paths: redactPaths,
censor: "[REDACTED]",
},
serializers: {
err: errorSerializer,
error: errorSerializer,
},
...(isProduction
? {}
: {
transport: {
target: "pino-pretty",
},
}),
});