Files
social-mcp/apps/xhh-mcp/src/platforms/xiaoheihe/feeds.ts
T

269 lines
9.1 KiB
TypeScript

import type { Page } from 'rebrowser-playwright';
import { logger } from '@social/core/utils/logger.js';
import { XHH_SELECTORS } from './selectors.js';
import type { Feed } from './types.js';
import {
detectCaptchaText,
extractLinkIdFromUrl,
firstNonEmpty,
parseCountString,
} from './extractors.js';
const HOME_URL = 'https://www.xiaoheihe.cn/app/bbs/home';
const log = logger.child({ module: 'xhh-feeds' });
interface RawFeedCandidate {
id?: string;
title?: string;
description?: string;
coverUrl?: string;
likeCount?: string | number;
commentCount?: string | number;
userId?: string;
nickname?: string;
avatar?: string;
linkUrl?: string;
}
export async function listFeeds(page: Page): Promise<Feed[]> {
await page.goto(HOME_URL, { waitUntil: 'domcontentloaded' });
await page.waitForTimeout(1_500);
const text = await page.textContent('body').catch(() => '');
if (text && detectCaptchaText(text)) {
throw new Error('CAPTCHA_REQUIRED: captcha detected on feeds page');
}
const nuxtFeeds = await extractFeedsFromNuxt(page);
const domFeeds = await extractFeedsFromDom(page);
const merged = [...nuxtFeeds, ...domFeeds];
const result = dedupeAndNormalize(merged);
log.info({ count: result.length }, 'xhh feeds extracted');
return result;
}
export async function searchFeeds(page: Page, keyword: string): Promise<Feed[]> {
const targetUrl = `https://www.xiaoheihe.cn/app/bbs/search?keyword=${encodeURIComponent(keyword)}`;
await page.goto(targetUrl, { waitUntil: 'domcontentloaded' });
await page.waitForTimeout(1_200);
const text = await page.textContent('body').catch(() => '');
if (text && detectCaptchaText(text)) {
throw new Error('CAPTCHA_REQUIRED: captcha detected on search page');
}
const combined = dedupeAndNormalize([
...(await extractFeedsFromNuxt(page)),
...(await extractFeedsFromDom(page)),
]);
if (combined.length > 0) {
return combined.filter((item) => {
const haystack = `${item.title} ${item.description} ${item.user.nickname}`.toLowerCase();
return haystack.includes(keyword.toLowerCase());
});
}
// Fallback: when search route structure changes, use home feeds and filter.
const homeFeeds = await listFeeds(page);
return homeFeeds.filter((item) => {
const haystack = `${item.title} ${item.description} ${item.user.nickname}`.toLowerCase();
return haystack.includes(keyword.toLowerCase());
});
}
async function extractFeedsFromNuxt(page: Page): Promise<RawFeedCandidate[]> {
const data = await page
.evaluate(() => {
const root: unknown =
(window as { __NUXT_DATA__?: unknown }).__NUXT_DATA__ ??
(window as { __NUXT__?: { data?: unknown } }).__NUXT__?.data ??
null;
const out: Array<Record<string, unknown>> = [];
const visited = new Set<unknown>();
function walk(value: unknown): void {
if (!value || typeof value !== 'object') return;
if (visited.has(value)) return;
visited.add(value);
if (Array.isArray(value)) {
for (const item of value) walk(item);
return;
}
const obj = value as Record<string, unknown>;
const id =
(typeof obj['link_id'] === 'string' && obj['link_id']) ||
(typeof obj['linkid'] === 'string' && obj['linkid']) ||
(typeof obj['id'] === 'string' && obj['id']) ||
(typeof obj['post_id'] === 'string' && obj['post_id']) ||
'';
const url =
(typeof obj['link_url'] === 'string' && obj['link_url']) ||
(typeof obj['url'] === 'string' && obj['url']) ||
'';
const title =
(typeof obj['title'] === 'string' && obj['title']) ||
(typeof obj['subject'] === 'string' && obj['subject']) ||
'';
const hasLink = (typeof url === 'string' && url.includes('/app/bbs/link/'));
if (id || hasLink || title) {
out.push(obj);
}
for (const next of Object.values(obj)) {
walk(next);
}
}
walk(root);
return out.slice(0, 500);
})
.catch(() => []);
return (data as Array<Record<string, unknown>>).map((item) => {
const linkUrl = firstNonEmpty(
valueString(item['link_url']),
valueString(item['url']),
valueString(item['jump_url']),
);
const user = (item['user'] ?? item['author']) as Record<string, unknown> | undefined;
return {
id: firstNonEmpty(
valueString(item['link_id']),
valueString(item['linkid']),
valueString(item['post_id']),
valueString(item['id']),
),
title: firstNonEmpty(valueString(item['title']), valueString(item['subject'])),
description: firstNonEmpty(
valueString(item['description']),
valueString(item['content']),
valueString(item['desc']),
),
coverUrl: firstNonEmpty(
valueString(item['cover']),
valueString(item['cover_url']),
valueString(item['image']),
),
likeCount: valueString(item['like_count']) || valueString(item['likes']),
commentCount: valueString(item['comment_count']) || valueString(item['comments']),
userId: firstNonEmpty(
valueString(user?.['userid']),
valueString(user?.['user_id']),
valueString(item['userid']),
),
nickname: firstNonEmpty(
valueString(user?.['nickname']),
valueString(user?.['name']),
valueString(item['nickname']),
),
avatar: firstNonEmpty(
valueString(user?.['avatar']),
valueString(user?.['avatar_url']),
),
linkUrl,
};
});
}
async function extractFeedsFromDom(page: Page): Promise<RawFeedCandidate[]> {
return page
.evaluate((selectors) => {
const anchors = [...document.querySelectorAll<HTMLAnchorElement>(selectors.feed.link)]
.filter((a) => Boolean(a.getAttribute('href')));
const feeds: RawFeedCandidate[] = [];
const cardSelector = selectors.feed.card.join(', ');
const titleSelector = selectors.feed.title.join(', ');
const descSelector = selectors.feed.description.join(', ');
const userNameSelector = selectors.feed.userName.join(', ');
const likeSelector = selectors.feed.likeCount.join(', ');
const commentSelector = selectors.feed.commentCount.join(', ');
for (const link of anchors) {
const href = link.getAttribute('href') ?? '';
const card = link.closest(cardSelector) ?? link.parentElement;
const title = (card?.querySelector(titleSelector)?.textContent ?? '').trim();
const description = (card?.querySelector(descSelector)?.textContent ?? '').trim();
const cover = (card?.querySelector(selectors.feed.cover) as HTMLImageElement | null)?.src ?? '';
const userNode = card?.querySelector(selectors.feed.userLink) as HTMLAnchorElement | null;
const username = (card?.querySelector(userNameSelector)?.textContent ?? '').trim();
const likeCount = (card?.querySelector(likeSelector)?.textContent ?? '').trim();
const commentCount = (card?.querySelector(commentSelector)?.textContent ?? '').trim();
feeds.push({
linkUrl: href,
title,
description,
coverUrl: cover,
userId: userNode?.getAttribute('href') ?? '',
nickname: username,
avatar: (card?.querySelector('img') as HTMLImageElement | null)?.src ?? '',
likeCount,
commentCount,
});
}
return feeds;
}, XHH_SELECTORS)
.catch(() => []);
}
function dedupeAndNormalize(items: RawFeedCandidate[]): Feed[] {
const output: Feed[] = [];
const seen = new Set<string>();
for (const item of items) {
const linkId = firstNonEmpty(item.id, item.linkUrl ? extractLinkIdFromUrl(item.linkUrl) ?? '' : '');
if (!linkId || seen.has(linkId)) continue;
seen.add(linkId);
const linkUrl = normalizeLinkUrl(item.linkUrl, linkId);
const userIdFromHref = item.userId ? extractUserIdFromMaybeHref(item.userId) : '';
output.push({
id: linkId,
title: item.title?.trim() ?? '',
description: item.description?.trim() ?? '',
coverUrl: item.coverUrl?.trim() ?? '',
likeCount: parseCountString(item.likeCount),
commentCount: parseCountString(item.commentCount),
user: {
id: userIdFromHref,
nickname: item.nickname?.trim() ?? '',
avatar: item.avatar?.trim() ?? '',
},
linkUrl,
});
}
return output;
}
function normalizeLinkUrl(rawUrl: string | undefined, linkId: string): string {
const trimmed = rawUrl?.trim() ?? '';
if (!trimmed) return `https://www.xiaoheihe.cn/app/bbs/link/${linkId}`;
if (/^https?:\/\//i.test(trimmed)) return trimmed;
if (trimmed.startsWith('/')) return `https://www.xiaoheihe.cn${trimmed}`;
return `https://${trimmed}`;
}
function extractUserIdFromMaybeHref(raw: string): string {
const normalized = raw.startsWith('/') ? `https://www.xiaoheihe.cn${raw}` : raw;
return normalized.match(/\/app\/user\/profile\/(\d+)/)?.[1] ?? '';
}
function valueString(value: unknown): string {
if (typeof value === 'string') return value;
if (typeof value === 'number') return String(value);
return '';
}