269 lines
9.1 KiB
TypeScript
269 lines
9.1 KiB
TypeScript
import type { Page } from 'rebrowser-playwright';
|
|
|
|
import { logger } from '@social/core/utils/logger.js';
|
|
import { XHH_SELECTORS } from './selectors.js';
|
|
import type { Feed } from './types.js';
|
|
import {
|
|
detectCaptchaText,
|
|
extractLinkIdFromUrl,
|
|
firstNonEmpty,
|
|
parseCountString,
|
|
} from './extractors.js';
|
|
|
|
const HOME_URL = 'https://www.xiaoheihe.cn/app/bbs/home';
|
|
const log = logger.child({ module: 'xhh-feeds' });
|
|
|
|
interface RawFeedCandidate {
|
|
id?: string;
|
|
title?: string;
|
|
description?: string;
|
|
coverUrl?: string;
|
|
likeCount?: string | number;
|
|
commentCount?: string | number;
|
|
userId?: string;
|
|
nickname?: string;
|
|
avatar?: string;
|
|
linkUrl?: string;
|
|
}
|
|
|
|
export async function listFeeds(page: Page): Promise<Feed[]> {
|
|
await page.goto(HOME_URL, { waitUntil: 'domcontentloaded' });
|
|
await page.waitForTimeout(1_500);
|
|
|
|
const text = await page.textContent('body').catch(() => '');
|
|
if (text && detectCaptchaText(text)) {
|
|
throw new Error('CAPTCHA_REQUIRED: captcha detected on feeds page');
|
|
}
|
|
|
|
const nuxtFeeds = await extractFeedsFromNuxt(page);
|
|
const domFeeds = await extractFeedsFromDom(page);
|
|
|
|
const merged = [...nuxtFeeds, ...domFeeds];
|
|
const result = dedupeAndNormalize(merged);
|
|
log.info({ count: result.length }, 'xhh feeds extracted');
|
|
return result;
|
|
}
|
|
|
|
export async function searchFeeds(page: Page, keyword: string): Promise<Feed[]> {
|
|
const targetUrl = `https://www.xiaoheihe.cn/app/bbs/search?keyword=${encodeURIComponent(keyword)}`;
|
|
await page.goto(targetUrl, { waitUntil: 'domcontentloaded' });
|
|
await page.waitForTimeout(1_200);
|
|
|
|
const text = await page.textContent('body').catch(() => '');
|
|
if (text && detectCaptchaText(text)) {
|
|
throw new Error('CAPTCHA_REQUIRED: captcha detected on search page');
|
|
}
|
|
|
|
const combined = dedupeAndNormalize([
|
|
...(await extractFeedsFromNuxt(page)),
|
|
...(await extractFeedsFromDom(page)),
|
|
]);
|
|
|
|
if (combined.length > 0) {
|
|
return combined.filter((item) => {
|
|
const haystack = `${item.title} ${item.description} ${item.user.nickname}`.toLowerCase();
|
|
return haystack.includes(keyword.toLowerCase());
|
|
});
|
|
}
|
|
|
|
// Fallback: when search route structure changes, use home feeds and filter.
|
|
const homeFeeds = await listFeeds(page);
|
|
return homeFeeds.filter((item) => {
|
|
const haystack = `${item.title} ${item.description} ${item.user.nickname}`.toLowerCase();
|
|
return haystack.includes(keyword.toLowerCase());
|
|
});
|
|
}
|
|
|
|
async function extractFeedsFromNuxt(page: Page): Promise<RawFeedCandidate[]> {
|
|
const data = await page
|
|
.evaluate(() => {
|
|
const root: unknown =
|
|
(window as { __NUXT_DATA__?: unknown }).__NUXT_DATA__ ??
|
|
(window as { __NUXT__?: { data?: unknown } }).__NUXT__?.data ??
|
|
null;
|
|
|
|
const out: Array<Record<string, unknown>> = [];
|
|
const visited = new Set<unknown>();
|
|
|
|
function walk(value: unknown): void {
|
|
if (!value || typeof value !== 'object') return;
|
|
if (visited.has(value)) return;
|
|
visited.add(value);
|
|
|
|
if (Array.isArray(value)) {
|
|
for (const item of value) walk(item);
|
|
return;
|
|
}
|
|
|
|
const obj = value as Record<string, unknown>;
|
|
|
|
const id =
|
|
(typeof obj['link_id'] === 'string' && obj['link_id']) ||
|
|
(typeof obj['linkid'] === 'string' && obj['linkid']) ||
|
|
(typeof obj['id'] === 'string' && obj['id']) ||
|
|
(typeof obj['post_id'] === 'string' && obj['post_id']) ||
|
|
'';
|
|
const url =
|
|
(typeof obj['link_url'] === 'string' && obj['link_url']) ||
|
|
(typeof obj['url'] === 'string' && obj['url']) ||
|
|
'';
|
|
const title =
|
|
(typeof obj['title'] === 'string' && obj['title']) ||
|
|
(typeof obj['subject'] === 'string' && obj['subject']) ||
|
|
'';
|
|
|
|
const hasLink = (typeof url === 'string' && url.includes('/app/bbs/link/'));
|
|
if (id || hasLink || title) {
|
|
out.push(obj);
|
|
}
|
|
|
|
for (const next of Object.values(obj)) {
|
|
walk(next);
|
|
}
|
|
}
|
|
|
|
walk(root);
|
|
return out.slice(0, 500);
|
|
})
|
|
.catch(() => []);
|
|
|
|
return (data as Array<Record<string, unknown>>).map((item) => {
|
|
const linkUrl = firstNonEmpty(
|
|
valueString(item['link_url']),
|
|
valueString(item['url']),
|
|
valueString(item['jump_url']),
|
|
);
|
|
const user = (item['user'] ?? item['author']) as Record<string, unknown> | undefined;
|
|
return {
|
|
id: firstNonEmpty(
|
|
valueString(item['link_id']),
|
|
valueString(item['linkid']),
|
|
valueString(item['post_id']),
|
|
valueString(item['id']),
|
|
),
|
|
title: firstNonEmpty(valueString(item['title']), valueString(item['subject'])),
|
|
description: firstNonEmpty(
|
|
valueString(item['description']),
|
|
valueString(item['content']),
|
|
valueString(item['desc']),
|
|
),
|
|
coverUrl: firstNonEmpty(
|
|
valueString(item['cover']),
|
|
valueString(item['cover_url']),
|
|
valueString(item['image']),
|
|
),
|
|
likeCount: valueString(item['like_count']) || valueString(item['likes']),
|
|
commentCount: valueString(item['comment_count']) || valueString(item['comments']),
|
|
userId: firstNonEmpty(
|
|
valueString(user?.['userid']),
|
|
valueString(user?.['user_id']),
|
|
valueString(item['userid']),
|
|
),
|
|
nickname: firstNonEmpty(
|
|
valueString(user?.['nickname']),
|
|
valueString(user?.['name']),
|
|
valueString(item['nickname']),
|
|
),
|
|
avatar: firstNonEmpty(
|
|
valueString(user?.['avatar']),
|
|
valueString(user?.['avatar_url']),
|
|
),
|
|
linkUrl,
|
|
};
|
|
});
|
|
}
|
|
|
|
async function extractFeedsFromDom(page: Page): Promise<RawFeedCandidate[]> {
|
|
return page
|
|
.evaluate((selectors) => {
|
|
const anchors = [...document.querySelectorAll<HTMLAnchorElement>(selectors.feed.link)]
|
|
.filter((a) => Boolean(a.getAttribute('href')));
|
|
|
|
const feeds: RawFeedCandidate[] = [];
|
|
const cardSelector = selectors.feed.card.join(', ');
|
|
const titleSelector = selectors.feed.title.join(', ');
|
|
const descSelector = selectors.feed.description.join(', ');
|
|
const userNameSelector = selectors.feed.userName.join(', ');
|
|
const likeSelector = selectors.feed.likeCount.join(', ');
|
|
const commentSelector = selectors.feed.commentCount.join(', ');
|
|
|
|
for (const link of anchors) {
|
|
const href = link.getAttribute('href') ?? '';
|
|
const card = link.closest(cardSelector) ?? link.parentElement;
|
|
const title = (card?.querySelector(titleSelector)?.textContent ?? '').trim();
|
|
const description = (card?.querySelector(descSelector)?.textContent ?? '').trim();
|
|
const cover = (card?.querySelector(selectors.feed.cover) as HTMLImageElement | null)?.src ?? '';
|
|
const userNode = card?.querySelector(selectors.feed.userLink) as HTMLAnchorElement | null;
|
|
const username = (card?.querySelector(userNameSelector)?.textContent ?? '').trim();
|
|
const likeCount = (card?.querySelector(likeSelector)?.textContent ?? '').trim();
|
|
const commentCount = (card?.querySelector(commentSelector)?.textContent ?? '').trim();
|
|
|
|
feeds.push({
|
|
linkUrl: href,
|
|
title,
|
|
description,
|
|
coverUrl: cover,
|
|
userId: userNode?.getAttribute('href') ?? '',
|
|
nickname: username,
|
|
avatar: (card?.querySelector('img') as HTMLImageElement | null)?.src ?? '',
|
|
likeCount,
|
|
commentCount,
|
|
});
|
|
}
|
|
|
|
return feeds;
|
|
}, XHH_SELECTORS)
|
|
.catch(() => []);
|
|
}
|
|
|
|
function dedupeAndNormalize(items: RawFeedCandidate[]): Feed[] {
|
|
const output: Feed[] = [];
|
|
const seen = new Set<string>();
|
|
|
|
for (const item of items) {
|
|
const linkId = firstNonEmpty(item.id, item.linkUrl ? extractLinkIdFromUrl(item.linkUrl) ?? '' : '');
|
|
if (!linkId || seen.has(linkId)) continue;
|
|
seen.add(linkId);
|
|
|
|
const linkUrl = normalizeLinkUrl(item.linkUrl, linkId);
|
|
const userIdFromHref = item.userId ? extractUserIdFromMaybeHref(item.userId) : '';
|
|
|
|
output.push({
|
|
id: linkId,
|
|
title: item.title?.trim() ?? '',
|
|
description: item.description?.trim() ?? '',
|
|
coverUrl: item.coverUrl?.trim() ?? '',
|
|
likeCount: parseCountString(item.likeCount),
|
|
commentCount: parseCountString(item.commentCount),
|
|
user: {
|
|
id: userIdFromHref,
|
|
nickname: item.nickname?.trim() ?? '',
|
|
avatar: item.avatar?.trim() ?? '',
|
|
},
|
|
linkUrl,
|
|
});
|
|
}
|
|
|
|
return output;
|
|
}
|
|
|
|
function normalizeLinkUrl(rawUrl: string | undefined, linkId: string): string {
|
|
const trimmed = rawUrl?.trim() ?? '';
|
|
if (!trimmed) return `https://www.xiaoheihe.cn/app/bbs/link/${linkId}`;
|
|
if (/^https?:\/\//i.test(trimmed)) return trimmed;
|
|
if (trimmed.startsWith('/')) return `https://www.xiaoheihe.cn${trimmed}`;
|
|
return `https://${trimmed}`;
|
|
}
|
|
|
|
function extractUserIdFromMaybeHref(raw: string): string {
|
|
const normalized = raw.startsWith('/') ? `https://www.xiaoheihe.cn${raw}` : raw;
|
|
return normalized.match(/\/app\/user\/profile\/(\d+)/)?.[1] ?? '';
|
|
}
|
|
|
|
function valueString(value: unknown): string {
|
|
if (typeof value === 'string') return value;
|
|
if (typeof value === 'number') return String(value);
|
|
return '';
|
|
}
|
|
|