重构为Monorepo:拆分xhs/xhh应用与core包并完成双服务部署改造
This commit is contained in:
@@ -0,0 +1,268 @@
|
||||
import type { Page } from 'rebrowser-playwright';
|
||||
|
||||
import { logger } from '@social/core/utils/logger.js';
|
||||
import { XHH_SELECTORS } from './selectors.js';
|
||||
import type { Feed } from './types.js';
|
||||
import {
|
||||
detectCaptchaText,
|
||||
extractLinkIdFromUrl,
|
||||
firstNonEmpty,
|
||||
parseCountString,
|
||||
} from './extractors.js';
|
||||
|
||||
const HOME_URL = 'https://www.xiaoheihe.cn/app/bbs/home';
|
||||
const log = logger.child({ module: 'xhh-feeds' });
|
||||
|
||||
interface RawFeedCandidate {
|
||||
id?: string;
|
||||
title?: string;
|
||||
description?: string;
|
||||
coverUrl?: string;
|
||||
likeCount?: string | number;
|
||||
commentCount?: string | number;
|
||||
userId?: string;
|
||||
nickname?: string;
|
||||
avatar?: string;
|
||||
linkUrl?: string;
|
||||
}
|
||||
|
||||
export async function listFeeds(page: Page): Promise<Feed[]> {
|
||||
await page.goto(HOME_URL, { waitUntil: 'domcontentloaded' });
|
||||
await page.waitForTimeout(1_500);
|
||||
|
||||
const text = await page.textContent('body').catch(() => '');
|
||||
if (text && detectCaptchaText(text)) {
|
||||
throw new Error('CAPTCHA_REQUIRED: captcha detected on feeds page');
|
||||
}
|
||||
|
||||
const nuxtFeeds = await extractFeedsFromNuxt(page);
|
||||
const domFeeds = await extractFeedsFromDom(page);
|
||||
|
||||
const merged = [...nuxtFeeds, ...domFeeds];
|
||||
const result = dedupeAndNormalize(merged);
|
||||
log.info({ count: result.length }, 'xhh feeds extracted');
|
||||
return result;
|
||||
}
|
||||
|
||||
export async function searchFeeds(page: Page, keyword: string): Promise<Feed[]> {
|
||||
const targetUrl = `https://www.xiaoheihe.cn/app/bbs/search?keyword=${encodeURIComponent(keyword)}`;
|
||||
await page.goto(targetUrl, { waitUntil: 'domcontentloaded' });
|
||||
await page.waitForTimeout(1_200);
|
||||
|
||||
const text = await page.textContent('body').catch(() => '');
|
||||
if (text && detectCaptchaText(text)) {
|
||||
throw new Error('CAPTCHA_REQUIRED: captcha detected on search page');
|
||||
}
|
||||
|
||||
const combined = dedupeAndNormalize([
|
||||
...(await extractFeedsFromNuxt(page)),
|
||||
...(await extractFeedsFromDom(page)),
|
||||
]);
|
||||
|
||||
if (combined.length > 0) {
|
||||
return combined.filter((item) => {
|
||||
const haystack = `${item.title} ${item.description} ${item.user.nickname}`.toLowerCase();
|
||||
return haystack.includes(keyword.toLowerCase());
|
||||
});
|
||||
}
|
||||
|
||||
// Fallback: when search route structure changes, use home feeds and filter.
|
||||
const homeFeeds = await listFeeds(page);
|
||||
return homeFeeds.filter((item) => {
|
||||
const haystack = `${item.title} ${item.description} ${item.user.nickname}`.toLowerCase();
|
||||
return haystack.includes(keyword.toLowerCase());
|
||||
});
|
||||
}
|
||||
|
||||
async function extractFeedsFromNuxt(page: Page): Promise<RawFeedCandidate[]> {
|
||||
const data = await page
|
||||
.evaluate(() => {
|
||||
const root: unknown =
|
||||
(window as { __NUXT_DATA__?: unknown }).__NUXT_DATA__ ??
|
||||
(window as { __NUXT__?: { data?: unknown } }).__NUXT__?.data ??
|
||||
null;
|
||||
|
||||
const out: Array<Record<string, unknown>> = [];
|
||||
const visited = new Set<unknown>();
|
||||
|
||||
function walk(value: unknown): void {
|
||||
if (!value || typeof value !== 'object') return;
|
||||
if (visited.has(value)) return;
|
||||
visited.add(value);
|
||||
|
||||
if (Array.isArray(value)) {
|
||||
for (const item of value) walk(item);
|
||||
return;
|
||||
}
|
||||
|
||||
const obj = value as Record<string, unknown>;
|
||||
|
||||
const id =
|
||||
(typeof obj['link_id'] === 'string' && obj['link_id']) ||
|
||||
(typeof obj['linkid'] === 'string' && obj['linkid']) ||
|
||||
(typeof obj['id'] === 'string' && obj['id']) ||
|
||||
(typeof obj['post_id'] === 'string' && obj['post_id']) ||
|
||||
'';
|
||||
const url =
|
||||
(typeof obj['link_url'] === 'string' && obj['link_url']) ||
|
||||
(typeof obj['url'] === 'string' && obj['url']) ||
|
||||
'';
|
||||
const title =
|
||||
(typeof obj['title'] === 'string' && obj['title']) ||
|
||||
(typeof obj['subject'] === 'string' && obj['subject']) ||
|
||||
'';
|
||||
|
||||
const hasLink = (typeof url === 'string' && url.includes('/app/bbs/link/'));
|
||||
if (id || hasLink || title) {
|
||||
out.push(obj);
|
||||
}
|
||||
|
||||
for (const next of Object.values(obj)) {
|
||||
walk(next);
|
||||
}
|
||||
}
|
||||
|
||||
walk(root);
|
||||
return out.slice(0, 500);
|
||||
})
|
||||
.catch(() => []);
|
||||
|
||||
return (data as Array<Record<string, unknown>>).map((item) => {
|
||||
const linkUrl = firstNonEmpty(
|
||||
valueString(item['link_url']),
|
||||
valueString(item['url']),
|
||||
valueString(item['jump_url']),
|
||||
);
|
||||
const user = (item['user'] ?? item['author']) as Record<string, unknown> | undefined;
|
||||
return {
|
||||
id: firstNonEmpty(
|
||||
valueString(item['link_id']),
|
||||
valueString(item['linkid']),
|
||||
valueString(item['post_id']),
|
||||
valueString(item['id']),
|
||||
),
|
||||
title: firstNonEmpty(valueString(item['title']), valueString(item['subject'])),
|
||||
description: firstNonEmpty(
|
||||
valueString(item['description']),
|
||||
valueString(item['content']),
|
||||
valueString(item['desc']),
|
||||
),
|
||||
coverUrl: firstNonEmpty(
|
||||
valueString(item['cover']),
|
||||
valueString(item['cover_url']),
|
||||
valueString(item['image']),
|
||||
),
|
||||
likeCount: valueString(item['like_count']) || valueString(item['likes']),
|
||||
commentCount: valueString(item['comment_count']) || valueString(item['comments']),
|
||||
userId: firstNonEmpty(
|
||||
valueString(user?.['userid']),
|
||||
valueString(user?.['user_id']),
|
||||
valueString(item['userid']),
|
||||
),
|
||||
nickname: firstNonEmpty(
|
||||
valueString(user?.['nickname']),
|
||||
valueString(user?.['name']),
|
||||
valueString(item['nickname']),
|
||||
),
|
||||
avatar: firstNonEmpty(
|
||||
valueString(user?.['avatar']),
|
||||
valueString(user?.['avatar_url']),
|
||||
),
|
||||
linkUrl,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
async function extractFeedsFromDom(page: Page): Promise<RawFeedCandidate[]> {
|
||||
return page
|
||||
.evaluate((selectors) => {
|
||||
const anchors = [...document.querySelectorAll<HTMLAnchorElement>(selectors.feed.link)]
|
||||
.filter((a) => Boolean(a.getAttribute('href')));
|
||||
|
||||
const feeds: RawFeedCandidate[] = [];
|
||||
const cardSelector = selectors.feed.card.join(', ');
|
||||
const titleSelector = selectors.feed.title.join(', ');
|
||||
const descSelector = selectors.feed.description.join(', ');
|
||||
const userNameSelector = selectors.feed.userName.join(', ');
|
||||
const likeSelector = selectors.feed.likeCount.join(', ');
|
||||
const commentSelector = selectors.feed.commentCount.join(', ');
|
||||
|
||||
for (const link of anchors) {
|
||||
const href = link.getAttribute('href') ?? '';
|
||||
const card = link.closest(cardSelector) ?? link.parentElement;
|
||||
const title = (card?.querySelector(titleSelector)?.textContent ?? '').trim();
|
||||
const description = (card?.querySelector(descSelector)?.textContent ?? '').trim();
|
||||
const cover = (card?.querySelector(selectors.feed.cover) as HTMLImageElement | null)?.src ?? '';
|
||||
const userNode = card?.querySelector(selectors.feed.userLink) as HTMLAnchorElement | null;
|
||||
const username = (card?.querySelector(userNameSelector)?.textContent ?? '').trim();
|
||||
const likeCount = (card?.querySelector(likeSelector)?.textContent ?? '').trim();
|
||||
const commentCount = (card?.querySelector(commentSelector)?.textContent ?? '').trim();
|
||||
|
||||
feeds.push({
|
||||
linkUrl: href,
|
||||
title,
|
||||
description,
|
||||
coverUrl: cover,
|
||||
userId: userNode?.getAttribute('href') ?? '',
|
||||
nickname: username,
|
||||
avatar: (card?.querySelector('img') as HTMLImageElement | null)?.src ?? '',
|
||||
likeCount,
|
||||
commentCount,
|
||||
});
|
||||
}
|
||||
|
||||
return feeds;
|
||||
}, XHH_SELECTORS)
|
||||
.catch(() => []);
|
||||
}
|
||||
|
||||
function dedupeAndNormalize(items: RawFeedCandidate[]): Feed[] {
|
||||
const output: Feed[] = [];
|
||||
const seen = new Set<string>();
|
||||
|
||||
for (const item of items) {
|
||||
const linkId = firstNonEmpty(item.id, item.linkUrl ? extractLinkIdFromUrl(item.linkUrl) ?? '' : '');
|
||||
if (!linkId || seen.has(linkId)) continue;
|
||||
seen.add(linkId);
|
||||
|
||||
const linkUrl = normalizeLinkUrl(item.linkUrl, linkId);
|
||||
const userIdFromHref = item.userId ? extractUserIdFromMaybeHref(item.userId) : '';
|
||||
|
||||
output.push({
|
||||
id: linkId,
|
||||
title: item.title?.trim() ?? '',
|
||||
description: item.description?.trim() ?? '',
|
||||
coverUrl: item.coverUrl?.trim() ?? '',
|
||||
likeCount: parseCountString(item.likeCount),
|
||||
commentCount: parseCountString(item.commentCount),
|
||||
user: {
|
||||
id: userIdFromHref,
|
||||
nickname: item.nickname?.trim() ?? '',
|
||||
avatar: item.avatar?.trim() ?? '',
|
||||
},
|
||||
linkUrl,
|
||||
});
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
function normalizeLinkUrl(rawUrl: string | undefined, linkId: string): string {
|
||||
const trimmed = rawUrl?.trim() ?? '';
|
||||
if (!trimmed) return `https://www.xiaoheihe.cn/app/bbs/link/${linkId}`;
|
||||
if (/^https?:\/\//i.test(trimmed)) return trimmed;
|
||||
if (trimmed.startsWith('/')) return `https://www.xiaoheihe.cn${trimmed}`;
|
||||
return `https://${trimmed}`;
|
||||
}
|
||||
|
||||
function extractUserIdFromMaybeHref(raw: string): string {
|
||||
const normalized = raw.startsWith('/') ? `https://www.xiaoheihe.cn${raw}` : raw;
|
||||
return normalized.match(/\/app\/user\/profile\/(\d+)/)?.[1] ?? '';
|
||||
}
|
||||
|
||||
function valueString(value: unknown): string {
|
||||
if (typeof value === 'string') return value;
|
||||
if (typeof value === 'number') return String(value);
|
||||
return '';
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user