import type { Page } from 'rebrowser-playwright'; import { logger } from '@social/core/utils/logger.js'; import { XHH_SELECTORS } from './selectors.js'; import type { Feed } from './types.js'; import { detectCaptchaText, extractLinkIdFromUrl, firstNonEmpty, parseCountString, } from './extractors.js'; const HOME_URL = 'https://www.xiaoheihe.cn/app/bbs/home'; const log = logger.child({ module: 'xhh-feeds' }); interface RawFeedCandidate { id?: string; title?: string; description?: string; coverUrl?: string; likeCount?: string | number; commentCount?: string | number; userId?: string; nickname?: string; avatar?: string; linkUrl?: string; } export async function listFeeds(page: Page): Promise { await page.goto(HOME_URL, { waitUntil: 'domcontentloaded' }); await page.waitForTimeout(1_500); const text = await page.textContent('body').catch(() => ''); if (text && detectCaptchaText(text)) { throw new Error('CAPTCHA_REQUIRED: captcha detected on feeds page'); } const nuxtFeeds = await extractFeedsFromNuxt(page); const domFeeds = await extractFeedsFromDom(page); const merged = [...nuxtFeeds, ...domFeeds]; const result = dedupeAndNormalize(merged); log.info({ count: result.length }, 'xhh feeds extracted'); return result; } export async function searchFeeds(page: Page, keyword: string): Promise { const targetUrl = `https://www.xiaoheihe.cn/app/bbs/search?keyword=${encodeURIComponent(keyword)}`; await page.goto(targetUrl, { waitUntil: 'domcontentloaded' }); await page.waitForTimeout(1_200); const text = await page.textContent('body').catch(() => ''); if (text && detectCaptchaText(text)) { throw new Error('CAPTCHA_REQUIRED: captcha detected on search page'); } const combined = dedupeAndNormalize([ ...(await extractFeedsFromNuxt(page)), ...(await extractFeedsFromDom(page)), ]); if (combined.length > 0) { return combined.filter((item) => { const haystack = `${item.title} ${item.description} ${item.user.nickname}`.toLowerCase(); return haystack.includes(keyword.toLowerCase()); }); } // Fallback: when search route structure changes, use home feeds and filter. const homeFeeds = await listFeeds(page); return homeFeeds.filter((item) => { const haystack = `${item.title} ${item.description} ${item.user.nickname}`.toLowerCase(); return haystack.includes(keyword.toLowerCase()); }); } async function extractFeedsFromNuxt(page: Page): Promise { const data = await page .evaluate(() => { const root: unknown = (window as { __NUXT_DATA__?: unknown }).__NUXT_DATA__ ?? (window as { __NUXT__?: { data?: unknown } }).__NUXT__?.data ?? null; const out: Array> = []; const visited = new Set(); function walk(value: unknown): void { if (!value || typeof value !== 'object') return; if (visited.has(value)) return; visited.add(value); if (Array.isArray(value)) { for (const item of value) walk(item); return; } const obj = value as Record; const id = (typeof obj['link_id'] === 'string' && obj['link_id']) || (typeof obj['linkid'] === 'string' && obj['linkid']) || (typeof obj['id'] === 'string' && obj['id']) || (typeof obj['post_id'] === 'string' && obj['post_id']) || ''; const url = (typeof obj['link_url'] === 'string' && obj['link_url']) || (typeof obj['url'] === 'string' && obj['url']) || ''; const title = (typeof obj['title'] === 'string' && obj['title']) || (typeof obj['subject'] === 'string' && obj['subject']) || ''; const hasLink = (typeof url === 'string' && url.includes('/app/bbs/link/')); if (id || hasLink || title) { out.push(obj); } for (const next of Object.values(obj)) { walk(next); } } walk(root); return out.slice(0, 500); }) .catch(() => []); return (data as Array>).map((item) => { const linkUrl = firstNonEmpty( valueString(item['link_url']), valueString(item['url']), valueString(item['jump_url']), ); const user = (item['user'] ?? item['author']) as Record | undefined; return { id: firstNonEmpty( valueString(item['link_id']), valueString(item['linkid']), valueString(item['post_id']), valueString(item['id']), ), title: firstNonEmpty(valueString(item['title']), valueString(item['subject'])), description: firstNonEmpty( valueString(item['description']), valueString(item['content']), valueString(item['desc']), ), coverUrl: firstNonEmpty( valueString(item['cover']), valueString(item['cover_url']), valueString(item['image']), ), likeCount: valueString(item['like_count']) || valueString(item['likes']), commentCount: valueString(item['comment_count']) || valueString(item['comments']), userId: firstNonEmpty( valueString(user?.['userid']), valueString(user?.['user_id']), valueString(item['userid']), ), nickname: firstNonEmpty( valueString(user?.['nickname']), valueString(user?.['name']), valueString(item['nickname']), ), avatar: firstNonEmpty( valueString(user?.['avatar']), valueString(user?.['avatar_url']), ), linkUrl, }; }); } async function extractFeedsFromDom(page: Page): Promise { return page .evaluate((selectors) => { const anchors = [...document.querySelectorAll(selectors.feed.link)] .filter((a) => Boolean(a.getAttribute('href'))); const feeds: RawFeedCandidate[] = []; const cardSelector = selectors.feed.card.join(', '); const titleSelector = selectors.feed.title.join(', '); const descSelector = selectors.feed.description.join(', '); const userNameSelector = selectors.feed.userName.join(', '); const likeSelector = selectors.feed.likeCount.join(', '); const commentSelector = selectors.feed.commentCount.join(', '); for (const link of anchors) { const href = link.getAttribute('href') ?? ''; const card = link.closest(cardSelector) ?? link.parentElement; const title = (card?.querySelector(titleSelector)?.textContent ?? '').trim(); const description = (card?.querySelector(descSelector)?.textContent ?? '').trim(); const cover = (card?.querySelector(selectors.feed.cover) as HTMLImageElement | null)?.src ?? ''; const userNode = card?.querySelector(selectors.feed.userLink) as HTMLAnchorElement | null; const username = (card?.querySelector(userNameSelector)?.textContent ?? '').trim(); const likeCount = (card?.querySelector(likeSelector)?.textContent ?? '').trim(); const commentCount = (card?.querySelector(commentSelector)?.textContent ?? '').trim(); feeds.push({ linkUrl: href, title, description, coverUrl: cover, userId: userNode?.getAttribute('href') ?? '', nickname: username, avatar: (card?.querySelector('img') as HTMLImageElement | null)?.src ?? '', likeCount, commentCount, }); } return feeds; }, XHH_SELECTORS) .catch(() => []); } function dedupeAndNormalize(items: RawFeedCandidate[]): Feed[] { const output: Feed[] = []; const seen = new Set(); for (const item of items) { const linkId = firstNonEmpty(item.id, item.linkUrl ? extractLinkIdFromUrl(item.linkUrl) ?? '' : ''); if (!linkId || seen.has(linkId)) continue; seen.add(linkId); const linkUrl = normalizeLinkUrl(item.linkUrl, linkId); const userIdFromHref = item.userId ? extractUserIdFromMaybeHref(item.userId) : ''; output.push({ id: linkId, title: item.title?.trim() ?? '', description: item.description?.trim() ?? '', coverUrl: item.coverUrl?.trim() ?? '', likeCount: parseCountString(item.likeCount), commentCount: parseCountString(item.commentCount), user: { id: userIdFromHref, nickname: item.nickname?.trim() ?? '', avatar: item.avatar?.trim() ?? '', }, linkUrl, }); } return output; } function normalizeLinkUrl(rawUrl: string | undefined, linkId: string): string { const trimmed = rawUrl?.trim() ?? ''; if (!trimmed) return `https://www.xiaoheihe.cn/app/bbs/link/${linkId}`; if (/^https?:\/\//i.test(trimmed)) return trimmed; if (trimmed.startsWith('/')) return `https://www.xiaoheihe.cn${trimmed}`; return `https://${trimmed}`; } function extractUserIdFromMaybeHref(raw: string): string { const normalized = raw.startsWith('/') ? `https://www.xiaoheihe.cn${raw}` : raw; return normalized.match(/\/app\/user\/profile\/(\d+)/)?.[1] ?? ''; } function valueString(value: unknown): string { if (typeof value === 'string') return value; if (typeof value === 'number') return String(value); return ''; }