#!/usr/bin/env node /** * NAC 贸易规则爬虫 cron 执行脚本 * 每日凌晨 2:30 执行(避开 2:00 的 Git 备份) * * 执行方式: * node /opt/nac/services/nac-admin/scripts/runCrawlerCron.js * * 日志: * /opt/nac/services/nac-admin/logs/crawler.log */ import { createRequire } from 'module'; import { fileURLToPath } from 'url'; import { dirname, join } from 'path'; import { createWriteStream, mkdirSync } from 'fs'; const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); // ─── 日志工具 ────────────────────────────────────────────── const LOG_DIR = join(__dirname, '..', 'logs'); const LOG_FILE = join(LOG_DIR, 'crawler.log'); try { mkdirSync(LOG_DIR, { recursive: true }); } catch {} function log(level, msg) { const ts = new Date().toISOString(); const line = `[${ts}] [${level}] ${msg}\n`; process.stdout.write(line); try { const fs = await import('fs'); fs.appendFileSync(LOG_FILE, line); } catch {} } function logSync(level, msg) { const ts = new Date().toISOString(); const line = `[${ts}] [${level}] ${msg}\n`; process.stdout.write(line); try { const { appendFileSync } = await import('fs'); appendFileSync(LOG_FILE, line); } catch {} } // ─── 同步日志(不用 async)────────────────────────────────── import { appendFileSync } from 'fs'; function writeLog(level, msg) { const ts = new Date().toISOString(); const line = `[${ts}] [${level}] ${msg}\n`; process.stdout.write(line); try { appendFileSync(LOG_FILE, line); } catch {} } // ─── 主逻辑:通过 HTTP API 触发爬虫 ──────────────────────── const SERVICE_URL = process.env.NAC_SERVICE_URL || 'http://localhost:9560'; const CRAWLER_ENDPOINT = `${SERVICE_URL}/api/trpc/crawler.runFullCrawl`; writeLog('INFO', '========================================'); writeLog('INFO', 'NAC 贸易规则爬虫 cron 任务启动'); writeLog('INFO', `目标服务: ${SERVICE_URL}`); writeLog('INFO', `执行时间: ${new Date().toLocaleString('zh-CN', { timeZone: 'Asia/Shanghai' })}`); async function runCrawler() { try { // 方式1:通过 tRPC API 触发(如果有对应路由) writeLog('INFO', '尝试通过 tRPC API 触发爬虫...'); const response = await fetch(CRAWLER_ENDPOINT, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ json: { tier: 1, maxSources: 10 } }), signal: AbortSignal.timeout(300000), // 5分钟超时 }); if (response.ok) { const data = await response.json(); const result = data?.result?.data?.json; writeLog('INFO', `爬虫执行成功`); writeLog('INFO', `新增规则: ${result?.newRules || 0} 条`); writeLog('INFO', `更新规则: ${result?.updatedRules || 0} 条`); writeLog('INFO', `爬取来源: ${result?.sourcesProcessed || 0} 个`); writeLog('INFO', `错误数量: ${result?.errors || 0} 个`); return true; } else { writeLog('WARN', `API 响应异常: ${response.status} ${response.statusText}`); return false; } } catch (err) { writeLog('WARN', `tRPC API 触发失败: ${err.message}`); writeLog('INFO', '降级到直接 MongoDB 写入模式...'); return await runDirectCrawl(); } } async function runDirectCrawl() { // 方式2:直接连接 MongoDB 执行爬虫逻辑 try { const { MongoClient } = await import('mongodb'); const MONGO_URL = process.env.MONGO_URL || 'mongodb://root:idP0ZaRGyLsTUA3a@localhost:27017/nac_knowledge_engine?authSource=admin'; const client = new MongoClient(MONGO_URL); await client.connect(); const db = client.db('nac_knowledge_engine'); const col = db.collection('compliance_rules'); writeLog('INFO', '已连接 MongoDB,开始直接爬取...'); // 爬取各官方数据源 const sources = [ { name: 'SEC EDGAR', url: 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&type=D&dateb=&owner=include&count=10&search_text=', jurisdiction: 'US' }, { name: 'SFC HK', url: 'https://www.sfc.hk/en/Rules-and-standards/Codes-and-guidelines', jurisdiction: 'HK' }, { name: 'MAS Singapore', url: 'https://www.mas.gov.sg/regulation/digital-assets', jurisdiction: 'SG' }, ]; let newCount = 0; let errorCount = 0; for (const source of sources) { try { writeLog('INFO', `爬取: ${source.name} (${source.url})`); const res = await fetch(source.url, { headers: { 'User-Agent': 'NAC-Compliance-Bot/1.0 (compliance@newassetchain.io)' }, signal: AbortSignal.timeout(15000), }); if (res.ok) { const html = await res.text(); const wordCount = html.length; writeLog('INFO', ` ✅ ${source.name}: 获取 ${wordCount} 字符`); // 记录爬取日志到 MongoDB await db.collection('crawler_logs').insertOne({ source: source.name, jurisdiction: source.jurisdiction, url: source.url, status: 'success', contentLength: wordCount, crawledAt: new Date(), }); } else { writeLog('WARN', ` ⚠️ ${source.name}: HTTP ${res.status}`); errorCount++; } } catch (e) { writeLog('WARN', ` ❌ ${source.name}: ${e.message}`); errorCount++; } } // 记录本次执行摘要 await db.collection('crawler_runs').insertOne({ runAt: new Date(), newRules: newCount, errors: errorCount, sourcesProcessed: sources.length - errorCount, mode: 'direct', }); writeLog('INFO', `直接爬取完成: 新增 ${newCount} 条规则,${errorCount} 个错误`); await client.close(); return true; } catch (err) { writeLog('ERROR', `直接爬取失败: ${err.message}`); return false; } } // ─── 执行 ────────────────────────────────────────────────── try { const success = await runCrawler(); writeLog('INFO', `爬虫任务${success ? '成功' : '失败'}完成`); writeLog('INFO', '========================================'); process.exit(success ? 0 : 1); } catch (err) { writeLog('ERROR', `未捕获异常: ${err.message}`); writeLog('INFO', '========================================'); process.exit(1); }