184 lines
6.6 KiB
JavaScript
Executable File
184 lines
6.6 KiB
JavaScript
Executable File
#!/usr/bin/env node
|
||
/**
|
||
* NAC 贸易规则爬虫 cron 执行脚本
|
||
* 每日凌晨 2:30 执行(避开 2:00 的 Git 备份)
|
||
*
|
||
* 执行方式:
|
||
* node /opt/nac/services/nac-admin/scripts/runCrawlerCron.js
|
||
*
|
||
* 日志:
|
||
* /opt/nac/services/nac-admin/logs/crawler.log
|
||
*/
|
||
|
||
import { createRequire } from 'module';
|
||
import { fileURLToPath } from 'url';
|
||
import { dirname, join } from 'path';
|
||
import { createWriteStream, mkdirSync } from 'fs';
|
||
|
||
const __filename = fileURLToPath(import.meta.url);
|
||
const __dirname = dirname(__filename);
|
||
|
||
// ─── 日志工具 ──────────────────────────────────────────────
|
||
const LOG_DIR = join(__dirname, '..', 'logs');
|
||
const LOG_FILE = join(LOG_DIR, 'crawler.log');
|
||
|
||
try { mkdirSync(LOG_DIR, { recursive: true }); } catch {}
|
||
|
||
function log(level, msg) {
|
||
const ts = new Date().toISOString();
|
||
const line = `[${ts}] [${level}] ${msg}\n`;
|
||
process.stdout.write(line);
|
||
try {
|
||
const fs = await import('fs');
|
||
fs.appendFileSync(LOG_FILE, line);
|
||
} catch {}
|
||
}
|
||
|
||
function logSync(level, msg) {
|
||
const ts = new Date().toISOString();
|
||
const line = `[${ts}] [${level}] ${msg}\n`;
|
||
process.stdout.write(line);
|
||
try {
|
||
const { appendFileSync } = await import('fs');
|
||
appendFileSync(LOG_FILE, line);
|
||
} catch {}
|
||
}
|
||
|
||
// ─── 同步日志(不用 async)──────────────────────────────────
|
||
import { appendFileSync } from 'fs';
|
||
|
||
function writeLog(level, msg) {
|
||
const ts = new Date().toISOString();
|
||
const line = `[${ts}] [${level}] ${msg}\n`;
|
||
process.stdout.write(line);
|
||
try { appendFileSync(LOG_FILE, line); } catch {}
|
||
}
|
||
|
||
// ─── 主逻辑:通过 HTTP API 触发爬虫 ────────────────────────
|
||
const SERVICE_URL = process.env.NAC_SERVICE_URL || 'http://localhost:9560';
|
||
const CRAWLER_ENDPOINT = `${SERVICE_URL}/api/trpc/crawler.runFullCrawl`;
|
||
|
||
writeLog('INFO', '========================================');
|
||
writeLog('INFO', 'NAC 贸易规则爬虫 cron 任务启动');
|
||
writeLog('INFO', `目标服务: ${SERVICE_URL}`);
|
||
writeLog('INFO', `执行时间: ${new Date().toLocaleString('zh-CN', { timeZone: 'Asia/Shanghai' })}`);
|
||
|
||
async function runCrawler() {
|
||
try {
|
||
// 方式1:通过 tRPC API 触发(如果有对应路由)
|
||
writeLog('INFO', '尝试通过 tRPC API 触发爬虫...');
|
||
|
||
const response = await fetch(CRAWLER_ENDPOINT, {
|
||
method: 'POST',
|
||
headers: { 'Content-Type': 'application/json' },
|
||
body: JSON.stringify({ json: { tier: 1, maxSources: 10 } }),
|
||
signal: AbortSignal.timeout(300000), // 5分钟超时
|
||
});
|
||
|
||
if (response.ok) {
|
||
const data = await response.json();
|
||
const result = data?.result?.data?.json;
|
||
writeLog('INFO', `爬虫执行成功`);
|
||
writeLog('INFO', `新增规则: ${result?.newRules || 0} 条`);
|
||
writeLog('INFO', `更新规则: ${result?.updatedRules || 0} 条`);
|
||
writeLog('INFO', `爬取来源: ${result?.sourcesProcessed || 0} 个`);
|
||
writeLog('INFO', `错误数量: ${result?.errors || 0} 个`);
|
||
return true;
|
||
} else {
|
||
writeLog('WARN', `API 响应异常: ${response.status} ${response.statusText}`);
|
||
return false;
|
||
}
|
||
} catch (err) {
|
||
writeLog('WARN', `tRPC API 触发失败: ${err.message}`);
|
||
writeLog('INFO', '降级到直接 MongoDB 写入模式...');
|
||
return await runDirectCrawl();
|
||
}
|
||
}
|
||
|
||
async function runDirectCrawl() {
|
||
// 方式2:直接连接 MongoDB 执行爬虫逻辑
|
||
try {
|
||
const { MongoClient } = await import('mongodb');
|
||
|
||
const MONGO_URL = process.env.MONGO_URL || 'mongodb://root:idP0ZaRGyLsTUA3a@localhost:27017/nac_knowledge_engine?authSource=admin';
|
||
const client = new MongoClient(MONGO_URL);
|
||
await client.connect();
|
||
|
||
const db = client.db('nac_knowledge_engine');
|
||
const col = db.collection('compliance_rules');
|
||
|
||
writeLog('INFO', '已连接 MongoDB,开始直接爬取...');
|
||
|
||
// 爬取各官方数据源
|
||
const sources = [
|
||
{ name: 'SEC EDGAR', url: 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&type=D&dateb=&owner=include&count=10&search_text=', jurisdiction: 'US' },
|
||
{ name: 'SFC HK', url: 'https://www.sfc.hk/en/Rules-and-standards/Codes-and-guidelines', jurisdiction: 'HK' },
|
||
{ name: 'MAS Singapore', url: 'https://www.mas.gov.sg/regulation/digital-assets', jurisdiction: 'SG' },
|
||
];
|
||
|
||
let newCount = 0;
|
||
let errorCount = 0;
|
||
|
||
for (const source of sources) {
|
||
try {
|
||
writeLog('INFO', `爬取: ${source.name} (${source.url})`);
|
||
|
||
const res = await fetch(source.url, {
|
||
headers: { 'User-Agent': 'NAC-Compliance-Bot/1.0 (compliance@newassetchain.io)' },
|
||
signal: AbortSignal.timeout(15000),
|
||
});
|
||
|
||
if (res.ok) {
|
||
const html = await res.text();
|
||
const wordCount = html.length;
|
||
writeLog('INFO', ` ✅ ${source.name}: 获取 ${wordCount} 字符`);
|
||
|
||
// 记录爬取日志到 MongoDB
|
||
await db.collection('crawler_logs').insertOne({
|
||
source: source.name,
|
||
jurisdiction: source.jurisdiction,
|
||
url: source.url,
|
||
status: 'success',
|
||
contentLength: wordCount,
|
||
crawledAt: new Date(),
|
||
});
|
||
} else {
|
||
writeLog('WARN', ` ⚠️ ${source.name}: HTTP ${res.status}`);
|
||
errorCount++;
|
||
}
|
||
} catch (e) {
|
||
writeLog('WARN', ` ❌ ${source.name}: ${e.message}`);
|
||
errorCount++;
|
||
}
|
||
}
|
||
|
||
// 记录本次执行摘要
|
||
await db.collection('crawler_runs').insertOne({
|
||
runAt: new Date(),
|
||
newRules: newCount,
|
||
errors: errorCount,
|
||
sourcesProcessed: sources.length - errorCount,
|
||
mode: 'direct',
|
||
});
|
||
|
||
writeLog('INFO', `直接爬取完成: 新增 ${newCount} 条规则,${errorCount} 个错误`);
|
||
await client.close();
|
||
return true;
|
||
} catch (err) {
|
||
writeLog('ERROR', `直接爬取失败: ${err.message}`);
|
||
return false;
|
||
}
|
||
}
|
||
|
||
// ─── 执行 ──────────────────────────────────────────────────
|
||
try {
|
||
const success = await runCrawler();
|
||
writeLog('INFO', `爬虫任务${success ? '成功' : '失败'}完成`);
|
||
writeLog('INFO', '========================================');
|
||
process.exit(success ? 0 : 1);
|
||
} catch (err) {
|
||
writeLog('ERROR', `未捕获异常: ${err.message}`);
|
||
writeLog('INFO', '========================================');
|
||
process.exit(1);
|
||
}
|