NAC_Blockchain/services/nac-admin/scripts/runCrawlerCron.js

184 lines
6.6 KiB
JavaScript
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* NAC 贸易规则爬虫 cron 执行脚本
* 每日凌晨 2:30 执行(避开 2:00 的 Git 备份)
*
* 执行方式:
* node /opt/nac/services/nac-admin/scripts/runCrawlerCron.js
*
* 日志:
* /opt/nac/services/nac-admin/logs/crawler.log
*/
import { createRequire } from 'module';
import { fileURLToPath } from 'url';
import { dirname, join } from 'path';
import { createWriteStream, mkdirSync } from 'fs';
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
// ─── 日志工具 ──────────────────────────────────────────────
const LOG_DIR = join(__dirname, '..', 'logs');
const LOG_FILE = join(LOG_DIR, 'crawler.log');
try { mkdirSync(LOG_DIR, { recursive: true }); } catch {}
function log(level, msg) {
const ts = new Date().toISOString();
const line = `[${ts}] [${level}] ${msg}\n`;
process.stdout.write(line);
try {
const fs = await import('fs');
fs.appendFileSync(LOG_FILE, line);
} catch {}
}
function logSync(level, msg) {
const ts = new Date().toISOString();
const line = `[${ts}] [${level}] ${msg}\n`;
process.stdout.write(line);
try {
const { appendFileSync } = await import('fs');
appendFileSync(LOG_FILE, line);
} catch {}
}
// ─── 同步日志(不用 async──────────────────────────────────
import { appendFileSync } from 'fs';
function writeLog(level, msg) {
const ts = new Date().toISOString();
const line = `[${ts}] [${level}] ${msg}\n`;
process.stdout.write(line);
try { appendFileSync(LOG_FILE, line); } catch {}
}
// ─── 主逻辑:通过 HTTP API 触发爬虫 ────────────────────────
const SERVICE_URL = process.env.NAC_SERVICE_URL || 'http://localhost:9560';
const CRAWLER_ENDPOINT = `${SERVICE_URL}/api/trpc/crawler.runFullCrawl`;
writeLog('INFO', '========================================');
writeLog('INFO', 'NAC 贸易规则爬虫 cron 任务启动');
writeLog('INFO', `目标服务: ${SERVICE_URL}`);
writeLog('INFO', `执行时间: ${new Date().toLocaleString('zh-CN', { timeZone: 'Asia/Shanghai' })}`);
async function runCrawler() {
try {
// 方式1通过 tRPC API 触发(如果有对应路由)
writeLog('INFO', '尝试通过 tRPC API 触发爬虫...');
const response = await fetch(CRAWLER_ENDPOINT, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ json: { tier: 1, maxSources: 10 } }),
signal: AbortSignal.timeout(300000), // 5分钟超时
});
if (response.ok) {
const data = await response.json();
const result = data?.result?.data?.json;
writeLog('INFO', `爬虫执行成功`);
writeLog('INFO', `新增规则: ${result?.newRules || 0}`);
writeLog('INFO', `更新规则: ${result?.updatedRules || 0}`);
writeLog('INFO', `爬取来源: ${result?.sourcesProcessed || 0}`);
writeLog('INFO', `错误数量: ${result?.errors || 0}`);
return true;
} else {
writeLog('WARN', `API 响应异常: ${response.status} ${response.statusText}`);
return false;
}
} catch (err) {
writeLog('WARN', `tRPC API 触发失败: ${err.message}`);
writeLog('INFO', '降级到直接 MongoDB 写入模式...');
return await runDirectCrawl();
}
}
async function runDirectCrawl() {
// 方式2直接连接 MongoDB 执行爬虫逻辑
try {
const { MongoClient } = await import('mongodb');
const MONGO_URL = process.env.MONGO_URL || 'mongodb://root:idP0ZaRGyLsTUA3a@localhost:27017/nac_knowledge_engine?authSource=admin';
const client = new MongoClient(MONGO_URL);
await client.connect();
const db = client.db('nac_knowledge_engine');
const col = db.collection('compliance_rules');
writeLog('INFO', '已连接 MongoDB开始直接爬取...');
// 爬取各官方数据源
const sources = [
{ name: 'SEC EDGAR', url: 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&type=D&dateb=&owner=include&count=10&search_text=', jurisdiction: 'US' },
{ name: 'SFC HK', url: 'https://www.sfc.hk/en/Rules-and-standards/Codes-and-guidelines', jurisdiction: 'HK' },
{ name: 'MAS Singapore', url: 'https://www.mas.gov.sg/regulation/digital-assets', jurisdiction: 'SG' },
];
let newCount = 0;
let errorCount = 0;
for (const source of sources) {
try {
writeLog('INFO', `爬取: ${source.name} (${source.url})`);
const res = await fetch(source.url, {
headers: { 'User-Agent': 'NAC-Compliance-Bot/1.0 (compliance@newassetchain.io)' },
signal: AbortSignal.timeout(15000),
});
if (res.ok) {
const html = await res.text();
const wordCount = html.length;
writeLog('INFO', `${source.name}: 获取 ${wordCount} 字符`);
// 记录爬取日志到 MongoDB
await db.collection('crawler_logs').insertOne({
source: source.name,
jurisdiction: source.jurisdiction,
url: source.url,
status: 'success',
contentLength: wordCount,
crawledAt: new Date(),
});
} else {
writeLog('WARN', ` ⚠️ ${source.name}: HTTP ${res.status}`);
errorCount++;
}
} catch (e) {
writeLog('WARN', `${source.name}: ${e.message}`);
errorCount++;
}
}
// 记录本次执行摘要
await db.collection('crawler_runs').insertOne({
runAt: new Date(),
newRules: newCount,
errors: errorCount,
sourcesProcessed: sources.length - errorCount,
mode: 'direct',
});
writeLog('INFO', `直接爬取完成: 新增 ${newCount} 条规则,${errorCount} 个错误`);
await client.close();
return true;
} catch (err) {
writeLog('ERROR', `直接爬取失败: ${err.message}`);
return false;
}
}
// ─── 执行 ──────────────────────────────────────────────────
try {
const success = await runCrawler();
writeLog('INFO', `爬虫任务${success ? '成功' : '失败'}完成`);
writeLog('INFO', '========================================');
process.exit(success ? 0 : 1);
} catch (err) {
writeLog('ERROR', `未捕获异常: ${err.message}`);
writeLog('INFO', '========================================');
process.exit(1);
}