NAC_Blockchain/services/nac-admin/server/regulatoryCrawler.ts

822 lines
28 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* NAC 公链 - 监管规则自动爬虫模块
* Regulatory Rules Auto-Crawler
*
* 覆盖范围:
* - Tier 1 辖区20个US/CA/EU/GB/CH/DE/FR/NL/IE/LU/JP/KR/SG/HK/AU/AE/IL
* - Tier 2 辖区25个CN/TW/MY/TH/IN/IT/ES/TR/SA/QA/KW/BH/BR/CL/AR/ZA等
* - Tier 3 辖区15个ID/PH/VN/PK/BD/OM/CO/PE/VE/UY/PY/NG/EG/KE/MA/RU/KZ/UA
*
* 资产类别20大类 100+子类GNACS标准
*
* 爬取策略:
* 1. 官方 RSS/Atom 订阅源(实时更新)
* 2. 官方 API 接口SEC EDGAR、ESMA FIRDS等
* 3. 官方网站 HTML 解析(无 API 时使用)
* 4. 解析提取 → 结构化规则 → 写入 MongoDB
*/
import https from "https";
import http from "http";
import { URL } from "url";
import { MongoClient } from "mongodb";
// ─── 类型定义 ─────────────────────────────────────────────────────
export interface CrawledRule {
ruleId: string;
jurisdiction: string;
assetClass: string;
ruleType: "ownership_verification" | "trading_rules" | "compliance_general" | "tax_rules" | "aml_kyc";
ruleName: string;
content: string;
legalBasis: string;
ownershipRequirements?: {
proofDocuments?: string[];
registrationAuthority?: string;
transferMechanism?: string;
chainRecognition?: string;
foreignOwnershipRestriction?: string;
disputeResolution?: string;
};
tradingRequirements?: {
minimumInvestor?: string;
settlementPeriod?: string;
allowedCurrencies?: string[];
tradingPlatform?: string;
reportingRequirements?: string;
};
sourceUrl: string;
sourceName: string;
crawledAt: Date;
lastUpdated: Date;
tier: number;
tags: string[];
complianceLevel: "mandatory" | "recommended" | "informational";
}
export interface CrawlerSource {
jurisdiction: string;
sourceName: string;
sourceUrl: string;
rssUrl?: string;
apiUrl?: string;
assetClasses: string[];
tier: number;
parseStrategy: "rss" | "api" | "html" | "json";
rateLimit?: number; // ms between requests
}
export interface CrawlerResult {
jurisdiction: string;
sourceName: string;
rulesFound: number;
rulesInserted: number;
rulesUpdated: number;
errors: string[];
crawledAt: Date;
}
// ─── 官方数据源清单 ───────────────────────────────────────────────
export const REGULATORY_SOURCES: CrawlerSource[] = [
// ══════════════════════════════════════════════════════════════
// 北美洲
// ══════════════════════════════════════════════════════════════
{
jurisdiction: "US",
sourceName: "SEC (美国证券交易委员会)",
sourceUrl: "https://www.sec.gov",
rssUrl: "https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&type=&dateb=&owner=include&count=40&search_text=&output=atom",
apiUrl: "https://efts.sec.gov/LATEST/search-index?q=%22RWA%22+%22tokenization%22&dateRange=custom&startdt=2023-01-01&forms=S-1,8-K",
assetClasses: ["Equity", "Bonds", "RealEstate", "Commodities", "Funds"],
tier: 1,
parseStrategy: "rss",
rateLimit: 1000,
},
{
jurisdiction: "US",
sourceName: "FinCEN (美国金融犯罪执法网络)",
sourceUrl: "https://www.fincen.gov",
rssUrl: "https://www.fincen.gov/news/rss.xml",
assetClasses: ["ALL"],
tier: 1,
parseStrategy: "rss",
rateLimit: 1000,
},
{
jurisdiction: "CA",
sourceName: "CSA (加拿大证券管理局)",
sourceUrl: "https://www.securities-administrators.ca",
rssUrl: "https://www.securities-administrators.ca/news/rss",
assetClasses: ["Equity", "Bonds", "Funds", "Crypto"],
tier: 1,
parseStrategy: "rss",
rateLimit: 1000,
},
// ══════════════════════════════════════════════════════════════
// 欧洲
// ══════════════════════════════════════════════════════════════
{
jurisdiction: "EU",
sourceName: "ESMA (欧洲证券和市场管理局)",
sourceUrl: "https://www.esma.europa.eu",
rssUrl: "https://www.esma.europa.eu/press-news/rss-feeds",
apiUrl: "https://registers.esma.europa.eu/publication/searchRegister?core=esma_registers_firds_ir",
assetClasses: ["Equity", "Bonds", "Derivatives", "Funds", "Crypto"],
tier: 1,
parseStrategy: "rss",
rateLimit: 1000,
},
{
jurisdiction: "GB",
sourceName: "FCA (英国金融行为监管局)",
sourceUrl: "https://www.fca.org.uk",
rssUrl: "https://www.fca.org.uk/news/rss.xml",
assetClasses: ["Equity", "Bonds", "RealEstate", "Funds", "Crypto"],
tier: 1,
parseStrategy: "rss",
rateLimit: 1000,
},
{
jurisdiction: "CH",
sourceName: "FINMA (瑞士金融市场监管局)",
sourceUrl: "https://www.finma.ch",
rssUrl: "https://www.finma.ch/en/news/rss/",
assetClasses: ["Equity", "Bonds", "Funds", "Crypto", "RealEstate"],
tier: 1,
parseStrategy: "rss",
rateLimit: 1000,
},
{
jurisdiction: "DE",
sourceName: "BaFin (德国联邦金融监管局)",
sourceUrl: "https://www.bafin.de",
rssUrl: "https://www.bafin.de/SiteGlobals/Functions/RSSFeed/EN/RSSNewsfeed_Veroeffentlichungen/RSSNewsfeed_Veroeffentlichungen_node.html",
assetClasses: ["Equity", "Bonds", "Funds", "RealEstate"],
tier: 1,
parseStrategy: "rss",
rateLimit: 1000,
},
{
jurisdiction: "FR",
sourceName: "AMF (法国金融市场管理局)",
sourceUrl: "https://www.amf-france.org",
rssUrl: "https://www.amf-france.org/en/rss/news",
assetClasses: ["Equity", "Bonds", "Funds", "Crypto"],
tier: 1,
parseStrategy: "rss",
rateLimit: 1000,
},
{
jurisdiction: "NL",
sourceName: "AFM (荷兰金融市场管理局)",
sourceUrl: "https://www.afm.nl",
rssUrl: "https://www.afm.nl/en/nieuws/rss",
assetClasses: ["Equity", "Bonds", "Funds"],
tier: 1,
parseStrategy: "rss",
rateLimit: 1000,
},
{
jurisdiction: "LU",
sourceName: "CSSF (卢森堡金融监管委员会)",
sourceUrl: "https://www.cssf.lu",
rssUrl: "https://www.cssf.lu/en/news/rss/",
assetClasses: ["Funds", "Bonds", "Equity"],
tier: 1,
parseStrategy: "rss",
rateLimit: 1000,
},
// ══════════════════════════════════════════════════════════════
// 亚太地区
// ══════════════════════════════════════════════════════════════
{
jurisdiction: "HK",
sourceName: "SFC (香港证券及期货事务监察委员会)",
sourceUrl: "https://www.sfc.hk",
rssUrl: "https://www.sfc.hk/en/rss/news",
assetClasses: ["Equity", "Bonds", "RealEstate", "Funds", "Crypto"],
tier: 1,
parseStrategy: "rss",
rateLimit: 1000,
},
{
jurisdiction: "HK",
sourceName: "HKEX (香港交易所)",
sourceUrl: "https://www.hkex.com.hk",
rssUrl: "https://www.hkex.com.hk/eng/newsconsul/hkexnews/rss/news.xml",
assetClasses: ["Equity", "Bonds", "Derivatives", "Funds"],
tier: 1,
parseStrategy: "rss",
rateLimit: 1000,
},
{
jurisdiction: "SG",
sourceName: "MAS (新加坡金融管理局)",
sourceUrl: "https://www.mas.gov.sg",
rssUrl: "https://www.mas.gov.sg/news/rss",
assetClasses: ["Equity", "Bonds", "RealEstate", "Funds", "Crypto"],
tier: 1,
parseStrategy: "rss",
rateLimit: 1000,
},
{
jurisdiction: "JP",
sourceName: "FSA (日本金融厅)",
sourceUrl: "https://www.fsa.go.jp",
rssUrl: "https://www.fsa.go.jp/en/news/rss.xml",
assetClasses: ["Equity", "Bonds", "RealEstate", "Funds"],
tier: 1,
parseStrategy: "rss",
rateLimit: 1500,
},
{
jurisdiction: "KR",
sourceName: "FSC (韩国金融委员会)",
sourceUrl: "https://www.fsc.go.kr",
rssUrl: "https://www.fsc.go.kr/eng/rss/news.xml",
assetClasses: ["Equity", "Bonds", "Funds", "Crypto"],
tier: 1,
parseStrategy: "rss",
rateLimit: 1500,
},
{
jurisdiction: "AU",
sourceName: "ASIC (澳大利亚证券和投资委员会)",
sourceUrl: "https://asic.gov.au",
rssUrl: "https://asic.gov.au/about-asic/news-centre/rss-feeds/",
assetClasses: ["Equity", "Bonds", "RealEstate", "Commodities", "Funds"],
tier: 1,
parseStrategy: "rss",
rateLimit: 1000,
},
{
jurisdiction: "CN",
sourceName: "CSRC (中国证券监督管理委员会)",
sourceUrl: "http://www.csrc.gov.cn",
rssUrl: "http://www.csrc.gov.cn/csrc/c100028/common_list.shtml",
assetClasses: ["Equity", "Bonds", "Funds", "RealEstate"],
tier: 2,
parseStrategy: "html",
rateLimit: 2000,
},
{
jurisdiction: "CN",
sourceName: "PBOC (中国人民银行)",
sourceUrl: "http://www.pbc.gov.cn",
rssUrl: "http://www.pbc.gov.cn/rss/index.xml",
assetClasses: ["Bonds", "Forex", "Crypto"],
tier: 2,
parseStrategy: "rss",
rateLimit: 2000,
},
{
jurisdiction: "IN",
sourceName: "SEBI (印度证券交易委员会)",
sourceUrl: "https://www.sebi.gov.in",
rssUrl: "https://www.sebi.gov.in/rss/news.xml",
assetClasses: ["Equity", "Bonds", "Funds", "Commodities"],
tier: 2,
parseStrategy: "rss",
rateLimit: 1500,
},
{
jurisdiction: "MY",
sourceName: "SC (马来西亚证券委员会)",
sourceUrl: "https://www.sc.com.my",
rssUrl: "https://www.sc.com.my/api/documentcentre/rss",
assetClasses: ["Equity", "Bonds", "Funds", "IslamicFinance"],
tier: 2,
parseStrategy: "rss",
rateLimit: 1500,
},
{
jurisdiction: "TH",
sourceName: "SEC Thailand (泰国证券交易委员会)",
sourceUrl: "https://www.sec.or.th",
rssUrl: "https://www.sec.or.th/EN/Pages/News/rss.aspx",
assetClasses: ["Equity", "Bonds", "Funds", "Crypto"],
tier: 2,
parseStrategy: "rss",
rateLimit: 1500,
},
// ══════════════════════════════════════════════════════════════
// 中东地区
// ══════════════════════════════════════════════════════════════
{
jurisdiction: "AE",
sourceName: "DFSA (迪拜金融服务局)",
sourceUrl: "https://www.dfsa.ae",
rssUrl: "https://www.dfsa.ae/news/rss",
assetClasses: ["Equity", "Bonds", "RealEstate", "Funds", "Crypto"],
tier: 1,
parseStrategy: "rss",
rateLimit: 1000,
},
{
jurisdiction: "AE",
sourceName: "ADGM (阿布扎比全球市场)",
sourceUrl: "https://www.adgm.com",
rssUrl: "https://www.adgm.com/news/rss",
assetClasses: ["Equity", "Bonds", "Funds", "Crypto", "RealEstate"],
tier: 1,
parseStrategy: "rss",
rateLimit: 1000,
},
{
jurisdiction: "SA",
sourceName: "CMA Saudi (沙特资本市场管理局)",
sourceUrl: "https://cma.org.sa",
rssUrl: "https://cma.org.sa/en/News/Pages/rss.aspx",
assetClasses: ["Equity", "Bonds", "Funds", "RealEstate"],
tier: 2,
parseStrategy: "rss",
rateLimit: 1500,
},
{
jurisdiction: "QA",
sourceName: "QFMA (卡塔尔金融市场管理局)",
sourceUrl: "https://www.qfma.org.qa",
rssUrl: "https://www.qfma.org.qa/English/News/rss.aspx",
assetClasses: ["Equity", "Bonds", "Funds"],
tier: 2,
parseStrategy: "rss",
rateLimit: 1500,
},
{
jurisdiction: "IL",
sourceName: "ISA (以色列证券局)",
sourceUrl: "https://www.isa.gov.il",
rssUrl: "https://www.isa.gov.il/en/news/rss",
assetClasses: ["Equity", "Bonds", "Funds", "Crypto"],
tier: 1,
parseStrategy: "rss",
rateLimit: 1000,
},
// ══════════════════════════════════════════════════════════════
// 南美洲
// ══════════════════════════════════════════════════════════════
{
jurisdiction: "BR",
sourceName: "CVM (巴西证券委员会)",
sourceUrl: "https://www.gov.br/cvm",
rssUrl: "https://www.gov.br/cvm/pt-br/assuntos/noticias/rss.xml",
assetClasses: ["Equity", "Bonds", "Funds", "Commodities"],
tier: 2,
parseStrategy: "rss",
rateLimit: 1500,
},
{
jurisdiction: "CL",
sourceName: "CMF (智利金融市场委员会)",
sourceUrl: "https://www.cmfchile.cl",
rssUrl: "https://www.cmfchile.cl/sitio/rss/noticias.xml",
assetClasses: ["Equity", "Bonds", "Funds", "Commodities"],
tier: 2,
parseStrategy: "rss",
rateLimit: 1500,
},
// ══════════════════════════════════════════════════════════════
// 非洲
// ══════════════════════════════════════════════════════════════
{
jurisdiction: "ZA",
sourceName: "FSCA (南非金融行业监管局)",
sourceUrl: "https://www.fsca.co.za",
rssUrl: "https://www.fsca.co.za/News/Pages/rss.aspx",
assetClasses: ["Equity", "Bonds", "Funds", "Commodities"],
tier: 2,
parseStrategy: "rss",
rateLimit: 1500,
},
];
// ─── HTTP 请求工具 ────────────────────────────────────────────────
async function fetchUrl(url: string, timeoutMs = 15000): Promise<string> {
return new Promise((resolve, reject) => {
const parsedUrl = new URL(url);
const protocol = parsedUrl.protocol === "https:" ? https : http;
const options = {
hostname: parsedUrl.hostname,
port: parsedUrl.port || (parsedUrl.protocol === "https:" ? 443 : 80),
path: parsedUrl.pathname + parsedUrl.search,
method: "GET",
headers: {
"User-Agent": "NAC-Regulatory-Crawler/1.0 (NAC Public Chain Compliance; https://newassetchain.io)",
"Accept": "application/rss+xml, application/atom+xml, application/xml, text/xml, text/html, application/json",
"Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8",
"Cache-Control": "no-cache",
},
timeout: timeoutMs,
};
const req = protocol.request(options, (res) => {
// 处理重定向
if (res.statusCode && [301, 302, 303, 307, 308].includes(res.statusCode) && res.headers.location) {
const redirectUrl = res.headers.location.startsWith("http")
? res.headers.location
: `${parsedUrl.protocol}//${parsedUrl.hostname}${res.headers.location}`;
fetchUrl(redirectUrl, timeoutMs).then(resolve).catch(reject);
return;
}
if (res.statusCode && res.statusCode >= 400) {
reject(new Error(`HTTP ${res.statusCode}: ${url}`));
return;
}
const chunks: Buffer[] = [];
res.on("data", (chunk: Buffer) => chunks.push(chunk));
res.on("end", () => resolve(Buffer.concat(chunks).toString("utf-8")));
res.on("error", reject);
});
req.on("timeout", () => {
req.destroy();
reject(new Error(`Timeout fetching: ${url}`));
});
req.on("error", reject);
req.end();
});
}
// ─── RSS/Atom 解析器 ──────────────────────────────────────────────
interface RSSItem {
title: string;
link: string;
description: string;
pubDate: string;
category?: string;
}
function parseRSSFeed(xmlContent: string): RSSItem[] {
const items: RSSItem[] = [];
// 支持 RSS 2.0 和 Atom 格式
const isAtom = xmlContent.includes("<feed") && xmlContent.includes("xmlns=\"http://www.w3.org/2005/Atom\"");
if (isAtom) {
// Atom 格式
const entryRegex = /<entry[^>]*>([\s\S]*?)<\/entry>/gi;
let entryMatch: RegExpExecArray | null;
while ((entryMatch = entryRegex.exec(xmlContent)) !== null) {
const match = entryMatch;
const entry = match[1];
const title = extractXmlTag(entry, "title") || "";
const link = extractAtomLink(entry);
const summary = extractXmlTag(entry, "summary") || extractXmlTag(entry, "content") || "";
const updated = extractXmlTag(entry, "updated") || extractXmlTag(entry, "published") || "";
if (title && link) {
items.push({
title: cleanHtml(title),
link,
description: cleanHtml(summary).slice(0, 500),
pubDate: updated,
});
}
}
} else {
// RSS 2.0 格式
const itemRegex = /<item[^>]*>([\s\S]*?)<\/item>/gi;
let itemMatch: RegExpExecArray | null;
while ((itemMatch = itemRegex.exec(xmlContent)) !== null) {
const match = itemMatch;
const item = match[1];
const title = extractXmlTag(item, "title") || "";
const link = extractXmlTag(item, "link") || extractXmlTag(item, "guid") || "";
const description = extractXmlTag(item, "description") || "";
const pubDate = extractXmlTag(item, "pubDate") || extractXmlTag(item, "dc:date") || "";
const category = extractXmlTag(item, "category") || "";
if (title && link) {
items.push({
title: cleanHtml(title),
link: link.trim(),
description: cleanHtml(description).slice(0, 500),
pubDate,
category: category || undefined,
});
}
}
}
return items.slice(0, 50); // 最多取50条
}
function extractXmlTag(xml: string, tag: string): string {
const match = xml.match(new RegExp(`<${tag}[^>]*><!\\[CDATA\\[([\\s\\S]*?)\\]\\]><\\/${tag}>`, "i"))
|| xml.match(new RegExp(`<${tag}[^>]*>([\\s\\S]*?)<\\/${tag}>`, "i"));
return match ? match[1].trim() : "";
}
function extractAtomLink(xml: string): string {
const match = xml.match(/<link[^>]+href=["']([^"']+)["'][^>]*\/?>/i)
|| xml.match(/<link[^>]*>([^<]+)<\/link>/i);
return match ? match[1].trim() : "";
}
function cleanHtml(html: string): string {
return html
.replace(/<[^>]+>/g, " ")
.replace(/&amp;/g, "&")
.replace(/&lt;/g, "<")
.replace(/&gt;/g, ">")
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/&nbsp;/g, " ")
.replace(/\s+/g, " ")
.trim();
}
// ─── 规则提取器 ───────────────────────────────────────────────────
/**
* 从 RSS 条目中提取结构化规则
* 基于标题和描述的关键词匹配,识别规则类型和资产类别
*/
function extractRuleFromRSSItem(
item: RSSItem,
source: CrawlerSource
): Partial<CrawledRule> | null {
const text = `${item.title} ${item.description}`.toLowerCase();
// 判断是否与 RWA/监管规则相关
const relevantKeywords = [
"regulation", "rule", "guidance", "circular", "notice", "directive",
"compliance", "requirement", "framework", "standard", "policy",
"tokeniz", "digital asset", "crypto", "blockchain", "rwa", "real world asset",
"securities", "license", "registration", "approval", "permit",
"监管", "规则", "指引", "通知", "合规", "要求", "框架", "标准",
"代币化", "数字资产", "加密", "区块链", "证券", "许可", "注册",
];
const isRelevant = relevantKeywords.some(kw => text.includes(kw));
if (!isRelevant) return null;
// 识别资产类别
let assetClass = "General";
const assetKeywords: Record<string, string[]> = {
"RealEstate": ["real estate", "property", "reits", "mortgage", "land", "房地产", "不动产", "房产", "土地"],
"Equity": ["equity", "stock", "share", "ipo", "listing", "股权", "股票", "股份", "上市"],
"Bonds": ["bond", "debt", "fixed income", "treasury", "debenture", "债券", "债务", "国债", "票据"],
"Commodities": ["commodity", "gold", "silver", "oil", "gas", "wheat", "大宗商品", "黄金", "白银", "石油"],
"Funds": ["fund", "etf", "mutual fund", "hedge fund", "基金", "投资基金"],
"Crypto": ["crypto", "bitcoin", "ethereum", "token", "digital asset", "加密货币", "代币", "数字资产"],
"CarbonCredits": ["carbon", "emission", "esg", "green", "碳", "排放", "绿色"],
"IP": ["intellectual property", "patent", "copyright", "trademark", "知识产权", "专利", "版权", "商标"],
"Infrastructure": ["infrastructure", "highway", "railway", "airport", "基础设施", "高速公路", "铁路", "机场"],
};
for (const [cls, keywords] of Object.entries(assetKeywords)) {
if (keywords.some(kw => text.includes(kw))) {
assetClass = cls;
break;
}
}
// 识别规则类型
let ruleType: CrawledRule["ruleType"] = "compliance_general";
if (text.match(/ownership|title|deed|register|登记|所有权|产权|确权/)) {
ruleType = "ownership_verification";
} else if (text.match(/trading|settlement|transaction|exchange|交易|结算|买卖/)) {
ruleType = "trading_rules";
} else if (text.match(/tax|duty|stamp|withholding|税|关税|印花税|预扣税/)) {
ruleType = "tax_rules";
} else if (text.match(/kyc|aml|anti.money|fatf|反洗钱|客户尽职/)) {
ruleType = "aml_kyc";
}
// 生成规则ID
const ruleId = `${source.jurisdiction}-${assetClass.toUpperCase().slice(0, 4)}-CRAWL-${Date.now()}-${Math.random().toString(36).slice(2, 6)}`;
return {
ruleId,
jurisdiction: source.jurisdiction,
assetClass,
ruleType,
ruleName: item.title.slice(0, 100),
content: `${item.title}\n\n${item.description}`,
legalBasis: `${source.sourceName} - ${item.pubDate || "最新发布"}`,
sourceUrl: item.link,
sourceName: source.sourceName,
crawledAt: new Date(),
lastUpdated: item.pubDate ? new Date(item.pubDate) : new Date(),
tier: source.tier,
tags: [source.jurisdiction, assetClass, ruleType, "auto-crawled"],
complianceLevel: "informational",
};
}
// ─── 爬虫主逻辑 ───────────────────────────────────────────────────
const MONGO_URL = process.env.NAC_MONGO_URL || "mongodb://root:idP0ZaRGyLsTUA3a@localhost:27017/nac_knowledge_engine?authSource=admin";
const DB_NAME = "nac_knowledge_engine";
const COLLECTION_NAME = "compliance_rules";
async function crawlSource(source: CrawlerSource): Promise<CrawlerResult> {
const result: CrawlerResult = {
jurisdiction: source.jurisdiction,
sourceName: source.sourceName,
rulesFound: 0,
rulesInserted: 0,
rulesUpdated: 0,
errors: [],
crawledAt: new Date(),
};
const client = new MongoClient(MONGO_URL);
try {
await client.connect();
const db = client.db(DB_NAME);
const collection = db.collection(COLLECTION_NAME);
let items: RSSItem[] = [];
// 根据策略选择抓取方式
if (source.parseStrategy === "rss" && source.rssUrl) {
try {
const content = await fetchUrl(source.rssUrl);
items = parseRSSFeed(content);
console.log(`[Crawler] ${source.sourceName}: 获取到 ${items.length} 条 RSS 条目`);
} catch (e) {
result.errors.push(`RSS 获取失败: ${(e as Error).message}`);
// 降级到主页
try {
const content = await fetchUrl(source.sourceUrl);
items = parseRSSFeed(content);
} catch {
// 忽略
}
}
} else if (source.parseStrategy === "api" && source.apiUrl) {
try {
const content = await fetchUrl(source.apiUrl);
// JSON API 解析
const data = JSON.parse(content);
if (Array.isArray(data.hits?.hits)) {
items = data.hits.hits.map((hit: Record<string, unknown>) => {
const src = hit._source as Record<string, unknown> || {};
return {
title: String(src.display_names || src.entity_name || src.file_date || ""),
link: `https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=${src.entity_id || ""}`,
description: String(src.period_of_report || src.file_date || ""),
pubDate: String(src.file_date || ""),
};
});
}
} catch (e) {
result.errors.push(`API 获取失败: ${(e as Error).message}`);
}
}
result.rulesFound = items.length;
// 处理每条规则
for (const item of items) {
const rule = extractRuleFromRSSItem(item, source);
if (!rule) continue;
try {
// 检查是否已存在(基于 sourceUrl
const existing = await collection.findOne({ sourceUrl: rule.sourceUrl });
if (existing) {
// 更新已有规则
await collection.updateOne(
{ sourceUrl: rule.sourceUrl },
{
$set: {
...rule,
lastUpdated: new Date(),
},
}
);
result.rulesUpdated++;
} else {
// 插入新规则
await collection.insertOne({
...rule,
createdAt: new Date(),
});
result.rulesInserted++;
}
} catch (e) {
result.errors.push(`规则写入失败: ${(e as Error).message}`);
}
// 限速
if (source.rateLimit) {
await new Promise(resolve => setTimeout(resolve, source.rateLimit));
}
}
} catch (e) {
result.errors.push(`连接失败: ${(e as Error).message}`);
} finally {
await client.close();
}
return result;
}
/**
* 运行完整爬虫(所有数据源)
*/
export async function runFullCrawl(options?: {
jurisdictions?: string[];
tier?: number;
dryRun?: boolean;
}): Promise<CrawlerResult[]> {
const results: CrawlerResult[] = [];
let sources = REGULATORY_SOURCES;
// 过滤条件
if (options?.jurisdictions && options.jurisdictions.length > 0) {
sources = sources.filter(s => options.jurisdictions!.includes(s.jurisdiction));
}
if (options?.tier !== undefined) {
sources = sources.filter(s => s.tier <= options.tier!);
}
console.log(`[Crawler] 开始爬取 ${sources.length} 个数据源...`);
for (const source of sources) {
console.log(`[Crawler] 正在爬取: ${source.sourceName} (${source.jurisdiction})`);
if (options?.dryRun) {
results.push({
jurisdiction: source.jurisdiction,
sourceName: source.sourceName,
rulesFound: 0,
rulesInserted: 0,
rulesUpdated: 0,
errors: ["DRY_RUN: 跳过实际爬取"],
crawledAt: new Date(),
});
continue;
}
const result = await crawlSource(source);
results.push(result);
console.log(`[Crawler] ${source.sourceName}: 找到 ${result.rulesFound} 条, 新增 ${result.rulesInserted} 条, 更新 ${result.rulesUpdated}`);
if (result.errors.length > 0) {
console.warn(`[Crawler] ${source.sourceName} 错误: ${result.errors.join("; ")}`);
}
// 数据源间间隔
await new Promise(resolve => setTimeout(resolve, 500));
}
return results;
}
/**
* 运行 Tier 1 辖区爬虫(快速模式)
*/
export async function runTier1Crawl(): Promise<CrawlerResult[]> {
return runFullCrawl({ tier: 1 });
}
/**
* 获取爬虫数据源列表(不执行爬取)
*/
export function getCrawlerSources(tier?: number): CrawlerSource[] {
if (tier !== undefined) {
return REGULATORY_SOURCES.filter(s => s.tier <= tier);
}
return REGULATORY_SOURCES;
}
/**
* 获取爬虫统计信息
*/
export function getCrawlerStats(): {
totalSources: number;
tier1Sources: number;
tier2Sources: number;
jurisdictionCount: number;
assetClassCount: number;
} {
const tier1 = REGULATORY_SOURCES.filter(s => s.tier === 1);
const tier2 = REGULATORY_SOURCES.filter(s => s.tier === 2);
const jurisdictions = new Set(REGULATORY_SOURCES.map(s => s.jurisdiction));
const assetClasses = new Set(REGULATORY_SOURCES.flatMap(s => s.assetClasses));
return {
totalSources: REGULATORY_SOURCES.length,
tier1Sources: tier1.length,
tier2Sources: tier2.length,
jurisdictionCount: jurisdictions.size,
assetClassCount: assetClasses.size,
};
}