822 lines
28 KiB
TypeScript
822 lines
28 KiB
TypeScript
/**
|
||
* NAC 公链 - 监管规则自动爬虫模块
|
||
* Regulatory Rules Auto-Crawler
|
||
*
|
||
* 覆盖范围:
|
||
* - Tier 1 辖区(20个):US/CA/EU/GB/CH/DE/FR/NL/IE/LU/JP/KR/SG/HK/AU/AE/IL
|
||
* - Tier 2 辖区(25个):CN/TW/MY/TH/IN/IT/ES/TR/SA/QA/KW/BH/BR/CL/AR/ZA等
|
||
* - Tier 3 辖区(15个):ID/PH/VN/PK/BD/OM/CO/PE/VE/UY/PY/NG/EG/KE/MA/RU/KZ/UA
|
||
*
|
||
* 资产类别:20大类 100+子类(GNACS标准)
|
||
*
|
||
* 爬取策略:
|
||
* 1. 官方 RSS/Atom 订阅源(实时更新)
|
||
* 2. 官方 API 接口(SEC EDGAR、ESMA FIRDS等)
|
||
* 3. 官方网站 HTML 解析(无 API 时使用)
|
||
* 4. 解析提取 → 结构化规则 → 写入 MongoDB
|
||
*/
|
||
|
||
import https from "https";
|
||
import http from "http";
|
||
import { URL } from "url";
|
||
import { MongoClient } from "mongodb";
|
||
|
||
// ─── 类型定义 ─────────────────────────────────────────────────────
|
||
|
||
export interface CrawledRule {
|
||
ruleId: string;
|
||
jurisdiction: string;
|
||
assetClass: string;
|
||
ruleType: "ownership_verification" | "trading_rules" | "compliance_general" | "tax_rules" | "aml_kyc";
|
||
ruleName: string;
|
||
content: string;
|
||
legalBasis: string;
|
||
ownershipRequirements?: {
|
||
proofDocuments?: string[];
|
||
registrationAuthority?: string;
|
||
transferMechanism?: string;
|
||
chainRecognition?: string;
|
||
foreignOwnershipRestriction?: string;
|
||
disputeResolution?: string;
|
||
};
|
||
tradingRequirements?: {
|
||
minimumInvestor?: string;
|
||
settlementPeriod?: string;
|
||
allowedCurrencies?: string[];
|
||
tradingPlatform?: string;
|
||
reportingRequirements?: string;
|
||
};
|
||
sourceUrl: string;
|
||
sourceName: string;
|
||
crawledAt: Date;
|
||
lastUpdated: Date;
|
||
tier: number;
|
||
tags: string[];
|
||
complianceLevel: "mandatory" | "recommended" | "informational";
|
||
}
|
||
|
||
export interface CrawlerSource {
|
||
jurisdiction: string;
|
||
sourceName: string;
|
||
sourceUrl: string;
|
||
rssUrl?: string;
|
||
apiUrl?: string;
|
||
assetClasses: string[];
|
||
tier: number;
|
||
parseStrategy: "rss" | "api" | "html" | "json";
|
||
rateLimit?: number; // ms between requests
|
||
}
|
||
|
||
export interface CrawlerResult {
|
||
jurisdiction: string;
|
||
sourceName: string;
|
||
rulesFound: number;
|
||
rulesInserted: number;
|
||
rulesUpdated: number;
|
||
errors: string[];
|
||
crawledAt: Date;
|
||
}
|
||
|
||
// ─── 官方数据源清单 ───────────────────────────────────────────────
|
||
|
||
export const REGULATORY_SOURCES: CrawlerSource[] = [
|
||
// ══════════════════════════════════════════════════════════════
|
||
// 北美洲
|
||
// ══════════════════════════════════════════════════════════════
|
||
{
|
||
jurisdiction: "US",
|
||
sourceName: "SEC (美国证券交易委员会)",
|
||
sourceUrl: "https://www.sec.gov",
|
||
rssUrl: "https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&type=&dateb=&owner=include&count=40&search_text=&output=atom",
|
||
apiUrl: "https://efts.sec.gov/LATEST/search-index?q=%22RWA%22+%22tokenization%22&dateRange=custom&startdt=2023-01-01&forms=S-1,8-K",
|
||
assetClasses: ["Equity", "Bonds", "RealEstate", "Commodities", "Funds"],
|
||
tier: 1,
|
||
parseStrategy: "rss",
|
||
rateLimit: 1000,
|
||
},
|
||
{
|
||
jurisdiction: "US",
|
||
sourceName: "FinCEN (美国金融犯罪执法网络)",
|
||
sourceUrl: "https://www.fincen.gov",
|
||
rssUrl: "https://www.fincen.gov/news/rss.xml",
|
||
assetClasses: ["ALL"],
|
||
tier: 1,
|
||
parseStrategy: "rss",
|
||
rateLimit: 1000,
|
||
},
|
||
{
|
||
jurisdiction: "CA",
|
||
sourceName: "CSA (加拿大证券管理局)",
|
||
sourceUrl: "https://www.securities-administrators.ca",
|
||
rssUrl: "https://www.securities-administrators.ca/news/rss",
|
||
assetClasses: ["Equity", "Bonds", "Funds", "Crypto"],
|
||
tier: 1,
|
||
parseStrategy: "rss",
|
||
rateLimit: 1000,
|
||
},
|
||
// ══════════════════════════════════════════════════════════════
|
||
// 欧洲
|
||
// ══════════════════════════════════════════════════════════════
|
||
{
|
||
jurisdiction: "EU",
|
||
sourceName: "ESMA (欧洲证券和市场管理局)",
|
||
sourceUrl: "https://www.esma.europa.eu",
|
||
rssUrl: "https://www.esma.europa.eu/press-news/rss-feeds",
|
||
apiUrl: "https://registers.esma.europa.eu/publication/searchRegister?core=esma_registers_firds_ir",
|
||
assetClasses: ["Equity", "Bonds", "Derivatives", "Funds", "Crypto"],
|
||
tier: 1,
|
||
parseStrategy: "rss",
|
||
rateLimit: 1000,
|
||
},
|
||
{
|
||
jurisdiction: "GB",
|
||
sourceName: "FCA (英国金融行为监管局)",
|
||
sourceUrl: "https://www.fca.org.uk",
|
||
rssUrl: "https://www.fca.org.uk/news/rss.xml",
|
||
assetClasses: ["Equity", "Bonds", "RealEstate", "Funds", "Crypto"],
|
||
tier: 1,
|
||
parseStrategy: "rss",
|
||
rateLimit: 1000,
|
||
},
|
||
{
|
||
jurisdiction: "CH",
|
||
sourceName: "FINMA (瑞士金融市场监管局)",
|
||
sourceUrl: "https://www.finma.ch",
|
||
rssUrl: "https://www.finma.ch/en/news/rss/",
|
||
assetClasses: ["Equity", "Bonds", "Funds", "Crypto", "RealEstate"],
|
||
tier: 1,
|
||
parseStrategy: "rss",
|
||
rateLimit: 1000,
|
||
},
|
||
{
|
||
jurisdiction: "DE",
|
||
sourceName: "BaFin (德国联邦金融监管局)",
|
||
sourceUrl: "https://www.bafin.de",
|
||
rssUrl: "https://www.bafin.de/SiteGlobals/Functions/RSSFeed/EN/RSSNewsfeed_Veroeffentlichungen/RSSNewsfeed_Veroeffentlichungen_node.html",
|
||
assetClasses: ["Equity", "Bonds", "Funds", "RealEstate"],
|
||
tier: 1,
|
||
parseStrategy: "rss",
|
||
rateLimit: 1000,
|
||
},
|
||
{
|
||
jurisdiction: "FR",
|
||
sourceName: "AMF (法国金融市场管理局)",
|
||
sourceUrl: "https://www.amf-france.org",
|
||
rssUrl: "https://www.amf-france.org/en/rss/news",
|
||
assetClasses: ["Equity", "Bonds", "Funds", "Crypto"],
|
||
tier: 1,
|
||
parseStrategy: "rss",
|
||
rateLimit: 1000,
|
||
},
|
||
{
|
||
jurisdiction: "NL",
|
||
sourceName: "AFM (荷兰金融市场管理局)",
|
||
sourceUrl: "https://www.afm.nl",
|
||
rssUrl: "https://www.afm.nl/en/nieuws/rss",
|
||
assetClasses: ["Equity", "Bonds", "Funds"],
|
||
tier: 1,
|
||
parseStrategy: "rss",
|
||
rateLimit: 1000,
|
||
},
|
||
{
|
||
jurisdiction: "LU",
|
||
sourceName: "CSSF (卢森堡金融监管委员会)",
|
||
sourceUrl: "https://www.cssf.lu",
|
||
rssUrl: "https://www.cssf.lu/en/news/rss/",
|
||
assetClasses: ["Funds", "Bonds", "Equity"],
|
||
tier: 1,
|
||
parseStrategy: "rss",
|
||
rateLimit: 1000,
|
||
},
|
||
// ══════════════════════════════════════════════════════════════
|
||
// 亚太地区
|
||
// ══════════════════════════════════════════════════════════════
|
||
{
|
||
jurisdiction: "HK",
|
||
sourceName: "SFC (香港证券及期货事务监察委员会)",
|
||
sourceUrl: "https://www.sfc.hk",
|
||
rssUrl: "https://www.sfc.hk/en/rss/news",
|
||
assetClasses: ["Equity", "Bonds", "RealEstate", "Funds", "Crypto"],
|
||
tier: 1,
|
||
parseStrategy: "rss",
|
||
rateLimit: 1000,
|
||
},
|
||
{
|
||
jurisdiction: "HK",
|
||
sourceName: "HKEX (香港交易所)",
|
||
sourceUrl: "https://www.hkex.com.hk",
|
||
rssUrl: "https://www.hkex.com.hk/eng/newsconsul/hkexnews/rss/news.xml",
|
||
assetClasses: ["Equity", "Bonds", "Derivatives", "Funds"],
|
||
tier: 1,
|
||
parseStrategy: "rss",
|
||
rateLimit: 1000,
|
||
},
|
||
{
|
||
jurisdiction: "SG",
|
||
sourceName: "MAS (新加坡金融管理局)",
|
||
sourceUrl: "https://www.mas.gov.sg",
|
||
rssUrl: "https://www.mas.gov.sg/news/rss",
|
||
assetClasses: ["Equity", "Bonds", "RealEstate", "Funds", "Crypto"],
|
||
tier: 1,
|
||
parseStrategy: "rss",
|
||
rateLimit: 1000,
|
||
},
|
||
{
|
||
jurisdiction: "JP",
|
||
sourceName: "FSA (日本金融厅)",
|
||
sourceUrl: "https://www.fsa.go.jp",
|
||
rssUrl: "https://www.fsa.go.jp/en/news/rss.xml",
|
||
assetClasses: ["Equity", "Bonds", "RealEstate", "Funds"],
|
||
tier: 1,
|
||
parseStrategy: "rss",
|
||
rateLimit: 1500,
|
||
},
|
||
{
|
||
jurisdiction: "KR",
|
||
sourceName: "FSC (韩国金融委员会)",
|
||
sourceUrl: "https://www.fsc.go.kr",
|
||
rssUrl: "https://www.fsc.go.kr/eng/rss/news.xml",
|
||
assetClasses: ["Equity", "Bonds", "Funds", "Crypto"],
|
||
tier: 1,
|
||
parseStrategy: "rss",
|
||
rateLimit: 1500,
|
||
},
|
||
{
|
||
jurisdiction: "AU",
|
||
sourceName: "ASIC (澳大利亚证券和投资委员会)",
|
||
sourceUrl: "https://asic.gov.au",
|
||
rssUrl: "https://asic.gov.au/about-asic/news-centre/rss-feeds/",
|
||
assetClasses: ["Equity", "Bonds", "RealEstate", "Commodities", "Funds"],
|
||
tier: 1,
|
||
parseStrategy: "rss",
|
||
rateLimit: 1000,
|
||
},
|
||
{
|
||
jurisdiction: "CN",
|
||
sourceName: "CSRC (中国证券监督管理委员会)",
|
||
sourceUrl: "http://www.csrc.gov.cn",
|
||
rssUrl: "http://www.csrc.gov.cn/csrc/c100028/common_list.shtml",
|
||
assetClasses: ["Equity", "Bonds", "Funds", "RealEstate"],
|
||
tier: 2,
|
||
parseStrategy: "html",
|
||
rateLimit: 2000,
|
||
},
|
||
{
|
||
jurisdiction: "CN",
|
||
sourceName: "PBOC (中国人民银行)",
|
||
sourceUrl: "http://www.pbc.gov.cn",
|
||
rssUrl: "http://www.pbc.gov.cn/rss/index.xml",
|
||
assetClasses: ["Bonds", "Forex", "Crypto"],
|
||
tier: 2,
|
||
parseStrategy: "rss",
|
||
rateLimit: 2000,
|
||
},
|
||
{
|
||
jurisdiction: "IN",
|
||
sourceName: "SEBI (印度证券交易委员会)",
|
||
sourceUrl: "https://www.sebi.gov.in",
|
||
rssUrl: "https://www.sebi.gov.in/rss/news.xml",
|
||
assetClasses: ["Equity", "Bonds", "Funds", "Commodities"],
|
||
tier: 2,
|
||
parseStrategy: "rss",
|
||
rateLimit: 1500,
|
||
},
|
||
{
|
||
jurisdiction: "MY",
|
||
sourceName: "SC (马来西亚证券委员会)",
|
||
sourceUrl: "https://www.sc.com.my",
|
||
rssUrl: "https://www.sc.com.my/api/documentcentre/rss",
|
||
assetClasses: ["Equity", "Bonds", "Funds", "IslamicFinance"],
|
||
tier: 2,
|
||
parseStrategy: "rss",
|
||
rateLimit: 1500,
|
||
},
|
||
{
|
||
jurisdiction: "TH",
|
||
sourceName: "SEC Thailand (泰国证券交易委员会)",
|
||
sourceUrl: "https://www.sec.or.th",
|
||
rssUrl: "https://www.sec.or.th/EN/Pages/News/rss.aspx",
|
||
assetClasses: ["Equity", "Bonds", "Funds", "Crypto"],
|
||
tier: 2,
|
||
parseStrategy: "rss",
|
||
rateLimit: 1500,
|
||
},
|
||
// ══════════════════════════════════════════════════════════════
|
||
// 中东地区
|
||
// ══════════════════════════════════════════════════════════════
|
||
{
|
||
jurisdiction: "AE",
|
||
sourceName: "DFSA (迪拜金融服务局)",
|
||
sourceUrl: "https://www.dfsa.ae",
|
||
rssUrl: "https://www.dfsa.ae/news/rss",
|
||
assetClasses: ["Equity", "Bonds", "RealEstate", "Funds", "Crypto"],
|
||
tier: 1,
|
||
parseStrategy: "rss",
|
||
rateLimit: 1000,
|
||
},
|
||
{
|
||
jurisdiction: "AE",
|
||
sourceName: "ADGM (阿布扎比全球市场)",
|
||
sourceUrl: "https://www.adgm.com",
|
||
rssUrl: "https://www.adgm.com/news/rss",
|
||
assetClasses: ["Equity", "Bonds", "Funds", "Crypto", "RealEstate"],
|
||
tier: 1,
|
||
parseStrategy: "rss",
|
||
rateLimit: 1000,
|
||
},
|
||
{
|
||
jurisdiction: "SA",
|
||
sourceName: "CMA Saudi (沙特资本市场管理局)",
|
||
sourceUrl: "https://cma.org.sa",
|
||
rssUrl: "https://cma.org.sa/en/News/Pages/rss.aspx",
|
||
assetClasses: ["Equity", "Bonds", "Funds", "RealEstate"],
|
||
tier: 2,
|
||
parseStrategy: "rss",
|
||
rateLimit: 1500,
|
||
},
|
||
{
|
||
jurisdiction: "QA",
|
||
sourceName: "QFMA (卡塔尔金融市场管理局)",
|
||
sourceUrl: "https://www.qfma.org.qa",
|
||
rssUrl: "https://www.qfma.org.qa/English/News/rss.aspx",
|
||
assetClasses: ["Equity", "Bonds", "Funds"],
|
||
tier: 2,
|
||
parseStrategy: "rss",
|
||
rateLimit: 1500,
|
||
},
|
||
{
|
||
jurisdiction: "IL",
|
||
sourceName: "ISA (以色列证券局)",
|
||
sourceUrl: "https://www.isa.gov.il",
|
||
rssUrl: "https://www.isa.gov.il/en/news/rss",
|
||
assetClasses: ["Equity", "Bonds", "Funds", "Crypto"],
|
||
tier: 1,
|
||
parseStrategy: "rss",
|
||
rateLimit: 1000,
|
||
},
|
||
// ══════════════════════════════════════════════════════════════
|
||
// 南美洲
|
||
// ══════════════════════════════════════════════════════════════
|
||
{
|
||
jurisdiction: "BR",
|
||
sourceName: "CVM (巴西证券委员会)",
|
||
sourceUrl: "https://www.gov.br/cvm",
|
||
rssUrl: "https://www.gov.br/cvm/pt-br/assuntos/noticias/rss.xml",
|
||
assetClasses: ["Equity", "Bonds", "Funds", "Commodities"],
|
||
tier: 2,
|
||
parseStrategy: "rss",
|
||
rateLimit: 1500,
|
||
},
|
||
{
|
||
jurisdiction: "CL",
|
||
sourceName: "CMF (智利金融市场委员会)",
|
||
sourceUrl: "https://www.cmfchile.cl",
|
||
rssUrl: "https://www.cmfchile.cl/sitio/rss/noticias.xml",
|
||
assetClasses: ["Equity", "Bonds", "Funds", "Commodities"],
|
||
tier: 2,
|
||
parseStrategy: "rss",
|
||
rateLimit: 1500,
|
||
},
|
||
// ══════════════════════════════════════════════════════════════
|
||
// 非洲
|
||
// ══════════════════════════════════════════════════════════════
|
||
{
|
||
jurisdiction: "ZA",
|
||
sourceName: "FSCA (南非金融行业监管局)",
|
||
sourceUrl: "https://www.fsca.co.za",
|
||
rssUrl: "https://www.fsca.co.za/News/Pages/rss.aspx",
|
||
assetClasses: ["Equity", "Bonds", "Funds", "Commodities"],
|
||
tier: 2,
|
||
parseStrategy: "rss",
|
||
rateLimit: 1500,
|
||
},
|
||
];
|
||
|
||
// ─── HTTP 请求工具 ────────────────────────────────────────────────
|
||
|
||
async function fetchUrl(url: string, timeoutMs = 15000): Promise<string> {
|
||
return new Promise((resolve, reject) => {
|
||
const parsedUrl = new URL(url);
|
||
const protocol = parsedUrl.protocol === "https:" ? https : http;
|
||
|
||
const options = {
|
||
hostname: parsedUrl.hostname,
|
||
port: parsedUrl.port || (parsedUrl.protocol === "https:" ? 443 : 80),
|
||
path: parsedUrl.pathname + parsedUrl.search,
|
||
method: "GET",
|
||
headers: {
|
||
"User-Agent": "NAC-Regulatory-Crawler/1.0 (NAC Public Chain Compliance; https://newassetchain.io)",
|
||
"Accept": "application/rss+xml, application/atom+xml, application/xml, text/xml, text/html, application/json",
|
||
"Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8",
|
||
"Cache-Control": "no-cache",
|
||
},
|
||
timeout: timeoutMs,
|
||
};
|
||
|
||
const req = protocol.request(options, (res) => {
|
||
// 处理重定向
|
||
if (res.statusCode && [301, 302, 303, 307, 308].includes(res.statusCode) && res.headers.location) {
|
||
const redirectUrl = res.headers.location.startsWith("http")
|
||
? res.headers.location
|
||
: `${parsedUrl.protocol}//${parsedUrl.hostname}${res.headers.location}`;
|
||
fetchUrl(redirectUrl, timeoutMs).then(resolve).catch(reject);
|
||
return;
|
||
}
|
||
|
||
if (res.statusCode && res.statusCode >= 400) {
|
||
reject(new Error(`HTTP ${res.statusCode}: ${url}`));
|
||
return;
|
||
}
|
||
|
||
const chunks: Buffer[] = [];
|
||
res.on("data", (chunk: Buffer) => chunks.push(chunk));
|
||
res.on("end", () => resolve(Buffer.concat(chunks).toString("utf-8")));
|
||
res.on("error", reject);
|
||
});
|
||
|
||
req.on("timeout", () => {
|
||
req.destroy();
|
||
reject(new Error(`Timeout fetching: ${url}`));
|
||
});
|
||
|
||
req.on("error", reject);
|
||
req.end();
|
||
});
|
||
}
|
||
|
||
// ─── RSS/Atom 解析器 ──────────────────────────────────────────────
|
||
|
||
interface RSSItem {
|
||
title: string;
|
||
link: string;
|
||
description: string;
|
||
pubDate: string;
|
||
category?: string;
|
||
}
|
||
|
||
function parseRSSFeed(xmlContent: string): RSSItem[] {
|
||
const items: RSSItem[] = [];
|
||
|
||
// 支持 RSS 2.0 和 Atom 格式
|
||
const isAtom = xmlContent.includes("<feed") && xmlContent.includes("xmlns=\"http://www.w3.org/2005/Atom\"");
|
||
|
||
if (isAtom) {
|
||
// Atom 格式
|
||
const entryRegex = /<entry[^>]*>([\s\S]*?)<\/entry>/gi;
|
||
let entryMatch: RegExpExecArray | null;
|
||
while ((entryMatch = entryRegex.exec(xmlContent)) !== null) {
|
||
const match = entryMatch;
|
||
const entry = match[1];
|
||
const title = extractXmlTag(entry, "title") || "";
|
||
const link = extractAtomLink(entry);
|
||
const summary = extractXmlTag(entry, "summary") || extractXmlTag(entry, "content") || "";
|
||
const updated = extractXmlTag(entry, "updated") || extractXmlTag(entry, "published") || "";
|
||
|
||
if (title && link) {
|
||
items.push({
|
||
title: cleanHtml(title),
|
||
link,
|
||
description: cleanHtml(summary).slice(0, 500),
|
||
pubDate: updated,
|
||
});
|
||
}
|
||
}
|
||
} else {
|
||
// RSS 2.0 格式
|
||
const itemRegex = /<item[^>]*>([\s\S]*?)<\/item>/gi;
|
||
let itemMatch: RegExpExecArray | null;
|
||
while ((itemMatch = itemRegex.exec(xmlContent)) !== null) {
|
||
const match = itemMatch;
|
||
const item = match[1];
|
||
const title = extractXmlTag(item, "title") || "";
|
||
const link = extractXmlTag(item, "link") || extractXmlTag(item, "guid") || "";
|
||
const description = extractXmlTag(item, "description") || "";
|
||
const pubDate = extractXmlTag(item, "pubDate") || extractXmlTag(item, "dc:date") || "";
|
||
const category = extractXmlTag(item, "category") || "";
|
||
|
||
if (title && link) {
|
||
items.push({
|
||
title: cleanHtml(title),
|
||
link: link.trim(),
|
||
description: cleanHtml(description).slice(0, 500),
|
||
pubDate,
|
||
category: category || undefined,
|
||
});
|
||
}
|
||
}
|
||
}
|
||
|
||
return items.slice(0, 50); // 最多取50条
|
||
}
|
||
|
||
function extractXmlTag(xml: string, tag: string): string {
|
||
const match = xml.match(new RegExp(`<${tag}[^>]*><!\\[CDATA\\[([\\s\\S]*?)\\]\\]><\\/${tag}>`, "i"))
|
||
|| xml.match(new RegExp(`<${tag}[^>]*>([\\s\\S]*?)<\\/${tag}>`, "i"));
|
||
return match ? match[1].trim() : "";
|
||
}
|
||
|
||
function extractAtomLink(xml: string): string {
|
||
const match = xml.match(/<link[^>]+href=["']([^"']+)["'][^>]*\/?>/i)
|
||
|| xml.match(/<link[^>]*>([^<]+)<\/link>/i);
|
||
return match ? match[1].trim() : "";
|
||
}
|
||
|
||
function cleanHtml(html: string): string {
|
||
return html
|
||
.replace(/<[^>]+>/g, " ")
|
||
.replace(/&/g, "&")
|
||
.replace(/</g, "<")
|
||
.replace(/>/g, ">")
|
||
.replace(/"/g, '"')
|
||
.replace(/'/g, "'")
|
||
.replace(/ /g, " ")
|
||
.replace(/\s+/g, " ")
|
||
.trim();
|
||
}
|
||
|
||
// ─── 规则提取器 ───────────────────────────────────────────────────
|
||
|
||
/**
|
||
* 从 RSS 条目中提取结构化规则
|
||
* 基于标题和描述的关键词匹配,识别规则类型和资产类别
|
||
*/
|
||
function extractRuleFromRSSItem(
|
||
item: RSSItem,
|
||
source: CrawlerSource
|
||
): Partial<CrawledRule> | null {
|
||
const text = `${item.title} ${item.description}`.toLowerCase();
|
||
|
||
// 判断是否与 RWA/监管规则相关
|
||
const relevantKeywords = [
|
||
"regulation", "rule", "guidance", "circular", "notice", "directive",
|
||
"compliance", "requirement", "framework", "standard", "policy",
|
||
"tokeniz", "digital asset", "crypto", "blockchain", "rwa", "real world asset",
|
||
"securities", "license", "registration", "approval", "permit",
|
||
"监管", "规则", "指引", "通知", "合规", "要求", "框架", "标准",
|
||
"代币化", "数字资产", "加密", "区块链", "证券", "许可", "注册",
|
||
];
|
||
|
||
const isRelevant = relevantKeywords.some(kw => text.includes(kw));
|
||
if (!isRelevant) return null;
|
||
|
||
// 识别资产类别
|
||
let assetClass = "General";
|
||
const assetKeywords: Record<string, string[]> = {
|
||
"RealEstate": ["real estate", "property", "reits", "mortgage", "land", "房地产", "不动产", "房产", "土地"],
|
||
"Equity": ["equity", "stock", "share", "ipo", "listing", "股权", "股票", "股份", "上市"],
|
||
"Bonds": ["bond", "debt", "fixed income", "treasury", "debenture", "债券", "债务", "国债", "票据"],
|
||
"Commodities": ["commodity", "gold", "silver", "oil", "gas", "wheat", "大宗商品", "黄金", "白银", "石油"],
|
||
"Funds": ["fund", "etf", "mutual fund", "hedge fund", "基金", "投资基金"],
|
||
"Crypto": ["crypto", "bitcoin", "ethereum", "token", "digital asset", "加密货币", "代币", "数字资产"],
|
||
"CarbonCredits": ["carbon", "emission", "esg", "green", "碳", "排放", "绿色"],
|
||
"IP": ["intellectual property", "patent", "copyright", "trademark", "知识产权", "专利", "版权", "商标"],
|
||
"Infrastructure": ["infrastructure", "highway", "railway", "airport", "基础设施", "高速公路", "铁路", "机场"],
|
||
};
|
||
|
||
for (const [cls, keywords] of Object.entries(assetKeywords)) {
|
||
if (keywords.some(kw => text.includes(kw))) {
|
||
assetClass = cls;
|
||
break;
|
||
}
|
||
}
|
||
|
||
// 识别规则类型
|
||
let ruleType: CrawledRule["ruleType"] = "compliance_general";
|
||
if (text.match(/ownership|title|deed|register|登记|所有权|产权|确权/)) {
|
||
ruleType = "ownership_verification";
|
||
} else if (text.match(/trading|settlement|transaction|exchange|交易|结算|买卖/)) {
|
||
ruleType = "trading_rules";
|
||
} else if (text.match(/tax|duty|stamp|withholding|税|关税|印花税|预扣税/)) {
|
||
ruleType = "tax_rules";
|
||
} else if (text.match(/kyc|aml|anti.money|fatf|反洗钱|客户尽职/)) {
|
||
ruleType = "aml_kyc";
|
||
}
|
||
|
||
// 生成规则ID
|
||
const ruleId = `${source.jurisdiction}-${assetClass.toUpperCase().slice(0, 4)}-CRAWL-${Date.now()}-${Math.random().toString(36).slice(2, 6)}`;
|
||
|
||
return {
|
||
ruleId,
|
||
jurisdiction: source.jurisdiction,
|
||
assetClass,
|
||
ruleType,
|
||
ruleName: item.title.slice(0, 100),
|
||
content: `${item.title}\n\n${item.description}`,
|
||
legalBasis: `${source.sourceName} - ${item.pubDate || "最新发布"}`,
|
||
sourceUrl: item.link,
|
||
sourceName: source.sourceName,
|
||
crawledAt: new Date(),
|
||
lastUpdated: item.pubDate ? new Date(item.pubDate) : new Date(),
|
||
tier: source.tier,
|
||
tags: [source.jurisdiction, assetClass, ruleType, "auto-crawled"],
|
||
complianceLevel: "informational",
|
||
};
|
||
}
|
||
|
||
// ─── 爬虫主逻辑 ───────────────────────────────────────────────────
|
||
|
||
const MONGO_URL = process.env.NAC_MONGO_URL || "mongodb://root:idP0ZaRGyLsTUA3a@localhost:27017/nac_knowledge_engine?authSource=admin";
|
||
const DB_NAME = "nac_knowledge_engine";
|
||
const COLLECTION_NAME = "compliance_rules";
|
||
|
||
async function crawlSource(source: CrawlerSource): Promise<CrawlerResult> {
|
||
const result: CrawlerResult = {
|
||
jurisdiction: source.jurisdiction,
|
||
sourceName: source.sourceName,
|
||
rulesFound: 0,
|
||
rulesInserted: 0,
|
||
rulesUpdated: 0,
|
||
errors: [],
|
||
crawledAt: new Date(),
|
||
};
|
||
|
||
const client = new MongoClient(MONGO_URL);
|
||
|
||
try {
|
||
await client.connect();
|
||
const db = client.db(DB_NAME);
|
||
const collection = db.collection(COLLECTION_NAME);
|
||
|
||
let items: RSSItem[] = [];
|
||
|
||
// 根据策略选择抓取方式
|
||
if (source.parseStrategy === "rss" && source.rssUrl) {
|
||
try {
|
||
const content = await fetchUrl(source.rssUrl);
|
||
items = parseRSSFeed(content);
|
||
console.log(`[Crawler] ${source.sourceName}: 获取到 ${items.length} 条 RSS 条目`);
|
||
} catch (e) {
|
||
result.errors.push(`RSS 获取失败: ${(e as Error).message}`);
|
||
// 降级到主页
|
||
try {
|
||
const content = await fetchUrl(source.sourceUrl);
|
||
items = parseRSSFeed(content);
|
||
} catch {
|
||
// 忽略
|
||
}
|
||
}
|
||
} else if (source.parseStrategy === "api" && source.apiUrl) {
|
||
try {
|
||
const content = await fetchUrl(source.apiUrl);
|
||
// JSON API 解析
|
||
const data = JSON.parse(content);
|
||
if (Array.isArray(data.hits?.hits)) {
|
||
items = data.hits.hits.map((hit: Record<string, unknown>) => {
|
||
const src = hit._source as Record<string, unknown> || {};
|
||
return {
|
||
title: String(src.display_names || src.entity_name || src.file_date || ""),
|
||
link: `https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=${src.entity_id || ""}`,
|
||
description: String(src.period_of_report || src.file_date || ""),
|
||
pubDate: String(src.file_date || ""),
|
||
};
|
||
});
|
||
}
|
||
} catch (e) {
|
||
result.errors.push(`API 获取失败: ${(e as Error).message}`);
|
||
}
|
||
}
|
||
|
||
result.rulesFound = items.length;
|
||
|
||
// 处理每条规则
|
||
for (const item of items) {
|
||
const rule = extractRuleFromRSSItem(item, source);
|
||
if (!rule) continue;
|
||
|
||
try {
|
||
// 检查是否已存在(基于 sourceUrl)
|
||
const existing = await collection.findOne({ sourceUrl: rule.sourceUrl });
|
||
|
||
if (existing) {
|
||
// 更新已有规则
|
||
await collection.updateOne(
|
||
{ sourceUrl: rule.sourceUrl },
|
||
{
|
||
$set: {
|
||
...rule,
|
||
lastUpdated: new Date(),
|
||
},
|
||
}
|
||
);
|
||
result.rulesUpdated++;
|
||
} else {
|
||
// 插入新规则
|
||
await collection.insertOne({
|
||
...rule,
|
||
createdAt: new Date(),
|
||
});
|
||
result.rulesInserted++;
|
||
}
|
||
} catch (e) {
|
||
result.errors.push(`规则写入失败: ${(e as Error).message}`);
|
||
}
|
||
|
||
// 限速
|
||
if (source.rateLimit) {
|
||
await new Promise(resolve => setTimeout(resolve, source.rateLimit));
|
||
}
|
||
}
|
||
|
||
} catch (e) {
|
||
result.errors.push(`连接失败: ${(e as Error).message}`);
|
||
} finally {
|
||
await client.close();
|
||
}
|
||
|
||
return result;
|
||
}
|
||
|
||
/**
|
||
* 运行完整爬虫(所有数据源)
|
||
*/
|
||
export async function runFullCrawl(options?: {
|
||
jurisdictions?: string[];
|
||
tier?: number;
|
||
dryRun?: boolean;
|
||
}): Promise<CrawlerResult[]> {
|
||
const results: CrawlerResult[] = [];
|
||
|
||
let sources = REGULATORY_SOURCES;
|
||
|
||
// 过滤条件
|
||
if (options?.jurisdictions && options.jurisdictions.length > 0) {
|
||
sources = sources.filter(s => options.jurisdictions!.includes(s.jurisdiction));
|
||
}
|
||
if (options?.tier !== undefined) {
|
||
sources = sources.filter(s => s.tier <= options.tier!);
|
||
}
|
||
|
||
console.log(`[Crawler] 开始爬取 ${sources.length} 个数据源...`);
|
||
|
||
for (const source of sources) {
|
||
console.log(`[Crawler] 正在爬取: ${source.sourceName} (${source.jurisdiction})`);
|
||
|
||
if (options?.dryRun) {
|
||
results.push({
|
||
jurisdiction: source.jurisdiction,
|
||
sourceName: source.sourceName,
|
||
rulesFound: 0,
|
||
rulesInserted: 0,
|
||
rulesUpdated: 0,
|
||
errors: ["DRY_RUN: 跳过实际爬取"],
|
||
crawledAt: new Date(),
|
||
});
|
||
continue;
|
||
}
|
||
|
||
const result = await crawlSource(source);
|
||
results.push(result);
|
||
|
||
console.log(`[Crawler] ${source.sourceName}: 找到 ${result.rulesFound} 条, 新增 ${result.rulesInserted} 条, 更新 ${result.rulesUpdated} 条`);
|
||
if (result.errors.length > 0) {
|
||
console.warn(`[Crawler] ${source.sourceName} 错误: ${result.errors.join("; ")}`);
|
||
}
|
||
|
||
// 数据源间间隔
|
||
await new Promise(resolve => setTimeout(resolve, 500));
|
||
}
|
||
|
||
return results;
|
||
}
|
||
|
||
/**
|
||
* 运行 Tier 1 辖区爬虫(快速模式)
|
||
*/
|
||
export async function runTier1Crawl(): Promise<CrawlerResult[]> {
|
||
return runFullCrawl({ tier: 1 });
|
||
}
|
||
|
||
/**
|
||
* 获取爬虫数据源列表(不执行爬取)
|
||
*/
|
||
export function getCrawlerSources(tier?: number): CrawlerSource[] {
|
||
if (tier !== undefined) {
|
||
return REGULATORY_SOURCES.filter(s => s.tier <= tier);
|
||
}
|
||
return REGULATORY_SOURCES;
|
||
}
|
||
|
||
/**
|
||
* 获取爬虫统计信息
|
||
*/
|
||
export function getCrawlerStats(): {
|
||
totalSources: number;
|
||
tier1Sources: number;
|
||
tier2Sources: number;
|
||
jurisdictionCount: number;
|
||
assetClassCount: number;
|
||
} {
|
||
const tier1 = REGULATORY_SOURCES.filter(s => s.tier === 1);
|
||
const tier2 = REGULATORY_SOURCES.filter(s => s.tier === 2);
|
||
const jurisdictions = new Set(REGULATORY_SOURCES.map(s => s.jurisdiction));
|
||
const assetClasses = new Set(REGULATORY_SOURCES.flatMap(s => s.assetClasses));
|
||
|
||
return {
|
||
totalSources: REGULATORY_SOURCES.length,
|
||
tier1Sources: tier1.length,
|
||
tier2Sources: tier2.length,
|
||
jurisdictionCount: jurisdictions.size,
|
||
assetClassCount: assetClasses.size,
|
||
};
|
||
}
|