/** * NAC 公链 - 监管规则自动爬虫模块 * Regulatory Rules Auto-Crawler * * 覆盖范围: * - Tier 1 辖区(20个):US/CA/EU/GB/CH/DE/FR/NL/IE/LU/JP/KR/SG/HK/AU/AE/IL * - Tier 2 辖区(25个):CN/TW/MY/TH/IN/IT/ES/TR/SA/QA/KW/BH/BR/CL/AR/ZA等 * - Tier 3 辖区(15个):ID/PH/VN/PK/BD/OM/CO/PE/VE/UY/PY/NG/EG/KE/MA/RU/KZ/UA * * 资产类别:20大类 100+子类(GNACS标准) * * 爬取策略: * 1. 官方 RSS/Atom 订阅源(实时更新) * 2. 官方 API 接口(SEC EDGAR、ESMA FIRDS等) * 3. 官方网站 HTML 解析(无 API 时使用) * 4. 解析提取 → 结构化规则 → 写入 MongoDB */ import https from "https"; import http from "http"; import { URL } from "url"; import { MongoClient } from "mongodb"; // ─── 类型定义 ───────────────────────────────────────────────────── export interface CrawledRule { ruleId: string; jurisdiction: string; assetClass: string; ruleType: "ownership_verification" | "trading_rules" | "compliance_general" | "tax_rules" | "aml_kyc"; ruleName: string; content: string; legalBasis: string; ownershipRequirements?: { proofDocuments?: string[]; registrationAuthority?: string; transferMechanism?: string; chainRecognition?: string; foreignOwnershipRestriction?: string; disputeResolution?: string; }; tradingRequirements?: { minimumInvestor?: string; settlementPeriod?: string; allowedCurrencies?: string[]; tradingPlatform?: string; reportingRequirements?: string; }; sourceUrl: string; sourceName: string; crawledAt: Date; lastUpdated: Date; tier: number; tags: string[]; complianceLevel: "mandatory" | "recommended" | "informational"; } export interface CrawlerSource { jurisdiction: string; sourceName: string; sourceUrl: string; rssUrl?: string; apiUrl?: string; assetClasses: string[]; tier: number; parseStrategy: "rss" | "api" | "html" | "json"; rateLimit?: number; // ms between requests } export interface CrawlerResult { jurisdiction: string; sourceName: string; rulesFound: number; rulesInserted: number; rulesUpdated: number; errors: string[]; crawledAt: Date; } // ─── 官方数据源清单 ─────────────────────────────────────────────── export const REGULATORY_SOURCES: CrawlerSource[] = [ // ══════════════════════════════════════════════════════════════ // 北美洲 // ══════════════════════════════════════════════════════════════ { jurisdiction: "US", sourceName: "SEC (美国证券交易委员会)", sourceUrl: "https://www.sec.gov", rssUrl: "https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&type=&dateb=&owner=include&count=40&search_text=&output=atom", apiUrl: "https://efts.sec.gov/LATEST/search-index?q=%22RWA%22+%22tokenization%22&dateRange=custom&startdt=2023-01-01&forms=S-1,8-K", assetClasses: ["Equity", "Bonds", "RealEstate", "Commodities", "Funds"], tier: 1, parseStrategy: "rss", rateLimit: 1000, }, { jurisdiction: "US", sourceName: "FinCEN (美国金融犯罪执法网络)", sourceUrl: "https://www.fincen.gov", rssUrl: "https://www.fincen.gov/news/rss.xml", assetClasses: ["ALL"], tier: 1, parseStrategy: "rss", rateLimit: 1000, }, { jurisdiction: "CA", sourceName: "CSA (加拿大证券管理局)", sourceUrl: "https://www.securities-administrators.ca", rssUrl: "https://www.securities-administrators.ca/news/rss", assetClasses: ["Equity", "Bonds", "Funds", "Crypto"], tier: 1, parseStrategy: "rss", rateLimit: 1000, }, // ══════════════════════════════════════════════════════════════ // 欧洲 // ══════════════════════════════════════════════════════════════ { jurisdiction: "EU", sourceName: "ESMA (欧洲证券和市场管理局)", sourceUrl: "https://www.esma.europa.eu", rssUrl: "https://www.esma.europa.eu/press-news/rss-feeds", apiUrl: "https://registers.esma.europa.eu/publication/searchRegister?core=esma_registers_firds_ir", assetClasses: ["Equity", "Bonds", "Derivatives", "Funds", "Crypto"], tier: 1, parseStrategy: "rss", rateLimit: 1000, }, { jurisdiction: "GB", sourceName: "FCA (英国金融行为监管局)", sourceUrl: "https://www.fca.org.uk", rssUrl: "https://www.fca.org.uk/news/rss.xml", assetClasses: ["Equity", "Bonds", "RealEstate", "Funds", "Crypto"], tier: 1, parseStrategy: "rss", rateLimit: 1000, }, { jurisdiction: "CH", sourceName: "FINMA (瑞士金融市场监管局)", sourceUrl: "https://www.finma.ch", rssUrl: "https://www.finma.ch/en/news/rss/", assetClasses: ["Equity", "Bonds", "Funds", "Crypto", "RealEstate"], tier: 1, parseStrategy: "rss", rateLimit: 1000, }, { jurisdiction: "DE", sourceName: "BaFin (德国联邦金融监管局)", sourceUrl: "https://www.bafin.de", rssUrl: "https://www.bafin.de/SiteGlobals/Functions/RSSFeed/EN/RSSNewsfeed_Veroeffentlichungen/RSSNewsfeed_Veroeffentlichungen_node.html", assetClasses: ["Equity", "Bonds", "Funds", "RealEstate"], tier: 1, parseStrategy: "rss", rateLimit: 1000, }, { jurisdiction: "FR", sourceName: "AMF (法国金融市场管理局)", sourceUrl: "https://www.amf-france.org", rssUrl: "https://www.amf-france.org/en/rss/news", assetClasses: ["Equity", "Bonds", "Funds", "Crypto"], tier: 1, parseStrategy: "rss", rateLimit: 1000, }, { jurisdiction: "NL", sourceName: "AFM (荷兰金融市场管理局)", sourceUrl: "https://www.afm.nl", rssUrl: "https://www.afm.nl/en/nieuws/rss", assetClasses: ["Equity", "Bonds", "Funds"], tier: 1, parseStrategy: "rss", rateLimit: 1000, }, { jurisdiction: "LU", sourceName: "CSSF (卢森堡金融监管委员会)", sourceUrl: "https://www.cssf.lu", rssUrl: "https://www.cssf.lu/en/news/rss/", assetClasses: ["Funds", "Bonds", "Equity"], tier: 1, parseStrategy: "rss", rateLimit: 1000, }, // ══════════════════════════════════════════════════════════════ // 亚太地区 // ══════════════════════════════════════════════════════════════ { jurisdiction: "HK", sourceName: "SFC (香港证券及期货事务监察委员会)", sourceUrl: "https://www.sfc.hk", rssUrl: "https://www.sfc.hk/en/rss/news", assetClasses: ["Equity", "Bonds", "RealEstate", "Funds", "Crypto"], tier: 1, parseStrategy: "rss", rateLimit: 1000, }, { jurisdiction: "HK", sourceName: "HKEX (香港交易所)", sourceUrl: "https://www.hkex.com.hk", rssUrl: "https://www.hkex.com.hk/eng/newsconsul/hkexnews/rss/news.xml", assetClasses: ["Equity", "Bonds", "Derivatives", "Funds"], tier: 1, parseStrategy: "rss", rateLimit: 1000, }, { jurisdiction: "SG", sourceName: "MAS (新加坡金融管理局)", sourceUrl: "https://www.mas.gov.sg", rssUrl: "https://www.mas.gov.sg/news/rss", assetClasses: ["Equity", "Bonds", "RealEstate", "Funds", "Crypto"], tier: 1, parseStrategy: "rss", rateLimit: 1000, }, { jurisdiction: "JP", sourceName: "FSA (日本金融厅)", sourceUrl: "https://www.fsa.go.jp", rssUrl: "https://www.fsa.go.jp/en/news/rss.xml", assetClasses: ["Equity", "Bonds", "RealEstate", "Funds"], tier: 1, parseStrategy: "rss", rateLimit: 1500, }, { jurisdiction: "KR", sourceName: "FSC (韩国金融委员会)", sourceUrl: "https://www.fsc.go.kr", rssUrl: "https://www.fsc.go.kr/eng/rss/news.xml", assetClasses: ["Equity", "Bonds", "Funds", "Crypto"], tier: 1, parseStrategy: "rss", rateLimit: 1500, }, { jurisdiction: "AU", sourceName: "ASIC (澳大利亚证券和投资委员会)", sourceUrl: "https://asic.gov.au", rssUrl: "https://asic.gov.au/about-asic/news-centre/rss-feeds/", assetClasses: ["Equity", "Bonds", "RealEstate", "Commodities", "Funds"], tier: 1, parseStrategy: "rss", rateLimit: 1000, }, { jurisdiction: "CN", sourceName: "CSRC (中国证券监督管理委员会)", sourceUrl: "http://www.csrc.gov.cn", rssUrl: "http://www.csrc.gov.cn/csrc/c100028/common_list.shtml", assetClasses: ["Equity", "Bonds", "Funds", "RealEstate"], tier: 2, parseStrategy: "html", rateLimit: 2000, }, { jurisdiction: "CN", sourceName: "PBOC (中国人民银行)", sourceUrl: "http://www.pbc.gov.cn", rssUrl: "http://www.pbc.gov.cn/rss/index.xml", assetClasses: ["Bonds", "Forex", "Crypto"], tier: 2, parseStrategy: "rss", rateLimit: 2000, }, { jurisdiction: "IN", sourceName: "SEBI (印度证券交易委员会)", sourceUrl: "https://www.sebi.gov.in", rssUrl: "https://www.sebi.gov.in/rss/news.xml", assetClasses: ["Equity", "Bonds", "Funds", "Commodities"], tier: 2, parseStrategy: "rss", rateLimit: 1500, }, { jurisdiction: "MY", sourceName: "SC (马来西亚证券委员会)", sourceUrl: "https://www.sc.com.my", rssUrl: "https://www.sc.com.my/api/documentcentre/rss", assetClasses: ["Equity", "Bonds", "Funds", "IslamicFinance"], tier: 2, parseStrategy: "rss", rateLimit: 1500, }, { jurisdiction: "TH", sourceName: "SEC Thailand (泰国证券交易委员会)", sourceUrl: "https://www.sec.or.th", rssUrl: "https://www.sec.or.th/EN/Pages/News/rss.aspx", assetClasses: ["Equity", "Bonds", "Funds", "Crypto"], tier: 2, parseStrategy: "rss", rateLimit: 1500, }, // ══════════════════════════════════════════════════════════════ // 中东地区 // ══════════════════════════════════════════════════════════════ { jurisdiction: "AE", sourceName: "DFSA (迪拜金融服务局)", sourceUrl: "https://www.dfsa.ae", rssUrl: "https://www.dfsa.ae/news/rss", assetClasses: ["Equity", "Bonds", "RealEstate", "Funds", "Crypto"], tier: 1, parseStrategy: "rss", rateLimit: 1000, }, { jurisdiction: "AE", sourceName: "ADGM (阿布扎比全球市场)", sourceUrl: "https://www.adgm.com", rssUrl: "https://www.adgm.com/news/rss", assetClasses: ["Equity", "Bonds", "Funds", "Crypto", "RealEstate"], tier: 1, parseStrategy: "rss", rateLimit: 1000, }, { jurisdiction: "SA", sourceName: "CMA Saudi (沙特资本市场管理局)", sourceUrl: "https://cma.org.sa", rssUrl: "https://cma.org.sa/en/News/Pages/rss.aspx", assetClasses: ["Equity", "Bonds", "Funds", "RealEstate"], tier: 2, parseStrategy: "rss", rateLimit: 1500, }, { jurisdiction: "QA", sourceName: "QFMA (卡塔尔金融市场管理局)", sourceUrl: "https://www.qfma.org.qa", rssUrl: "https://www.qfma.org.qa/English/News/rss.aspx", assetClasses: ["Equity", "Bonds", "Funds"], tier: 2, parseStrategy: "rss", rateLimit: 1500, }, { jurisdiction: "IL", sourceName: "ISA (以色列证券局)", sourceUrl: "https://www.isa.gov.il", rssUrl: "https://www.isa.gov.il/en/news/rss", assetClasses: ["Equity", "Bonds", "Funds", "Crypto"], tier: 1, parseStrategy: "rss", rateLimit: 1000, }, // ══════════════════════════════════════════════════════════════ // 南美洲 // ══════════════════════════════════════════════════════════════ { jurisdiction: "BR", sourceName: "CVM (巴西证券委员会)", sourceUrl: "https://www.gov.br/cvm", rssUrl: "https://www.gov.br/cvm/pt-br/assuntos/noticias/rss.xml", assetClasses: ["Equity", "Bonds", "Funds", "Commodities"], tier: 2, parseStrategy: "rss", rateLimit: 1500, }, { jurisdiction: "CL", sourceName: "CMF (智利金融市场委员会)", sourceUrl: "https://www.cmfchile.cl", rssUrl: "https://www.cmfchile.cl/sitio/rss/noticias.xml", assetClasses: ["Equity", "Bonds", "Funds", "Commodities"], tier: 2, parseStrategy: "rss", rateLimit: 1500, }, // ══════════════════════════════════════════════════════════════ // 非洲 // ══════════════════════════════════════════════════════════════ { jurisdiction: "ZA", sourceName: "FSCA (南非金融行业监管局)", sourceUrl: "https://www.fsca.co.za", rssUrl: "https://www.fsca.co.za/News/Pages/rss.aspx", assetClasses: ["Equity", "Bonds", "Funds", "Commodities"], tier: 2, parseStrategy: "rss", rateLimit: 1500, }, ]; // ─── HTTP 请求工具 ──────────────────────────────────────────────── async function fetchUrl(url: string, timeoutMs = 15000): Promise { return new Promise((resolve, reject) => { const parsedUrl = new URL(url); const protocol = parsedUrl.protocol === "https:" ? https : http; const options = { hostname: parsedUrl.hostname, port: parsedUrl.port || (parsedUrl.protocol === "https:" ? 443 : 80), path: parsedUrl.pathname + parsedUrl.search, method: "GET", headers: { "User-Agent": "NAC-Regulatory-Crawler/1.0 (NAC Public Chain Compliance; https://newassetchain.io)", "Accept": "application/rss+xml, application/atom+xml, application/xml, text/xml, text/html, application/json", "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8", "Cache-Control": "no-cache", }, timeout: timeoutMs, }; const req = protocol.request(options, (res) => { // 处理重定向 if (res.statusCode && [301, 302, 303, 307, 308].includes(res.statusCode) && res.headers.location) { const redirectUrl = res.headers.location.startsWith("http") ? res.headers.location : `${parsedUrl.protocol}//${parsedUrl.hostname}${res.headers.location}`; fetchUrl(redirectUrl, timeoutMs).then(resolve).catch(reject); return; } if (res.statusCode && res.statusCode >= 400) { reject(new Error(`HTTP ${res.statusCode}: ${url}`)); return; } const chunks: Buffer[] = []; res.on("data", (chunk: Buffer) => chunks.push(chunk)); res.on("end", () => resolve(Buffer.concat(chunks).toString("utf-8"))); res.on("error", reject); }); req.on("timeout", () => { req.destroy(); reject(new Error(`Timeout fetching: ${url}`)); }); req.on("error", reject); req.end(); }); } // ─── RSS/Atom 解析器 ────────────────────────────────────────────── interface RSSItem { title: string; link: string; description: string; pubDate: string; category?: string; } function parseRSSFeed(xmlContent: string): RSSItem[] { const items: RSSItem[] = []; // 支持 RSS 2.0 和 Atom 格式 const isAtom = xmlContent.includes("]*>([\s\S]*?)<\/entry>/gi; let entryMatch: RegExpExecArray | null; while ((entryMatch = entryRegex.exec(xmlContent)) !== null) { const match = entryMatch; const entry = match[1]; const title = extractXmlTag(entry, "title") || ""; const link = extractAtomLink(entry); const summary = extractXmlTag(entry, "summary") || extractXmlTag(entry, "content") || ""; const updated = extractXmlTag(entry, "updated") || extractXmlTag(entry, "published") || ""; if (title && link) { items.push({ title: cleanHtml(title), link, description: cleanHtml(summary).slice(0, 500), pubDate: updated, }); } } } else { // RSS 2.0 格式 const itemRegex = /]*>([\s\S]*?)<\/item>/gi; let itemMatch: RegExpExecArray | null; while ((itemMatch = itemRegex.exec(xmlContent)) !== null) { const match = itemMatch; const item = match[1]; const title = extractXmlTag(item, "title") || ""; const link = extractXmlTag(item, "link") || extractXmlTag(item, "guid") || ""; const description = extractXmlTag(item, "description") || ""; const pubDate = extractXmlTag(item, "pubDate") || extractXmlTag(item, "dc:date") || ""; const category = extractXmlTag(item, "category") || ""; if (title && link) { items.push({ title: cleanHtml(title), link: link.trim(), description: cleanHtml(description).slice(0, 500), pubDate, category: category || undefined, }); } } } return items.slice(0, 50); // 最多取50条 } function extractXmlTag(xml: string, tag: string): string { const match = xml.match(new RegExp(`<${tag}[^>]*><\\/${tag}>`, "i")) || xml.match(new RegExp(`<${tag}[^>]*>([\\s\\S]*?)<\\/${tag}>`, "i")); return match ? match[1].trim() : ""; } function extractAtomLink(xml: string): string { const match = xml.match(/]+href=["']([^"']+)["'][^>]*\/?>/i) || xml.match(/]*>([^<]+)<\/link>/i); return match ? match[1].trim() : ""; } function cleanHtml(html: string): string { return html .replace(/<[^>]+>/g, " ") .replace(/&/g, "&") .replace(/</g, "<") .replace(/>/g, ">") .replace(/"/g, '"') .replace(/'/g, "'") .replace(/ /g, " ") .replace(/\s+/g, " ") .trim(); } // ─── 规则提取器 ─────────────────────────────────────────────────── /** * 从 RSS 条目中提取结构化规则 * 基于标题和描述的关键词匹配,识别规则类型和资产类别 */ function extractRuleFromRSSItem( item: RSSItem, source: CrawlerSource ): Partial | null { const text = `${item.title} ${item.description}`.toLowerCase(); // 判断是否与 RWA/监管规则相关 const relevantKeywords = [ "regulation", "rule", "guidance", "circular", "notice", "directive", "compliance", "requirement", "framework", "standard", "policy", "tokeniz", "digital asset", "crypto", "blockchain", "rwa", "real world asset", "securities", "license", "registration", "approval", "permit", "监管", "规则", "指引", "通知", "合规", "要求", "框架", "标准", "代币化", "数字资产", "加密", "区块链", "证券", "许可", "注册", ]; const isRelevant = relevantKeywords.some(kw => text.includes(kw)); if (!isRelevant) return null; // 识别资产类别 let assetClass = "General"; const assetKeywords: Record = { "RealEstate": ["real estate", "property", "reits", "mortgage", "land", "房地产", "不动产", "房产", "土地"], "Equity": ["equity", "stock", "share", "ipo", "listing", "股权", "股票", "股份", "上市"], "Bonds": ["bond", "debt", "fixed income", "treasury", "debenture", "债券", "债务", "国债", "票据"], "Commodities": ["commodity", "gold", "silver", "oil", "gas", "wheat", "大宗商品", "黄金", "白银", "石油"], "Funds": ["fund", "etf", "mutual fund", "hedge fund", "基金", "投资基金"], "Crypto": ["crypto", "bitcoin", "ethereum", "token", "digital asset", "加密货币", "代币", "数字资产"], "CarbonCredits": ["carbon", "emission", "esg", "green", "碳", "排放", "绿色"], "IP": ["intellectual property", "patent", "copyright", "trademark", "知识产权", "专利", "版权", "商标"], "Infrastructure": ["infrastructure", "highway", "railway", "airport", "基础设施", "高速公路", "铁路", "机场"], }; for (const [cls, keywords] of Object.entries(assetKeywords)) { if (keywords.some(kw => text.includes(kw))) { assetClass = cls; break; } } // 识别规则类型 let ruleType: CrawledRule["ruleType"] = "compliance_general"; if (text.match(/ownership|title|deed|register|登记|所有权|产权|确权/)) { ruleType = "ownership_verification"; } else if (text.match(/trading|settlement|transaction|exchange|交易|结算|买卖/)) { ruleType = "trading_rules"; } else if (text.match(/tax|duty|stamp|withholding|税|关税|印花税|预扣税/)) { ruleType = "tax_rules"; } else if (text.match(/kyc|aml|anti.money|fatf|反洗钱|客户尽职/)) { ruleType = "aml_kyc"; } // 生成规则ID const ruleId = `${source.jurisdiction}-${assetClass.toUpperCase().slice(0, 4)}-CRAWL-${Date.now()}-${Math.random().toString(36).slice(2, 6)}`; return { ruleId, jurisdiction: source.jurisdiction, assetClass, ruleType, ruleName: item.title.slice(0, 100), content: `${item.title}\n\n${item.description}`, legalBasis: `${source.sourceName} - ${item.pubDate || "最新发布"}`, sourceUrl: item.link, sourceName: source.sourceName, crawledAt: new Date(), lastUpdated: item.pubDate ? new Date(item.pubDate) : new Date(), tier: source.tier, tags: [source.jurisdiction, assetClass, ruleType, "auto-crawled"], complianceLevel: "informational", }; } // ─── 爬虫主逻辑 ─────────────────────────────────────────────────── const MONGO_URL = process.env.NAC_MONGO_URL || "mongodb://root:idP0ZaRGyLsTUA3a@localhost:27017/nac_knowledge_engine?authSource=admin"; const DB_NAME = "nac_knowledge_engine"; const COLLECTION_NAME = "compliance_rules"; async function crawlSource(source: CrawlerSource): Promise { const result: CrawlerResult = { jurisdiction: source.jurisdiction, sourceName: source.sourceName, rulesFound: 0, rulesInserted: 0, rulesUpdated: 0, errors: [], crawledAt: new Date(), }; const client = new MongoClient(MONGO_URL); try { await client.connect(); const db = client.db(DB_NAME); const collection = db.collection(COLLECTION_NAME); let items: RSSItem[] = []; // 根据策略选择抓取方式 if (source.parseStrategy === "rss" && source.rssUrl) { try { const content = await fetchUrl(source.rssUrl); items = parseRSSFeed(content); console.log(`[Crawler] ${source.sourceName}: 获取到 ${items.length} 条 RSS 条目`); } catch (e) { result.errors.push(`RSS 获取失败: ${(e as Error).message}`); // 降级到主页 try { const content = await fetchUrl(source.sourceUrl); items = parseRSSFeed(content); } catch { // 忽略 } } } else if (source.parseStrategy === "api" && source.apiUrl) { try { const content = await fetchUrl(source.apiUrl); // JSON API 解析 const data = JSON.parse(content); if (Array.isArray(data.hits?.hits)) { items = data.hits.hits.map((hit: Record) => { const src = hit._source as Record || {}; return { title: String(src.display_names || src.entity_name || src.file_date || ""), link: `https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=${src.entity_id || ""}`, description: String(src.period_of_report || src.file_date || ""), pubDate: String(src.file_date || ""), }; }); } } catch (e) { result.errors.push(`API 获取失败: ${(e as Error).message}`); } } result.rulesFound = items.length; // 处理每条规则 for (const item of items) { const rule = extractRuleFromRSSItem(item, source); if (!rule) continue; try { // 检查是否已存在(基于 sourceUrl) const existing = await collection.findOne({ sourceUrl: rule.sourceUrl }); if (existing) { // 更新已有规则 await collection.updateOne( { sourceUrl: rule.sourceUrl }, { $set: { ...rule, lastUpdated: new Date(), }, } ); result.rulesUpdated++; } else { // 插入新规则 await collection.insertOne({ ...rule, createdAt: new Date(), }); result.rulesInserted++; } } catch (e) { result.errors.push(`规则写入失败: ${(e as Error).message}`); } // 限速 if (source.rateLimit) { await new Promise(resolve => setTimeout(resolve, source.rateLimit)); } } } catch (e) { result.errors.push(`连接失败: ${(e as Error).message}`); } finally { await client.close(); } return result; } /** * 运行完整爬虫(所有数据源) */ export async function runFullCrawl(options?: { jurisdictions?: string[]; tier?: number; dryRun?: boolean; }): Promise { const results: CrawlerResult[] = []; let sources = REGULATORY_SOURCES; // 过滤条件 if (options?.jurisdictions && options.jurisdictions.length > 0) { sources = sources.filter(s => options.jurisdictions!.includes(s.jurisdiction)); } if (options?.tier !== undefined) { sources = sources.filter(s => s.tier <= options.tier!); } console.log(`[Crawler] 开始爬取 ${sources.length} 个数据源...`); for (const source of sources) { console.log(`[Crawler] 正在爬取: ${source.sourceName} (${source.jurisdiction})`); if (options?.dryRun) { results.push({ jurisdiction: source.jurisdiction, sourceName: source.sourceName, rulesFound: 0, rulesInserted: 0, rulesUpdated: 0, errors: ["DRY_RUN: 跳过实际爬取"], crawledAt: new Date(), }); continue; } const result = await crawlSource(source); results.push(result); console.log(`[Crawler] ${source.sourceName}: 找到 ${result.rulesFound} 条, 新增 ${result.rulesInserted} 条, 更新 ${result.rulesUpdated} 条`); if (result.errors.length > 0) { console.warn(`[Crawler] ${source.sourceName} 错误: ${result.errors.join("; ")}`); } // 数据源间间隔 await new Promise(resolve => setTimeout(resolve, 500)); } return results; } /** * 运行 Tier 1 辖区爬虫(快速模式) */ export async function runTier1Crawl(): Promise { return runFullCrawl({ tier: 1 }); } /** * 获取爬虫数据源列表(不执行爬取) */ export function getCrawlerSources(tier?: number): CrawlerSource[] { if (tier !== undefined) { return REGULATORY_SOURCES.filter(s => s.tier <= tier); } return REGULATORY_SOURCES; } /** * 获取爬虫统计信息 */ export function getCrawlerStats(): { totalSources: number; tier1Sources: number; tier2Sources: number; jurisdictionCount: number; assetClassCount: number; } { const tier1 = REGULATORY_SOURCES.filter(s => s.tier === 1); const tier2 = REGULATORY_SOURCES.filter(s => s.tier === 2); const jurisdictions = new Set(REGULATORY_SOURCES.map(s => s.jurisdiction)); const assetClasses = new Set(REGULATORY_SOURCES.flatMap(s => s.assetClasses)); return { totalSources: REGULATORY_SOURCES.length, tier1Sources: tier1.length, tier2Sources: tier2.length, jurisdictionCount: jurisdictions.size, assetClassCount: assetClasses.size, }; }