/** * NAC Knowledge Engine - RAG检索增强模块 v2.0 * * 功能:从MongoDB知识库中检索与用户问题最相关的合规规则条文, * 作为上下文注入到AI Agent的提示词中,提升回答的准确性和可溯源性。 * * 检索策略(四层递进): * 1. 意图识别 - 提取辖区、资产类别、规则类型等结构化信息 * 2. MongoDB全文检索($text index)- 关键词精确匹配 * 3. 正则关键词匹配 - 覆盖全文索引未命中的情况 * 4. 随机采样 - 兜底策略,确保始终有上下文 * * 支持两种规则文档格式: * - 旧格式:{ ruleId, ruleName, jurisdiction, category, content } * - 新格式(贸易规则):{ ruleId, ruleName, jurisdiction, assetClass, ruleType, content, contentEn, ownershipRequirements, tradingRequirements } */ import { getMongoDb, COLLECTIONS } from "./mongodb"; import { semanticSearch, buildVectorIndex, getEmbeddingStatus } from "./embeddingRetrieval"; // ─── 类型定义 ───────────────────────────────────────────────────── export interface RetrievedRule { ruleId: string; ruleName: string; jurisdiction: string; category: string; assetClass?: string; ruleType?: string; content: string; description?: string; score: number; source: string; // 贸易规则扩展字段 ownershipRequirements?: Record; tradingRequirements?: Record; legalBasis?: string; sourceUrl?: string; complianceLevel?: string; tags?: string[]; } export interface RAGContext { rules: RetrievedRule[]; totalFound: number; retrievalMethod: "fulltext" | "regex" | "structured" | "sample" | "semantic" | "hybrid" | "none"; queryKeywords: string[]; detectedJurisdiction?: string; detectedAssetClass?: string; detectedRuleType?: string; } // ─── 意图识别:辖区/资产类别/规则类型映射 ────────────────────────── const JURISDICTION_MAP: Record = { // 中文 → 代码 "美国": "US", "美利坚": "US", "欧盟": "EU", "欧洲": "EU", "英国": "GB", "英格兰": "GB", "香港": "HK", "港": "HK", "新加坡": "SG", "狮城": "SG", "迪拜": "AE", "阿联酋": "AE", "阿布扎比": "AE", "中国": "CN", "大陆": "CN", "内地": "CN", "日本": "JP", "澳大利亚": "AU", "澳洲": "AU", "瑞士": "CH", "韩国": "KR", "加拿大": "CA", "德国": "DE", "法国": "FR", "开曼": "KY", "开曼群岛": "KY", "英属维尔京": "VG", "BVI": "VG", "巴哈马": "BS", "百慕大": "BM", // 英文代码直接映射 "US": "US", "EU": "EU", "GB": "GB", "HK": "HK", "SG": "SG", "AE": "AE", "CN": "CN", "JP": "JP", "AU": "AU", "CH": "CH", "KR": "KR", "CA": "CA", "DE": "DE", "FR": "FR", }; const ASSET_CLASS_MAP: Record = { // 房地产 "房地产": "RealEstate", "不动产": "RealEstate", "房产": "RealEstate", "商业地产": "RealEstate", "住宅": "RealEstate", "写字楼": "RealEstate", "REITs": "RealEstate", "REIT": "RealEstate", "房地产投资信托": "RealEstate", // 金融证券 "证券": "FinancialSecurities", "股票": "FinancialSecurities", "债券": "FinancialSecurities", "基金": "FinancialSecurities", "期货": "FinancialSecurities", "期权": "FinancialSecurities", "security": "FinancialSecurities", "securities": "FinancialSecurities", // 大宗商品 "大宗商品": "Commodities", "黄金": "Commodities", "白银": "Commodities", "原油": "Commodities", "铜": "Commodities", "铁矿石": "Commodities", "农产品": "Commodities", "粮食": "Commodities", // 数字资产 "数字资产": "DigitalAssets", "加密资产": "DigitalAssets", "虚拟资产": "DigitalAssets", "代币": "DigitalAssets", "Token": "DigitalAssets", "NFT": "DigitalAssets", "稳定币": "DigitalAssets", "XTZH": "DigitalAssets", // 碳排放权 "碳排放": "EnvironmentalRights", "碳信用": "EnvironmentalRights", "碳权": "EnvironmentalRights", "排放权": "EnvironmentalRights", // 知识产权 "知识产权": "IntellectualProperty", "专利": "IntellectualProperty", "商标": "IntellectualProperty", "版权": "IntellectualProperty", // 基础设施 "基础设施": "Infrastructure", "电力": "Infrastructure", "水务": "Infrastructure", // 应收账款 "应收账款": "Receivables", "贸易融资": "Receivables", }; const RULE_TYPE_MAP: Record = { "所有权": "ownership_verification", "产权": "ownership_verification", "确权": "ownership_verification", "所有人": "ownership_verification", "交易": "trading_permission", "买卖": "trading_permission", "上市": "trading_permission", "流通": "trading_permission", "KYC": "kyc_aml", "AML": "kyc_aml", "反洗钱": "kyc_aml", "身份验证": "kyc_aml", "尽职调查": "kyc_aml", "税": "tax_compliance", "税务": "tax_compliance", "纳税": "tax_compliance", "跨境": "cross_border", "境外": "cross_border", "外资": "cross_border", "结算": "settlement", "清算": "settlement", "托管": "custody", "保管": "custody", "披露": "disclosure", "信息披露": "disclosure", "牌照": "licensing", "许可证": "licensing", "执照": "licensing", }; /** * 从用户问题中识别结构化意图(辖区/资产类别/规则类型) */ function detectQueryIntent(query: string): { jurisdiction?: string; assetClass?: string; ruleType?: string; keywords: string[]; } { let detectedJurisdiction: string | undefined; let detectedAssetClass: string | undefined; let detectedRuleType: string | undefined; // 检测司法辖区 for (const [term, code] of Object.entries(JURISDICTION_MAP)) { if (query.includes(term)) { detectedJurisdiction = code; break; } } // 检测资产类别 for (const [term, cls] of Object.entries(ASSET_CLASS_MAP)) { if (query.toLowerCase().includes(term.toLowerCase())) { detectedAssetClass = cls; break; } } // 检测规则类型 for (const [term, type] of Object.entries(RULE_TYPE_MAP)) { if (query.includes(term)) { detectedRuleType = type; break; } } // 提取关键词 const STOP_WORDS = new Set([ "的", "了", "是", "在", "我", "有", "和", "就", "不", "人", "都", "一", "一个", "上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看", "好", "自己", "这", "那", "什么", "如何", "怎么", "请问", "帮我", "告诉", "介绍", "关于", "对于", "针对", "需要", "可以", "应该", "必须", "规定", "要求", "the", "a", "an", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "do", "does", "did", "will", "would", "could", "should", "what", "how", "when", "where", "why", "which", "who", ]); const chineseTerms = query.match(/[\u4e00-\u9fa5]{2,8}/g) || []; const englishTerms = query.match(/[a-zA-Z]{3,}/g) || []; const allTerms = [...chineseTerms, ...englishTerms]; const keywords = Array.from(new Set(allTerms.filter(t => !STOP_WORDS.has(t.toLowerCase())))).slice(0, 8); return { jurisdiction: detectedJurisdiction, assetClass: detectedAssetClass, ruleType: detectedRuleType, keywords }; } // ─── 主检索函数 ─────────────────────────────────────────────────── /** * 从MongoDB知识库检索相关规则(RAG核心函数) */ export async function retrieveRelevantRules( query: string, options: { maxResults?: number; jurisdictions?: string[]; categories?: string[]; language?: string; } = {} ): Promise { const { maxResults = 6, jurisdictions, categories, language = "zh" } = options; const db = await getMongoDb(); if (!db) { return { rules: [], totalFound: 0, retrievalMethod: "none", queryKeywords: [] }; } // 意图识别 const intent = detectQueryIntent(query); const keywords = intent.keywords; const collection = db.collection(COLLECTIONS.COMPLIANCE_RULES); // 构建基础过滤条件(兼容新旧两种文档格式) const baseFilter: Record = {}; // 优先使用意图识别的辖区,其次使用传入的辖区参数 const targetJurisdictions = jurisdictions || (intent.jurisdiction ? [intent.jurisdiction, "GLOBAL"] : undefined); if (targetJurisdictions && targetJurisdictions.length > 0) { baseFilter.jurisdiction = { $in: targetJurisdictions }; } // 资产类别过滤(新格式用 assetClass,旧格式用 category) const targetAssetClass = intent.assetClass; if (targetAssetClass) { baseFilter.$or = [ { assetClass: { $in: [targetAssetClass, "ALL"] } }, { category: { $regex: targetAssetClass, $options: "i" } }, ]; } else if (categories && categories.length > 0) { baseFilter.category = { $in: categories }; } // 规则类型过滤 if (intent.ruleType) { baseFilter.ruleType = intent.ruleType; } let rules: RetrievedRule[] = []; let retrievalMethod: RAGContext["retrievalMethod"] = "none"; // ── 策略1:结构化精确匹配(意图识别命中时)────────────────── if (intent.jurisdiction || intent.assetClass || intent.ruleType) { try { const structuredResults = await collection .find(baseFilter) .sort({ tier: 1, relevance: -1 }) .limit(maxResults) .toArray(); if (structuredResults.length > 0) { rules = structuredResults.map((doc, idx) => { // 结构化检索基于精确匹配,给予较高基础分 const baseScore = Math.max(0.6, 1.0 - (idx / Math.max(1, structuredResults.length)) * 0.3); return formatRule(doc, language, idx, structuredResults.length, baseScore); }); retrievalMethod = "structured"; } } catch (e) { console.warn("[RAG] 结构化检索失败:", (e as Error).message); } } // ── 策略2:MongoDB全文检索 ──────────────────────────────────── if (rules.length < 3 && keywords.length > 0) { try { const searchText = keywords.join(" "); const textFilter: Record = { $text: { $search: searchText }, }; // 不加辖区过滤,扩大全文检索范围 if (intent.assetClass) { textFilter.$or = [ { assetClass: { $in: [intent.assetClass, "ALL"] } }, { category: { $regex: intent.assetClass, $options: "i" } }, ]; } const textResults = await collection .find(textFilter, { projection: { score: { $meta: "textScore" }, ruleId: 1, ruleName: 1, jurisdiction: 1, category: 1, assetClass: 1, ruleType: 1, content: 1, contentEn: 1, description: 1, ownershipRequirements: 1, tradingRequirements: 1, legalBasis: 1, sourceUrl: 1, complianceLevel: 1, tags: 1, "translations.zh": 1, "translations.en": 1, }, }) .sort({ score: { $meta: "textScore" } }) .limit(maxResults) .toArray(); if (textResults.length > 0) { const newRules = textResults .filter(r => !rules.some(existing => existing.ruleId === String(r.ruleId || r._id))) .map((doc, idx) => { // textScore 可能为 undefined 或 NaN,需要安全处理 const textScore = typeof doc.score === "number" && !isNaN(doc.score as number) ? Math.min(1.0, (doc.score as number) / 10) // textScore 通常在 0-10 范围,归一化到 0-1 : undefined; return formatRule(doc, language, idx, textResults.length, textScore); }); rules = [...rules, ...newRules].slice(0, maxResults); if (retrievalMethod === "none") retrievalMethod = "fulltext"; } } catch (e) { console.warn("[RAG] 全文检索失败,降级到正则检索:", (e as Error).message); } } // ── 策略3:正则关键词匹配 ───────────────────────────────────── if (rules.length < 3 && keywords.length > 0) { try { const regexConditions = keywords.slice(0, 4).map(kw => ({ $or: [ { ruleName: { $regex: kw, $options: "i" } }, { ruleNameEn: { $regex: kw, $options: "i" } }, { description: { $regex: kw, $options: "i" } }, { content: { $regex: kw, $options: "i" } }, { contentEn: { $regex: kw, $options: "i" } }, { tags: { $regex: kw, $options: "i" } }, { "translations.zh": { $regex: kw, $options: "i" } }, ], })); const regexFilter: Record = { $and: regexConditions }; const regexResults = await collection .find(regexFilter) .limit(maxResults) .toArray(); if (regexResults.length > 0) { const newRules = regexResults .filter(r => !rules.some(existing => existing.ruleId === String(r.ruleId || r._id))) .map((doc, idx) => { // 正则检索基于关键词匹配,给予中等分数 const baseScore = Math.max(0.5, 0.9 - (idx / Math.max(1, regexResults.length)) * 0.4); return formatRule(doc, language, idx, regexResults.length, baseScore); }); rules = [...rules, ...newRules].slice(0, maxResults); if (retrievalMethod === "none") retrievalMethod = "regex"; } } catch (e) { console.warn("[RAG] 正则检索失败:", (e as Error).message); } } // ── 策略4:随机采样(兜底策略)────────────────────────────── if (rules.length === 0) { try { const sampleResults = await collection .aggregate([ { $match: {} }, { $sample: { size: maxResults } }, ]) .toArray(); if (sampleResults.length > 0) { rules = sampleResults.map((doc, idx) => formatRule(doc, language, idx, sampleResults.length, 0.3)); retrievalMethod = "sample"; } } catch (e) { console.warn("[RAG] 随机采样失败:", (e as Error).message); } } // ── 策略5:语义向量检索(增强层)────────────────────────────── // 无论前面是否找到结果,都尝试语义检索来补充或替换低质量结果 try { // 预热向量索引(异步,不阻塞) buildVectorIndex().catch(() => {}); const semanticResults = await semanticSearch(query, { topK: maxResults, jurisdiction: intent.jurisdiction, assetClass: intent.assetClass, ruleType: intent.ruleType, minScore: 0.05, }); if (semanticResults.length > 0) { if (rules.length === 0) { // 关键词检索无结果,完全使用语义检索 rules = semanticResults.map(r => ({ ruleId: r.ruleId, ruleName: r.ruleName, jurisdiction: r.jurisdiction, category: r.assetClass, assetClass: r.assetClass, ruleType: r.ruleType, content: r.content, score: r.score, source: `${r.jurisdiction}·${r.assetClass}·${r.ruleName.slice(0, 20)}`, legalBasis: r.legalBasis, ownershipRequirements: r.ownershipRequirements, tradingRequirements: r.tradingRequirements, sourceUrl: r.sourceUrl, complianceLevel: r.complianceLevel, tags: r.tags, })); retrievalMethod = "semantic"; } else { // 混合:将语义检索结果中未出现的规则追加到结果末尾 const existingIds = new Set(rules.map(r => r.ruleId)); const newSemanticRules = semanticResults .filter(r => !existingIds.has(r.ruleId)) .slice(0, Math.max(0, maxResults - rules.length)) .map(r => ({ ruleId: r.ruleId, ruleName: r.ruleName, jurisdiction: r.jurisdiction, category: r.assetClass, assetClass: r.assetClass, ruleType: r.ruleType, content: r.content, score: r.score * 0.9, // 语义补充结果略降分 source: `${r.jurisdiction}·${r.assetClass}·${r.ruleName.slice(0, 20)}`, legalBasis: r.legalBasis, ownershipRequirements: r.ownershipRequirements, tradingRequirements: r.tradingRequirements, sourceUrl: r.sourceUrl, complianceLevel: r.complianceLevel, tags: r.tags, })); if (newSemanticRules.length > 0) { rules = [...rules, ...newSemanticRules]; retrievalMethod = "hybrid"; } } } } catch (e) { console.warn("[RAG] 语义检索失败(降级到关键词结果):", (e as Error).message); } return { rules, totalFound: rules.length, retrievalMethod, queryKeywords: keywords, detectedJurisdiction: intent.jurisdiction, detectedAssetClass: intent.assetClass, detectedRuleType: intent.ruleType, }; } // ─── 格式化工具函数 ─────────────────────────────────────────────── function formatRule( doc: Record, language: string, idx: number, total: number, baseScore?: number ): RetrievedRule { // 防止 total=0 时产生 NaN(idx/0 = NaN) const safeTotal = total > 0 ? total : 1; const score = baseScore !== undefined ? (isNaN(baseScore) ? 0.5 : Math.min(1.0, Math.max(0.0, baseScore))) : Math.max(0.4, 1.0 - (idx / safeTotal) * 0.5); // 兼容新旧两种格式的内容字段 const translations = doc.translations as Record | undefined; let content = ""; if (language === "zh") { content = String(doc.content || translations?.zh || doc.contentEn || translations?.en || ""); } else { content = String(doc.contentEn || translations?.en || doc.content || translations?.zh || ""); } // 截断内容到800字(贸易规则内容较长) const truncatedContent = content.length > 800 ? content.slice(0, 800) + "..." : content; const ruleId = String(doc.ruleId || doc._id || ""); const ruleName = String(doc.ruleName || doc.ruleNameEn || "未命名规则"); const jurisdiction = String(doc.jurisdiction || "未知"); const category = String(doc.category || doc.assetClass || "通用"); const description = doc.description ? String(doc.description) : undefined; return { ruleId, ruleName, jurisdiction, category, assetClass: doc.assetClass ? String(doc.assetClass) : undefined, ruleType: doc.ruleType ? String(doc.ruleType) : undefined, content: truncatedContent, description, score, source: `${jurisdiction}·${category}·${ruleName.slice(0, 20)}`, ownershipRequirements: doc.ownershipRequirements as Record | undefined, tradingRequirements: doc.tradingRequirements as Record | undefined, legalBasis: doc.legalBasis ? String(doc.legalBasis) : undefined, sourceUrl: doc.sourceUrl ? String(doc.sourceUrl) : undefined, complianceLevel: doc.complianceLevel ? String(doc.complianceLevel) : undefined, tags: Array.isArray(doc.tags) ? doc.tags.map(String) : undefined, }; } // ─── 构建RAG提示词上下文 ───────────────────────────────────────── /** * 将检索到的规则格式化为AI提示词中的上下文段落(增强版) */ export function buildRAGPromptContext(ragCtx: RAGContext): string { if (ragCtx.rules.length === 0) { return ""; } const lines: string[] = [ "【知识库检索结果】", `(共检索到 ${ragCtx.totalFound} 条相关规则,检索方式:${ ragCtx.retrievalMethod === "semantic" ? "语义向量检索" : ragCtx.retrievalMethod === "hybrid" ? "混合检索(关键词+语义)" : ragCtx.retrievalMethod === "structured" ? "结构化精确匹配" : ragCtx.retrievalMethod === "fulltext" ? "全文关键词检索" : ragCtx.retrievalMethod === "regex" ? "正则关键词检索" : ragCtx.retrievalMethod === "sample" ? "随机采样(兜底)" : "未知" })`, ]; if (ragCtx.detectedJurisdiction) { lines.push(`(识别到司法辖区:${ragCtx.detectedJurisdiction})`); } if (ragCtx.detectedAssetClass) { lines.push(`(识别到资产类别:${ragCtx.detectedAssetClass})`); } lines.push(""); ragCtx.rules.forEach((rule, idx) => { lines.push(`【规则 ${idx + 1}】${rule.ruleName}`); const safeScore = (rule.score !== undefined && !isNaN(rule.score)) ? rule.score : 0.5; lines.push(` 辖区:${rule.jurisdiction} | 类别:${rule.category} | 相关度:${Math.round(safeScore * 100)}%`); if (rule.ruleType) lines.push(` 规则类型:${rule.ruleType}`); if (rule.legalBasis) lines.push(` 法律依据:${rule.legalBasis}`); if (rule.complianceLevel) lines.push(` 合规级别:${rule.complianceLevel}`); if (rule.description) lines.push(` 摘要:${rule.description}`); lines.push(` 内容:${rule.content}`); // 所有权要求(核心字段) if (rule.ownershipRequirements) { const req = rule.ownershipRequirements as Record; if (req.proofDocuments && Array.isArray(req.proofDocuments)) { lines.push(` 所有权证明文件:${(req.proofDocuments as string[]).join("、")}`); } if (req.registrationAuthority) { lines.push(` 登记机构:${req.registrationAuthority}`); } if (req.transferMechanism) { lines.push(` 转移机制:${req.transferMechanism}`); } if (req.chainRecognition) { lines.push(` 链上法律认可:${req.chainRecognition}`); } if (req.foreignOwnershipRestriction) { lines.push(` 外资限制:${req.foreignOwnershipRestriction}`); } } // 交易要求 if (rule.tradingRequirements) { const req = rule.tradingRequirements as Record; if (req.minimumInvestor) lines.push(` 最低投资者资质:${req.minimumInvestor}`); if (req.settlementPeriod) lines.push(` 结算周期:${req.settlementPeriod}`); if (req.allowedCurrencies && Array.isArray(req.allowedCurrencies)) { lines.push(` 允许结算货币:${(req.allowedCurrencies as string[]).join("、")}`); } } if (rule.sourceUrl) lines.push(` 来源:${rule.sourceUrl}`); lines.push(""); }); lines.push("请严格基于以上知识库规则回答用户问题,并在回答中注明引用的规则编号和来源。"); lines.push("如果知识库中没有完全匹配的规则,请基于已有规则进行合理推断,并说明推断依据。"); return lines.join("\n"); } // 别名导出(兼容旧版引擎调用) export const retrieveComplianceRules = retrieveRelevantRules;