diff --git a/nac-admin/server/ragRetrieval.ts b/nac-admin/server/ragRetrieval.ts new file mode 100644 index 0000000..17461cf --- /dev/null +++ b/nac-admin/server/ragRetrieval.ts @@ -0,0 +1,533 @@ +/** + * NAC Knowledge Engine - RAG检索增强模块 v3.0 + * + * 功能:从MongoDB知识库中检索与用户问题最相关的合规规则条文, + * 作为上下文注入到AI Agent的提示词中,提升回答的准确性和可溯源性。 + * + * 检索策略(四层递进): + * 1. 意图识别 - 提取辖区、资产类别、规则类型等结构化信息 + * 2. MongoDB全文检索($text index)- 关键词精确匹配 + * 3. 正则关键词匹配 - 覆盖全文索引未命中的情况 + * 4. 随机采样 - 兜底策略,确保始终有上下文 + * + * v3.0 新增:jieba中文分词增强(工单 NAC-AI-2026-006) + * - 通过Python子进程调用jieba分词库,替代简单正则匹配 + * - LRU缓存(500条)避免重复分词 + * - 500ms超时保护,超时自动降级到正则方案 + * - NAC专业词典:CBPP/CNNL/XTZH等30+专业术语 + * + * 支持两种规则文档格式: + * - 旧格式:{ ruleId, ruleName, jurisdiction, category, content } + * - 新格式(贸易规则):{ ruleId, ruleName, jurisdiction, assetClass, ruleType, content, contentEn, ownershipRequirements, tradingRequirements } + */ + +import { getMongoDb, COLLECTIONS } from "./mongodb"; +import { execSync } from "child_process"; + +// ─── jieba中文分词模块(NAC-AI-2026-006)──────────────────────────── +const _jiebaCache = new Map(); +const _NAC_WORDS = [ + "CBPP", "CNNL", "CSNP", "NRPC", "NVM", "Charter", "GNACS", "XTZH", "XIC", + "ACC-20", "ACC-721", "ACC-20C", "RWA", "NAC公链", "宪法收据", "宪政区块", + "合规验证", "七层合规", "流体区块", "主权地址", "当铺协议", "量子浏览器", + "宪法执行引擎", "CEE", "CBP", "DID", "KYC", "AML", "CFT", + "稳定币", "治理代币", "资产上链", "跨链合规", "司法辖区", + "香港SFC", "新加坡MAS", "欧盟MiCA", "美国SEC", "阿联酋DFSA", + "不动产", "大宗商品", "金融资产", "艺术品", "知识产权", + "SDR锚定", "黄金储备", "储备证明", "熔断机制", +]; + +function _jiebaTokenize(text: string): string[] { + const key = text.slice(0, 100); + if (_jiebaCache.has(key)) return _jiebaCache.get(key)!; + try { + const escaped = text.replace(/\\/g, "\\\\").replace(/'/g, "'\\''").replace(/\n/g, " ").slice(0, 400); + const customWordsJson = JSON.stringify(_NAC_WORDS); + const script = `import jieba,jieba.analyse,json\n[jieba.add_word(w,freq=1000) for w in ${customWordsJson}]\nkws=jieba.analyse.extract_tags('${escaped}',topK=12,withWeight=False)\nkws=[k for k in kws if len(k)>=2]\nprint(json.dumps(kws,ensure_ascii=False))`; + const result = execSync(`python3 -c "${script.replace(/"/g, '\\"')}"`, { timeout: 500, encoding: "utf8" }).trim(); + const tokens = JSON.parse(result) as string[]; + if (_jiebaCache.size >= 500) { const first = _jiebaCache.keys().next().value; if (first) _jiebaCache.delete(first); } + _jiebaCache.set(key, tokens); + return tokens; + } catch { + return text.match(/[\u4e00-\u9fa5]{2,8}/g) || []; + } +} + +// ─── 类型定义 ───────────────────────────────────────────────────── + +export interface RetrievedRule { + ruleId: string; + ruleName: string; + jurisdiction: string; + category: string; + assetClass?: string; + ruleType?: string; + content: string; + description?: string; + score: number; + source: string; + // 贸易规则扩展字段 + ownershipRequirements?: Record; + tradingRequirements?: Record; + legalBasis?: string; + sourceUrl?: string; + complianceLevel?: string; + tags?: string[]; +} + +export interface RAGContext { + rules: RetrievedRule[]; + totalFound: number; + retrievalMethod: "fulltext" | "regex" | "structured" | "sample" | "none"; + queryKeywords: string[]; + detectedJurisdiction?: string; + detectedAssetClass?: string; + detectedRuleType?: string; +} + +// ─── 意图识别:辖区/资产类别/规则类型映射 ────────────────────────── + +const JURISDICTION_MAP: Record = { + // 中文 → 代码 + "美国": "US", "美利坚": "US", + "欧盟": "EU", "欧洲": "EU", + "英国": "GB", "英格兰": "GB", + "香港": "HK", "港": "HK", + "新加坡": "SG", "狮城": "SG", + "迪拜": "AE", "阿联酋": "AE", "阿布扎比": "AE", + "中国": "CN", "大陆": "CN", "内地": "CN", + "日本": "JP", + "澳大利亚": "AU", "澳洲": "AU", + "瑞士": "CH", + "韩国": "KR", + "加拿大": "CA", + "德国": "DE", + "法国": "FR", + "开曼": "KY", "开曼群岛": "KY", + "英属维尔京": "VG", "BVI": "VG", + "巴哈马": "BS", + "百慕大": "BM", + // 英文代码直接映射 + "US": "US", "EU": "EU", "GB": "GB", "HK": "HK", "SG": "SG", + "AE": "AE", "CN": "CN", "JP": "JP", "AU": "AU", "CH": "CH", + "KR": "KR", "CA": "CA", "DE": "DE", "FR": "FR", +}; + +const ASSET_CLASS_MAP: Record = { + // 房地产 + "房地产": "RealEstate", "不动产": "RealEstate", "房产": "RealEstate", + "商业地产": "RealEstate", "住宅": "RealEstate", "写字楼": "RealEstate", + "REITs": "RealEstate", "REIT": "RealEstate", "房地产投资信托": "RealEstate", + // 金融证券 + "证券": "FinancialSecurities", "股票": "FinancialSecurities", "债券": "FinancialSecurities", + "基金": "FinancialSecurities", "期货": "FinancialSecurities", "期权": "FinancialSecurities", + "security": "FinancialSecurities", "securities": "FinancialSecurities", + // 大宗商品 + "大宗商品": "Commodities", "黄金": "Commodities", "白银": "Commodities", + "原油": "Commodities", "铜": "Commodities", "铁矿石": "Commodities", + "农产品": "Commodities", "粮食": "Commodities", + // 数字资产 + "数字资产": "DigitalAssets", "加密资产": "DigitalAssets", "虚拟资产": "DigitalAssets", + "代币": "DigitalAssets", "Token": "DigitalAssets", "NFT": "DigitalAssets", + "稳定币": "DigitalAssets", "XTZH": "DigitalAssets", + // 碳排放权 + "碳排放": "EnvironmentalRights", "碳信用": "EnvironmentalRights", + "碳权": "EnvironmentalRights", "排放权": "EnvironmentalRights", + // 知识产权 + "知识产权": "IntellectualProperty", "专利": "IntellectualProperty", + "商标": "IntellectualProperty", "版权": "IntellectualProperty", + // 基础设施 + "基础设施": "Infrastructure", "电力": "Infrastructure", "水务": "Infrastructure", + // 应收账款 + "应收账款": "Receivables", "贸易融资": "Receivables", +}; + +const RULE_TYPE_MAP: Record = { + "所有权": "ownership_verification", "产权": "ownership_verification", + "确权": "ownership_verification", "所有人": "ownership_verification", + "交易": "trading_permission", "买卖": "trading_permission", + "上市": "trading_permission", "流通": "trading_permission", + "KYC": "kyc_aml", "AML": "kyc_aml", "反洗钱": "kyc_aml", + "身份验证": "kyc_aml", "尽职调查": "kyc_aml", + "税": "tax_compliance", "税务": "tax_compliance", "纳税": "tax_compliance", + "跨境": "cross_border", "境外": "cross_border", "外资": "cross_border", + "结算": "settlement", "清算": "settlement", + "托管": "custody", "保管": "custody", + "披露": "disclosure", "信息披露": "disclosure", + "牌照": "licensing", "许可证": "licensing", "执照": "licensing", +}; + +/** + * 从用户问题中识别结构化意图(辖区/资产类别/规则类型) + */ +function detectQueryIntent(queryInput: unknown): { + jurisdiction?: string; + assetClass?: string; + ruleType?: string; + keywords: string[]; +} { + // 类型守卫:确保 query 是字符串 + const query = typeof queryInput === 'string' ? queryInput : String(queryInput ?? ''); + let detectedJurisdiction: string | undefined; + let detectedAssetClass: string | undefined; + let detectedRuleType: string | undefined; + + // 检测司法辖区 + for (const [term, code] of Object.entries(JURISDICTION_MAP)) { + if (query.includes(term)) { + detectedJurisdiction = code; + break; + } + } + + // 检测资产类别 + for (const [term, cls] of Object.entries(ASSET_CLASS_MAP)) { + if (query.toLowerCase().includes(term.toLowerCase())) { + detectedAssetClass = cls; + break; + } + } + + // 检测规则类型 + for (const [term, type] of Object.entries(RULE_TYPE_MAP)) { + if (query.includes(term)) { + detectedRuleType = type; + break; + } + } + + // 提取关键词 + const STOP_WORDS = new Set([ + "的", "了", "是", "在", "我", "有", "和", "就", "不", "人", "都", "一", "一个", + "上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看", "好", + "自己", "这", "那", "什么", "如何", "怎么", "请问", "帮我", "告诉", "介绍", + "关于", "对于", "针对", "需要", "可以", "应该", "必须", "规定", "要求", + "the", "a", "an", "is", "are", "was", "were", "be", "been", "being", + "have", "has", "had", "do", "does", "did", "will", "would", "could", "should", + "what", "how", "when", "where", "why", "which", "who", + ]); + + // v3.0: 使用jieba分词替代简单正则匹配 + const jiebaTokens = _jiebaTokenize(query); + const englishTerms = (query.match(/[A-Z][A-Z0-9-]{1,}/g) || []).concat(query.match(/[a-zA-Z]{4,}/g) || []); + const allTerms = [...jiebaTokens, ...englishTerms]; + const keywords = Array.from(new Set(allTerms.filter(t => !STOP_WORDS.has(t.toLowerCase()) && t.length >= 2))).slice(0, 10); + + return { jurisdiction: detectedJurisdiction, assetClass: detectedAssetClass, ruleType: detectedRuleType, keywords }; +} + +// ─── 主检索函数 ─────────────────────────────────────────────────── + +/** + * 从MongoDB知识库检索相关规则(RAG核心函数) + */ +export async function retrieveRelevantRules( + queryInput: unknown, + options: { + maxResults?: number; + jurisdictions?: string[]; + categories?: string[]; + language?: string; + } = {} +): Promise { + // 类型守卫:确保 query 是字符串 + const query = typeof queryInput === 'string' ? queryInput : String(queryInput ?? ''); + const { maxResults = 6, jurisdictions, categories, language = "zh" } = options; + const db = await getMongoDb(); + + if (!db) { + return { rules: [], totalFound: 0, retrievalMethod: "none", queryKeywords: [] }; + } + + // 意图识别 + const intent = detectQueryIntent(query); + const keywords = intent.keywords; + + const collection = db.collection(COLLECTIONS.COMPLIANCE_RULES); + + // 构建基础过滤条件(兼容新旧两种文档格式) + const baseFilter: Record = {}; + + // 优先使用意图识别的辖区,其次使用传入的辖区参数 + const targetJurisdictions = jurisdictions || + (intent.jurisdiction ? [intent.jurisdiction, "GLOBAL"] : undefined); + + if (targetJurisdictions && targetJurisdictions.length > 0) { + baseFilter.jurisdiction = { $in: targetJurisdictions }; + } + + // 资产类别过滤(新格式用 assetClass,旧格式用 category) + const targetAssetClass = intent.assetClass; + if (targetAssetClass) { + baseFilter.$or = [ + { assetClass: { $in: [targetAssetClass, "ALL"] } }, + { category: { $regex: targetAssetClass, $options: "i" } }, + ]; + } else if (categories && categories.length > 0) { + baseFilter.category = { $in: categories }; + } + + // 规则类型过滤 + if (intent.ruleType) { + baseFilter.ruleType = intent.ruleType; + } + + let rules: RetrievedRule[] = []; + let retrievalMethod: RAGContext["retrievalMethod"] = "none"; + + // ── 策略1:结构化精确匹配(意图识别命中时)────────────────── + if (intent.jurisdiction || intent.assetClass || intent.ruleType) { + try { + const structuredResults = await collection + .find(baseFilter) + .sort({ tier: 1, relevance: -1 }) + .limit(maxResults) + .toArray(); + + if (structuredResults.length > 0) { + rules = structuredResults.map((doc, idx) => formatRule(doc, language, idx, structuredResults.length)); + retrievalMethod = "structured"; + } + } catch (e) { + console.warn("[RAG] 结构化检索失败:", (e as Error).message); + } + } + + // ── 策略2:MongoDB全文检索 ──────────────────────────────────── + if (rules.length < 3 && keywords.length > 0) { + try { + const searchText = keywords.join(" "); + const textFilter: Record = { + $text: { $search: searchText }, + }; + // 不加辖区过滤,扩大全文检索范围 + if (intent.assetClass) { + textFilter.$or = [ + { assetClass: { $in: [intent.assetClass, "ALL"] } }, + { category: { $regex: intent.assetClass, $options: "i" } }, + ]; + } + + const textResults = await collection + .find(textFilter, { + projection: { + score: { $meta: "textScore" }, + ruleId: 1, ruleName: 1, jurisdiction: 1, category: 1, assetClass: 1, + ruleType: 1, content: 1, contentEn: 1, description: 1, + ownershipRequirements: 1, tradingRequirements: 1, + legalBasis: 1, sourceUrl: 1, complianceLevel: 1, tags: 1, + "translations.zh": 1, "translations.en": 1, + }, + }) + .sort({ score: { $meta: "textScore" } }) + .limit(maxResults) + .toArray(); + + if (textResults.length > 0) { + const newRules = textResults + .filter(r => !rules.some(existing => existing.ruleId === String(r.ruleId || r._id))) + .map((doc, idx) => formatRule(doc, language, idx, textResults.length)); + rules = [...rules, ...newRules].slice(0, maxResults); + if (retrievalMethod === "none") retrievalMethod = "fulltext"; + } + } catch (e) { + console.warn("[RAG] 全文检索失败,降级到正则检索:", (e as Error).message); + } + } + + // ── 策略3:正则关键词匹配 ───────────────────────────────────── + if (rules.length < 3 && keywords.length > 0) { + try { + const regexConditions = keywords.slice(0, 4).map(kw => ({ + $or: [ + { ruleName: { $regex: kw, $options: "i" } }, + { ruleNameEn: { $regex: kw, $options: "i" } }, + { description: { $regex: kw, $options: "i" } }, + { content: { $regex: kw, $options: "i" } }, + { contentEn: { $regex: kw, $options: "i" } }, + { tags: { $regex: kw, $options: "i" } }, + { "translations.zh": { $regex: kw, $options: "i" } }, + ], + })); + + const regexFilter: Record = { $and: regexConditions }; + + const regexResults = await collection + .find(regexFilter) + .limit(maxResults) + .toArray(); + + if (regexResults.length > 0) { + const newRules = regexResults + .filter(r => !rules.some(existing => existing.ruleId === String(r.ruleId || r._id))) + .map((doc, idx) => formatRule(doc, language, idx, regexResults.length)); + rules = [...rules, ...newRules].slice(0, maxResults); + if (retrievalMethod === "none") retrievalMethod = "regex"; + } + } catch (e) { + console.warn("[RAG] 正则检索失败:", (e as Error).message); + } + } + + // ── 策略4:随机采样(兜底策略)────────────────────────────── + if (rules.length === 0) { + try { + const sampleResults = await collection + .aggregate([ + { $match: {} }, + { $sample: { size: maxResults } }, + ]) + .toArray(); + + if (sampleResults.length > 0) { + rules = sampleResults.map((doc, idx) => formatRule(doc, language, idx, sampleResults.length, 0.3)); + retrievalMethod = "sample"; + } + } catch (e) { + console.warn("[RAG] 随机采样失败:", (e as Error).message); + } + } + + return { + rules, + totalFound: rules.length, + retrievalMethod, + queryKeywords: keywords, + detectedJurisdiction: intent.jurisdiction, + detectedAssetClass: intent.assetClass, + detectedRuleType: intent.ruleType, + }; +} + +// ─── 格式化工具函数 ─────────────────────────────────────────────── + +function formatRule( + doc: Record, + language: string, + idx: number, + total: number, + baseScore?: number +): RetrievedRule { + const score = baseScore !== undefined + ? baseScore + : Math.max(0.4, 1.0 - (idx / total) * 0.5); + + // 兼容新旧两种格式的内容字段 + const translations = doc.translations as Record | undefined; + let content = ""; + if (language === "zh") { + content = String(doc.content || translations?.zh || doc.contentEn || translations?.en || ""); + } else { + content = String(doc.contentEn || translations?.en || doc.content || translations?.zh || ""); + } + + // 截断内容到800字(贸易规则内容较长) + const truncatedContent = content.length > 800 + ? content.slice(0, 800) + "..." + : content; + + const ruleId = String(doc.ruleId || doc._id || ""); + const ruleName = String(doc.ruleName || doc.ruleNameEn || "未命名规则"); + const jurisdiction = String(doc.jurisdiction || "未知"); + const category = String(doc.category || doc.assetClass || "通用"); + const description = doc.description ? String(doc.description) : undefined; + + return { + ruleId, + ruleName, + jurisdiction, + category, + assetClass: doc.assetClass ? String(doc.assetClass) : undefined, + ruleType: doc.ruleType ? String(doc.ruleType) : undefined, + content: truncatedContent, + description, + score, + source: `${jurisdiction}·${category}·${ruleName.slice(0, 20)}`, + ownershipRequirements: doc.ownershipRequirements as Record | undefined, + tradingRequirements: doc.tradingRequirements as Record | undefined, + legalBasis: doc.legalBasis ? String(doc.legalBasis) : undefined, + sourceUrl: doc.sourceUrl ? String(doc.sourceUrl) : undefined, + complianceLevel: doc.complianceLevel ? String(doc.complianceLevel) : undefined, + tags: Array.isArray(doc.tags) ? doc.tags.map(String) : undefined, + }; +} + +// ─── 构建RAG提示词上下文 ───────────────────────────────────────── + +/** + * 将检索到的规则格式化为AI提示词中的上下文段落(增强版) + */ +export function buildRAGPromptContext(ragCtx: RAGContext): string { + if (ragCtx.rules.length === 0) { + return ""; + } + + const lines: string[] = [ + "【知识库检索结果】", + `(共检索到 ${ragCtx.totalFound} 条相关规则,检索方式:${ragCtx.retrievalMethod})`, + ]; + + if (ragCtx.detectedJurisdiction) { + lines.push(`(识别到司法辖区:${ragCtx.detectedJurisdiction})`); + } + if (ragCtx.detectedAssetClass) { + lines.push(`(识别到资产类别:${ragCtx.detectedAssetClass})`); + } + lines.push(""); + + ragCtx.rules.forEach((rule, idx) => { + lines.push(`【规则 ${idx + 1}】${rule.ruleName}`); + lines.push(` 辖区:${rule.jurisdiction} | 类别:${rule.category} | 相关度:${Math.round(rule.score * 100)}%`); + if (rule.ruleType) lines.push(` 规则类型:${rule.ruleType}`); + if (rule.legalBasis) lines.push(` 法律依据:${rule.legalBasis}`); + if (rule.complianceLevel) lines.push(` 合规级别:${rule.complianceLevel}`); + if (rule.description) lines.push(` 摘要:${rule.description}`); + lines.push(` 内容:${rule.content}`); + + // 所有权要求(核心字段) + if (rule.ownershipRequirements) { + const req = rule.ownershipRequirements as Record; + if (req.proofDocuments && Array.isArray(req.proofDocuments)) { + lines.push(` 所有权证明文件:${(req.proofDocuments as string[]).join("、")}`); + } + if (req.registrationAuthority) { + lines.push(` 登记机构:${req.registrationAuthority}`); + } + if (req.transferMechanism) { + lines.push(` 转移机制:${req.transferMechanism}`); + } + if (req.chainRecognition) { + lines.push(` 链上法律认可:${req.chainRecognition}`); + } + if (req.foreignOwnershipRestriction) { + lines.push(` 外资限制:${req.foreignOwnershipRestriction}`); + } + } + + // 交易要求 + if (rule.tradingRequirements) { + const req = rule.tradingRequirements as Record; + if (req.minimumInvestor) lines.push(` 最低投资者资质:${req.minimumInvestor}`); + if (req.settlementPeriod) lines.push(` 结算周期:${req.settlementPeriod}`); + if (req.allowedCurrencies && Array.isArray(req.allowedCurrencies)) { + lines.push(` 允许结算货币:${(req.allowedCurrencies as string[]).join("、")}`); + } + } + + if (rule.sourceUrl) lines.push(` 来源:${rule.sourceUrl}`); + lines.push(""); + }); + + lines.push("请严格基于以上知识库规则回答用户问题,并在回答中注明引用的规则编号和来源。"); + lines.push("如果知识库中没有完全匹配的规则,请基于已有规则进行合理推断,并说明推断依据。"); + + return lines.join("\n"); +} + +// ─── 向后兼容别名(生产服务器旧版本使用 retrieveComplianceRules)──── +/** + * @deprecated 请使用 retrieveRelevantRules + * 保留此别名以兼容生产服务器的 nacInferenceEngine.ts 和 aiAgents.ts + */ +export const retrieveComplianceRules = retrieveRelevantRules;