diff --git a/nac-ai-inference/src/engine/ragEngine.ts b/nac-ai-inference/src/engine/ragEngine.ts new file mode 100644 index 0000000..e77fabb --- /dev/null +++ b/nac-ai-inference/src/engine/ragEngine.ts @@ -0,0 +1,228 @@ +/** + * NAC RAG检索引擎 + * + * 从MongoDB知识库(compliance_rules + nac_tech_docs)检索相关内容 + * 支持关键词匹配、辖区过滤、资产类型过滤 + */ + +import { Db } from "mongodb"; + +export interface RAGDocument { + _id?: unknown; + ruleId?: string; + ruleName?: string; + ruleNameI18n?: Record; + jurisdiction?: string; + category?: string; + assetClass?: string; + description?: string; + descriptionI18n?: Record; + content?: string; + tags?: string[]; + relevance?: number; + score?: number; + source?: "compliance_rules" | "nac_tech_docs"; +} + +export interface RAGContext { + documents: RAGDocument[]; + totalFound: number; + queryKeywords: string[]; + jurisdiction?: string; + assetClass?: string; +} + +// ─── 关键词提取 ─────────────────────────────────────────────────── + +const STOP_WORDS = new Set([ + "的", "了", "是", "在", "我", "有", "和", "就", "不", "人", "都", "一", + "上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看", + "自己", "这", "那", "什么", "如何", "怎么", "请问", "帮我", "告诉", "介绍", + "关于", "对于", "针对", "需要", "可以", "应该", "必须", "规定", "要求", + "the", "a", "an", "is", "are", "was", "were", "be", "been", + "have", "has", "had", "do", "does", "did", "will", "would", "could", + "what", "how", "when", "where", "why", "which", "who", +]); + +const JURISDICTION_MAP: Record = { + "香港": "HK", "港": "HK", "HK": "HK", + "新加坡": "SG", "狮城": "SG", "SG": "SG", + "迪拜": "AE", "阿联酋": "AE", "AE": "AE", + "中国": "CN", "大陆": "CN", "CN": "CN", + "美国": "US", "US": "US", + "欧盟": "EU", "EU": "EU", + "日本": "JP", "JP": "JP", + "澳大利亚": "AU", "AU": "AU", +}; + +const ASSET_CLASS_MAP: Record = { + "房地产": "RealEstate", "不动产": "RealEstate", "房产": "RealEstate", + "证券": "FinancialSecurities", "股票": "FinancialSecurities", "债券": "FinancialSecurities", + "大宗商品": "Commodities", "黄金": "Commodities", "原油": "Commodities", + "数字资产": "DigitalAssets", "代币": "DigitalAssets", "NFT": "DigitalAssets", + "碳排放": "EnvironmentalRights", "碳信用": "EnvironmentalRights", + "知识产权": "IntellectualProperty", "专利": "IntellectualProperty", + "基础设施": "Infrastructure", + "应收账款": "Receivables", +}; + +function extractKeywords(query: string): { keywords: string[]; jurisdiction?: string; assetClass?: string } { + const q = String(query ?? ""); + let jurisdiction: string | undefined; + let assetClass: string | undefined; + + for (const [term, code] of Object.entries(JURISDICTION_MAP)) { + if (q.includes(term)) { jurisdiction = code; break; } + } + for (const [term, cls] of Object.entries(ASSET_CLASS_MAP)) { + if (q.toLowerCase().includes(term.toLowerCase())) { assetClass = cls; break; } + } + + const chineseTerms = q.match(/[\u4e00-\u9fa5]{2,8}/g) || []; + const englishTerms = q.match(/[a-zA-Z]{3,}/g) || []; + const keywords = Array.from(new Set([...chineseTerms, ...englishTerms])) + .filter(t => !STOP_WORDS.has(t.toLowerCase())) + .slice(0, 8); + + return { keywords, jurisdiction, assetClass }; +} + +// ─── 主检索函数 ─────────────────────────────────────────────────── + +export async function retrieveRelevantDocs( + db: Db, + query: string, + options: { + maxResults?: number; + language?: string; + jurisdiction?: string; + assetClass?: string; + includeCompliance?: boolean; + includeTechDocs?: boolean; + } = {} +): Promise { + const { + maxResults = 8, + language = "zh", + includeCompliance = true, + includeTechDocs = true, + } = options; + + const { keywords, jurisdiction, assetClass } = extractKeywords(query); + const effectiveJurisdiction = options.jurisdiction ?? jurisdiction; + const effectiveAssetClass = options.assetClass ?? assetClass; + + const results: RAGDocument[] = []; + + // ── 检索 compliance_rules ────────────────────────────────────── + if (includeCompliance) { + try { + const filter: Record = {}; + if (effectiveJurisdiction) filter.jurisdiction = effectiveJurisdiction; + if (effectiveAssetClass) filter.assetClass = effectiveAssetClass; + + // 尝试全文搜索 + let complianceDocs: RAGDocument[] = []; + if (keywords.length > 0) { + try { + complianceDocs = await db.collection("compliance_rules") + .find({ ...filter, $text: { $search: keywords.join(" ") } }) + .project({ score: { $meta: "textScore" } }) + .sort({ score: { $meta: "textScore" } }) + .limit(Math.ceil(maxResults * 0.6)) + .toArray() as RAGDocument[]; + } catch { + // 全文索引不可用时,退回正则匹配 + const regexPattern = keywords.slice(0, 3).join("|"); + const regex = new RegExp(regexPattern, "i"); + complianceDocs = await db.collection("compliance_rules") + .find({ + ...filter, + $or: [ + { ruleName: regex }, + { description: regex }, + { content: regex }, + { tags: { $in: keywords } }, + ], + }) + .limit(Math.ceil(maxResults * 0.6)) + .toArray() as RAGDocument[]; + } + } else { + complianceDocs = await db.collection("compliance_rules") + .find(filter) + .sort({ relevance: -1 }) + .limit(Math.ceil(maxResults * 0.6)) + .toArray() as RAGDocument[]; + } + + complianceDocs.forEach(doc => { + results.push({ ...doc, source: "compliance_rules" }); + }); + } catch (err) { + console.warn("[ragEngine] compliance_rules检索失败:", err); + } + } + // ── 检索 nac_tech_docs ───────────────────────────────────────────── + if (includeTechDocs) { + try { + // 扩展的技术关键词过滤器:覆盖所有NAC原生技术概念 + const techKeywords = keywords.filter(k => + /charter|nvm|cbpp|csnp|cnnl|gnacs|acc|xtzh|xic|nrpc|rwa|token|合约|虚拟机|共识|协议|治理|宪法|收据|估值|地址|主权|辖区|合规|身份|kyc|aml|erc|solidity|evm|pos|pow|比特币|以太坊|稳定币|代币|资产|上链|流程|注册|浏览器|量子|跨链|互操作|联盟链|公有链|私有链|第四种|私法监管|流体区块|七层|sdr|黄金|香港|新加坡|迪拜|sfc|mas|vara|adgm|difc|acc20|cee|did|nac公链|宪政|神经网络|编程语言|开发者|生态|对比|区别|不同|比较|什么是|如何工作|原理|架构|技术栈/i.test(k) + ); + // 扩展的触发条件:更广泛地触发知识库检索 + const shouldSearchTechDocs = techKeywords.length > 0 || + keywords.some(k => /技术|开发|编程|合约|虚拟机|治理|合规|身份|估值|资产|上链|区别|对比|比较|什么|如何|怎么|为什么|原理|架构|生态|公链|区块链/i.test(k)) || + query.length > 10; // 任何较长的查询都尝试检索知识库 + + if (shouldSearchTechDocs) { + // 使用所有关键词构建更全面的正则表达式 + const searchTerms = techKeywords.length > 0 ? techKeywords : keywords; + const regexPattern = searchTerms.slice(0, 5).join("|"); + const regex = new RegExp(regexPattern, "i"); + const techDocs = await db.collection("nac_tech_docs") + .find({ + $or: [ + { title: regex }, + { content: regex }, + { tags: { $in: keywords } }, + { category: { $in: ["overview", "comparison", "governance", "consensus", "economics", "compliance", "identity", "development", "network", "asset_standard", "process", "valuation", "tools"] } }, + ], + }) + .limit(Math.ceil(maxResults * 0.6)) + .toArray() as RAGDocument[]; + + techDocs.forEach(doc => { + results.push({ ...doc, source: "nac_tech_docs" }); + }); + } + } catch (err) { + console.warn("[ragEngine] nac_tech_docs检索失败:", err); + } + } + + return { + documents: results.slice(0, maxResults), + totalFound: results.length, + queryKeywords: keywords, + jurisdiction: effectiveJurisdiction, + assetClass: effectiveAssetClass, + }; +} + +// ─── 构建RAG上下文字符串 ────────────────────────────────────────── + +export function buildContextString(ragCtx: RAGContext, language: string = "zh"): string { + if (ragCtx.documents.length === 0) return ""; + + const lines: string[] = ["=== 相关知识库内容 ==="]; + ragCtx.documents.slice(0, 6).forEach((doc, i) => { + const name = doc.ruleNameI18n?.[language] ?? doc.ruleName ?? (doc as any).title ?? "未命名"; + const desc = doc.descriptionI18n?.[language] ?? doc.description ?? doc.content ?? ""; + lines.push(`\n[${i + 1}] ${name}`); + if (doc.jurisdiction) lines.push(`辖区: ${doc.jurisdiction}`); + if (doc.category) lines.push(`类别: ${doc.category}`); + lines.push(desc.slice(0, 400)); + }); + return lines.join("\n"); +}