NAC_Blockchain/services/nac-admin/server/ragRetrieval.ts

488 lines
19 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* NAC Knowledge Engine - RAG检索增强模块 v2.0
*
* 功能从MongoDB知识库中检索与用户问题最相关的合规规则条文
* 作为上下文注入到AI Agent的提示词中提升回答的准确性和可溯源性。
*
* 检索策略(四层递进):
* 1. 意图识别 - 提取辖区、资产类别、规则类型等结构化信息
* 2. MongoDB全文检索$text index- 关键词精确匹配
* 3. 正则关键词匹配 - 覆盖全文索引未命中的情况
* 4. 随机采样 - 兜底策略,确保始终有上下文
*
* 支持两种规则文档格式:
* - 旧格式:{ ruleId, ruleName, jurisdiction, category, content }
* - 新格式(贸易规则):{ ruleId, ruleName, jurisdiction, assetClass, ruleType, content, contentEn, ownershipRequirements, tradingRequirements }
*/
import { getMongoDb, COLLECTIONS } from "./mongodb";
// ─── 类型定义 ─────────────────────────────────────────────────────
export interface RetrievedRule {
ruleId: string;
ruleName: string;
jurisdiction: string;
category: string;
assetClass?: string;
ruleType?: string;
content: string;
description?: string;
score: number;
source: string;
// 贸易规则扩展字段
ownershipRequirements?: Record<string, unknown>;
tradingRequirements?: Record<string, unknown>;
legalBasis?: string;
sourceUrl?: string;
complianceLevel?: string;
tags?: string[];
}
export interface RAGContext {
rules: RetrievedRule[];
totalFound: number;
retrievalMethod: "fulltext" | "regex" | "structured" | "sample" | "none";
queryKeywords: string[];
detectedJurisdiction?: string;
detectedAssetClass?: string;
detectedRuleType?: string;
}
// ─── 意图识别:辖区/资产类别/规则类型映射 ──────────────────────────
const JURISDICTION_MAP: Record<string, string> = {
// 中文 → 代码
"美国": "US", "美利坚": "US",
"欧盟": "EU", "欧洲": "EU",
"英国": "GB", "英格兰": "GB",
"香港": "HK", "港": "HK",
"新加坡": "SG", "狮城": "SG",
"迪拜": "AE", "阿联酋": "AE", "阿布扎比": "AE",
"中国": "CN", "大陆": "CN", "内地": "CN",
"日本": "JP",
"澳大利亚": "AU", "澳洲": "AU",
"瑞士": "CH",
"韩国": "KR",
"加拿大": "CA",
"德国": "DE",
"法国": "FR",
"开曼": "KY", "开曼群岛": "KY",
"英属维尔京": "VG", "BVI": "VG",
"巴哈马": "BS",
"百慕大": "BM",
// 英文代码直接映射
"US": "US", "EU": "EU", "GB": "GB", "HK": "HK", "SG": "SG",
"AE": "AE", "CN": "CN", "JP": "JP", "AU": "AU", "CH": "CH",
"KR": "KR", "CA": "CA", "DE": "DE", "FR": "FR",
};
const ASSET_CLASS_MAP: Record<string, string> = {
// 房地产
"房地产": "RealEstate", "不动产": "RealEstate", "房产": "RealEstate",
"商业地产": "RealEstate", "住宅": "RealEstate", "写字楼": "RealEstate",
"REITs": "RealEstate", "REIT": "RealEstate", "房地产投资信托": "RealEstate",
// 金融证券
"证券": "FinancialSecurities", "股票": "FinancialSecurities", "债券": "FinancialSecurities",
"基金": "FinancialSecurities", "期货": "FinancialSecurities", "期权": "FinancialSecurities",
"security": "FinancialSecurities", "securities": "FinancialSecurities",
// 大宗商品
"大宗商品": "Commodities", "黄金": "Commodities", "白银": "Commodities",
"原油": "Commodities", "铜": "Commodities", "铁矿石": "Commodities",
"农产品": "Commodities", "粮食": "Commodities",
// 数字资产
"数字资产": "DigitalAssets", "加密资产": "DigitalAssets", "虚拟资产": "DigitalAssets",
"代币": "DigitalAssets", "Token": "DigitalAssets", "NFT": "DigitalAssets",
"稳定币": "DigitalAssets", "XTZH": "DigitalAssets",
// 碳排放权
"碳排放": "EnvironmentalRights", "碳信用": "EnvironmentalRights",
"碳权": "EnvironmentalRights", "排放权": "EnvironmentalRights",
// 知识产权
"知识产权": "IntellectualProperty", "专利": "IntellectualProperty",
"商标": "IntellectualProperty", "版权": "IntellectualProperty",
// 基础设施
"基础设施": "Infrastructure", "电力": "Infrastructure", "水务": "Infrastructure",
// 应收账款
"应收账款": "Receivables", "贸易融资": "Receivables",
};
const RULE_TYPE_MAP: Record<string, string> = {
"所有权": "ownership_verification", "产权": "ownership_verification",
"确权": "ownership_verification", "所有人": "ownership_verification",
"交易": "trading_permission", "买卖": "trading_permission",
"上市": "trading_permission", "流通": "trading_permission",
"KYC": "kyc_aml", "AML": "kyc_aml", "反洗钱": "kyc_aml",
"身份验证": "kyc_aml", "尽职调查": "kyc_aml",
"税": "tax_compliance", "税务": "tax_compliance", "纳税": "tax_compliance",
"跨境": "cross_border", "境外": "cross_border", "外资": "cross_border",
"结算": "settlement", "清算": "settlement",
"托管": "custody", "保管": "custody",
"披露": "disclosure", "信息披露": "disclosure",
"牌照": "licensing", "许可证": "licensing", "执照": "licensing",
};
/**
* 从用户问题中识别结构化意图(辖区/资产类别/规则类型)
*/
function detectQueryIntent(query: string): {
jurisdiction?: string;
assetClass?: string;
ruleType?: string;
keywords: string[];
} {
let detectedJurisdiction: string | undefined;
let detectedAssetClass: string | undefined;
let detectedRuleType: string | undefined;
// 检测司法辖区
for (const [term, code] of Object.entries(JURISDICTION_MAP)) {
if (query.includes(term)) {
detectedJurisdiction = code;
break;
}
}
// 检测资产类别
for (const [term, cls] of Object.entries(ASSET_CLASS_MAP)) {
if (query.toLowerCase().includes(term.toLowerCase())) {
detectedAssetClass = cls;
break;
}
}
// 检测规则类型
for (const [term, type] of Object.entries(RULE_TYPE_MAP)) {
if (query.includes(term)) {
detectedRuleType = type;
break;
}
}
// 提取关键词
const STOP_WORDS = new Set([
"的", "了", "是", "在", "我", "有", "和", "就", "不", "人", "都", "一", "一个",
"上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看", "好",
"自己", "这", "那", "什么", "如何", "怎么", "请问", "帮我", "告诉", "介绍",
"关于", "对于", "针对", "需要", "可以", "应该", "必须", "规定", "要求",
"the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
"have", "has", "had", "do", "does", "did", "will", "would", "could", "should",
"what", "how", "when", "where", "why", "which", "who",
]);
const chineseTerms = query.match(/[\u4e00-\u9fa5]{2,8}/g) || [];
const englishTerms = query.match(/[a-zA-Z]{3,}/g) || [];
const allTerms = [...chineseTerms, ...englishTerms];
const keywords = Array.from(new Set(allTerms.filter(t => !STOP_WORDS.has(t.toLowerCase())))).slice(0, 8);
return { jurisdiction: detectedJurisdiction, assetClass: detectedAssetClass, ruleType: detectedRuleType, keywords };
}
// ─── 主检索函数 ───────────────────────────────────────────────────
/**
* 从MongoDB知识库检索相关规则RAG核心函数
*/
export async function retrieveRelevantRules(
query: string,
options: {
maxResults?: number;
jurisdictions?: string[];
categories?: string[];
language?: string;
} = {}
): Promise<RAGContext> {
const { maxResults = 6, jurisdictions, categories, language = "zh" } = options;
const db = await getMongoDb();
if (!db) {
return { rules: [], totalFound: 0, retrievalMethod: "none", queryKeywords: [] };
}
// 意图识别
const intent = detectQueryIntent(query);
const keywords = intent.keywords;
const collection = db.collection(COLLECTIONS.COMPLIANCE_RULES);
// 构建基础过滤条件(兼容新旧两种文档格式)
const baseFilter: Record<string, unknown> = {};
// 优先使用意图识别的辖区,其次使用传入的辖区参数
const targetJurisdictions = jurisdictions ||
(intent.jurisdiction ? [intent.jurisdiction, "GLOBAL"] : undefined);
if (targetJurisdictions && targetJurisdictions.length > 0) {
baseFilter.jurisdiction = { $in: targetJurisdictions };
}
// 资产类别过滤(新格式用 assetClass旧格式用 category
const targetAssetClass = intent.assetClass;
if (targetAssetClass) {
baseFilter.$or = [
{ assetClass: { $in: [targetAssetClass, "ALL"] } },
{ category: { $regex: targetAssetClass, $options: "i" } },
];
} else if (categories && categories.length > 0) {
baseFilter.category = { $in: categories };
}
// 规则类型过滤
if (intent.ruleType) {
baseFilter.ruleType = intent.ruleType;
}
let rules: RetrievedRule[] = [];
let retrievalMethod: RAGContext["retrievalMethod"] = "none";
// ── 策略1结构化精确匹配意图识别命中时──────────────────
if (intent.jurisdiction || intent.assetClass || intent.ruleType) {
try {
const structuredResults = await collection
.find(baseFilter)
.sort({ tier: 1, relevance: -1 })
.limit(maxResults)
.toArray();
if (structuredResults.length > 0) {
rules = structuredResults.map((doc, idx) => formatRule(doc, language, idx, structuredResults.length));
retrievalMethod = "structured";
}
} catch (e) {
console.warn("[RAG] 结构化检索失败:", (e as Error).message);
}
}
// ── 策略2MongoDB全文检索 ────────────────────────────────────
if (rules.length < 3 && keywords.length > 0) {
try {
const searchText = keywords.join(" ");
const textFilter: Record<string, unknown> = {
$text: { $search: searchText },
};
// 不加辖区过滤,扩大全文检索范围
if (intent.assetClass) {
textFilter.$or = [
{ assetClass: { $in: [intent.assetClass, "ALL"] } },
{ category: { $regex: intent.assetClass, $options: "i" } },
];
}
const textResults = await collection
.find(textFilter, {
projection: {
score: { $meta: "textScore" },
ruleId: 1, ruleName: 1, jurisdiction: 1, category: 1, assetClass: 1,
ruleType: 1, content: 1, contentEn: 1, description: 1,
ownershipRequirements: 1, tradingRequirements: 1,
legalBasis: 1, sourceUrl: 1, complianceLevel: 1, tags: 1,
"translations.zh": 1, "translations.en": 1,
},
})
.sort({ score: { $meta: "textScore" } })
.limit(maxResults)
.toArray();
if (textResults.length > 0) {
const newRules = textResults
.filter(r => !rules.some(existing => existing.ruleId === String(r.ruleId || r._id)))
.map((doc, idx) => formatRule(doc, language, idx, textResults.length));
rules = [...rules, ...newRules].slice(0, maxResults);
if (retrievalMethod === "none") retrievalMethod = "fulltext";
}
} catch (e) {
console.warn("[RAG] 全文检索失败,降级到正则检索:", (e as Error).message);
}
}
// ── 策略3正则关键词匹配 ─────────────────────────────────────
if (rules.length < 3 && keywords.length > 0) {
try {
const regexConditions = keywords.slice(0, 4).map(kw => ({
$or: [
{ ruleName: { $regex: kw, $options: "i" } },
{ ruleNameEn: { $regex: kw, $options: "i" } },
{ description: { $regex: kw, $options: "i" } },
{ content: { $regex: kw, $options: "i" } },
{ contentEn: { $regex: kw, $options: "i" } },
{ tags: { $regex: kw, $options: "i" } },
{ "translations.zh": { $regex: kw, $options: "i" } },
],
}));
const regexFilter: Record<string, unknown> = { $and: regexConditions };
const regexResults = await collection
.find(regexFilter)
.limit(maxResults)
.toArray();
if (regexResults.length > 0) {
const newRules = regexResults
.filter(r => !rules.some(existing => existing.ruleId === String(r.ruleId || r._id)))
.map((doc, idx) => formatRule(doc, language, idx, regexResults.length));
rules = [...rules, ...newRules].slice(0, maxResults);
if (retrievalMethod === "none") retrievalMethod = "regex";
}
} catch (e) {
console.warn("[RAG] 正则检索失败:", (e as Error).message);
}
}
// ── 策略4随机采样兜底策略──────────────────────────────
if (rules.length === 0) {
try {
const sampleResults = await collection
.aggregate([
{ $match: {} },
{ $sample: { size: maxResults } },
])
.toArray();
if (sampleResults.length > 0) {
rules = sampleResults.map((doc, idx) => formatRule(doc, language, idx, sampleResults.length, 0.3));
retrievalMethod = "sample";
}
} catch (e) {
console.warn("[RAG] 随机采样失败:", (e as Error).message);
}
}
return {
rules,
totalFound: rules.length,
retrievalMethod,
queryKeywords: keywords,
detectedJurisdiction: intent.jurisdiction,
detectedAssetClass: intent.assetClass,
detectedRuleType: intent.ruleType,
};
}
// ─── 格式化工具函数 ───────────────────────────────────────────────
function formatRule(
doc: Record<string, unknown>,
language: string,
idx: number,
total: number,
baseScore?: number
): RetrievedRule {
const score = baseScore !== undefined
? baseScore
: Math.max(0.4, 1.0 - (idx / total) * 0.5);
// 兼容新旧两种格式的内容字段
const translations = doc.translations as Record<string, string> | undefined;
let content = "";
if (language === "zh") {
content = String(doc.content || translations?.zh || doc.contentEn || translations?.en || "");
} else {
content = String(doc.contentEn || translations?.en || doc.content || translations?.zh || "");
}
// 截断内容到800字贸易规则内容较长
const truncatedContent = content.length > 800
? content.slice(0, 800) + "..."
: content;
const ruleId = String(doc.ruleId || doc._id || "");
const ruleName = String(doc.ruleName || doc.ruleNameEn || "未命名规则");
const jurisdiction = String(doc.jurisdiction || "未知");
const category = String(doc.category || doc.assetClass || "通用");
const description = doc.description ? String(doc.description) : undefined;
return {
ruleId,
ruleName,
jurisdiction,
category,
assetClass: doc.assetClass ? String(doc.assetClass) : undefined,
ruleType: doc.ruleType ? String(doc.ruleType) : undefined,
content: truncatedContent,
description,
score,
source: `${jurisdiction}·${category}·${ruleName.slice(0, 20)}`,
ownershipRequirements: doc.ownershipRequirements as Record<string, unknown> | undefined,
tradingRequirements: doc.tradingRequirements as Record<string, unknown> | undefined,
legalBasis: doc.legalBasis ? String(doc.legalBasis) : undefined,
sourceUrl: doc.sourceUrl ? String(doc.sourceUrl) : undefined,
complianceLevel: doc.complianceLevel ? String(doc.complianceLevel) : undefined,
tags: Array.isArray(doc.tags) ? doc.tags.map(String) : undefined,
};
}
// ─── 构建RAG提示词上下文 ─────────────────────────────────────────
/**
* 将检索到的规则格式化为AI提示词中的上下文段落增强版
*/
export function buildRAGPromptContext(ragCtx: RAGContext): string {
if (ragCtx.rules.length === 0) {
return "";
}
const lines: string[] = [
"【知识库检索结果】",
`(共检索到 ${ragCtx.totalFound} 条相关规则,检索方式:${ragCtx.retrievalMethod}`,
];
if (ragCtx.detectedJurisdiction) {
lines.push(`(识别到司法辖区:${ragCtx.detectedJurisdiction}`);
}
if (ragCtx.detectedAssetClass) {
lines.push(`(识别到资产类别:${ragCtx.detectedAssetClass}`);
}
lines.push("");
ragCtx.rules.forEach((rule, idx) => {
lines.push(`【规则 ${idx + 1}${rule.ruleName}`);
lines.push(` 辖区:${rule.jurisdiction} | 类别:${rule.category} | 相关度:${Math.round(rule.score * 100)}%`);
if (rule.ruleType) lines.push(` 规则类型:${rule.ruleType}`);
if (rule.legalBasis) lines.push(` 法律依据:${rule.legalBasis}`);
if (rule.complianceLevel) lines.push(` 合规级别:${rule.complianceLevel}`);
if (rule.description) lines.push(` 摘要:${rule.description}`);
lines.push(` 内容:${rule.content}`);
// 所有权要求(核心字段)
if (rule.ownershipRequirements) {
const req = rule.ownershipRequirements as Record<string, unknown>;
if (req.proofDocuments && Array.isArray(req.proofDocuments)) {
lines.push(` 所有权证明文件:${(req.proofDocuments as string[]).join("、")}`);
}
if (req.registrationAuthority) {
lines.push(` 登记机构:${req.registrationAuthority}`);
}
if (req.transferMechanism) {
lines.push(` 转移机制:${req.transferMechanism}`);
}
if (req.chainRecognition) {
lines.push(` 链上法律认可:${req.chainRecognition}`);
}
if (req.foreignOwnershipRestriction) {
lines.push(` 外资限制:${req.foreignOwnershipRestriction}`);
}
}
// 交易要求
if (rule.tradingRequirements) {
const req = rule.tradingRequirements as Record<string, unknown>;
if (req.minimumInvestor) lines.push(` 最低投资者资质:${req.minimumInvestor}`);
if (req.settlementPeriod) lines.push(` 结算周期:${req.settlementPeriod}`);
if (req.allowedCurrencies && Array.isArray(req.allowedCurrencies)) {
lines.push(` 允许结算货币:${(req.allowedCurrencies as string[]).join("、")}`);
}
}
if (rule.sourceUrl) lines.push(` 来源:${rule.sourceUrl}`);
lines.push("");
});
lines.push("请严格基于以上知识库规则回答用户问题,并在回答中注明引用的规则编号和来源。");
lines.push("如果知识库中没有完全匹配的规则,请基于已有规则进行合理推断,并说明推断依据。");
return lines.join("\n");
}
// 别名导出(兼容旧版引擎调用)
export const retrieveComplianceRules = retrieveRelevantRules;