feat: 扩展RAG引擎techKeywords过滤器,覆盖50+NAC技术关键词

This commit is contained in:
nacadmin 2026-03-04 16:33:54 +08:00
parent e8d80c1ee0
commit b489e9417b
1 changed files with 228 additions and 0 deletions

View File

@ -0,0 +1,228 @@
/**
* NAC RAG检索引擎
*
* MongoDB知识库compliance_rules + nac_tech_docs
*
*/
import { Db } from "mongodb";
export interface RAGDocument {
_id?: unknown;
ruleId?: string;
ruleName?: string;
ruleNameI18n?: Record<string, string>;
jurisdiction?: string;
category?: string;
assetClass?: string;
description?: string;
descriptionI18n?: Record<string, string>;
content?: string;
tags?: string[];
relevance?: number;
score?: number;
source?: "compliance_rules" | "nac_tech_docs";
}
export interface RAGContext {
documents: RAGDocument[];
totalFound: number;
queryKeywords: string[];
jurisdiction?: string;
assetClass?: string;
}
// ─── 关键词提取 ───────────────────────────────────────────────────
const STOP_WORDS = new Set([
"的", "了", "是", "在", "我", "有", "和", "就", "不", "人", "都", "一",
"上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看",
"自己", "这", "那", "什么", "如何", "怎么", "请问", "帮我", "告诉", "介绍",
"关于", "对于", "针对", "需要", "可以", "应该", "必须", "规定", "要求",
"the", "a", "an", "is", "are", "was", "were", "be", "been",
"have", "has", "had", "do", "does", "did", "will", "would", "could",
"what", "how", "when", "where", "why", "which", "who",
]);
const JURISDICTION_MAP: Record<string, string> = {
"香港": "HK", "港": "HK", "HK": "HK",
"新加坡": "SG", "狮城": "SG", "SG": "SG",
"迪拜": "AE", "阿联酋": "AE", "AE": "AE",
"中国": "CN", "大陆": "CN", "CN": "CN",
"美国": "US", "US": "US",
"欧盟": "EU", "EU": "EU",
"日本": "JP", "JP": "JP",
"澳大利亚": "AU", "AU": "AU",
};
const ASSET_CLASS_MAP: Record<string, string> = {
"房地产": "RealEstate", "不动产": "RealEstate", "房产": "RealEstate",
"证券": "FinancialSecurities", "股票": "FinancialSecurities", "债券": "FinancialSecurities",
"大宗商品": "Commodities", "黄金": "Commodities", "原油": "Commodities",
"数字资产": "DigitalAssets", "代币": "DigitalAssets", "NFT": "DigitalAssets",
"碳排放": "EnvironmentalRights", "碳信用": "EnvironmentalRights",
"知识产权": "IntellectualProperty", "专利": "IntellectualProperty",
"基础设施": "Infrastructure",
"应收账款": "Receivables",
};
function extractKeywords(query: string): { keywords: string[]; jurisdiction?: string; assetClass?: string } {
const q = String(query ?? "");
let jurisdiction: string | undefined;
let assetClass: string | undefined;
for (const [term, code] of Object.entries(JURISDICTION_MAP)) {
if (q.includes(term)) { jurisdiction = code; break; }
}
for (const [term, cls] of Object.entries(ASSET_CLASS_MAP)) {
if (q.toLowerCase().includes(term.toLowerCase())) { assetClass = cls; break; }
}
const chineseTerms = q.match(/[\u4e00-\u9fa5]{2,8}/g) || [];
const englishTerms = q.match(/[a-zA-Z]{3,}/g) || [];
const keywords = Array.from(new Set([...chineseTerms, ...englishTerms]))
.filter(t => !STOP_WORDS.has(t.toLowerCase()))
.slice(0, 8);
return { keywords, jurisdiction, assetClass };
}
// ─── 主检索函数 ───────────────────────────────────────────────────
export async function retrieveRelevantDocs(
db: Db,
query: string,
options: {
maxResults?: number;
language?: string;
jurisdiction?: string;
assetClass?: string;
includeCompliance?: boolean;
includeTechDocs?: boolean;
} = {}
): Promise<RAGContext> {
const {
maxResults = 8,
language = "zh",
includeCompliance = true,
includeTechDocs = true,
} = options;
const { keywords, jurisdiction, assetClass } = extractKeywords(query);
const effectiveJurisdiction = options.jurisdiction ?? jurisdiction;
const effectiveAssetClass = options.assetClass ?? assetClass;
const results: RAGDocument[] = [];
// ── 检索 compliance_rules ──────────────────────────────────────
if (includeCompliance) {
try {
const filter: Record<string, unknown> = {};
if (effectiveJurisdiction) filter.jurisdiction = effectiveJurisdiction;
if (effectiveAssetClass) filter.assetClass = effectiveAssetClass;
// 尝试全文搜索
let complianceDocs: RAGDocument[] = [];
if (keywords.length > 0) {
try {
complianceDocs = await db.collection("compliance_rules")
.find({ ...filter, $text: { $search: keywords.join(" ") } })
.project({ score: { $meta: "textScore" } })
.sort({ score: { $meta: "textScore" } })
.limit(Math.ceil(maxResults * 0.6))
.toArray() as RAGDocument[];
} catch {
// 全文索引不可用时,退回正则匹配
const regexPattern = keywords.slice(0, 3).join("|");
const regex = new RegExp(regexPattern, "i");
complianceDocs = await db.collection("compliance_rules")
.find({
...filter,
$or: [
{ ruleName: regex },
{ description: regex },
{ content: regex },
{ tags: { $in: keywords } },
],
})
.limit(Math.ceil(maxResults * 0.6))
.toArray() as RAGDocument[];
}
} else {
complianceDocs = await db.collection("compliance_rules")
.find(filter)
.sort({ relevance: -1 })
.limit(Math.ceil(maxResults * 0.6))
.toArray() as RAGDocument[];
}
complianceDocs.forEach(doc => {
results.push({ ...doc, source: "compliance_rules" });
});
} catch (err) {
console.warn("[ragEngine] compliance_rules检索失败:", err);
}
}
// ── 检索 nac_tech_docs ─────────────────────────────────────────────
if (includeTechDocs) {
try {
// 扩展的技术关键词过滤器覆盖所有NAC原生技术概念
const techKeywords = keywords.filter(k =>
/charter|nvm|cbpp|csnp|cnnl|gnacs|acc|xtzh|xic|nrpc|rwa|token|合约|虚拟机|共识|协议|治理|宪法|收据|估值|地址|主权|辖区|合规|身份|kyc|aml|erc|solidity|evm|pos|pow|比特币|以太坊|稳定币|代币|资产|上链|流程|注册|浏览器|量子|跨链|互操作|联盟链|公有链|私有链|第四种|私法监管|流体区块|七层|sdr|黄金|香港|新加坡|迪拜|sfc|mas|vara|adgm|difc|acc20|cee|did|nac公链|宪政|神经网络|编程语言|开发者|生态|对比|区别|不同|比较|什么是|如何工作|原理|架构|技术栈/i.test(k)
);
// 扩展的触发条件:更广泛地触发知识库检索
const shouldSearchTechDocs = techKeywords.length > 0 ||
keywords.some(k => /技术|开发|编程|合约|虚拟机|治理|合规|身份|估值|资产|上链|区别|对比|比较|什么|如何|怎么|为什么|原理|架构|生态|公链|区块链/i.test(k)) ||
query.length > 10; // 任何较长的查询都尝试检索知识库
if (shouldSearchTechDocs) {
// 使用所有关键词构建更全面的正则表达式
const searchTerms = techKeywords.length > 0 ? techKeywords : keywords;
const regexPattern = searchTerms.slice(0, 5).join("|");
const regex = new RegExp(regexPattern, "i");
const techDocs = await db.collection("nac_tech_docs")
.find({
$or: [
{ title: regex },
{ content: regex },
{ tags: { $in: keywords } },
{ category: { $in: ["overview", "comparison", "governance", "consensus", "economics", "compliance", "identity", "development", "network", "asset_standard", "process", "valuation", "tools"] } },
],
})
.limit(Math.ceil(maxResults * 0.6))
.toArray() as RAGDocument[];
techDocs.forEach(doc => {
results.push({ ...doc, source: "nac_tech_docs" });
});
}
} catch (err) {
console.warn("[ragEngine] nac_tech_docs检索失败:", err);
}
}
return {
documents: results.slice(0, maxResults),
totalFound: results.length,
queryKeywords: keywords,
jurisdiction: effectiveJurisdiction,
assetClass: effectiveAssetClass,
};
}
// ─── 构建RAG上下文字符串 ──────────────────────────────────────────
export function buildContextString(ragCtx: RAGContext, language: string = "zh"): string {
if (ragCtx.documents.length === 0) return "";
const lines: string[] = ["=== 相关知识库内容 ==="];
ragCtx.documents.slice(0, 6).forEach((doc, i) => {
const name = doc.ruleNameI18n?.[language] ?? doc.ruleName ?? (doc as any).title ?? "未命名";
const desc = doc.descriptionI18n?.[language] ?? doc.description ?? doc.content ?? "";
lines.push(`\n[${i + 1}] ${name}`);
if (doc.jurisdiction) lines.push(`辖区: ${doc.jurisdiction}`);
if (doc.category) lines.push(`类别: ${doc.category}`);
lines.push(desc.slice(0, 400));
});
return lines.join("\n");
}