feat: 扩展RAG引擎techKeywords过滤器,覆盖50+NAC技术关键词
This commit is contained in:
parent
e8d80c1ee0
commit
b489e9417b
|
|
@ -0,0 +1,228 @@
|
|||
/**
|
||||
* NAC RAG检索引擎
|
||||
*
|
||||
* 从MongoDB知识库(compliance_rules + nac_tech_docs)检索相关内容
|
||||
* 支持关键词匹配、辖区过滤、资产类型过滤
|
||||
*/
|
||||
|
||||
import { Db } from "mongodb";
|
||||
|
||||
export interface RAGDocument {
|
||||
_id?: unknown;
|
||||
ruleId?: string;
|
||||
ruleName?: string;
|
||||
ruleNameI18n?: Record<string, string>;
|
||||
jurisdiction?: string;
|
||||
category?: string;
|
||||
assetClass?: string;
|
||||
description?: string;
|
||||
descriptionI18n?: Record<string, string>;
|
||||
content?: string;
|
||||
tags?: string[];
|
||||
relevance?: number;
|
||||
score?: number;
|
||||
source?: "compliance_rules" | "nac_tech_docs";
|
||||
}
|
||||
|
||||
export interface RAGContext {
|
||||
documents: RAGDocument[];
|
||||
totalFound: number;
|
||||
queryKeywords: string[];
|
||||
jurisdiction?: string;
|
||||
assetClass?: string;
|
||||
}
|
||||
|
||||
// ─── 关键词提取 ───────────────────────────────────────────────────
|
||||
|
||||
const STOP_WORDS = new Set([
|
||||
"的", "了", "是", "在", "我", "有", "和", "就", "不", "人", "都", "一",
|
||||
"上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看",
|
||||
"自己", "这", "那", "什么", "如何", "怎么", "请问", "帮我", "告诉", "介绍",
|
||||
"关于", "对于", "针对", "需要", "可以", "应该", "必须", "规定", "要求",
|
||||
"the", "a", "an", "is", "are", "was", "were", "be", "been",
|
||||
"have", "has", "had", "do", "does", "did", "will", "would", "could",
|
||||
"what", "how", "when", "where", "why", "which", "who",
|
||||
]);
|
||||
|
||||
const JURISDICTION_MAP: Record<string, string> = {
|
||||
"香港": "HK", "港": "HK", "HK": "HK",
|
||||
"新加坡": "SG", "狮城": "SG", "SG": "SG",
|
||||
"迪拜": "AE", "阿联酋": "AE", "AE": "AE",
|
||||
"中国": "CN", "大陆": "CN", "CN": "CN",
|
||||
"美国": "US", "US": "US",
|
||||
"欧盟": "EU", "EU": "EU",
|
||||
"日本": "JP", "JP": "JP",
|
||||
"澳大利亚": "AU", "AU": "AU",
|
||||
};
|
||||
|
||||
const ASSET_CLASS_MAP: Record<string, string> = {
|
||||
"房地产": "RealEstate", "不动产": "RealEstate", "房产": "RealEstate",
|
||||
"证券": "FinancialSecurities", "股票": "FinancialSecurities", "债券": "FinancialSecurities",
|
||||
"大宗商品": "Commodities", "黄金": "Commodities", "原油": "Commodities",
|
||||
"数字资产": "DigitalAssets", "代币": "DigitalAssets", "NFT": "DigitalAssets",
|
||||
"碳排放": "EnvironmentalRights", "碳信用": "EnvironmentalRights",
|
||||
"知识产权": "IntellectualProperty", "专利": "IntellectualProperty",
|
||||
"基础设施": "Infrastructure",
|
||||
"应收账款": "Receivables",
|
||||
};
|
||||
|
||||
function extractKeywords(query: string): { keywords: string[]; jurisdiction?: string; assetClass?: string } {
|
||||
const q = String(query ?? "");
|
||||
let jurisdiction: string | undefined;
|
||||
let assetClass: string | undefined;
|
||||
|
||||
for (const [term, code] of Object.entries(JURISDICTION_MAP)) {
|
||||
if (q.includes(term)) { jurisdiction = code; break; }
|
||||
}
|
||||
for (const [term, cls] of Object.entries(ASSET_CLASS_MAP)) {
|
||||
if (q.toLowerCase().includes(term.toLowerCase())) { assetClass = cls; break; }
|
||||
}
|
||||
|
||||
const chineseTerms = q.match(/[\u4e00-\u9fa5]{2,8}/g) || [];
|
||||
const englishTerms = q.match(/[a-zA-Z]{3,}/g) || [];
|
||||
const keywords = Array.from(new Set([...chineseTerms, ...englishTerms]))
|
||||
.filter(t => !STOP_WORDS.has(t.toLowerCase()))
|
||||
.slice(0, 8);
|
||||
|
||||
return { keywords, jurisdiction, assetClass };
|
||||
}
|
||||
|
||||
// ─── 主检索函数 ───────────────────────────────────────────────────
|
||||
|
||||
export async function retrieveRelevantDocs(
|
||||
db: Db,
|
||||
query: string,
|
||||
options: {
|
||||
maxResults?: number;
|
||||
language?: string;
|
||||
jurisdiction?: string;
|
||||
assetClass?: string;
|
||||
includeCompliance?: boolean;
|
||||
includeTechDocs?: boolean;
|
||||
} = {}
|
||||
): Promise<RAGContext> {
|
||||
const {
|
||||
maxResults = 8,
|
||||
language = "zh",
|
||||
includeCompliance = true,
|
||||
includeTechDocs = true,
|
||||
} = options;
|
||||
|
||||
const { keywords, jurisdiction, assetClass } = extractKeywords(query);
|
||||
const effectiveJurisdiction = options.jurisdiction ?? jurisdiction;
|
||||
const effectiveAssetClass = options.assetClass ?? assetClass;
|
||||
|
||||
const results: RAGDocument[] = [];
|
||||
|
||||
// ── 检索 compliance_rules ──────────────────────────────────────
|
||||
if (includeCompliance) {
|
||||
try {
|
||||
const filter: Record<string, unknown> = {};
|
||||
if (effectiveJurisdiction) filter.jurisdiction = effectiveJurisdiction;
|
||||
if (effectiveAssetClass) filter.assetClass = effectiveAssetClass;
|
||||
|
||||
// 尝试全文搜索
|
||||
let complianceDocs: RAGDocument[] = [];
|
||||
if (keywords.length > 0) {
|
||||
try {
|
||||
complianceDocs = await db.collection("compliance_rules")
|
||||
.find({ ...filter, $text: { $search: keywords.join(" ") } })
|
||||
.project({ score: { $meta: "textScore" } })
|
||||
.sort({ score: { $meta: "textScore" } })
|
||||
.limit(Math.ceil(maxResults * 0.6))
|
||||
.toArray() as RAGDocument[];
|
||||
} catch {
|
||||
// 全文索引不可用时,退回正则匹配
|
||||
const regexPattern = keywords.slice(0, 3).join("|");
|
||||
const regex = new RegExp(regexPattern, "i");
|
||||
complianceDocs = await db.collection("compliance_rules")
|
||||
.find({
|
||||
...filter,
|
||||
$or: [
|
||||
{ ruleName: regex },
|
||||
{ description: regex },
|
||||
{ content: regex },
|
||||
{ tags: { $in: keywords } },
|
||||
],
|
||||
})
|
||||
.limit(Math.ceil(maxResults * 0.6))
|
||||
.toArray() as RAGDocument[];
|
||||
}
|
||||
} else {
|
||||
complianceDocs = await db.collection("compliance_rules")
|
||||
.find(filter)
|
||||
.sort({ relevance: -1 })
|
||||
.limit(Math.ceil(maxResults * 0.6))
|
||||
.toArray() as RAGDocument[];
|
||||
}
|
||||
|
||||
complianceDocs.forEach(doc => {
|
||||
results.push({ ...doc, source: "compliance_rules" });
|
||||
});
|
||||
} catch (err) {
|
||||
console.warn("[ragEngine] compliance_rules检索失败:", err);
|
||||
}
|
||||
}
|
||||
// ── 检索 nac_tech_docs ─────────────────────────────────────────────
|
||||
if (includeTechDocs) {
|
||||
try {
|
||||
// 扩展的技术关键词过滤器:覆盖所有NAC原生技术概念
|
||||
const techKeywords = keywords.filter(k =>
|
||||
/charter|nvm|cbpp|csnp|cnnl|gnacs|acc|xtzh|xic|nrpc|rwa|token|合约|虚拟机|共识|协议|治理|宪法|收据|估值|地址|主权|辖区|合规|身份|kyc|aml|erc|solidity|evm|pos|pow|比特币|以太坊|稳定币|代币|资产|上链|流程|注册|浏览器|量子|跨链|互操作|联盟链|公有链|私有链|第四种|私法监管|流体区块|七层|sdr|黄金|香港|新加坡|迪拜|sfc|mas|vara|adgm|difc|acc20|cee|did|nac公链|宪政|神经网络|编程语言|开发者|生态|对比|区别|不同|比较|什么是|如何工作|原理|架构|技术栈/i.test(k)
|
||||
);
|
||||
// 扩展的触发条件:更广泛地触发知识库检索
|
||||
const shouldSearchTechDocs = techKeywords.length > 0 ||
|
||||
keywords.some(k => /技术|开发|编程|合约|虚拟机|治理|合规|身份|估值|资产|上链|区别|对比|比较|什么|如何|怎么|为什么|原理|架构|生态|公链|区块链/i.test(k)) ||
|
||||
query.length > 10; // 任何较长的查询都尝试检索知识库
|
||||
|
||||
if (shouldSearchTechDocs) {
|
||||
// 使用所有关键词构建更全面的正则表达式
|
||||
const searchTerms = techKeywords.length > 0 ? techKeywords : keywords;
|
||||
const regexPattern = searchTerms.slice(0, 5).join("|");
|
||||
const regex = new RegExp(regexPattern, "i");
|
||||
const techDocs = await db.collection("nac_tech_docs")
|
||||
.find({
|
||||
$or: [
|
||||
{ title: regex },
|
||||
{ content: regex },
|
||||
{ tags: { $in: keywords } },
|
||||
{ category: { $in: ["overview", "comparison", "governance", "consensus", "economics", "compliance", "identity", "development", "network", "asset_standard", "process", "valuation", "tools"] } },
|
||||
],
|
||||
})
|
||||
.limit(Math.ceil(maxResults * 0.6))
|
||||
.toArray() as RAGDocument[];
|
||||
|
||||
techDocs.forEach(doc => {
|
||||
results.push({ ...doc, source: "nac_tech_docs" });
|
||||
});
|
||||
}
|
||||
} catch (err) {
|
||||
console.warn("[ragEngine] nac_tech_docs检索失败:", err);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
documents: results.slice(0, maxResults),
|
||||
totalFound: results.length,
|
||||
queryKeywords: keywords,
|
||||
jurisdiction: effectiveJurisdiction,
|
||||
assetClass: effectiveAssetClass,
|
||||
};
|
||||
}
|
||||
|
||||
// ─── 构建RAG上下文字符串 ──────────────────────────────────────────
|
||||
|
||||
export function buildContextString(ragCtx: RAGContext, language: string = "zh"): string {
|
||||
if (ragCtx.documents.length === 0) return "";
|
||||
|
||||
const lines: string[] = ["=== 相关知识库内容 ==="];
|
||||
ragCtx.documents.slice(0, 6).forEach((doc, i) => {
|
||||
const name = doc.ruleNameI18n?.[language] ?? doc.ruleName ?? (doc as any).title ?? "未命名";
|
||||
const desc = doc.descriptionI18n?.[language] ?? doc.description ?? doc.content ?? "";
|
||||
lines.push(`\n[${i + 1}] ${name}`);
|
||||
if (doc.jurisdiction) lines.push(`辖区: ${doc.jurisdiction}`);
|
||||
if (doc.category) lines.push(`类别: ${doc.category}`);
|
||||
lines.push(desc.slice(0, 400));
|
||||
});
|
||||
return lines.join("\n");
|
||||
}
|
||||
Loading…
Reference in New Issue