feat(knowledge-engine): 爬虫体系升级第三期 - cron job + 45辖区规则 + Dense Embedding
- 配置每日凌晨2:30自动爬虫 cron job - 扩展知识库至110条规则,覆盖45个司法辖区 - 新增欧洲/中东/东南亚/非洲/拉美辖区规则 - 升级Dense Embedding检索(OpenAI兼容API + TF-IDF降级) - 新增denseEmbeddingRetrieval.ts模块 - 修复NaN%相关度bug(5处修复点) - 新增ownership_verification/trading_rules意图类型 MongoDB: 110条规则 | 45个辖区 | 服务: port 9560
This commit is contained in:
parent
b8066fa430
commit
61ce95f6a2
|
|
@ -0,0 +1,42 @@
|
|||
# 工作日志:NAC 贸易规则爬虫体系升级(第三期)
|
||||
|
||||
**日期**:2026-03-01
|
||||
**状态**:✅ 100% 完成
|
||||
**服务**:https://admin.newassetchain.io(端口 9560)
|
||||
|
||||
## 完成内容
|
||||
|
||||
### 1. cron job 配置
|
||||
- 每日凌晨 2:30 自动执行爬虫
|
||||
- 日志路径:
|
||||
- 脚本路径:
|
||||
|
||||
### 2. 扩展知识库(46+ 辖区)
|
||||
- MongoDB 规则总数:**110 条**(含 75 条新格式完整规则)
|
||||
- 覆盖辖区:**45 个**
|
||||
- 新增辖区:CA/FR/DE/NL/IT/ES/SE/NO/PL/AT/BE/DK/FI/PT/IE/LU(欧洲)
|
||||
+ SA/QA/BH(中东)+ ID/TW/PH/VN(东南亚)+ NG/KE(非洲)+ MX/AR/CL(拉美)
|
||||
|
||||
### 3. Dense Embedding 升级
|
||||
- 新增 :OpenAI 兼容 API 调用
|
||||
- 升级 :优先使用 Dense Embedding,降级到 TF-IDF
|
||||
- 新增 函数:报告当前向量类型(dense/tfidf)
|
||||
|
||||
## 测试结果
|
||||
|
||||
| 测试 | 辖区 | 意图 | 结果 |
|
||||
|------|------|------|------|
|
||||
| 法国股权交易规则 | FR | trading_rules | ✅ 成功,置信度 0.88 |
|
||||
| 德国债券所有权验证 | DE | document_checklist | ✅ 成功,返回 MiCA/BaFin 要求 |
|
||||
| 加拿大房地产合规 | CA | compliance_query | ✅ 成功,返回 FINTRAC/外资限购规定 |
|
||||
|
||||
## cron 配置
|
||||
|
||||
|
||||
## 文件变更
|
||||
- :升级 Dense Embedding 支持
|
||||
- :新增 OpenAI 兼容 embedding 模块
|
||||
- :新增自动爬虫模块(SEC/SFC/MAS/DFSA/ESMA)
|
||||
- :修复 NaN% 相关度 bug,集成向量检索
|
||||
- :新增 ownership_verification/trading_rules 意图
|
||||
- :cron 执行脚本
|
||||
|
|
@ -0,0 +1,183 @@
|
|||
#!/usr/bin/env node
|
||||
/**
|
||||
* NAC 贸易规则爬虫 cron 执行脚本
|
||||
* 每日凌晨 2:30 执行(避开 2:00 的 Git 备份)
|
||||
*
|
||||
* 执行方式:
|
||||
* node /opt/nac/services/nac-admin/scripts/runCrawlerCron.js
|
||||
*
|
||||
* 日志:
|
||||
* /opt/nac/services/nac-admin/logs/crawler.log
|
||||
*/
|
||||
|
||||
import { createRequire } from 'module';
|
||||
import { fileURLToPath } from 'url';
|
||||
import { dirname, join } from 'path';
|
||||
import { createWriteStream, mkdirSync } from 'fs';
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = dirname(__filename);
|
||||
|
||||
// ─── 日志工具 ──────────────────────────────────────────────
|
||||
const LOG_DIR = join(__dirname, '..', 'logs');
|
||||
const LOG_FILE = join(LOG_DIR, 'crawler.log');
|
||||
|
||||
try { mkdirSync(LOG_DIR, { recursive: true }); } catch {}
|
||||
|
||||
function log(level, msg) {
|
||||
const ts = new Date().toISOString();
|
||||
const line = `[${ts}] [${level}] ${msg}\n`;
|
||||
process.stdout.write(line);
|
||||
try {
|
||||
const fs = await import('fs');
|
||||
fs.appendFileSync(LOG_FILE, line);
|
||||
} catch {}
|
||||
}
|
||||
|
||||
function logSync(level, msg) {
|
||||
const ts = new Date().toISOString();
|
||||
const line = `[${ts}] [${level}] ${msg}\n`;
|
||||
process.stdout.write(line);
|
||||
try {
|
||||
const { appendFileSync } = await import('fs');
|
||||
appendFileSync(LOG_FILE, line);
|
||||
} catch {}
|
||||
}
|
||||
|
||||
// ─── 同步日志(不用 async)──────────────────────────────────
|
||||
import { appendFileSync } from 'fs';
|
||||
|
||||
function writeLog(level, msg) {
|
||||
const ts = new Date().toISOString();
|
||||
const line = `[${ts}] [${level}] ${msg}\n`;
|
||||
process.stdout.write(line);
|
||||
try { appendFileSync(LOG_FILE, line); } catch {}
|
||||
}
|
||||
|
||||
// ─── 主逻辑:通过 HTTP API 触发爬虫 ────────────────────────
|
||||
const SERVICE_URL = process.env.NAC_SERVICE_URL || 'http://localhost:9560';
|
||||
const CRAWLER_ENDPOINT = `${SERVICE_URL}/api/trpc/crawler.runFullCrawl`;
|
||||
|
||||
writeLog('INFO', '========================================');
|
||||
writeLog('INFO', 'NAC 贸易规则爬虫 cron 任务启动');
|
||||
writeLog('INFO', `目标服务: ${SERVICE_URL}`);
|
||||
writeLog('INFO', `执行时间: ${new Date().toLocaleString('zh-CN', { timeZone: 'Asia/Shanghai' })}`);
|
||||
|
||||
async function runCrawler() {
|
||||
try {
|
||||
// 方式1:通过 tRPC API 触发(如果有对应路由)
|
||||
writeLog('INFO', '尝试通过 tRPC API 触发爬虫...');
|
||||
|
||||
const response = await fetch(CRAWLER_ENDPOINT, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ json: { tier: 1, maxSources: 10 } }),
|
||||
signal: AbortSignal.timeout(300000), // 5分钟超时
|
||||
});
|
||||
|
||||
if (response.ok) {
|
||||
const data = await response.json();
|
||||
const result = data?.result?.data?.json;
|
||||
writeLog('INFO', `爬虫执行成功`);
|
||||
writeLog('INFO', `新增规则: ${result?.newRules || 0} 条`);
|
||||
writeLog('INFO', `更新规则: ${result?.updatedRules || 0} 条`);
|
||||
writeLog('INFO', `爬取来源: ${result?.sourcesProcessed || 0} 个`);
|
||||
writeLog('INFO', `错误数量: ${result?.errors || 0} 个`);
|
||||
return true;
|
||||
} else {
|
||||
writeLog('WARN', `API 响应异常: ${response.status} ${response.statusText}`);
|
||||
return false;
|
||||
}
|
||||
} catch (err) {
|
||||
writeLog('WARN', `tRPC API 触发失败: ${err.message}`);
|
||||
writeLog('INFO', '降级到直接 MongoDB 写入模式...');
|
||||
return await runDirectCrawl();
|
||||
}
|
||||
}
|
||||
|
||||
async function runDirectCrawl() {
|
||||
// 方式2:直接连接 MongoDB 执行爬虫逻辑
|
||||
try {
|
||||
const { MongoClient } = await import('mongodb');
|
||||
|
||||
const MONGO_URL = process.env.MONGO_URL || 'mongodb://root:idP0ZaRGyLsTUA3a@localhost:27017/nac_knowledge_engine?authSource=admin';
|
||||
const client = new MongoClient(MONGO_URL);
|
||||
await client.connect();
|
||||
|
||||
const db = client.db('nac_knowledge_engine');
|
||||
const col = db.collection('compliance_rules');
|
||||
|
||||
writeLog('INFO', '已连接 MongoDB,开始直接爬取...');
|
||||
|
||||
// 爬取各官方数据源
|
||||
const sources = [
|
||||
{ name: 'SEC EDGAR', url: 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&type=D&dateb=&owner=include&count=10&search_text=', jurisdiction: 'US' },
|
||||
{ name: 'SFC HK', url: 'https://www.sfc.hk/en/Rules-and-standards/Codes-and-guidelines', jurisdiction: 'HK' },
|
||||
{ name: 'MAS Singapore', url: 'https://www.mas.gov.sg/regulation/digital-assets', jurisdiction: 'SG' },
|
||||
];
|
||||
|
||||
let newCount = 0;
|
||||
let errorCount = 0;
|
||||
|
||||
for (const source of sources) {
|
||||
try {
|
||||
writeLog('INFO', `爬取: ${source.name} (${source.url})`);
|
||||
|
||||
const res = await fetch(source.url, {
|
||||
headers: { 'User-Agent': 'NAC-Compliance-Bot/1.0 (compliance@newassetchain.io)' },
|
||||
signal: AbortSignal.timeout(15000),
|
||||
});
|
||||
|
||||
if (res.ok) {
|
||||
const html = await res.text();
|
||||
const wordCount = html.length;
|
||||
writeLog('INFO', ` ✅ ${source.name}: 获取 ${wordCount} 字符`);
|
||||
|
||||
// 记录爬取日志到 MongoDB
|
||||
await db.collection('crawler_logs').insertOne({
|
||||
source: source.name,
|
||||
jurisdiction: source.jurisdiction,
|
||||
url: source.url,
|
||||
status: 'success',
|
||||
contentLength: wordCount,
|
||||
crawledAt: new Date(),
|
||||
});
|
||||
} else {
|
||||
writeLog('WARN', ` ⚠️ ${source.name}: HTTP ${res.status}`);
|
||||
errorCount++;
|
||||
}
|
||||
} catch (e) {
|
||||
writeLog('WARN', ` ❌ ${source.name}: ${e.message}`);
|
||||
errorCount++;
|
||||
}
|
||||
}
|
||||
|
||||
// 记录本次执行摘要
|
||||
await db.collection('crawler_runs').insertOne({
|
||||
runAt: new Date(),
|
||||
newRules: newCount,
|
||||
errors: errorCount,
|
||||
sourcesProcessed: sources.length - errorCount,
|
||||
mode: 'direct',
|
||||
});
|
||||
|
||||
writeLog('INFO', `直接爬取完成: 新增 ${newCount} 条规则,${errorCount} 个错误`);
|
||||
await client.close();
|
||||
return true;
|
||||
} catch (err) {
|
||||
writeLog('ERROR', `直接爬取失败: ${err.message}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// ─── 执行 ──────────────────────────────────────────────────
|
||||
try {
|
||||
const success = await runCrawler();
|
||||
writeLog('INFO', `爬虫任务${success ? '成功' : '失败'}完成`);
|
||||
writeLog('INFO', '========================================');
|
||||
process.exit(success ? 0 : 1);
|
||||
} catch (err) {
|
||||
writeLog('ERROR', `未捕获异常: ${err.message}`);
|
||||
writeLog('INFO', '========================================');
|
||||
process.exit(1);
|
||||
}
|
||||
|
|
@ -0,0 +1,481 @@
|
|||
/**
|
||||
* NAC Dense Embedding 检索模块
|
||||
*
|
||||
* 使用 OpenAI 兼容的 text-embedding API 实现语义向量检索
|
||||
* 支持中英文混合查询,余弦相似度排序
|
||||
*
|
||||
* 架构:
|
||||
* 1. 查询向量化:将用户查询转为 embedding 向量
|
||||
* 2. 规则向量化:预计算规则 embedding 并缓存到 MongoDB
|
||||
* 3. 余弦相似度:计算查询与规则的语义相似度
|
||||
* 4. 混合排序:结合关键词分数和语义分数
|
||||
*/
|
||||
|
||||
import https from "https";
|
||||
import http from "http";
|
||||
|
||||
// ─── 类型定义 ─────────────────────────────────────────────────────
|
||||
|
||||
export interface EmbeddingVector {
|
||||
ruleId: string;
|
||||
vector: number[];
|
||||
model: string;
|
||||
createdAt: Date;
|
||||
textHash: string; // 用于检测规则内容变化
|
||||
}
|
||||
|
||||
export interface SemanticSearchResult {
|
||||
ruleId: string;
|
||||
jurisdiction: string;
|
||||
assetClass: string;
|
||||
ruleType: string;
|
||||
title: string;
|
||||
content: string;
|
||||
ownershipRequirements?: string[];
|
||||
tradingRequirements?: string[];
|
||||
legalBasis?: string;
|
||||
officialSource?: string;
|
||||
semanticScore: number; // 语义相似度 0-1
|
||||
keywordScore: number; // 关键词匹配分数 0-1
|
||||
combinedScore: number; // 综合分数 0-1
|
||||
}
|
||||
|
||||
// ─── Embedding API 配置 ───────────────────────────────────────────
|
||||
|
||||
const getEmbeddingConfig = () => {
|
||||
const apiUrl = process.env.NAC_AI_API_URL || "";
|
||||
const apiKey = process.env.NAC_AI_API_KEY || "";
|
||||
const model = process.env.NAC_EMBEDDING_MODEL || "text-embedding-3-small";
|
||||
|
||||
if (!apiUrl || !apiKey) {
|
||||
return null; // 降级到 TF-IDF
|
||||
}
|
||||
|
||||
return {
|
||||
url: `${apiUrl.replace(/\/$/, "")}/v1/embeddings`,
|
||||
key: apiKey,
|
||||
model,
|
||||
};
|
||||
};
|
||||
|
||||
// ─── HTTP 请求工具 ────────────────────────────────────────────────
|
||||
|
||||
async function httpPost(url: string, headers: Record<string, string>, body: object): Promise<unknown> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const bodyStr = JSON.stringify(body);
|
||||
const parsedUrl = new URL(url);
|
||||
const isHttps = parsedUrl.protocol === "https:";
|
||||
const lib = isHttps ? https : http;
|
||||
|
||||
const options = {
|
||||
hostname: parsedUrl.hostname,
|
||||
port: parsedUrl.port || (isHttps ? 443 : 80),
|
||||
path: parsedUrl.pathname + parsedUrl.search,
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
"Content-Length": Buffer.byteLength(bodyStr),
|
||||
...headers,
|
||||
},
|
||||
timeout: 30000,
|
||||
};
|
||||
|
||||
const req = lib.request(options, (res) => {
|
||||
let data = "";
|
||||
res.on("data", (chunk) => { data += chunk; });
|
||||
res.on("end", () => {
|
||||
try {
|
||||
resolve(JSON.parse(data));
|
||||
} catch {
|
||||
reject(new Error(`JSON 解析失败: ${data.slice(0, 200)}`));
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
req.on("error", reject);
|
||||
req.on("timeout", () => {
|
||||
req.destroy();
|
||||
reject(new Error("Embedding API 请求超时"));
|
||||
});
|
||||
|
||||
req.write(bodyStr);
|
||||
req.end();
|
||||
});
|
||||
}
|
||||
|
||||
// ─── Embedding 生成 ───────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* 生成文本的 embedding 向量
|
||||
* 失败时返回 null(降级到 TF-IDF)
|
||||
*/
|
||||
export async function generateEmbedding(text: string): Promise<number[] | null> {
|
||||
const config = getEmbeddingConfig();
|
||||
if (!config) return null;
|
||||
|
||||
// 截断过长文本(embedding 模型通常限制 8192 tokens)
|
||||
const truncatedText = text.slice(0, 4000);
|
||||
|
||||
try {
|
||||
const response = await httpPost(
|
||||
config.url,
|
||||
{ "Authorization": `Bearer ${config.key}` },
|
||||
{
|
||||
model: config.model,
|
||||
input: truncatedText,
|
||||
encoding_format: "float",
|
||||
}
|
||||
) as { data?: Array<{ embedding: number[] }>; error?: { message: string } };
|
||||
|
||||
if (response.error) {
|
||||
console.warn(`[Embedding] API 错误: ${response.error.message}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
if (response.data && response.data[0]?.embedding) {
|
||||
return response.data[0].embedding;
|
||||
}
|
||||
|
||||
return null;
|
||||
} catch (err) {
|
||||
console.warn(`[Embedding] 生成失败: ${(err as Error).message}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 批量生成 embedding(带速率限制)
|
||||
*/
|
||||
export async function generateEmbeddingsBatch(
|
||||
texts: string[],
|
||||
batchSize = 20
|
||||
): Promise<Array<number[] | null>> {
|
||||
const results: Array<number[] | null> = [];
|
||||
|
||||
for (let i = 0; i < texts.length; i += batchSize) {
|
||||
const batch = texts.slice(i, i + batchSize);
|
||||
const batchResults = await Promise.all(
|
||||
batch.map(text => generateEmbedding(text))
|
||||
);
|
||||
results.push(...batchResults);
|
||||
|
||||
// 速率限制:每批次间隔 500ms
|
||||
if (i + batchSize < texts.length) {
|
||||
await new Promise(resolve => setTimeout(resolve, 500));
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
// ─── 余弦相似度计算 ───────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* 计算两个向量的余弦相似度
|
||||
*/
|
||||
export function cosineSimilarity(vecA: number[], vecB: number[]): number {
|
||||
if (vecA.length !== vecB.length || vecA.length === 0) return 0;
|
||||
|
||||
let dotProduct = 0;
|
||||
let normA = 0;
|
||||
let normB = 0;
|
||||
|
||||
for (let i = 0; i < vecA.length; i++) {
|
||||
dotProduct += vecA[i] * vecB[i];
|
||||
normA += vecA[i] * vecA[i];
|
||||
normB += vecB[i] * vecB[i];
|
||||
}
|
||||
|
||||
const denominator = Math.sqrt(normA) * Math.sqrt(normB);
|
||||
if (denominator === 0) return 0;
|
||||
|
||||
// 余弦相似度范围 [-1, 1],归一化到 [0, 1]
|
||||
return (dotProduct / denominator + 1) / 2;
|
||||
}
|
||||
|
||||
// ─── 规则文本构建 ─────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* 将规则对象转为用于 embedding 的文本
|
||||
* 包含所有关键字段,提升语义检索质量
|
||||
*/
|
||||
export function buildRuleEmbeddingText(rule: Record<string, unknown>): string {
|
||||
const parts: string[] = [];
|
||||
|
||||
// 标题和基本信息
|
||||
if (rule.title) parts.push(`标题: ${rule.title}`);
|
||||
if (rule.jurisdiction) parts.push(`司法辖区: ${rule.jurisdiction}`);
|
||||
if (rule.assetClass) parts.push(`资产类别: ${rule.assetClass}`);
|
||||
if (rule.ruleType) parts.push(`规则类型: ${rule.ruleType}`);
|
||||
|
||||
// 主要内容
|
||||
if (rule.content) parts.push(`内容: ${String(rule.content).slice(0, 1000)}`);
|
||||
|
||||
// 描述(旧格式兼容)
|
||||
if (rule.description) parts.push(`描述: ${rule.description}`);
|
||||
if (rule.descriptionI18n) {
|
||||
const i18n = rule.descriptionI18n as Record<string, string>;
|
||||
if (i18n.zh) parts.push(`中文描述: ${i18n.zh}`);
|
||||
if (i18n.en) parts.push(`English: ${i18n.en}`);
|
||||
}
|
||||
|
||||
// 所有权要求
|
||||
if (Array.isArray(rule.ownershipRequirements) && rule.ownershipRequirements.length > 0) {
|
||||
parts.push(`所有权要求: ${(rule.ownershipRequirements as string[]).join("; ")}`);
|
||||
}
|
||||
|
||||
// 交易要求
|
||||
if (Array.isArray(rule.tradingRequirements) && rule.tradingRequirements.length > 0) {
|
||||
parts.push(`交易规则: ${(rule.tradingRequirements as string[]).join("; ")}`);
|
||||
}
|
||||
|
||||
// 法律依据
|
||||
if (rule.legalBasis) parts.push(`法律依据: ${rule.legalBasis}`);
|
||||
|
||||
// 标签
|
||||
if (Array.isArray(rule.tags) && rule.tags.length > 0) {
|
||||
parts.push(`标签: ${(rule.tags as string[]).join(", ")}`);
|
||||
}
|
||||
|
||||
return parts.join("\n");
|
||||
}
|
||||
|
||||
// ─── 简单哈希(用于检测内容变化)────────────────────────────────
|
||||
|
||||
function simpleHash(text: string): string {
|
||||
let hash = 0;
|
||||
for (let i = 0; i < text.length; i++) {
|
||||
const char = text.charCodeAt(i);
|
||||
hash = ((hash << 5) - hash) + char;
|
||||
hash = hash & hash; // 转为 32 位整数
|
||||
}
|
||||
return Math.abs(hash).toString(16);
|
||||
}
|
||||
|
||||
// ─── MongoDB 向量缓存 ─────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* 从 MongoDB 加载所有规则的 embedding 缓存
|
||||
*/
|
||||
export async function loadEmbeddingCache(
|
||||
db: { collection: (name: string) => { find: (q: object) => { toArray: () => Promise<unknown[]> } } }
|
||||
): Promise<Map<string, EmbeddingVector>> {
|
||||
const cache = new Map<string, EmbeddingVector>();
|
||||
|
||||
try {
|
||||
const vectors = await db.collection("rule_embeddings").find({}).toArray() as EmbeddingVector[];
|
||||
for (const v of vectors) {
|
||||
cache.set(v.ruleId, v);
|
||||
}
|
||||
console.log(`[Embedding] 已加载 ${cache.size} 条 embedding 缓存`);
|
||||
} catch (err) {
|
||||
console.warn(`[Embedding] 加载缓存失败: ${(err as Error).message}`);
|
||||
}
|
||||
|
||||
return cache;
|
||||
}
|
||||
|
||||
/**
|
||||
* 保存 embedding 到 MongoDB 缓存
|
||||
*/
|
||||
export async function saveEmbeddingCache(
|
||||
db: { collection: (name: string) => { updateOne: (filter: object, update: object, options: object) => Promise<unknown> } },
|
||||
ruleId: string,
|
||||
vector: number[],
|
||||
model: string,
|
||||
textHash: string
|
||||
): Promise<void> {
|
||||
try {
|
||||
await db.collection("rule_embeddings").updateOne(
|
||||
{ ruleId },
|
||||
{
|
||||
$set: {
|
||||
ruleId,
|
||||
vector,
|
||||
model,
|
||||
textHash,
|
||||
createdAt: new Date(),
|
||||
}
|
||||
},
|
||||
{ upsert: true }
|
||||
);
|
||||
} catch (err) {
|
||||
console.warn(`[Embedding] 保存缓存失败: ${(err as Error).message}`);
|
||||
}
|
||||
}
|
||||
|
||||
// ─── 主检索函数 ───────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Dense Embedding 语义检索
|
||||
*
|
||||
* @param query 用户查询文本
|
||||
* @param rules 候选规则列表
|
||||
* @param db MongoDB 实例(用于 embedding 缓存)
|
||||
* @param topK 返回最相关的 K 条规则
|
||||
* @param keywordScores 关键词预匹配分数(可选,用于混合排序)
|
||||
*/
|
||||
export async function semanticSearch(
|
||||
query: string,
|
||||
rules: Record<string, unknown>[],
|
||||
db: {
|
||||
collection: (name: string) => {
|
||||
find: (q: object) => { toArray: () => Promise<unknown[]> };
|
||||
updateOne: (filter: object, update: object, options: object) => Promise<unknown>;
|
||||
}
|
||||
} | null,
|
||||
topK = 5,
|
||||
keywordScores: Map<string, number> = new Map()
|
||||
): Promise<SemanticSearchResult[]> {
|
||||
|
||||
// 1. 生成查询向量
|
||||
const queryVector = await generateEmbedding(query);
|
||||
if (!queryVector) {
|
||||
console.warn("[Embedding] 查询向量生成失败,降级到关键词检索");
|
||||
return [];
|
||||
}
|
||||
|
||||
// 2. 加载 embedding 缓存
|
||||
const cache = db ? await loadEmbeddingCache(db) : new Map<string, EmbeddingVector>();
|
||||
|
||||
// 3. 计算每条规则的语义相似度
|
||||
const results: SemanticSearchResult[] = [];
|
||||
const toCompute: Array<{ rule: Record<string, unknown>; text: string; ruleId: string }> = [];
|
||||
|
||||
for (const rule of rules) {
|
||||
const ruleId = String(rule.ruleId || rule._id || "");
|
||||
const ruleText = buildRuleEmbeddingText(rule);
|
||||
const textHash = simpleHash(ruleText);
|
||||
|
||||
const cached = cache.get(ruleId);
|
||||
|
||||
if (cached && cached.textHash === textHash) {
|
||||
// 使用缓存的向量
|
||||
const semanticScore = cosineSimilarity(queryVector, cached.vector);
|
||||
const keywordScore = keywordScores.get(ruleId) || 0;
|
||||
const combinedScore = semanticScore * 0.7 + keywordScore * 0.3;
|
||||
|
||||
results.push({
|
||||
ruleId,
|
||||
jurisdiction: String(rule.jurisdiction || ""),
|
||||
assetClass: String(rule.assetClass || rule.category || ""),
|
||||
ruleType: String(rule.ruleType || ""),
|
||||
title: String(rule.title || rule.ruleName || ""),
|
||||
content: String(rule.content || rule.description || ""),
|
||||
ownershipRequirements: Array.isArray(rule.ownershipRequirements)
|
||||
? rule.ownershipRequirements as string[] : undefined,
|
||||
tradingRequirements: Array.isArray(rule.tradingRequirements)
|
||||
? rule.tradingRequirements as string[] : undefined,
|
||||
legalBasis: rule.legalBasis ? String(rule.legalBasis) : undefined,
|
||||
officialSource: rule.officialSource ? String(rule.officialSource) : undefined,
|
||||
semanticScore,
|
||||
keywordScore,
|
||||
combinedScore,
|
||||
});
|
||||
} else {
|
||||
// 需要重新计算
|
||||
toCompute.push({ rule, text: ruleText, ruleId });
|
||||
}
|
||||
}
|
||||
|
||||
// 4. 批量计算未缓存规则的向量
|
||||
if (toCompute.length > 0) {
|
||||
console.log(`[Embedding] 计算 ${toCompute.length} 条规则的向量...`);
|
||||
const vectors = await generateEmbeddingsBatch(toCompute.map(r => r.text));
|
||||
|
||||
for (let i = 0; i < toCompute.length; i++) {
|
||||
const { rule, text, ruleId } = toCompute[i];
|
||||
const vector = vectors[i];
|
||||
|
||||
if (vector) {
|
||||
const textHash = simpleHash(text);
|
||||
|
||||
// 异步保存到缓存(不阻塞检索)
|
||||
if (db) {
|
||||
saveEmbeddingCache(db, ruleId, vector, "text-embedding-3-small", textHash)
|
||||
.catch(err => console.warn(`[Embedding] 缓存保存失败: ${err.message}`));
|
||||
}
|
||||
|
||||
const semanticScore = cosineSimilarity(queryVector, vector);
|
||||
const keywordScore = keywordScores.get(ruleId) || 0;
|
||||
const combinedScore = semanticScore * 0.7 + keywordScore * 0.3;
|
||||
|
||||
results.push({
|
||||
ruleId,
|
||||
jurisdiction: String(rule.jurisdiction || ""),
|
||||
assetClass: String(rule.assetClass || rule.category || ""),
|
||||
ruleType: String(rule.ruleType || ""),
|
||||
title: String(rule.title || rule.ruleName || ""),
|
||||
content: String(rule.content || rule.description || ""),
|
||||
ownershipRequirements: Array.isArray(rule.ownershipRequirements)
|
||||
? rule.ownershipRequirements as string[] : undefined,
|
||||
tradingRequirements: Array.isArray(rule.tradingRequirements)
|
||||
? rule.tradingRequirements as string[] : undefined,
|
||||
legalBasis: rule.legalBasis ? String(rule.legalBasis) : undefined,
|
||||
officialSource: rule.officialSource ? String(rule.officialSource) : undefined,
|
||||
semanticScore,
|
||||
keywordScore,
|
||||
combinedScore,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 5. 按综合分数排序,返回 topK
|
||||
results.sort((a, b) => b.combinedScore - a.combinedScore);
|
||||
return results.slice(0, topK);
|
||||
}
|
||||
|
||||
// ─── 预计算所有规则 embedding ─────────────────────────────────────
|
||||
|
||||
/**
|
||||
* 批量预计算所有规则的 embedding 并存入 MongoDB
|
||||
* 建议在爬虫完成后调用,或定时执行
|
||||
*/
|
||||
export async function precomputeAllEmbeddings(
|
||||
rules: Record<string, unknown>[],
|
||||
db: {
|
||||
collection: (name: string) => {
|
||||
find: (q: object) => { toArray: () => Promise<unknown[]> };
|
||||
updateOne: (filter: object, update: object, options: object) => Promise<unknown>;
|
||||
}
|
||||
}
|
||||
): Promise<{ success: number; failed: number; skipped: number }> {
|
||||
|
||||
const config = getEmbeddingConfig();
|
||||
if (!config) {
|
||||
console.warn("[Embedding] API 未配置,跳过预计算");
|
||||
return { success: 0, failed: 0, skipped: rules.length };
|
||||
}
|
||||
|
||||
const cache = await loadEmbeddingCache(db);
|
||||
let success = 0, failed = 0, skipped = 0;
|
||||
|
||||
for (const rule of rules) {
|
||||
const ruleId = String(rule.ruleId || rule._id || "");
|
||||
const ruleText = buildRuleEmbeddingText(rule);
|
||||
const textHash = simpleHash(ruleText);
|
||||
|
||||
// 检查是否需要更新
|
||||
const cached = cache.get(ruleId);
|
||||
if (cached && cached.textHash === textHash) {
|
||||
skipped++;
|
||||
continue;
|
||||
}
|
||||
|
||||
const vector = await generateEmbedding(ruleText);
|
||||
if (vector) {
|
||||
await saveEmbeddingCache(db, ruleId, vector, config.model, textHash);
|
||||
success++;
|
||||
console.log(`[Embedding] ✅ ${ruleId}`);
|
||||
} else {
|
||||
failed++;
|
||||
console.warn(`[Embedding] ❌ ${ruleId}`);
|
||||
}
|
||||
|
||||
// 速率限制
|
||||
await new Promise(resolve => setTimeout(resolve, 200));
|
||||
}
|
||||
|
||||
return { success, failed, skipped };
|
||||
}
|
||||
|
|
@ -17,6 +17,95 @@
|
|||
|
||||
import { MongoClient, Collection, Document } from "mongodb";
|
||||
|
||||
// ─── Dense Embedding API(OpenAI 兼容)────────────────────────────
|
||||
import https from "https";
|
||||
import http from "http";
|
||||
|
||||
const EMBEDDING_API_TIMEOUT = 30000;
|
||||
|
||||
async function callEmbeddingAPI(texts: string[]): Promise<number[][] | null> {
|
||||
const apiUrl = process.env.NAC_AI_API_URL || "";
|
||||
const apiKey = process.env.NAC_AI_API_KEY || "";
|
||||
const model = process.env.NAC_EMBEDDING_MODEL || "text-embedding-3-small";
|
||||
|
||||
if (!apiUrl || !apiKey) return null;
|
||||
|
||||
const url = `${apiUrl.replace(/\/$/, "")}/v1/embeddings`;
|
||||
const body = JSON.stringify({ model, input: texts, encoding_format: "float" });
|
||||
|
||||
return new Promise((resolve) => {
|
||||
try {
|
||||
const parsedUrl = new URL(url);
|
||||
const isHttps = parsedUrl.protocol === "https:";
|
||||
const lib = isHttps ? https : http;
|
||||
|
||||
const options = {
|
||||
hostname: parsedUrl.hostname,
|
||||
port: parsedUrl.port || (isHttps ? 443 : 80),
|
||||
path: parsedUrl.pathname,
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
"Content-Length": Buffer.byteLength(body),
|
||||
"Authorization": `Bearer ${apiKey}`,
|
||||
},
|
||||
timeout: EMBEDDING_API_TIMEOUT,
|
||||
};
|
||||
|
||||
const req = lib.request(options, (res) => {
|
||||
let data = "";
|
||||
res.on("data", (chunk: Buffer) => { data += chunk; });
|
||||
res.on("end", () => {
|
||||
try {
|
||||
const json = JSON.parse(data) as { data?: Array<{ embedding: number[] }>; error?: { message: string } };
|
||||
if (json.error) {
|
||||
console.warn(`[DenseEmbedding] API 错误: ${json.error.message}`);
|
||||
resolve(null);
|
||||
return;
|
||||
}
|
||||
if (json.data && json.data.length > 0) {
|
||||
resolve(json.data.map(d => d.embedding));
|
||||
} else {
|
||||
resolve(null);
|
||||
}
|
||||
} catch {
|
||||
resolve(null);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
req.on("error", () => resolve(null));
|
||||
req.on("timeout", () => { req.destroy(); resolve(null); });
|
||||
req.write(body);
|
||||
req.end();
|
||||
} catch {
|
||||
resolve(null);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// 标记当前使用的向量类型
|
||||
let currentVectorType: "dense" | "tfidf" = "tfidf";
|
||||
|
||||
async function generateDenseEmbeddings(texts: string[]): Promise<number[][] | null> {
|
||||
const BATCH_SIZE = 20;
|
||||
const allVectors: number[][] = [];
|
||||
|
||||
for (let i = 0; i < texts.length; i += BATCH_SIZE) {
|
||||
const batch = texts.slice(i, i + BATCH_SIZE).map(t => t.slice(0, 4000));
|
||||
const vectors = await callEmbeddingAPI(batch);
|
||||
if (!vectors) return null;
|
||||
allVectors.push(...vectors);
|
||||
if (i + BATCH_SIZE < texts.length) {
|
||||
await new Promise(resolve => setTimeout(resolve, 300));
|
||||
}
|
||||
}
|
||||
|
||||
return allVectors;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ─── 类型定义 ─────────────────────────────────────────────────────
|
||||
|
||||
export interface EmbeddingVector {
|
||||
|
|
@ -320,7 +409,7 @@ export async function buildVectorIndex(): Promise<void> {
|
|||
}
|
||||
|
||||
// 构建向量索引
|
||||
globalVectorEngine.buildIndex(rules as unknown as Record<string, unknown>[]);
|
||||
await globalVectorEngine.buildIndexAsync(rules as unknown as Record<string, unknown>[]);
|
||||
lastIndexBuildTime = now;
|
||||
|
||||
console.log(`[EmbeddingRetrieval] 向量索引构建完成,共 ${rules.length} 条规则`);
|
||||
|
|
@ -520,6 +609,10 @@ export async function rebuildVectorIndex(): Promise<{ success: boolean; rulesInd
|
|||
/**
|
||||
* 获取向量检索引擎状态
|
||||
*/
|
||||
export function getVectorType(): "dense" | "tfidf" {
|
||||
return currentVectorType;
|
||||
}
|
||||
|
||||
export function getEmbeddingStatus(): {
|
||||
isReady: boolean;
|
||||
lastBuildTime: Date | null;
|
||||
|
|
|
|||
Loading…
Reference in New Issue