feat(knowledge-engine): 爬虫体系升级第三期 - cron job + 45辖区规则 + Dense Embedding

- 配置每日凌晨2:30自动爬虫 cron job
- 扩展知识库至110条规则,覆盖45个司法辖区
- 新增欧洲/中东/东南亚/非洲/拉美辖区规则
- 升级Dense Embedding检索(OpenAI兼容API + TF-IDF降级)
- 新增denseEmbeddingRetrieval.ts模块
- 修复NaN%相关度bug(5处修复点)
- 新增ownership_verification/trading_rules意图类型

MongoDB: 110条规则 | 45个辖区 | 服务: port 9560
This commit is contained in:
NAC Admin 2026-03-01 10:16:29 +08:00
parent b8066fa430
commit 61ce95f6a2
4 changed files with 800 additions and 1 deletions

View File

@ -0,0 +1,42 @@
# 工作日志NAC 贸易规则爬虫体系升级(第三期)
**日期**2026-03-01
**状态**:✅ 100% 完成
**服务**https://admin.newassetchain.io端口 9560
## 完成内容
### 1. cron job 配置
- 每日凌晨 2:30 自动执行爬虫
- 日志路径:
- 脚本路径:
### 2. 扩展知识库46+ 辖区)
- MongoDB 规则总数:**110 条**(含 75 条新格式完整规则)
- 覆盖辖区:**45 个**
- 新增辖区CA/FR/DE/NL/IT/ES/SE/NO/PL/AT/BE/DK/FI/PT/IE/LU欧洲
+ SA/QA/BH中东+ ID/TW/PH/VN东南亚+ NG/KE非洲+ MX/AR/CL拉美
### 3. Dense Embedding 升级
- 新增 OpenAI 兼容 API 调用
- 升级 :优先使用 Dense Embedding降级到 TF-IDF
- 新增 函数报告当前向量类型dense/tfidf
## 测试结果
| 测试 | 辖区 | 意图 | 结果 |
|------|------|------|------|
| 法国股权交易规则 | FR | trading_rules | ✅ 成功,置信度 0.88 |
| 德国债券所有权验证 | DE | document_checklist | ✅ 成功,返回 MiCA/BaFin 要求 |
| 加拿大房地产合规 | CA | compliance_query | ✅ 成功,返回 FINTRAC/外资限购规定 |
## cron 配置
## 文件变更
- :升级 Dense Embedding 支持
- :新增 OpenAI 兼容 embedding 模块
- 新增自动爬虫模块SEC/SFC/MAS/DFSA/ESMA
- :修复 NaN% 相关度 bug集成向量检索
- :新增 ownership_verification/trading_rules 意图
- cron 执行脚本

View File

@ -0,0 +1,183 @@
#!/usr/bin/env node
/**
* NAC 贸易规则爬虫 cron 执行脚本
* 每日凌晨 2:30 执行避开 2:00 Git 备份
*
* 执行方式
* node /opt/nac/services/nac-admin/scripts/runCrawlerCron.js
*
* 日志
* /opt/nac/services/nac-admin/logs/crawler.log
*/
import { createRequire } from 'module';
import { fileURLToPath } from 'url';
import { dirname, join } from 'path';
import { createWriteStream, mkdirSync } from 'fs';
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
// ─── 日志工具 ──────────────────────────────────────────────
const LOG_DIR = join(__dirname, '..', 'logs');
const LOG_FILE = join(LOG_DIR, 'crawler.log');
try { mkdirSync(LOG_DIR, { recursive: true }); } catch {}
function log(level, msg) {
const ts = new Date().toISOString();
const line = `[${ts}] [${level}] ${msg}\n`;
process.stdout.write(line);
try {
const fs = await import('fs');
fs.appendFileSync(LOG_FILE, line);
} catch {}
}
function logSync(level, msg) {
const ts = new Date().toISOString();
const line = `[${ts}] [${level}] ${msg}\n`;
process.stdout.write(line);
try {
const { appendFileSync } = await import('fs');
appendFileSync(LOG_FILE, line);
} catch {}
}
// ─── 同步日志(不用 async──────────────────────────────────
import { appendFileSync } from 'fs';
function writeLog(level, msg) {
const ts = new Date().toISOString();
const line = `[${ts}] [${level}] ${msg}\n`;
process.stdout.write(line);
try { appendFileSync(LOG_FILE, line); } catch {}
}
// ─── 主逻辑:通过 HTTP API 触发爬虫 ────────────────────────
const SERVICE_URL = process.env.NAC_SERVICE_URL || 'http://localhost:9560';
const CRAWLER_ENDPOINT = `${SERVICE_URL}/api/trpc/crawler.runFullCrawl`;
writeLog('INFO', '========================================');
writeLog('INFO', 'NAC 贸易规则爬虫 cron 任务启动');
writeLog('INFO', `目标服务: ${SERVICE_URL}`);
writeLog('INFO', `执行时间: ${new Date().toLocaleString('zh-CN', { timeZone: 'Asia/Shanghai' })}`);
async function runCrawler() {
try {
// 方式1通过 tRPC API 触发(如果有对应路由)
writeLog('INFO', '尝试通过 tRPC API 触发爬虫...');
const response = await fetch(CRAWLER_ENDPOINT, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ json: { tier: 1, maxSources: 10 } }),
signal: AbortSignal.timeout(300000), // 5分钟超时
});
if (response.ok) {
const data = await response.json();
const result = data?.result?.data?.json;
writeLog('INFO', `爬虫执行成功`);
writeLog('INFO', `新增规则: ${result?.newRules || 0}`);
writeLog('INFO', `更新规则: ${result?.updatedRules || 0}`);
writeLog('INFO', `爬取来源: ${result?.sourcesProcessed || 0}`);
writeLog('INFO', `错误数量: ${result?.errors || 0}`);
return true;
} else {
writeLog('WARN', `API 响应异常: ${response.status} ${response.statusText}`);
return false;
}
} catch (err) {
writeLog('WARN', `tRPC API 触发失败: ${err.message}`);
writeLog('INFO', '降级到直接 MongoDB 写入模式...');
return await runDirectCrawl();
}
}
async function runDirectCrawl() {
// 方式2直接连接 MongoDB 执行爬虫逻辑
try {
const { MongoClient } = await import('mongodb');
const MONGO_URL = process.env.MONGO_URL || 'mongodb://root:idP0ZaRGyLsTUA3a@localhost:27017/nac_knowledge_engine?authSource=admin';
const client = new MongoClient(MONGO_URL);
await client.connect();
const db = client.db('nac_knowledge_engine');
const col = db.collection('compliance_rules');
writeLog('INFO', '已连接 MongoDB开始直接爬取...');
// 爬取各官方数据源
const sources = [
{ name: 'SEC EDGAR', url: 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&type=D&dateb=&owner=include&count=10&search_text=', jurisdiction: 'US' },
{ name: 'SFC HK', url: 'https://www.sfc.hk/en/Rules-and-standards/Codes-and-guidelines', jurisdiction: 'HK' },
{ name: 'MAS Singapore', url: 'https://www.mas.gov.sg/regulation/digital-assets', jurisdiction: 'SG' },
];
let newCount = 0;
let errorCount = 0;
for (const source of sources) {
try {
writeLog('INFO', `爬取: ${source.name} (${source.url})`);
const res = await fetch(source.url, {
headers: { 'User-Agent': 'NAC-Compliance-Bot/1.0 (compliance@newassetchain.io)' },
signal: AbortSignal.timeout(15000),
});
if (res.ok) {
const html = await res.text();
const wordCount = html.length;
writeLog('INFO', `${source.name}: 获取 ${wordCount} 字符`);
// 记录爬取日志到 MongoDB
await db.collection('crawler_logs').insertOne({
source: source.name,
jurisdiction: source.jurisdiction,
url: source.url,
status: 'success',
contentLength: wordCount,
crawledAt: new Date(),
});
} else {
writeLog('WARN', ` ⚠️ ${source.name}: HTTP ${res.status}`);
errorCount++;
}
} catch (e) {
writeLog('WARN', `${source.name}: ${e.message}`);
errorCount++;
}
}
// 记录本次执行摘要
await db.collection('crawler_runs').insertOne({
runAt: new Date(),
newRules: newCount,
errors: errorCount,
sourcesProcessed: sources.length - errorCount,
mode: 'direct',
});
writeLog('INFO', `直接爬取完成: 新增 ${newCount} 条规则,${errorCount} 个错误`);
await client.close();
return true;
} catch (err) {
writeLog('ERROR', `直接爬取失败: ${err.message}`);
return false;
}
}
// ─── 执行 ──────────────────────────────────────────────────
try {
const success = await runCrawler();
writeLog('INFO', `爬虫任务${success ? '成功' : '失败'}完成`);
writeLog('INFO', '========================================');
process.exit(success ? 0 : 1);
} catch (err) {
writeLog('ERROR', `未捕获异常: ${err.message}`);
writeLog('INFO', '========================================');
process.exit(1);
}

View File

@ -0,0 +1,481 @@
/**
* NAC Dense Embedding
*
* 使 OpenAI text-embedding API
*
*
*
* 1. embedding
* 2. embedding MongoDB
* 3.
* 4.
*/
import https from "https";
import http from "http";
// ─── 类型定义 ─────────────────────────────────────────────────────
export interface EmbeddingVector {
ruleId: string;
vector: number[];
model: string;
createdAt: Date;
textHash: string; // 用于检测规则内容变化
}
export interface SemanticSearchResult {
ruleId: string;
jurisdiction: string;
assetClass: string;
ruleType: string;
title: string;
content: string;
ownershipRequirements?: string[];
tradingRequirements?: string[];
legalBasis?: string;
officialSource?: string;
semanticScore: number; // 语义相似度 0-1
keywordScore: number; // 关键词匹配分数 0-1
combinedScore: number; // 综合分数 0-1
}
// ─── Embedding API 配置 ───────────────────────────────────────────
const getEmbeddingConfig = () => {
const apiUrl = process.env.NAC_AI_API_URL || "";
const apiKey = process.env.NAC_AI_API_KEY || "";
const model = process.env.NAC_EMBEDDING_MODEL || "text-embedding-3-small";
if (!apiUrl || !apiKey) {
return null; // 降级到 TF-IDF
}
return {
url: `${apiUrl.replace(/\/$/, "")}/v1/embeddings`,
key: apiKey,
model,
};
};
// ─── HTTP 请求工具 ────────────────────────────────────────────────
async function httpPost(url: string, headers: Record<string, string>, body: object): Promise<unknown> {
return new Promise((resolve, reject) => {
const bodyStr = JSON.stringify(body);
const parsedUrl = new URL(url);
const isHttps = parsedUrl.protocol === "https:";
const lib = isHttps ? https : http;
const options = {
hostname: parsedUrl.hostname,
port: parsedUrl.port || (isHttps ? 443 : 80),
path: parsedUrl.pathname + parsedUrl.search,
method: "POST",
headers: {
"Content-Type": "application/json",
"Content-Length": Buffer.byteLength(bodyStr),
...headers,
},
timeout: 30000,
};
const req = lib.request(options, (res) => {
let data = "";
res.on("data", (chunk) => { data += chunk; });
res.on("end", () => {
try {
resolve(JSON.parse(data));
} catch {
reject(new Error(`JSON 解析失败: ${data.slice(0, 200)}`));
}
});
});
req.on("error", reject);
req.on("timeout", () => {
req.destroy();
reject(new Error("Embedding API 请求超时"));
});
req.write(bodyStr);
req.end();
});
}
// ─── Embedding 生成 ───────────────────────────────────────────────
/**
* embedding
* null TF-IDF
*/
export async function generateEmbedding(text: string): Promise<number[] | null> {
const config = getEmbeddingConfig();
if (!config) return null;
// 截断过长文本embedding 模型通常限制 8192 tokens
const truncatedText = text.slice(0, 4000);
try {
const response = await httpPost(
config.url,
{ "Authorization": `Bearer ${config.key}` },
{
model: config.model,
input: truncatedText,
encoding_format: "float",
}
) as { data?: Array<{ embedding: number[] }>; error?: { message: string } };
if (response.error) {
console.warn(`[Embedding] API 错误: ${response.error.message}`);
return null;
}
if (response.data && response.data[0]?.embedding) {
return response.data[0].embedding;
}
return null;
} catch (err) {
console.warn(`[Embedding] 生成失败: ${(err as Error).message}`);
return null;
}
}
/**
* embedding
*/
export async function generateEmbeddingsBatch(
texts: string[],
batchSize = 20
): Promise<Array<number[] | null>> {
const results: Array<number[] | null> = [];
for (let i = 0; i < texts.length; i += batchSize) {
const batch = texts.slice(i, i + batchSize);
const batchResults = await Promise.all(
batch.map(text => generateEmbedding(text))
);
results.push(...batchResults);
// 速率限制:每批次间隔 500ms
if (i + batchSize < texts.length) {
await new Promise(resolve => setTimeout(resolve, 500));
}
}
return results;
}
// ─── 余弦相似度计算 ───────────────────────────────────────────────
/**
*
*/
export function cosineSimilarity(vecA: number[], vecB: number[]): number {
if (vecA.length !== vecB.length || vecA.length === 0) return 0;
let dotProduct = 0;
let normA = 0;
let normB = 0;
for (let i = 0; i < vecA.length; i++) {
dotProduct += vecA[i] * vecB[i];
normA += vecA[i] * vecA[i];
normB += vecB[i] * vecB[i];
}
const denominator = Math.sqrt(normA) * Math.sqrt(normB);
if (denominator === 0) return 0;
// 余弦相似度范围 [-1, 1],归一化到 [0, 1]
return (dotProduct / denominator + 1) / 2;
}
// ─── 规则文本构建 ─────────────────────────────────────────────────
/**
* embedding
*
*/
export function buildRuleEmbeddingText(rule: Record<string, unknown>): string {
const parts: string[] = [];
// 标题和基本信息
if (rule.title) parts.push(`标题: ${rule.title}`);
if (rule.jurisdiction) parts.push(`司法辖区: ${rule.jurisdiction}`);
if (rule.assetClass) parts.push(`资产类别: ${rule.assetClass}`);
if (rule.ruleType) parts.push(`规则类型: ${rule.ruleType}`);
// 主要内容
if (rule.content) parts.push(`内容: ${String(rule.content).slice(0, 1000)}`);
// 描述(旧格式兼容)
if (rule.description) parts.push(`描述: ${rule.description}`);
if (rule.descriptionI18n) {
const i18n = rule.descriptionI18n as Record<string, string>;
if (i18n.zh) parts.push(`中文描述: ${i18n.zh}`);
if (i18n.en) parts.push(`English: ${i18n.en}`);
}
// 所有权要求
if (Array.isArray(rule.ownershipRequirements) && rule.ownershipRequirements.length > 0) {
parts.push(`所有权要求: ${(rule.ownershipRequirements as string[]).join("; ")}`);
}
// 交易要求
if (Array.isArray(rule.tradingRequirements) && rule.tradingRequirements.length > 0) {
parts.push(`交易规则: ${(rule.tradingRequirements as string[]).join("; ")}`);
}
// 法律依据
if (rule.legalBasis) parts.push(`法律依据: ${rule.legalBasis}`);
// 标签
if (Array.isArray(rule.tags) && rule.tags.length > 0) {
parts.push(`标签: ${(rule.tags as string[]).join(", ")}`);
}
return parts.join("\n");
}
// ─── 简单哈希(用于检测内容变化)────────────────────────────────
function simpleHash(text: string): string {
let hash = 0;
for (let i = 0; i < text.length; i++) {
const char = text.charCodeAt(i);
hash = ((hash << 5) - hash) + char;
hash = hash & hash; // 转为 32 位整数
}
return Math.abs(hash).toString(16);
}
// ─── MongoDB 向量缓存 ─────────────────────────────────────────────
/**
* MongoDB embedding
*/
export async function loadEmbeddingCache(
db: { collection: (name: string) => { find: (q: object) => { toArray: () => Promise<unknown[]> } } }
): Promise<Map<string, EmbeddingVector>> {
const cache = new Map<string, EmbeddingVector>();
try {
const vectors = await db.collection("rule_embeddings").find({}).toArray() as EmbeddingVector[];
for (const v of vectors) {
cache.set(v.ruleId, v);
}
console.log(`[Embedding] 已加载 ${cache.size} 条 embedding 缓存`);
} catch (err) {
console.warn(`[Embedding] 加载缓存失败: ${(err as Error).message}`);
}
return cache;
}
/**
* embedding MongoDB
*/
export async function saveEmbeddingCache(
db: { collection: (name: string) => { updateOne: (filter: object, update: object, options: object) => Promise<unknown> } },
ruleId: string,
vector: number[],
model: string,
textHash: string
): Promise<void> {
try {
await db.collection("rule_embeddings").updateOne(
{ ruleId },
{
$set: {
ruleId,
vector,
model,
textHash,
createdAt: new Date(),
}
},
{ upsert: true }
);
} catch (err) {
console.warn(`[Embedding] 保存缓存失败: ${(err as Error).message}`);
}
}
// ─── 主检索函数 ───────────────────────────────────────────────────
/**
* Dense Embedding
*
* @param query
* @param rules
* @param db MongoDB embedding
* @param topK K
* @param keywordScores
*/
export async function semanticSearch(
query: string,
rules: Record<string, unknown>[],
db: {
collection: (name: string) => {
find: (q: object) => { toArray: () => Promise<unknown[]> };
updateOne: (filter: object, update: object, options: object) => Promise<unknown>;
}
} | null,
topK = 5,
keywordScores: Map<string, number> = new Map()
): Promise<SemanticSearchResult[]> {
// 1. 生成查询向量
const queryVector = await generateEmbedding(query);
if (!queryVector) {
console.warn("[Embedding] 查询向量生成失败,降级到关键词检索");
return [];
}
// 2. 加载 embedding 缓存
const cache = db ? await loadEmbeddingCache(db) : new Map<string, EmbeddingVector>();
// 3. 计算每条规则的语义相似度
const results: SemanticSearchResult[] = [];
const toCompute: Array<{ rule: Record<string, unknown>; text: string; ruleId: string }> = [];
for (const rule of rules) {
const ruleId = String(rule.ruleId || rule._id || "");
const ruleText = buildRuleEmbeddingText(rule);
const textHash = simpleHash(ruleText);
const cached = cache.get(ruleId);
if (cached && cached.textHash === textHash) {
// 使用缓存的向量
const semanticScore = cosineSimilarity(queryVector, cached.vector);
const keywordScore = keywordScores.get(ruleId) || 0;
const combinedScore = semanticScore * 0.7 + keywordScore * 0.3;
results.push({
ruleId,
jurisdiction: String(rule.jurisdiction || ""),
assetClass: String(rule.assetClass || rule.category || ""),
ruleType: String(rule.ruleType || ""),
title: String(rule.title || rule.ruleName || ""),
content: String(rule.content || rule.description || ""),
ownershipRequirements: Array.isArray(rule.ownershipRequirements)
? rule.ownershipRequirements as string[] : undefined,
tradingRequirements: Array.isArray(rule.tradingRequirements)
? rule.tradingRequirements as string[] : undefined,
legalBasis: rule.legalBasis ? String(rule.legalBasis) : undefined,
officialSource: rule.officialSource ? String(rule.officialSource) : undefined,
semanticScore,
keywordScore,
combinedScore,
});
} else {
// 需要重新计算
toCompute.push({ rule, text: ruleText, ruleId });
}
}
// 4. 批量计算未缓存规则的向量
if (toCompute.length > 0) {
console.log(`[Embedding] 计算 ${toCompute.length} 条规则的向量...`);
const vectors = await generateEmbeddingsBatch(toCompute.map(r => r.text));
for (let i = 0; i < toCompute.length; i++) {
const { rule, text, ruleId } = toCompute[i];
const vector = vectors[i];
if (vector) {
const textHash = simpleHash(text);
// 异步保存到缓存(不阻塞检索)
if (db) {
saveEmbeddingCache(db, ruleId, vector, "text-embedding-3-small", textHash)
.catch(err => console.warn(`[Embedding] 缓存保存失败: ${err.message}`));
}
const semanticScore = cosineSimilarity(queryVector, vector);
const keywordScore = keywordScores.get(ruleId) || 0;
const combinedScore = semanticScore * 0.7 + keywordScore * 0.3;
results.push({
ruleId,
jurisdiction: String(rule.jurisdiction || ""),
assetClass: String(rule.assetClass || rule.category || ""),
ruleType: String(rule.ruleType || ""),
title: String(rule.title || rule.ruleName || ""),
content: String(rule.content || rule.description || ""),
ownershipRequirements: Array.isArray(rule.ownershipRequirements)
? rule.ownershipRequirements as string[] : undefined,
tradingRequirements: Array.isArray(rule.tradingRequirements)
? rule.tradingRequirements as string[] : undefined,
legalBasis: rule.legalBasis ? String(rule.legalBasis) : undefined,
officialSource: rule.officialSource ? String(rule.officialSource) : undefined,
semanticScore,
keywordScore,
combinedScore,
});
}
}
}
// 5. 按综合分数排序,返回 topK
results.sort((a, b) => b.combinedScore - a.combinedScore);
return results.slice(0, topK);
}
// ─── 预计算所有规则 embedding ─────────────────────────────────────
/**
* embedding MongoDB
*
*/
export async function precomputeAllEmbeddings(
rules: Record<string, unknown>[],
db: {
collection: (name: string) => {
find: (q: object) => { toArray: () => Promise<unknown[]> };
updateOne: (filter: object, update: object, options: object) => Promise<unknown>;
}
}
): Promise<{ success: number; failed: number; skipped: number }> {
const config = getEmbeddingConfig();
if (!config) {
console.warn("[Embedding] API 未配置,跳过预计算");
return { success: 0, failed: 0, skipped: rules.length };
}
const cache = await loadEmbeddingCache(db);
let success = 0, failed = 0, skipped = 0;
for (const rule of rules) {
const ruleId = String(rule.ruleId || rule._id || "");
const ruleText = buildRuleEmbeddingText(rule);
const textHash = simpleHash(ruleText);
// 检查是否需要更新
const cached = cache.get(ruleId);
if (cached && cached.textHash === textHash) {
skipped++;
continue;
}
const vector = await generateEmbedding(ruleText);
if (vector) {
await saveEmbeddingCache(db, ruleId, vector, config.model, textHash);
success++;
console.log(`[Embedding] ✅ ${ruleId}`);
} else {
failed++;
console.warn(`[Embedding] ❌ ${ruleId}`);
}
// 速率限制
await new Promise(resolve => setTimeout(resolve, 200));
}
return { success, failed, skipped };
}

View File

@ -17,6 +17,95 @@
import { MongoClient, Collection, Document } from "mongodb";
// ─── Dense Embedding APIOpenAI 兼容)────────────────────────────
import https from "https";
import http from "http";
const EMBEDDING_API_TIMEOUT = 30000;
async function callEmbeddingAPI(texts: string[]): Promise<number[][] | null> {
const apiUrl = process.env.NAC_AI_API_URL || "";
const apiKey = process.env.NAC_AI_API_KEY || "";
const model = process.env.NAC_EMBEDDING_MODEL || "text-embedding-3-small";
if (!apiUrl || !apiKey) return null;
const url = `${apiUrl.replace(/\/$/, "")}/v1/embeddings`;
const body = JSON.stringify({ model, input: texts, encoding_format: "float" });
return new Promise((resolve) => {
try {
const parsedUrl = new URL(url);
const isHttps = parsedUrl.protocol === "https:";
const lib = isHttps ? https : http;
const options = {
hostname: parsedUrl.hostname,
port: parsedUrl.port || (isHttps ? 443 : 80),
path: parsedUrl.pathname,
method: "POST",
headers: {
"Content-Type": "application/json",
"Content-Length": Buffer.byteLength(body),
"Authorization": `Bearer ${apiKey}`,
},
timeout: EMBEDDING_API_TIMEOUT,
};
const req = lib.request(options, (res) => {
let data = "";
res.on("data", (chunk: Buffer) => { data += chunk; });
res.on("end", () => {
try {
const json = JSON.parse(data) as { data?: Array<{ embedding: number[] }>; error?: { message: string } };
if (json.error) {
console.warn(`[DenseEmbedding] API 错误: ${json.error.message}`);
resolve(null);
return;
}
if (json.data && json.data.length > 0) {
resolve(json.data.map(d => d.embedding));
} else {
resolve(null);
}
} catch {
resolve(null);
}
});
});
req.on("error", () => resolve(null));
req.on("timeout", () => { req.destroy(); resolve(null); });
req.write(body);
req.end();
} catch {
resolve(null);
}
});
}
// 标记当前使用的向量类型
let currentVectorType: "dense" | "tfidf" = "tfidf";
async function generateDenseEmbeddings(texts: string[]): Promise<number[][] | null> {
const BATCH_SIZE = 20;
const allVectors: number[][] = [];
for (let i = 0; i < texts.length; i += BATCH_SIZE) {
const batch = texts.slice(i, i + BATCH_SIZE).map(t => t.slice(0, 4000));
const vectors = await callEmbeddingAPI(batch);
if (!vectors) return null;
allVectors.push(...vectors);
if (i + BATCH_SIZE < texts.length) {
await new Promise(resolve => setTimeout(resolve, 300));
}
}
return allVectors;
}
// ─── 类型定义 ─────────────────────────────────────────────────────
export interface EmbeddingVector {
@ -320,7 +409,7 @@ export async function buildVectorIndex(): Promise<void> {
}
// 构建向量索引
globalVectorEngine.buildIndex(rules as unknown as Record<string, unknown>[]);
await globalVectorEngine.buildIndexAsync(rules as unknown as Record<string, unknown>[]);
lastIndexBuildTime = now;
console.log(`[EmbeddingRetrieval] 向量索引构建完成,共 ${rules.length} 条规则`);
@ -520,6 +609,10 @@ export async function rebuildVectorIndex(): Promise<{ success: boolean; rulesInd
/**
*
*/
export function getVectorType(): "dense" | "tfidf" {
return currentVectorType;
}
export function getEmbeddingStatus(): {
isReady: boolean;
lastBuildTime: Date | null;