数据库架构¶
本文档详细介绍 Unifiles 的数据库设计,包括表结构、索引策略和 pgvector 配置。
数据库技术栈¶
核心表结构¶
用户与认证¶
users 表¶
CREATE TABLE users (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
email VARCHAR(255) UNIQUE NOT NULL,
password_hash VARCHAR(255) NOT NULL,
name VARCHAR(100),
-- 配额设置
storage_quota_bytes BIGINT DEFAULT 10737418240, -- 10GB
api_calls_quota INT DEFAULT 10000,
-- 状态
is_active BOOLEAN DEFAULT true,
email_verified BOOLEAN DEFAULT false,
-- 时间戳
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
last_login_at TIMESTAMP WITH TIME ZONE
);
CREATE INDEX idx_users_email ON users(email);
CREATE INDEX idx_users_active ON users(is_active) WHERE is_active = true;
api_keys 表¶
CREATE TABLE api_keys (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
user_id UUID NOT NULL REFERENCES users(id) ON DELETE CASCADE,
-- 密钥信息 (存储哈希值,不存储明文)
key_prefix VARCHAR(12) NOT NULL, -- 如 "sk_live_abc"
key_hash VARCHAR(255) NOT NULL, -- bcrypt 哈希
-- 元数据
name VARCHAR(100),
description TEXT,
-- 权限与限制
scopes TEXT[] DEFAULT ARRAY['read', 'write'],
rate_limit INT DEFAULT 1000, -- 每分钟请求数
-- 状态
is_active BOOLEAN DEFAULT true,
expires_at TIMESTAMP WITH TIME ZONE,
last_used_at TIMESTAMP WITH TIME ZONE,
-- 时间戳
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
revoked_at TIMESTAMP WITH TIME ZONE
);
CREATE INDEX idx_api_keys_user ON api_keys(user_id);
CREATE INDEX idx_api_keys_prefix ON api_keys(key_prefix);
CREATE INDEX idx_api_keys_active ON api_keys(is_active) WHERE is_active = true;
Layer 1: 文件存储¶
files 表¶
CREATE TABLE files (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
user_id UUID NOT NULL REFERENCES users(id) ON DELETE CASCADE,
-- 文件信息
filename VARCHAR(255) NOT NULL,
original_filename VARCHAR(255) NOT NULL,
mime_type VARCHAR(100) NOT NULL,
file_size BIGINT NOT NULL,
-- 存储位置
storage_bucket VARCHAR(100) NOT NULL,
storage_key VARCHAR(500) NOT NULL,
-- 文件哈希 (用于去重)
content_hash VARCHAR(64), -- SHA-256
-- 元数据
metadata JSONB DEFAULT '{}',
tags TEXT[] DEFAULT ARRAY[]::TEXT[],
-- 状态
status VARCHAR(20) DEFAULT 'uploaded',
-- uploaded, processing, ready, error, deleted
-- 时间戳
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
deleted_at TIMESTAMP WITH TIME ZONE
);
-- 索引
CREATE INDEX idx_files_user ON files(user_id);
CREATE INDEX idx_files_status ON files(status);
CREATE INDEX idx_files_hash ON files(content_hash);
CREATE INDEX idx_files_created ON files(created_at DESC);
CREATE INDEX idx_files_metadata ON files USING GIN(metadata);
CREATE INDEX idx_files_tags ON files USING GIN(tags);
-- 软删除过滤
CREATE INDEX idx_files_active ON files(user_id, status)
WHERE deleted_at IS NULL;
Layer 2: 内容提取¶
extracted_contents 表¶
CREATE TABLE extracted_contents (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
file_id UUID NOT NULL REFERENCES files(id) ON DELETE CASCADE,
-- 提取内容 (Markdown 为单一事实来源)
markdown_content TEXT NOT NULL,
plain_text TEXT, -- 用于全文搜索
-- 提取元数据
extraction_mode VARCHAR(20) NOT NULL, -- simple, normal, advanced
ocr_provider VARCHAR(50),
ocr_config JSONB DEFAULT '{}',
-- 文档结构
page_count INT,
word_count INT,
char_count INT,
language VARCHAR(10),
-- 结构化数据
headings JSONB DEFAULT '[]',
tables JSONB DEFAULT '[]',
images JSONB DEFAULT '[]',
-- 处理信息
processing_time_ms INT,
extraction_version VARCHAR(20),
-- 状态
status VARCHAR(20) DEFAULT 'pending',
-- pending, processing, completed, failed
error_message TEXT,
-- 时间戳
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
completed_at TIMESTAMP WITH TIME ZONE
);
-- 索引
CREATE INDEX idx_extractions_file ON extracted_contents(file_id);
CREATE INDEX idx_extractions_status ON extracted_contents(status);
-- 全文搜索索引
CREATE INDEX idx_extractions_fts ON extracted_contents
USING GIN(to_tsvector('english', plain_text));
file_processing_logs 表¶
CREATE TABLE file_processing_logs (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
file_id UUID NOT NULL REFERENCES files(id) ON DELETE CASCADE,
extraction_id UUID REFERENCES extracted_contents(id),
-- 处理阶段
stage VARCHAR(50) NOT NULL,
-- validation, conversion, ocr, post_processing
-- 状态
status VARCHAR(20) NOT NULL,
-- started, completed, failed, skipped
-- 详情
message TEXT,
details JSONB DEFAULT '{}',
duration_ms INT,
-- 时间戳
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
);
CREATE INDEX idx_processing_logs_file ON file_processing_logs(file_id);
CREATE INDEX idx_processing_logs_extraction ON file_processing_logs(extraction_id);
CREATE INDEX idx_processing_logs_stage ON file_processing_logs(stage);
Layer 3: 知识库¶
knowledge_bases 表¶
CREATE TABLE knowledge_bases (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
user_id UUID NOT NULL REFERENCES users(id) ON DELETE CASCADE,
-- 基本信息
name VARCHAR(100) NOT NULL,
description TEXT,
-- 配置
embedding_model VARCHAR(100) DEFAULT 'text-embedding-3-small',
embedding_dimensions INT DEFAULT 1536,
-- 默认分块策略
default_chunking_strategy VARCHAR(20) DEFAULT 'semantic',
default_chunk_size INT DEFAULT 512,
default_chunk_overlap INT DEFAULT 50,
-- 统计
document_count INT DEFAULT 0,
chunk_count INT DEFAULT 0,
total_tokens INT DEFAULT 0,
-- 元数据
metadata JSONB DEFAULT '{}',
-- 状态
is_active BOOLEAN DEFAULT true,
-- 时间戳
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
);
CREATE INDEX idx_kb_user ON knowledge_bases(user_id);
CREATE INDEX idx_kb_active ON knowledge_bases(is_active) WHERE is_active = true;
CREATE UNIQUE INDEX idx_kb_user_name ON knowledge_bases(user_id, name);
documents 表¶
CREATE TABLE documents (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
knowledge_base_id UUID NOT NULL REFERENCES knowledge_bases(id) ON DELETE CASCADE,
file_id UUID NOT NULL REFERENCES files(id),
extracted_content_id UUID NOT NULL REFERENCES extracted_contents(id),
-- 分块配置 (可覆盖知识库默认值)
chunking_strategy VARCHAR(20) NOT NULL,
chunk_size INT NOT NULL,
chunk_overlap INT NOT NULL,
-- 文档元数据
title VARCHAR(255),
metadata JSONB DEFAULT '{}',
tags TEXT[] DEFAULT ARRAY[]::TEXT[],
-- 统计
chunk_count INT DEFAULT 0,
token_count INT DEFAULT 0,
-- 状态
status VARCHAR(20) DEFAULT 'pending',
-- pending, processing, indexed, failed
-- 时间戳
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
indexed_at TIMESTAMP WITH TIME ZONE
);
-- 索引
CREATE INDEX idx_docs_kb ON documents(knowledge_base_id);
CREATE INDEX idx_docs_file ON documents(file_id);
CREATE INDEX idx_docs_status ON documents(status);
CREATE INDEX idx_docs_metadata ON documents USING GIN(metadata);
CREATE INDEX idx_docs_tags ON documents USING GIN(tags);
-- 防止同一文件重复添加到知识库
CREATE UNIQUE INDEX idx_docs_kb_file ON documents(knowledge_base_id, file_id);
chunks 表 (向量存储)¶
CREATE TABLE chunks (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
knowledge_base_id UUID NOT NULL REFERENCES knowledge_bases(id) ON DELETE CASCADE,
-- 分块内容
content TEXT NOT NULL,
token_count INT NOT NULL,
-- 位置信息
chunk_index INT NOT NULL,
start_char INT,
end_char INT,
page_numbers INT[],
-- 向量嵌入 (pgvector)
embedding vector(1536), -- 维度根据模型调整
-- 元数据 (继承自文档)
metadata JSONB DEFAULT '{}',
-- 时间戳
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
);
-- 标准索引
CREATE INDEX idx_chunks_doc ON chunks(document_id);
CREATE INDEX idx_chunks_kb ON chunks(knowledge_base_id);
CREATE INDEX idx_chunks_index ON chunks(document_id, chunk_index);
-- 向量搜索索引 (HNSW - 推荐用于生产环境)
CREATE INDEX idx_chunks_embedding ON chunks
USING hnsw (embedding vector_cosine_ops)
WITH (m = 16, ef_construction = 64);
-- 备选: IVFFlat 索引 (适合大规模数据)
-- CREATE INDEX idx_chunks_embedding_ivf ON chunks
-- USING ivfflat (embedding vector_cosine_ops)
-- WITH (lists = 100);
Webhook 与事件¶
webhooks 表¶
CREATE TABLE webhooks (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
user_id UUID NOT NULL REFERENCES users(id) ON DELETE CASCADE,
-- 配置
url VARCHAR(500) NOT NULL,
events TEXT[] NOT NULL,
secret VARCHAR(64) NOT NULL,
-- 元数据
description TEXT,
metadata JSONB DEFAULT '{}',
-- 状态
is_active BOOLEAN DEFAULT true,
-- 统计
total_deliveries INT DEFAULT 0,
successful_deliveries INT DEFAULT 0,
failed_deliveries INT DEFAULT 0,
-- 时间戳
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
);
CREATE INDEX idx_webhooks_user ON webhooks(user_id);
CREATE INDEX idx_webhooks_active ON webhooks(is_active) WHERE is_active = true;
webhook_deliveries 表¶
CREATE TABLE webhook_deliveries (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
webhook_id UUID NOT NULL REFERENCES webhooks(id) ON DELETE CASCADE,
-- 事件信息
event_type VARCHAR(50) NOT NULL,
event_id UUID NOT NULL,
-- 请求
request_body JSONB NOT NULL,
request_headers JSONB,
-- 响应
response_status INT,
response_body TEXT,
response_time_ms INT,
-- 重试信息
attempt_number INT DEFAULT 1,
next_retry_at TIMESTAMP WITH TIME ZONE,
-- 状态
status VARCHAR(20) NOT NULL,
-- pending, success, failed, exhausted
error_message TEXT,
-- 时间戳
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
delivered_at TIMESTAMP WITH TIME ZONE
);
CREATE INDEX idx_deliveries_webhook ON webhook_deliveries(webhook_id);
CREATE INDEX idx_deliveries_status ON webhook_deliveries(status);
CREATE INDEX idx_deliveries_retry ON webhook_deliveries(next_retry_at)
WHERE status = 'failed';
pgvector 配置¶
扩展安装¶
-- 安装 pgvector 扩展
CREATE EXTENSION IF NOT EXISTS vector;
-- 验证安装
SELECT * FROM pg_extension WHERE extname = 'vector';
向量维度¶
不同嵌入模型的维度配置:
| 模型 | 维度 | 说明 |
|---|---|---|
| text-embedding-3-small | 1536 | OpenAI 默认 |
| text-embedding-3-large | 3072 | OpenAI 高精度 |
| text-embedding-ada-002 | 1536 | OpenAI 旧版 |
| voyage-2 | 1024 | Voyage AI |
| bge-large-zh | 1024 | 中文优化 |
距离度量¶
-- 余弦相似度 (推荐)
SELECT * FROM chunks
ORDER BY embedding <=> query_embedding
LIMIT 10;
-- 欧氏距离
SELECT * FROM chunks
ORDER BY embedding <-> query_embedding
LIMIT 10;
-- 内积
SELECT * FROM chunks
ORDER BY embedding <#> query_embedding
LIMIT 10;
索引选择指南¶
HNSW 索引¶
适用场景: - 数据量 < 1000 万条 - 需要高召回率 - 可接受较大内存占用
CREATE INDEX idx_hnsw ON chunks
USING hnsw (embedding vector_cosine_ops)
WITH (
m = 16, -- 每层连接数
ef_construction = 64 -- 构建时搜索范围
);
-- 查询时设置 ef_search
SET hnsw.ef_search = 100;
IVFFlat 索引¶
适用场景: - 数据量 > 1000 万条 - 内存受限 - 可接受略低的召回率
-- 需要先有数据才能创建
CREATE INDEX idx_ivf ON chunks
USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 100); -- sqrt(n) 为经验值
-- 查询时设置 probes
SET ivfflat.probes = 10;
性能优化¶
分区策略¶
对于大规模部署,按知识库分区:
-- 创建分区表
CREATE TABLE chunks_partitioned (
id UUID NOT NULL,
document_id UUID NOT NULL,
knowledge_base_id UUID NOT NULL,
content TEXT NOT NULL,
token_count INT NOT NULL,
chunk_index INT NOT NULL,
embedding vector(1536),
metadata JSONB DEFAULT '{}',
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
PRIMARY KEY (id, knowledge_base_id)
) PARTITION BY HASH (knowledge_base_id);
-- 创建分区
CREATE TABLE chunks_p0 PARTITION OF chunks_partitioned
FOR VALUES WITH (MODULUS 4, REMAINDER 0);
CREATE TABLE chunks_p1 PARTITION OF chunks_partitioned
FOR VALUES WITH (MODULUS 4, REMAINDER 1);
CREATE TABLE chunks_p2 PARTITION OF chunks_partitioned
FOR VALUES WITH (MODULUS 4, REMAINDER 2);
CREATE TABLE chunks_p3 PARTITION OF chunks_partitioned
FOR VALUES WITH (MODULUS 4, REMAINDER 3);
连接池配置¶
# PostgreSQL 连接池推荐配置
pool_config = {
"min_size": 5,
"max_size": 20,
"max_queries": 50000,
"max_inactive_connection_lifetime": 300,
"command_timeout": 60
}
查询优化¶
-- 向量搜索 + 元数据过滤
SELECT c.id, c.content,
1 - (c.embedding <=> $1) as similarity
FROM chunks c
JOIN documents d ON c.document_id = d.id
WHERE c.knowledge_base_id = $2
AND d.metadata @> $3 -- JSONB 包含查询
ORDER BY c.embedding <=> $1
LIMIT $4;
-- 创建复合索引加速过滤
CREATE INDEX idx_chunks_kb_embedding ON chunks(knowledge_base_id)
INCLUDE (embedding);
数据迁移¶
添加新向量维度¶
-- 添加新的向量列
ALTER TABLE chunks ADD COLUMN embedding_3072 vector(3072);
-- 批量更新嵌入
UPDATE chunks SET embedding_3072 = compute_embedding(content)
WHERE embedding_3072 IS NULL;
-- 创建新索引
CREATE INDEX CONCURRENTLY idx_chunks_embedding_3072 ON chunks
USING hnsw (embedding_3072 vector_cosine_ops);
-- 切换后删除旧列
ALTER TABLE chunks DROP COLUMN embedding;
ALTER TABLE chunks RENAME COLUMN embedding_3072 TO embedding;
版本化迁移脚本¶
迁移脚本位于 scripts/sql/ 目录:
scripts/sql/
├── 011-create-extensions.sql
├── 021-create-users.sql
├── 031-create-file-management.sql
├── 041-create-extractions.sql
├── 051-create-knowledge-bases.sql
├── 061-create-webhooks.sql
└── 071-create-indexes.sql
执行顺序:
备份策略¶
逻辑备份¶
# 完整备份
pg_dump -U postgres -d unifiles -F c -f backup.dump
# 仅备份 schema
pg_dump -U postgres -d unifiles -s -f schema.sql
# 备份特定表
pg_dump -U postgres -d unifiles -t chunks -F c -f chunks.dump
向量数据导出¶
-- 导出向量为 CSV
COPY (
SELECT id, document_id,
embedding::text as embedding_text
FROM chunks
) TO '/tmp/chunks_export.csv' WITH CSV HEADER;
监控查询¶
-- 表大小统计
SELECT
schemaname,
tablename,
pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) as total_size
FROM pg_tables
WHERE schemaname = 'public'
ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC;
-- 索引使用情况
SELECT
indexrelname,
idx_scan,
idx_tup_read,
idx_tup_fetch
FROM pg_stat_user_indexes
ORDER BY idx_scan DESC;
-- 向量索引状态
SELECT * FROM pg_stat_user_indexes
WHERE indexrelname LIKE '%embedding%';