SQL查询优化详解
本篇系统介绍SQL查询优化的核心方法,涵盖执行计划分析、索引优化、查询改写、连接优化等技巧,帮助大幅提升查询性能。
一、执行计划分析
1.1 EXPLAIN使用
执行计划是优化的第一步,理解执行计划才能对症下药。
-- MySQL
EXPLAIN SELECT * FROM users WHERE id = 100;
EXPLAIN ANALYZE SELECT * FROM users WHERE id = 100; -- 实际执行并计时
-- Oracle
EXPLAIN PLAN FOR SELECT * FROM users WHERE id = 100;
SELECT * FROM TABLE(DBMS_XPLAN.DISPLAY);
-- ClickHouse
EXPLAIN SELECT * FROM users WHERE id = 100;
-- Hive
EXPLAIN SELECT * FROM users WHERE id = 100;
EXPLAIN EXTENDED SELECT * FROM users WHERE id = 100; -- 详细输出
1.2 执行计划关键指标
| 字段 | 说明 | 优化关注点 |
|---|
| type | 连接类型 | 优先ALL → index → range → ref → const |
| possible_keys | 可用索引 | 应有预期索引 |
| key | 实际使用索引 | 需与预期一致 |
| rows | 扫描行数 | 越少越好 |
| Extra | 额外信息 | Using filesort/Using temporary需优化 |
1.3 连接类型对比(MySQL)
-- ❌ ALL 全表扫描
EXPLAIN SELECT * FROM users WHERE age = 25;
-- type: ALL, rows: 1000000
-- ❌ index 索引全扫描
EXPLAIN SELECT id FROM users ORDER BY created_at;
-- type: index
-- ✅ range 索引范围扫描
EXPLAIN SELECT * FROM users WHERE id BETWEEN 100 AND 200;
-- type: range
-- ✅ ref 索引等值查询
EXPLAIN SELECT * FROM users WHERE status = 'active';
-- type: ref
-- ✅ const 主键等值查询
EXPLAIN SELECT * FROM users WHERE id = 100;
-- type: const
1.4 分布式执行计划(ClickHouse)
EXPLAIN SELECT
date,
count() AS c
FROM events
WHERE date >= '2024-01-01'
GROUP BY date
ORDER BY c DESC
LIMIT 10;
-- 输出解读
-- ┌─explain─────────────────────────────────────┐
-- │ Expression (Projection) │ -- 投影阶段
-- │ Limit (LIMIT 10) │ -- 取Top 10
-- │ Sort (ORDER BY count() DESC) │ -- 排序
-- │ Aggregating │ -- 聚合
-- │ Filter (WHERE date >= '2024-01-01') │ -- 过滤
-- │ ReadFromMergeTree (events) │ -- 数据读取
-- └─────────────────────────────────────────────┘
二、索引优化
2.1 索引类型选择
| 索引类型 | 适用场景 | 特点 |
|---|
| B-Tree | 等值查询、范围查询 | 默认类型,平衡读写 |
| Hash | 等值查询 | 精确匹配,不支持范围 |
| Bitmap | 低基数列 | 位运算,并行友好 |
| Full-Text | 文本搜索 | 中文需分词支持 |
| Spatial | 地理坐标 | 空间索引 |
2.2 高效索引设计
-- ❌ 错误示范:索引列被函数包裹
SELECT * FROM orders WHERE YEAR(created_at) = 2024;
-- 索引失效
-- ✅ 正确写法:范围条件放索引列后
SELECT * FROM orders WHERE created_at >= '2024-01-01' AND created_at < '2025-01-01';
-- ❌ 错误示范:隐式类型转换
SELECT * FROM users WHERE phone = 13800138000;
-- phone是VARCHAR类型,触发转换,索引失效
-- ✅ 正确写法:保持类型一致
SELECT * FROM users WHERE phone = '13800138000';
-- 复合索引最佳实践:区分度高的列放前面
-- 索引:(status, created_at, user_id)
SELECT * FROM orders
WHERE status = 'completed'
AND created_at >= '2024-01-01'
AND user_id = 100;
-- 查询只用到了索引前两列,第三列未被使用
-- 考虑是否需要调整索引顺序
2.3 索引覆盖查询
-- 创建覆盖索引
CREATE INDEX idx_user_status_created ON users(status, created_at, name);
-- ✅ 使用覆盖索引,无需回表
SELECT name, created_at FROM users WHERE status = 'active' AND created_at > '2024-01-01';
-- Extra: Using index
-- ❌ 需要回表查询
SELECT * FROM users WHERE status = 'active';
-- 需要访问聚簇索引获取全部列
2.4 索引维护
-- MySQL 查看索引基数
SHOW INDEX FROM users;
ANALYZE TABLE users; -- 刷新索引统计信息
-- Oracle 重建索引
ALTER INDEX idx_name REBUILD;
-- ClickHouse 手动合并分区
OPTIMIZE TABLE events PARTITION ('2024-01');
-- Hive 修复分区元数据
MSCK REPAIR TABLE table_name;
三、查询改写技巧
3.1 子查询优化
-- ❌ 原始:IN子查询(可能多次执行)
SELECT * FROM orders
WHERE user_id IN (SELECT user_id FROM users WHERE status = 'vip');
-- ✅ 改写1:JOIN(通常更高效)
SELECT o.*
FROM orders o
JOIN users u ON o.user_id = u.user_id
WHERE u.status = 'vip';
-- ✅ 改写2:EXISTS(适合判断存在性)
SELECT o.*
FROM orders o
WHERE EXISTS (
SELECT 1 FROM users u
WHERE u.user_id = o.user_id AND u.status = 'vip'
);
-- ❌ 原始:相关子查询(每行执行一次)
SELECT o.*,
(SELECT SUM(amount) FROM order_items WHERE order_id = o.id) AS total
FROM orders o;
-- ✅ 改写:JOIN + 聚合
SELECT o.*, i.total
FROM orders o
LEFT JOIN (
SELECT order_id, SUM(amount) AS total
FROM order_items
GROUP BY order_id
) i ON o.id = i.order_id;
3.2 COUNT优化
-- ❌ 多次COUNT
SELECT
COUNT(*) AS total,
COUNT(status = 'active' OR NULL) AS active,
COUNT(status = 'vip' OR NULL) AS vip
FROM users;
-- ✅ 一次扫描完成所有计数
SELECT
COUNT(*) AS total,
SUM(CASE WHEN status = 'active' THEN 1 ELSE 0 END) AS active,
SUM(CASE WHEN status = 'vip' THEN 1 ELSE 0 END) AS vip
FROM users;
-- ✅ 使用近似函数(大数据量)
SELECT approx_count_distinct(user_id) FROM events;
3.3 分页优化
-- ❌ 传统分页(深度分页极慢)
SELECT * FROM orders ORDER BY created_at DESC LIMIT 1000000, 20;
-- ✅ 优化1:基于上一页ID
SELECT * FROM orders
WHERE id > 1000000
ORDER BY id
LIMIT 20;
-- ✅ 优化2:延迟关联
SELECT * FROM orders
INNER JOIN (
SELECT id FROM orders
ORDER BY created_at DESC
LIMIT 1000000, 20
) t USING (id);
-- ✅ 优化3:游标分页(实时场景)
SELECT * FROM orders
WHERE created_at < '2024-01-15T00:00:00'
ORDER BY created_at DESC
LIMIT 20;
3.4 OR改写UNION
-- ❌ OR可能放弃索引
SELECT * FROM users WHERE status = 'active' OR city = 'Beijing';
-- ✅ 改写为UNION
SELECT * FROM users WHERE status = 'active'
UNION
SELECT * FROM users WHERE city = 'Beijing';
-- 注意:UNION会去重,UNION ALL不去重
-- 如果不需要去重且数据量大,用UNION ALL
四、连接优化
4.1 连接顺序
-- ❌ 小表驱动大表(错误)
SELECT /*+ LEADING(small_table large_table) */
l.*, r.*
FROM large_table l
JOIN small_table s ON l.id = s.id;
-- ✅ 大表驱动小表(正确)
SELECT /*+ LEADING(large_table small_table) */
l.*, r.*
FROM large_table l
JOIN small_table s ON l.id = s.id;
-- Oracle/Hint指定驱动表
SELECT /*+ ORDERED */
l.*, r.*
FROM orders l, order_items r
WHERE l.id = r.order_id;
4.2 连接类型选择
-- 内连接:只保留匹配行
SELECT * FROM orders o
INNER JOIN users u ON o.user_id = u.user_id;
-- 左连接:保留左表所有行
SELECT * FROM orders o
LEFT JOIN users u ON o.user_id = u.user_id
WHERE u.user_id IS NULL; -- 找出无订单用户
-- 多个LEFT JOIN注意
-- ❌ 错误:可能产生笛卡尔积
SELECT * FROM a LEFT JOIN b ON a.id = b.id
LEFT JOIN c ON a.id = c.id;
-- ✅ 正确:逐层连接
SELECT * FROM a
LEFT JOIN (b JOIN c ON b.id = c.id) ON a.id = b.id;
4.3 多表连接优化
-- 8表连接查询优化
SELECT /*+ LEADING(a b c d e f g h) USE_NL(b c d e f g h) */
a.*, b.*, c.*, d.*, e.*, f.*, g.*, h.*
FROM a
JOIN b ON a.id = b.a_id
JOIN c ON b.id = c.b_id
JOIN d ON c.id = d.c_id
JOIN e ON d.id = e.d_id
JOIN f ON e.id = f.e_id
JOIN g ON f.id = g.f_id
JOIN h ON g.id = h.g_id;
五、分区优化
5.1 分区裁剪
-- ✅ 正确:分区字段在过滤条件中
SELECT * FROM events
WHERE date = '2024-01-15'; -- 只扫描单个分区
-- ❌ 错误:分区字段被函数处理
SELECT * FROM events
WHERE DATE_FORMAT(date, '%Y-%m-%d') = '2024-01-15'; -- 全分区扫描
-- ✅ 正确:范围条件
SELECT * FROM events
WHERE date >= '2024-01-01' AND date < '2024-02-01';
-- 分区表关联优化
SELECT /*+ PARTITION(p) */
*
FROM facts f
JOIN dims d ON f.dim_id = d.id
WHERE f.date = '2024-01-15';
5.2 ClickHouse分区优化
-- 查看分区信息
SELECT partition, count() FROM events GROUP BY partition;
-- 手动合并小分区
OPTIMIZE TABLE events PARTITION ('2024-01-01');
-- 删除旧分区
ALTER TABLE events DROP PARTITION ('2023-01-01');
-- 查询时指定分区
SELECT * FROM events PARTITION ('2024-01');
六、大数据平台优化
6.1 Hive优化
-- 开启MapJoin(小表广播)
SET hive.auto.convert.join=true;
SET hive.mapjoin.smalltable.filesize=25000000;
-- 并行执行
SET hive.exec.parallel=true;
SET hive.exec.parallel.thread.number=8;
-- 向量化执行
SET hive.vectorized.execution.enabled=true;
SET hive.vectorized.execution.reduce.enabled=true;
-- 倾斜优化
SET hive.groupby.skewindata=true;
SET hive.optimize.skewjoin=true;
-- 压缩输出
SET hive.exec.compress.output=true;
SET hive.exec.compress.intermediate=true;
6.2 MapReduce调优
-- 调整Map数
SET mapreduce.input.fileinputformat.split.maxsize=256000000;
SET mapreduce.input.fileinputformat.split.minsize=128000000;
-- 调整Reduce数
SET mapreduce.job.reduces=64;
SET hive.exec.reducers.bytes.per.reducer=128000000;
-- 内存设置
SET mapreduce.map.memory.mb=4096;
SET mapreduce.reduce.memory.mb=8192;
6.3 ClickHouse优化
-- 线程数设置
SET max_threads=16;
SET max_parallel_replicas=2;
-- 内存限制
SET max_memory_usage=8000000000;
SET max_bytes_before_external_sort=2000000000;
-- 分布式查询优化
SET distributed_group_by_no_merge=1;
SET optimize_skip_unused_shards=1;
-- 谓词下推
SET enable_optimize_predicate_pushdown=1;
6.4 MaxCompute优化
-- 开启列裁剪
SET odps.sql.allow.fullscan=false;
-- 并行度设置
SET odps.sql.reducer.num=128;
-- 小表广播
SET odps.sql.hive.compatible=true;
-- 动态分区
SET odps.sql.allow.fullscan=true;
SET odps.sql.dynamic.partition=true;
SET odps.sql.dynamic.partition.mode=nonstrict;
七、统计信息与成本模型
7.1 统计信息更新
-- MySQL
ANALYZE TABLE users; -- 更新表统计信息
SHOW TABLE STATUS LIKE 'users';
-- Oracle
EXEC DBMS_STATS.GATHER_TABLE_STATS('SCHEMA', 'USERS');
-- Hive
ANALYZE TABLE users COMPUTE STATISTICS;
ANALYZE TABLE users COMPUTE STATISTICS FOR COLUMNS;
7.2 执行计划缓存
-- MySQL 8.0+ 物化视图
CREATE MATERIALIZED VIEW mv_stats
AS SELECT date, COUNT(*) FROM events GROUP BY date;
-- ClickHouse 物化视图
CREATE MATERIALIZED VIEW mv_daily
ENGINE = SummingMergeTree()
AS SELECT date, count() FROM events GROUP BY date;
-- Oracle 自动重写查询
SELECT /*+ REWRITE */ * FROM sales WHERE EXTRACT(YEAR FROM date) = 2024;
八、常见反模式
8.1 避免全表扫描
-- ❌ 缺少WHERE条件
SELECT * FROM large_table; -- 全部扫描
-- ✅ 添加限制
SELECT * FROM large_table LIMIT 100;
-- ❌ OR导致索引失效
SELECT * FROM t WHERE a = 1 OR b = 2;
-- ✅ 改写UNION
SELECT * FROM t WHERE a = 1
UNION
SELECT * FROM t WHERE b = 2;
-- ❌ IN列表过长
SELECT * FROM t WHERE id IN (1,2,3,...,10000);
-- ✅ 改用JOIN临时表
CREATE TEMPORARY TABLE ids (id INT PRIMARY KEY);
INSERT INTO ids VALUES (1),(2),...,(10000);
SELECT t.* FROM t JOIN ids USING (id);
8.2 避免过多列
-- ❌ SELECT *
SELECT * FROM large_table; -- 读取所有列
-- ✅ 只选需要的列
SELECT id, name, status FROM large_table;
8.3 避免在WHERE中对列运算
-- ❌ 计算导致索引失效
SELECT * FROM orders WHERE amount * 1.1 > 1000;
-- ✅ 移到常量侧
SELECT * FROM orders WHERE amount > 1000 / 1.1;
九、优化检查清单
9.1 上线前检查
-- 1. 执行计划分析
EXPLAIN SELECT ...
-- 2. 确认索引被使用
SHOW CREATE TABLE users;
-- 查看possible_keys和key
-- 3. 验证扫描行数
EXPLAIN ANALYZE SELECT ...
-- 4. 测试不同数据量
-- 测试1%、50%、100%数据量的性能
-- 5. 检查是否有filesort/temporary
EXPLAIN SELECT ... -- 查看Extra列
9.2 监控指标
| 指标 | 告警阈值 | 优化方向 |
|---|
| 全表扫描 | >10000行 | 添加索引 |
| Using filesort | 存在 | 优化排序 |
| Using temporary | 存在 | 优化GROUP BY |
| 慢查询 | >1秒 | 综合优化 |
十、总结
优化原则优先级:
- 先分析:用EXPLAIN定位瓶颈
- 后索引:合理设计索引结构
- 再改写:优化查询语句结构
- 调参数:根据场景调整配置
- 控数据:归档清理历史数据
| 优化手段 | 效果 | 成本 |
|---|
| 添加索引 | 高 | 低 |
| 查询改写 | 高 | 低 |
| 分区裁剪 | 高 | 中 |
| 硬件升级 | 高 | 高 |
| 架构重构 | 最高 | 最高 |