子查询优化
子查询是复杂查询的基础,但不当使用会影响性能。本篇介绍子查询的优化技巧。
📊 支持情况
| 数据库 | 相关子查询 | EXISTS | IN | JOIN替代 | 优化器 |
|---|---|---|---|---|---|
| MySQL | ✅ | ✅ | ✅ | ✅ | 持续优化 |
| Oracle | ✅ | ✅ | ✅ | ✅ | CBO优化 |
| ClickHouse | ✅ | ✅ | ✅ | ✅ | 向量化 |
| Hologres | ✅ | ✅ | ✅ | ✅ | PG优化器 |
| MaxCompute | ✅ | ✅ | ✅ | ✅ | 分布式优化 |
| Hive | ✅ | ✅ | ✅ | ✅ | CBO优化 |
1. 子查询类型
标量子查询
-- 返回单个值
SELECT
name,
(SELECT MAX(salary) FROM employees) AS max_salary
FROM departments;
-- 多个列(返回单行)
SELECT
d.name,
(SELECT name, salary FROM employees
WHERE department_id = d.id
ORDER BY salary DESC LIMIT 1) AS top_employee
FROM departments d;
行子查询
-- 返回单行多列
SELECT *
FROM products
WHERE (category, price) = (
SELECT category, MAX(price)
FROM products
GROUP BY category
);
列子查询
-- 返回一列多行
SELECT *
FROM products
WHERE category IN (
SELECT category FROM top_categories
);
表子查询
-- 返回多行多列
SELECT *
FROM orders o
JOIN (
SELECT user_id, SUM(amount) AS total
FROM orders
GROUP BY user_id
HAVING SUM(amount) > 10000
) high_value ON o.user_id = high_value.user_id;
2. EXISTS vs IN
IN 子查询
-- MySQL
SELECT *
FROM orders
WHERE user_id IN (
SELECT user_id FROM vip_users
);
EXISTS 子查询
-- MySQL - 通常更高效
SELECT *
FROM orders o
WHERE EXISTS (
SELECT 1 FROM vip_users v
WHERE v.user_id = o.user_id
);
选择指南
| 场景 | 推荐使用 | 原因 |
|---|---|---|
| 小结果集 IN 大表 | IN | 缓存小结果集 |
| 大结果集 IN 小表 | EXISTS | 尽早退出 |
| NOT IN 包含NULL | NOT EXISTS | NULL处理 |
| 相关子查询 | EXISTS | 相关性优化 |
3. JOIN 替代子查询
相关子查询转JOIN
-- ❌ 低效:相关子查询(每行执行一次)
SELECT
o.order_id,
o.amount,
(SELECT MAX(amount) FROM orders WHERE user_id = o.user_id) AS max_order
FROM orders o;
-- ✅ 高效:JOIN(只执行一次)
SELECT
o.order_id,
o.amount,
o.user_max.max_amount AS max_order
FROM orders o
JOIN (
SELECT user_id, MAX(amount) AS max_amount
FROM orders
GROUP BY user_id
) o.user_max ON o.user_id = o.user_max.user_id;
IN 转 JOIN
-- ❌ IN子查询
SELECT *
FROM orders
WHERE user_id IN (
SELECT user_id FROM high_value_users
);
-- ✅ JOIN
SELECT o.*
FROM orders o
JOIN high_value_users h ON o.user_id = h.user_id;
NOT IN vs NOT EXISTS
-- NOT IN(可能有问题)
SELECT *
FROM orders
WHERE user_id NOT IN (
SELECT user_id FROM blacklist_users
);
-- 如果子查询返回NULL,结果为空
-- ✅ NOT EXISTS(安全)
SELECT o.*
FROM orders o
WHERE NOT EXISTS (
SELECT 1 FROM blacklist_users b
WHERE b.user_id = o.user_id
);
4. 派生表优化
派生表下推
-- ✅ 建议:在派生表中先过滤和聚合
SELECT *
FROM (
SELECT user_id, SUM(amount) AS total
FROM orders
WHERE order_date >= '2024-01-01' -- 先过滤
GROUP BY user_id
) user_totals
WHERE total > 1000;
派生表索引
-- 确保派生表查询能利用索引
SELECT *
FROM (
SELECT * FROM orders
WHERE order_date >= '2024-01-01'
ORDER BY order_id
LIMIT 100
) t
JOIN products p ON t.product_id = p.id;
5. 实战案例
场景1:复杂报表查询
-- ❌ 低效写法
SELECT
(SELECT COUNT(*) FROM orders WHERE user_id = u.user_id) AS order_count,
(SELECT SUM(amount) FROM orders WHERE user_id = u.user_id) AS total_spent,
(SELECT MAX(order_date) FROM orders WHERE user_id = u.user_id) AS last_order
FROM users u;
-- ✅ 优化写法
SELECT
u.user_id,
COALESCE(o.order_count, 0) AS order_count,
COALESCE(o.total_spent, 0) AS total_spent,
o.last_order
FROM users u
LEFT JOIN (
SELECT
user_id,
COUNT(*) AS order_count,
SUM(amount) AS total_spent,
MAX(order_date) AS last_order
FROM orders
GROUP BY user_id
) o ON u.user_id = o.user_id;
场景2:排名查询
-- ❌ 低效:相关子查询排名
SELECT
u.user_id,
(SELECT COUNT(*) FROM users u2 WHERE u2.score > u.score) + 1 AS rank
FROM users u;
-- ✅ 高效:窗口函数
SELECT
user_id,
score,
RANK() OVER (ORDER BY score DESC) AS rank
FROM users;
场景3:最新记录查询
-- ❌ 低效:相关子查询
SELECT *
FROM orders o1
WHERE o1.order_date = (
SELECT MAX(order_date)
FROM orders o2
WHERE o2.user_id = o1.user_id
);
-- ✅ 高效:窗口函数
SELECT *
FROM (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY order_date DESC) AS rn
FROM orders
) t
WHERE rn = 1;
6. 优化器提示
MySQL
-- 强制使用派生表合并
SELECT /*+ DERIVED_CONCATITION(t) */ ...
FROM (SELECT * FROM large_table) t;
-- 使用索引
SELECT /*+ INDEX(o idx_user_id) */ ...
FROM orders o;
Oracle
-- 强制哈希连接
SELECT /*+ USE_HASH(o u) */ ...
FROM orders o, users u
WHERE o.user_id = u.user_id;
-- 取消子查询展开
SELECT /*+ NO_UNNEST */ ...
FROM orders o
WHERE EXISTS (SELECT 1 FROM users u WHERE u.user_id = o.user_id);
Hive
-- 启用CBO
SET hive.cbo.enable=true;
-- 启用谓词下推
SET hive.optimize.ppd=true;
⚠️ 注意事项
1. NULL值问题
-- ❌ IN遇到NULL
SELECT * FROM t WHERE col IN (1, 2, NULL);
-- 等同于 col = 1 OR col = 2 OR col = NULL,结果总是UNKNOWN
-- ✅ 使用NOT EXISTS
SELECT * FROM t WHERE NOT EXISTS (SELECT 1 FROM other WHERE other.col = t.col);
2. 相关子查询性能
-- 相关子查询会执行多次
SELECT * FROM t1
WHERE EXISTS (SELECT 1 FROM t2 WHERE t2.id = t1.id);
-- 如果数据量大,建议用JOIN
SELECT DISTINCT t1.* FROM t1 JOIN t2 ON t1.id = t2.id;
3. 临时表优化
-- 大数据量时,先物化子查询
CREATE TEMPORARY TABLE temp_users AS
SELECT user_id, level FROM users WHERE status = 'active';
SELECT * FROM orders
WHERE user_id IN (SELECT user_id FROM temp_users);
📝 练习题
-- 建表
CREATE TABLE products (
product_id INT,
category STRING,
price DECIMAL(10,2),
stock INT
);
CREATE TABLE orders (
order_id INT,
user_id INT,
product_id INT,
amount DECIMAL(10,2),
order_date DATE
);
INSERT INTO products VALUES
(1, 'A', 100, 50),
(2, 'A', 150, 30),
(3, 'B', 200, 20),
(4, 'B', 250, 10);
INSERT INTO orders VALUES
(1, 101, 1, 100, '2024-01-14'),
(2, 102, 2, 150, '2024-01-14'),
(3, 101, 3, 200, '2024-01-15'),
(4, 103, 4, 250, '2024-01-15');
要求:
- 优化:查询购买过A类产品的所有用户
- 优化:查询每个类别销量最高的商品
- 优化:查询用户最近一次订单
-- 参考答案
-- 1. IN改JOIN
SELECT DISTINCT o.user_id
FROM orders o
JOIN products p ON o.product_id = p.product_id
WHERE p.category = 'A';
-- 2. 子查询转JOIN
SELECT p.*
FROM products p
JOIN (
SELECT category, MAX(price) AS max_price
FROM products
GROUP BY category
) t ON p.category = t.category AND p.price = t.max_price;
-- 3. 窗口函数替代相关子查询
SELECT *
FROM (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY order_date DESC) AS rn
FROM orders
) t
WHERE rn = 1;
順子の杂货铺


