分组TOP-N查询
分组TOP-N是数据分析中的高频需求,用于从每个分组中取出排名前N的记录。
📊 支持情况
| 数据库 | 支持情况 | 备注 |
|---|---|---|
| MySQL | ✅ 完整支持 | 8.0+窗口函数 |
| Oracle | ✅ 完整支持 | ROWNUM或窗口函数 |
| ClickHouse | ✅ 完整支持 | ALL/GROUP BY技巧 |
| Hologres | ✅ 完整支持 | 窗口函数 |
| MaxCompute | ✅ 完整支持 | 窗口函数 |
| Hive | ✅ 完整支持 | 窗口函数 |
1. 基本分组TOP-N
场景:每个部门薪资最高的前3名员工
MySQL 8.0+
-- 方法1:窗口函数
SELECT *
FROM (
SELECT
emp_id,
dept_id,
salary,
ROW_NUMBER() OVER (
PARTITION BY dept_id
ORDER BY salary DESC
) AS rn
FROM employees
) t
WHERE rn <= 3;
Oracle
-- 方法1:窗口函数(推荐)
SELECT *
FROM (
SELECT
emp_id,
dept_id,
salary,
ROW_NUMBER() OVER (
PARTITION BY dept_id
ORDER BY salary DESC
) AS rn
FROM employees
)
WHERE rn <= 3;
-- 方法2:ROWNUM(旧版本)
SELECT *
FROM (
SELECT emp_id, dept_id, salary
FROM employees
ORDER BY salary DESC
)
WHERE ROWNUM <= 3;
Hive / MaxCompute / Hologres
SELECT *
FROM (
SELECT
emp_id,
dept_id,
salary,
ROW_NUMBER() OVER (
PARTITION BY dept_id
ORDER BY salary DESC
) AS rn
FROM employees
) t
WHERE rn <= 3;
ClickHouse
-- 方法1:窗口函数
SELECT *
FROM (
SELECT
emp_id,
dept_id,
salary,
row_number() OVER (
PARTITION BY dept_id
ORDER BY salary DESC
) AS rn
FROM employees
)
WHERE rn <= 3;
-- 方法2:ARRAY JOIN(更高效)
SELECT
dept_id,
arrayJoin(topK(3)(salary)) AS top_salary,
groupArray(emp_id) AS employees
FROM employees
GROUP BY dept_id;
2. 多字段分组TOP-N
场景:每个区域、每类产品的销售额Top2
-- MySQL / Hive / MaxCompute / Hologres
SELECT *
FROM (
SELECT
region,
category,
product_id,
sales_amount,
ROW_NUMBER() OVER (
PARTITION BY region, category
ORDER BY sales_amount DESC
) AS rn
FROM product_sales
) t
WHERE rn <= 2;
输出示例:
| region | category | product_id | sales_amount | rn |
|---|---|---|---|---|
| 华东 | 电子 | P001 | 50000 | 1 |
| 华东 | 电子 | P002 | 45000 | 2 |
| 华东 | 服装 | P003 | 30000 | 1 |
| 华北 | 电子 | P004 | 40000 | 1 |
3. TOP-N + 聚合
场景:每个部门Top3员工的薪资总和
-- MySQL 8.0+
SELECT
dept_id,
SUM(salary) AS top3_salary_sum
FROM (
SELECT
dept_id,
salary,
ROW_NUMBER() OVER (
PARTITION BY dept_id
ORDER BY salary DESC
) AS rn
FROM employees
) t
WHERE rn <= 3
GROUP BY dept_id;
Oracle
SELECT
dept_id,
SUM(salary) AS top3_salary_sum
FROM (
SELECT
dept_id,
salary,
ROW_NUMBER() OVER (
PARTITION BY dept_id
ORDER BY salary DESC
) AS rn
FROM employees
)
WHERE rn <= 3
GROUP BY dept_id;
4. RANK版本(允许并列)
场景:每个班级成绩前5名(允许并列)
-- MySQL / Hive / MaxCompute / Hologres
-- 使用RANK()而不是ROW_NUMBER(),允许并列
SELECT *
FROM (
SELECT
class_id,
student_id,
score,
RANK() OVER (
PARTITION BY class_id
ORDER BY score DESC
) AS rk
FROM exam_scores
) t
WHERE rk <= 5;
输出示例:
| class_id | student_id | score | rk |
|---|---|---|---|
| 1班 | S001 | 100 | 1 |
| 1班 | S002 | 95 | 2 |
| 1班 | S003 | 95 | 2 |
| 1班 | S004 | 90 | 4 |
| 1班 | S005 | 90 | 4 |
5. 分组取最大/最小一条
场景:每个用户最近一笔订单
-- MySQL / Hive / MaxCompute / Hologres
SELECT *
FROM (
SELECT
user_id,
order_id,
order_date,
amount,
ROW_NUMBER() OVER (
PARTITION BY user_id
ORDER BY order_date DESC
) AS rn
FROM orders
) t
WHERE rn = 1;
Oracle
SELECT *
FROM (
SELECT
user_id,
order_id,
order_date,
amount,
ROW_NUMBER() OVER (
PARTITION BY user_id
ORDER BY order_date DESC
) AS rn
FROM orders
)
WHERE rn = 1;
ClickHouse
SELECT
user_id,
argMax(order_id, order_date) AS latest_order_id,
max(order_date) AS latest_date,
max(amount) AS amount
FROM orders
GROUP BY user_id;
6. 分组TOP-N + 关联查询
场景:获取每个区域销售冠军的详细信息
-- MySQL / Hive
WITH top_sales AS (
SELECT
region,
salesperson,
ROW_NUMBER() OVER (
PARTITION BY region
ORDER BY sales_amount DESC
) AS rn
FROM sales
WHERE YEAR(sale_date) = 2024
)
SELECT
r.region_name,
t.salesperson,
t.sales_amount
FROM top_sales t
JOIN regions r ON t.region = r.region_id
WHERE t.rn = 1;
Oracle
SELECT
r.region_name,
t.salesperson,
t.sales_amount
FROM (
SELECT
region,
salesperson,
sales_amount,
ROW_NUMBER() OVER (
PARTITION BY region
ORDER BY sales_amount DESC
) AS rn
FROM sales
WHERE EXTRACT(YEAR FROM sale_date) = 2024
) t
JOIN regions r ON t.region = r.region_id
WHERE t.rn = 1;
7. 动态TOP-N
场景:根据参数取TopN
-- MySQL 8.0+
SET @top_n = 5;
SELECT *
FROM (
SELECT
category,
product_id,
sales_amount,
ROW_NUMBER() OVER (
PARTITION BY category
ORDER BY sales_amount DESC
) AS rn
FROM product_sales
) t
WHERE rn <= @top_n;
Hive(使用变量)
SET hivevar:top_n=5;
SELECT *
FROM (
SELECT
category,
product_id,
sales_amount,
ROW_NUMBER() OVER (
PARTITION BY category
ORDER BY sales_amount DESC
) AS rn
FROM product_sales
) t
WHERE rn <= ${hivevar:top_n};
8. MAXCOMPUTE特殊语法
使用DISTRIBUTE BY + SORT BY
-- MaxCompute特有的高效写法
SELECT
dept_id,
emp_id,
salary
FROM employees
DISTRIBUTE BY dept_id
SORT BY salary DESC
LIMIT 3 PER GROUP;
使用SAMPLE取样TOP-N
-- 先取样再排序
SELECT
category,
product_id,
sales_amount
FROM product_sales
SAMPLE (1000) -- 采样1000条
DISTRIBUTE BY category
SORT BY sales_amount DESC;
9. ClickHouse特殊技巧
使用ARRAY + ARRAYJOIN
-- 获取每个分区的Top3
SELECT
dept_id,
arrayJoin(topK(3)(salary)) AS top_salary,
groupArray(emp_id) FILTER (salary IN topK(3)(salary)) AS top_employees
FROM employees
GROUP BY dept_id;
使用LIMIT BY
-- ClickHouse特有语法
SELECT
dept_id,
emp_id,
salary
FROM employees
ORDER BY dept_id, salary DESC
LIMIT 3 BY dept_id;
⚠️ 注意事项
1. 性能优化
-- ✅ 建议:在子查询中先过滤数据
SELECT *
FROM (
SELECT
emp_id,
dept_id,
salary,
ROW_NUMBER() OVER (
PARTITION BY dept_id
ORDER BY salary DESC
) AS rn
FROM employees
WHERE hire_date >= '2023-01-01' -- 先过滤
) t
WHERE rn <= 3;
-- ❌ 避免:先排序所有数据再过滤
SELECT *
FROM (
SELECT
emp_id,
dept_id,
salary,
ROW_NUMBER() OVER (
PARTITION BY dept_id
ORDER BY salary DESC
) AS rn
FROM employees
) t
WHERE rn <= 3 AND hire_date >= '2023-01-01'; -- 过滤在窗口计算后
2. 索引利用
-- 确保PARTITION BY和ORDER BY字段有索引
-- MySQL
CREATE INDEX idx_emp_dept_salary ON employees(dept_id, salary DESC);
3. 数据量大时
-- 大数据量建议使用 DISTRIBUTE BY + SORT BY(Hive/MaxCompute)
SELECT /*+ MAPJOIN(dim) */
t.*
FROM (
SELECT
dept_id,
emp_id,
salary,
ROW_NUMBER() OVER (
PARTITION BY dept_id
ORDER BY salary DESC
) AS rn
FROM employees
WHERE dt = '2024-01-14'
) t
JOIN dim_dept d ON t.dept_id = d.dept_id
WHERE t.rn <= 10;
📝 练习题
题目:电商订单分析
-- 建表
CREATE TABLE orders (
order_id INT,
user_id INT,
category STRING,
region STRING,
amount DECIMAL(10,2),
order_date DATE
);
-- 插入数据
INSERT INTO orders VALUES
(1, 101, '电子产品', '华东', 5000, '2024-01-15'),
(2, 102, '电子产品', '华东', 3000, '2024-01-15'),
(3, 103, '服装', '华东', 800, '2024-01-15'),
(4, 104, '电子产品', '华北', 4500, '2024-01-15'),
(5, 101, '服装', '华东', 600, '2024-01-16'),
(6, 102, '服装', '华东', 1200, '2024-01-16'),
(7, 105, '电子产品', '华北', 5500, '2024-01-16'),
(8, 106, '食品', '华东', 300, '2024-01-16');
要求:
- 找出每个区域消费金额最高的前2名用户
- 找出每个类别订单数量最多的前2个区域
- 找出每天消费最高的订单
参考SQL:
-- 1. 每个区域消费最高的前2名用户
SELECT *
FROM (
SELECT
region,
user_id,
SUM(amount) AS total_amount,
ROW_NUMBER() OVER (
PARTITION BY region
ORDER BY SUM(amount) DESC
) AS rn
FROM orders
GROUP BY region, user_id
) t
WHERE rn <= 2;
-- 2. 每个类别订单最多的前2个区域
SELECT *
FROM (
SELECT
category,
region,
COUNT(*) AS order_count,
ROW_NUMBER() OVER (
PARTITION BY category
ORDER BY COUNT(*) DESC
) AS rn
FROM orders
GROUP BY category, region
) t
WHERE rn <= 2;
-- 3. 每天消费最高的订单
SELECT *
FROM (
SELECT
order_date,
order_id,
user_id,
amount,
RANK() OVER (
PARTITION BY order_date
ORDER BY amount DESC
) AS rk
FROM orders
) t
WHERE rk = 1;
順子の杂货铺


