顺子の杂货铺
生命不息,折腾不止,且行且珍惜~

04-分组TOP-N查询

DMIT VPS

分组TOP-N查询

分组TOP-N是数据分析中的高频需求,用于从每个分组中取出排名前N的记录。

📊 支持情况

数据库 支持情况 备注
MySQL ✅ 完整支持 8.0+窗口函数
Oracle ✅ 完整支持 ROWNUM或窗口函数
ClickHouse ✅ 完整支持 ALL/GROUP BY技巧
Hologres ✅ 完整支持 窗口函数
MaxCompute ✅ 完整支持 窗口函数
Hive ✅ 完整支持 窗口函数

1. 基本分组TOP-N

场景:每个部门薪资最高的前3名员工

MySQL 8.0+

-- 方法1:窗口函数
SELECT *
FROM (
    SELECT
        emp_id,
        dept_id,
        salary,
        ROW_NUMBER() OVER (
            PARTITION BY dept_id
            ORDER BY salary DESC
        ) AS rn
    FROM employees
) t
WHERE rn <= 3;

Oracle

-- 方法1:窗口函数(推荐)
SELECT *
FROM (
    SELECT
        emp_id,
        dept_id,
        salary,
        ROW_NUMBER() OVER (
            PARTITION BY dept_id
            ORDER BY salary DESC
        ) AS rn
    FROM employees
)
WHERE rn <= 3;

-- 方法2:ROWNUM(旧版本)
SELECT *
FROM (
    SELECT emp_id, dept_id, salary
    FROM employees
    ORDER BY salary DESC
)
WHERE ROWNUM <= 3;

Hive / MaxCompute / Hologres

SELECT *
FROM (
    SELECT
        emp_id,
        dept_id,
        salary,
        ROW_NUMBER() OVER (
            PARTITION BY dept_id
            ORDER BY salary DESC
        ) AS rn
    FROM employees
) t
WHERE rn <= 3;

ClickHouse

-- 方法1:窗口函数
SELECT *
FROM (
    SELECT
        emp_id,
        dept_id,
        salary,
        row_number() OVER (
            PARTITION BY dept_id
            ORDER BY salary DESC
        ) AS rn
    FROM employees
)
WHERE rn <= 3;

-- 方法2:ARRAY JOIN(更高效)
SELECT
    dept_id,
    arrayJoin(topK(3)(salary)) AS top_salary,
    groupArray(emp_id) AS employees
FROM employees
GROUP BY dept_id;

2. 多字段分组TOP-N

场景:每个区域、每类产品的销售额Top2

-- MySQL / Hive / MaxCompute / Hologres
SELECT *
FROM (
    SELECT
        region,
        category,
        product_id,
        sales_amount,
        ROW_NUMBER() OVER (
            PARTITION BY region, category
            ORDER BY sales_amount DESC
        ) AS rn
    FROM product_sales
) t
WHERE rn <= 2;

输出示例:

region category product_id sales_amount rn
华东 电子 P001 50000 1
华东 电子 P002 45000 2
华东 服装 P003 30000 1
华北 电子 P004 40000 1

3. TOP-N + 聚合

场景:每个部门Top3员工的薪资总和

-- MySQL 8.0+
SELECT
    dept_id,
    SUM(salary) AS top3_salary_sum
FROM (
    SELECT
        dept_id,
        salary,
        ROW_NUMBER() OVER (
            PARTITION BY dept_id
            ORDER BY salary DESC
        ) AS rn
    FROM employees
) t
WHERE rn <= 3
GROUP BY dept_id;

Oracle

SELECT
    dept_id,
    SUM(salary) AS top3_salary_sum
FROM (
    SELECT
        dept_id,
        salary,
        ROW_NUMBER() OVER (
            PARTITION BY dept_id
            ORDER BY salary DESC
        ) AS rn
    FROM employees
)
WHERE rn <= 3
GROUP BY dept_id;

4. RANK版本(允许并列)

场景:每个班级成绩前5名(允许并列)

-- MySQL / Hive / MaxCompute / Hologres
-- 使用RANK()而不是ROW_NUMBER(),允许并列
SELECT *
FROM (
    SELECT
        class_id,
        student_id,
        score,
        RANK() OVER (
            PARTITION BY class_id
            ORDER BY score DESC
        ) AS rk
    FROM exam_scores
) t
WHERE rk <= 5;

输出示例:

class_id student_id score rk
1班 S001 100 1
1班 S002 95 2
1班 S003 95 2
1班 S004 90 4
1班 S005 90 4

5. 分组取最大/最小一条

场景:每个用户最近一笔订单

-- MySQL / Hive / MaxCompute / Hologres
SELECT *
FROM (
    SELECT
        user_id,
        order_id,
        order_date,
        amount,
        ROW_NUMBER() OVER (
            PARTITION BY user_id
            ORDER BY order_date DESC
        ) AS rn
    FROM orders
) t
WHERE rn = 1;

Oracle

SELECT *
FROM (
    SELECT
        user_id,
        order_id,
        order_date,
        amount,
        ROW_NUMBER() OVER (
            PARTITION BY user_id
            ORDER BY order_date DESC
        ) AS rn
    FROM orders
)
WHERE rn = 1;

ClickHouse

SELECT
    user_id,
    argMax(order_id, order_date) AS latest_order_id,
    max(order_date) AS latest_date,
    max(amount) AS amount
FROM orders
GROUP BY user_id;

6. 分组TOP-N + 关联查询

场景:获取每个区域销售冠军的详细信息

-- MySQL / Hive
WITH top_sales AS (
    SELECT
        region,
        salesperson,
        ROW_NUMBER() OVER (
            PARTITION BY region
            ORDER BY sales_amount DESC
        ) AS rn
    FROM sales
    WHERE YEAR(sale_date) = 2024
)
SELECT
    r.region_name,
    t.salesperson,
    t.sales_amount
FROM top_sales t
JOIN regions r ON t.region = r.region_id
WHERE t.rn = 1;

Oracle

SELECT
    r.region_name,
    t.salesperson,
    t.sales_amount
FROM (
    SELECT
        region,
        salesperson,
        sales_amount,
        ROW_NUMBER() OVER (
            PARTITION BY region
            ORDER BY sales_amount DESC
        ) AS rn
    FROM sales
    WHERE EXTRACT(YEAR FROM sale_date) = 2024
) t
JOIN regions r ON t.region = r.region_id
WHERE t.rn = 1;

7. 动态TOP-N

场景:根据参数取TopN

-- MySQL 8.0+
SET @top_n = 5;

SELECT *
FROM (
    SELECT
        category,
        product_id,
        sales_amount,
        ROW_NUMBER() OVER (
            PARTITION BY category
            ORDER BY sales_amount DESC
        ) AS rn
    FROM product_sales
) t
WHERE rn <= @top_n;

Hive(使用变量)

SET hivevar:top_n=5;

SELECT *
FROM (
    SELECT
        category,
        product_id,
        sales_amount,
        ROW_NUMBER() OVER (
            PARTITION BY category
            ORDER BY sales_amount DESC
        ) AS rn
    FROM product_sales
) t
WHERE rn <= ${hivevar:top_n};

8. MAXCOMPUTE特殊语法

使用DISTRIBUTE BY + SORT BY

-- MaxCompute特有的高效写法
SELECT
    dept_id,
    emp_id,
    salary
FROM employees
DISTRIBUTE BY dept_id
SORT BY salary DESC
LIMIT 3 PER GROUP;

使用SAMPLE取样TOP-N

-- 先取样再排序
SELECT
    category,
    product_id,
    sales_amount
FROM product_sales
SAMPLE (1000)  -- 采样1000条
DISTRIBUTE BY category
SORT BY sales_amount DESC;

9. ClickHouse特殊技巧

使用ARRAY + ARRAYJOIN

-- 获取每个分区的Top3
SELECT
    dept_id,
    arrayJoin(topK(3)(salary)) AS top_salary,
    groupArray(emp_id) FILTER (salary IN topK(3)(salary)) AS top_employees
FROM employees
GROUP BY dept_id;

使用LIMIT BY

-- ClickHouse特有语法
SELECT
    dept_id,
    emp_id,
    salary
FROM employees
ORDER BY dept_id, salary DESC
LIMIT 3 BY dept_id;

⚠️ 注意事项

1. 性能优化

-- ✅ 建议:在子查询中先过滤数据
SELECT *
FROM (
    SELECT
        emp_id,
        dept_id,
        salary,
        ROW_NUMBER() OVER (
            PARTITION BY dept_id
            ORDER BY salary DESC
        ) AS rn
    FROM employees
    WHERE hire_date >= '2023-01-01'  -- 先过滤
) t
WHERE rn <= 3;

-- ❌ 避免:先排序所有数据再过滤
SELECT *
FROM (
    SELECT
        emp_id,
        dept_id,
        salary,
        ROW_NUMBER() OVER (
            PARTITION BY dept_id
            ORDER BY salary DESC
        ) AS rn
    FROM employees
) t
WHERE rn <= 3 AND hire_date >= '2023-01-01';  -- 过滤在窗口计算后

2. 索引利用

-- 确保PARTITION BY和ORDER BY字段有索引
-- MySQL
CREATE INDEX idx_emp_dept_salary ON employees(dept_id, salary DESC);

3. 数据量大时

-- 大数据量建议使用 DISTRIBUTE BY + SORT BY(Hive/MaxCompute)
SELECT /*+ MAPJOIN(dim) */
    t.*
FROM (
    SELECT
        dept_id,
        emp_id,
        salary,
        ROW_NUMBER() OVER (
            PARTITION BY dept_id
            ORDER BY salary DESC
        ) AS rn
    FROM employees
    WHERE dt = '2024-01-14'
) t
JOIN dim_dept d ON t.dept_id = d.dept_id
WHERE t.rn <= 10;

📝 练习题

题目:电商订单分析

-- 建表
CREATE TABLE orders (
    order_id INT,
    user_id INT,
    category STRING,
    region STRING,
    amount DECIMAL(10,2),
    order_date DATE
);

-- 插入数据
INSERT INTO orders VALUES
(1, 101, '电子产品', '华东', 5000, '2024-01-15'),
(2, 102, '电子产品', '华东', 3000, '2024-01-15'),
(3, 103, '服装', '华东', 800, '2024-01-15'),
(4, 104, '电子产品', '华北', 4500, '2024-01-15'),
(5, 101, '服装', '华东', 600, '2024-01-16'),
(6, 102, '服装', '华东', 1200, '2024-01-16'),
(7, 105, '电子产品', '华北', 5500, '2024-01-16'),
(8, 106, '食品', '华东', 300, '2024-01-16');

要求

  1. 找出每个区域消费金额最高的前2名用户
  2. 找出每个类别订单数量最多的前2个区域
  3. 找出每天消费最高的订单

参考SQL

-- 1. 每个区域消费最高的前2名用户
SELECT *
FROM (
    SELECT
        region,
        user_id,
        SUM(amount) AS total_amount,
        ROW_NUMBER() OVER (
            PARTITION BY region
            ORDER BY SUM(amount) DESC
        ) AS rn
    FROM orders
    GROUP BY region, user_id
) t
WHERE rn <= 2;

-- 2. 每个类别订单最多的前2个区域
SELECT *
FROM (
    SELECT
        category,
        region,
        COUNT(*) AS order_count,
        ROW_NUMBER() OVER (
            PARTITION BY category
            ORDER BY COUNT(*) DESC
        ) AS rn
    FROM orders
    GROUP BY category, region
) t
WHERE rn <= 2;

-- 3. 每天消费最高的订单
SELECT *
FROM (
    SELECT
        order_date,
        order_id,
        user_id,
        amount,
        RANK() OVER (
            PARTITION BY order_date
            ORDER BY amount DESC
        ) AS rk
    FROM orders
) t
WHERE rk = 1;
赞(0)
未经允许不得转载:順子の杂货铺 » 04-分组TOP-N查询
搬瓦工VPS

评论 抢沙发

  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址

分享创造快乐

联系我们联系我们