顺子の杂货铺
生命不息,折腾不止,且行且珍惜~

14-多行合并

DMIT VPS

多行合并

将多行数据合并为一行是数据处理中的常见需求,本篇介绍各数据库的实现方法。

📊 支持情况

数据库 GROUP_CONCAT WM_CONCAT LISTAGG STRING_AGG collect_set/list
MySQL ✅ 8.0+
Oracle
ClickHouse groupArray
Hologres string_agg
MaxCompute concat_ws+collect_set
Hive collect_set/collect_list

1. MySQL GROUP_CONCAT

基本用法

-- 将用户名合并
SELECT
    dept_id,
    GROUP_CONCAT(employee_name) AS employees
FROM employees
GROUP BY dept_id;
-- 输出: 1 张三,李四,王五

-- 指定分隔符
SELECT
    dept_id,
    GROUP_CONCAT(employee_name SEPARATOR ' | ') AS employees
FROM employees
GROUP BY dept_id;
-- 输出: 1 张三 | 李四 | 王五

去重合并

-- 去除重复值
SELECT
    dept_id,
    GROUP_CONCAT(DISTINCT employee_name ORDER BY employee_name) AS employees
FROM employees
GROUP BY dept_id;

排序合并

-- 按薪资降序排列
SELECT
    dept_id,
    GROUP_CONCAT(
        employee_name ORDER BY salary DESC SEPARATOR ', '
    ) AS employees_by_salary
FROM employees
GROUP BY dept_id;

限制长度

-- 设置最大长度
SELECT
    GROUP_CONCAT(employee_name SEPARATOR ', ') AS employees
FROM employees
GROUP BY dept_id;

-- 查看设置
SELECT @@group_concat_max_len;
SET SESSION group_concat_max_len = 10000;

2. Oracle LISTAGG

基本用法

SELECT
    dept_id,
    LISTAGG(employee_name, ', ') WITHIN GROUP (ORDER BY employee_name) AS employees
FROM employees
GROUP BY dept_id;

带回退(Oracle 11g+)

-- OVER子句用于分析函数
SELECT
    dept_id,
    employee_name,
    salary,
    LISTAGG(employee_name, ', ') WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION BY dept_id) AS employees
FROM employees;

ON OVERFLOW TRUNCATE(Oracle 12c+)

-- 处理溢出(超过4000字符)
SELECT
    LISTAGG(employee_name, ', ')
        WITHIN GROUP (ORDER BY employee_name)
        ON OVERFLOW TRUNCATE '...'
    AS employees
FROM large_table;

3. ClickHouse groupArray

基本用法

SELECT
    dept_id,
    groupArray(employee_name) AS employees
FROM employees
GROUP BY dept_id;
-- 输出: 1 ['张三','李四','王五']

指定分隔符

SELECT
    dept_id,
    arrayStringConcat(groupArray(employee_name), ', ') AS employees
FROM employees
GROUP BY dept_id;

去重

SELECT
    dept_id,
    arrayStringConcat(groupArray(DISTINCT employee_name), ', ') AS employees
FROM employees
GROUP BY dept_id;

4. Hive / MaxCompute collect_set

Hive collect_set

-- 收集为数组(去重)
SELECT
    dept_id,
    collect_set(employee_name) AS employees
FROM employees
GROUP BY dept_id;
-- 输出: ["张三","李四","王五"]

-- 收集为数组(不去重)
SELECT
    dept_id,
    collect_list(employee_name) AS employees
FROM employees
GROUP BY dept_id;

转换为字符串

-- concat_ws + collect_set
SELECT
    dept_id,
    concat_ws(',', collect_set(employee_name)) AS employees
FROM employees
GROUP BY dept_id;

-- 带排序
SELECT
    dept_id,
    concat_ws(',', sort_array(collect_set(employee_name))) AS employees
FROM employees
GROUP BY dept_id;

5. Hologres / PostgreSQL string_agg

string_agg函数

SELECT
    dept_id,
    string_agg(employee_name, ', ' ORDER BY employee_name) AS employees
FROM employees
GROUP BY dept_id;

6. 实战案例

场景1:订单商品列表

-- MySQL
SELECT
    order_id,
    GROUP_CONCAT(
        CONCAT(product_name, ' x', quantity)
        ORDER BY product_name
        SEPARATOR ', '
    ) AS products
FROM order_items
GROUP BY order_id;
-- 输出: iPhone x2, MacBook x1

场景2:用户标签合并

-- Hive
SELECT
    user_id,
    concat_ws('|', collect_set(tag_name)) AS user_tags
FROM user_tag_mapping
GROUP BY user_id;

场景3:部门员工邮箱列表

-- Oracle
SELECT
    d.dept_name,
    LISTAGG(e.email, '; ') WITHIN GROUP (ORDER BY e.name) AS emails
FROM departments d
JOIN employees e ON d.dept_id = e.dept_id
GROUP BY d.dept_name;

场景4:多级分类路径

-- MySQL - 树形路径
SELECT
    cat_id,
    cat_name,
    GROUP_CONCAT(parent_name ORDER BY level SEPARATOR ' > ') AS path
FROM (
    SELECT
        c1.id AS cat_id,
        c1.name AS cat_name,
        c2.name AS parent_name,
        1 AS level
    FROM categories c1
    LEFT JOIN categories c2 ON c1.parent_id = c2.id
    UNION ALL
    SELECT
        c1.id AS cat_id,
        c1.name AS cat_name,
        c3.name AS parent_name,
        2 AS level
    FROM categories c1
    LEFT JOIN categories c2 ON c1.parent_id = c2.id
    LEFT JOIN categories c3 ON c2.parent_id = c3.id
) t
GROUP BY cat_id, cat_name;

7. 嵌套合并

多级合并

-- MySQL - 先合并子分类,再合并父分类
SELECT
    parent_id,
    GROUP_CONCAT(DISTINCT child_name ORDER BY child_name) AS children
FROM (
    SELECT
        c1.id AS parent_id,
        c2.name AS child_name
    FROM categories c1
    JOIN categories c2 ON c2.parent_id = c1.id
) t
GROUP BY parent_id;

⚠️ 注意事项

1. 长度限制

数据库 默认限制 最大限制
MySQL 1024 可配置(group_concat_max_len)
Oracle 4000(VARCHAR2) 32767
ClickHouse 无限制 内存限制
Hive 无限制 内存限制

2. NULL值处理

-- MySQL - NULL被跳过
SELECT GROUP_CONCAT(name) FROM (SELECT NULL AS name UNION SELECT 'a') t;
-- 输出: a

-- Hive - NULL被跳过
SELECT concat_ws(',', collect_set(col)) FROM (SELECT NULL AS col UNION SELECT 'a') t;
-- 输出: a

3. 性能问题

-- ✅ 建议:先过滤再合并
SELECT
    dept_id,
    GROUP_CONCAT(employee_name)
FROM employees
WHERE status = 'active'  -- 先过滤
GROUP BY dept_id;

-- ❌ 避免:合并后再过滤
SELECT
    dept_id,
    GROUP_CONCAT(employee_name)
FROM employees
GROUP BY dept_id
HAVING COUNT(*) > 5;

📝 练习题

-- 建表
CREATE TABLE order_items (
    order_id INT,
    product_name STRING,
    quantity INT,
    price DECIMAL(10,2)
);

INSERT INTO order_items VALUES
(1, 'iPhone 15', 2, 7999),
(1, 'AirPods', 1, 1499),
(2, 'MacBook Pro', 1, 14999),
(2, 'iPad Air', 2, 4399),
(3, 'iPhone 15', 1, 7999);

要求

  1. 查询每个订单的商品列表
  2. 查询每个商品的订单列表
  3. 计算每种商品的销售数量(合并时计算)
-- 参考答案(MySQL)
-- 1. 订单商品列表
SELECT
    order_id,
    GROUP_CONCAT(
        CONCAT(product_name, '(x', quantity, ')')
        ORDER BY product_name
        SEPARATOR ', '
    ) AS products
FROM order_items
GROUP BY order_id;

-- 2. 商品订单列表
SELECT
    product_name,
    GROUP_CONCAT(
        CONCAT('订单#', order_id, '(x', quantity, ')')
        ORDER BY order_id
        SEPARATOR ', '
    ) AS orders
FROM order_items
GROUP BY product_name;

-- 3. 带聚合的合并
SELECT
    order_id,
    GROUP_CONCAT(
        CONCAT(product_name, ' x', quantity)
        ORDER BY product_name
    ) AS products,
    SUM(quantity) AS total_items,
    SUM(quantity * price) AS total_amount
FROM order_items
GROUP BY order_id;

8. SORT_ARRAY – 数组排序

Hive / MaxCompute 数组排序

-- 基本排序
SELECT SORT_ARRAY(ARRAY(3, 1, 2));  -- [1, 2, 3]

-- 带排序的合并
SELECT
    dept_id,
    SORT_ARRAY(COLLECT_SET(employee_name)) AS employees
FROM employees
GROUP BY dept_id;

-- 降序排序
SELECT
    SORT_ARRAY(ARRAY(3, 1, 2), FALSE) AS employees_desc;  -- [3, 2, 1]

Oracle 排序

-- Oracle 12c+ 支持数组排序
SELECT
    dept_id,
    LISTAGG(employee_name, ', ') WITHIN GROUP (ORDER BY employee_name DESC) AS employees
FROM employees
GROUP BY dept_id;

9. 列转行 – lateral view explode

Hive / MaxCompute 列转行

-- 原始数据:每个用户有多个标签
-- user_id | tags
-- 1       | ["vip","male"]
-- 2       | ["female","young"]

-- 使用 lateral view + explode
SELECT
    user_id,
    tag
FROM user_tags
LATERAL VIEW EXPLODE(tags) t AS tag;

-- 输出:
-- user_id | tag
-- 1       | vip
-- 1       | male
-- 2       | female
-- 2       | young

posexplode – 带位置展开

-- posexplode 返回位置索引和值
SELECT
    user_id,
    pos,
    tag
FROM user_tags
LATERAL VIEW POSEXPLODE(tags) t AS pos, tag;

-- 输出:
-- user_id | pos | tag
-- 1       | 0   | vip
-- 1       | 1   | male

outer + explode – 保留空数组

-- 普通explode会过滤空数组
-- 使用outer保留
SELECT
    user_id,
    tag
FROM user_tags
LATERAL VIEW OUTER EXPLODE(tags) t AS tag;

10. 复杂类型展开

展开 Map

-- 原始数据:属性map
-- user_id | attributes
-- 1       | {"city":"北京","age":25}

SELECT
    user_id,
    attr_key,
    attr_value
FROM user_tags
LATERAL VIEW EXPLODE(attributes) t AS attr_key, attr_value;

-- 输出:
-- user_id | attr_key | attr_value
-- 1       | city     | 北京
-- 1       | age      | 25

展开 Struct

-- 假设有STRUCT类型
SELECT
    user_id,
    name,
    age
FROM users
LATERAL VIEW EXPLODE(address) t AS addr;

-- 或者使用 json_tuple
SELECT
    user_id,
    GET_JSON_OBJECT(info, '$.name') AS name,
    GET_JSON_OBJECT(info, '$.age') AS age
FROM users;

11. PostgreSQL / Hologres UNNEST

-- PostgreSQL/Hologres UNNEST展开数组
SELECT
    user_id,
    UNNEST(tags) AS tag
FROM user_tags;

-- 展开多列
SELECT
    user_id,
    UNNEST(tags) WITH ORDINALITY AS t(tag, idx);

WITH ORDINALITY – 带序号

-- PostgreSQL/Hologres
SELECT
    user_id,
    tag,
    idx
FROM user_tags
CROSS JOIN UNNEST(tags) WITH ORDINALITY AS t(tag, idx);

-- 输出:
-- user_id | tag    | idx
-- 1       | vip    | 1
-- 1       | male   | 2

12. ClickHouse 数组展开

-- arrayJoin - 展开数组
SELECT
    user_id,
    arrayJoin(tags) AS tag
FROM user_tags;

-- arrayJoin带序号
SELECT
    user_id,
    arrayEnumerate(tags) AS idx,
    tags[idx] AS tag
FROM user_tags;

-- 展开多维数组
SELECT
    user_id,
    arrayJoin(arrays) AS element
FROM multi_arrays;

13. 组合使用:展开 + 聚合

场景:计算每个标签的用户数

-- Hive - 先展开再聚合
SELECT
    tag,
    COUNT(DISTINCT user_id) AS user_count
FROM user_tags
LATERAL VIEW EXPLODE(tags) t AS tag
GROUP BY tag;

场景:带序号的展开

-- Hive - 按顺序处理
SELECT
    user_id,
    tag,
    ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY pos) AS tag_order
FROM (
    SELECT
        user_id,
        tag,
        POSEXPLODE(tags) AS (pos, tag)
    FROM user_tags
) t;

场景:多列展开

-- Hive - 展开多个数组列
SELECT
    user_id,
    tag,
    score
FROM user_tags
LATERAL VIEW EXPLODE(tags) t1 AS tag
LATERAL VIEW EXPLODE(scores) t2 AS score;

14. WITHIN GROUP 详解

Oracle WITHIN GROUP (ORDER BY)

-- 聚合时排序
SELECT
    dept_id,
    LISTAGG(employee_name, ', ')
        WITHIN GROUP (ORDER BY salary DESC, employee_name) AS employees_by_salary
FROM employees
GROUP BY dept_id;

-- STRING_AGG(PostgreSQL/Hologres)
SELECT
    dept_id,
    STRING_AGG(employee_name, ', ' ORDER BY salary DESC) AS employees
FROM employees
GROUP BY dept_id;

带过滤条件的排序

-- 只合并特定条件的值
SELECT
    dept_id,
    LISTAGG(
        CASE WHEN salary > 10000 THEN employee_name END, ', '
    ) WITHIN GROUP (ORDER BY employee_name) AS high_earners
FROM employees
GROUP BY dept_id;

15. 实战综合案例

场景1:订单商品展开与汇总

-- Hive
SELECT
    order_id,
    product_name,
    quantity
FROM orders
LATERAL VIEW OUTER EXPLODE(items) t AS item
LATERAL VIEW OUTER EXPLODE(item.products) p AS product_name
LATERAL VIEW OUTER EXPLODE(item.quantities) q AS quantity;

场景2:用户行为路径分析

-- 按用户展开行为序列
SELECT
    user_id,
    event_order,
    event_type,
    event_time
FROM (
    SELECT
        user_id,
        EVENTS,
        POSEXPLODE(EVENTS) AS (event_order, event)
    FROM user_sessions
) t
LATERAL VIEW OUTER EXPLODE(event.event_type) et AS event_type
LATERAL VIEW OUTER EXPLODE(event.event_time) et AS event_time;

场景3:多维度标签统计

-- Hive - 展开多个标签列并统计
SELECT
    tag_type,
    tag_value,
    COUNT(DISTINCT user_id) AS user_count
FROM (
    SELECT
        user_id,
        'interest' AS tag_type,
        tag AS tag_value
    FROM users
    LATERAL VIEW EXPLODE(interests) t AS tag
    UNION ALL
    SELECT
        user_id,
        'habit' AS tag_type,
        habit AS tag_value
    FROM users
    LATERAL VIEW EXPLODE(habits) t AS habit
) all_tags
GROUP BY tag_type, tag_value;

⚠️ 注意事项

1. 性能影响

-- ❌ 大数据量展开可能导致数据膨胀
SELECT COUNT(*) FROM (SELECT ... LATERAL VIEW EXPLODE ...) t;

-- ✅ 建议:先过滤再展开
SELECT
    user_id,
    tag
FROM (
    SELECT * FROM user_tags WHERE dt = '2024-01-14'
) t
LATERAL VIEW EXPLODE(tags) t AS tag;

2. 内存使用

-- 展开大数组可能内存溢出
-- 使用分布式处理
SET hive.map.aggr=true;
SET hive.groupby.mapaggr.checkinterval=100000;

3. NULL和空数组

-- EXPLODE会过滤NULL和空数组
-- 使用LATERAL VIEW OUTER保留
SELECT
    user_id,
    tag
FROM users
LATERAL VIEW OUTER EXPLODE(tags) t AS tag;

📝 练习题

-- 建表
CREATE TABLE user_tags (
    user_id INT,
    interests ARRAY,
    habits ARRAY,
    scores ARRAY
);

INSERT INTO user_tags VALUES
(1, ['reading', 'music', 'sports'], ['early_bird', 'coffee'], [85, 90, 78]),
(2, ['gaming', 'music'], ['night_owl'], [70, 65]),
(3, ['reading'], ['early_bird'], [92]);

要求

  1. 展开用户兴趣列表
  2. 计算每个兴趣的用户数
  3. 展开所有标签(兴趣+习惯)
  4. 按用户展开分数并排序
-- 参考答案(Hive)
-- 1. 展开兴趣列表
SELECT user_id, interest
FROM user_tags
LATERAL VIEW EXPLODE(interests) t AS interest;

-- 2. 兴趣用户统计
SELECT
    interest,
    COUNT(DISTINCT user_id) AS user_count
FROM user_tags
LATERAL VIEW EXPLODE(interests) t AS interest
GROUP BY interest;

-- 3. 展开所有标签
SELECT user_id, tag FROM (
    SELECT user_id, interest AS tag FROM user_tags LATERAL VIEW EXPLODE(interests) t AS interest
    UNION ALL
    SELECT user_id, habit AS tag FROM user_tags LATERAL VIEW EXPLODE(habits) t AS habit
) all_tags;

-- 4. 展开分数并排序
SELECT
    user_id,
    score,
    ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY score DESC) AS score_rank
FROM (
    SELECT user_id, score
    FROM user_tags
    LATERAL VIEW EXPLODE(scores) t AS score
) t;
赞(0)
未经允许不得转载:順子の杂货铺 » 14-多行合并
搬瓦工VPS

评论 抢沙发

  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址

分享创造快乐

联系我们联系我们