多行合并
将多行数据合并为一行是数据处理中的常见需求,本篇介绍各数据库的实现方法。
📊 支持情况
| 数据库 | GROUP_CONCAT | WM_CONCAT | LISTAGG | STRING_AGG | collect_set/list |
|---|---|---|---|---|---|
| MySQL | ✅ | – | – | ✅ 8.0+ | – |
| Oracle | – | – | ✅ | ✅ | – |
| ClickHouse | – | – | – | – | groupArray |
| Hologres | – | – | – | – | string_agg |
| MaxCompute | – | – | – | – | concat_ws+collect_set |
| Hive | – | ✅ | – | – | collect_set/collect_list |
1. MySQL GROUP_CONCAT
基本用法
-- 将用户名合并
SELECT
dept_id,
GROUP_CONCAT(employee_name) AS employees
FROM employees
GROUP BY dept_id;
-- 输出: 1 张三,李四,王五
-- 指定分隔符
SELECT
dept_id,
GROUP_CONCAT(employee_name SEPARATOR ' | ') AS employees
FROM employees
GROUP BY dept_id;
-- 输出: 1 张三 | 李四 | 王五去重合并
-- 去除重复值
SELECT
dept_id,
GROUP_CONCAT(DISTINCT employee_name ORDER BY employee_name) AS employees
FROM employees
GROUP BY dept_id;排序合并
-- 按薪资降序排列
SELECT
dept_id,
GROUP_CONCAT(
employee_name ORDER BY salary DESC SEPARATOR ', '
) AS employees_by_salary
FROM employees
GROUP BY dept_id;限制长度
-- 设置最大长度
SELECT
GROUP_CONCAT(employee_name SEPARATOR ', ') AS employees
FROM employees
GROUP BY dept_id;
-- 查看设置
SELECT @@group_concat_max_len;
SET SESSION group_concat_max_len = 10000;2. Oracle LISTAGG
基本用法
SELECT
dept_id,
LISTAGG(employee_name, ', ') WITHIN GROUP (ORDER BY employee_name) AS employees
FROM employees
GROUP BY dept_id;带回退(Oracle 11g+)
-- OVER子句用于分析函数
SELECT
dept_id,
employee_name,
salary,
LISTAGG(employee_name, ', ') WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION BY dept_id) AS employees
FROM employees;ON OVERFLOW TRUNCATE(Oracle 12c+)
-- 处理溢出(超过4000字符)
SELECT
LISTAGG(employee_name, ', ')
WITHIN GROUP (ORDER BY employee_name)
ON OVERFLOW TRUNCATE '...'
AS employees
FROM large_table;3. ClickHouse groupArray
基本用法
SELECT
dept_id,
groupArray(employee_name) AS employees
FROM employees
GROUP BY dept_id;
-- 输出: 1 ['张三','李四','王五']指定分隔符
SELECT
dept_id,
arrayStringConcat(groupArray(employee_name), ', ') AS employees
FROM employees
GROUP BY dept_id;去重
SELECT
dept_id,
arrayStringConcat(groupArray(DISTINCT employee_name), ', ') AS employees
FROM employees
GROUP BY dept_id;4. Hive / MaxCompute collect_set
Hive collect_set
-- 收集为数组(去重)
SELECT
dept_id,
collect_set(employee_name) AS employees
FROM employees
GROUP BY dept_id;
-- 输出: ["张三","李四","王五"]
-- 收集为数组(不去重)
SELECT
dept_id,
collect_list(employee_name) AS employees
FROM employees
GROUP BY dept_id;转换为字符串
-- concat_ws + collect_set
SELECT
dept_id,
concat_ws(',', collect_set(employee_name)) AS employees
FROM employees
GROUP BY dept_id;
-- 带排序
SELECT
dept_id,
concat_ws(',', sort_array(collect_set(employee_name))) AS employees
FROM employees
GROUP BY dept_id;5. Hologres / PostgreSQL string_agg
string_agg函数
SELECT
dept_id,
string_agg(employee_name, ', ' ORDER BY employee_name) AS employees
FROM employees
GROUP BY dept_id;6. 实战案例
场景1:订单商品列表
-- MySQL
SELECT
order_id,
GROUP_CONCAT(
CONCAT(product_name, ' x', quantity)
ORDER BY product_name
SEPARATOR ', '
) AS products
FROM order_items
GROUP BY order_id;
-- 输出: iPhone x2, MacBook x1场景2:用户标签合并
-- Hive
SELECT
user_id,
concat_ws('|', collect_set(tag_name)) AS user_tags
FROM user_tag_mapping
GROUP BY user_id;场景3:部门员工邮箱列表
-- Oracle
SELECT
d.dept_name,
LISTAGG(e.email, '; ') WITHIN GROUP (ORDER BY e.name) AS emails
FROM departments d
JOIN employees e ON d.dept_id = e.dept_id
GROUP BY d.dept_name;场景4:多级分类路径
-- MySQL - 树形路径
SELECT
cat_id,
cat_name,
GROUP_CONCAT(parent_name ORDER BY level SEPARATOR ' > ') AS path
FROM (
SELECT
c1.id AS cat_id,
c1.name AS cat_name,
c2.name AS parent_name,
1 AS level
FROM categories c1
LEFT JOIN categories c2 ON c1.parent_id = c2.id
UNION ALL
SELECT
c1.id AS cat_id,
c1.name AS cat_name,
c3.name AS parent_name,
2 AS level
FROM categories c1
LEFT JOIN categories c2 ON c1.parent_id = c2.id
LEFT JOIN categories c3 ON c2.parent_id = c3.id
) t
GROUP BY cat_id, cat_name;7. 嵌套合并
多级合并
-- MySQL - 先合并子分类,再合并父分类
SELECT
parent_id,
GROUP_CONCAT(DISTINCT child_name ORDER BY child_name) AS children
FROM (
SELECT
c1.id AS parent_id,
c2.name AS child_name
FROM categories c1
JOIN categories c2 ON c2.parent_id = c1.id
) t
GROUP BY parent_id;⚠️ 注意事项
1. 长度限制
| 数据库 | 默认限制 | 最大限制 |
|---|---|---|
| MySQL | 1024 | 可配置(group_concat_max_len) |
| Oracle | 4000(VARCHAR2) | 32767 |
| ClickHouse | 无限制 | 内存限制 |
| Hive | 无限制 | 内存限制 |
2. NULL值处理
-- MySQL - NULL被跳过
SELECT GROUP_CONCAT(name) FROM (SELECT NULL AS name UNION SELECT 'a') t;
-- 输出: a
-- Hive - NULL被跳过
SELECT concat_ws(',', collect_set(col)) FROM (SELECT NULL AS col UNION SELECT 'a') t;
-- 输出: a3. 性能问题
-- ✅ 建议:先过滤再合并
SELECT
dept_id,
GROUP_CONCAT(employee_name)
FROM employees
WHERE status = 'active' -- 先过滤
GROUP BY dept_id;
-- ❌ 避免:合并后再过滤
SELECT
dept_id,
GROUP_CONCAT(employee_name)
FROM employees
GROUP BY dept_id
HAVING COUNT(*) > 5;📝 练习题
-- 建表
CREATE TABLE order_items (
order_id INT,
product_name STRING,
quantity INT,
price DECIMAL(10,2)
);
INSERT INTO order_items VALUES
(1, 'iPhone 15', 2, 7999),
(1, 'AirPods', 1, 1499),
(2, 'MacBook Pro', 1, 14999),
(2, 'iPad Air', 2, 4399),
(3, 'iPhone 15', 1, 7999);要求:
- 查询每个订单的商品列表
- 查询每个商品的订单列表
- 计算每种商品的销售数量(合并时计算)
-- 参考答案(MySQL)
-- 1. 订单商品列表
SELECT
order_id,
GROUP_CONCAT(
CONCAT(product_name, '(x', quantity, ')')
ORDER BY product_name
SEPARATOR ', '
) AS products
FROM order_items
GROUP BY order_id;
-- 2. 商品订单列表
SELECT
product_name,
GROUP_CONCAT(
CONCAT('订单#', order_id, '(x', quantity, ')')
ORDER BY order_id
SEPARATOR ', '
) AS orders
FROM order_items
GROUP BY product_name;
-- 3. 带聚合的合并
SELECT
order_id,
GROUP_CONCAT(
CONCAT(product_name, ' x', quantity)
ORDER BY product_name
) AS products,
SUM(quantity) AS total_items,
SUM(quantity * price) AS total_amount
FROM order_items
GROUP BY order_id;8. SORT_ARRAY – 数组排序
Hive / MaxCompute 数组排序
-- 基本排序
SELECT SORT_ARRAY(ARRAY(3, 1, 2)); -- [1, 2, 3]
-- 带排序的合并
SELECT
dept_id,
SORT_ARRAY(COLLECT_SET(employee_name)) AS employees
FROM employees
GROUP BY dept_id;
-- 降序排序
SELECT
SORT_ARRAY(ARRAY(3, 1, 2), FALSE) AS employees_desc; -- [3, 2, 1]Oracle 排序
-- Oracle 12c+ 支持数组排序
SELECT
dept_id,
LISTAGG(employee_name, ', ') WITHIN GROUP (ORDER BY employee_name DESC) AS employees
FROM employees
GROUP BY dept_id;9. 列转行 – lateral view explode
Hive / MaxCompute 列转行
-- 原始数据:每个用户有多个标签
-- user_id | tags
-- 1 | ["vip","male"]
-- 2 | ["female","young"]
-- 使用 lateral view + explode
SELECT
user_id,
tag
FROM user_tags
LATERAL VIEW EXPLODE(tags) t AS tag;
-- 输出:
-- user_id | tag
-- 1 | vip
-- 1 | male
-- 2 | female
-- 2 | youngposexplode – 带位置展开
-- posexplode 返回位置索引和值
SELECT
user_id,
pos,
tag
FROM user_tags
LATERAL VIEW POSEXPLODE(tags) t AS pos, tag;
-- 输出:
-- user_id | pos | tag
-- 1 | 0 | vip
-- 1 | 1 | maleouter + explode – 保留空数组
-- 普通explode会过滤空数组
-- 使用outer保留
SELECT
user_id,
tag
FROM user_tags
LATERAL VIEW OUTER EXPLODE(tags) t AS tag;10. 复杂类型展开
展开 Map
-- 原始数据:属性map
-- user_id | attributes
-- 1 | {"city":"北京","age":25}
SELECT
user_id,
attr_key,
attr_value
FROM user_tags
LATERAL VIEW EXPLODE(attributes) t AS attr_key, attr_value;
-- 输出:
-- user_id | attr_key | attr_value
-- 1 | city | 北京
-- 1 | age | 25展开 Struct
-- 假设有STRUCT类型
SELECT
user_id,
name,
age
FROM users
LATERAL VIEW EXPLODE(address) t AS addr;
-- 或者使用 json_tuple
SELECT
user_id,
GET_JSON_OBJECT(info, '$.name') AS name,
GET_JSON_OBJECT(info, '$.age') AS age
FROM users;11. PostgreSQL / Hologres UNNEST
-- PostgreSQL/Hologres UNNEST展开数组
SELECT
user_id,
UNNEST(tags) AS tag
FROM user_tags;
-- 展开多列
SELECT
user_id,
UNNEST(tags) WITH ORDINALITY AS t(tag, idx);WITH ORDINALITY – 带序号
-- PostgreSQL/Hologres
SELECT
user_id,
tag,
idx
FROM user_tags
CROSS JOIN UNNEST(tags) WITH ORDINALITY AS t(tag, idx);
-- 输出:
-- user_id | tag | idx
-- 1 | vip | 1
-- 1 | male | 212. ClickHouse 数组展开
-- arrayJoin - 展开数组
SELECT
user_id,
arrayJoin(tags) AS tag
FROM user_tags;
-- arrayJoin带序号
SELECT
user_id,
arrayEnumerate(tags) AS idx,
tags[idx] AS tag
FROM user_tags;
-- 展开多维数组
SELECT
user_id,
arrayJoin(arrays) AS element
FROM multi_arrays;13. 组合使用:展开 + 聚合
场景:计算每个标签的用户数
-- Hive - 先展开再聚合
SELECT
tag,
COUNT(DISTINCT user_id) AS user_count
FROM user_tags
LATERAL VIEW EXPLODE(tags) t AS tag
GROUP BY tag;场景:带序号的展开
-- Hive - 按顺序处理
SELECT
user_id,
tag,
ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY pos) AS tag_order
FROM (
SELECT
user_id,
tag,
POSEXPLODE(tags) AS (pos, tag)
FROM user_tags
) t;场景:多列展开
-- Hive - 展开多个数组列
SELECT
user_id,
tag,
score
FROM user_tags
LATERAL VIEW EXPLODE(tags) t1 AS tag
LATERAL VIEW EXPLODE(scores) t2 AS score;14. WITHIN GROUP 详解
Oracle WITHIN GROUP (ORDER BY)
-- 聚合时排序
SELECT
dept_id,
LISTAGG(employee_name, ', ')
WITHIN GROUP (ORDER BY salary DESC, employee_name) AS employees_by_salary
FROM employees
GROUP BY dept_id;
-- STRING_AGG(PostgreSQL/Hologres)
SELECT
dept_id,
STRING_AGG(employee_name, ', ' ORDER BY salary DESC) AS employees
FROM employees
GROUP BY dept_id;带过滤条件的排序
-- 只合并特定条件的值
SELECT
dept_id,
LISTAGG(
CASE WHEN salary > 10000 THEN employee_name END, ', '
) WITHIN GROUP (ORDER BY employee_name) AS high_earners
FROM employees
GROUP BY dept_id;15. 实战综合案例
场景1:订单商品展开与汇总
-- Hive
SELECT
order_id,
product_name,
quantity
FROM orders
LATERAL VIEW OUTER EXPLODE(items) t AS item
LATERAL VIEW OUTER EXPLODE(item.products) p AS product_name
LATERAL VIEW OUTER EXPLODE(item.quantities) q AS quantity;场景2:用户行为路径分析
-- 按用户展开行为序列
SELECT
user_id,
event_order,
event_type,
event_time
FROM (
SELECT
user_id,
EVENTS,
POSEXPLODE(EVENTS) AS (event_order, event)
FROM user_sessions
) t
LATERAL VIEW OUTER EXPLODE(event.event_type) et AS event_type
LATERAL VIEW OUTER EXPLODE(event.event_time) et AS event_time;场景3:多维度标签统计
-- Hive - 展开多个标签列并统计
SELECT
tag_type,
tag_value,
COUNT(DISTINCT user_id) AS user_count
FROM (
SELECT
user_id,
'interest' AS tag_type,
tag AS tag_value
FROM users
LATERAL VIEW EXPLODE(interests) t AS tag
UNION ALL
SELECT
user_id,
'habit' AS tag_type,
habit AS tag_value
FROM users
LATERAL VIEW EXPLODE(habits) t AS habit
) all_tags
GROUP BY tag_type, tag_value;⚠️ 注意事项
1. 性能影响
-- ❌ 大数据量展开可能导致数据膨胀
SELECT COUNT(*) FROM (SELECT ... LATERAL VIEW EXPLODE ...) t;
-- ✅ 建议:先过滤再展开
SELECT
user_id,
tag
FROM (
SELECT * FROM user_tags WHERE dt = '2024-01-14'
) t
LATERAL VIEW EXPLODE(tags) t AS tag;2. 内存使用
-- 展开大数组可能内存溢出
-- 使用分布式处理
SET hive.map.aggr=true;
SET hive.groupby.mapaggr.checkinterval=100000;3. NULL和空数组
-- EXPLODE会过滤NULL和空数组
-- 使用LATERAL VIEW OUTER保留
SELECT
user_id,
tag
FROM users
LATERAL VIEW OUTER EXPLODE(tags) t AS tag;📝 练习题
-- 建表
CREATE TABLE user_tags (
user_id INT,
interests ARRAY,
habits ARRAY,
scores ARRAY
);
INSERT INTO user_tags VALUES
(1, ['reading', 'music', 'sports'], ['early_bird', 'coffee'], [85, 90, 78]),
(2, ['gaming', 'music'], ['night_owl'], [70, 65]),
(3, ['reading'], ['early_bird'], [92]); 要求:
- 展开用户兴趣列表
- 计算每个兴趣的用户数
- 展开所有标签(兴趣+习惯)
- 按用户展开分数并排序
-- 参考答案(Hive)
-- 1. 展开兴趣列表
SELECT user_id, interest
FROM user_tags
LATERAL VIEW EXPLODE(interests) t AS interest;
-- 2. 兴趣用户统计
SELECT
interest,
COUNT(DISTINCT user_id) AS user_count
FROM user_tags
LATERAL VIEW EXPLODE(interests) t AS interest
GROUP BY interest;
-- 3. 展开所有标签
SELECT user_id, tag FROM (
SELECT user_id, interest AS tag FROM user_tags LATERAL VIEW EXPLODE(interests) t AS interest
UNION ALL
SELECT user_id, habit AS tag FROM user_tags LATERAL VIEW EXPLODE(habits) t AS habit
) all_tags;
-- 4. 展开分数并排序
SELECT
user_id,
score,
ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY score DESC) AS score_rank
FROM (
SELECT user_id, score
FROM user_tags
LATERAL VIEW EXPLODE(scores) t AS score
) t;
順子の杂货铺


