去重计数
去重计数是数据分析中最常用的统计方法之一,本篇介绍精确去重和近似去重的实现。
📊 支持情况
| 数据库 | COUNT(DISTINCT) | APPROX_COUNT_DISTINCT | HyperLogLog | 备注 |
|---|---|---|---|---|
| MySQL | ✅ 8.0+ | ✅ | ✅ 8.0+ | 完整支持 |
| Oracle | ✅ | ✅ | – | APPROX_RANK |
| ClickHouse | ✅ | ✅ | ✅ | uniqExact |
| Hologres | ✅ | ✅ | – | 兼容PG |
| MaxCompute | ✅ | ✅ | ✅ | approx_count_distinct |
| Hive | ✅ | ✅ | ✅ | approx_count_distinct |
1. 精确去重
COUNT(DISTINCT)
-- 基本语法
SELECT COUNT(DISTINCT user_id) AS unique_users FROM orders;
-- 多列去重
SELECT COUNT(DISTINCT user_id, order_date) AS unique_orders FROM orders;
-- 按组去重
SELECT
DATE(order_date) AS order_date,
COUNT(DISTINCT user_id) AS daily_users,
COUNT(DISTINCT order_id) AS daily_orders
FROM orders
GROUP BY DATE(order_date);
DISTINCT + COUNT
-- 写法等价
SELECT COUNT(DISTINCT user_id) FROM orders;
SELECT COUNT(*) FROM (SELECT DISTINCT user_id FROM orders) t;
Oracle 特殊写法
SELECT COUNT(DISTINCT user_id) FROM orders;
SELECT COUNT(*) FROM (SELECT DISTINCT user_id FROM orders);
2. 分组后去重计数
按多维度去重
-- MySQL
SELECT
DATE(order_date) AS day,
region,
COUNT(DISTINCT user_id) AS users,
COUNT(DISTINCT product_id) AS products
FROM orders
WHERE order_date >= DATE_SUB(CURDATE(), INTERVAL 7 DAY)
GROUP BY day, region;
带过滤条件的去重
-- 只统计已完成订单的用户
SELECT
COUNT(DISTINCT CASE WHEN status = 'completed' THEN user_id END) AS completed_users,
COUNT(DISTINCT user_id) AS total_users
FROM orders;
3. 近似去重
MySQL APPROX_COUNT_DISTINCT
-- 大数据量时使用近似值(误差约2%)
SELECT APPROX_COUNT_DISTINCT(user_id) FROM large_table;
ClickHouse
-- 使用uniqHLL12(HyperLogLog)
SELECT uniqHLL12(user_id) FROM large_table;
-- 使用uniq(默认使用HLL)
SELECT uniq(user_id) FROM large_table;
Hive / MaxCompute
SELECT approx_count_distinct(user_id) FROM large_table;
Oracle
SELECT APPROX_COUNT_DISTINCT(user_id) FROM large_table;
4. HyperLogLog (HLL)
MySQL HLL
-- 创建HLL结构
SELECT COUNT(DISTINCT user_id) FROM orders;
-- MySQL 8.0+ 自动使用HLL优化
-- 手动使用
SELECT COUNT(DISTINCT user_id) FROM large_table;
-- 查看HLL状态
SELECT * FROM sys.schema_index_statistics;
ClickHouse HLL
-- uniqHLL12 返回近似值
SELECT uniqHLL12(user_id) FROM events;
-- uniqExact 返回精确值
SELECT uniqExact(user_id) FROM events;
5. 实战案例
场景1:日活/月活统计
-- 日活跃用户数(DAU)
SELECT DATE(active_time) AS day, COUNT(DISTINCT user_id) AS dau
FROM user_activity
GROUP BY DATE(active_time);
-- 月活跃用户数(MAU)
SELECT DATE_FORMAT(active_time, '%Y-%m') AS month, COUNT(DISTINCT user_id) AS mau
FROM user_activity
GROUP BY DATE_FORMAT(active_time, '%Y-%m');
-- 日活/月活比
SELECT
DATE_FORMAT(active_time, '%Y-%m') AS month,
COUNT(DISTINCT user_id) AS mau,
ROUND(COUNT(DISTINCT user_id) / NULLIF(
(SELECT COUNT(DISTINCT user_id) FROM user_activity
WHERE DATE_FORMAT(active_time, '%Y-%m') = DATE_FORMAT(CURDATE(), '%Y-%m'))
, 0), 4) AS dau_mau_ratio
FROM user_activity
GROUP BY month;
场景2:去重+聚合
-- 每个用户购买的不同商品数
SELECT
user_id,
COUNT(DISTINCT product_id) AS unique_products,
SUM(amount) AS total_spent
FROM orders
GROUP BY user_id;
-- 按类别统计去重用户数
SELECT
category,
COUNT(DISTINCT user_id) AS unique_buyers,
COUNT(DISTINCT order_id) AS orders,
AVG(amount) AS avg_order_value
FROM orders
GROUP BY category;
场景3:去重+JOIN
-- 去重后JOIN
SELECT
u.user_id,
u.name,
COUNT(DISTINCT o.order_id) AS order_count
FROM users u
LEFT JOIN orders o ON u.user_id = o.user_id
GROUP BY u.user_id, u.name;
场景4:多条件去重
-- 同一天同一用户的多次访问只算一次
SELECT
DATE(visit_time) AS visit_date,
COUNT(DISTINCT user_id, DATE(visit_time)) AS unique_visitors
FROM page_visits
GROUP BY visit_date;
-- 不同渠道去重
SELECT
DATE(order_date) AS day,
COUNT(DISTINCT user_id) AS total_users,
COUNT(DISTINCT CASE WHEN source = 'app' THEN user_id END) AS app_users,
COUNT(DISTINCT CASE WHEN source = 'web' THEN user_id END) AS web_users
FROM orders
GROUP BY day;
6. 性能优化
大数据量去重优化
-- ✅ 建议:先过滤再去重
SELECT COUNT(DISTINCT user_id)
FROM (
SELECT user_id FROM orders WHERE order_date >= '2024-01-01'
) t;
-- ✅ 建议:使用近似去重(可接受误差时)
SELECT APPROX_COUNT_DISTINCT(user_id) FROM large_log;
-- ✅ 建议:分批去重后合并
SELECT COUNT(*) FROM (
SELECT user_id FROM orders_2024 WHERE user_id < 100000
UNION
SELECT user_id FROM orders_2024 WHERE user_id >= 100000
) t;
索引优化
-- 在去重字段上建立索引
CREATE INDEX idx_orders_user_date ON orders(user_id, order_date);
-- 分区表利用分区裁剪
SELECT COUNT(DISTINCT user_id)
FROM orders
WHERE dt >= '2024-01-01' AND dt < '2024-02-01';
7. 特殊去重场景
去重后保留最新记录
-- MySQL
SELECT *
FROM (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY order_date DESC) AS rn
FROM orders
) t
WHERE rn = 1;
去重统计各状态
-- 统计各状态的去重用户
SELECT
COUNT(DISTINCT user_id) AS total_users,
COUNT(DISTINCT CASE WHEN status = 'active' THEN user_id END) AS active_users,
COUNT(DISTINCT CASE WHEN status = 'inactive' THEN user_id END) AS inactive_users
FROM users;
连续去重
-- 按顺序去重
SELECT
user_id,
COUNT(*) AS total_events,
COUNT(DISTINCT event_type) AS unique_event_types
FROM user_events
GROUP BY user_id;
⚠️ 注意事项
1. NULL值处理
-- COUNT(DISTINCT) 忽略NULL
SELECT COUNT(DISTINCT user_id) FROM orders; -- NULL不计入
-- 如果需要包含NULL,可以转换
SELECT COUNT(DISTINCT COALESCE(user_id, -1)) FROM orders;
2. 多列去重
-- 多列去重是组合唯一
SELECT COUNT(DISTINCT col1, col2) FROM t;
-- 等同于
SELECT COUNT(*) FROM (SELECT DISTINCT col1, col2 FROM t) t;
3. 性能差异
| 方法 | 精确度 | 性能 | 内存使用 |
|---|---|---|---|
| COUNT(DISTINCT) | 100% | 慢 | 高 |
| APPROX_COUNT_DISTINCT | ~98% | 快 | 低 |
| HyperLogLog | ~97% | 最快 | 最低 |
📝 练习题
-- 建表
CREATE TABLE user_orders (
order_id INT,
user_id INT,
product_id INT,
amount DECIMAL(10,2),
order_date DATE,
status STRING
);
INSERT INTO user_orders VALUES
(1, 101, 1, 500, '2024-01-14', 'completed'),
(2, 101, 2, 300, '2024-01-14', 'completed'),
(3, 102, 1, 800, '2024-01-14', 'pending'),
(4, 102, 3, 450, '2024-01-15', 'completed'),
(5, 103, 2, 600, '2024-01-15', 'cancelled'),
(6, 101, 4, 200, '2024-01-15', 'completed'),
(7, 104, 1, 900, '2024-01-15', 'completed');
要求:
- 统计每日去重用户数
- 统计每日去重商品数
- 统计每个用户购买的不同商品数
- 统计每个状态的去重用户数
- 使用近似去重计算用户数
-- 参考答案(MySQL)
-- 1. 每日去重用户数
SELECT
order_date,
COUNT(DISTINCT user_id) AS daily_users
FROM user_orders
GROUP BY order_date;
-- 2. 每日去重商品数
SELECT
order_date,
COUNT(DISTINCT product_id) AS daily_products
FROM user_orders
GROUP BY order_date;
-- 3. 每个用户购买的不同商品数
SELECT
user_id,
COUNT(DISTINCT product_id) AS unique_products,
SUM(amount) AS total_spent
FROM user_orders
GROUP BY user_id;
-- 4. 每个状态的去重用户数
SELECT
status,
COUNT(DISTINCT user_id) AS users,
COUNT(*) AS orders
FROM user_orders
GROUP BY status;
-- 5. 近似去重
SELECT
order_date,
APPROX_COUNT_DISTINCT(user_id) AS approx_users
FROM user_orders
GROUP BY order_date;
順子の杂货铺


