留存分析
留存分析是衡量用户粘性的核心指标,用于分析用户在首次使用后持续使用的情况。
📊 支持情况
| 数据库 | DATEDIFF | COUNT DISTINCT | WINDOW | JOIN | 备注 |
|---|---|---|---|---|---|
| MySQL | ✅ | ✅ | ✅ | ✅ | 完整支持 |
| Oracle | ✅ | ✅ | ✅ | ✅ | 完整支持 |
| ClickHouse | ✅ | ✅ | ✅ | ✅ | 完整支持 |
| Hologres | ✅ | ✅ | ✅ | ✅ | 兼容PG |
| MaxCompute | ✅ | ✅ | ✅ | ✅ | 完整支持 |
| Hive | ✅ | ✅ | ✅ | ✅ | 完整支持 |
1. 基础留存计算
日留存率
-- MySQL - 计算次日留存
SELECT
install_date,
COUNT(*) AS new_users,
SUM(CASE WHEN days_since_install = 1 THEN 1 ELSE 0 END) AS retained_day1,
ROUND(
SUM(CASE WHEN days_since_install = 1 THEN 1 ELSE 0 END) * 100.0 / COUNT(*),
2
) AS day1_retention
FROM (
SELECT
user_id,
MIN(DATE(install_time)) AS install_date,
DATEDIFF(DATE(event_time), MIN(DATE(install_time))) AS days_since_install
FROM user_actions
WHERE event_time >= '2024-01-01'
GROUP BY user_id, DATE(event_time)
) t
WHERE days_since_install BETWEEN 0 AND 1
GROUP BY install_date;多日留存
-- MySQL - 1-7日留存
WITH user_daily AS (
SELECT
user_id,
MIN(DATE(install_time)) AS install_date,
DATE(event_time) AS activity_date,
DATEDIFF(DATE(event_time), MIN(DATE(install_time))) AS days
FROM user_actions
GROUP BY user_id, DATE(event_time)
)
SELECT
install_date,
COUNT(DISTINCT user_id) AS new_users,
COUNT(DISTINCT CASE WHEN days = 0 THEN user_id END) AS day0,
COUNT(DISTINCT CASE WHEN days = 1 THEN user_id END) AS day1,
COUNT(DISTINCT CASE WHEN days = 2 THEN user_id END) AS day2,
COUNT(DISTINCT CASE WHEN days = 3 THEN user_id END) AS day3,
COUNT(DISTINCT CASE WHEN days = 4 THEN user_id END) AS day4,
COUNT(DISTINCT CASE WHEN days = 5 THEN user_id END) AS day5,
COUNT(DISTINCT CASE WHEN days = 6 THEN user_id END) AS day6,
COUNT(DISTINCT CASE WHEN days = 7 THEN user_id END) AS day7
FROM user_daily
GROUP BY install_date;2. 留存曲线
留存率曲线
-- MySQL - 计算留存曲线数据
SELECT
days_since_install,
COUNT(*) AS users,
ROUND(COUNT(*) * 100.0 / FIRST_VALUE(COUNT(*)) OVER (ORDER BY days_since_install), 2) AS retention_pct
FROM (
SELECT
user_id,
DATEDIFF(activity_date, install_date) AS days_since_install
FROM (
SELECT
user_id,
MIN(DATE(install_time)) AS install_date,
DATE(event_time) AS activity_date
FROM user_actions
WHERE event_time >= '2024-01-01'
GROUP BY user_id, DATE(event_time)
) t
WHERE DATEDIFF(activity_date, install_date) <= 30
) t
GROUP BY days_since_install
ORDER BY days_since_install;周留存/月留存
-- 周留存
SELECT
install_week,
COUNT(DISTINCT user_id) AS new_users,
COUNT(DISTINCT CASE WHEN weeks = 1 THEN user_id END) AS week1,
COUNT(DISTINCT CASE WHEN weeks = 2 THEN user_id END) AS week2,
COUNT(DISTINCT CASE WHEN weeks = 4 THEN user_id END) AS week4
FROM (
SELECT
user_id,
YEARWEEK(install_time, 1) AS install_week,
FLOOR(DATEDIFF(event_time, install_time) / 7) AS weeks
FROM user_actions
) t
GROUP BY install_week;3. 同期群分析(Cohort Analysis)
月度同期群
-- MySQL
WITH cohorts AS (
SELECT
user_id,
DATE_FORMAT(MIN(install_time), '%Y-%m') AS cohort,
MIN(DATE(install_time)) AS install_date
FROM users
GROUP BY user_id
),
activity AS (
SELECT
c.user_id,
c.cohort,
c.install_date,
DATEDIFF(DATE(a.action_time), c.install_date) AS days
FROM cohorts c
JOIN user_actions a ON c.user_id = a.user_id
)
SELECT
cohort,
COUNT(DISTINCT user_id) AS cohort_size,
ROUND(COUNT(DISTINCT CASE WHEN days < 7 THEN user_id END) * 100.0 / COUNT(DISTINCT user_id), 2) AS week1_ret,
ROUND(COUNT(DISTINCT CASE WHEN days < 14 THEN user_id END) * 100.0 / COUNT(DISTINCT user_id), 2) AS week2_ret,
ROUND(COUNT(DISTINCT CASE WHEN days < 30 THEN user_id END) * 100.0 / COUNT(DISTINCT user_id), 2) AS month1_ret
FROM activity
GROUP BY cohort
ORDER BY cohort;留存矩阵
-- MySQL - 同期群留存矩阵
WITH cohorts AS (
SELECT user_id, MIN(DATE(install_time)) AS install_date
FROM users
GROUP BY user_id
),
retention AS (
SELECT
c.install_date,
DATEDIFF(DATE(a.action_time), c.install_date) AS days,
c.user_id
FROM cohorts c
JOIN user_actions a ON c.user_id = a.user_id
)
SELECT
install_date AS cohort,
COUNT(DISTINCT user_id) AS total_users,
ROUND(SUM(CASE WHEN days = 0 THEN 1 ELSE 0 END) * 100.0 / COUNT(DISTINCT user_id), 1) AS day0,
ROUND(SUM(CASE WHEN days = 1 THEN 1 ELSE 0 END) * 100.0 / COUNT(DISTINCT user_id), 1) AS day1,
ROUND(SUM(CASE WHEN days = 2 THEN 1 ELSE 0 END) * 100.0 / COUNT(DISTINCT user_id), 1) AS day2,
ROUND(SUM(CASE WHEN days = 3 THEN 1 ELSE 0 END) * 100.0 / COUNT(DISTINCT user_id), 1) AS day3,
ROUND(SUM(CASE WHEN days = 6 THEN 1 ELSE 0 END) * 100.0 / COUNT(DISTINCT user_id), 1) AS day6
FROM retention
GROUP BY install_date
ORDER BY install_date;4. 实战案例
场景1:渠道留存对比
-- MySQL
SELECT
source,
install_date,
COUNT(*) AS new_users,
ROUND(SUM(CASE WHEN days = 1 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) AS day1_ret,
ROUND(SUM(CASE WHEN days = 7 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) AS day7_ret,
ROUND(SUM(CASE WHEN days = 30 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) AS day30_ret
FROM (
SELECT
user_id,
source,
MIN(DATE(install_time)) AS install_date,
DATEDIFF(DATE(event_time), MIN(DATE(install_time))) AS days
FROM user_actions
GROUP BY user_id, source, DATE(event_time)
) t
GROUP BY source, install_date;场景2:版本留存对比
-- Oracle
SELECT
app_version,
cohort_month,
COUNT(*) AS installs,
ROUND(AVG(CASE WHEN months = 1 THEN retention END), 2) AS m1_ret,
ROUND(AVG(CASE WHEN months = 3 THEN retention END), 2) AS m3_ret
FROM (
SELECT
user_id,
app_version,
TO_CHAR(MIN(install_date), 'YYYY-MM') AS cohort_month,
MONTHS_BETWEEN(activity_date, install_date) AS months,
1 AS retention
FROM user_activity
GROUP BY user_id, app_version, activity_date
)
GROUP BY app_version, cohort_month;场景3:付费用户留存
SELECT
user_type,
install_date,
COUNT(*) AS users,
ROUND(SUM(CASE WHEN days = 0 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) AS day0,
ROUND(SUM(CASE WHEN days = 7 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) AS day7
FROM (
SELECT
u.user_id,
CASE WHEN p.user_id IS NOT NULL THEN 'paid' ELSE 'free' END AS user_type,
MIN(DATE(u.install_time)) AS install_date,
DATEDIFF(DATE(a.action_time), MIN(DATE(u.install_time))) AS days
FROM users u
LEFT JOIN purchases p ON u.user_id = p.user_id
JOIN user_actions a ON u.user_id = a.user_id
GROUP BY u.user_id, CASE WHEN p.user_id IS NOT NULL THEN 'paid' ELSE 'free' END, DATE(a.action_time)
) t
GROUP BY user_type, install_date;场景4:回访分析
-- MySQL - 用户回访周期
SELECT
user_id,
install_date,
last_active_date,
DATEDIFF(last_active_date, install_date) AS days_active,
DATEDIFF(CURDATE(), last_active_date) AS days_since_last
FROM (
SELECT
user_id,
MIN(DATE(install_time)) AS install_date,
MAX(DATE(last_action_time)) AS last_active_date
FROM users
GROUP BY user_id
) t
ORDER BY days_since_last;5. 留存预测
预测次日留存
-- 基于历史数据的简单预测
SELECT
install_source,
predicted_day1_retention
FROM (
SELECT
install_source,
AVG(actual_day1_ret) AS predicted_day1_ret
FROM (
SELECT
install_date,
install_source,
ROUND(SUM(CASE WHEN days = 1 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) AS actual_day1_ret
FROM (
SELECT
user_id,
install_source,
DATE(install_time) AS install_date,
DATEDIFF(DATE(event_time), DATE(install_time)) AS days
FROM user_actions
GROUP BY user_id, install_source, DATE(event_time)
) t
GROUP BY install_date, install_source
) t
WHERE install_date < CURDATE() - INTERVAL 7 DAY
GROUP BY install_source
) t;⚠️ 注意事项
1. 时间边界
-- 确保安装日期和活动日期在同一时区
-- 建议统一使用DATE()转换
-- 避免时区问题
SET time_zone = '+08:00';2. 去重逻辑
-- 同一用户同一天多次活动只算一次
-- ✅ 正确
COUNT(DISTINCT user_id)
-- ❌ 错误
COUNT(*)3. 数据延迟
-- 注意数据延迟可能影响当天留存计算
-- 通常T+1查看T日数据更准确📝 练习题
-- 建表
CREATE TABLE user_retention (
user_id INT,
action_date DATE,
install_date DATE,
source STRING
);
INSERT INTO user_retention VALUES
(1, '2024-01-01', '2024-01-01', 'A'),
(1, '2024-01-02', '2024-01-01', 'A'),
(1, '2024-01-08', '2024-01-01', 'A'),
(2, '2024-01-01', '2024-01-01', 'B'),
(2, '2024-01-02', '2024-01-01', 'B'),
(3, '2024-01-02', '2024-01-02', 'A'),
(3, '2024-01-03', '2024-01-02', 'A'),
(4, '2024-01-02', '2024-01-02', 'B');要求:
- 计算每日新用户数
- 计算次日留存率
- 计算3日留存率
- 按来源渠道对比留存
-- 参考答案(MySQL)
-- 1. 每日新用户数
SELECT install_date, COUNT(DISTINCT user_id) AS new_users
FROM user_retention
GROUP BY install_date;
-- 2. 次日留存率
SELECT
install_date,
COUNT(DISTINCT user_id) AS new_users,
COUNT(DISTINCT CASE WHEN DATEDIFF(action_date, install_date) = 1 THEN user_id END) AS day1_users,
ROUND(
COUNT(DISTINCT CASE WHEN DATEDIFF(action_date, install_date) = 1 THEN user_id END) * 100.0 /
NULLIF(COUNT(DISTINCT user_id), 0),
2
) AS day1_retention
FROM user_retention
GROUP BY install_date;
-- 3. 3日留存率
SELECT
install_date,
ROUND(
COUNT(DISTINCT CASE WHEN DATEDIFF(action_date, install_date) = 2 THEN user_id END) * 100.0 /
NULLIF(COUNT(DISTINCT user_id), 0),
2
) AS day3_retention
FROM user_retention
GROUP BY install_date;
-- 4. 渠道留存对比
SELECT
source,
ROUND(
COUNT(DISTINCT CASE WHEN DATEDIFF(action_date, install_date) = 1 THEN user_id END) * 100.0 /
NULLIF(COUNT(DISTINCT user_id), 0),
2
) AS day1_retention
FROM user_retention
GROUP BY source;
順子の杂货铺


