顺子の杂货铺
生命不息,折腾不止,且行且珍惜~

25-留存分析

DMIT VPS

留存分析

留存分析是衡量用户粘性的核心指标,用于分析用户在首次使用后持续使用的情况。

📊 支持情况

数据库 DATEDIFF COUNT DISTINCT WINDOW JOIN 备注
MySQL 完整支持
Oracle 完整支持
ClickHouse 完整支持
Hologres 兼容PG
MaxCompute 完整支持
Hive 完整支持

1. 基础留存计算

日留存率

-- MySQL - 计算次日留存
SELECT
    install_date,
    COUNT(*) AS new_users,
    SUM(CASE WHEN days_since_install = 1 THEN 1 ELSE 0 END) AS retained_day1,
    ROUND(
        SUM(CASE WHEN days_since_install = 1 THEN 1 ELSE 0 END) * 100.0 / COUNT(*),
        2
    ) AS day1_retention
FROM (
    SELECT
        user_id,
        MIN(DATE(install_time)) AS install_date,
        DATEDIFF(DATE(event_time), MIN(DATE(install_time))) AS days_since_install
    FROM user_actions
    WHERE event_time >= '2024-01-01'
    GROUP BY user_id, DATE(event_time)
) t
WHERE days_since_install BETWEEN 0 AND 1
GROUP BY install_date;

多日留存

-- MySQL - 1-7日留存
WITH user_daily AS (
    SELECT
        user_id,
        MIN(DATE(install_time)) AS install_date,
        DATE(event_time) AS activity_date,
        DATEDIFF(DATE(event_time), MIN(DATE(install_time))) AS days
    FROM user_actions
    GROUP BY user_id, DATE(event_time)
)
SELECT
    install_date,
    COUNT(DISTINCT user_id) AS new_users,
    COUNT(DISTINCT CASE WHEN days = 0 THEN user_id END) AS day0,
    COUNT(DISTINCT CASE WHEN days = 1 THEN user_id END) AS day1,
    COUNT(DISTINCT CASE WHEN days = 2 THEN user_id END) AS day2,
    COUNT(DISTINCT CASE WHEN days = 3 THEN user_id END) AS day3,
    COUNT(DISTINCT CASE WHEN days = 4 THEN user_id END) AS day4,
    COUNT(DISTINCT CASE WHEN days = 5 THEN user_id END) AS day5,
    COUNT(DISTINCT CASE WHEN days = 6 THEN user_id END) AS day6,
    COUNT(DISTINCT CASE WHEN days = 7 THEN user_id END) AS day7
FROM user_daily
GROUP BY install_date;

2. 留存曲线

留存率曲线

-- MySQL - 计算留存曲线数据
SELECT
    days_since_install,
    COUNT(*) AS users,
    ROUND(COUNT(*) * 100.0 / FIRST_VALUE(COUNT(*)) OVER (ORDER BY days_since_install), 2) AS retention_pct
FROM (
    SELECT
        user_id,
        DATEDIFF(activity_date, install_date) AS days_since_install
    FROM (
        SELECT
            user_id,
            MIN(DATE(install_time)) AS install_date,
            DATE(event_time) AS activity_date
        FROM user_actions
        WHERE event_time >= '2024-01-01'
        GROUP BY user_id, DATE(event_time)
    ) t
    WHERE DATEDIFF(activity_date, install_date) <= 30
) t
GROUP BY days_since_install
ORDER BY days_since_install;

周留存/月留存

-- 周留存
SELECT
    install_week,
    COUNT(DISTINCT user_id) AS new_users,
    COUNT(DISTINCT CASE WHEN weeks = 1 THEN user_id END) AS week1,
    COUNT(DISTINCT CASE WHEN weeks = 2 THEN user_id END) AS week2,
    COUNT(DISTINCT CASE WHEN weeks = 4 THEN user_id END) AS week4
FROM (
    SELECT
        user_id,
        YEARWEEK(install_time, 1) AS install_week,
        FLOOR(DATEDIFF(event_time, install_time) / 7) AS weeks
    FROM user_actions
) t
GROUP BY install_week;

3. 同期群分析(Cohort Analysis)

月度同期群

-- MySQL
WITH cohorts AS (
    SELECT
        user_id,
        DATE_FORMAT(MIN(install_time), '%Y-%m') AS cohort,
        MIN(DATE(install_time)) AS install_date
    FROM users
    GROUP BY user_id
),
activity AS (
    SELECT
        c.user_id,
        c.cohort,
        c.install_date,
        DATEDIFF(DATE(a.action_time), c.install_date) AS days
    FROM cohorts c
    JOIN user_actions a ON c.user_id = a.user_id
)
SELECT
    cohort,
    COUNT(DISTINCT user_id) AS cohort_size,
    ROUND(COUNT(DISTINCT CASE WHEN days < 7 THEN user_id END) * 100.0 / COUNT(DISTINCT user_id), 2) AS week1_ret,
    ROUND(COUNT(DISTINCT CASE WHEN days < 14 THEN user_id END) * 100.0 / COUNT(DISTINCT user_id), 2) AS week2_ret,
    ROUND(COUNT(DISTINCT CASE WHEN days < 30 THEN user_id END) * 100.0 / COUNT(DISTINCT user_id), 2) AS month1_ret
FROM activity
GROUP BY cohort
ORDER BY cohort;

留存矩阵

-- MySQL - 同期群留存矩阵
WITH cohorts AS (
    SELECT user_id, MIN(DATE(install_time)) AS install_date
    FROM users
    GROUP BY user_id
),
retention AS (
    SELECT
        c.install_date,
        DATEDIFF(DATE(a.action_time), c.install_date) AS days,
        c.user_id
    FROM cohorts c
    JOIN user_actions a ON c.user_id = a.user_id
)
SELECT
    install_date AS cohort,
    COUNT(DISTINCT user_id) AS total_users,
    ROUND(SUM(CASE WHEN days = 0 THEN 1 ELSE 0 END) * 100.0 / COUNT(DISTINCT user_id), 1) AS day0,
    ROUND(SUM(CASE WHEN days = 1 THEN 1 ELSE 0 END) * 100.0 / COUNT(DISTINCT user_id), 1) AS day1,
    ROUND(SUM(CASE WHEN days = 2 THEN 1 ELSE 0 END) * 100.0 / COUNT(DISTINCT user_id), 1) AS day2,
    ROUND(SUM(CASE WHEN days = 3 THEN 1 ELSE 0 END) * 100.0 / COUNT(DISTINCT user_id), 1) AS day3,
    ROUND(SUM(CASE WHEN days = 6 THEN 1 ELSE 0 END) * 100.0 / COUNT(DISTINCT user_id), 1) AS day6
FROM retention
GROUP BY install_date
ORDER BY install_date;

4. 实战案例

场景1:渠道留存对比

-- MySQL
SELECT
    source,
    install_date,
    COUNT(*) AS new_users,
    ROUND(SUM(CASE WHEN days = 1 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) AS day1_ret,
    ROUND(SUM(CASE WHEN days = 7 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) AS day7_ret,
    ROUND(SUM(CASE WHEN days = 30 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) AS day30_ret
FROM (
    SELECT
        user_id,
        source,
        MIN(DATE(install_time)) AS install_date,
        DATEDIFF(DATE(event_time), MIN(DATE(install_time))) AS days
    FROM user_actions
    GROUP BY user_id, source, DATE(event_time)
) t
GROUP BY source, install_date;

场景2:版本留存对比

-- Oracle
SELECT
    app_version,
    cohort_month,
    COUNT(*) AS installs,
    ROUND(AVG(CASE WHEN months = 1 THEN retention END), 2) AS m1_ret,
    ROUND(AVG(CASE WHEN months = 3 THEN retention END), 2) AS m3_ret
FROM (
    SELECT
        user_id,
        app_version,
        TO_CHAR(MIN(install_date), 'YYYY-MM') AS cohort_month,
        MONTHS_BETWEEN(activity_date, install_date) AS months,
        1 AS retention
    FROM user_activity
    GROUP BY user_id, app_version, activity_date
)
GROUP BY app_version, cohort_month;

场景3:付费用户留存

SELECT
    user_type,
    install_date,
    COUNT(*) AS users,
    ROUND(SUM(CASE WHEN days = 0 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) AS day0,
    ROUND(SUM(CASE WHEN days = 7 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) AS day7
FROM (
    SELECT
        u.user_id,
        CASE WHEN p.user_id IS NOT NULL THEN 'paid' ELSE 'free' END AS user_type,
        MIN(DATE(u.install_time)) AS install_date,
        DATEDIFF(DATE(a.action_time), MIN(DATE(u.install_time))) AS days
    FROM users u
    LEFT JOIN purchases p ON u.user_id = p.user_id
    JOIN user_actions a ON u.user_id = a.user_id
    GROUP BY u.user_id, CASE WHEN p.user_id IS NOT NULL THEN 'paid' ELSE 'free' END, DATE(a.action_time)
) t
GROUP BY user_type, install_date;

场景4:回访分析

-- MySQL - 用户回访周期
SELECT
    user_id,
    install_date,
    last_active_date,
    DATEDIFF(last_active_date, install_date) AS days_active,
    DATEDIFF(CURDATE(), last_active_date) AS days_since_last
FROM (
    SELECT
        user_id,
        MIN(DATE(install_time)) AS install_date,
        MAX(DATE(last_action_time)) AS last_active_date
    FROM users
    GROUP BY user_id
) t
ORDER BY days_since_last;

5. 留存预测

预测次日留存

-- 基于历史数据的简单预测
SELECT
    install_source,
    predicted_day1_retention
FROM (
    SELECT
        install_source,
        AVG(actual_day1_ret) AS predicted_day1_ret
    FROM (
        SELECT
            install_date,
            install_source,
            ROUND(SUM(CASE WHEN days = 1 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) AS actual_day1_ret
        FROM (
            SELECT
                user_id,
                install_source,
                DATE(install_time) AS install_date,
                DATEDIFF(DATE(event_time), DATE(install_time)) AS days
            FROM user_actions
            GROUP BY user_id, install_source, DATE(event_time)
        ) t
        GROUP BY install_date, install_source
    ) t
    WHERE install_date < CURDATE() - INTERVAL 7 DAY
    GROUP BY install_source
) t;

⚠️ 注意事项

1. 时间边界

-- 确保安装日期和活动日期在同一时区
-- 建议统一使用DATE()转换

-- 避免时区问题
SET time_zone = '+08:00';

2. 去重逻辑

-- 同一用户同一天多次活动只算一次
-- ✅ 正确
COUNT(DISTINCT user_id)

-- ❌ 错误
COUNT(*)

3. 数据延迟

-- 注意数据延迟可能影响当天留存计算
-- 通常T+1查看T日数据更准确

📝 练习题

-- 建表
CREATE TABLE user_retention (
    user_id INT,
    action_date DATE,
    install_date DATE,
    source STRING
);

INSERT INTO user_retention VALUES
(1, '2024-01-01', '2024-01-01', 'A'),
(1, '2024-01-02', '2024-01-01', 'A'),
(1, '2024-01-08', '2024-01-01', 'A'),
(2, '2024-01-01', '2024-01-01', 'B'),
(2, '2024-01-02', '2024-01-01', 'B'),
(3, '2024-01-02', '2024-01-02', 'A'),
(3, '2024-01-03', '2024-01-02', 'A'),
(4, '2024-01-02', '2024-01-02', 'B');

要求

  1. 计算每日新用户数
  2. 计算次日留存率
  3. 计算3日留存率
  4. 按来源渠道对比留存
-- 参考答案(MySQL)
-- 1. 每日新用户数
SELECT install_date, COUNT(DISTINCT user_id) AS new_users
FROM user_retention
GROUP BY install_date;

-- 2. 次日留存率
SELECT
    install_date,
    COUNT(DISTINCT user_id) AS new_users,
    COUNT(DISTINCT CASE WHEN DATEDIFF(action_date, install_date) = 1 THEN user_id END) AS day1_users,
    ROUND(
        COUNT(DISTINCT CASE WHEN DATEDIFF(action_date, install_date) = 1 THEN user_id END) * 100.0 /
        NULLIF(COUNT(DISTINCT user_id), 0),
        2
    ) AS day1_retention
FROM user_retention
GROUP BY install_date;

-- 3. 3日留存率
SELECT
    install_date,
    ROUND(
        COUNT(DISTINCT CASE WHEN DATEDIFF(action_date, install_date) = 2 THEN user_id END) * 100.0 /
        NULLIF(COUNT(DISTINCT user_id), 0),
        2
    ) AS day3_retention
FROM user_retention
GROUP BY install_date;

-- 4. 渠道留存对比
SELECT
    source,
    ROUND(
        COUNT(DISTINCT CASE WHEN DATEDIFF(action_date, install_date) = 1 THEN user_id END) * 100.0 /
        NULLIF(COUNT(DISTINCT user_id), 0),
        2
    ) AS day1_retention
FROM user_retention
GROUP BY source;
赞(0)
未经允许不得转载:順子の杂货铺 » 25-留存分析
搬瓦工VPS

评论 抢沙发

  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址

分享创造快乐

联系我们联系我们