顺子の杂货铺
生命不息,折腾不止,且行且珍惜~

29-抽样查询

DMIT VPS

抽样查询

抽样查询是处理大数据集的重要技术,用于获取代表性样本进行快速分析。

📊 支持情况

数据库TABLESAMPLEBERNOULLISYSTEMSAMPLE备注
MySQL需模拟
OracleSAMPLE
ClickHouse完整支持
Hologres兼容PG
MaxComputeTABLESAMPLE
Hive完整支持

1. 基础抽样

Hive 抽样

-- 按百分比抽样
SELECT * FROM large_table TABLESAMPLE (10 PERCENT);

-- 按行数抽样
SELECT * FROM large_table TABLESAMPLE (1000 ROWS);

-- 按数据大小抽样
SELECT * FROM large_table TABLESAMPLE (1G);

Hive 哈希抽样

-- 哈希分桶抽样
SELECT * FROM large_table
TABLESAMPLE (BUCKET 1 OUT OF 32 ON user_id);

-- 指定分桶
SELECT * FROM large_table
WHERE rand() < 0.1;  -- 简单随机抽样

ClickHouse 抽样

-- 按百分比抽样(系统抽样)
SELECT * FROM large_table SAMPLE 0.1;

-- 指定样本数
SELECT * FROM large_table SAMPLE 10000;

-- 协调抽样(JOIN时保持一致性)
SELECT *
FROM orders SAMPLE 0.1
JOIN users SAMPLE 0.1 ON orders.user_id = users.user_id;

2. 统计抽样

伯努利抽样

-- Hive - 伯努利抽样(每个数据块独立决定)
SELECT * FROM large_table
TABLESAMPLE (10 PERCENT) t;  -- 每个块有10%概率被包含

系统抽样

-- Hive - 系统抽样(均匀分布)
SELECT * FROM large_table
TABLESAMPLE (10 PERCENT) t
WHERE rand() < 0.1;  -- 结合随机过滤

精确行数抽样

-- Oracle - 精确百分比
SELECT * FROM large_table SAMPLE (10);

-- Oracle - 精确行数
SELECT * FROM large_table SAMPLE BLOCK (10);

-- Oracle - 种子抽样(可重复)
SELECT * FROM large_table SAMPLE (10) SEED (123);

3. 分层抽样

按组分层抽样

-- Hive - 分层抽样
SELECT *
FROM (
    SELECT *,
           ROW_NUMBER() OVER (PARTITION BY category ORDER BY RAND()) AS rn
    FROM large_table
) t
WHERE rn <= 100;  -- 每层取100条

权重抽样

-- MySQL - 按权重抽样
SELECT *
FROM products
ORDER BY RAND() * CASE
    WHEN category = 'hot' THEN 0.5   -- 热门商品权重高
    WHEN category = 'new' THEN 0.3
    ELSE 0.2
END
LIMIT 1000;

4. 实战案例

场景1:快速数据分析

-- Hive - 抽样后快速统计
SELECT
    AVG(amount),
    COUNT(DISTINCT user_id),
    MAX(amount)
FROM (
    SELECT * FROM large_orders
    TABLESAMPLE (1 PERCENT)
) sample;

场景2:数据质量检查

-- Hive - 抽样检查数据质量
SELECT
    COUNT(*) AS total_rows,
    COUNT(*) - COUNT(col1) AS col1_nulls,
    COUNT(DISTINCT col2) AS unique_col2
FROM (
    SELECT * FROM raw_data
    TABLESAMPLE (10000 ROWS)
) sample;

场景3:A/B测试样本

-- Hive - 获取测试组样本
SELECT *
FROM (
    SELECT *,
           ROW_NUMBER() OVER (PARTITION BY experiment_group ORDER BY RAND()) AS rn
    FROM users
    WHERE experiment_group IN ('A', 'B')
) t
WHERE rn <= 1000;  -- 每组1000人

场景4:数据导出

-- ClickHouse - 导出样本数据
SELECT *
FROM events
SAMPLE 10000
INTO OUTFILE '/tmp/sample.csv'
FORMAT CSVWithNames;

5. 抽样函数

随机数函数

-- MySQL
SELECT RAND(), RAND(123);  -- 种子随机数

-- Hive
SELECT rand(), rand(seed);

-- Oracle
SELECT DBMS_RANDOM.VALUE FROM dual;

哈希抽样函数

-- Hive - 哈希取模
SELECT *
FROM users
WHERE ABS(HASH(user_id)) % 100 < 10;  -- 10%样本

-- 改进的哈希
SELECT *
FROM users
WHERE pmod(hash(user_id), 100) < 10;

⚠️ 注意事项

1. 抽样代表性

-- 简单随机抽样可能不均匀
-- 分层抽样更适合不均匀数据

-- 检验抽样质量
SELECT category, COUNT(*) as cnt
FROM (SELECT * FROM data TABLESAMPLE (10 PERCENT)) t
GROUP BY category
ORDER BY cnt;

2. 性能影响

-- 系统抽样通常比伯努利抽样快
-- 但可能不均匀

-- 大数据量时优先使用TABLESAMPLE

3. JOIN时抽样

-- ClickHouse 协调抽样
SELECT *
FROM orders SAMPLE 0.1
JOIN items SAMPLE 0.1 ON orders.id = items.order_id;

-- Hive 需要手动处理
WITH orders_sample AS (
    SELECT * FROM orders TABLESAMPLE (10 PERCENT)
),
items_sample AS (
    SELECT * FROM items TABLESAMPLE (10 PERCENT)
)
SELECT * FROM orders_sample o
JOIN items_sample i ON o.id = i.order_id;

📝 练习题

-- 建表
CREATE TABLE large_logs (
    log_id BIGINT,
    user_id BIGINT,
    event_type STRING,
    event_time TIMESTAMP
);

要求

  1. 抽取10000条日志样本
  2. 按用户ID取10%的用户样本
  3. 抽取1%数据统计各事件类型分布
  4. 使用哈希抽样确保样本可重复
-- 参考答案(Hive)
-- 1. 抽取固定行数
SELECT * FROM large_logs TABLESAMPLE (10000 ROWS);

-- 2. 按用户抽样(哈希)
SELECT *
FROM large_logs
WHERE ABS(HASH(user_id)) % 100 < 10;

-- 3. 1%抽样统计
SELECT
    event_type,
    COUNT(*) AS event_count
FROM (SELECT * FROM large_logs TABLESAMPLE (1 PERCENT)) t
GROUP BY event_type
ORDER BY event_count DESC;

-- 4. 哈希抽样(可重复)
SELECT *
FROM large_logs
WHERE pmod(hash(concat(user_id, '2024')), 1000) < 100;  -- 10%样本
赞(0)
未经允许不得转载:順子の杂货铺 » 29-抽样查询
搬瓦工VPS

评论 抢沙发

  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址

分享创造快乐

联系我们联系我们