抽样查询
抽样查询是处理大数据集的重要技术,用于获取代表性样本进行快速分析。
📊 支持情况
| 数据库 | TABLESAMPLE | BERNOULLI | SYSTEM | SAMPLE | 备注 |
|---|---|---|---|---|---|
| MySQL | ❌ | ❌ | ❌ | ✅ | 需模拟 |
| Oracle | ✅ | – | – | ✅ | SAMPLE |
| ClickHouse | ✅ | ✅ | ✅ | – | 完整支持 |
| Hologres | ✅ | – | – | – | 兼容PG |
| MaxCompute | ✅ | – | – | – | TABLESAMPLE |
| Hive | ✅ | ✅ | ✅ | – | 完整支持 |
1. 基础抽样
Hive 抽样
-- 按百分比抽样
SELECT * FROM large_table TABLESAMPLE (10 PERCENT);
-- 按行数抽样
SELECT * FROM large_table TABLESAMPLE (1000 ROWS);
-- 按数据大小抽样
SELECT * FROM large_table TABLESAMPLE (1G);Hive 哈希抽样
-- 哈希分桶抽样
SELECT * FROM large_table
TABLESAMPLE (BUCKET 1 OUT OF 32 ON user_id);
-- 指定分桶
SELECT * FROM large_table
WHERE rand() < 0.1; -- 简单随机抽样ClickHouse 抽样
-- 按百分比抽样(系统抽样)
SELECT * FROM large_table SAMPLE 0.1;
-- 指定样本数
SELECT * FROM large_table SAMPLE 10000;
-- 协调抽样(JOIN时保持一致性)
SELECT *
FROM orders SAMPLE 0.1
JOIN users SAMPLE 0.1 ON orders.user_id = users.user_id;2. 统计抽样
伯努利抽样
-- Hive - 伯努利抽样(每个数据块独立决定)
SELECT * FROM large_table
TABLESAMPLE (10 PERCENT) t; -- 每个块有10%概率被包含系统抽样
-- Hive - 系统抽样(均匀分布)
SELECT * FROM large_table
TABLESAMPLE (10 PERCENT) t
WHERE rand() < 0.1; -- 结合随机过滤精确行数抽样
-- Oracle - 精确百分比
SELECT * FROM large_table SAMPLE (10);
-- Oracle - 精确行数
SELECT * FROM large_table SAMPLE BLOCK (10);
-- Oracle - 种子抽样(可重复)
SELECT * FROM large_table SAMPLE (10) SEED (123);3. 分层抽样
按组分层抽样
-- Hive - 分层抽样
SELECT *
FROM (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY category ORDER BY RAND()) AS rn
FROM large_table
) t
WHERE rn <= 100; -- 每层取100条权重抽样
-- MySQL - 按权重抽样
SELECT *
FROM products
ORDER BY RAND() * CASE
WHEN category = 'hot' THEN 0.5 -- 热门商品权重高
WHEN category = 'new' THEN 0.3
ELSE 0.2
END
LIMIT 1000;4. 实战案例
场景1:快速数据分析
-- Hive - 抽样后快速统计
SELECT
AVG(amount),
COUNT(DISTINCT user_id),
MAX(amount)
FROM (
SELECT * FROM large_orders
TABLESAMPLE (1 PERCENT)
) sample;场景2:数据质量检查
-- Hive - 抽样检查数据质量
SELECT
COUNT(*) AS total_rows,
COUNT(*) - COUNT(col1) AS col1_nulls,
COUNT(DISTINCT col2) AS unique_col2
FROM (
SELECT * FROM raw_data
TABLESAMPLE (10000 ROWS)
) sample;场景3:A/B测试样本
-- Hive - 获取测试组样本
SELECT *
FROM (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY experiment_group ORDER BY RAND()) AS rn
FROM users
WHERE experiment_group IN ('A', 'B')
) t
WHERE rn <= 1000; -- 每组1000人场景4:数据导出
-- ClickHouse - 导出样本数据
SELECT *
FROM events
SAMPLE 10000
INTO OUTFILE '/tmp/sample.csv'
FORMAT CSVWithNames;5. 抽样函数
随机数函数
-- MySQL
SELECT RAND(), RAND(123); -- 种子随机数
-- Hive
SELECT rand(), rand(seed);
-- Oracle
SELECT DBMS_RANDOM.VALUE FROM dual;哈希抽样函数
-- Hive - 哈希取模
SELECT *
FROM users
WHERE ABS(HASH(user_id)) % 100 < 10; -- 10%样本
-- 改进的哈希
SELECT *
FROM users
WHERE pmod(hash(user_id), 100) < 10;⚠️ 注意事项
1. 抽样代表性
-- 简单随机抽样可能不均匀
-- 分层抽样更适合不均匀数据
-- 检验抽样质量
SELECT category, COUNT(*) as cnt
FROM (SELECT * FROM data TABLESAMPLE (10 PERCENT)) t
GROUP BY category
ORDER BY cnt;2. 性能影响
-- 系统抽样通常比伯努利抽样快
-- 但可能不均匀
-- 大数据量时优先使用TABLESAMPLE3. JOIN时抽样
-- ClickHouse 协调抽样
SELECT *
FROM orders SAMPLE 0.1
JOIN items SAMPLE 0.1 ON orders.id = items.order_id;
-- Hive 需要手动处理
WITH orders_sample AS (
SELECT * FROM orders TABLESAMPLE (10 PERCENT)
),
items_sample AS (
SELECT * FROM items TABLESAMPLE (10 PERCENT)
)
SELECT * FROM orders_sample o
JOIN items_sample i ON o.id = i.order_id;📝 练习题
-- 建表
CREATE TABLE large_logs (
log_id BIGINT,
user_id BIGINT,
event_type STRING,
event_time TIMESTAMP
);要求:
- 抽取10000条日志样本
- 按用户ID取10%的用户样本
- 抽取1%数据统计各事件类型分布
- 使用哈希抽样确保样本可重复
-- 参考答案(Hive)
-- 1. 抽取固定行数
SELECT * FROM large_logs TABLESAMPLE (10000 ROWS);
-- 2. 按用户抽样(哈希)
SELECT *
FROM large_logs
WHERE ABS(HASH(user_id)) % 100 < 10;
-- 3. 1%抽样统计
SELECT
event_type,
COUNT(*) AS event_count
FROM (SELECT * FROM large_logs TABLESAMPLE (1 PERCENT)) t
GROUP BY event_type
ORDER BY event_count DESC;
-- 4. 哈希抽样(可重复)
SELECT *
FROM large_logs
WHERE pmod(hash(concat(user_id, '2024')), 1000) < 100; -- 10%样本
順子の杂货铺


