分区表查询
分区表是大数据平台的核心优化技术,通过分区裁剪大幅提升查询性能。
📊 支持情况
| 数据库 | 分区类型 | 自动分区 | 静态分区 | 动态分区 | 备注 |
|---|---|---|---|---|---|
| MySQL | LIST/RANGE | ✅ | ✅ | ✅ | 分区表 |
| Oracle | LIST/RANGE/HASH | ✅ | ✅ | – | 自动管理 |
| ClickHouse | RANGE/LIST | ✅ | ✅ | ✅ | 按月/日 |
| Hologres | LIST | ✅ | – | ✅ | 自动分区 |
| MaxCompute | LIST | ✅ | – | ✅ | 按日期 |
| Hive | LIST/RANGE/HASH | ✅ | ✅ | ✅ | 完整支持 |
1. 分区表创建
Hive 分区表
-- 按日期分区
CREATE TABLE sales_partitioned (
order_id BIGINT,
user_id BIGINT,
amount DECIMAL(10,2),
status STRING
)
PARTITIONED BY (dt STRING)
STORED AS ORC;
-- 按多字段分区
CREATE TABLE user_behavior (
user_id BIGINT,
event_type STRING,
event_time TIMESTAMP
)
PARTITIONED BY (year STRING, month STRING, day STRING)
STORED AS PARQUET;
MaxCompute 分区表
-- MaxCompute分区表
CREATE TABLE sales_mc (
order_id BIGINT,
user_id BIGINT,
amount DECIMAL(10,2)
)
PARTITIONED BY (dt STRING)
LIFECYCLE 30;
Oracle 分区表
-- 按范围分区
CREATE TABLE sales_range (
order_id NUMBER,
order_date DATE,
amount NUMBER
)
PARTITION BY RANGE (order_date) (
PARTITION p1 VALUES LESS THAN (TO_DATE('2024-02-01', 'YYYY-MM-DD')),
PARTITION p2 VALUES LESS THAN (TO_DATE('2024-03-01', 'YYYY-MM-DD')),
PARTITION pmax VALUES LESS THAN (MAXVALUE)
);
-- 按列表分区
CREATE TABLE sales_list (
order_id NUMBER,
region VARCHAR2(20),
amount NUMBER
)
PARTITION BY LIST (region) (
PARTITION p_east VALUES ('华东', '华南'),
PARTITION p_west VALUES ('西北', '西南'),
PARTITION p_default VALUES (DEFAULT)
);
2. 分区裁剪
静态分区查询
-- Hive - 指定分区查询
SELECT * FROM sales_partitioned
WHERE dt = '2024-01-14';
-- 查询多个分区
SELECT * FROM sales_partitioned
WHERE dt IN ('2024-01-14', '2024-01-15', '2024-01-16');
-- 分区范围查询
SELECT * FROM sales_partitioned
WHERE dt >= '2024-01-01' AND dt < '2024-01-15';
动态分区
-- Hive - 启用动态分区
SET hive.exec.dynamic.partition=true;
SET hive.exec.dynamic.partition.mode=nonstrict;
-- 动态插入分区
INSERT OVERWRITE TABLE sales_partitioned PARTITION (dt)
SELECT
order_id,
user_id,
amount,
status,
TO_DATE(order_time) AS dt
FROM source_orders;
查看分区信息
-- Hive
SHOW PARTITIONS sales_partitioned;
DESCRIBE FORMATTED sales_partitioned;
-- Oracle
SELECT partition_name, high_value
FROM user_tab_partitions
WHERE table_name = 'SALES_RANGE';
3. 实战案例
场景1:增量数据加载
-- Hive - 增量导入
INSERT OVERWRITE TABLE sales_partitioned PARTITION (dt='2024-01-14')
SELECT
order_id,
user_id,
amount,
status
FROM source_orders
WHERE TO_DATE(order_time) = '2024-01-14';
场景2:多级分区查询
-- 查询指定年月的所有数据
SELECT *
FROM user_behavior
WHERE year = '2024' AND month = '01';
-- 查询某季度数据
SELECT *
FROM user_behavior
WHERE year = '2024' AND month IN ('01', '02', '03');
场景3:分区元数据查询
-- MySQL - 查看分区信息
SELECT
PARTITION_NAME,
TABLE_ROWS,
DATA_LENGTH,
INDEX_LENGTH
FROM INFORMATION_SCHEMA.PARTITIONS
WHERE TABLE_NAME = 'orders';
场景4:分区统计分析
-- Hive - 各分区数据量
SELECT
dt,
COUNT(*) AS row_count,
SUM(amount) AS total_amount
FROM sales_partitioned
GROUP BY dt
ORDER BY dt;
-- 查看分区统计信息
DESCRIBE FORMATTED sales_partitioned PARTITION (dt='2024-01-14');
4. 分区维护
添加分区
-- Hive
ALTER TABLE sales_partitioned ADD PARTITION (dt='2024-01-20');
-- Oracle
ALTER TABLE sales_range ADD PARTITION p3
VALUES LESS THAN (TO_DATE('2024-04-01', 'YYYY-MM-DD'));
删除分区
-- Hive
ALTER TABLE sales_partitioned DROP PARTITION (dt='2024-01-14');
-- 删除分区并删除数据
ALTER TABLE sales_partitioned DROP PARTITION (dt='2024-01-14')
PURGE;
-- Oracle
ALTER TABLE sales_range DROP PARTITION p1;
修复分区
-- Hive - 修复分区(元数据与实际一致)
MSCK REPAIR TABLE sales_partitioned;
-- 添加丢失的分区
ALTER TABLE sales_partitioned ADD PARTITION (dt='2024-01-21');
5. 分区优化
合理设计分区
-- ✅ 按查询频率高的字段分区
-- 常用查询:按日期查询
CREATE TABLE sales (
...
)
PARTITIONED BY (dt STRING, hour STRING)
CLUSTERED BY (user_id) INTO 32 BUCKETS; -- 哈希分桶
-- ✅ 分区粒度适中
-- 粒度太细:元数据膨胀
-- 粒度太粗:扫描数据量大
分区裁剪验证
-- 查看执行计划,确认分区裁剪
EXPLAIN
SELECT * FROM sales_partitioned
WHERE dt = '2024-01-14';
-- 预期看到:partition predicates: dt=2024-01-14
⚠️ 注意事项
1. 全表扫描
-- ❌ 没有分区条件,会扫描所有分区
SELECT * FROM sales_partitioned;
-- ✅ 使用分区条件
SELECT * FROM sales_partitioned WHERE dt = '2024-01-14';
2. 分区字段类型
-- ✅ 使用字符串类型存储日期
PARTITIONED BY (dt STRING)
-- 日期函数转换可能导致分区裁剪失效
-- ❌ 错误
SELECT * FROM sales WHERE TO_DATE(dt) = '2024-01-14';
-- ✅ 正确
SELECT * FROM sales WHERE dt = '2024-01-14';
3. 分区数量限制
-- Hive 默认分区数限制
SET hive.exec.max.dynamic.partitions=1000;
SET hive.exec.max.dynamic.partitions.pernode=100;
📝 练习题
-- 建表
CREATE TABLE orders_partition (
order_id INT,
user_id INT,
amount DECIMAL(10,2),
status STRING,
order_time DATETIME
)
PARTITIONED BY (dt STRING);
要求:
- 向分区表中插入2024-01-14的数据
- 查询该分区的订单数
- 删除过期分区(7天前)
- 统计各分区的销售额
-- 参考答案(Hive)
-- 1. 插入分区数据
INSERT OVERWRITE TABLE orders_partition PARTITION (dt='2024-01-14')
SELECT
order_id,
user_id,
amount,
status,
order_time
FROM source_orders
WHERE TO_DATE(order_time) = '2024-01-14';
-- 2. 查询分区数据
SELECT COUNT(*) AS order_count
FROM orders_partition
WHERE dt = '2024-01-14';
-- 3. 删除过期分区
ALTER TABLE orders_partition DROP PARTITION (dt=DATE_SUB(CURDATE(), 7));
-- 4. 分区统计
SELECT
dt,
COUNT(*) AS order_count,
SUM(amount) AS total_sales
FROM orders_partition
GROUP BY dt
ORDER BY dt;
順子の杂货铺


