字符串处理函数详解
字符串处理是SQL中最常用的操作之一,本篇涵盖截取、拼接、替换、转换等常见场景。
📊 支持情况
| 数据库 | SUBSTR | CONCAT | TRIM | REPLACE | SPLIT |
|---|---|---|---|---|---|
| MySQL | ✅ | ✅ | ✅ | ✅ | ✅ |
| Oracle | ✅ | ✅ | ✅ | ✅ | ✅ |
| ClickHouse | ✅ | ✅ | ✅ | ✅ | ✅ |
| Hologres | ✅ | ✅ | ✅ | ✅ | ✅ |
| MaxCompute | ✅ | ✅ | ✅ | ✅ | ✅ |
| Hive | ✅ | ✅ | ✅ | ✅ | ✅ |
1. 字符串截取
SUBSTR / SUBSTRING
MySQL
-- 从位置开始截取(1开始)
SELECT SUBSTR('Hello World', 1, 5); -- Hello
SELECT SUBSTRING('Hello World', 7); -- World
-- 从右边开始
SELECT SUBSTR('Hello World', -5); -- WorldOracle
-- Oracle从1开始
SELECT SUBSTR('Hello World', 1, 5) FROM dual; -- Hello
SELECT SUBSTR('Hello World', 7) FROM dual; -- World
SELECT SUBSTR('Hello World', -5) FROM dual; -- WorldClickHouse
SELECT substr('Hello World', 1, 5); -- Hello
SELECT substring('Hello World', 7); -- WorldHive / MaxCompute / Hologres
SELECT substr('Hello World', 1, 5); -- Hello
SELECT substring('Hello World', 7); -- WorldLEFT / RIGHT
-- MySQL / Hive / MaxCompute / Hologres
SELECT LEFT('Hello World', 5); -- Hello
SELECT RIGHT('Hello World', 5); -- World
-- Oracle
SELECT SUBSTR('Hello World', 1, 5) FROM dual;
SELECT SUBSTR('Hello World', -5) FROM dual;MID / CHARINDEX(Oracle)
-- Oracle特定函数
SELECT INSTR('Hello World', 'World') FROM dual; -- 7
SELECT SUBSTR('Hello World', INSTR('Hello World', ' ')) FROM dual; -- World2. 字符串拼接
CONCAT
-- MySQL / Hive / MaxCompute / Hologres
SELECT CONCAT('Hello', ' ', 'World'); -- Hello World
-- 多列拼接
SELECT CONCAT(first_name, ' ', last_name) AS full_name
FROM users;CONCAT_WS
-- 带分隔符的拼接
SELECT CONCAT_WS('-', '2024', '01', '14'); -- 2024-01-14
-- 多列拼接
SELECT CONCAT_WS(',', name, email, phone) AS contact_info
FROM users;Oracle CONCAT ||
-- Oracle使用 || 拼接
SELECT 'Hello' || ' ' || 'World' FROM dual; -- Hello World
-- CONCAT函数(Oracle最多2个参数)
SELECT CONCAT(CONCAT('Hello', ' '), 'World') FROM dual;ClickHouse
SELECT concat('Hello', ' ', 'World'); -- Hello World
SELECT 'Hello' || ' ' || 'World'; -- Hello WorldGROUP_CONCAT(MySQL)
-- 将多行拼接成一行
SELECT
dept_id,
GROUP_CONCAT(employee_name SEPARATOR ', ') AS employees
FROM employees
GROUP BY dept_id;
-- 带排序
SELECT
dept_id,
GROUP_CONCAT(employee_name ORDER BY salary DESC SEPARATOR ', ') AS employees
FROM employees
GROUP BY dept_id;WM_CONCAT(Oracle / Hive)
-- Oracle
SELECT dept_id, WM_CONCAT(',', employee_name) FROM employees GROUP BY dept_id;
-- Hive
SELECT dept_id, concat_ws(',', collect_set(employee_name)) FROM employees GROUP BY dept_id;3. 去除空白
TRIM 系列
-- 去除两端空格
SELECT TRIM(' Hello '); -- Hello
-- 去除左侧空格
SELECT LTRIM(' Hello'); -- Hello
-- 去除右侧空格
SELECT RTRIM('Hello '); -- Hello自定义去除字符
-- MySQL / Hive
SELECT TRIM(BOTH ',' FROM ',Hello,'); -- Hello
-- Oracle
SELECT TRIM(',' FROM ',Hello,') FROM dual; -- Hello4. 大小写转换
-- 转大写
SELECT UPPER('Hello'); -- HELLO
-- 转小写
SELECT LOWER('HELLO'); -- hello
-- 首字母大写
SELECT CONCAT(UPPER(SUBSTR(name, 1, 1)), LOWER(SUBSTR(name, 2))) AS name
FROM users;5. 字符串替换
REPLACE
-- MySQL / Hive / MaxCompute / Hologres
SELECT REPLACE('Hello World', 'World', 'SQL'); -- Hello SQL
-- 去除特定字符
SELECT REPLACE(phone, '-', '') FROM users;
-- Oracle
SELECT REPLACE('Hello World', 'World', 'SQL') FROM dual;REGEXP_REPLACE(正则替换)
-- MySQL 8.0+
SELECT REGEXP_REPLACE('Hello123World', '[0-9]', ''); -- HelloWorld
-- Oracle
SELECT REGEXP_REPLACE('Hello123World', '[0-9]', '') FROM dual;
-- Hive
SELECT regexp_replace('Hello123World', '[0-9]', '');ClickHouse
SELECT replaceAll('Hello123World', '[0-9]', ''); -- 使用正则
SELECT replace('Hello123World', '123', ''); -- 普通替换6. 字符串查找
INSTR / CHARINDEX
-- MySQL
SELECT INSTR('Hello World', 'World'); -- 7
SELECT INSTR('Hello World', 'SQL'); -- 0(未找到)
-- Oracle
SELECT INSTR('Hello World', 'World') FROM dual; -- 7
-- Hive / MaxCompute
SELECT instr('Hello World', 'World'); -- 7POSITION
-- 标准SQL
SELECT POSITION('World' IN 'Hello World'); -- 7
-- ClickHouse
SELECT position('Hello World', 'World'); -- 7LIKE 模糊匹配
-- 包含
SELECT * FROM users WHERE name LIKE '%张%';
-- 开头
SELECT * FROM users WHERE email LIKE 'admin%';
-- 结尾
SELECT * FROM users WHERE phone LIKE '%1234';
-- 长度匹配
SELECT * FROM users WHERE LENGTH(name) = 3;7. 字符串分割
SPLIT
-- Hive / MaxCompute / Hologres
SELECT split('a,b,c,d', ','); -- 返回数组 ["a","b","c","d"]
-- 取分割后的第1个元素
SELECT split('a,b,c,d', ',')[0]; -- a
SELECT split('a,b,c,d', ',')[1]; -- b
-- ClickHouse
SELECT splitByString(',', 'a,b,c,d');
-- MySQL 8.0+
SELECT SUBSTRING_INDEX('a,b,c,d', ',', 1); -- a
SELECT SUBSTRING_INDEX(SUBSTRING_INDEX('a,b,c,d', ',', 2), ',', -1); -- bREGEXP_EXTRACT
-- Hive / MaxCompute
SELECT regexp_extract('user123@example.com', '([a-zA-Z]+)', 1); -- user
SELECT regexp_extract('price: $100', '\\$(\\d+)', 1); -- 100
-- Oracle
SELECT REGEXP_SUBSTR('user123@example.com', '[a-zA-Z]+') FROM dual;
-- MySQL 8.0+
SELECT REGEXP_SUBSTR('user123@example.com', '[a-zA-Z]+'); -- user8. 格式化函数
LPAD / RPAD
-- 左侧补齐
SELECT LPAD('123', 6, '0'); -- 000123
SELECT LPAD('AB', 5, '*'); -- ***AB
-- 右侧补齐
SELECT RPAD('123', 6, '0'); -- 123000
SELECT RPAD('AB', 5, '*'); -- AB***REPEAT
SELECT REPEAT('*', 10); -- **********REVERSE
SELECT REVERSE('Hello'); -- olleH9. 长度计算
-- 字符长度
SELECT LENGTH('Hello'); -- 5(MySQL/Hive/Oracle返回字符数)
SELECT CHAR_LENGTH('Hello'); -- 5(字符数)
SELECT CHARACTER_LENGTH('Hello'); -- 标准SQL
-- 字节长度
SELECT OCTET_LENGTH('Hello'); -- 5(字节数,中文不同)
-- 中文处理
SELECT LENGTH('你好'); -- MySQL: 6(UTF-8占3字节)
SELECT LENGTH('你好'); -- Oracle: 2(按字符)10. 实战案例
场景1:手机号脱敏
-- MySQL / Hive / MaxCompute
SELECT
phone,
CONCAT(
SUBSTR(phone, 1, 3),
'****',
SUBSTR(phone, 8, 4)
) AS phone_masked
FROM users;场景2:邮箱提取用户名
SELECT
email,
SUBSTRING_INDEX(email, '@', 1) AS username
FROM users;场景3:解析URL
-- Hive
SELECT
url,
parse_url(url, 'HOST') AS host,
parse_url(url, 'PATH') AS path,
parse_url(url, 'QUERY', 'id') AS query_id
FROM web_logs;场景4:数据清洗
SELECT
TRIM(UPPER(REPLACE(REPLACE(name, '\t', ''), '\n', ''))) AS clean_name
FROM raw_data;⚠️ 注意事项
| 数据库 | 字符串截取起始位置 | 特殊说明 |
|---|---|---|
| MySQL | 1 | SUBSTR和SUBSTRING等价 |
| Oracle | 1 | SUBSTR参数从1开始 |
| Hive | 1 | |
| MaxCompute | 1 | |
| ClickHouse | 1 | |
| PostgreSQL | 1 |
📝 练习题
-- 建表
CREATE TABLE user_profiles (
user_id INT,
name STRING,
email STRING,
phone STRING,
address STRING
);
-- 插入数据
INSERT INTO user_profiles VALUES
(1, '张三', 'zhangsan@email.com', '13812345678', '北京市朝阳区'),
(2, '李四', 'lisi@email.com', '13987654321', ' 上海市浦东新区 ');要求:
- 提取邮箱@前的用户名
- 手机号中间四位脱敏
- 去除地址两端空格
- 姓名转大写
-- 参考答案
SELECT
user_id,
UPPER(name) AS name_upper,
SUBSTRING_INDEX(email, '@', 1) AS username,
CONCAT(SUBSTR(phone, 1, 3), '****', SUBSTR(phone, 8, 4)) AS phone_masked,
TRIM(address) AS address_clean
FROM user_profiles;
順子の杂货铺


