32 KiB
32 KiB
MySQL GROUP BY 语法使用文档
目录
GROUP BY 基础语法
GROUP BY 用于将查询结果按指定字段进行分组,通常与聚合函数一起使用。
SELECT column1, aggregate_function(column2)
FROM table_name
[WHERE condition]
GROUP BY column1, column2, ...
[HAVING group_condition]
[ORDER BY column]
[LIMIT number];
执行顺序:
- FROM - 确定数据源
- WHERE - 过滤原始数据
- GROUP BY - 分组
- 聚合函数计算
- HAVING - 过滤分组结果
- SELECT - 选择输出列
- ORDER BY - 排序
- LIMIT - 限制结果数量
GROUP BY 规则和特性
-
SELECT 列限制:SELECT 子句中只能包含:
- GROUP BY 子句中的列
- 聚合函数
- 常量
-
NULL 值处理:NULL 值被视为一组
-
聚合函数:
- COUNT():计数
- SUM():求和
- AVG():平均值
- MAX():最大值
- MIN():最小值
- GROUP_CONCAT():字符串连接
-
HAVING vs WHERE:
- WHERE:过滤原始行
- HAVING:过滤分组结果
示例数据准备
创建示例表
-- 销售订单表
CREATE TABLE sales_orders (
order_id INT PRIMARY KEY AUTO_INCREMENT,
customer_id INT,
customer_name VARCHAR(50),
product_id INT,
product_name VARCHAR(100),
category VARCHAR(50),
order_date DATE,
quantity INT,
unit_price DECIMAL(10,2),
sales_rep VARCHAR(50),
region VARCHAR(30),
discount_rate DECIMAL(3,2) DEFAULT 0.00
);
-- 员工表
CREATE TABLE employees (
emp_id INT PRIMARY KEY,
name VARCHAR(50),
department VARCHAR(30),
position VARCHAR(50),
salary DECIMAL(10,2),
hire_date DATE,
manager_id INT,
age INT,
gender ENUM('M', 'F'),
city VARCHAR(30)
);
-- 学生成绩表
CREATE TABLE student_scores (
student_id INT,
student_name VARCHAR(50),
subject VARCHAR(30),
score INT,
exam_date DATE,
teacher VARCHAR(30),
class_id INT,
semester VARCHAR(20),
PRIMARY KEY (student_id, subject, exam_date)
);
-- 网站访问日志表
CREATE TABLE web_logs (
log_id INT PRIMARY KEY AUTO_INCREMENT,
user_id INT,
page_url VARCHAR(200),
visit_date DATE,
visit_time TIME,
session_duration INT, -- 会话时长(分钟)
device_type VARCHAR(20),
browser VARCHAR(30),
country VARCHAR(30),
page_views INT DEFAULT 1
);
-- 库存表
CREATE TABLE inventory (
product_id INT PRIMARY KEY,
product_name VARCHAR(100),
category VARCHAR(50),
supplier VARCHAR(50),
stock_quantity INT,
reorder_level INT,
unit_cost DECIMAL(8,2),
last_restock_date DATE,
warehouse_location VARCHAR(30)
);
插入示例数据
-- 插入销售订单数据
INSERT INTO sales_orders (customer_id, customer_name, product_id, product_name, category, order_date, quantity, unit_price, sales_rep, region, discount_rate) VALUES
(1, '阿里巴巴', 101, 'MacBook Pro', '笔记本', '2024-01-15', 10, 12999.00, '张三', '华东', 0.05),
(1, '阿里巴巴', 102, 'iPhone 15', '手机', '2024-01-16', 50, 5999.00, '张三', '华东', 0.03),
(2, '腾讯', 103, 'iPad Air', '平板', '2024-01-20', 20, 3999.00, '李四', '华南', 0.04),
(2, '腾讯', 101, 'MacBook Pro', '笔记本', '2024-01-22', 15, 12999.00, '李四', '华南', 0.06),
(3, '百度', 104, 'AirPods Pro', '耳机', '2024-01-25', 100, 1999.00, '王五', '华北', 0.08),
(3, '百度', 102, 'iPhone 15', '手机', '2024-01-28', 30, 5999.00, '王五', '华北', 0.05),
(4, '字节跳动', 105, 'Apple Watch', '智能手表', '2024-02-01', 25, 2999.00, '赵六', '华北', 0.10),
(4, '字节跳动', 103, 'iPad Air', '平板', '2024-02-03', 35, 3999.00, '赵六', '华北', 0.07),
(5, '美团', 106, 'MacBook Air', '笔记本', '2024-02-05', 12, 8999.00, '钱七', '华东', 0.04),
(5, '美团', 104, 'AirPods Pro', '耳机', '2024-02-08', 80, 1999.00, '钱七', '华东', 0.06),
(6, '滴滴', 107, 'iMac', '台式机', '2024-02-10', 8, 15999.00, '孙八', '华南', 0.03),
(6, '滴滴', 102, 'iPhone 15', '手机', '2024-02-12', 40, 5999.00, '孙八', '华南', 0.04),
(7, '小米', 108, 'iPad Pro', '平板', '2024-02-15', 18, 7999.00, '周九', '华北', 0.05),
(7, '小米', 105, 'Apple Watch', '智能手表', '2024-02-18', 30, 2999.00, '周九', '华北', 0.12),
(8, '华为', 109, 'Studio Display', '显示器', '2024-02-20', 6, 11999.00, '吴十', '华南', 0.02);
-- 插入员工数据
INSERT INTO employees (emp_id, name, department, position, salary, hire_date, manager_id, age, gender, city) VALUES
(1001, '张经理', '技术部', '部门经理', 25000.00, '2020-01-15', NULL, 35, 'M', '北京'),
(1002, '李架构师', '技术部', '高级架构师', 22000.00, '2020-05-20', 1001, 32, 'M', '北京'),
(1003, '王工程师', '技术部', '高级工程师', 18000.00, '2021-03-10', 1001, 28, 'F', '北京'),
(1004, '赵工程师', '技术部', '中级工程师', 15000.00, '2022-01-15', 1001, 26, 'M', '北京'),
(1005, '钱实习生', '技术部', '实习工程师', 8000.00, '2023-07-01', 1002, 23, 'F', '北京'),
(2001, '孙经理', '产品部', '产品经理', 20000.00, '2021-02-01', NULL, 30, 'F', '上海'),
(2002, '周产品', '产品部', '高级产品', 16000.00, '2021-08-15', 2001, 27, 'M', '上海'),
(2003, '吴助理', '产品部', '产品助理', 12000.00, '2022-06-20', 2001, 25, 'F', '上海'),
(3001, '郑经理', '销售部', '销售经理', 18000.00, '2020-11-10', NULL, 33, 'M', '深圳'),
(3002, '刘销售', '销售部', '高级销售', 14000.00, '2021-09-25', 3001, 29, 'F', '深圳'),
(3003, '陈销售', '销售部', '销售代表', 10000.00, '2022-12-01', 3001, 24, 'M', '深圳'),
(4001, '林经理', '人事部', 'HR经理', 16000.00, '2021-04-12', NULL, 31, 'F', '广州'),
(4002, '黄专员', '人事部', 'HR专员', 11000.00, '2022-08-30', 4001, 26, 'F', '广州'),
(5001, '何经理', '财务部', '财务经理', 19000.00, '2020-07-08', NULL, 34, 'M', '杭州'),
(5002, '魏会计', '财务部', '会计', 13000.00, '2021-11-20', 5001, 28, 'F', '杭州');
-- 插入学生成绩数据
INSERT INTO student_scores (student_id, student_name, subject, score, exam_date, teacher, class_id, semester) VALUES
(1, '张小明', '数学', 85, '2024-01-15', '王老师', 1, '2024春'),
(1, '张小明', '语文', 78, '2024-01-16', '李老师', 1, '2024春'),
(1, '张小明', '英语', 92, '2024-01-17', '赵老师', 1, '2024春'),
(2, '李小红', '数学', 92, '2024-01-15', '王老师', 1, '2024春'),
(2, '李小红', '语文', 88, '2024-01-16', '李老师', 1, '2024春'),
(2, '李小红', '英语', 95, '2024-01-17', '赵老师', 1, '2024春'),
(3, '王小刚', '数学', 76, '2024-01-15', '王老师', 2, '2024春'),
(3, '王小刚', '语文', 82, '2024-01-16', '李老师', 2, '2024春'),
(3, '王小刚', '英语', 79, '2024-01-17', '赵老师', 2, '2024春'),
(4, '赵小丽', '数学', 88, '2024-01-15', '王老师', 2, '2024春'),
(4, '赵小丽', '语文', 85, '2024-01-16', '李老师', 2, '2024春'),
(4, '赵小丽', '英语', 90, '2024-01-17', '赵老师', 2, '2024春'),
(5, '钱小伟', '数学', 90, '2024-01-15', '钱老师', 3, '2024春'),
(5, '钱小伟', '语文', 87, '2024-01-16', '孙老师', 3, '2024春'),
(5, '钱小伟', '英语', 93, '2024-01-17', '周老师', 3, '2024春'),
-- 添加期中考试成绩
(1, '张小明', '数学', 88, '2024-03-15', '王老师', 1, '2024春'),
(1, '张小明', '语文', 82, '2024-03-16', '李老师', 1, '2024春'),
(2, '李小红', '数学', 95, '2024-03-15', '王老师', 1, '2024春'),
(2, '李小红', '语文', 90, '2024-03-16', '李老师', 1, '2024春'),
(3, '王小刚', '数学', 80, '2024-03-15', '王老师', 2, '2024春');
-- 插入网站访问日志数据
INSERT INTO web_logs (user_id, page_url, visit_date, visit_time, session_duration, device_type, browser, country, page_views) VALUES
(1001, '/home', '2024-01-15', '09:30:00', 25, 'Desktop', 'Chrome', '中国', 5),
(1001, '/products', '2024-01-15', '10:15:00', 15, 'Desktop', 'Chrome', '中国', 8),
(1002, '/home', '2024-01-15', '14:20:00', 30, 'Mobile', 'Safari', '美国', 3),
(1003, '/about', '2024-01-16', '11:45:00', 20, 'Tablet', 'Chrome', '英国', 4),
(1001, '/checkout', '2024-01-16', '16:30:00', 45, 'Desktop', 'Chrome', '中国', 2),
(1004, '/home', '2024-01-17', '08:15:00', 35, 'Mobile', 'Firefox', '日本', 6),
(1002, '/products', '2024-01-17', '13:20:00', 28, 'Desktop', 'Edge', '美国', 7),
(1005, '/contact', '2024-01-18', '10:40:00', 12, 'Mobile', 'Safari', '加拿大', 2),
(1003, '/home', '2024-01-18', '15:25:00', 22, 'Desktop', 'Chrome', '英国', 4),
(1006, '/products', '2024-01-19', '09:10:00', 40, 'Tablet', 'Safari', '澳大利亚', 9),
(1001, '/home', '2024-01-19', '14:35:00', 18, 'Mobile', 'Chrome', '中国', 3),
(1007, '/login', '2024-01-20', '11:20:00', 8, 'Desktop', 'Firefox', '德国', 1),
(1002, '/dashboard', '2024-01-20', '16:45:00', 55, 'Desktop', 'Chrome', '美国', 12);
-- 插入库存数据
INSERT INTO inventory (product_id, product_name, category, supplier, stock_quantity, reorder_level, unit_cost, last_restock_date, warehouse_location) VALUES
(101, 'MacBook Pro', '笔记本', 'Apple', 45, 20, 10000.00, '2024-01-10', '北京仓库'),
(102, 'iPhone 15', '手机', 'Apple', 120, 50, 4500.00, '2024-01-12', '上海仓库'),
(103, 'iPad Air', '平板', 'Apple', 80, 30, 3200.00, '2024-01-08', '深圳仓库'),
(104, 'AirPods Pro', '耳机', 'Apple', 200, 80, 1500.00, '2024-01-15', '北京仓库'),
(105, 'Apple Watch', '智能手表', 'Apple', 60, 25, 2400.00, '2024-01-05', '广州仓库'),
(106, 'MacBook Air', '笔记本', 'Apple', 35, 15, 7200.00, '2024-01-20', '上海仓库'),
(107, 'iMac', '台式机', 'Apple', 15, 10, 12800.00, '2024-01-18', '深圳仓库'),
(108, 'iPad Pro', '平板', 'Apple', 25, 15, 6400.00, '2024-01-22', '北京仓库'),
(109, 'Studio Display', '显示器', 'Apple', 12, 8, 9600.00, '2024-01-25', '广州仓库'),
(110, 'Mac Mini', '台式机', 'Apple', 30, 12, 4800.00, '2024-01-28', '杭州仓库');
基础分组查询
1. 简单分组统计
-- 按产品类别统计销售数量
SELECT category, SUM(quantity) AS total_quantity
FROM sales_orders
GROUP BY category;
结果:
+----------+----------------+
| category | total_quantity |
+----------+----------------+
| 笔记本 | 37 |
| 手机 | 120 |
| 平板 | 73 |
| 耳机 | 180 |
| 智能手表 | 55 |
| 台式机 | 8 |
| 显示器 | 6 |
+----------+----------------+
2. 按部门统计员工信息
-- 按部门统计员工数量和平均薪资
SELECT department,
COUNT(*) AS emp_count,
ROUND(AVG(salary), 2) AS avg_salary,
MIN(salary) AS min_salary,
MAX(salary) AS max_salary
FROM employees
GROUP BY department;
结果:
+----------+-----------+------------+------------+------------+
| department | emp_count | avg_salary | min_salary | max_salary |
+----------+-----------+------------+------------+------------+
| 技术部 | 5 | 17600.00 | 8000.00 | 25000.00 |
| 产品部 | 3 | 16000.00 | 12000.00 | 20000.00 |
| 销售部 | 3 | 14000.00 | 10000.00 | 18000.00 |
| 人事部 | 2 | 13500.00 | 11000.00 | 16000.00 |
| 财务部 | 2 | 16000.00 | 13000.00 | 19000.00 |
+----------+-----------+------------+------------+------------+
3. 按日期分组统计
-- 按月份统计订单数量和销售额
SELECT DATE_FORMAT(order_date, '%Y-%m') AS order_month,
COUNT(*) AS order_count,
SUM(quantity * unit_price * (1 - discount_rate)) AS total_sales
FROM sales_orders
GROUP BY DATE_FORMAT(order_date, '%Y-%m')
ORDER BY order_month;
结果:
+-------------+-------------+-------------+
| order_month | order_count | total_sales |
+-------------+-------------+-------------+
| 2024-01 | 6 | 912393.00 |
| 2024-02 | 9 | 809328.40 |
+-------------+-------------+-------------+
聚合函数详解
1. COUNT() 函数详解
-- COUNT 的不同用法
SELECT
COUNT(*) AS total_rows, -- 总行数
COUNT(manager_id) AS has_manager, -- 非NULL值数量
COUNT(DISTINCT department) AS dept_count, -- 去重计数
COUNT(CASE WHEN salary > 15000 THEN 1 END) AS high_salary_count -- 条件计数
FROM employees;
结果:
+------------+-------------+------------+-------------------+
| total_rows | has_manager | dept_count | high_salary_count |
+------------+-------------+------------+-------------------+
| 15 | 10 | 5 | 8 |
+------------+-------------+------------+-------------------+
2. SUM() 和 AVG() 函数
-- 按销售代表统计业绩
SELECT sales_rep,
COUNT(*) AS order_count,
SUM(quantity) AS total_quantity,
SUM(quantity * unit_price * (1 - discount_rate)) AS total_sales,
ROUND(AVG(quantity * unit_price * (1 - discount_rate)), 2) AS avg_order_value
FROM sales_orders
GROUP BY sales_rep
ORDER BY total_sales DESC;
结果:
+----------+-------------+----------------+-------------+-----------------+
| sales_rep | order_count | total_quantity | total_sales | avg_order_value |
+----------+-------------+----------------+-------------+-----------------+
| 张三 | 2 | 60 | 423447.00 | 211723.50 |
| 李四 | 2 | 35 | 318969.00 | 159484.50 |
| 王五 | 2 | 130 | 344562.00 | 172281.00 |
| 赵六 | 2 | 60 | 206187.00 | 103093.50 |
| 钱七 | 2 | 92 | 258291.60 | 129145.80 |
| 孙八 | 2 | 48 | 355464.00 | 177732.00 |
| 周九 | 2 | 48 | 222777.60 | 111388.80 |
| 吴十 | 1 | 6 | 70622.80 | 70622.80 |
+----------+-------------+----------------+-------------+-----------------+
3. MAX() 和 MIN() 函数
-- 按班级统计成绩分布
SELECT class_id,
COUNT(*) AS student_count,
MAX(score) AS highest_score,
MIN(score) AS lowest_score,
ROUND(AVG(score), 2) AS avg_score,
MAX(score) - MIN(score) AS score_range
FROM student_scores
GROUP BY class_id
ORDER BY class_id;
结果:
+----------+---------------+---------------+--------------+-----------+-------------+
| class_id | student_count | highest_score | lowest_score | avg_score | score_range |
+----------+---------------+---------------+--------------+-----------+-------------+
| 1 | 8 | 95 | 78 | 86.75 | 17 |
| 2 | 7 | 90 | 76 | 82.71 | 14 |
| 3 | 3 | 93 | 87 | 90.00 | 6 |
+----------+---------------+---------------+--------------+-----------+-------------+
4. GROUP_CONCAT() 函数
-- 按部门列出所有员工姓名
SELECT department,
COUNT(*) AS emp_count,
GROUP_CONCAT(name ORDER BY salary DESC) AS employees,
GROUP_CONCAT(DISTINCT city) AS cities
FROM employees
GROUP BY department;
结果:
+----------+-----------+------------------------------------------+----------+
| department | emp_count | employees | cities |
+----------+-----------+------------------------------------------+----------+
| 技术部 | 5 | 张经理,李架构师,王工程师,赵工程师,钱实习生 | 北京 |
| 产品部 | 3 | 孙经理,周产品,吴助理 | 上海 |
| 销售部 | 3 | 郑经理,刘销售,陈销售 | 深圳 |
| 人事部 | 2 | 林经理,黄专员 | 广州 |
| 财务部 | 2 | 何经理,魏会计 | 杭州 |
+----------+-----------+------------------------------------------+----------+
HAVING 子句使用
1. HAVING 基础用法
-- 查找员工数量大于2的部门
SELECT department,
COUNT(*) AS emp_count,
ROUND(AVG(salary), 2) AS avg_salary
FROM employees
GROUP BY department
HAVING COUNT(*) > 2
ORDER BY emp_count DESC;
结果:
+----------+-----------+------------+
| department | emp_count | avg_salary |
+----------+-----------+------------+
| 技术部 | 5 | 17600.00 |
| 产品部 | 3 | 16000.00 |
| 销售部 | 3 | 14000.00 |
+----------+-----------+------------+
2. HAVING 与 WHERE 的区别
-- 错误用法:WHERE 不能使用聚合函数
-- SELECT department, COUNT(*) FROM employees WHERE COUNT(*) > 2 GROUP BY department;
-- 正确用法:先过滤再分组
SELECT department,
COUNT(*) AS emp_count,
ROUND(AVG(salary), 2) AS avg_salary
FROM employees
WHERE salary > 12000 -- 先过滤薪资大于12000的员工
GROUP BY department
HAVING COUNT(*) >= 2 -- 再过滤员工数量>=2的部门
ORDER BY avg_salary DESC;
结果:
+----------+-----------+------------+
| department | emp_count | avg_salary |
+----------+-----------+------------+
| 技术部 | 3 | 21666.67 |
| 产品部 | 2 | 18000.00 |
| 财务部 | 2 | 16000.00 |
+----------+-----------+------------+
3. 复杂 HAVING 条件
-- 查找高价值客户(订单总额>30万且订单数量>1)
SELECT customer_name,
COUNT(*) AS order_count,
SUM(quantity * unit_price * (1 - discount_rate)) AS total_sales,
ROUND(AVG(quantity * unit_price * (1 - discount_rate)), 2) AS avg_order_value
FROM sales_orders
GROUP BY customer_name
HAVING COUNT(*) > 1 AND SUM(quantity * unit_price * (1 - discount_rate)) > 300000
ORDER BY total_sales DESC;
结果:
+--------------+-------------+-------------+-----------------+
| customer_name | order_count | total_sales | avg_order_value |
+--------------+-------------+-------------+-----------------+
| 阿里巴巴 | 2 | 423447.00 | 211723.50 |
| 腾讯 | 2 | 318969.00 | 159484.50 |
| 百度 | 2 | 344562.00 | 172281.00 |
| 滴滴 | 2 | 355464.00 | 177732.00 |
+-------------+-------------+-------------+-----------------+
多列分组查询
1. 双列分组
-- 按地区和销售代表分组统计
SELECT region, sales_rep,
COUNT(*) AS order_count,
SUM(quantity) AS total_quantity,
SUM(quantity * unit_price * (1 - discount_rate)) AS total_sales
FROM sales_orders
GROUP BY region, sales_rep
ORDER BY region, total_sales DESC;
结果:
+--------+-----------+-------------+----------------+-------------+
| region | sales_rep | order_count | total_quantity | total_sales |
+--------+-----------+-------------+----------------+-------------+
| 华北 | 王五 | 2 | 130 | 344562.00 |
| 华北 | 赵六 | 2 | 60 | 206187.00 |
| 华北 | 周九 | 2 | 48 | 222777.60 |
| 华东 | 张三 | 2 | 60 | 423447.00 |
| 华东 | 钱七 | 2 | 92 | 258291.60 |
| 华南 | 李四 | 2 | 35 | 318969.00 |
| 华南 | 孙八 | 2 | 48 | 355464.00 |
| 华南 | 吴十 | 1 | 6 | 70622.80 |
+--------+-----------+-------------+----------------+-------------+
2. 三列分组
-- 按年龄段、性别、部门分组统计员工
SELECT
CASE
WHEN age < 25 THEN '24岁以下'
WHEN age <= 30 THEN '25-30岁'
WHEN age <= 35 THEN '31-35岁'
ELSE '35岁以上'
END AS age_group,
gender,
department,
COUNT(*) AS emp_count,
ROUND(AVG(salary), 2) AS avg_salary
FROM employees
GROUP BY
CASE
WHEN age < 25 THEN '24岁以下'
WHEN age <= 30 THEN '25-30岁'
WHEN age <= 35 THEN '31-35岁'
ELSE '35岁以上'
END,
gender,
department
HAVING COUNT(*) >= 1
ORDER BY age_group, gender, department;
3. 时间维度分组
-- 按年、月、产品类别分组统计
SELECT
YEAR(order_date) AS order_year,
MONTH(order_date) AS order_month,
category,
COUNT(*) AS order_count,
SUM(quantity) AS total_quantity,
ROUND(SUM(quantity * unit_price * (1 - discount_rate)), 2) AS total_sales
FROM sales_orders
GROUP BY YEAR(order_date), MONTH(order_date), category
ORDER BY order_year, order_month, total_sales DESC;
复杂分组统计
1. 嵌套分组查询
-- 查找每个部门薪资最高的员工信息
SELECT e1.*
FROM employees e1
WHERE e1.salary = (
SELECT MAX(e2.salary)
FROM employees e2
WHERE e2.department = e1.department
)
ORDER BY e1.department, e1.salary DESC;
2. 窗口函数与分组
-- 计算每个员工在部门内的薪资排名
SELECT name, department, salary,
ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) AS dept_rank,
ROUND(salary / AVG(salary) OVER (PARTITION BY department) * 100, 2) AS salary_ratio
FROM employees
ORDER BY department, dept_rank;
3. 数据透视表效果
-- 按设备类型和浏览器统计访问情况
SELECT device_type,
SUM(CASE WHEN browser = 'Chrome' THEN page_views ELSE 0 END) AS Chrome,
SUM(CASE WHEN browser = 'Safari' THEN page_views ELSE 0 END) AS Safari,
SUM(CASE WHEN browser = 'Firefox' THEN page_views ELSE 0 END) AS Firefox,
SUM(CASE WHEN browser = 'Edge' THEN page_views ELSE 0 END) AS Edge,
SUM(page_views) AS Total
FROM web_logs
GROUP BY device_type
ORDER BY Total DESC;
结果:
+-------------+--------+--------+---------+------+-------+
| device_type | Chrome | Safari | Firefox | Edge | Total |
+-------------+--------+--------+---------+------+-------+
| Desktop | 24 | 0 | 1 | 7 | 32 |
| Mobile | 6 | 5 | 6 | 0 | 17 |
| Tablet | 4 | 9 | 0 | 0 | 13 |
+-------------+--------+--------+---------+------+-------+
4. 动态分组条件
-- 按库存状态分组统计产品
SELECT
CASE
WHEN stock_quantity <= reorder_level THEN '需要补货'
WHEN stock_quantity <= reorder_level * 2 THEN '库存偏低'
ELSE '库存充足'
END AS stock_status,
COUNT(*) AS product_count,
SUM(stock_quantity) AS total_stock,
ROUND(AVG(unit_cost), 2) AS avg_cost
FROM inventory
GROUP BY
CASE
WHEN stock_quantity <= reorder_level THEN '需要补货'
WHEN stock_quantity <= reorder_level * 2 THEN '库存偏低'
ELSE '库存充足'
END
ORDER BY
CASE
WHEN stock_status = '需要补货' THEN 1
WHEN stock_status = '库存偏低' THEN 2
ELSE 3
END;
面试题和实际案例
面试题1:连续登录用户分析
题目:找出连续3天都有访问记录的用户。
-- 解答:使用窗口函数和分组
WITH daily_users AS (
SELECT DISTINCT user_id, visit_date
FROM web_logs
),
user_sequences AS (
SELECT user_id, visit_date,
ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY visit_date) as rn,
DATE_SUB(visit_date, INTERVAL ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY visit_date) DAY) as group_date
FROM daily_users
),
consecutive_groups AS (
SELECT user_id, group_date, COUNT(*) as consecutive_days
FROM user_sequences
GROUP BY user_id, group_date
HAVING COUNT(*) >= 3
)
SELECT DISTINCT cg.user_id
FROM consecutive_groups cg;
面试题2:同比增长率计算
题目:计算每个月的销售额同比增长率。
-- 解答:使用自连接和分组
WITH monthly_sales AS (
SELECT
DATE_FORMAT(order_date, '%Y-%m') as month,
SUM(quantity * unit_price * (1 - discount_rate)) as total_sales
FROM sales_orders
GROUP BY DATE_FORMAT(order_date, '%Y-%m')
)
SELECT
current_month.month,
current_month.total_sales as current_sales,
last_year.total_sales as last_year_sales,
ROUND(
(current_month.total_sales - IFNULL(last_year.total_sales, 0)) /
IFNULL(last_year.total_sales, current_month.total_sales) * 100, 2
) as growth_rate
FROM monthly_sales current_month
LEFT JOIN monthly_sales last_year
ON DATE_FORMAT(DATE_SUB(STR_TO_DATE(CONCAT(current_month.month, '-01'), '%Y-%m-%d'), INTERVAL 1 YEAR), '%Y-%m') = last_year.month
ORDER BY current_month.month;
面试题3:帕累托分析(80/20法则)
题目:找出贡献80%销售额的前20%客户。
-- 解答:使用窗口函数计算累计占比
WITH customer_sales AS (
SELECT customer_name,
SUM(quantity * unit_price * (1 - discount_rate)) as total_sales
FROM sales_orders
GROUP BY customer_name
),
customer_ranking AS (
SELECT customer_name, total_sales,
ROW_NUMBER() OVER (ORDER BY total_sales DESC) as rank_num,
COUNT(*) OVER () as total_customers,
SUM(total_sales) OVER () as grand_total,
SUM(total_sales) OVER (ORDER BY total_sales DESC) as cumulative_sales
FROM customer_sales
),
customer_contribution AS (
SELECT *,
ROUND(rank_num / total_customers * 100, 2) as customer_percentile,
ROUND(cumulative_sales / grand_total * 100, 2) as sales_percentile
FROM customer_ranking
)
SELECT customer_name, total_sales, customer_percentile, sales_percentile
FROM customer_contribution
WHERE sales_percentile <= 80
ORDER BY total_sales DESC;
实际案例1:用户行为分析
场景:电商网站需要分析用户访问行为,优化用户体验。
-- 综合用户行为分析
SELECT
country,
device_type,
COUNT(DISTINCT user_id) as unique_users,
COUNT(*) as total_sessions,
ROUND(COUNT(*) / COUNT(DISTINCT user_id), 2) as sessions_per_user,
ROUND(AVG(session_duration), 2) as avg_session_duration,
SUM(page_views) as total_page_views,
ROUND(SUM(page_views) / COUNT(*), 2) as pages_per_session,
ROUND(SUM(session_duration) / 60, 2) as total_hours
FROM web_logs
GROUP BY country, device_type
HAVING COUNT(DISTINCT user_id) >= 1
ORDER BY unique_users DESC, avg_session_duration DESC;
实际案例2:销售业绩分析
场景:销售部门需要全面分析销售业绩,制定奖励策略。
-- 销售代表综合业绩分析
WITH sales_performance AS (
SELECT sales_rep,
COUNT(*) as order_count,
COUNT(DISTINCT customer_name) as customer_count,
SUM(quantity) as total_quantity,
SUM(quantity * unit_price * (1 - discount_rate)) as total_sales,
ROUND(AVG(quantity * unit_price * (1 - discount_rate)), 2) as avg_order_value,
MAX(quantity * unit_price * (1 - discount_rate)) as max_order_value,
COUNT(DISTINCT category) as category_diversity
FROM sales_orders
GROUP BY sales_rep
),
performance_ranking AS (
SELECT *,
RANK() OVER (ORDER BY total_sales DESC) as sales_rank,
RANK() OVER (ORDER BY customer_count DESC) as customer_rank,
RANK() OVER (ORDER BY avg_order_value DESC) as avg_value_rank,
ROUND(total_sales / SUM(total_sales) OVER () * 100, 2) as sales_share
FROM sales_performance
)
SELECT sales_rep, total_sales, customer_count, avg_order_value,
sales_rank, customer_rank, sales_share,
CASE
WHEN sales_rank <= 2 THEN '金牌销售'
WHEN sales_rank <= 4 THEN '银牌销售'
ELSE '铜牌销售'
END as performance_level
FROM performance_ranking
ORDER BY sales_rank;
实际案例3:库存管理优化
场景:仓库管理需要优化库存配置,减少缺货和积压。
-- 库存分析和补货建议
SELECT
category,
warehouse_location,
COUNT(*) as product_count,
SUM(stock_quantity) as total_stock,
SUM(CASE WHEN stock_quantity <= reorder_level THEN 1 ELSE 0 END) as need_reorder,
ROUND(AVG(stock_quantity), 2) as avg_stock,
ROUND(AVG(unit_cost), 2) as avg_cost,
SUM(stock_quantity * unit_cost) as inventory_value,
ROUND(SUM(CASE WHEN stock_quantity <= reorder_level THEN unit_cost * reorder_level ELSE 0 END), 2) as reorder_investment
FROM inventory
GROUP BY category, warehouse_location
HAVING COUNT(*) >= 1
ORDER BY inventory_value DESC, need_reorder DESC;
面试题4:数据质量检查
题目:检查数据中的异常情况。
-- 多维度数据质量检查
SELECT
'sales_orders' as table_name,
'order_date' as check_field,
COUNT(*) as total_records,
SUM(CASE WHEN order_date IS NULL THEN 1 ELSE 0 END) as null_count,
SUM(CASE WHEN order_date > CURDATE() THEN 1 ELSE 0 END) as future_date_count,
MIN(order_date) as min_date,
MAX(order_date) as max_date
FROM sales_orders
UNION ALL
SELECT
'employees' as table_name,
'salary' as check_field,
COUNT(*) as total_records,
SUM(CASE WHEN salary IS NULL THEN 1 ELSE 0 END) as null_count,
SUM(CASE WHEN salary <= 0 THEN 1 ELSE 0 END) as invalid_salary_count,
MIN(salary) as min_salary,
MAX(salary) as max_salary
FROM employees;
性能优化和最佳实践
1. 索引优化
-- 为分组字段创建索引
CREATE INDEX idx_sales_orders_category ON sales_orders(category);
CREATE INDEX idx_sales_orders_sales_rep ON sales_orders(sales_rep);
CREATE INDEX idx_employees_department ON employees(department);
-- 复合索引优化分组查询
CREATE INDEX idx_sales_orders_date_region ON sales_orders(order_date, region);
CREATE INDEX idx_web_logs_user_date ON web_logs(user_id, visit_date);
2. 查询优化技巧
-- ✅ 推荐:使用WHERE过滤再分组
SELECT department, COUNT(*)
FROM employees
WHERE salary > 10000 -- 先过滤
GROUP BY department;
-- ❌ 避免:分组后再过滤
SELECT department, COUNT(*)
FROM employees
GROUP BY department
HAVING AVG(salary) > 10000; -- 这会计算所有组的平均值
3. 避免不必要的分组
-- ❌ 不必要的分组
SELECT customer_name, SUM(quantity)
FROM sales_orders
WHERE customer_name = '阿里巴巴'
GROUP BY customer_name;
-- ✅ 优化:直接使用WHERE
SELECT '阿里巴巴' as customer_name, SUM(quantity)
FROM sales_orders
WHERE customer_name = '阿里巴巴';
4. 大数据量分组优化
-- 使用分区表优化大数据量分组
-- CREATE TABLE sales_orders_partitioned (
-- ...
-- ) PARTITION BY RANGE (YEAR(order_date)) (
-- PARTITION p2023 VALUES LESS THAN (2024),
-- PARTITION p2024 VALUES LESS THAN (2025)
-- );
-- 优化GROUP BY的内存使用
SET SESSION group_concat_max_len = 1000000;
SET SESSION tmp_table_size = 256000000;
SET SESSION max_heap_table_size = 256000000;
5. 监控和调优
-- 查看分组查询的执行计划
EXPLAIN SELECT department, COUNT(*), AVG(salary)
FROM employees
GROUP BY department;
-- 监控分组查询性能
SELECT
sql_text,
exec_count,
avg_timer_wait/1000000000000 as avg_time_sec,
sum_rows_examined/exec_count as avg_rows_examined
FROM performance_schema.events_statements_summary_by_digest
WHERE sql_text LIKE '%GROUP BY%'
ORDER BY avg_timer_wait DESC
LIMIT 10;
6. 最佳实践总结
-- ✅ 好的GROUP BY实践
SELECT
category, -- 分组字段
COUNT(*) as order_count, -- 明确的聚合函数
SUM(quantity) as total_qty -- 有意义的别名
FROM sales_orders
WHERE order_date >= '2024-01-01' -- 先过滤数据
GROUP BY category -- 简洁的分组
HAVING COUNT(*) > 1 -- 分组后过滤
ORDER BY total_qty DESC -- 有序输出
LIMIT 10; -- 限制结果数量
-- ❌ 避免的问题
-- 1. 在SELECT中使用非分组字段(MySQL 5.7+会报错)
-- 2. GROUP BY使用复杂表达式而不创建索引
-- 3. 不必要的HAVING条件
-- 4. 大量数据不加WHERE条件直接分组
总结:
- GROUP BY 是数据分析的核心工具,配合聚合函数使用
- 理解执行顺序:WHERE → GROUP BY → HAVING → SELECT → ORDER BY
- 合理使用索引可以显著提升分组查询性能
- HAVING 用于过滤分组结果,WHERE 用于过滤原始数据
- 复杂分析可以结合窗口函数和子查询实现
- 注意数据类型和NULL值的处理