954 lines
32 KiB
Markdown
954 lines
32 KiB
Markdown
# MySQL GROUP BY 语法使用文档
|
||
|
||
## 目录
|
||
1. [GROUP BY 基础语法](#group-by-基础语法)
|
||
2. [GROUP BY 规则和特性](#group-by-规则和特性)
|
||
3. [示例数据准备](#示例数据准备)
|
||
4. [基础分组查询](#基础分组查询)
|
||
5. [聚合函数详解](#聚合函数详解)
|
||
6. [HAVING 子句使用](#having-子句使用)
|
||
7. [多列分组查询](#多列分组查询)
|
||
8. [复杂分组统计](#复杂分组统计)
|
||
9. [面试题和实际案例](#面试题和实际案例)
|
||
10. [性能优化和最佳实践](#性能优化和最佳实践)
|
||
|
||
## GROUP BY 基础语法
|
||
|
||
GROUP BY 用于将查询结果按指定字段进行分组,通常与聚合函数一起使用。
|
||
|
||
```sql
|
||
SELECT column1, aggregate_function(column2)
|
||
FROM table_name
|
||
[WHERE condition]
|
||
GROUP BY column1, column2, ...
|
||
[HAVING group_condition]
|
||
[ORDER BY column]
|
||
[LIMIT number];
|
||
```
|
||
|
||
**执行顺序:**
|
||
1. FROM - 确定数据源
|
||
2. WHERE - 过滤原始数据
|
||
3. GROUP BY - 分组
|
||
4. 聚合函数计算
|
||
5. HAVING - 过滤分组结果
|
||
6. SELECT - 选择输出列
|
||
7. ORDER BY - 排序
|
||
8. LIMIT - 限制结果数量
|
||
|
||
## GROUP BY 规则和特性
|
||
|
||
1. **SELECT 列限制**:SELECT 子句中只能包含:
|
||
- GROUP BY 子句中的列
|
||
- 聚合函数
|
||
- 常量
|
||
|
||
2. **NULL 值处理**:NULL 值被视为一组
|
||
|
||
3. **聚合函数**:
|
||
- COUNT():计数
|
||
- SUM():求和
|
||
- AVG():平均值
|
||
- MAX():最大值
|
||
- MIN():最小值
|
||
- GROUP_CONCAT():字符串连接
|
||
|
||
4. **HAVING vs WHERE**:
|
||
- WHERE:过滤原始行
|
||
- HAVING:过滤分组结果
|
||
|
||
## 示例数据准备
|
||
|
||
### 创建示例表
|
||
|
||
```sql
|
||
-- 销售订单表
|
||
CREATE TABLE sales_orders (
|
||
order_id INT PRIMARY KEY AUTO_INCREMENT,
|
||
customer_id INT,
|
||
customer_name VARCHAR(50),
|
||
product_id INT,
|
||
product_name VARCHAR(100),
|
||
category VARCHAR(50),
|
||
order_date DATE,
|
||
quantity INT,
|
||
unit_price DECIMAL(10,2),
|
||
sales_rep VARCHAR(50),
|
||
region VARCHAR(30),
|
||
discount_rate DECIMAL(3,2) DEFAULT 0.00
|
||
);
|
||
|
||
-- 员工表
|
||
CREATE TABLE employees (
|
||
emp_id INT PRIMARY KEY,
|
||
name VARCHAR(50),
|
||
department VARCHAR(30),
|
||
position VARCHAR(50),
|
||
salary DECIMAL(10,2),
|
||
hire_date DATE,
|
||
manager_id INT,
|
||
age INT,
|
||
gender ENUM('M', 'F'),
|
||
city VARCHAR(30)
|
||
);
|
||
|
||
-- 学生成绩表
|
||
CREATE TABLE student_scores (
|
||
student_id INT,
|
||
student_name VARCHAR(50),
|
||
subject VARCHAR(30),
|
||
score INT,
|
||
exam_date DATE,
|
||
teacher VARCHAR(30),
|
||
class_id INT,
|
||
semester VARCHAR(20),
|
||
PRIMARY KEY (student_id, subject, exam_date)
|
||
);
|
||
|
||
-- 网站访问日志表
|
||
CREATE TABLE web_logs (
|
||
log_id INT PRIMARY KEY AUTO_INCREMENT,
|
||
user_id INT,
|
||
page_url VARCHAR(200),
|
||
visit_date DATE,
|
||
visit_time TIME,
|
||
session_duration INT, -- 会话时长(分钟)
|
||
device_type VARCHAR(20),
|
||
browser VARCHAR(30),
|
||
country VARCHAR(30),
|
||
page_views INT DEFAULT 1
|
||
);
|
||
|
||
-- 库存表
|
||
CREATE TABLE inventory (
|
||
product_id INT PRIMARY KEY,
|
||
product_name VARCHAR(100),
|
||
category VARCHAR(50),
|
||
supplier VARCHAR(50),
|
||
stock_quantity INT,
|
||
reorder_level INT,
|
||
unit_cost DECIMAL(8,2),
|
||
last_restock_date DATE,
|
||
warehouse_location VARCHAR(30)
|
||
);
|
||
```
|
||
|
||
### 插入示例数据
|
||
|
||
```sql
|
||
-- 插入销售订单数据
|
||
INSERT INTO sales_orders (customer_id, customer_name, product_id, product_name, category, order_date, quantity, unit_price, sales_rep, region, discount_rate) VALUES
|
||
(1, '阿里巴巴', 101, 'MacBook Pro', '笔记本', '2024-01-15', 10, 12999.00, '张三', '华东', 0.05),
|
||
(1, '阿里巴巴', 102, 'iPhone 15', '手机', '2024-01-16', 50, 5999.00, '张三', '华东', 0.03),
|
||
(2, '腾讯', 103, 'iPad Air', '平板', '2024-01-20', 20, 3999.00, '李四', '华南', 0.04),
|
||
(2, '腾讯', 101, 'MacBook Pro', '笔记本', '2024-01-22', 15, 12999.00, '李四', '华南', 0.06),
|
||
(3, '百度', 104, 'AirPods Pro', '耳机', '2024-01-25', 100, 1999.00, '王五', '华北', 0.08),
|
||
(3, '百度', 102, 'iPhone 15', '手机', '2024-01-28', 30, 5999.00, '王五', '华北', 0.05),
|
||
(4, '字节跳动', 105, 'Apple Watch', '智能手表', '2024-02-01', 25, 2999.00, '赵六', '华北', 0.10),
|
||
(4, '字节跳动', 103, 'iPad Air', '平板', '2024-02-03', 35, 3999.00, '赵六', '华北', 0.07),
|
||
(5, '美团', 106, 'MacBook Air', '笔记本', '2024-02-05', 12, 8999.00, '钱七', '华东', 0.04),
|
||
(5, '美团', 104, 'AirPods Pro', '耳机', '2024-02-08', 80, 1999.00, '钱七', '华东', 0.06),
|
||
(6, '滴滴', 107, 'iMac', '台式机', '2024-02-10', 8, 15999.00, '孙八', '华南', 0.03),
|
||
(6, '滴滴', 102, 'iPhone 15', '手机', '2024-02-12', 40, 5999.00, '孙八', '华南', 0.04),
|
||
(7, '小米', 108, 'iPad Pro', '平板', '2024-02-15', 18, 7999.00, '周九', '华北', 0.05),
|
||
(7, '小米', 105, 'Apple Watch', '智能手表', '2024-02-18', 30, 2999.00, '周九', '华北', 0.12),
|
||
(8, '华为', 109, 'Studio Display', '显示器', '2024-02-20', 6, 11999.00, '吴十', '华南', 0.02);
|
||
|
||
-- 插入员工数据
|
||
INSERT INTO employees (emp_id, name, department, position, salary, hire_date, manager_id, age, gender, city) VALUES
|
||
(1001, '张经理', '技术部', '部门经理', 25000.00, '2020-01-15', NULL, 35, 'M', '北京'),
|
||
(1002, '李架构师', '技术部', '高级架构师', 22000.00, '2020-05-20', 1001, 32, 'M', '北京'),
|
||
(1003, '王工程师', '技术部', '高级工程师', 18000.00, '2021-03-10', 1001, 28, 'F', '北京'),
|
||
(1004, '赵工程师', '技术部', '中级工程师', 15000.00, '2022-01-15', 1001, 26, 'M', '北京'),
|
||
(1005, '钱实习生', '技术部', '实习工程师', 8000.00, '2023-07-01', 1002, 23, 'F', '北京'),
|
||
(2001, '孙经理', '产品部', '产品经理', 20000.00, '2021-02-01', NULL, 30, 'F', '上海'),
|
||
(2002, '周产品', '产品部', '高级产品', 16000.00, '2021-08-15', 2001, 27, 'M', '上海'),
|
||
(2003, '吴助理', '产品部', '产品助理', 12000.00, '2022-06-20', 2001, 25, 'F', '上海'),
|
||
(3001, '郑经理', '销售部', '销售经理', 18000.00, '2020-11-10', NULL, 33, 'M', '深圳'),
|
||
(3002, '刘销售', '销售部', '高级销售', 14000.00, '2021-09-25', 3001, 29, 'F', '深圳'),
|
||
(3003, '陈销售', '销售部', '销售代表', 10000.00, '2022-12-01', 3001, 24, 'M', '深圳'),
|
||
(4001, '林经理', '人事部', 'HR经理', 16000.00, '2021-04-12', NULL, 31, 'F', '广州'),
|
||
(4002, '黄专员', '人事部', 'HR专员', 11000.00, '2022-08-30', 4001, 26, 'F', '广州'),
|
||
(5001, '何经理', '财务部', '财务经理', 19000.00, '2020-07-08', NULL, 34, 'M', '杭州'),
|
||
(5002, '魏会计', '财务部', '会计', 13000.00, '2021-11-20', 5001, 28, 'F', '杭州');
|
||
|
||
-- 插入学生成绩数据
|
||
INSERT INTO student_scores (student_id, student_name, subject, score, exam_date, teacher, class_id, semester) VALUES
|
||
(1, '张小明', '数学', 85, '2024-01-15', '王老师', 1, '2024春'),
|
||
(1, '张小明', '语文', 78, '2024-01-16', '李老师', 1, '2024春'),
|
||
(1, '张小明', '英语', 92, '2024-01-17', '赵老师', 1, '2024春'),
|
||
(2, '李小红', '数学', 92, '2024-01-15', '王老师', 1, '2024春'),
|
||
(2, '李小红', '语文', 88, '2024-01-16', '李老师', 1, '2024春'),
|
||
(2, '李小红', '英语', 95, '2024-01-17', '赵老师', 1, '2024春'),
|
||
(3, '王小刚', '数学', 76, '2024-01-15', '王老师', 2, '2024春'),
|
||
(3, '王小刚', '语文', 82, '2024-01-16', '李老师', 2, '2024春'),
|
||
(3, '王小刚', '英语', 79, '2024-01-17', '赵老师', 2, '2024春'),
|
||
(4, '赵小丽', '数学', 88, '2024-01-15', '王老师', 2, '2024春'),
|
||
(4, '赵小丽', '语文', 85, '2024-01-16', '李老师', 2, '2024春'),
|
||
(4, '赵小丽', '英语', 90, '2024-01-17', '赵老师', 2, '2024春'),
|
||
(5, '钱小伟', '数学', 90, '2024-01-15', '钱老师', 3, '2024春'),
|
||
(5, '钱小伟', '语文', 87, '2024-01-16', '孙老师', 3, '2024春'),
|
||
(5, '钱小伟', '英语', 93, '2024-01-17', '周老师', 3, '2024春'),
|
||
-- 添加期中考试成绩
|
||
(1, '张小明', '数学', 88, '2024-03-15', '王老师', 1, '2024春'),
|
||
(1, '张小明', '语文', 82, '2024-03-16', '李老师', 1, '2024春'),
|
||
(2, '李小红', '数学', 95, '2024-03-15', '王老师', 1, '2024春'),
|
||
(2, '李小红', '语文', 90, '2024-03-16', '李老师', 1, '2024春'),
|
||
(3, '王小刚', '数学', 80, '2024-03-15', '王老师', 2, '2024春');
|
||
|
||
-- 插入网站访问日志数据
|
||
INSERT INTO web_logs (user_id, page_url, visit_date, visit_time, session_duration, device_type, browser, country, page_views) VALUES
|
||
(1001, '/home', '2024-01-15', '09:30:00', 25, 'Desktop', 'Chrome', '中国', 5),
|
||
(1001, '/products', '2024-01-15', '10:15:00', 15, 'Desktop', 'Chrome', '中国', 8),
|
||
(1002, '/home', '2024-01-15', '14:20:00', 30, 'Mobile', 'Safari', '美国', 3),
|
||
(1003, '/about', '2024-01-16', '11:45:00', 20, 'Tablet', 'Chrome', '英国', 4),
|
||
(1001, '/checkout', '2024-01-16', '16:30:00', 45, 'Desktop', 'Chrome', '中国', 2),
|
||
(1004, '/home', '2024-01-17', '08:15:00', 35, 'Mobile', 'Firefox', '日本', 6),
|
||
(1002, '/products', '2024-01-17', '13:20:00', 28, 'Desktop', 'Edge', '美国', 7),
|
||
(1005, '/contact', '2024-01-18', '10:40:00', 12, 'Mobile', 'Safari', '加拿大', 2),
|
||
(1003, '/home', '2024-01-18', '15:25:00', 22, 'Desktop', 'Chrome', '英国', 4),
|
||
(1006, '/products', '2024-01-19', '09:10:00', 40, 'Tablet', 'Safari', '澳大利亚', 9),
|
||
(1001, '/home', '2024-01-19', '14:35:00', 18, 'Mobile', 'Chrome', '中国', 3),
|
||
(1007, '/login', '2024-01-20', '11:20:00', 8, 'Desktop', 'Firefox', '德国', 1),
|
||
(1002, '/dashboard', '2024-01-20', '16:45:00', 55, 'Desktop', 'Chrome', '美国', 12);
|
||
|
||
-- 插入库存数据
|
||
INSERT INTO inventory (product_id, product_name, category, supplier, stock_quantity, reorder_level, unit_cost, last_restock_date, warehouse_location) VALUES
|
||
(101, 'MacBook Pro', '笔记本', 'Apple', 45, 20, 10000.00, '2024-01-10', '北京仓库'),
|
||
(102, 'iPhone 15', '手机', 'Apple', 120, 50, 4500.00, '2024-01-12', '上海仓库'),
|
||
(103, 'iPad Air', '平板', 'Apple', 80, 30, 3200.00, '2024-01-08', '深圳仓库'),
|
||
(104, 'AirPods Pro', '耳机', 'Apple', 200, 80, 1500.00, '2024-01-15', '北京仓库'),
|
||
(105, 'Apple Watch', '智能手表', 'Apple', 60, 25, 2400.00, '2024-01-05', '广州仓库'),
|
||
(106, 'MacBook Air', '笔记本', 'Apple', 35, 15, 7200.00, '2024-01-20', '上海仓库'),
|
||
(107, 'iMac', '台式机', 'Apple', 15, 10, 12800.00, '2024-01-18', '深圳仓库'),
|
||
(108, 'iPad Pro', '平板', 'Apple', 25, 15, 6400.00, '2024-01-22', '北京仓库'),
|
||
(109, 'Studio Display', '显示器', 'Apple', 12, 8, 9600.00, '2024-01-25', '广州仓库'),
|
||
(110, 'Mac Mini', '台式机', 'Apple', 30, 12, 4800.00, '2024-01-28', '杭州仓库');
|
||
```
|
||
|
||
## 基础分组查询
|
||
|
||
### 1. 简单分组统计
|
||
|
||
```sql
|
||
-- 按产品类别统计销售数量
|
||
SELECT category, SUM(quantity) AS total_quantity
|
||
FROM sales_orders
|
||
GROUP BY category;
|
||
```
|
||
|
||
**结果:**
|
||
```
|
||
+----------+----------------+
|
||
| category | total_quantity |
|
||
+----------+----------------+
|
||
| 笔记本 | 37 |
|
||
| 手机 | 120 |
|
||
| 平板 | 73 |
|
||
| 耳机 | 180 |
|
||
| 智能手表 | 55 |
|
||
| 台式机 | 8 |
|
||
| 显示器 | 6 |
|
||
+----------+----------------+
|
||
```
|
||
|
||
### 2. 按部门统计员工信息
|
||
|
||
```sql
|
||
-- 按部门统计员工数量和平均薪资
|
||
SELECT department,
|
||
COUNT(*) AS emp_count,
|
||
ROUND(AVG(salary), 2) AS avg_salary,
|
||
MIN(salary) AS min_salary,
|
||
MAX(salary) AS max_salary
|
||
FROM employees
|
||
GROUP BY department;
|
||
```
|
||
|
||
**结果:**
|
||
```
|
||
+----------+-----------+------------+------------+------------+
|
||
| department | emp_count | avg_salary | min_salary | max_salary |
|
||
+----------+-----------+------------+------------+------------+
|
||
| 技术部 | 5 | 17600.00 | 8000.00 | 25000.00 |
|
||
| 产品部 | 3 | 16000.00 | 12000.00 | 20000.00 |
|
||
| 销售部 | 3 | 14000.00 | 10000.00 | 18000.00 |
|
||
| 人事部 | 2 | 13500.00 | 11000.00 | 16000.00 |
|
||
| 财务部 | 2 | 16000.00 | 13000.00 | 19000.00 |
|
||
+----------+-----------+------------+------------+------------+
|
||
```
|
||
|
||
### 3. 按日期分组统计
|
||
|
||
```sql
|
||
-- 按月份统计订单数量和销售额
|
||
SELECT DATE_FORMAT(order_date, '%Y-%m') AS order_month,
|
||
COUNT(*) AS order_count,
|
||
SUM(quantity * unit_price * (1 - discount_rate)) AS total_sales
|
||
FROM sales_orders
|
||
GROUP BY DATE_FORMAT(order_date, '%Y-%m')
|
||
ORDER BY order_month;
|
||
```
|
||
|
||
**结果:**
|
||
```
|
||
+-------------+-------------+-------------+
|
||
| order_month | order_count | total_sales |
|
||
+-------------+-------------+-------------+
|
||
| 2024-01 | 6 | 912393.00 |
|
||
| 2024-02 | 9 | 809328.40 |
|
||
+-------------+-------------+-------------+
|
||
```
|
||
|
||
## 聚合函数详解
|
||
|
||
### 1. COUNT() 函数详解
|
||
|
||
```sql
|
||
-- COUNT 的不同用法
|
||
SELECT
|
||
COUNT(*) AS total_rows, -- 总行数
|
||
COUNT(manager_id) AS has_manager, -- 非NULL值数量
|
||
COUNT(DISTINCT department) AS dept_count, -- 去重计数
|
||
COUNT(CASE WHEN salary > 15000 THEN 1 END) AS high_salary_count -- 条件计数
|
||
FROM employees;
|
||
```
|
||
|
||
**结果:**
|
||
```
|
||
+------------+-------------+------------+-------------------+
|
||
| total_rows | has_manager | dept_count | high_salary_count |
|
||
+------------+-------------+------------+-------------------+
|
||
| 15 | 10 | 5 | 8 |
|
||
+------------+-------------+------------+-------------------+
|
||
```
|
||
|
||
### 2. SUM() 和 AVG() 函数
|
||
|
||
```sql
|
||
-- 按销售代表统计业绩
|
||
SELECT sales_rep,
|
||
COUNT(*) AS order_count,
|
||
SUM(quantity) AS total_quantity,
|
||
SUM(quantity * unit_price * (1 - discount_rate)) AS total_sales,
|
||
ROUND(AVG(quantity * unit_price * (1 - discount_rate)), 2) AS avg_order_value
|
||
FROM sales_orders
|
||
GROUP BY sales_rep
|
||
ORDER BY total_sales DESC;
|
||
```
|
||
|
||
**结果:**
|
||
```
|
||
+----------+-------------+----------------+-------------+-----------------+
|
||
| sales_rep | order_count | total_quantity | total_sales | avg_order_value |
|
||
+----------+-------------+----------------+-------------+-----------------+
|
||
| 张三 | 2 | 60 | 423447.00 | 211723.50 |
|
||
| 李四 | 2 | 35 | 318969.00 | 159484.50 |
|
||
| 王五 | 2 | 130 | 344562.00 | 172281.00 |
|
||
| 赵六 | 2 | 60 | 206187.00 | 103093.50 |
|
||
| 钱七 | 2 | 92 | 258291.60 | 129145.80 |
|
||
| 孙八 | 2 | 48 | 355464.00 | 177732.00 |
|
||
| 周九 | 2 | 48 | 222777.60 | 111388.80 |
|
||
| 吴十 | 1 | 6 | 70622.80 | 70622.80 |
|
||
+----------+-------------+----------------+-------------+-----------------+
|
||
```
|
||
|
||
### 3. MAX() 和 MIN() 函数
|
||
|
||
```sql
|
||
-- 按班级统计成绩分布
|
||
SELECT class_id,
|
||
COUNT(*) AS student_count,
|
||
MAX(score) AS highest_score,
|
||
MIN(score) AS lowest_score,
|
||
ROUND(AVG(score), 2) AS avg_score,
|
||
MAX(score) - MIN(score) AS score_range
|
||
FROM student_scores
|
||
GROUP BY class_id
|
||
ORDER BY class_id;
|
||
```
|
||
|
||
**结果:**
|
||
```
|
||
+----------+---------------+---------------+--------------+-----------+-------------+
|
||
| class_id | student_count | highest_score | lowest_score | avg_score | score_range |
|
||
+----------+---------------+---------------+--------------+-----------+-------------+
|
||
| 1 | 8 | 95 | 78 | 86.75 | 17 |
|
||
| 2 | 7 | 90 | 76 | 82.71 | 14 |
|
||
| 3 | 3 | 93 | 87 | 90.00 | 6 |
|
||
+----------+---------------+---------------+--------------+-----------+-------------+
|
||
```
|
||
|
||
### 4. GROUP_CONCAT() 函数
|
||
|
||
```sql
|
||
-- 按部门列出所有员工姓名
|
||
SELECT department,
|
||
COUNT(*) AS emp_count,
|
||
GROUP_CONCAT(name ORDER BY salary DESC) AS employees,
|
||
GROUP_CONCAT(DISTINCT city) AS cities
|
||
FROM employees
|
||
GROUP BY department;
|
||
```
|
||
|
||
**结果:**
|
||
```
|
||
+----------+-----------+------------------------------------------+----------+
|
||
| department | emp_count | employees | cities |
|
||
+----------+-----------+------------------------------------------+----------+
|
||
| 技术部 | 5 | 张经理,李架构师,王工程师,赵工程师,钱实习生 | 北京 |
|
||
| 产品部 | 3 | 孙经理,周产品,吴助理 | 上海 |
|
||
| 销售部 | 3 | 郑经理,刘销售,陈销售 | 深圳 |
|
||
| 人事部 | 2 | 林经理,黄专员 | 广州 |
|
||
| 财务部 | 2 | 何经理,魏会计 | 杭州 |
|
||
+----------+-----------+------------------------------------------+----------+
|
||
```
|
||
|
||
## HAVING 子句使用
|
||
|
||
### 1. HAVING 基础用法
|
||
|
||
```sql
|
||
-- 查找员工数量大于2的部门
|
||
SELECT department,
|
||
COUNT(*) AS emp_count,
|
||
ROUND(AVG(salary), 2) AS avg_salary
|
||
FROM employees
|
||
GROUP BY department
|
||
HAVING COUNT(*) > 2
|
||
ORDER BY emp_count DESC;
|
||
```
|
||
|
||
**结果:**
|
||
```
|
||
+----------+-----------+------------+
|
||
| department | emp_count | avg_salary |
|
||
+----------+-----------+------------+
|
||
| 技术部 | 5 | 17600.00 |
|
||
| 产品部 | 3 | 16000.00 |
|
||
| 销售部 | 3 | 14000.00 |
|
||
+----------+-----------+------------+
|
||
```
|
||
|
||
### 2. HAVING 与 WHERE 的区别
|
||
|
||
```sql
|
||
-- 错误用法:WHERE 不能使用聚合函数
|
||
-- SELECT department, COUNT(*) FROM employees WHERE COUNT(*) > 2 GROUP BY department;
|
||
|
||
-- 正确用法:先过滤再分组
|
||
SELECT department,
|
||
COUNT(*) AS emp_count,
|
||
ROUND(AVG(salary), 2) AS avg_salary
|
||
FROM employees
|
||
WHERE salary > 12000 -- 先过滤薪资大于12000的员工
|
||
GROUP BY department
|
||
HAVING COUNT(*) >= 2 -- 再过滤员工数量>=2的部门
|
||
ORDER BY avg_salary DESC;
|
||
```
|
||
|
||
**结果:**
|
||
```
|
||
+----------+-----------+------------+
|
||
| department | emp_count | avg_salary |
|
||
+----------+-----------+------------+
|
||
| 技术部 | 3 | 21666.67 |
|
||
| 产品部 | 2 | 18000.00 |
|
||
| 财务部 | 2 | 16000.00 |
|
||
+----------+-----------+------------+
|
||
```
|
||
|
||
### 3. 复杂 HAVING 条件
|
||
|
||
```sql
|
||
-- 查找高价值客户(订单总额>30万且订单数量>1)
|
||
SELECT customer_name,
|
||
COUNT(*) AS order_count,
|
||
SUM(quantity * unit_price * (1 - discount_rate)) AS total_sales,
|
||
ROUND(AVG(quantity * unit_price * (1 - discount_rate)), 2) AS avg_order_value
|
||
FROM sales_orders
|
||
GROUP BY customer_name
|
||
HAVING COUNT(*) > 1 AND SUM(quantity * unit_price * (1 - discount_rate)) > 300000
|
||
ORDER BY total_sales DESC;
|
||
```
|
||
|
||
**结果:**
|
||
```
|
||
+--------------+-------------+-------------+-----------------+
|
||
| customer_name | order_count | total_sales | avg_order_value |
|
||
+--------------+-------------+-------------+-----------------+
|
||
| 阿里巴巴 | 2 | 423447.00 | 211723.50 |
|
||
| 腾讯 | 2 | 318969.00 | 159484.50 |
|
||
| 百度 | 2 | 344562.00 | 172281.00 |
|
||
| 滴滴 | 2 | 355464.00 | 177732.00 |
|
||
+-------------+-------------+-------------+-----------------+
|
||
```
|
||
|
||
## 多列分组查询
|
||
|
||
### 1. 双列分组
|
||
|
||
```sql
|
||
-- 按地区和销售代表分组统计
|
||
SELECT region, sales_rep,
|
||
COUNT(*) AS order_count,
|
||
SUM(quantity) AS total_quantity,
|
||
SUM(quantity * unit_price * (1 - discount_rate)) AS total_sales
|
||
FROM sales_orders
|
||
GROUP BY region, sales_rep
|
||
ORDER BY region, total_sales DESC;
|
||
```
|
||
|
||
**结果:**
|
||
```
|
||
+--------+-----------+-------------+----------------+-------------+
|
||
| region | sales_rep | order_count | total_quantity | total_sales |
|
||
+--------+-----------+-------------+----------------+-------------+
|
||
| 华北 | 王五 | 2 | 130 | 344562.00 |
|
||
| 华北 | 赵六 | 2 | 60 | 206187.00 |
|
||
| 华北 | 周九 | 2 | 48 | 222777.60 |
|
||
| 华东 | 张三 | 2 | 60 | 423447.00 |
|
||
| 华东 | 钱七 | 2 | 92 | 258291.60 |
|
||
| 华南 | 李四 | 2 | 35 | 318969.00 |
|
||
| 华南 | 孙八 | 2 | 48 | 355464.00 |
|
||
| 华南 | 吴十 | 1 | 6 | 70622.80 |
|
||
+--------+-----------+-------------+----------------+-------------+
|
||
```
|
||
|
||
### 2. 三列分组
|
||
|
||
```sql
|
||
-- 按年龄段、性别、部门分组统计员工
|
||
SELECT
|
||
CASE
|
||
WHEN age < 25 THEN '24岁以下'
|
||
WHEN age <= 30 THEN '25-30岁'
|
||
WHEN age <= 35 THEN '31-35岁'
|
||
ELSE '35岁以上'
|
||
END AS age_group,
|
||
gender,
|
||
department,
|
||
COUNT(*) AS emp_count,
|
||
ROUND(AVG(salary), 2) AS avg_salary
|
||
FROM employees
|
||
GROUP BY
|
||
CASE
|
||
WHEN age < 25 THEN '24岁以下'
|
||
WHEN age <= 30 THEN '25-30岁'
|
||
WHEN age <= 35 THEN '31-35岁'
|
||
ELSE '35岁以上'
|
||
END,
|
||
gender,
|
||
department
|
||
HAVING COUNT(*) >= 1
|
||
ORDER BY age_group, gender, department;
|
||
```
|
||
|
||
### 3. 时间维度分组
|
||
|
||
```sql
|
||
-- 按年、月、产品类别分组统计
|
||
SELECT
|
||
YEAR(order_date) AS order_year,
|
||
MONTH(order_date) AS order_month,
|
||
category,
|
||
COUNT(*) AS order_count,
|
||
SUM(quantity) AS total_quantity,
|
||
ROUND(SUM(quantity * unit_price * (1 - discount_rate)), 2) AS total_sales
|
||
FROM sales_orders
|
||
GROUP BY YEAR(order_date), MONTH(order_date), category
|
||
ORDER BY order_year, order_month, total_sales DESC;
|
||
```
|
||
|
||
## 复杂分组统计
|
||
|
||
### 1. 嵌套分组查询
|
||
|
||
```sql
|
||
-- 查找每个部门薪资最高的员工信息
|
||
SELECT e1.*
|
||
FROM employees e1
|
||
WHERE e1.salary = (
|
||
SELECT MAX(e2.salary)
|
||
FROM employees e2
|
||
WHERE e2.department = e1.department
|
||
)
|
||
ORDER BY e1.department, e1.salary DESC;
|
||
```
|
||
|
||
### 2. 窗口函数与分组
|
||
|
||
```sql
|
||
-- 计算每个员工在部门内的薪资排名
|
||
SELECT name, department, salary,
|
||
ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) AS dept_rank,
|
||
ROUND(salary / AVG(salary) OVER (PARTITION BY department) * 100, 2) AS salary_ratio
|
||
FROM employees
|
||
ORDER BY department, dept_rank;
|
||
```
|
||
|
||
### 3. 数据透视表效果
|
||
|
||
```sql
|
||
-- 按设备类型和浏览器统计访问情况
|
||
SELECT device_type,
|
||
SUM(CASE WHEN browser = 'Chrome' THEN page_views ELSE 0 END) AS Chrome,
|
||
SUM(CASE WHEN browser = 'Safari' THEN page_views ELSE 0 END) AS Safari,
|
||
SUM(CASE WHEN browser = 'Firefox' THEN page_views ELSE 0 END) AS Firefox,
|
||
SUM(CASE WHEN browser = 'Edge' THEN page_views ELSE 0 END) AS Edge,
|
||
SUM(page_views) AS Total
|
||
FROM web_logs
|
||
GROUP BY device_type
|
||
ORDER BY Total DESC;
|
||
```
|
||
|
||
**结果:**
|
||
```
|
||
+-------------+--------+--------+---------+------+-------+
|
||
| device_type | Chrome | Safari | Firefox | Edge | Total |
|
||
+-------------+--------+--------+---------+------+-------+
|
||
| Desktop | 24 | 0 | 1 | 7 | 32 |
|
||
| Mobile | 6 | 5 | 6 | 0 | 17 |
|
||
| Tablet | 4 | 9 | 0 | 0 | 13 |
|
||
+-------------+--------+--------+---------+------+-------+
|
||
```
|
||
|
||
### 4. 动态分组条件
|
||
|
||
```sql
|
||
-- 按库存状态分组统计产品
|
||
SELECT
|
||
CASE
|
||
WHEN stock_quantity <= reorder_level THEN '需要补货'
|
||
WHEN stock_quantity <= reorder_level * 2 THEN '库存偏低'
|
||
ELSE '库存充足'
|
||
END AS stock_status,
|
||
COUNT(*) AS product_count,
|
||
SUM(stock_quantity) AS total_stock,
|
||
ROUND(AVG(unit_cost), 2) AS avg_cost
|
||
FROM inventory
|
||
GROUP BY
|
||
CASE
|
||
WHEN stock_quantity <= reorder_level THEN '需要补货'
|
||
WHEN stock_quantity <= reorder_level * 2 THEN '库存偏低'
|
||
ELSE '库存充足'
|
||
END
|
||
ORDER BY
|
||
CASE
|
||
WHEN stock_status = '需要补货' THEN 1
|
||
WHEN stock_status = '库存偏低' THEN 2
|
||
ELSE 3
|
||
END;
|
||
```
|
||
|
||
## 面试题和实际案例
|
||
|
||
### 面试题1:连续登录用户分析
|
||
|
||
**题目**:找出连续3天都有访问记录的用户。
|
||
|
||
```sql
|
||
-- 解答:使用窗口函数和分组
|
||
WITH daily_users AS (
|
||
SELECT DISTINCT user_id, visit_date
|
||
FROM web_logs
|
||
),
|
||
user_sequences AS (
|
||
SELECT user_id, visit_date,
|
||
ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY visit_date) as rn,
|
||
DATE_SUB(visit_date, INTERVAL ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY visit_date) DAY) as group_date
|
||
FROM daily_users
|
||
),
|
||
consecutive_groups AS (
|
||
SELECT user_id, group_date, COUNT(*) as consecutive_days
|
||
FROM user_sequences
|
||
GROUP BY user_id, group_date
|
||
HAVING COUNT(*) >= 3
|
||
)
|
||
SELECT DISTINCT cg.user_id
|
||
FROM consecutive_groups cg;
|
||
```
|
||
|
||
### 面试题2:同比增长率计算
|
||
|
||
**题目**:计算每个月的销售额同比增长率。
|
||
|
||
```sql
|
||
-- 解答:使用自连接和分组
|
||
WITH monthly_sales AS (
|
||
SELECT
|
||
DATE_FORMAT(order_date, '%Y-%m') as month,
|
||
SUM(quantity * unit_price * (1 - discount_rate)) as total_sales
|
||
FROM sales_orders
|
||
GROUP BY DATE_FORMAT(order_date, '%Y-%m')
|
||
)
|
||
SELECT
|
||
current_month.month,
|
||
current_month.total_sales as current_sales,
|
||
last_year.total_sales as last_year_sales,
|
||
ROUND(
|
||
(current_month.total_sales - IFNULL(last_year.total_sales, 0)) /
|
||
IFNULL(last_year.total_sales, current_month.total_sales) * 100, 2
|
||
) as growth_rate
|
||
FROM monthly_sales current_month
|
||
LEFT JOIN monthly_sales last_year
|
||
ON DATE_FORMAT(DATE_SUB(STR_TO_DATE(CONCAT(current_month.month, '-01'), '%Y-%m-%d'), INTERVAL 1 YEAR), '%Y-%m') = last_year.month
|
||
ORDER BY current_month.month;
|
||
```
|
||
|
||
### 面试题3:帕累托分析(80/20法则)
|
||
|
||
**题目**:找出贡献80%销售额的前20%客户。
|
||
|
||
```sql
|
||
-- 解答:使用窗口函数计算累计占比
|
||
WITH customer_sales AS (
|
||
SELECT customer_name,
|
||
SUM(quantity * unit_price * (1 - discount_rate)) as total_sales
|
||
FROM sales_orders
|
||
GROUP BY customer_name
|
||
),
|
||
customer_ranking AS (
|
||
SELECT customer_name, total_sales,
|
||
ROW_NUMBER() OVER (ORDER BY total_sales DESC) as rank_num,
|
||
COUNT(*) OVER () as total_customers,
|
||
SUM(total_sales) OVER () as grand_total,
|
||
SUM(total_sales) OVER (ORDER BY total_sales DESC) as cumulative_sales
|
||
FROM customer_sales
|
||
),
|
||
customer_contribution AS (
|
||
SELECT *,
|
||
ROUND(rank_num / total_customers * 100, 2) as customer_percentile,
|
||
ROUND(cumulative_sales / grand_total * 100, 2) as sales_percentile
|
||
FROM customer_ranking
|
||
)
|
||
SELECT customer_name, total_sales, customer_percentile, sales_percentile
|
||
FROM customer_contribution
|
||
WHERE sales_percentile <= 80
|
||
ORDER BY total_sales DESC;
|
||
```
|
||
|
||
### 实际案例1:用户行为分析
|
||
|
||
**场景**:电商网站需要分析用户访问行为,优化用户体验。
|
||
|
||
```sql
|
||
-- 综合用户行为分析
|
||
SELECT
|
||
country,
|
||
device_type,
|
||
COUNT(DISTINCT user_id) as unique_users,
|
||
COUNT(*) as total_sessions,
|
||
ROUND(COUNT(*) / COUNT(DISTINCT user_id), 2) as sessions_per_user,
|
||
ROUND(AVG(session_duration), 2) as avg_session_duration,
|
||
SUM(page_views) as total_page_views,
|
||
ROUND(SUM(page_views) / COUNT(*), 2) as pages_per_session,
|
||
ROUND(SUM(session_duration) / 60, 2) as total_hours
|
||
FROM web_logs
|
||
GROUP BY country, device_type
|
||
HAVING COUNT(DISTINCT user_id) >= 1
|
||
ORDER BY unique_users DESC, avg_session_duration DESC;
|
||
```
|
||
|
||
### 实际案例2:销售业绩分析
|
||
|
||
**场景**:销售部门需要全面分析销售业绩,制定奖励策略。
|
||
|
||
```sql
|
||
-- 销售代表综合业绩分析
|
||
WITH sales_performance AS (
|
||
SELECT sales_rep,
|
||
COUNT(*) as order_count,
|
||
COUNT(DISTINCT customer_name) as customer_count,
|
||
SUM(quantity) as total_quantity,
|
||
SUM(quantity * unit_price * (1 - discount_rate)) as total_sales,
|
||
ROUND(AVG(quantity * unit_price * (1 - discount_rate)), 2) as avg_order_value,
|
||
MAX(quantity * unit_price * (1 - discount_rate)) as max_order_value,
|
||
COUNT(DISTINCT category) as category_diversity
|
||
FROM sales_orders
|
||
GROUP BY sales_rep
|
||
),
|
||
performance_ranking AS (
|
||
SELECT *,
|
||
RANK() OVER (ORDER BY total_sales DESC) as sales_rank,
|
||
RANK() OVER (ORDER BY customer_count DESC) as customer_rank,
|
||
RANK() OVER (ORDER BY avg_order_value DESC) as avg_value_rank,
|
||
ROUND(total_sales / SUM(total_sales) OVER () * 100, 2) as sales_share
|
||
FROM sales_performance
|
||
)
|
||
SELECT sales_rep, total_sales, customer_count, avg_order_value,
|
||
sales_rank, customer_rank, sales_share,
|
||
CASE
|
||
WHEN sales_rank <= 2 THEN '金牌销售'
|
||
WHEN sales_rank <= 4 THEN '银牌销售'
|
||
ELSE '铜牌销售'
|
||
END as performance_level
|
||
FROM performance_ranking
|
||
ORDER BY sales_rank;
|
||
```
|
||
|
||
### 实际案例3:库存管理优化
|
||
|
||
**场景**:仓库管理需要优化库存配置,减少缺货和积压。
|
||
|
||
```sql
|
||
-- 库存分析和补货建议
|
||
SELECT
|
||
category,
|
||
warehouse_location,
|
||
COUNT(*) as product_count,
|
||
SUM(stock_quantity) as total_stock,
|
||
SUM(CASE WHEN stock_quantity <= reorder_level THEN 1 ELSE 0 END) as need_reorder,
|
||
ROUND(AVG(stock_quantity), 2) as avg_stock,
|
||
ROUND(AVG(unit_cost), 2) as avg_cost,
|
||
SUM(stock_quantity * unit_cost) as inventory_value,
|
||
ROUND(SUM(CASE WHEN stock_quantity <= reorder_level THEN unit_cost * reorder_level ELSE 0 END), 2) as reorder_investment
|
||
FROM inventory
|
||
GROUP BY category, warehouse_location
|
||
HAVING COUNT(*) >= 1
|
||
ORDER BY inventory_value DESC, need_reorder DESC;
|
||
```
|
||
|
||
### 面试题4:数据质量检查
|
||
|
||
**题目**:检查数据中的异常情况。
|
||
|
||
```sql
|
||
-- 多维度数据质量检查
|
||
SELECT
|
||
'sales_orders' as table_name,
|
||
'order_date' as check_field,
|
||
COUNT(*) as total_records,
|
||
SUM(CASE WHEN order_date IS NULL THEN 1 ELSE 0 END) as null_count,
|
||
SUM(CASE WHEN order_date > CURDATE() THEN 1 ELSE 0 END) as future_date_count,
|
||
MIN(order_date) as min_date,
|
||
MAX(order_date) as max_date
|
||
FROM sales_orders
|
||
|
||
UNION ALL
|
||
|
||
SELECT
|
||
'employees' as table_name,
|
||
'salary' as check_field,
|
||
COUNT(*) as total_records,
|
||
SUM(CASE WHEN salary IS NULL THEN 1 ELSE 0 END) as null_count,
|
||
SUM(CASE WHEN salary <= 0 THEN 1 ELSE 0 END) as invalid_salary_count,
|
||
MIN(salary) as min_salary,
|
||
MAX(salary) as max_salary
|
||
FROM employees;
|
||
```
|
||
|
||
## 性能优化和最佳实践
|
||
|
||
### 1. 索引优化
|
||
|
||
```sql
|
||
-- 为分组字段创建索引
|
||
CREATE INDEX idx_sales_orders_category ON sales_orders(category);
|
||
CREATE INDEX idx_sales_orders_sales_rep ON sales_orders(sales_rep);
|
||
CREATE INDEX idx_employees_department ON employees(department);
|
||
|
||
-- 复合索引优化分组查询
|
||
CREATE INDEX idx_sales_orders_date_region ON sales_orders(order_date, region);
|
||
CREATE INDEX idx_web_logs_user_date ON web_logs(user_id, visit_date);
|
||
```
|
||
|
||
### 2. 查询优化技巧
|
||
|
||
```sql
|
||
-- ✅ 推荐:使用WHERE过滤再分组
|
||
SELECT department, COUNT(*)
|
||
FROM employees
|
||
WHERE salary > 10000 -- 先过滤
|
||
GROUP BY department;
|
||
|
||
-- ❌ 避免:分组后再过滤
|
||
SELECT department, COUNT(*)
|
||
FROM employees
|
||
GROUP BY department
|
||
HAVING AVG(salary) > 10000; -- 这会计算所有组的平均值
|
||
```
|
||
|
||
### 3. 避免不必要的分组
|
||
|
||
```sql
|
||
-- ❌ 不必要的分组
|
||
SELECT customer_name, SUM(quantity)
|
||
FROM sales_orders
|
||
WHERE customer_name = '阿里巴巴'
|
||
GROUP BY customer_name;
|
||
|
||
-- ✅ 优化:直接使用WHERE
|
||
SELECT '阿里巴巴' as customer_name, SUM(quantity)
|
||
FROM sales_orders
|
||
WHERE customer_name = '阿里巴巴';
|
||
```
|
||
|
||
### 4. 大数据量分组优化
|
||
|
||
```sql
|
||
-- 使用分区表优化大数据量分组
|
||
-- CREATE TABLE sales_orders_partitioned (
|
||
-- ...
|
||
-- ) PARTITION BY RANGE (YEAR(order_date)) (
|
||
-- PARTITION p2023 VALUES LESS THAN (2024),
|
||
-- PARTITION p2024 VALUES LESS THAN (2025)
|
||
-- );
|
||
|
||
-- 优化GROUP BY的内存使用
|
||
SET SESSION group_concat_max_len = 1000000;
|
||
SET SESSION tmp_table_size = 256000000;
|
||
SET SESSION max_heap_table_size = 256000000;
|
||
```
|
||
|
||
### 5. 监控和调优
|
||
|
||
```sql
|
||
-- 查看分组查询的执行计划
|
||
EXPLAIN SELECT department, COUNT(*), AVG(salary)
|
||
FROM employees
|
||
GROUP BY department;
|
||
|
||
-- 监控分组查询性能
|
||
SELECT
|
||
sql_text,
|
||
exec_count,
|
||
avg_timer_wait/1000000000000 as avg_time_sec,
|
||
sum_rows_examined/exec_count as avg_rows_examined
|
||
FROM performance_schema.events_statements_summary_by_digest
|
||
WHERE sql_text LIKE '%GROUP BY%'
|
||
ORDER BY avg_timer_wait DESC
|
||
LIMIT 10;
|
||
```
|
||
|
||
### 6. 最佳实践总结
|
||
|
||
```sql
|
||
-- ✅ 好的GROUP BY实践
|
||
SELECT
|
||
category, -- 分组字段
|
||
COUNT(*) as order_count, -- 明确的聚合函数
|
||
SUM(quantity) as total_qty -- 有意义的别名
|
||
FROM sales_orders
|
||
WHERE order_date >= '2024-01-01' -- 先过滤数据
|
||
GROUP BY category -- 简洁的分组
|
||
HAVING COUNT(*) > 1 -- 分组后过滤
|
||
ORDER BY total_qty DESC -- 有序输出
|
||
LIMIT 10; -- 限制结果数量
|
||
|
||
-- ❌ 避免的问题
|
||
-- 1. 在SELECT中使用非分组字段(MySQL 5.7+会报错)
|
||
-- 2. GROUP BY使用复杂表达式而不创建索引
|
||
-- 3. 不必要的HAVING条件
|
||
-- 4. 大量数据不加WHERE条件直接分组
|
||
```
|
||
|
||
---
|
||
|
||
**总结:**
|
||
- GROUP BY 是数据分析的核心工具,配合聚合函数使用
|
||
- 理解执行顺序:WHERE → GROUP BY → HAVING → SELECT → ORDER BY
|
||
- 合理使用索引可以显著提升分组查询性能
|
||
- HAVING 用于过滤分组结果,WHERE 用于过滤原始数据
|
||
- 复杂分析可以结合窗口函数和子查询实现
|
||
- 注意数据类型和NULL值的处理 |