# MySQL GROUP BY 语法使用文档 ## 目录 1. [GROUP BY 基础语法](#group-by-基础语法) 2. [GROUP BY 规则和特性](#group-by-规则和特性) 3. [示例数据准备](#示例数据准备) 4. [基础分组查询](#基础分组查询) 5. [聚合函数详解](#聚合函数详解) 6. [HAVING 子句使用](#having-子句使用) 7. [多列分组查询](#多列分组查询) 8. [复杂分组统计](#复杂分组统计) 9. [面试题和实际案例](#面试题和实际案例) 10. [性能优化和最佳实践](#性能优化和最佳实践) ## GROUP BY 基础语法 GROUP BY 用于将查询结果按指定字段进行分组,通常与聚合函数一起使用。 ```sql SELECT column1, aggregate_function(column2) FROM table_name [WHERE condition] GROUP BY column1, column2, ... [HAVING group_condition] [ORDER BY column] [LIMIT number]; ``` **执行顺序:** 1. FROM - 确定数据源 2. WHERE - 过滤原始数据 3. GROUP BY - 分组 4. 聚合函数计算 5. HAVING - 过滤分组结果 6. SELECT - 选择输出列 7. ORDER BY - 排序 8. LIMIT - 限制结果数量 ## GROUP BY 规则和特性 1. **SELECT 列限制**:SELECT 子句中只能包含: - GROUP BY 子句中的列 - 聚合函数 - 常量 2. **NULL 值处理**:NULL 值被视为一组 3. **聚合函数**: - COUNT():计数 - SUM():求和 - AVG():平均值 - MAX():最大值 - MIN():最小值 - GROUP_CONCAT():字符串连接 4. **HAVING vs WHERE**: - WHERE:过滤原始行 - HAVING:过滤分组结果 ## 示例数据准备 ### 创建示例表 ```sql -- 销售订单表 CREATE TABLE sales_orders ( order_id INT PRIMARY KEY AUTO_INCREMENT, customer_id INT, customer_name VARCHAR(50), product_id INT, product_name VARCHAR(100), category VARCHAR(50), order_date DATE, quantity INT, unit_price DECIMAL(10,2), sales_rep VARCHAR(50), region VARCHAR(30), discount_rate DECIMAL(3,2) DEFAULT 0.00 ); -- 员工表 CREATE TABLE employees ( emp_id INT PRIMARY KEY, name VARCHAR(50), department VARCHAR(30), position VARCHAR(50), salary DECIMAL(10,2), hire_date DATE, manager_id INT, age INT, gender ENUM('M', 'F'), city VARCHAR(30) ); -- 学生成绩表 CREATE TABLE student_scores ( student_id INT, student_name VARCHAR(50), subject VARCHAR(30), score INT, exam_date DATE, teacher VARCHAR(30), class_id INT, semester VARCHAR(20), PRIMARY KEY (student_id, subject, exam_date) ); -- 网站访问日志表 CREATE TABLE web_logs ( log_id INT PRIMARY KEY AUTO_INCREMENT, user_id INT, page_url VARCHAR(200), visit_date DATE, visit_time TIME, session_duration INT, -- 会话时长(分钟) device_type VARCHAR(20), browser VARCHAR(30), country VARCHAR(30), page_views INT DEFAULT 1 ); -- 库存表 CREATE TABLE inventory ( product_id INT PRIMARY KEY, product_name VARCHAR(100), category VARCHAR(50), supplier VARCHAR(50), stock_quantity INT, reorder_level INT, unit_cost DECIMAL(8,2), last_restock_date DATE, warehouse_location VARCHAR(30) ); ``` ### 插入示例数据 ```sql -- 插入销售订单数据 INSERT INTO sales_orders (customer_id, customer_name, product_id, product_name, category, order_date, quantity, unit_price, sales_rep, region, discount_rate) VALUES (1, '阿里巴巴', 101, 'MacBook Pro', '笔记本', '2024-01-15', 10, 12999.00, '张三', '华东', 0.05), (1, '阿里巴巴', 102, 'iPhone 15', '手机', '2024-01-16', 50, 5999.00, '张三', '华东', 0.03), (2, '腾讯', 103, 'iPad Air', '平板', '2024-01-20', 20, 3999.00, '李四', '华南', 0.04), (2, '腾讯', 101, 'MacBook Pro', '笔记本', '2024-01-22', 15, 12999.00, '李四', '华南', 0.06), (3, '百度', 104, 'AirPods Pro', '耳机', '2024-01-25', 100, 1999.00, '王五', '华北', 0.08), (3, '百度', 102, 'iPhone 15', '手机', '2024-01-28', 30, 5999.00, '王五', '华北', 0.05), (4, '字节跳动', 105, 'Apple Watch', '智能手表', '2024-02-01', 25, 2999.00, '赵六', '华北', 0.10), (4, '字节跳动', 103, 'iPad Air', '平板', '2024-02-03', 35, 3999.00, '赵六', '华北', 0.07), (5, '美团', 106, 'MacBook Air', '笔记本', '2024-02-05', 12, 8999.00, '钱七', '华东', 0.04), (5, '美团', 104, 'AirPods Pro', '耳机', '2024-02-08', 80, 1999.00, '钱七', '华东', 0.06), (6, '滴滴', 107, 'iMac', '台式机', '2024-02-10', 8, 15999.00, '孙八', '华南', 0.03), (6, '滴滴', 102, 'iPhone 15', '手机', '2024-02-12', 40, 5999.00, '孙八', '华南', 0.04), (7, '小米', 108, 'iPad Pro', '平板', '2024-02-15', 18, 7999.00, '周九', '华北', 0.05), (7, '小米', 105, 'Apple Watch', '智能手表', '2024-02-18', 30, 2999.00, '周九', '华北', 0.12), (8, '华为', 109, 'Studio Display', '显示器', '2024-02-20', 6, 11999.00, '吴十', '华南', 0.02); -- 插入员工数据 INSERT INTO employees (emp_id, name, department, position, salary, hire_date, manager_id, age, gender, city) VALUES (1001, '张经理', '技术部', '部门经理', 25000.00, '2020-01-15', NULL, 35, 'M', '北京'), (1002, '李架构师', '技术部', '高级架构师', 22000.00, '2020-05-20', 1001, 32, 'M', '北京'), (1003, '王工程师', '技术部', '高级工程师', 18000.00, '2021-03-10', 1001, 28, 'F', '北京'), (1004, '赵工程师', '技术部', '中级工程师', 15000.00, '2022-01-15', 1001, 26, 'M', '北京'), (1005, '钱实习生', '技术部', '实习工程师', 8000.00, '2023-07-01', 1002, 23, 'F', '北京'), (2001, '孙经理', '产品部', '产品经理', 20000.00, '2021-02-01', NULL, 30, 'F', '上海'), (2002, '周产品', '产品部', '高级产品', 16000.00, '2021-08-15', 2001, 27, 'M', '上海'), (2003, '吴助理', '产品部', '产品助理', 12000.00, '2022-06-20', 2001, 25, 'F', '上海'), (3001, '郑经理', '销售部', '销售经理', 18000.00, '2020-11-10', NULL, 33, 'M', '深圳'), (3002, '刘销售', '销售部', '高级销售', 14000.00, '2021-09-25', 3001, 29, 'F', '深圳'), (3003, '陈销售', '销售部', '销售代表', 10000.00, '2022-12-01', 3001, 24, 'M', '深圳'), (4001, '林经理', '人事部', 'HR经理', 16000.00, '2021-04-12', NULL, 31, 'F', '广州'), (4002, '黄专员', '人事部', 'HR专员', 11000.00, '2022-08-30', 4001, 26, 'F', '广州'), (5001, '何经理', '财务部', '财务经理', 19000.00, '2020-07-08', NULL, 34, 'M', '杭州'), (5002, '魏会计', '财务部', '会计', 13000.00, '2021-11-20', 5001, 28, 'F', '杭州'); -- 插入学生成绩数据 INSERT INTO student_scores (student_id, student_name, subject, score, exam_date, teacher, class_id, semester) VALUES (1, '张小明', '数学', 85, '2024-01-15', '王老师', 1, '2024春'), (1, '张小明', '语文', 78, '2024-01-16', '李老师', 1, '2024春'), (1, '张小明', '英语', 92, '2024-01-17', '赵老师', 1, '2024春'), (2, '李小红', '数学', 92, '2024-01-15', '王老师', 1, '2024春'), (2, '李小红', '语文', 88, '2024-01-16', '李老师', 1, '2024春'), (2, '李小红', '英语', 95, '2024-01-17', '赵老师', 1, '2024春'), (3, '王小刚', '数学', 76, '2024-01-15', '王老师', 2, '2024春'), (3, '王小刚', '语文', 82, '2024-01-16', '李老师', 2, '2024春'), (3, '王小刚', '英语', 79, '2024-01-17', '赵老师', 2, '2024春'), (4, '赵小丽', '数学', 88, '2024-01-15', '王老师', 2, '2024春'), (4, '赵小丽', '语文', 85, '2024-01-16', '李老师', 2, '2024春'), (4, '赵小丽', '英语', 90, '2024-01-17', '赵老师', 2, '2024春'), (5, '钱小伟', '数学', 90, '2024-01-15', '钱老师', 3, '2024春'), (5, '钱小伟', '语文', 87, '2024-01-16', '孙老师', 3, '2024春'), (5, '钱小伟', '英语', 93, '2024-01-17', '周老师', 3, '2024春'), -- 添加期中考试成绩 (1, '张小明', '数学', 88, '2024-03-15', '王老师', 1, '2024春'), (1, '张小明', '语文', 82, '2024-03-16', '李老师', 1, '2024春'), (2, '李小红', '数学', 95, '2024-03-15', '王老师', 1, '2024春'), (2, '李小红', '语文', 90, '2024-03-16', '李老师', 1, '2024春'), (3, '王小刚', '数学', 80, '2024-03-15', '王老师', 2, '2024春'); -- 插入网站访问日志数据 INSERT INTO web_logs (user_id, page_url, visit_date, visit_time, session_duration, device_type, browser, country, page_views) VALUES (1001, '/home', '2024-01-15', '09:30:00', 25, 'Desktop', 'Chrome', '中国', 5), (1001, '/products', '2024-01-15', '10:15:00', 15, 'Desktop', 'Chrome', '中国', 8), (1002, '/home', '2024-01-15', '14:20:00', 30, 'Mobile', 'Safari', '美国', 3), (1003, '/about', '2024-01-16', '11:45:00', 20, 'Tablet', 'Chrome', '英国', 4), (1001, '/checkout', '2024-01-16', '16:30:00', 45, 'Desktop', 'Chrome', '中国', 2), (1004, '/home', '2024-01-17', '08:15:00', 35, 'Mobile', 'Firefox', '日本', 6), (1002, '/products', '2024-01-17', '13:20:00', 28, 'Desktop', 'Edge', '美国', 7), (1005, '/contact', '2024-01-18', '10:40:00', 12, 'Mobile', 'Safari', '加拿大', 2), (1003, '/home', '2024-01-18', '15:25:00', 22, 'Desktop', 'Chrome', '英国', 4), (1006, '/products', '2024-01-19', '09:10:00', 40, 'Tablet', 'Safari', '澳大利亚', 9), (1001, '/home', '2024-01-19', '14:35:00', 18, 'Mobile', 'Chrome', '中国', 3), (1007, '/login', '2024-01-20', '11:20:00', 8, 'Desktop', 'Firefox', '德国', 1), (1002, '/dashboard', '2024-01-20', '16:45:00', 55, 'Desktop', 'Chrome', '美国', 12); -- 插入库存数据 INSERT INTO inventory (product_id, product_name, category, supplier, stock_quantity, reorder_level, unit_cost, last_restock_date, warehouse_location) VALUES (101, 'MacBook Pro', '笔记本', 'Apple', 45, 20, 10000.00, '2024-01-10', '北京仓库'), (102, 'iPhone 15', '手机', 'Apple', 120, 50, 4500.00, '2024-01-12', '上海仓库'), (103, 'iPad Air', '平板', 'Apple', 80, 30, 3200.00, '2024-01-08', '深圳仓库'), (104, 'AirPods Pro', '耳机', 'Apple', 200, 80, 1500.00, '2024-01-15', '北京仓库'), (105, 'Apple Watch', '智能手表', 'Apple', 60, 25, 2400.00, '2024-01-05', '广州仓库'), (106, 'MacBook Air', '笔记本', 'Apple', 35, 15, 7200.00, '2024-01-20', '上海仓库'), (107, 'iMac', '台式机', 'Apple', 15, 10, 12800.00, '2024-01-18', '深圳仓库'), (108, 'iPad Pro', '平板', 'Apple', 25, 15, 6400.00, '2024-01-22', '北京仓库'), (109, 'Studio Display', '显示器', 'Apple', 12, 8, 9600.00, '2024-01-25', '广州仓库'), (110, 'Mac Mini', '台式机', 'Apple', 30, 12, 4800.00, '2024-01-28', '杭州仓库'); ``` ## 基础分组查询 ### 1. 简单分组统计 ```sql -- 按产品类别统计销售数量 SELECT category, SUM(quantity) AS total_quantity FROM sales_orders GROUP BY category; ``` **结果:** ``` +----------+----------------+ | category | total_quantity | +----------+----------------+ | 笔记本 | 37 | | 手机 | 120 | | 平板 | 73 | | 耳机 | 180 | | 智能手表 | 55 | | 台式机 | 8 | | 显示器 | 6 | +----------+----------------+ ``` ### 2. 按部门统计员工信息 ```sql -- 按部门统计员工数量和平均薪资 SELECT department, COUNT(*) AS emp_count, ROUND(AVG(salary), 2) AS avg_salary, MIN(salary) AS min_salary, MAX(salary) AS max_salary FROM employees GROUP BY department; ``` **结果:** ``` +----------+-----------+------------+------------+------------+ | department | emp_count | avg_salary | min_salary | max_salary | +----------+-----------+------------+------------+------------+ | 技术部 | 5 | 17600.00 | 8000.00 | 25000.00 | | 产品部 | 3 | 16000.00 | 12000.00 | 20000.00 | | 销售部 | 3 | 14000.00 | 10000.00 | 18000.00 | | 人事部 | 2 | 13500.00 | 11000.00 | 16000.00 | | 财务部 | 2 | 16000.00 | 13000.00 | 19000.00 | +----------+-----------+------------+------------+------------+ ``` ### 3. 按日期分组统计 ```sql -- 按月份统计订单数量和销售额 SELECT DATE_FORMAT(order_date, '%Y-%m') AS order_month, COUNT(*) AS order_count, SUM(quantity * unit_price * (1 - discount_rate)) AS total_sales FROM sales_orders GROUP BY DATE_FORMAT(order_date, '%Y-%m') ORDER BY order_month; ``` **结果:** ``` +-------------+-------------+-------------+ | order_month | order_count | total_sales | +-------------+-------------+-------------+ | 2024-01 | 6 | 912393.00 | | 2024-02 | 9 | 809328.40 | +-------------+-------------+-------------+ ``` ## 聚合函数详解 ### 1. COUNT() 函数详解 ```sql -- COUNT 的不同用法 SELECT COUNT(*) AS total_rows, -- 总行数 COUNT(manager_id) AS has_manager, -- 非NULL值数量 COUNT(DISTINCT department) AS dept_count, -- 去重计数 COUNT(CASE WHEN salary > 15000 THEN 1 END) AS high_salary_count -- 条件计数 FROM employees; ``` **结果:** ``` +------------+-------------+------------+-------------------+ | total_rows | has_manager | dept_count | high_salary_count | +------------+-------------+------------+-------------------+ | 15 | 10 | 5 | 8 | +------------+-------------+------------+-------------------+ ``` ### 2. SUM() 和 AVG() 函数 ```sql -- 按销售代表统计业绩 SELECT sales_rep, COUNT(*) AS order_count, SUM(quantity) AS total_quantity, SUM(quantity * unit_price * (1 - discount_rate)) AS total_sales, ROUND(AVG(quantity * unit_price * (1 - discount_rate)), 2) AS avg_order_value FROM sales_orders GROUP BY sales_rep ORDER BY total_sales DESC; ``` **结果:** ``` +----------+-------------+----------------+-------------+-----------------+ | sales_rep | order_count | total_quantity | total_sales | avg_order_value | +----------+-------------+----------------+-------------+-----------------+ | 张三 | 2 | 60 | 423447.00 | 211723.50 | | 李四 | 2 | 35 | 318969.00 | 159484.50 | | 王五 | 2 | 130 | 344562.00 | 172281.00 | | 赵六 | 2 | 60 | 206187.00 | 103093.50 | | 钱七 | 2 | 92 | 258291.60 | 129145.80 | | 孙八 | 2 | 48 | 355464.00 | 177732.00 | | 周九 | 2 | 48 | 222777.60 | 111388.80 | | 吴十 | 1 | 6 | 70622.80 | 70622.80 | +----------+-------------+----------------+-------------+-----------------+ ``` ### 3. MAX() 和 MIN() 函数 ```sql -- 按班级统计成绩分布 SELECT class_id, COUNT(*) AS student_count, MAX(score) AS highest_score, MIN(score) AS lowest_score, ROUND(AVG(score), 2) AS avg_score, MAX(score) - MIN(score) AS score_range FROM student_scores GROUP BY class_id ORDER BY class_id; ``` **结果:** ``` +----------+---------------+---------------+--------------+-----------+-------------+ | class_id | student_count | highest_score | lowest_score | avg_score | score_range | +----------+---------------+---------------+--------------+-----------+-------------+ | 1 | 8 | 95 | 78 | 86.75 | 17 | | 2 | 7 | 90 | 76 | 82.71 | 14 | | 3 | 3 | 93 | 87 | 90.00 | 6 | +----------+---------------+---------------+--------------+-----------+-------------+ ``` ### 4. GROUP_CONCAT() 函数 ```sql -- 按部门列出所有员工姓名 SELECT department, COUNT(*) AS emp_count, GROUP_CONCAT(name ORDER BY salary DESC) AS employees, GROUP_CONCAT(DISTINCT city) AS cities FROM employees GROUP BY department; ``` **结果:** ``` +----------+-----------+------------------------------------------+----------+ | department | emp_count | employees | cities | +----------+-----------+------------------------------------------+----------+ | 技术部 | 5 | 张经理,李架构师,王工程师,赵工程师,钱实习生 | 北京 | | 产品部 | 3 | 孙经理,周产品,吴助理 | 上海 | | 销售部 | 3 | 郑经理,刘销售,陈销售 | 深圳 | | 人事部 | 2 | 林经理,黄专员 | 广州 | | 财务部 | 2 | 何经理,魏会计 | 杭州 | +----------+-----------+------------------------------------------+----------+ ``` ## HAVING 子句使用 ### 1. HAVING 基础用法 ```sql -- 查找员工数量大于2的部门 SELECT department, COUNT(*) AS emp_count, ROUND(AVG(salary), 2) AS avg_salary FROM employees GROUP BY department HAVING COUNT(*) > 2 ORDER BY emp_count DESC; ``` **结果:** ``` +----------+-----------+------------+ | department | emp_count | avg_salary | +----------+-----------+------------+ | 技术部 | 5 | 17600.00 | | 产品部 | 3 | 16000.00 | | 销售部 | 3 | 14000.00 | +----------+-----------+------------+ ``` ### 2. HAVING 与 WHERE 的区别 ```sql -- 错误用法:WHERE 不能使用聚合函数 -- SELECT department, COUNT(*) FROM employees WHERE COUNT(*) > 2 GROUP BY department; -- 正确用法:先过滤再分组 SELECT department, COUNT(*) AS emp_count, ROUND(AVG(salary), 2) AS avg_salary FROM employees WHERE salary > 12000 -- 先过滤薪资大于12000的员工 GROUP BY department HAVING COUNT(*) >= 2 -- 再过滤员工数量>=2的部门 ORDER BY avg_salary DESC; ``` **结果:** ``` +----------+-----------+------------+ | department | emp_count | avg_salary | +----------+-----------+------------+ | 技术部 | 3 | 21666.67 | | 产品部 | 2 | 18000.00 | | 财务部 | 2 | 16000.00 | +----------+-----------+------------+ ``` ### 3. 复杂 HAVING 条件 ```sql -- 查找高价值客户(订单总额>30万且订单数量>1) SELECT customer_name, COUNT(*) AS order_count, SUM(quantity * unit_price * (1 - discount_rate)) AS total_sales, ROUND(AVG(quantity * unit_price * (1 - discount_rate)), 2) AS avg_order_value FROM sales_orders GROUP BY customer_name HAVING COUNT(*) > 1 AND SUM(quantity * unit_price * (1 - discount_rate)) > 300000 ORDER BY total_sales DESC; ``` **结果:** ``` +--------------+-------------+-------------+-----------------+ | customer_name | order_count | total_sales | avg_order_value | +--------------+-------------+-------------+-----------------+ | 阿里巴巴 | 2 | 423447.00 | 211723.50 | | 腾讯 | 2 | 318969.00 | 159484.50 | | 百度 | 2 | 344562.00 | 172281.00 | | 滴滴 | 2 | 355464.00 | 177732.00 | +-------------+-------------+-------------+-----------------+ ``` ## 多列分组查询 ### 1. 双列分组 ```sql -- 按地区和销售代表分组统计 SELECT region, sales_rep, COUNT(*) AS order_count, SUM(quantity) AS total_quantity, SUM(quantity * unit_price * (1 - discount_rate)) AS total_sales FROM sales_orders GROUP BY region, sales_rep ORDER BY region, total_sales DESC; ``` **结果:** ``` +--------+-----------+-------------+----------------+-------------+ | region | sales_rep | order_count | total_quantity | total_sales | +--------+-----------+-------------+----------------+-------------+ | 华北 | 王五 | 2 | 130 | 344562.00 | | 华北 | 赵六 | 2 | 60 | 206187.00 | | 华北 | 周九 | 2 | 48 | 222777.60 | | 华东 | 张三 | 2 | 60 | 423447.00 | | 华东 | 钱七 | 2 | 92 | 258291.60 | | 华南 | 李四 | 2 | 35 | 318969.00 | | 华南 | 孙八 | 2 | 48 | 355464.00 | | 华南 | 吴十 | 1 | 6 | 70622.80 | +--------+-----------+-------------+----------------+-------------+ ``` ### 2. 三列分组 ```sql -- 按年龄段、性别、部门分组统计员工 SELECT CASE WHEN age < 25 THEN '24岁以下' WHEN age <= 30 THEN '25-30岁' WHEN age <= 35 THEN '31-35岁' ELSE '35岁以上' END AS age_group, gender, department, COUNT(*) AS emp_count, ROUND(AVG(salary), 2) AS avg_salary FROM employees GROUP BY CASE WHEN age < 25 THEN '24岁以下' WHEN age <= 30 THEN '25-30岁' WHEN age <= 35 THEN '31-35岁' ELSE '35岁以上' END, gender, department HAVING COUNT(*) >= 1 ORDER BY age_group, gender, department; ``` ### 3. 时间维度分组 ```sql -- 按年、月、产品类别分组统计 SELECT YEAR(order_date) AS order_year, MONTH(order_date) AS order_month, category, COUNT(*) AS order_count, SUM(quantity) AS total_quantity, ROUND(SUM(quantity * unit_price * (1 - discount_rate)), 2) AS total_sales FROM sales_orders GROUP BY YEAR(order_date), MONTH(order_date), category ORDER BY order_year, order_month, total_sales DESC; ``` ## 复杂分组统计 ### 1. 嵌套分组查询 ```sql -- 查找每个部门薪资最高的员工信息 SELECT e1.* FROM employees e1 WHERE e1.salary = ( SELECT MAX(e2.salary) FROM employees e2 WHERE e2.department = e1.department ) ORDER BY e1.department, e1.salary DESC; ``` ### 2. 窗口函数与分组 ```sql -- 计算每个员工在部门内的薪资排名 SELECT name, department, salary, ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) AS dept_rank, ROUND(salary / AVG(salary) OVER (PARTITION BY department) * 100, 2) AS salary_ratio FROM employees ORDER BY department, dept_rank; ``` ### 3. 数据透视表效果 ```sql -- 按设备类型和浏览器统计访问情况 SELECT device_type, SUM(CASE WHEN browser = 'Chrome' THEN page_views ELSE 0 END) AS Chrome, SUM(CASE WHEN browser = 'Safari' THEN page_views ELSE 0 END) AS Safari, SUM(CASE WHEN browser = 'Firefox' THEN page_views ELSE 0 END) AS Firefox, SUM(CASE WHEN browser = 'Edge' THEN page_views ELSE 0 END) AS Edge, SUM(page_views) AS Total FROM web_logs GROUP BY device_type ORDER BY Total DESC; ``` **结果:** ``` +-------------+--------+--------+---------+------+-------+ | device_type | Chrome | Safari | Firefox | Edge | Total | +-------------+--------+--------+---------+------+-------+ | Desktop | 24 | 0 | 1 | 7 | 32 | | Mobile | 6 | 5 | 6 | 0 | 17 | | Tablet | 4 | 9 | 0 | 0 | 13 | +-------------+--------+--------+---------+------+-------+ ``` ### 4. 动态分组条件 ```sql -- 按库存状态分组统计产品 SELECT CASE WHEN stock_quantity <= reorder_level THEN '需要补货' WHEN stock_quantity <= reorder_level * 2 THEN '库存偏低' ELSE '库存充足' END AS stock_status, COUNT(*) AS product_count, SUM(stock_quantity) AS total_stock, ROUND(AVG(unit_cost), 2) AS avg_cost FROM inventory GROUP BY CASE WHEN stock_quantity <= reorder_level THEN '需要补货' WHEN stock_quantity <= reorder_level * 2 THEN '库存偏低' ELSE '库存充足' END ORDER BY CASE WHEN stock_status = '需要补货' THEN 1 WHEN stock_status = '库存偏低' THEN 2 ELSE 3 END; ``` ## 面试题和实际案例 ### 面试题1:连续登录用户分析 **题目**:找出连续3天都有访问记录的用户。 ```sql -- 解答:使用窗口函数和分组 WITH daily_users AS ( SELECT DISTINCT user_id, visit_date FROM web_logs ), user_sequences AS ( SELECT user_id, visit_date, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY visit_date) as rn, DATE_SUB(visit_date, INTERVAL ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY visit_date) DAY) as group_date FROM daily_users ), consecutive_groups AS ( SELECT user_id, group_date, COUNT(*) as consecutive_days FROM user_sequences GROUP BY user_id, group_date HAVING COUNT(*) >= 3 ) SELECT DISTINCT cg.user_id FROM consecutive_groups cg; ``` ### 面试题2:同比增长率计算 **题目**:计算每个月的销售额同比增长率。 ```sql -- 解答:使用自连接和分组 WITH monthly_sales AS ( SELECT DATE_FORMAT(order_date, '%Y-%m') as month, SUM(quantity * unit_price * (1 - discount_rate)) as total_sales FROM sales_orders GROUP BY DATE_FORMAT(order_date, '%Y-%m') ) SELECT current_month.month, current_month.total_sales as current_sales, last_year.total_sales as last_year_sales, ROUND( (current_month.total_sales - IFNULL(last_year.total_sales, 0)) / IFNULL(last_year.total_sales, current_month.total_sales) * 100, 2 ) as growth_rate FROM monthly_sales current_month LEFT JOIN monthly_sales last_year ON DATE_FORMAT(DATE_SUB(STR_TO_DATE(CONCAT(current_month.month, '-01'), '%Y-%m-%d'), INTERVAL 1 YEAR), '%Y-%m') = last_year.month ORDER BY current_month.month; ``` ### 面试题3:帕累托分析(80/20法则) **题目**:找出贡献80%销售额的前20%客户。 ```sql -- 解答:使用窗口函数计算累计占比 WITH customer_sales AS ( SELECT customer_name, SUM(quantity * unit_price * (1 - discount_rate)) as total_sales FROM sales_orders GROUP BY customer_name ), customer_ranking AS ( SELECT customer_name, total_sales, ROW_NUMBER() OVER (ORDER BY total_sales DESC) as rank_num, COUNT(*) OVER () as total_customers, SUM(total_sales) OVER () as grand_total, SUM(total_sales) OVER (ORDER BY total_sales DESC) as cumulative_sales FROM customer_sales ), customer_contribution AS ( SELECT *, ROUND(rank_num / total_customers * 100, 2) as customer_percentile, ROUND(cumulative_sales / grand_total * 100, 2) as sales_percentile FROM customer_ranking ) SELECT customer_name, total_sales, customer_percentile, sales_percentile FROM customer_contribution WHERE sales_percentile <= 80 ORDER BY total_sales DESC; ``` ### 实际案例1:用户行为分析 **场景**:电商网站需要分析用户访问行为,优化用户体验。 ```sql -- 综合用户行为分析 SELECT country, device_type, COUNT(DISTINCT user_id) as unique_users, COUNT(*) as total_sessions, ROUND(COUNT(*) / COUNT(DISTINCT user_id), 2) as sessions_per_user, ROUND(AVG(session_duration), 2) as avg_session_duration, SUM(page_views) as total_page_views, ROUND(SUM(page_views) / COUNT(*), 2) as pages_per_session, ROUND(SUM(session_duration) / 60, 2) as total_hours FROM web_logs GROUP BY country, device_type HAVING COUNT(DISTINCT user_id) >= 1 ORDER BY unique_users DESC, avg_session_duration DESC; ``` ### 实际案例2:销售业绩分析 **场景**:销售部门需要全面分析销售业绩,制定奖励策略。 ```sql -- 销售代表综合业绩分析 WITH sales_performance AS ( SELECT sales_rep, COUNT(*) as order_count, COUNT(DISTINCT customer_name) as customer_count, SUM(quantity) as total_quantity, SUM(quantity * unit_price * (1 - discount_rate)) as total_sales, ROUND(AVG(quantity * unit_price * (1 - discount_rate)), 2) as avg_order_value, MAX(quantity * unit_price * (1 - discount_rate)) as max_order_value, COUNT(DISTINCT category) as category_diversity FROM sales_orders GROUP BY sales_rep ), performance_ranking AS ( SELECT *, RANK() OVER (ORDER BY total_sales DESC) as sales_rank, RANK() OVER (ORDER BY customer_count DESC) as customer_rank, RANK() OVER (ORDER BY avg_order_value DESC) as avg_value_rank, ROUND(total_sales / SUM(total_sales) OVER () * 100, 2) as sales_share FROM sales_performance ) SELECT sales_rep, total_sales, customer_count, avg_order_value, sales_rank, customer_rank, sales_share, CASE WHEN sales_rank <= 2 THEN '金牌销售' WHEN sales_rank <= 4 THEN '银牌销售' ELSE '铜牌销售' END as performance_level FROM performance_ranking ORDER BY sales_rank; ``` ### 实际案例3:库存管理优化 **场景**:仓库管理需要优化库存配置,减少缺货和积压。 ```sql -- 库存分析和补货建议 SELECT category, warehouse_location, COUNT(*) as product_count, SUM(stock_quantity) as total_stock, SUM(CASE WHEN stock_quantity <= reorder_level THEN 1 ELSE 0 END) as need_reorder, ROUND(AVG(stock_quantity), 2) as avg_stock, ROUND(AVG(unit_cost), 2) as avg_cost, SUM(stock_quantity * unit_cost) as inventory_value, ROUND(SUM(CASE WHEN stock_quantity <= reorder_level THEN unit_cost * reorder_level ELSE 0 END), 2) as reorder_investment FROM inventory GROUP BY category, warehouse_location HAVING COUNT(*) >= 1 ORDER BY inventory_value DESC, need_reorder DESC; ``` ### 面试题4:数据质量检查 **题目**:检查数据中的异常情况。 ```sql -- 多维度数据质量检查 SELECT 'sales_orders' as table_name, 'order_date' as check_field, COUNT(*) as total_records, SUM(CASE WHEN order_date IS NULL THEN 1 ELSE 0 END) as null_count, SUM(CASE WHEN order_date > CURDATE() THEN 1 ELSE 0 END) as future_date_count, MIN(order_date) as min_date, MAX(order_date) as max_date FROM sales_orders UNION ALL SELECT 'employees' as table_name, 'salary' as check_field, COUNT(*) as total_records, SUM(CASE WHEN salary IS NULL THEN 1 ELSE 0 END) as null_count, SUM(CASE WHEN salary <= 0 THEN 1 ELSE 0 END) as invalid_salary_count, MIN(salary) as min_salary, MAX(salary) as max_salary FROM employees; ``` ## 性能优化和最佳实践 ### 1. 索引优化 ```sql -- 为分组字段创建索引 CREATE INDEX idx_sales_orders_category ON sales_orders(category); CREATE INDEX idx_sales_orders_sales_rep ON sales_orders(sales_rep); CREATE INDEX idx_employees_department ON employees(department); -- 复合索引优化分组查询 CREATE INDEX idx_sales_orders_date_region ON sales_orders(order_date, region); CREATE INDEX idx_web_logs_user_date ON web_logs(user_id, visit_date); ``` ### 2. 查询优化技巧 ```sql -- ✅ 推荐:使用WHERE过滤再分组 SELECT department, COUNT(*) FROM employees WHERE salary > 10000 -- 先过滤 GROUP BY department; -- ❌ 避免:分组后再过滤 SELECT department, COUNT(*) FROM employees GROUP BY department HAVING AVG(salary) > 10000; -- 这会计算所有组的平均值 ``` ### 3. 避免不必要的分组 ```sql -- ❌ 不必要的分组 SELECT customer_name, SUM(quantity) FROM sales_orders WHERE customer_name = '阿里巴巴' GROUP BY customer_name; -- ✅ 优化:直接使用WHERE SELECT '阿里巴巴' as customer_name, SUM(quantity) FROM sales_orders WHERE customer_name = '阿里巴巴'; ``` ### 4. 大数据量分组优化 ```sql -- 使用分区表优化大数据量分组 -- CREATE TABLE sales_orders_partitioned ( -- ... -- ) PARTITION BY RANGE (YEAR(order_date)) ( -- PARTITION p2023 VALUES LESS THAN (2024), -- PARTITION p2024 VALUES LESS THAN (2025) -- ); -- 优化GROUP BY的内存使用 SET SESSION group_concat_max_len = 1000000; SET SESSION tmp_table_size = 256000000; SET SESSION max_heap_table_size = 256000000; ``` ### 5. 监控和调优 ```sql -- 查看分组查询的执行计划 EXPLAIN SELECT department, COUNT(*), AVG(salary) FROM employees GROUP BY department; -- 监控分组查询性能 SELECT sql_text, exec_count, avg_timer_wait/1000000000000 as avg_time_sec, sum_rows_examined/exec_count as avg_rows_examined FROM performance_schema.events_statements_summary_by_digest WHERE sql_text LIKE '%GROUP BY%' ORDER BY avg_timer_wait DESC LIMIT 10; ``` ### 6. 最佳实践总结 ```sql -- ✅ 好的GROUP BY实践 SELECT category, -- 分组字段 COUNT(*) as order_count, -- 明确的聚合函数 SUM(quantity) as total_qty -- 有意义的别名 FROM sales_orders WHERE order_date >= '2024-01-01' -- 先过滤数据 GROUP BY category -- 简洁的分组 HAVING COUNT(*) > 1 -- 分组后过滤 ORDER BY total_qty DESC -- 有序输出 LIMIT 10; -- 限制结果数量 -- ❌ 避免的问题 -- 1. 在SELECT中使用非分组字段(MySQL 5.7+会报错) -- 2. GROUP BY使用复杂表达式而不创建索引 -- 3. 不必要的HAVING条件 -- 4. 大量数据不加WHERE条件直接分组 ``` --- **总结:** - GROUP BY 是数据分析的核心工具,配合聚合函数使用 - 理解执行顺序:WHERE → GROUP BY → HAVING → SELECT → ORDER BY - 合理使用索引可以显著提升分组查询性能 - HAVING 用于过滤分组结果,WHERE 用于过滤原始数据 - 复杂分析可以结合窗口函数和子查询实现 - 注意数据类型和NULL值的处理