简化查询,增加SQL可读性和可维护性。我的理解CTE的作用相当于编程语言里的临时变量。起到复用和增加可读性的作用。在SQL里,如果没有CTE,很多复杂查询就只能用子查询来完成,嵌套层次太多不容易阅读和维护。有了CTE的支持,可以把一些嵌套层次很深的查询展开。例如,discourse里的一个报表查询:
WITH mods AS (
SELECT
id AS user_id,
username_lower AS username,
uploaded_avatar_id
FROM users u
WHERE u.moderator = 'true'
AND u.id > 0
),
time_read AS (
SELECT SUM(uv.time_read) AS time_read,
uv.user_id
FROM mods m
JOIN user_visits uv
ON m.user_id = uv.user_id
WHERE uv.visited_at >= '#{report.start_date}'
AND uv.visited_at <= '#{report.end_date}'
GROUP BY uv.user_id
),
flag_count AS (
WITH period_actions AS (
SELECT agreed_by_id,
disagreed_by_id
FROM post_actions
WHERE post_action_type_id IN (#{PostActionType.flag_types_without_custom.values.join(',')})
AND created_at >= '#{report.start_date}'
AND created_at <= '#{report.end_date}'
),
agreed_flags AS (
SELECT pa.agreed_by_id AS user_id,
COUNT(*) AS flag_count
FROM mods m
JOIN period_actions pa
ON pa.agreed_by_id = m.user_id
GROUP BY agreed_by_id
),
disagreed_flags AS (
SELECT pa.disagreed_by_id AS user_id,
COUNT(*) AS flag_count
FROM mods m
JOIN period_actions pa
ON pa.disagreed_by_id = m.user_id
GROUP BY disagreed_by_id
)
SELECT
COALESCE(af.user_id, df.user_id) AS user_id,
COALESCE(af.flag_count, 0) + COALESCE(df.flag_count, 0) AS flag_count
FROM agreed_flags af
FULL OUTER JOIN disagreed_flags df
ON df.user_id = af.user_id
),
revision_count AS (
SELECT pr.user_id,
COUNT(*) AS revision_count
FROM mods m
JOIN post_revisions pr
ON pr.user_id = m.user_id
JOIN posts p
ON p.id = pr.post_id
WHERE pr.created_at >= '#{report.start_date}'
AND pr.created_at <= '#{report.end_date}'
AND p.user_id <> pr.user_id
GROUP BY pr.user_id
),
topic_count AS (
SELECT t.user_id,
COUNT(*) AS topic_count
FROM mods m
JOIN topics t
ON t.user_id = m.user_id
WHERE t.archetype = 'regular'
AND t.created_at >= '#{report.start_date}'
AND t.created_at <= '#{report.end_date}'
GROUP BY t.user_id
),
post_count AS (
SELECT p.user_id,
COUNT(*) AS post_count
FROM mods m
JOIN posts p
ON p.user_id = m.user_id
JOIN topics t
ON t.id = p.topic_id
WHERE t.archetype = 'regular'
AND p.created_at >= '#{report.start_date}'
AND p.created_at <= '#{report.end_date}'
GROUP BY p.user_id
),
pm_count AS (
SELECT p.user_id,
COUNT(*) AS pm_count
FROM mods m
JOIN posts p
ON p.user_id = m.user_id
JOIN topics t
ON t.id = p.topic_id
WHERE t.archetype = 'private_message'
AND p.created_at >= '#{report.start_date}'
AND p.created_at <= '#{report.end_date}'
GROUP BY p.user_id
)
SELECT
m.user_id,
m.username,
m.uploaded_avatar_id,
tr.time_read,
fc.flag_count,
rc.revision_count,
tc.topic_count,
pc.post_count,
pmc.pm_count
FROM mods m
LEFT JOIN time_read tr ON tr.user_id = m.user_id
LEFT JOIN flag_count fc ON fc.user_id = m.user_id
LEFT JOIN revision_count rc ON rc.user_id = m.user_id
LEFT JOIN topic_count tc ON tc.user_id = m.user_id
LEFT JOIN post_count pc ON pc.user_id = m.user_id
LEFT JOIN pm_count pmc ON pmc.user_id = m.user_id
ORDER BY m.username
ref: https://github.com/discourse/discourse/blob/dc6b547ed89f652b5406489d76140b76cf8e0d1d/app/models/concerns/reports/moderators_activity.rb
期待的CTE是和子查询有同样能力的一个功能,所以聚合和窗口函数这些一定是需要的。CTE使用场景也是在分析类查询中,会考虑和TiFlash一起用。顺便说一下之前在PG上使用CTE的一些坑,PG12之前的CTE默认行为是物化结果,导致执行计划和普通子查询会产生差异,比如不能push down:
# 由于PG12之前,CTE默认物化结果,这个执行计划会是一个全表扫
WITH cte AS (
SELECT * FROM foo
)
SELECT * FROM cte WHERE id = 500000;
默认物化这个行为会导致PG12之前的CTE在一些场景相比子查询损失性能,不得不牺牲可读性继续用子查询。
PG12之后增加了一个NOT MATERIALIZED标记,解决了这个问题:
# PG12之后,外层条件可以下推了
WITH cte AS NOT MATERIALIZED (
SELECT * FROM foo
)
SELECT * FROM cte WHERE id = 500000;
Oracle的默认行为和PG是相反的,默认不物化CTE,但可以加hint来强制物化:
# Oracle加hint之后强制物化,条件不下推
WITH cte AS (
SELECT /*+ MATERIALIZE */ * FROM foo
)
SELECT * FROM cte WHERE id = 500000;