LAS SQL 语法标准以 ANSI SQL 2011 为基础,增加了 OLAP 相关语法,同时基于 Spark 3.0,支持了大部分的 Spark SQL build-in functions。
CREATE { DATABASE | SCHEMA } [ IF NOT EXISTS ] database_name [ WITH DBPROPERTIES ( <property, ...>) ] [ COMMENT comment ]
键=值
格式来表示的键值对,键和值的两端都需要有单引号,比如 'propKey'='propValue'
。CREATE SCHEMA test_olap COMMENT 'this is a database for test'; CREATE DATABASE IF NOT EXISTS test_olap; CREATE SCHEMA IF NOT EXISTS test_olap; WITH DBPROPERTIES ('scope' = 'inner', 'ownerName' = 'user1') COMMENT 'this is a database for test';
USE database_name
USE test_olap
DROP { DATABASE | SCHEMA } [ IF EXISTS ] database_identifier [ RESTRICT | CASCADE ]
DROP SCHMEA IF EXISTS test_hive_db;
CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [database_name.]table_name [ ( <column_defination, ...> ) ] [ COMMENT 'table_comment' ] [ PARTITIONED BY ( <column_defination, ...> ) ] [ bucket_spec ] [ skew_spec ] [ row_format ] [ create_file_format ] [ TBLPROPERTIES ( <property, ...> ) ] column_defination: column_name column_type [ COMMENT comment ] bucket_spec: CLUSTERED BY ( <column_name, ...> ) [ SORTED BY ( <column_name [ ASC | DESC ], ...> ) ] INTO num_buckets BUCKETS skew_spec: SKEWED BY ( column_name_1, column_name_2, ..., column_name_n ) ON ( <( column_value_for_name_1, ..., column_value_for_name_n ), ...> ) [STORED AS DIRECTORIES] row_format: ROW FORMAT SERDE serde_class [ WITH SERDEPROPERTIES ( <property, ...> ) ] | ROW FORMAT DELIMITED [ FIELDS TERMINATED BY fields_termiated_char [ ESCAPED BY escaped_char ] ] [ COLLECTION ITEMS TERMINATED BY collection_items_termiated_char ] [ MAP KEYS TERMINATED BY map_key_termiated_char ] [ LINES TERMINATED BY row_termiated_char ] [ NULL DEFINED AS null_char ] create_file_format: STORED AS file_format | STORED BY storage_handler file_format: INPUTFORMAT 'input_format_class' OUTPUTFORMAT 'output_format_class' | {TEXTFILE | PARQUET | ORCFILE | RCFILE | SEQUENCEFILE} storage_handler: 'hander_class' [WITH SERDEPROPERTIES ( <property, ...> )]
SERDE
子句指定一个自定义 SerDe,或者使用DELIMITED
子句使用原生 SerDe 并指定分隔符、转义字符、空字符等。DELIMITED
子句可用于指定原生 SerDe 并声明分隔符、转义字符、空字符等。-- 创建一个内表(主键表) CREATE TABLE db.tb ( user_id BIGINT, product_id BIGINT, action_type STRING, gmv DOUBLE, ts BIGINT, PRIMARY KEY (user_id, product_id), -- 指定主键 PRECOMBINE KEY (ts) -- 指定排序字段,一般为业务时间戳字段,保证乱序情况下,数据符合预期。 -- 不指定时可能导致查询结果不符合预期。 -- 主键相同时,基于此字段排序选择最大的一条进行去重。 ) PARTITIONED BY (date STRING, hour STRING) -- 指定分区列,非分区表可以不指定 CLUSTERED BY (user_id, product_id) -- bucket 字段,需为主键列的子集(如果没有特殊需求,建议和主键列保持一致) INTO 32 BUCKETS -- bucket 数,决定数据读写并发度,基于实际数据量预估,单个 bucket 不超过 1G,建议为 2 次幂 STORED AS ByteLake; -- 创建一个内表(非主键表) CREATE TABLE db.tb ( user_id BIGINT, product_id BIGINT, action_type STRING, gmv DOUBLE, ts BIGINT ) PARTITIONED BY (date STRING, hour STRING) -- 指定分区列,非分区表可以不指定 STORED AS ByteLake; -- 创建一个hive分区表 CREATE TABLE IF NOT EXISTS test_olap.student ( id INT COMMENT 'id', name STRING COMMENT 'name', age INT COMMENT 'age' ) PARTITIONED BY (date string COMMENT 'date partition') COMMENT 'this is a comment' STORED AS PARQUET TBLPROPERTIES ('created.date' = 'xxxx-xx-xx'); -- 使用 Row Format 和 file format CREATE TABLE test_olap.student (id INT, name STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE; -- 使用复杂类型 (外表需要指定 Location) CREATE EXTERNAL TABLE test_olap.family( name STRING, friends ARRAY<STRING>, children MAP<STRING, INT>, address STRUCT<street: STRING, city: STRING> ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ESCAPED BY '\\' COLLECTION ITEMS TERMINATED BY '_' MAP KEYS TERMINATED BY ':' LINES TERMINATED BY '\n' NULL DEFINED AS 'foonull' STORED AS TEXTFILE LOCATION "tos://xxxxx/xxxx";
CREATE TABLE [IF NOT EXISTS] table_name LIKE source_table_name [ ROW FORMAT row_format ] [ STORED AS create_file_format ] [ TBLPROPERTIES ( <property, ...> ) ]
-- 根据一个已存在的表创建一张表 CREATE TABLE test_olap.student_copy like test_olap.student; -- 使用 row format CREATE TABLE test_olap.student_copy like test_olap.student ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE TBLPROPERTIES ('prop1'='xxxx');
Alter Table 大部分操作适用于 LAS 外表,对于LAS 内表的相关分区等属性通常由系统自动管理,不允许直接通过接口变更。
ALTER TABLE [database_name.]table_name ADD COLUMNS ( <column_defination, ...> )
增删列语法支持内表和外表,属于 Schema Evolution 的特性,需要说明的是,对于LAS内表而言,有如下限制:
对于 LAS 标准内表,alter table add column 执行后需要至少写入一条数据,否则 presto 执行查询时,将会报错不支持,这种情况建议增加数据确保 schema 演化功能生效。对于外表没有该限制。
-- 添加一个列 ALTER TABLE test_olap.student ADD COLUMNS (grade STRING COMMENT 'student grade')
ALTER TABLE [database_name.]table_name ADD [IF NOT EXISTS] PARTITION ( <partition_spec, ...> )
ALTER TABLE test_olap.student_part ADD IF NOT EXISTS PARTITION(date='20200101')
ALTER TABLE [database_name.]table_name RENAME TO [database_name.]table_name
ALTER TABLE test_olap.student RENAME TO test_olap.student_2
ALTER TABLE [database_name.]table_name partition_spec RENAME TO partition_spec
ALTER TABLE test_olap.student_part PARTITION (date = '20200101') RENAME TO PARTITION (date = '220200102')
ALTER TABLE [database_name.]table_name DROP [ IF EXISTS ] partition_spec [PURGE]
ALTER TABLE test_olap.student_part DROP IF EXISTS PARTITION(date='20200101') PURGE
ALTER TABLE [database_name.]table_name SET TBLPROPERTIES ( <property, ...> ) ALTER TABLE [database_name.]table_name UNSET TBLPROPERTIES [ IF EXISTS ] ( <property_key, ...> )
-- 添加 Table 属性 ALTER TABLE test_olap.student SET TBLPROPERTIES ('ownerName' = 'xxx', 'size' = '10M') -- 去掉 Table 属性 ALTER TABLE test_olap.student UNSET TBLPROPERTIES IF EXISTS ('ownerName', 'size')
-- Set SERDE Properties ALTER TABLE [database_name.]table_name [ partition_spec ] SET SERDEPROPERTIES ( <property, ...> ) ALTER TABLE [database_name.]table_name [ partition_spec ] SET SERDE serde_class_name [ WITH SERDEPROPERTIES ( <property, ...> ) ]
-- Set SERDE -- Before set SERDE DESC EXTENDED test_olap.student ... |Serde Library |org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe ... ALTER TABLE test_olap.student SET SERDE 'org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe'; -- After set SERDE DESC EXTENDED test_olap.student ... |Serde Library |org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerD ...
-- Set SERDE properties -- Before set SERDE properties DESC EXTENDED test_olap.student ... |Storage Properties |[serialization.format=1] ... ALTER TABLE test_olap.student SET SERDEPROPERTIES ('k' = 'v', 'kay' = 'vee') -- After set SERDE properties DESC EXTENDED test_olap.student ... |Storage Properties |[kay=vee, k=v, serialization.format=1] ...
ALTER TABLE RECOVER
分区仅适用于分区表。ALTER TABLE database_name.table_name RECOVER PARTITIONS
ALTER TABLE test_olap.student_part RECOVER PARTITIONS
ALTER TABLE [database_name.]table_name CLUSTERED BY ( <column_name, ...> ) [ SORTED BY ( <column_name [ ASC | DESC ], ...> ) ] INTO bucket_num BUCKETS
ALTER TABLE test_olap.student CLUSTERED BY (name ,age) SORTED BY (name, age) INTO 8 BUCKETS
ALTER TABLE database_name.table_name REPLACE COLUMNS ( <column_defination, ...> )
-- Before replace columns DESC test_olap.student +--------+---------+-------+ |col_name|data_type|comment| +--------+---------+-------+ |id |int |null | |name |string |null | |age |int |null | +--------+---------+-------+ ALTER TABLE test_olap.student REPLACE COLUMNS ( id INT COMMENT 'id comment', age STRING, idd INT ) -- After replace columns DESC test_olap.student +--------+---------+----------+ |col_name|data_type|comment | +--------+---------+----------+ |id |int |id comment| |age |string |null | |idd |int |null | +--------+---------+----------+
ALTER TABLE database_name.table_name CHANGE [COLUMN] column_name column_defination
ALTER TABLE test_olap.student CHANGE age age INT COMMENT 'new age comment'
DROP TABLE [ IF EXISTS ] [database_name.]table_name
示例
DROP TABLE test_olap.student
truncate TABLE [ IF EXISTS ] [database_name.]table_name
truncate table test_olap.student;
CREATE [OR REPLACE] [[GLOBAL] TEMPORARY] VIEW [IF NOT EXISTS] database_name.view_name [(<column_name [COMMENT 'column_comment'], ...>)] [COMMENT 'view_comment'] [TBLPROPERTIES (<property, ...>)] AS select_statement
- select_statement 查询语句,具体见后面[DQL语句]部分
-- 创建一个view CREATE OR REPLACE VIEW test_olap.older_student ( id COMMENT 'id', name COMMENT 'name' ) COMMENT 'view for older student' TBLPROPERTIES('created.date' = 'xxxx-xx-xx') AS SELECT id, name FROM test_olap.student WHERE age > 18;
ALTER VIEW database_name.view_name RENAME TO database_name.view_name
ALTER VIEW test_olap.older_student RENAME TO test_olap.older_student_copy
-- Set View Properties ALTER VIEW database_name.view_name { SET | UNSET } TBLPROPERTIES ( <property, ...> ) -- Unset View Properties ALTER VIEW database_name.view_name UNSET TBLPROPERTIES [ IF EXISTS ] ( <property_key, ...> )
-- Set View Properties ALTER VIEW test_olap.older_student SET TBLPROPERTIES ('owner' = 'xxx', 'size' = '10M') --Show View Properties after set using command SHOW TBLPROPERTIES test_olap.older_student +---------------------+----------+ |key |value | +---------------------+----------+ |size |10M | |created.date |2020-10-10| |transient_lastDdlTime|1603265772| |last_modified_time |1603264813| |owner |xxx | +---------------------+----------+ -- Unset View properties ALTER VIEW test_olap.older_student UNSET TBLPROPERTIES IF EXISTS ('owner', 'size') --Show View Properties after set using command SHOW TBLPROPERTIES test_olap.older_student +---------------------+----------+ |key |value | +---------------------+----------+ |created.date |2020-10-10| |transient_lastDdlTime|1603266451| |last_modified_time |1603264813| +---------------------+----------+
ALTER VIEW database_name.view_name AS query
ALTER VIEW test_olap.older_student AS SELECT id, name FROM test_olap.student WHERE age > 20;
DROP VIEW [ IF EXISTS ] database_name.view_name
DROP VIEW IF EXISTS test_olap.older_student;
INSERT INTO [ TABLE ] [database_name.]table_name [ partition_spec ] { VALUES <( <{ value | NULL }, ...> ), ...> | select_statement }
SELECT
子句TABLE
子句FROM
子句-- Single row insert using a VALUES clause INSERT INTO test_olap.student VALUES (1, 'Tom', 20); -- Single row insert using VALUES clause on partitioned table SET hive.exec.dynamic.partition.mode=nonstrict; INSERT INTO test_olap.student_part2 VALUES (1, 'Tom', 21, '20200101'); --Multi-row insert using a VALUES clause INSERT INTO test_olap.student VALUES (1, 'Tom', 20), (2, 'Jack', 22); -- Insert using a SELECT statement INSERT INTO test_olap.student_part2 PARTITION (date = '20200201') SELECT id, name, age FROM test_olap.student WHERE age > 10; -- Insert using a TABLE statement INSERT INTO test_olap.student TABLE test_olap.student2; -- Insert using a FROM statement INSERT INTO test_olap.student_part2 PARTITION (date = '20200201') FROM test_olap.student SELECT id, name, age WHERE age > 10;
INSERT OVERWRITE TABLE [database_name.]table_name [ partition_spec [ IF NOT EXISTS ] ] { VALUES <( <{ value | NULL }, ...> ), ...> | select_statement }
-- Insert using a VALUES clause INSERT OVERWRITE TABLE test_olap.student VALUES (1, 'Tom', 20), (2, 'Jack', 22); -- Insert using a SELECT statement INSERT OVERWRITE TABLE test_olap.student_part2 PARTITION (date = '20200101') SELECT id, name, age FROM test_olap.student WHERE age > 20; -- Insert using a TABLE statement INSERT OVERWRITE TABLE test_olap.student TABLE test_olap.student2; -- Insert using a FROM statement INSERT OVERWRITE TABLE test_olap.student_part2 PARTITION (date = '20200101') FROM test_olap.student SELECT id, name, age WHERE age > 20;
delete 操作仅支持 ACID 表,参考https://www.volcengine.com/docs/6492/101914。
非主键 LAS 内表/外表无法支持 delete。
已知限制:
需要说明的是,目前 DML 语句暂不支持和 DDL 语句放在同一个任务中使用。原因为目前 DML 语句会针对性的对 LAS 表提前进行优化判断。当在同一个任务中使用 DDL 建表并立刻运行 DML 插入时,LAS 内表目前暂时无法应用内表优化规则,导致查询失败。对于 LAS 外表在 DDL+DML 时,DML 则无法立刻获得 TOS 认证信息。因此,如果您原来的查询语句中有 DDL+DML 一起的情况下,需要先起一个任务运行 DDL,再进行 DML 修改。
语法概览
[ cte_defination ] select_statement [ <{ UNION | INTERSECT | EXCEPT } [ ALL | DISTINCT ] select_statement, ...> ] [ ORDER BY <expression [ ASC | DESC ] [ NULLS { FIRST | LAST } ], ...> ] [ SORT BY <expression [ ASC | DESC ] [ NULLS { FIRST | LAST } ], ...> ] [ CLUSTER BY <expression, ...> ] [ DISTRIBUTE BY <expression, ...> ] [ <WINDOW named_window, ...> ] [ LIMIT { ALL | expression } ]
参数概览
指定主查询块前的公共表表达式(Common Table Expression, CTE)。这些表表达式允许在稍后的从句中引用。这对于从句中抽象出重复的子查询块很有用,并提高查询的可读性。
指定查询逻辑
intersect
、intersect all
、intersect distinct
)、并集(union
、union all
、union distinct
)或补集(except
、except all
、except distinct
)操作ORDER BY
子句用于按用户指定的顺序以排序方式返回结果行。与SORT BY
子句不同,该子句保证输出中的总顺序。
指定每个分区内行的排序。此参数与ORDER BY
和CLUSTER BY
互斥,不能一起指定。
请注意,这与保证输出总顺序的ORDER BY
子句不同,SORT BY
子句用于返回按用户指定顺序在每个分区内排序的结果行。当有多个分区时,SORT BY 可能返回部分有序的结果。
指定一组表达式,用于对行进行重新分区和排序。使用此子句的效果与同时使用DISTRIBUTE BY
和SORT BY
相同。
指定将结果行根据分区 key 进行重分区的一组表达式。此参数与ORDER BY
和CLUSTER BY
互斥,不能一起指定。
窗口函数对一组行进行操作,并根据该组行计算每行的返回值。窗口函数对于处理任务很有用,例如计算移动平均值、计算累积统计数据或访问给定当前行相对位置的行的值。
指定语句或子查询可返回的最大行数。此子句主要与ORDER BY
一起使用,以生成确定性结果。
WITH <cte_name [ AS ] ( select_statement ), ...>
with
子句中的其他 CTE 的名称相同。-- CTE -- Single CTE WITH t AS ( SELECT id, name, age FROM test_olap.student WHERE age > 20 ) SELECT * FROM t; --Multi CTEs WITH t1 AS ( SELECT id, name, age FROM test_olap.student WHERE age > 20 ), t2 AS ( SELECT id, name FROM t1 WHERE age < 25 ) SELECT * FROM t2; -- CTE in CTE definition WITH t AS ( WITH t2 AS ( SELECT id, name, age FROM test_olap.student WHERE age > 20 ) SELECT id, name FROM t2 WHERE age < 25 ) SELECT * FROM t; -- CTE in subquery SELECT max(t2.id) FROM ( WITH t AS ( SELECT id, name, age FROM test_olap.student WHERE age > 20 ) SELECT * FROM t ) t2
语法概览
SELECT [ <hints, ...> ] [ ALL | DISTINCT ] <named_expression, ...> FROM <from_item, ...> [ LATERAL VIEW clause ] [ LATERAL VIEW clause ... ] [ WHERE boolean_expression ] [ GROUP BY <expression, ...> ] [ HAVING boolean_expression ] [ ORDER BY <expression [ ASC | DESC ] [ NULLS { FIRST | LAST } ], ...> ] [ SORT BY <expression [ ASC | DESC ] [ NULLS { FIRST | LAST } ], ...> ] [ CLUSTER BY <expression, ...> ] [ DISTRIBUTE BY <expression, ...> ] [ <WINDOW named_window, ...> ] [ LIMIT { ALL | expression } ]
参数概览
Hints
来帮助引擎做出更好的规划决策。例如,目前 Spark 支持影响 join 策略选择和数据 repartition 的Hints
。All
代表从关系中选择所有匹配的行,并默认启用。DISTINCT
代表删除结果中的重复项后,从关系中选择所有匹配行。LATERAL VIEW
子句需要与EXPLODE
等生成器函数一起使用,EXPLODE
将生成包含一行或多行的虚拟表,LATERAL VIEW
将把这些行应用到每个原始输出行。FROM
子句的结果。GROUP BY
生成的行的谓词。HAVING
子句用于在执行分组后筛选行。如果指定HAVING
而不指定GROUP BY
,则表示GROUP BY
不包含分组表达式(全局聚合)。语法
relation { [ join_type ] JOIN relation [ join_criteria ] | NATURAL join_type JOIN relation }
参数
示例
-- Join Hints SELECT /*+ BROADCASTJOIN (t1) */ * FROM test_olap.student t1 LEFT JOIN (SELECT * FROM test_olap.student_part2 WHERE date = '20200101') t2 ON t1.id = t2.id;
语法
VALUES ( <expression, ...> ) [ table_alias ]
参数
示例
-- inline table SELECT * FROM (VALUES ("one", 1));
LATERAL VIEW
子句需要与EXPLODE
等生成器函数一起使用,EXPLODE
将生成包含一行或多行的虚拟表,LATERAL VIEW
将把这些行应用到每个原始输出行。
语法
LATERAL VIEW [ OUTER ] generator_function ( <expression, ... ) [ table_alias ] AS <column_alias, ...>
参数
OUTER
,当输入数组/映射为空或 null 时,则返回 null。generator_function
的别名,这是可选的。generator_function
的列的别名。如果generator_function
有多个输出列,则可能有多个别名。FROM
子句的结果。示例
-- LATERAL VIEW SELECT t1.age, t2.c_age, t3.d_age FROM test_olap.student t1 LATERAL VIEW EXPLODE(ARRAY(30, 60)) t2 AS c_age LATERAL VIEW EXPLODE(ARRAY(40, 80)) t3 AS d_age;
指定用于对行进行分组的表达式。这将与聚合函数(MIN、MAX、COUNT、SUM、AVG 等)一起使用。根据分组表达式和每个组中的聚合值对行进行分组。当 FILTER 子句附加到聚合函数时,仅将匹配的行传递给该函数。
语法
{ GROUP BY <group_expression, ...> [ { WITH ROLLUP | WITH CUBE | GROUPING SETS ( <grouping_set, ...> ) } ] | GROUP BY GROUPING SETS ( <grouping_set, ...> ) }
其中 group_expression 定义如下:
aggregate_name ( [ DISTINCT ] <expression, ...> ] ) [ FILTER ( WHERE boolean_expression ) ]
参数
GROUP BY GROUPING SETS (warehouse, product)
在语义上等同于GROUP BY
warehouse
仓库和GROUP BY
product
的结果的联合。该子句是UNION ALL
的简写,其中UNION ALL
运算符的每个分支执行GROUPING SETS
子句中指定的列子集的聚合。grouping sets
,也可以和grouping sets
组合使用。grouping sets
,也可以和grouping sets
组合使用。WHERE
子句中boolean_expression
为 true 的输入行,将其传递给集合函数;其他行将被丢弃。示例
-- Group by -- Grouping SETS SELECT id, count(1) FROM test_olap.student GROUP BY id, name GROUPING SETS((id, name), (id), ()); -- CUBE SELECT id, count(1) FROM test_olap.student GROUP BY id, name WITH CUBE; -- ROLLUP SELECT id, count(1) FROM test_olap.student GROUP BY id, name WITH ROLLUP; -- Distinct in aggregate functions SELECT id, sum(DISTINCT age) FROM test_olap.student GROUP BY id; -- Aggregate filter SELECT id, sum(age) FILTER (WHERE name in ('Tom', 'Jack')) FROM test_olap.student GROUP BY id; -- Group by alias SELECT regexp_replace(name, 'T', 'S') as name_alias, count(1) FROM test_olap.student GROUP BY name_alias
语法
CASE [ expression ] { WHEN boolean_expression THEN then_expression } [ ... ] [ ELSE else_expression ] END
参数
boolean_expression
条件的然后表达式。then_expression
和else_expression
都应该是相同的类型或可强制为通用类型。示例
-- Case clause -- Case clause specifies expression SELECT id, case age when 18 then 'young' when 20 then 'old' else 'others' end as stu_tag FROM test_olap.student; -- Case clause specifes no expression SELECT id, case when age = 18 then 'young' when age = 20 then 'old' else 'others' end as stu_tag FROM test_olap.student; -- Case clause as WHERE condtion SELECT id, name FROM test_olap.student WHERE case when age = 18 then 'young' when age = 20 then 'old' else 'others' end = 'young';
语法
window_function OVER ( [ { PARTITION | DISTRIBUTE } BY <column_spec, ...> ] [ { ORDER | SORT } BY <expression [ ASC | DESC ][ NULLS { FIRST | LAST } ], ...> ] [ window_frame ] )
参数
RANK | DENSE_RANK | PERCENT_RANK | NTILE | ROW_NUMBER
RANK | DENSE_RANK | PERCENT_RANK | NTILE | ROW_NUMBER
CUME_DIST | LAG | LEAD
MAX | MIN | COUNT | SUM | AVG | ...
{ RANGE | ROWS } { frame_start | BETWEEN frame_start AND frame_end }
frame_start
和frame_end
具有以下语法:
UNBOUNDED PRECEDING | offset PRECEDING | CURRENT ROW | offset FOLLOWING | UNBOUNDED FOLLOWING
offset
指定frame_end
与当前行位置的偏移量。如果省略frame_end
,则默认为当前行。
示例
-- Window -- Simple in-line window SELECT id, name, RANK() OVER (PARTITION BY name ORDER BY age) AS rank FROM test_olap.student; -- Rows between SELECT id, name, DENSE_RANK() OVER (PARTITION BY name ORDER BY age ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS rank FROM test_olap.student; -- range between SELECT id, name, DENSE_RANK() OVER (PARTITION BY name ORDER BY age Range BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS rank FROM test_olap.student; -- Aggregate functions with window SELECT id, name, max(age) OVER (PARTITION BY name ORDER BY age) AS rank FROM test_olap.student; -- Window expression SELECT id, name, max(age) OVER w as rank FROM test_olap.student WINDOW w as (PARTITION BY name ORDER BY age)
-- Having -- `HAVING` clause referring to column in `GROUP BY`. SELECT id, sum(age) FILTER (WHERE name in ('Tom', 'Jack')) as sum FROM test_olap.student GROUP BY id Having sum > 50; -- `HAVING` clause referring to aggregate function. SELECT id, name FROM test_olap.student GROUP BY id, name Having sum(age) > 50; -- `HAVING` clause referring to aggregate function by its alias SELECT id, name, sum(age) as sum FROM test_olap.student GROUP BY id, name Having sum > 50;
-- Order by -- Sort rows by id. By default rows are sorted in ascending manner with NULL FIRST. SELECT * FROM test_olap.student ORDER BY id; -- Sort rows in ascending manner keeping null values to be last. SELECT * FROM test_olap.student ORDER BY id NULLS LAST; -- Sort rows by id in descending manner, which defaults to NULL LAST. SELECT * FROM test_olap.student ORDER BY id DESC; -- Sort rows in ascending manner keeping null values to be first. SELECT * FROM test_olap.student ORDER BY id DESC NULLS FIRST; -- Sort rows based on more than one column with each column having different-- sort direction. SELECT * FROM test_olap.student ORDER BY id DESC, name ASC;
-- Sort by -- Sort rows by `name` within each partition in ascending manner SELECT id, name, age FROM test_olap.student SORT BY name; -- Sort rows within each partition using column position. SELECT id, name ,age FROM test_olap.student SORT BY 1; -- Sort rows within partition in ascending manner keeping null values to be last. SELECT id, name, age FROM test_olap.student SORT BY id NULLS LAST; -- Sort rows by id within each partition in descending manner, which defaults to NULL LAST. SELECT id, name, age FROM test_olap.student SORT BY id DESC; -- Sort rows by id within each partition in descending manner keeping null values to be first. SELECT id, name, age FROM test_olap.student SORT BY id DESC NULLS FIRST; -- Sort rows within each partition based on more than one column with each column having different sort direction. SELECT id, name, age FROM test_olap.student SORT BY id DESC, name;
-- Cluster by & distribute by -- Cluster by SELECT * FROM test_olap.student CLUSTER BY id; -- The above CLUSTER BY statement equals to DISTRIBUTE BY with SORT BY like the following SQL SELECT * FROM test_olap.student DISTRIBUTE BY id SORT BY id; -- DISTRIBUTE by SELECT * FROM test_olap.student DISTRIBUTE BY id;
-- LIMIT -- LIMIT 1000 rows SELECT * FROM test_olap.student LIMIT 1000; -- LIMIT ALL SELECT * FROM test_olap.student LIMIT ALL;
EXPLAIN [ EXTENDED | CODEGEN | COST] statement
-- Default Output EXPLAIN SELECT id, sum(age) FILTER (WHERE name in ('Tom', 'Jack')) FROM test_olap.student GROUP BY id; -- Using Extended EXPLAIN EXTENDED SELECT id, sum(age) FILTER (WHERE name in ('Tom', 'Jack')) FROM test_olap.student GROUP BY id; -- Using Formatted EXPLAIN FORMATTED SELECT id, sum(age) FILTER (WHERE name in ('Tom', 'Jack')) FROM test_olap.student GROUP BY id;
CACHE [ LAZY ] TABLE [database_name.]table_name [ [ AS ] select_statement ]
CACHE LAZY TABLE cacheTable AS SELECT * FROM test_olap.student
UNCACHE TABLE [ IF EXISTS ] [database_name.]table_name
UNCACHE TABLE cacheTable
{ DESC | DESCRIBE } { DATABASE | SCHEMA } [ EXTENDED ] database_identifier
DESC DATABASE EXTENDED test_olap
{ DESC | DESCRIBE } [ TABLE ] [ EXTENDED | FORMATTED ] [database_name.]table_name [ partition_spec ] [ col_name ]
EXTENDED
,则返回其他元数据信息(如父数据库、所有者和访问时间)。-- Describe table DESC FORMATTED test_olap.student -- Describe partition DESC EXTENDED test_olap.student_part PARTITION (date = '20200101') -- Describe column DESC test_olap.student name
{ DESC | DESCRIBE } FUNCTION function_name
DESC FUNCTION from_json
SHOW COLUMNS { IN | FROM } [database_name.]table_name [ { IN | FROM } database_identifier ]
-- Specifies table qualified with a database name SHOW COLUMNS IN test_olap.student; -- Specifies database SHOW COLUMNS in student in test_olap; -- Specifies table qualified with a database name also specifies database SHOW COLUMNS in test_olap.student in test_olap;
SHOW CREATE TABLE [database_name.]table_name
SHOW CREATE TABLE test_olap.student
SHOW { DATABASES | SCHEMAS } [ LIKE regex_pattern ]
*
和|
字符之外,该模式的工作方式类似于正则表达式。*
alone 匹配 0 个或更多字符,|
用于分隔多个不同的正则表达式,其中任何一个都可以匹配。-- Show all databases SHOW DATABASES -- Filter with regex pattern SHOW SCHEMAS LIKE 'test_olap'
SHOW [ function_kind ] FUNCTIONS [ [ LIKE ] { 'function_name' | 'regex_pattern' } ]
-- Specify function kind SHOW SYSTEM FUNCTIONS -- Specify function name SHOW FUNCTIONS 'from_json' -- Specify regex_pattern SHOW FUNCTIONS 'a*'
SHOW PARTITIONS [database_name.]table_name
-- Show all partitions SHOW PARTITIONS test_olap.student_part2
SHOW TABLES [ { FROM | IN } database_identifier ] [ LIKE 'regex_pattern' ]
-- Show all of the tables in the database SHOW TABLES IN test_olap; -- Show tables which names match the regex pattern SHOW TABLES LIKE 'a*' -- Show tables in the database which names match the regex pattern SHOW TABLES FROM test_olap LIKE 'a*'
SET [ -v ] SET property_key [ = property_value ]
-- Set a property. SET spark.sql.variable.substitute=false; -- List all SQLConf properties with value and meaning. SET -v; -- List all SQLConf properties with value for current session. SET; -- List the value of specified property key. SET spark.sql.variable.substitute;