BigQuery 参考

bq CLI 命令

# 运行查询 bq query --use_legacy_sql=false \ 'SELECT name, COUNT(*) FROM `my-project.dataset.table` GROUP BY name' # 创建数据集 bq mk --dataset --location=US my-project:my_dataset # 从 GCS 加载数据 bq load --autodetect --source_format=CSV \ my-project:my_dataset.my_table \ gs://my-bucket/data/*.csv # 导出表到 GCS bq extract --destination_format=CSV \ my-project:my_dataset.my_table \ gs://my-bucket/export/file_*.csv # 查看表结构 bq show --format=prettyjson my-project:my_dataset.my_table

标准 SQL 模式

-- 窗口函数:累计合计 SELECT date, revenue, SUM(revenue) OVER (ORDER BY date) AS running_total FROM `my-project.sales.daily_revenue` ORDER BY date; -- CTE(公共表表达式) WITH monthly AS ( SELECT DATE_TRUNC(created_at, MONTH) AS month, COUNT(*) AS orders FROM `my-project.ecom.orders` GROUP BY 1 ) SELECT month, orders FROM monthly; -- MERGE(合并更新) MERGE `my-project.dataset.target` T USING `my-project.dataset.source` S ON T.id = S.id WHEN MATCHED THEN UPDATE SET T.value = S.value WHEN NOT MATCHED THEN INSERT (id, value) VALUES(S.id, S.value);

分区表

-- 按列分区 CREATE TABLE `my-project.dataset.logs` ( log_date DATE, message STRING, level STRING ) PARTITION BY log_date OPTIONS (partition_expiration_days = 90); -- 查询特定分区(节省费用) SELECT * FROM `my-project.dataset.logs` WHERE log_date BETWEEN '2024-01-01' AND '2024-01-31';

聚簇(Clustering)

-- 创建分区+聚簇表(最多 4 列) CREATE TABLE `my-project.dataset.events_clustered` PARTITION BY DATE(event_date) CLUSTER BY user_id, event_type AS SELECT * FROM `my-project.dataset.events`; -- 聚簇减少按聚簇列过滤时的扫描量 -- 免费,由 BigQuery 自动维护

成本控制

# 试运行:估算查询费用 bq query --use_legacy_sql=false --dry_run \ 'SELECT * FROM `my-project.dataset.big_table`' # 设置最大计费字节数 bq query --use_legacy_sql=false \ --maximum_bytes_billed=1073741824 \ 'SELECT ...' -- SQL 节省成本技巧: -- 只 SELECT 需要的列(避免 SELECT *) -- 先过滤分区再 JOIN -- 用 APPROX_COUNT_DISTINCT 替代 COUNT(DISTINCT)