BigQuery Reference
bq CLI Commands
# Run a query
bq query --use_legacy_sql=false \
'SELECT name, COUNT(*) as cnt FROM `my-project.dataset.table` GROUP BY name'
# Create dataset
bq mk --dataset --location=US my-project:my_dataset
# Create table from schema file
bq mk --table my-project:my_dataset.my_table schema.json
# Load data from GCS
bq load --autodetect --source_format=CSV \
my-project:my_dataset.my_table \
gs://my-bucket/data/*.csv
# Load JSON (newline-delimited)
bq load --source_format=NEWLINE_DELIMITED_JSON \
my-project:my_dataset.my_table \
gs://my-bucket/data.jsonl \
schema.json
# Export table to GCS
bq extract --destination_format=CSV \
my-project:my_dataset.my_table \
gs://my-bucket/export/file_*.csv
# Show table schema
bq show --format=prettyjson my-project:my_dataset.my_table
Standard SQL Patterns
-- Window function: running total
SELECT
date,
revenue,
SUM(revenue) OVER (ORDER BY date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_total
FROM `my-project.sales.daily_revenue`
ORDER BY date;
-- ARRAY_AGG and UNNEST
SELECT user_id, item
FROM `my-project.events.purchases`,
UNNEST(items) AS item;
-- WITH clause (CTE)
WITH monthly AS (
SELECT DATE_TRUNC(created_at, MONTH) AS month, COUNT(*) AS orders
FROM `my-project.ecom.orders`
GROUP BY 1
)
SELECT month, orders,
LAG(orders) OVER (ORDER BY month) AS prev_month_orders
FROM monthly;
-- MERGE (upsert)
MERGE `my-project.dataset.target` T
USING `my-project.dataset.source` S ON T.id = S.id
WHEN MATCHED THEN UPDATE SET T.value = S.value
WHEN NOT MATCHED THEN INSERT (id, value) VALUES(S.id, S.value);
Partitioned Tables
-- Create partitioned table (by ingestion time)
CREATE TABLE `my-project.dataset.events_partitioned`
PARTITION BY DATE(_PARTITIONTIME)
OPTIONS (require_partition_filter = true)
AS SELECT * FROM `my-project.dataset.events` WHERE FALSE;
-- Create partitioned table (by column)
CREATE TABLE `my-project.dataset.logs`
(
log_date DATE,
message STRING,
level STRING
)
PARTITION BY log_date
OPTIONS (partition_expiration_days = 90);
-- Query specific partition (cost savings)
SELECT * FROM `my-project.dataset.logs`
WHERE log_date BETWEEN '2024-01-01' AND '2024-01-31';
-- Check partition info
SELECT * FROM `my-project.dataset.INFORMATION_SCHEMA.PARTITIONS`
WHERE table_name = 'logs'
ORDER BY partition_id DESC LIMIT 10;
Clustering
-- Create clustered table (up to 4 columns)
CREATE TABLE `my-project.dataset.events_clustered`
PARTITION BY DATE(event_date)
CLUSTER BY user_id, event_type
AS SELECT * FROM `my-project.dataset.events`;
-- Clustering benefits:
-- Reduces bytes scanned when filtering on cluster columns
-- Free to cluster (no extra charge)
-- Automatically re-clustered by BigQuery
-- Check clustering info
SELECT cluster_columns
FROM `my-project.dataset.INFORMATION_SCHEMA.TABLES`
WHERE table_name = 'events_clustered';
Data Loading Options
| Method | Use Case | Notes |
|---|---|---|
| bq load | Batch load from GCS/local | Formats: CSV, JSON, Avro, Parquet, ORC |
| Streaming insert | Real-time row inserts | ~$0.01/200MB; rows available immediately |
| Storage Write API | High-throughput streaming | Exactly-once delivery support |
| BigQuery Data Transfer | Scheduled imports from SaaS | Ads, YouTube, S3, Teradata, etc. |
| Dataflow | ETL pipelines | Apache Beam based |
| External tables | Query without loading | GCS, Cloud SQL, Drive |
Cost Control
# Dry run: estimate bytes before query
bq query --use_legacy_sql=false --dry_run \
'SELECT * FROM `my-project.dataset.big_table`'
# Set maximum bytes billed (query fails if exceeded)
bq query --use_legacy_sql=false \
--maximum_bytes_billed=1073741824 \ # 1 GB
'SELECT ...'
# Use reservations (flat-rate pricing)
bq mk --reservation --project_id=my-project \
--location=US --slots=500 my-reservation
-- SQL cost tips:
-- SELECT only needed columns (avoid SELECT *)
-- Filter partitions before joining
-- Use APPROX_COUNT_DISTINCT instead of COUNT(DISTINCT)
-- Materialize intermediate results with scheduled queries