BigQuery Reference

bq CLI Commands

# Run a query bq query --use_legacy_sql=false \ 'SELECT name, COUNT(*) as cnt FROM `my-project.dataset.table` GROUP BY name' # Create dataset bq mk --dataset --location=US my-project:my_dataset # Create table from schema file bq mk --table my-project:my_dataset.my_table schema.json # Load data from GCS bq load --autodetect --source_format=CSV \ my-project:my_dataset.my_table \ gs://my-bucket/data/*.csv # Load JSON (newline-delimited) bq load --source_format=NEWLINE_DELIMITED_JSON \ my-project:my_dataset.my_table \ gs://my-bucket/data.jsonl \ schema.json # Export table to GCS bq extract --destination_format=CSV \ my-project:my_dataset.my_table \ gs://my-bucket/export/file_*.csv # Show table schema bq show --format=prettyjson my-project:my_dataset.my_table

Standard SQL Patterns

-- Window function: running total SELECT date, revenue, SUM(revenue) OVER (ORDER BY date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_total FROM `my-project.sales.daily_revenue` ORDER BY date; -- ARRAY_AGG and UNNEST SELECT user_id, item FROM `my-project.events.purchases`, UNNEST(items) AS item; -- WITH clause (CTE) WITH monthly AS ( SELECT DATE_TRUNC(created_at, MONTH) AS month, COUNT(*) AS orders FROM `my-project.ecom.orders` GROUP BY 1 ) SELECT month, orders, LAG(orders) OVER (ORDER BY month) AS prev_month_orders FROM monthly; -- MERGE (upsert) MERGE `my-project.dataset.target` T USING `my-project.dataset.source` S ON T.id = S.id WHEN MATCHED THEN UPDATE SET T.value = S.value WHEN NOT MATCHED THEN INSERT (id, value) VALUES(S.id, S.value);

Partitioned Tables

-- Create partitioned table (by ingestion time) CREATE TABLE `my-project.dataset.events_partitioned` PARTITION BY DATE(_PARTITIONTIME) OPTIONS (require_partition_filter = true) AS SELECT * FROM `my-project.dataset.events` WHERE FALSE; -- Create partitioned table (by column) CREATE TABLE `my-project.dataset.logs` ( log_date DATE, message STRING, level STRING ) PARTITION BY log_date OPTIONS (partition_expiration_days = 90); -- Query specific partition (cost savings) SELECT * FROM `my-project.dataset.logs` WHERE log_date BETWEEN '2024-01-01' AND '2024-01-31'; -- Check partition info SELECT * FROM `my-project.dataset.INFORMATION_SCHEMA.PARTITIONS` WHERE table_name = 'logs' ORDER BY partition_id DESC LIMIT 10;

Clustering

-- Create clustered table (up to 4 columns) CREATE TABLE `my-project.dataset.events_clustered` PARTITION BY DATE(event_date) CLUSTER BY user_id, event_type AS SELECT * FROM `my-project.dataset.events`; -- Clustering benefits: -- Reduces bytes scanned when filtering on cluster columns -- Free to cluster (no extra charge) -- Automatically re-clustered by BigQuery -- Check clustering info SELECT cluster_columns FROM `my-project.dataset.INFORMATION_SCHEMA.TABLES` WHERE table_name = 'events_clustered';

Data Loading Options

MethodUse CaseNotes
bq loadBatch load from GCS/localFormats: CSV, JSON, Avro, Parquet, ORC
Streaming insertReal-time row inserts~$0.01/200MB; rows available immediately
Storage Write APIHigh-throughput streamingExactly-once delivery support
BigQuery Data TransferScheduled imports from SaaSAds, YouTube, S3, Teradata, etc.
DataflowETL pipelinesApache Beam based
External tablesQuery without loadingGCS, Cloud SQL, Drive

Cost Control

# Dry run: estimate bytes before query bq query --use_legacy_sql=false --dry_run \ 'SELECT * FROM `my-project.dataset.big_table`' # Set maximum bytes billed (query fails if exceeded) bq query --use_legacy_sql=false \ --maximum_bytes_billed=1073741824 \ # 1 GB 'SELECT ...' # Use reservations (flat-rate pricing) bq mk --reservation --project_id=my-project \ --location=US --slots=500 my-reservation -- SQL cost tips: -- SELECT only needed columns (avoid SELECT *) -- Filter partitions before joining -- Use APPROX_COUNT_DISTINCT instead of COUNT(DISTINCT) -- Materialize intermediate results with scheduled queries