BigQuery Reference

bq CLI Commands

# Run a query
bq query --use_legacy_sql=false \
  'SELECT name, COUNT(*) as cnt FROM `my-project.dataset.table` GROUP BY name'

# Create dataset
bq mk --dataset --location=US my-project:my_dataset

# Create table from schema file
bq mk --table my-project:my_dataset.my_table schema.json

# Load data from GCS
bq load --autodetect --source_format=CSV \
  my-project:my_dataset.my_table \
  gs://my-bucket/data/*.csv

# Load JSON (newline-delimited)
bq load --source_format=NEWLINE_DELIMITED_JSON \
  my-project:my_dataset.my_table \
  gs://my-bucket/data.jsonl \
  schema.json

# Export table to GCS
bq extract --destination_format=CSV \
  my-project:my_dataset.my_table \
  gs://my-bucket/export/file_*.csv

# Show table schema
bq show --format=prettyjson my-project:my_dataset.my_table

Standard SQL Patterns

-- Window function: running total
SELECT
  date,
  revenue,
  SUM(revenue) OVER (ORDER BY date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_total
FROM `my-project.sales.daily_revenue`
ORDER BY date;

-- ARRAY_AGG and UNNEST
SELECT user_id, item
FROM `my-project.events.purchases`,
UNNEST(items) AS item;

-- WITH clause (CTE)
WITH monthly AS (
  SELECT DATE_TRUNC(created_at, MONTH) AS month, COUNT(*) AS orders
  FROM `my-project.ecom.orders`
  GROUP BY 1
)
SELECT month, orders,
  LAG(orders) OVER (ORDER BY month) AS prev_month_orders
FROM monthly;

-- MERGE (upsert)
MERGE `my-project.dataset.target` T
USING `my-project.dataset.source` S ON T.id = S.id
WHEN MATCHED THEN UPDATE SET T.value = S.value
WHEN NOT MATCHED THEN INSERT (id, value) VALUES(S.id, S.value);

Partitioned Tables

-- Create partitioned table (by ingestion time)
CREATE TABLE `my-project.dataset.events_partitioned`
PARTITION BY DATE(_PARTITIONTIME)
OPTIONS (require_partition_filter = true)
AS SELECT * FROM `my-project.dataset.events` WHERE FALSE;

-- Create partitioned table (by column)
CREATE TABLE `my-project.dataset.logs`
(
  log_date DATE,
  message STRING,
  level STRING
)
PARTITION BY log_date
OPTIONS (partition_expiration_days = 90);

-- Query specific partition (cost savings)
SELECT * FROM `my-project.dataset.logs`
WHERE log_date BETWEEN '2024-01-01' AND '2024-01-31';

-- Check partition info
SELECT * FROM `my-project.dataset.INFORMATION_SCHEMA.PARTITIONS`
WHERE table_name = 'logs'
ORDER BY partition_id DESC LIMIT 10;

Clustering

-- Create clustered table (up to 4 columns)
CREATE TABLE `my-project.dataset.events_clustered`
PARTITION BY DATE(event_date)
CLUSTER BY user_id, event_type
AS SELECT * FROM `my-project.dataset.events`;

-- Clustering benefits:
-- Reduces bytes scanned when filtering on cluster columns
-- Free to cluster (no extra charge)
-- Automatically re-clustered by BigQuery

-- Check clustering info
SELECT cluster_columns
FROM `my-project.dataset.INFORMATION_SCHEMA.TABLES`
WHERE table_name = 'events_clustered';

Data Loading Options

Method	Use Case	Notes
bq load	Batch load from GCS/local	Formats: CSV, JSON, Avro, Parquet, ORC
Streaming insert	Real-time row inserts	~$0.01/200MB; rows available immediately
Storage Write API	High-throughput streaming	Exactly-once delivery support
BigQuery Data Transfer	Scheduled imports from SaaS	Ads, YouTube, S3, Teradata, etc.
Dataflow	ETL pipelines	Apache Beam based
External tables	Query without loading	GCS, Cloud SQL, Drive

Cost Control

# Dry run: estimate bytes before query
bq query --use_legacy_sql=false --dry_run \
  'SELECT * FROM `my-project.dataset.big_table`'

# Set maximum bytes billed (query fails if exceeded)
bq query --use_legacy_sql=false \
  --maximum_bytes_billed=1073741824 \  # 1 GB
  'SELECT ...'

# Use reservations (flat-rate pricing)
bq mk --reservation --project_id=my-project \
  --location=US --slots=500 my-reservation

-- SQL cost tips:
-- SELECT only needed columns (avoid SELECT *)
-- Filter partitions before joining
-- Use APPROX_COUNT_DISTINCT instead of COUNT(DISTINCT)
-- Materialize intermediate results with scheduled queries

BigQuery Reference

bq CLI Commands

Standard SQL Patterns

Partitioned Tables

Clustering

Data Loading Options

Cost Control

Outils associés