Skip to article frontmatterSkip to article content
Site not loading correctly?

This may be due to an incorrect BASE_URL configuration. See the MyST Documentation for reference.

Full Climate Data Extraction: Joshua Tree & Mojave

This notebook extracts T_Max and T_Min across all scenarios and all available years for both Joshua Tree and Mojave at monthly 3km resolution, using a 32-worker Coiled cluster on GCP.

The output CSVs can be used to compute T_avg = (T_Max + T_Min) / 2.

Estimated time: depends on cluster spin-up + data volume. All is timed below. Note:

  • Its helpedful to do activate conda activate py-env (or whatever your python eenvironment for calAdapt is) and then in the terminal:

    • coiled config set coiled.wheel-creation-timeout 4m


import sys
import os
import time
import pandas as pd
import coiled

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))
sys.path.insert(0, os.path.join(PROJECT_ROOT, "lib"))

from andrewAdaptLibrary import (
    ParkCatalog,
    get_lat_lon_bounds,
    get_climate_data,
    VARIABLE_MAP,
    SCENARIO_MAP,
)

# Output directory
OUTPUT_DIR = os.path.join(PROJECT_ROOT, "data", "csv", "full_extraction")
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load the mega NPS shapefile for park lookups
NPS_SHP = os.path.join(
    PROJECT_ROOT,
    "USA_National_Park_Service_Lands_20170930_4993375350946852027",
    "USA_Federal_Lands.shp",
)
catalog = ParkCatalog(NPS_SHP)

print(f"Variables: {list(VARIABLE_MAP.keys())}")
print(f"Scenarios: {list(SCENARIO_MAP.keys())}")
print(f"Output dir: {OUTPUT_DIR}")
/opt/conda/envs/py-env/lib/python3.12/site-packages/pyogrio/raw.py:200: RuntimeWarning: /workspaces/DSEBrandNew/USA_National_Park_Service_Lands_20170930_4993375350946852027/USA_Federal_Lands.shp contains polygon(s) with rings with invalid winding order. Autocorrecting them, but that shapefile should be corrected using ogr2ogr for example.
  return ogr_read(
Variables: ['T_Max', 'T_Min', 'Precip']
Scenarios: ['Historical Climate', 'SSP 2-4.5', 'SSP 3-7.0', 'SSP 5-8.5']
Output dir: /workspaces/DSEBrandNew/data/csv/full_extraction

Load Park Boundaries

parks = ["Joshua Tree National Park", "Mojave National Preserve"]

boundaries = {}
for park_name in parks:
    boundaries[park_name] = catalog.get_boundary(park_name)
    lat, lon = get_lat_lon_bounds(boundaries[park_name])
    print(f"{park_name}: {lat[0]:.2f}-{lat[1]:.2f}°N, {abs(lon[0]):.2f}-{abs(lon[1]):.2f}°W")
Joshua Tree National Park: 33.67-34.13°N, 116.46-115.26°W
Mojave National Preserve: 34.72-35.59°N, 116.17-114.95°W

Start Coiled Cluster (32 workers, 24 GiB each)

print("Starting cluster on GCP...")
t_cluster_start = time.perf_counter()

cluster = coiled.Cluster(
    name="full-extraction",
    region="us-west1",
    n_workers=32,
    worker_vm_types=["n1-highmem-4"],  # 4 vCPUs, 26 GiB RAM per worker
    spot_policy="spot_with_fallback",
    idle_timeout="70 minutes",         # longer timeout so cluster survives between cells
    package_sync=True,
)

client = cluster.get_client()
t_cluster_ready = time.perf_counter()

print(f"\n✓ Cluster ready in {t_cluster_ready - t_cluster_start:.0f}s")
print(f"  Workers: {len(client.scheduler_info()['workers'])}")
print(f"  Dashboard: {client.dashboard_link}")
Starting cluster on GCP...
Loading...
/tmp/ipykernel_1017/3972667268.py:4: FutureWarning: `package_sync` is a deprecated kwarg for `Cluster` and will be removed in a future release. To only sync certain packages, use `package_sync_only`, and to disable package sync, pass the `container` or `software` kwargs instead.
  cluster = coiled.Cluster(
Loading...
Loading...
Loading...
Loading...
Loading...
---------------------------------------------------------------------------
ClusterCreationError                      Traceback (most recent call last)
Cell In[3], line 4
      1 print("Starting cluster on GCP...")
      2 t_cluster_start = time.perf_counter()
      3 
----> 4 cluster = coiled.Cluster(
      5     name="full-extraction",
      6     region="us-west1",
      7     n_workers=32,

File /opt/conda/envs/py-env/lib/python3.12/site-packages/coiled/v2/cluster.py:1048, in Cluster.__init__(self, name, software, container, ignore_container_entrypoint, n_workers, worker_class, worker_options, worker_vm_types, worker_cpu, worker_memory, worker_disk_size, worker_disk_throughput, worker_disk_config, worker_gpu, worker_gpu_type, scheduler_options, scheduler_vm_types, scheduler_cpu, scheduler_memory, scheduler_disk_size, scheduler_disk_config, scheduler_gpu, asynchronous, cloud, account, workspace, shutdown_on_close, idle_timeout, cluster_timeout, no_client_timeout, use_scheduler_public_ip, use_dashboard_https, dashboard_custom_subdomain, credentials, credentials_duration_seconds, timeout, environ, tags, send_dask_config, unset_single_threading_variables, backend_options, show_widget, custom_widget, configure_logging, wait_for_workers, package_sync, package_sync_strict, package_sync_conda_extras, package_sync_ignore, package_sync_only, package_sync_fail_on, package_sync_use_uv_installer, private_to_creator, use_best_zone, allow_cross_zone, compute_purchase_option, spot_policy, extra_worker_on_scheduler, _n_worker_specs_per_host, scheduler_port, allow_ingress_from, allow_ssh_from, allow_ssh, allow_spark, open_extra_ports, jupyter, mount_bucket, host_setup_script, region, arm, batch_job_ids, batch_job_container, scheduler_sidecars, worker_sidecars, pause_on_exit, skip_ssd_mount, filestores_to_attach)
   1046     if self.cluster_id:
   1047         log_cluster_debug_info(self.cluster_id, self.workspace)
-> 1048     raise e.with_traceback(None)  # noqa: B904
   1049 except KeyboardInterrupt as e:
   1050     error = e

ClusterCreationError: Cluster was waiting for 10 workers but 32 (of 32) workers have already failed. You could try requesting fewer workers or adjust `wait_for_workers` if fewer workers would be acceptable.

Failure Reasons
---------------

Instance Stopped: 
Coiled was unable to provision this VM Instance because of your current Google Cloud quotas.

We were unable to provision any VMs for this cluster with your existing quotas and current usage.

The error from Google Cloud is:
ZONE_RESOURCE_POOL_EXHAUSTED - The zone 'projects/dseproject-487619/zones/us-west1-b' does not have enough resources available to fulfill the request.  Try a different zone, or try again later. A n1-highmem-4 VM instance is currently unavailable in the us-west1-b zone. Alternatively, you can try your request again with a different VM hardware configuration or at a later time. For more information, see the troubleshooting documentation.
We recommend you submit a request for Google Cloud to increase your quotas.
You can submit requests in Google Cloud for your project and region:
https://console.cloud.google.com/iam-admin/quotas?project=dseproject-487619&pageState=(%22allQuotasTable%22:(%22f%22:%22%255B%257B_22k_22_3A_22Quota_22_2C_22t_22_3A10_2C_22v_22_3A_22_5C_22_5C_5C_5C_22CPUs_5C_5C_5C_22_5C_22_22_2C_22i_22_3A_22displayName_22%257D_2C%257B_22k_22_3A_22_22_2C_22t_22_3A10_2C_22v_22_3A_22_5C_22region_3Aus-west1_5C_22_22_2C_22s_22_3Atrue%257D_2C%257B_22k_22_3A_22_22_2C_22t_22_3A10_2C_22v_22_3A_22_5C_22OR_5C_22_22_2C_22o_22_3Atrue%257D_2C%257B_22k_22_3A_22Quota_22_2C_22t_22_3A10_2C_22v_22_3A_22_5C_22Persistent%2520Disk%2520SSD%2520%2528GB%2529_5C_22_22_2C_22s_22_3Atrue_2C_22i_22_3A_22displayName_22%257D_2C%257B_22k_22_3A_22_22_2C_22t_22_3A10_2C_22v_22_3A_22_5C_22region_3Aus-west1_5C_22_22_2C_22s_22_3Atrue%257D_2C%257B_22k_22_3A_22_22_2C_22t_22_3A10_2C_22v_22_3A_22_5C_22OR_5C_22_22_2C_22o_22_3Atrue%257D_2C%257B_22k_22_3A_22Quota_22_2C_22t_22_3A10_2C_22v_22_3A_22_5C_22In-use%2520regional%2520external%2520IPv4%2520addresses_5C_22_22_2C_22s_22_3Atrue_2C_22i_22_3A_22displayName_22%257D_2C%257B_22k_22_3A_22_22_2C_22t_22_3A10_2C_22v_22_3A_22_5C_22region_3Aus-west1_5C_22_22_2C_22s_22_3Atrue%257D%255D%22,%22s%22:%5B(%22i%22:%22displayName%22,%22s%22:%220%22),(%22i%22:%22effectiveLimit%22,%22s%22:%221%22),(%22i%22:%22currentPercent%22,%22s%22:%221%22),(%22i%22:%22currentUsage%22,%22s%22:%221%22),(%22i%22:%22serviceTitle%22,%22s%22:%220%22),(%22i%22:%22displayDimensions%22,%22s%22:%220%22)%5D)%29

	(error affected 32 workers) (cluster_id: 1565232)
  Cell In[4], line 1
    coiled cluster list --limit 20
           ^
SyntaxError: invalid syntax

Fetch All Data

For each park, fetch T_Max and T_Min across all 4 scenarios for the full time range:

  • Historical: 1950–2014

  • SSP 2-4.5, SSP 3-7.0, SSP 5-8.5: 2015–2100

We split into historical and future since the year ranges are different.

variables = ["T_Max", "T_Min"]
historical_scenarios = ["Historical Climate"]
future_scenarios = ["SSP 2-4.5", "SSP 3-7.0", "SSP 5-8.5"]

all_timings = {}
t_total_start = time.perf_counter()

for park_name, boundary in boundaries.items():
    # Clean name for filenames: "Joshua Tree National Park" -> "JoshuaTree"
    short_name = park_name.replace("National Park", "").replace("National Preserve", "").strip().replace(" ", "")

    print(f"\n{'='*60}")
    print(f"  {park_name}")
    print(f"{'='*60}")
    
    park_dfs = {var: [] for var in variables}
    
    # --- Historical: 1950-2014 ---
    print(f"\n  gettin histproal (1950-2014)...")
    t0 = time.perf_counter()
    
    hist_data = get_climate_data(
        variables=variables,
        scenarios=historical_scenarios,
        boundary=boundary,
        time_slice=(1950, 2014),
        timescale="monthly",
        backend="coiled",
        coiled_cluster=cluster,
    )
    
    t_hist = time.perf_counter() - t0
    all_timings[f"{short_name}_historical"] = t_hist
    
    for var in variables:
        df = hist_data[var]
        print(f"    {var}: {len(df):,} rows")
        park_dfs[var].append(df)
    print(f"  Historical done in {t_hist:.1f}s")
    
    # --- Future SSPs: 2015-2100 ---
    print(f"\n  getting all SSP scens (2015-2100)...")
    t0 = time.perf_counter()
    
    future_data = get_climate_data(
        variables=variables,
        scenarios=future_scenarios,
        boundary=boundary,
        time_slice=(2015, 2100),
        timescale="monthly",
        backend="coiled",
        coiled_cluster=cluster,
    )
    
    t_future = time.perf_counter() - t0
    all_timings[f"{short_name}_future"] = t_future
    
    for var in variables:
        df = future_data[var]
        print(f"    {var}: {len(df):,} rows")
        park_dfs[var].append(df)
    print(f"  Future scenarios done in {t_future:.1f}s")
    
    # --- Join & save ---
    print(f"\n  Savng CSVs...")
    for var in variables:
        combined = pd.concat(park_dfs[var], ignore_index=True)
        combined["park"] = short_name
        filename = f"{short_name}_{var}.csv"
        filepath = os.path.join(OUTPUT_DIR, filename)
        combined.to_csv(filepath, index=False)
        print(f"    Saved: {filename} ({len(combined):,} rows)")

t_total = time.perf_counter() - t_total_start
print(f"\n{'='*60}")
print(f"  ALL DATA FETCHED")
print(f"{'='*60}")
print(f"Total fetch time: {t_total:.1f}s ({t_total/60:.1f} min)")

Shut Down Cluster

cluster.close()
print("Cluster shut down.")

Timing Summary

print("=== Timing Summary ===")
print(f"Cluster spin-up: {t_cluster_ready - t_cluster_start:.0f}s")
print()
for label, t in all_timings.items():
    print(f"{label}: {t:.1f}s ({t/60:.1f} min)")
print()
print(f"Total data fetch: {t_total:.1f}s ({t_total/60:.1f} min)")
print(f"Total including cluster: {t_cluster_ready - t_cluster_start + t_total:.0f}s ({(t_cluster_ready - t_cluster_start + t_total)/60:.1f} min)")

Verify Output

print("=== Output Files ===")
for f in sorted(os.listdir(OUTPUT_DIR)):
    if f.endswith(".csv"):
        path = os.path.join(OUTPUT_DIR, f)
        size_mb = os.path.getsize(path) / 1e6
        df = pd.read_csv(path, nrows=5)
        row_count = sum(1 for _ in open(path)) - 1  # minus header
        print(f"\n{f}")
        print(f"  Size: {size_mb:.1f} MB")
        print(f"  Rows: {row_count:,}")
        print(f"  Columns: {list(df.columns)}")
        print(f"  Scenarios: {pd.read_csv(path, usecols=['scenario'])['scenario'].unique().tolist()}")
# Quick peek at one file
sample = pd.read_csv(os.path.join(OUTPUT_DIR, "JoshuaTree_T_Max.csv"))
print(f"Shape: {sample.shape}")
print(f"Years: {pd.to_datetime(sample['time']).dt.year.min()} to {pd.to_datetime(sample['time']).dt.year.max()}")
print(f"Scenarios: {sample['scenario'].unique().tolist()}")
print(f"Simulations: {sample['simulation'].nunique()}")
print()
sample.head()

Computing T_avg

Now that we have T_Max and T_Min, computing average temperature is straightforward:

for park_name in boundaries:
    short_name = park_name.replace("National Park", "").replace("National Preserve", "").strip().replace(" ", "")
    tmax = pd.read_csv(os.path.join(OUTPUT_DIR, f"{short_name}_T_Max.csv"))
    tmin = pd.read_csv(os.path.join(OUTPUT_DIR, f"{short_name}_T_Min.csv"))
    
    # Merge on shared columns to align rows
    merge_cols = ["simulation", "time", "scenario", "timescale", "park"]
    merged = tmax.merge(tmin[merge_cols + ["T_Min"]], on=merge_cols)
    merged["T_Avg"] = (merged["T_Max"] + merged["T_Min"]) / 2
    
    # Save
    out_path = os.path.join(OUTPUT_DIR, f"{short_name}_T_Avg.csv")
    merged.to_csv(out_path, index=False)
    print(f"{short_name}_T_Avg.csv: {len(merged):,} rows, T_Avg range: {merged['T_Avg'].min():.1f}°C to {merged['T_Avg'].max():.1f}°C")

Output Summary

Files saved to data/csv/full_extraction/:

FileContents
JoshuaTree_T_Max.csvMax temp, all scenarios, 1950-2100
JoshuaTree_T_Min.csvMin temp, all scenarios, 1950-2100
JoshuaTree_T_Avg.csvAvg temp (computed), all scenarios, 1950-2100
Mojave_T_Max.csvMax temp, all scenarios, 1950-2100
Mojave_T_Min.csvMin temp, all scenarios, 1950-2100
Mojave_T_Avg.csvAvg temp (computed), all secnarios, 1950-2100

Each CSV has columns: simulation, time, spatial_ref, T_Max/T_Min/T_Avg, scenario, timescale, park