Full Climate Data Extraction: Joshua Tree & Mojave

This notebook extracts T_Max and T_Min across all scenarios and all available years for both Joshua Tree and Mojave at monthly 3km resolution, using a 32-worker Coiled cluster on GCP.

The output CSVs can be used to compute T_avg = (T_Max + T_Min) / 2.

Estimated time: depends on cluster spin-up + data volume. All is timed below. Note:

Its helpedful to do activate conda activate py-env (or whatever your python eenvironment for calAdapt is) and then in the terminal:
- coiled config set coiled.wheel-creation-timeout 4m

import sys
import os
import time
import pandas as pd
import coiled

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))
sys.path.insert(0, os.path.join(PROJECT_ROOT, "lib"))

from andrewAdaptLibrary import (
    ParkCatalog,
    get_lat_lon_bounds,
    get_climate_data,
    VARIABLE_MAP,
    SCENARIO_MAP,
)

# Output directory
OUTPUT_DIR = os.path.join(PROJECT_ROOT, "data", "csv", "full_extraction")
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load the mega NPS shapefile for park lookups
NPS_SHP = os.path.join(
    PROJECT_ROOT,
    "USA_National_Park_Service_Lands_20170930_4993375350946852027",
    "USA_Federal_Lands.shp",
)
catalog = ParkCatalog(NPS_SHP)

print(f"Variables: {list(VARIABLE_MAP.keys())}")
print(f"Scenarios: {list(SCENARIO_MAP.keys())}")
print(f"Output dir: {OUTPUT_DIR}")

/opt/conda/envs/py-env/lib/python3.12/site-packages/pyogrio/raw.py:200: RuntimeWarning: /workspaces/DSEBrandNew/USA_National_Park_Service_Lands_20170930_4993375350946852027/USA_Federal_Lands.shp contains polygon(s) with rings with invalid winding order. Autocorrecting them, but that shapefile should be corrected using ogr2ogr for example.
  return ogr_read(

Variables: ['T_Max', 'T_Min', 'Precip']
Scenarios: ['Historical Climate', 'SSP 2-4.5', 'SSP 3-7.0', 'SSP 5-8.5']
Output dir: /workspaces/DSEBrandNew/data/csv/full_extraction

Load Park Boundaries¶

parks = ["Joshua Tree National Park", "Mojave National Preserve"]

boundaries = {}
for park_name in parks:
    boundaries[park_name] = catalog.get_boundary(park_name)
    lat, lon = get_lat_lon_bounds(boundaries[park_name])
    print(f"{park_name}: {lat[0]:.2f}-{lat[1]:.2f}°N, {abs(lon[0]):.2f}-{abs(lon[1]):.2f}°W")

Joshua Tree National Park: 33.67-34.13°N, 116.46-115.26°W
Mojave National Preserve: 34.72-35.59°N, 116.17-114.95°W

Start Coiled Cluster (32 workers, 24 GiB each)¶

print("Starting cluster on GCP...")
t_cluster_start = time.perf_counter()

cluster = coiled.Cluster(
    name="full-extraction",
    region="us-west1",
    n_workers=32,
    worker_vm_types=["n1-highmem-4"],  # 4 vCPUs, 26 GiB RAM per worker
    spot_policy="spot_with_fallback",
    idle_timeout="70 minutes",         # longer timeout so cluster survives between cells
    package_sync=True,
)

client = cluster.get_client()
t_cluster_ready = time.perf_counter()

print(f"\n✓ Cluster ready in {t_cluster_ready - t_cluster_start:.0f}s")
print(f"  Workers: {len(client.scheduler_info()['workers'])}")
print(f"  Dashboard: {client.dashboard_link}")

Starting cluster on GCP...

/tmp/ipykernel_1017/3972667268.py:4: FutureWarning: `package_sync` is a deprecated kwarg for `Cluster` and will be removed in a future release. To only sync certain packages, use `package_sync_only`, and to disable package sync, pass the `container` or `software` kwargs instead.
  cluster = coiled.Cluster(

---------------------------------------------------------------------------
ClusterCreationError                      Traceback (most recent call last)
Cell In[3], line 4
      1 print("Starting cluster on GCP...")
      2 t_cluster_start = time.perf_counter()
      3 
----> 4 cluster = coiled.Cluster(
      5     name="full-extraction",
      6     region="us-west1",
      7     n_workers=32,

File /opt/conda/envs/py-env/lib/python3.12/site-packages/coiled/v2/cluster.py:1048, in Cluster.__init__(self, name, software, container, ignore_container_entrypoint, n_workers, worker_class, worker_options, worker_vm_types, worker_cpu, worker_memory, worker_disk_size, worker_disk_throughput, worker_disk_config, worker_gpu, worker_gpu_type, scheduler_options, scheduler_vm_types, scheduler_cpu, scheduler_memory, scheduler_disk_size, scheduler_disk_config, scheduler_gpu, asynchronous, cloud, account, workspace, shutdown_on_close, idle_timeout, cluster_timeout, no_client_timeout, use_scheduler_public_ip, use_dashboard_https, dashboard_custom_subdomain, credentials, credentials_duration_seconds, timeout, environ, tags, send_dask_config, unset_single_threading_variables, backend_options, show_widget, custom_widget, configure_logging, wait_for_workers, package_sync, package_sync_strict, package_sync_conda_extras, package_sync_ignore, package_sync_only, package_sync_fail_on, package_sync_use_uv_installer, private_to_creator, use_best_zone, allow_cross_zone, compute_purchase_option, spot_policy, extra_worker_on_scheduler, _n_worker_specs_per_host, scheduler_port, allow_ingress_from, allow_ssh_from, allow_ssh, allow_spark, open_extra_ports, jupyter, mount_bucket, host_setup_script, region, arm, batch_job_ids, batch_job_container, scheduler_sidecars, worker_sidecars, pause_on_exit, skip_ssd_mount, filestores_to_attach)
   1046     if self.cluster_id:
   1047         log_cluster_debug_info(self.cluster_id, self.workspace)
-> 1048     raise e.with_traceback(None)  # noqa: B904
   1049 except KeyboardInterrupt as e:
   1050     error = e

ClusterCreationError: Cluster was waiting for 10 workers but 32 (of 32) workers have already failed. You could try requesting fewer workers or adjust `wait_for_workers` if fewer workers would be acceptable.

Failure Reasons
---------------

Instance Stopped: 
Coiled was unable to provision this VM Instance because of your current Google Cloud quotas.

We were unable to provision any VMs for this cluster with your existing quotas and current usage.

The error from Google Cloud is:
ZONE_RESOURCE_POOL_EXHAUSTED - The zone 'projects/dseproject-487619/zones/us-west1-b' does not have enough resources available to fulfill the request.  Try a different zone, or try again later. A n1-highmem-4 VM instance is currently unavailable in the us-west1-b zone. Alternatively, you can try your request again with a different VM hardware configuration or at a later time. For more information, see the troubleshooting documentation.
We recommend you submit a request for Google Cloud to increase your quotas.
You can submit requests in Google Cloud for your project and region:
https://console.cloud.google.com/iam-admin/quotas?project=dseproject-487619&pageState=(%22allQuotasTable%22:(%22f%22:%22%255B%257B_22k_22_3A_22Quota_22_2C_22t_22_3A10_2C_22v_22_3A_22_5C_22_5C_5C_5C_22CPUs_5C_5C_5C_22_5C_22_22_2C_22i_22_3A_22displayName_22%257D_2C%257B_22k_22_3A_22_22_2C_22t_22_3A10_2C_22v_22_3A_22_5C_22region_3Aus-west1_5C_22_22_2C_22s_22_3Atrue%257D_2C%257B_22k_22_3A_22_22_2C_22t_22_3A10_2C_22v_22_3A_22_5C_22OR_5C_22_22_2C_22o_22_3Atrue%257D_2C%257B_22k_22_3A_22Quota_22_2C_22t_22_3A10_2C_22v_22_3A_22_5C_22Persistent%2520Disk%2520SSD%2520%2528GB%2529_5C_22_22_2C_22s_22_3Atrue_2C_22i_22_3A_22displayName_22%257D_2C%257B_22k_22_3A_22_22_2C_22t_22_3A10_2C_22v_22_3A_22_5C_22region_3Aus-west1_5C_22_22_2C_22s_22_3Atrue%257D_2C%257B_22k_22_3A_22_22_2C_22t_22_3A10_2C_22v_22_3A_22_5C_22OR_5C_22_22_2C_22o_22_3Atrue%257D_2C%257B_22k_22_3A_22Quota_22_2C_22t_22_3A10_2C_22v_22_3A_22_5C_22In-use%2520regional%2520external%2520IPv4%2520addresses_5C_22_22_2C_22s_22_3Atrue_2C_22i_22_3A_22displayName_22%257D_2C%257B_22k_22_3A_22_22_2C_22t_22_3A10_2C_22v_22_3A_22_5C_22region_3Aus-west1_5C_22_22_2C_22s_22_3Atrue%257D%255D%22,%22s%22:%5B(%22i%22:%22displayName%22,%22s%22:%220%22),(%22i%22:%22effectiveLimit%22,%22s%22:%221%22),(%22i%22:%22currentPercent%22,%22s%22:%221%22),(%22i%22:%22currentUsage%22,%22s%22:%221%22),(%22i%22:%22serviceTitle%22,%22s%22:%220%22),(%22i%22:%22displayDimensions%22,%22s%22:%220%22)%5D)%29

	(error affected 32 workers) (cluster_id: 1565232)

  Cell In[4], line 1
    coiled cluster list --limit 20
           ^
SyntaxError: invalid syntax

Fetch All Data¶

For each park, fetch T_Max and T_Min across all 4 scenarios for the full time range:

Historical: 1950–2014
SSP 2-4.5, SSP 3-7.0, SSP 5-8.5: 2015–2100

We split into historical and future since the year ranges are different.

variables = ["T_Max", "T_Min"]
historical_scenarios = ["Historical Climate"]
future_scenarios = ["SSP 2-4.5", "SSP 3-7.0", "SSP 5-8.5"]

all_timings = {}
t_total_start = time.perf_counter()

for park_name, boundary in boundaries.items():
    # Clean name for filenames: "Joshua Tree National Park" -> "JoshuaTree"
    short_name = park_name.replace("National Park", "").replace("National Preserve", "").strip().replace(" ", "")

    print(f"\n{'='*60}")
    print(f"  {park_name}")
    print(f"{'='*60}")
    
    park_dfs = {var: [] for var in variables}
    
    # --- Historical: 1950-2014 ---
    print(f"\n  gettin histproal (1950-2014)...")
    t0 = time.perf_counter()
    
    hist_data = get_climate_data(
        variables=variables,
        scenarios=historical_scenarios,
        boundary=boundary,
        time_slice=(1950, 2014),
        timescale="monthly",
        backend="coiled",
        coiled_cluster=cluster,
    )
    
    t_hist = time.perf_counter() - t0
    all_timings[f"{short_name}_historical"] = t_hist
    
    for var in variables:
        df = hist_data[var]
        print(f"    {var}: {len(df):,} rows")
        park_dfs[var].append(df)
    print(f"  Historical done in {t_hist:.1f}s")
    
    # --- Future SSPs: 2015-2100 ---
    print(f"\n  getting all SSP scens (2015-2100)...")
    t0 = time.perf_counter()
    
    future_data = get_climate_data(
        variables=variables,
        scenarios=future_scenarios,
        boundary=boundary,
        time_slice=(2015, 2100),
        timescale="monthly",
        backend="coiled",
        coiled_cluster=cluster,
    )
    
    t_future = time.perf_counter() - t0
    all_timings[f"{short_name}_future"] = t_future
    
    for var in variables:
        df = future_data[var]
        print(f"    {var}: {len(df):,} rows")
        park_dfs[var].append(df)
    print(f"  Future scenarios done in {t_future:.1f}s")
    
    # --- Join & save ---
    print(f"\n  Savng CSVs...")
    for var in variables:
        combined = pd.concat(park_dfs[var], ignore_index=True)
        combined["park"] = short_name
        filename = f"{short_name}_{var}.csv"
        filepath = os.path.join(OUTPUT_DIR, filename)
        combined.to_csv(filepath, index=False)
        print(f"    Saved: {filename} ({len(combined):,} rows)")

t_total = time.perf_counter() - t_total_start
print(f"\n{'='*60}")
print(f"  ALL DATA FETCHED")
print(f"{'='*60}")
print(f"Total fetch time: {t_total:.1f}s ({t_total/60:.1f} min)")

Shut Down Cluster¶

cluster.close()
print("Cluster shut down.")

Timing Summary¶

print("=== Timing Summary ===")
print(f"Cluster spin-up: {t_cluster_ready - t_cluster_start:.0f}s")
print()
for label, t in all_timings.items():
    print(f"{label}: {t:.1f}s ({t/60:.1f} min)")
print()
print(f"Total data fetch: {t_total:.1f}s ({t_total/60:.1f} min)")
print(f"Total including cluster: {t_cluster_ready - t_cluster_start + t_total:.0f}s ({(t_cluster_ready - t_cluster_start + t_total)/60:.1f} min)")

Verify Output¶

print("=== Output Files ===")
for f in sorted(os.listdir(OUTPUT_DIR)):
    if f.endswith(".csv"):
        path = os.path.join(OUTPUT_DIR, f)
        size_mb = os.path.getsize(path) / 1e6
        df = pd.read_csv(path, nrows=5)
        row_count = sum(1 for _ in open(path)) - 1  # minus header
        print(f"\n{f}")
        print(f"  Size: {size_mb:.1f} MB")
        print(f"  Rows: {row_count:,}")
        print(f"  Columns: {list(df.columns)}")
        print(f"  Scenarios: {pd.read_csv(path, usecols=['scenario'])['scenario'].unique().tolist()}")

# Quick peek at one file
sample = pd.read_csv(os.path.join(OUTPUT_DIR, "JoshuaTree_T_Max.csv"))
print(f"Shape: {sample.shape}")
print(f"Years: {pd.to_datetime(sample['time']).dt.year.min()} to {pd.to_datetime(sample['time']).dt.year.max()}")
print(f"Scenarios: {sample['scenario'].unique().tolist()}")
print(f"Simulations: {sample['simulation'].nunique()}")
print()
sample.head()

Computing T_avg¶

Now that we have T_Max and T_Min, computing average temperature is straightforward:

for park_name in boundaries:
    short_name = park_name.replace("National Park", "").replace("National Preserve", "").strip().replace(" ", "")
    tmax = pd.read_csv(os.path.join(OUTPUT_DIR, f"{short_name}_T_Max.csv"))
    tmin = pd.read_csv(os.path.join(OUTPUT_DIR, f"{short_name}_T_Min.csv"))
    
    # Merge on shared columns to align rows
    merge_cols = ["simulation", "time", "scenario", "timescale", "park"]
    merged = tmax.merge(tmin[merge_cols + ["T_Min"]], on=merge_cols)
    merged["T_Avg"] = (merged["T_Max"] + merged["T_Min"]) / 2
    
    # Save
    out_path = os.path.join(OUTPUT_DIR, f"{short_name}_T_Avg.csv")
    merged.to_csv(out_path, index=False)
    print(f"{short_name}_T_Avg.csv: {len(merged):,} rows, T_Avg range: {merged['T_Avg'].min():.1f}°C to {merged['T_Avg'].max():.1f}°C")

Output Summary¶

Files saved to data/csv/full_extraction/:

File	Contents
`JoshuaTree_T_Max.csv`	Max temp, all scenarios, 1950-2100
`JoshuaTree_T_Min.csv`	Min temp, all scenarios, 1950-2100
`JoshuaTree_T_Avg.csv`	Avg temp (computed), all scenarios, 1950-2100
`Mojave_T_Max.csv`	Max temp, all scenarios, 1950-2100
`Mojave_T_Min.csv`	Min temp, all scenarios, 1950-2100
`Mojave_T_Avg.csv`	Avg temp (computed), all secnarios, 1950-2100

Each CSV has columns: simulation, time, spatial_ref, T_Max/T_Min/T_Avg, scenario, timescale, park