#+++Libraries for caching decorator and debugging+++#
import os
import pickle
import re
import warnings
from pathlib import Path

#+++Libraries for geospatial work and clustering+++#
#++Note++#
#You will need to install earthaccess in terminal. Below is how to do this#
# conda activate [your environment]
# conda install -c conda-forge earthaccess
# python -c "import earthaccess; print(earthaccess.__version__)"
# my kernel also is missing earthpy so that also was installed in terminal
# do the same steps as above but replace with earthpy

import cartopy.crs as ccrs
import earthaccess
import earthpy as et
import geopandas as gpd
import geoviews as gv
import hvplot.pandas
import hvplot.xarray
import numpy as np
import pandas as pd
import rioxarray as rxr
import rioxarray.merge as rxrmerge
#Note#
# This library specifically gives us spiffy progress bars that you will see later
from tqdm.notebook import tqdm
import xarray as xr
from shapely.geometry import Polygon
from sklearn.cluster import KMeans

### set GDAL parameters
os.environ["GDAL_HTTP_MAX_RETRY"] = "5"
os.environ["GDAL_HTTP_RETRY_DELAY"] = "1"

# Half way through I got tired of debugging warnings. The following will keep warning messages 
# from making our output messy or harder to read
### don't show non-critical warnings
warnings.simplefilter('ignore')

c:\Users\kayle\miniconda3\envs\geog\Lib\site-packages\earthpy\__init__.py:7: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
  from pkg_resources import resource_string

### make the caching decorator
## some of the items here are to insure when we rerun cache cells we don't redo our work
def cached(func_key, override=False):
    """
    A decorator to cache function results
    
    Parameters
    ==========
    key: str
      File basename used to save pickled results
    override: bool
      When True, re-compute even if the results are already stored
    """
    def compute_and_cache_decorator(compute_function):
        """
        Wrap the caching function
        
        Parameters
        ==========
        compute_function: function
          The function to run and cache results
        """
        def compute_and_cache(*args, **kwargs):
            """
            Perform a computation and cache, or load cached result.
            
            Parameters
            ==========
            args
              Positional arguments for the compute function
            kwargs
              Keyword arguments for the compute function
            """
            ### Add an identifier from the particular function call
            if 'cache_key' in kwargs:
                key = '_'.join((func_key, kwargs['cache_key']))
            else:
                key = func_key

            ### define a file path based on the directory structure in earthpy
            path = os.path.join(
                
                ### earthpy directory
                et.io.HOME, 
                
                ### earthpy dataset
                et.io.DATA_NAME, 
                
                ### make a subdirectory called "jars"
                'jars', 
                
                ### use f-string (formatted string) to create a string by embedding the value
                ### of the variable "key" into the string 
                ### use .pickle file extension (a pickle file is a serialized python objecT)
                f'{key}.pickle')
            
            ### Check if the cache exists already or if we should override caching
            if not os.path.exists(path) or override:
                
                ### Make jars directory if needed
                os.makedirs(os.path.dirname(path), exist_ok=True)
                
                ### Run the compute function as the user did
                result = compute_function(*args, **kwargs)
                
                ### Pickle the object (save to file)
                ### open the file at filename
                with open(path, 'wb') as file:
                    
                    ### save the result without needing to recompute when loading
                    ### it back into Python
                    pickle.dump(result, file)
            
            ### if the file already exists/we are not overriding the cache
            else:
               
                ### Unpickle the object (load the cached result)
                with open(path, 'rb') as file:
                    
                    ### use pickle.load to unserialize the file back into a python object
                    result = pickle.load(file)
                    
            return result
        
        return compute_and_cache
    
    return compute_and_cache_decorator

### assign the hydrologic unit code
huc12 = "180703040203"

### download, unzip, and read the shapefile, and use the caching decorator to store it:

### use the cached decorator to wrap the function we'll make (read_wbd_file) 
@cached("wbd_region18_hu12")

### define a function (read_wbd_file()) to download the watershed boundary data
def read_wbd_file(region="18"):

    ### define the URL to download data
    wbd_url = (
        "https://prd-tnm.s3.amazonaws.com/StagedProducts/Hydrography/WBD/HU2/Shape/"

        ### insert the name of the specific file we want using an f-string
        f"WBD_{region}_HU2_Shape.zip"
    )

    ### download the data and unzip it into the directory we defined earlier
    wbd_dir = Path(et.data.get_data(url=wbd_url, verbose=True))

    ### make path to the shapefile in the directory
    shp_path = next(
        wbd_dir.rglob(

            ### make the shapefile name in any subfolder
            "WBDHU12.shp"
        )
    )

    ### read the shapefile as a gdf
    try:
        wbd_gdf = gpd.read_file(
            shp_path,

            ### use pyogrio library to read the shapefile 
            ### (better performance with large data)
            engine="pyogrio",
        )
    except Exception:
        wbd_gdf = gpd.read_file(shp_path)

    ### return the gdf of the watershed boundaries
    return wbd_gdf

### open the shapefile using the read_wbd_file function that we created
wbd_gdf = read_wbd_file()

Downloading from https://prd-tnm.s3.amazonaws.com/StagedProducts/Hydrography/WBD/HU2/Shape/WBD_18_HU2_Shape.zip
Extracted output to C:\Users\kayle\earth-analytics\data\earthpy-downloads\WBD_18_HU2_Shape

### filter the shapefile to the specific watershed we're using

### define the gdf for the watershed by subsetting the gdf of the whole watershed dataset
huc_col = "huc12" if "huc12" in wbd_gdf.columns else "HUC12"
gdf = (

    ### filter the gdf to the row(s) with the watershed we want
    ### use "dissolve" to merge the geometries of all the rows matching the target watershed
    wbd_gdf.loc[wbd_gdf[huc_col] == huc12]
    .dissolve()
    .reset_index(drop=True)
)
gdf[huc_col] = huc12

### check it out
gdf

### Make a site map with satellite imagery in the background
gv.extension("bokeh")
site_map = (
    gv.tile_sources.EsriImagery.opts(width=700, height=450)
    * gdf.to_crs(epsg=4326).hvplot(
        geo=True,
        crs=ccrs.PlateCarree(),
        fill_alpha=0,
        line_color="yellow",
        line_width=2,
        title=f"HU12 Watershed {huc12}",
    )
)
site_map

import holoviews as hv

upper_santa_maria_creek_plot = site_map

hv.save(upper_santa_maria_creek_plot, "upper_santa_maria_creek_watershed.html")

### Log in to earthaccess
auth = earthaccess.login(persist=True)

### Search for HLS granules we want
results = earthaccess.search_data(

    ### specify which dataset and spatial resolution we want 
    short_name="HLSL30",

    ### specify that we're using cloud data
    cloud_hosted=True,

    ### use the bounding box from our watershed boundary
    bounding_box=tuple(gdf.to_crs(epsg=4326).total_bounds),

    ### set the temporal range of the data
    temporal=("2023-05-01", "2023-10-31"),
)
len(results)

46

### make a function to process all the granules from the earthaccess search
### and extract information for each granule

def process_granules(search_results):

    ### make and display a progress bar
    progress_bar = tqdm(total=len(search_results), desc="Compiling granule metadata")

    ### regex to extract tile id and band from HLS filenames
    tif_regex = re.compile(
        r"\.(?P<tile_id>\w+)\.\d+T\d+\.v\d\.\d\.(?P<band>[A-Za-z0-9]+)\.tif$",
        flags=re.IGNORECASE,
    )

    ### accumulate rows
    link_rows = []

    ### loop over granules
    for result in search_results:

        ### locate metadata
        umm = result["umm"]

        ### pull out granule id
        granule_id = umm.get("GranuleUR")

        ### extract datetime
        granule_datetime = pd.to_datetime(
            umm.get("TemporalExtent", {})
            .get("RangeDateTime", {})
            .get("BeginningDateTime")
        )

        ### extract boundary coordinates
        points = []
        try:
            gpolygons = (
                umm.get("SpatialExtent", {})
                .get("HorizontalSpatialDomain", {})
                .get("Geometry", {})
                .get("GPolygons", [])
            )

            if isinstance(gpolygons, dict):
                gpolygons = [gpolygons]

            if gpolygons:
                boundary = gpolygons[0].get("Boundary", [])
                if isinstance(boundary, dict):
                    boundary = [boundary]
                if boundary:
                    points = boundary[0].get("Points", [])

        except Exception as e:
            print(f"Could not extract polygon for {granule_id}: {e}")
            points = []

        ### make polygon
        if len(points) >= 3:
            geometry = Polygon([(pt["Longitude"], pt["Latitude"]) for pt in points])
        else:
            geometry = None

        ### THIS is the important part:
        ### open through earthaccess and keep the returned file-like objects
        try:
            granule_files = earthaccess.open([result])
        except Exception as e:
            print(f"Could not open granule {granule_id}: {e}")
            progress_bar.update(1)
            continue

        ### loop through each file object in the granule
        for file_obj in granule_files:
            file_match = tif_regex.search(file_obj.full_name)

            if file_match:
                link_rows.append(
                    {
                        "granule_id": granule_id,
                        "datetime": granule_datetime,
                        "geometry": geometry,
                        "url": file_obj,   # keep file-like object, not plain string
                        "tile_id": file_match.group("tile_id"),
                        "band": file_match.group("band"),
                    }
                )

        progress_bar.update(1)

    progress_bar.close()

    granule_gdf = gpd.GeoDataFrame(link_rows, geometry="geometry", crs="EPSG:4326")
    return granule_gdf

from rasterio.errors import RasterioIOError

# @cached("masked_reflectance")   # leave off to avoid odd cache bug
def compute_reflectance_data(search_results, watershed_gdf):

    ### rebuild metadata inside function using earthaccess.open() file objects
    granule_gdf = process_granules(search_results)

    def open_crop_raster(file_obj, watershed_gdf, scale_factor=1.0, masked=True):
        try:
            da = rxr.open_rasterio(file_obj, masked=masked).squeeze(drop=True)
        except RasterioIOError as e:
            print(f"Open failed:\n{file_obj}\n{e}")
            return None
        except Exception as e:
            print(f"Unexpected open failure:\n{file_obj}\n{e}")
            return None

        try:
            watershed_on_da = watershed_gdf.to_crs(da.rio.crs)
            da = da.rio.clip_box(*watershed_on_da.total_bounds)
        except Exception as e:
            print(f"Clip failed:\n{file_obj}\n{e}")
            return None

        return da.astype("float32") * scale_factor

    def get_cloud_mask(fmask_da, mask_bits=(1, 2, 3)):
        bits = np.unpackbits(
            fmask_da.fillna(0).astype(np.uint8).values[..., np.newaxis],
            axis=-1,
            bitorder="little",
        )
        valid = np.prod(bits[..., list(mask_bits)] == 0, axis=-1).astype(bool)
        return xr.DataArray(valid, coords=fmask_da.coords, dims=fmask_da.dims)

    granule_da_rows = []

    total_groups = 0
    missing_fmask = 0
    failed_fmask_open = 0
    attempted_bands = 0
    failed_band_open = 0
    successful_bands = 0

    group_iter = granule_gdf.groupby(["datetime", "tile_id"])

    for (image_datetime, tile_id), tile_df in tqdm(group_iter, desc="Processing granules"):
        total_groups += 1

        fmask_df = tile_df.loc[tile_df.band.str.lower() == "fmask"]
        if fmask_df.empty:
            missing_fmask += 1
            continue

        fmask_da = open_crop_raster(
            fmask_df.iloc[0].url,
            watershed_gdf,
            scale_factor=1.0,
            masked=False,
        )

        if fmask_da is None:
            failed_fmask_open += 1
            continue

        clear_mask = get_cloud_mask(fmask_da)

        spectral_df = tile_df.loc[tile_df.band.str.upper().str.startswith("B")]

        for _, band_row in spectral_df.iterrows():
            attempted_bands += 1

            band_da = open_crop_raster(
                band_row.url,
                watershed_gdf,
                scale_factor=0.0001,
                masked=True,
            )

            if band_da is None:
                failed_band_open += 1
                continue

            band_da.name = band_row.band
            band_da = band_da.where(clear_mask)

            row_data = band_row.to_dict()
            row_data["da"] = band_da
            granule_da_rows.append(row_data)
            successful_bands += 1

    print("\nSummary")
    print("-------")
    print("Total datetime/tile groups:", total_groups)
    print("Groups missing Fmask row:", missing_fmask)
    print("Groups where Fmask failed to open:", failed_fmask_open)
    print("Band rasters attempted:", attempted_bands)
    print("Band rasters failed to open:", failed_band_open)
    print("Successful band rasters:", successful_bands)
    print("Rows appended:", len(granule_da_rows))

    if len(granule_da_rows) == 0:
        print("No valid rasters were processed; returning an empty GeoDataFrame.")
        empty_cols = list(granule_gdf.columns) + ["da"]
        return gpd.GeoDataFrame(
            {col: [] for col in empty_cols},
            geometry="geometry",
            crs=granule_gdf.crs
        )

    granule_da_df = pd.DataFrame(granule_da_rows)

    granule_da_gdf = gpd.GeoDataFrame(
        granule_da_df,
        geometry="geometry",
        crs=granule_gdf.crs
    )

    return granule_da_gdf

### apply the function
granule_da_gdf = compute_reflectance_data(results, gdf)

Compiling granule metadata:   0%|          | 0/46 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

QUEUEING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/15 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/15 [00:00<?, ?it/s]

# Here I am just checking the granules
print("Shape:", granule_da_gdf.shape)
print("Unique tiles:", granule_da_gdf["tile_id"].nunique())
print("Unique dates:", granule_da_gdf["datetime"].nunique())

Shape: (460, 7)
Unique tiles: 2
Unique dates: 24

# Call the gdf to make sure we have all 46 granules across all bands
granule_da_gdf["band"].value_counts()

band
B06    46
B03    46
B11    46
B05    46
B02    46
B09    46
B01    46
B10    46
B07    46
B04    46
Name: count, dtype: int64

### check out the dataframe for time band and our array too
granule_da_gdf[["datetime", "tile_id", "band", "da"]].head()

### apply cache decorator
@cached(f"composite_reflectance_{huc12}", override=True)

### create a function to merge and composite reflectance data from multiple granules
### end result: single, composite reflectance image for each spectral band
def merge_and_composite(granule_da_gdf):

    ### initialize a list to store dfs
    band_composites = []

    ### initialize a list to store composites after procesing
    spectral_bands = sorted(granule_da_gdf.band.unique())

    ### loop over each spectral band
    for band in spectral_bands:
        merged_das = []
        merged_dates = []
        band_df = granule_da_gdf.loc[granule_da_gdf.band == band]

        ### loop over date/time of image acquisition and merge granules for each data
        for image_datetime, date_df in band_df.groupby("datetime"):
            merged_da = rxrmerge.merge_arrays(list(date_df.da))

            ### mask negative values (could be no data or invalid data)
            merged_da = merged_da.where(merged_da >= 0)

            ### append to merged_das list we initialized
            merged_das.append(merged_da)
            merged_dates.append(pd.to_datetime(image_datetime))

        if not merged_das:
            continue

        ### composite images across dates
        merged_stack = xr.concat(merged_das, dim=pd.Index(merged_dates, name="datetime"))
        composite_da = merged_stack.mean(dim="datetime", skipna=True)
        composite_da = composite_da.expand_dims(band=[band])

        ### add processed and composite data array to lsit
        band_composites.append(composite_da)

    ### concatenates composite data arrays for each band along band dimension
    spectral_da = xr.concat(band_composites, dim="band").sortby("band")
    spectral_da.name = "reflectance"
    return spectral_da

### call function to get final composite reflectance data 
spectral_da = merge_and_composite(granule_da_gdf)
spectral_da

<xarray.DataArray 'reflectance' (band: 10, y: 411, x: 723)> Size: 12MB
array([[[0.02323   , 0.02051   , 0.02068182, ..., 0.02800667,
         0.02893334, 0.02663333],
        [0.0272    , 0.02226363, 0.02182727, ..., 0.03052667,
         0.02136   , 0.02165333],
        [0.02383   , 0.02519091, 0.02255454, ..., 0.02534667,
         0.02004   , 0.02010667],
        ...,
        [0.0182    , 0.024575  , 0.022325  , ..., 0.0297375 ,
         0.0295125 , 0.03119375],
        [0.02154167, 0.02566666, 0.0228    , ..., 0.02696875,
         0.02768125, 0.03118125],
        [0.02564167, 0.021125  , 0.01755   , ..., 0.0282875 ,
         0.029525  , 0.030075  ]],

       [[0.02739091, 0.0237    , 0.02630909, ..., 0.03753333,
         0.03829334, 0.03575333],
        [0.03229091, 0.02842727, 0.02795455, ..., 0.04112667,
         0.02968   , 0.02982667],
        [0.02806364, 0.03233636, 0.02918182, ..., 0.03398   ,
         0.02794667, 0.02822   ],
...
        [0.29019165, 0.2897    , 0.28825834, ..., 0.34480622,
         0.34674996, 0.34965625],
        [0.2914    , 0.29025832, 0.28791663, ..., 0.3422937 ,
         0.344625  , 0.3476125 ],
        [0.2916917 , 0.289775  , 0.28661668, ..., 0.33952498,
         0.34243748, 0.34556875]],

       [[0.2868091 , 0.28707272, 0.28841817, ..., 0.31446669,
         0.31547335, 0.31639332],
        [0.28775454, 0.28784546, 0.28925455, ..., 0.31435332,
         0.31426668, 0.31457335],
        [0.28948182, 0.2891909 , 0.29031816, ..., 0.3147733 ,
         0.31355998, 0.3128733 ],
        ...,
        [0.27674165, 0.27655   , 0.2756    , ..., 0.32446876,
         0.326175  , 0.32840624],
        [0.27785   , 0.27693334, 0.27515   , ..., 0.32219994,
         0.324125  , 0.3264625 ],
        [0.27839997, 0.27651668, 0.27376667, ..., 0.3194438 ,
         0.32189375, 0.32453123]]], shape=(10, 411, 723), dtype=float32)
Coordinates:
  * band         (band) object 80B 'B01' 'B02' 'B03' 'B04' ... 'B09' 'B10' 'B11'
  * y            (y) float64 3kB 3.663e+06 3.663e+06 ... 3.651e+06 3.651e+06
  * x            (x) float64 6kB 5.044e+05 5.044e+05 ... 5.26e+05 5.26e+05
    spatial_ref  int64 8B 0
Attributes: (12/35)
    ACCODE:                    Lasrc; Lasrc
    add_offset:                0.0
    arop_ave_xshift(meters):   0, 0
    arop_ave_yshift(meters):   0, 0
    arop_ncp:                  0, 0
    arop_rmse(meters):         0, 0
    ...                        ...
    TIRS_SSM_MODEL:            PRELIMINARY; PRELIMINARY
    TIRS_SSM_POSITION_STATUS:  ESTIMATED; ESTIMATED
    ULX:                       399960
    ULY:                       3700020
    USGS_SOFTWARE:             LPGS_16.2.0
    AREA_OR_POINT:             Area

array([[[0.02323   , 0.02051   , 0.02068182, ..., 0.02800667,
         0.02893334, 0.02663333],
        [0.0272    , 0.02226363, 0.02182727, ..., 0.03052667,
         0.02136   , 0.02165333],
        [0.02383   , 0.02519091, 0.02255454, ..., 0.02534667,
         0.02004   , 0.02010667],
        ...,
        [0.0182    , 0.024575  , 0.022325  , ..., 0.0297375 ,
         0.0295125 , 0.03119375],
        [0.02154167, 0.02566666, 0.0228    , ..., 0.02696875,
         0.02768125, 0.03118125],
        [0.02564167, 0.021125  , 0.01755   , ..., 0.0282875 ,
         0.029525  , 0.030075  ]],

       [[0.02739091, 0.0237    , 0.02630909, ..., 0.03753333,
         0.03829334, 0.03575333],
        [0.03229091, 0.02842727, 0.02795455, ..., 0.04112667,
         0.02968   , 0.02982667],
        [0.02806364, 0.03233636, 0.02918182, ..., 0.03398   ,
         0.02794667, 0.02822   ],
...
        [0.29019165, 0.2897    , 0.28825834, ..., 0.34480622,
         0.34674996, 0.34965625],
        [0.2914    , 0.29025832, 0.28791663, ..., 0.3422937 ,
         0.344625  , 0.3476125 ],
        [0.2916917 , 0.289775  , 0.28661668, ..., 0.33952498,
         0.34243748, 0.34556875]],

       [[0.2868091 , 0.28707272, 0.28841817, ..., 0.31446669,
         0.31547335, 0.31639332],
        [0.28775454, 0.28784546, 0.28925455, ..., 0.31435332,
         0.31426668, 0.31457335],
        [0.28948182, 0.2891909 , 0.29031816, ..., 0.3147733 ,
         0.31355998, 0.3128733 ],
        ...,
        [0.27674165, 0.27655   , 0.2756    , ..., 0.32446876,
         0.326175  , 0.32840624],
        [0.27785   , 0.27693334, 0.27515   , ..., 0.32219994,
         0.324125  , 0.3264625 ],
        [0.27839997, 0.27651668, 0.27376667, ..., 0.3194438 ,
         0.32189375, 0.32453123]]], shape=(10, 411, 723), dtype=float32)

array(['B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B09', 'B10', 'B11'],
      dtype=object)

array([3663195., 3663165., 3663135., ..., 3650955., 3650925., 3650895.],
      shape=(411,))

array([504375., 504405., 504435., ..., 525975., 526005., 526035.], shape=(723,))

array(0)

# Just like before we print here to make sure everything looks okay. We have some null values here but we will handles those later 
# before plotting
print(spectral_da.shape)
print("Any nonzero?", bool((spectral_da != 0).any()))
print("Min:", float(spectral_da.min()))
print("Max:", float(spectral_da.max()))
print("Null count:", int(spectral_da.isnull().sum()))

(10, 411, 723)
Any nonzero? True
Min: 0.0
Max: 0.9353333115577698
Null count: 16

# Here I want to the see the min and max of the values for each band, we will explore this further with clustering
# I just want to make sure there is nothing really odd here either
for b in spectral_da.band.values:
    band = spectral_da.sel(band=b)
    print(
        b,
        "min=", float(band.min()),
        "max=", float(band.max()),
        "nonzero=", int((band != 0).sum()),
        "null=", int(band.isnull().sum()),
    )

B01 min= 0.0 max= 0.49246424436569214 nonzero= 297152 null= 0
B02 min= 0.0033400002866983414 max= 0.5289143323898315 nonzero= 297153 null= 0
B03 min= 0.0080933328717947 max= 0.6199153065681458 nonzero= 297153 null= 0
B04 min= 0.004414285533130169 max= 0.682315468788147 nonzero= 297153 null= 0
B05 min= 0.00029999998514540493 max= 0.7021153569221497 nonzero= 297153 null= 4
B06 min= 0.0 max= 0.8451332449913025 nonzero= 297152 null= 7
B07 min= 0.00044999999227002263 max= 0.9353333115577698 nonzero= 297153 null= 5
B09 min= 0.0008166665793396533 max= 0.004364286083728075 nonzero= 297153 null= 0
B10 min= 0.24478568136692047 max= 0.42134544253349304 nonzero= 297153 null= 0
B11 min= 0.23344282805919647 max= 0.3924363851547241 nonzero= 297153 null= 0

### Convert spectral DataArray to a tidy DataFrame
model_df = (
    spectral_da.to_dataframe(name="reflectance")
    .reset_index()
    .pivot(index=["x", "y"], columns="band", values="reflectance")
)

### remove missing values
model_df = model_df.dropna()

### remove pixels where all bands are zero
model_df = model_df.loc[(model_df.sum(axis=1) > 0)]

model_df.head()

### features only (ignore the clusters column)
X = model_df.copy()

### initialize k-means model 
kmeans_model = KMeans(n_clusters=6, random_state=42, n_init="auto")

### fit model and predict
cluster_labels = kmeans_model.fit_predict(X)

### add clusters back
model_df["clusters"] = cluster_labels

model_df.head()

# In the above cell I specified 6 clusters. I want to actually inspect them furhter to see what potential land cover features
# we can infer from them (e.g., water, vegetation, built environment) before we get to the RBG and Kmeans maps
cluster_summary = (
    model_df.groupby("clusters")
    .mean()
    .reset_index()
)

cluster_summary

# Here instead of a table, I plot the values we got above for each the clusters so there are 6 lines plotted over the bands
# You will immediately notice nearly no reflectance for band 9 and band 10 is the highest of the bunch
import matplotlib.pyplot as plt

cluster_summary.set_index("clusters").T.plot(figsize=(8, 5))
plt.title("Spectral Profiles by Cluster")
plt.ylabel("Reflectance")
plt.xlabel("Band")
plt.legend(title="Cluster")
plt.show()

### make data array with bands to use for rgb: red, green, and blue
rgb_da = spectral_da.sel(band=["B04", "B03", "B02"]).sortby(["x", "y"]) * 10
rgb_plot = rgb_da.clip(min=0, max=1).hvplot.rgb(
    x="x",
    y="y",
    xlabel="Easting",
    ylabel="Northing",
    bands="band",
    aspect="equal",
    width=500,
    height=350,
)
rgb_plot

cluster_xr = (
    model_df["clusters"]
    .to_xarray()
    .transpose("y", "x")
    .sortby("x")
    .sortby("y")
)

kmeans_plot = cluster_xr.hvplot(
    x="x",
    y="y",
    cmap="Colorblind",
    aspect="equal",
    xlabel="Easting",
    ylabel="Northing",
    width=500,
    height=350,
).opts(
    xrotation=45
)

(rgb_plot + kmeans_plot)

import holoviews as hv

final_plot = rgb_plot + kmeans_plot

hv.save(final_plot, "kmeans_rgb_comparison.html")

	datetime	tile_id	band	da
0	2023-05-02 18:22:00.594000+00:00	T11SMS	B06	[[<xarray.DataArray 'B06' ()> Size: 4B\narray(...
1	2023-05-02 18:22:00.594000+00:00	T11SMS	B03	[[<xarray.DataArray 'B03' ()> Size: 4B\narray(...
2	2023-05-02 18:22:00.594000+00:00	T11SMS	B11	[[<xarray.DataArray 'B11' ()> Size: 4B\narray(...
3	2023-05-02 18:22:00.594000+00:00	T11SMS	B05	[[<xarray.DataArray 'B05' ()> Size: 4B\narray(...
4	2023-05-02 18:22:00.594000+00:00	T11SMS	B02	[[<xarray.DataArray 'B02' ()> Size: 4B\narray(...

	band	B01	B02	B03	B04	B05	B06	B07	B09	B10	B11
x	y
504375.0	3650895.0	0.025642	0.033417	0.057108	0.059558	0.213975	0.186283	0.119000	0.001158	0.291692	0.278400
	3650925.0	0.021542	0.028300	0.049100	0.050400	0.198392	0.172817	0.108717	0.001150	0.291400	0.277850
	3650955.0	0.018200	0.023858	0.042358	0.040517	0.195050	0.145400	0.085942	0.001192	0.290192	0.276742
	3650985.0	0.016983	0.022633	0.041750	0.038917	0.199042	0.139700	0.079883	0.001042	0.289808	0.276750
	3651015.0	0.012230	0.014967	0.033875	0.030000	0.200625	0.138375	0.074400	0.001008	0.291342	0.278542

	band	B01	B02	B03	B04	B05	B06	B07	B09	B10	B11	clusters
x	y
504375.0	3650895.0	0.025642	0.033417	0.057108	0.059558	0.213975	0.186283	0.119000	0.001158	0.291692	0.278400	1
	3650925.0	0.021542	0.028300	0.049100	0.050400	0.198392	0.172817	0.108717	0.001150	0.291400	0.277850	1
	3650955.0	0.018200	0.023858	0.042358	0.040517	0.195050	0.145400	0.085942	0.001192	0.290192	0.276742	1
	3650985.0	0.016983	0.022633	0.041750	0.038917	0.199042	0.139700	0.079883	0.001042	0.289808	0.276750	1
	3651015.0	0.012230	0.014967	0.033875	0.030000	0.200625	0.138375	0.074400	0.001008	0.291342	0.278542	1

band	clusters	B01	B02	B03	B04	B05	B06	B07	B09	B10	B11
0	0	0.032548	0.043107	0.069339	0.090922	0.189569	0.265964	0.188226	0.001647	0.355648	0.334234
1	1	0.016472	0.022893	0.042723	0.045080	0.176095	0.152530	0.094621	0.001419	0.306055	0.290713
2	2	0.024953	0.033707	0.057382	0.068760	0.184276	0.208346	0.142976	0.001534	0.333516	0.314948
3	3	0.053037	0.070494	0.111710	0.141235	0.279038	0.321384	0.239069	0.001618	0.370657	0.347376
4	4	0.043205	0.057961	0.092269	0.121504	0.236178	0.311877	0.224460	0.001666	0.370873	0.347318
5	5	0.035028	0.049063	0.082814	0.100561	0.236223	0.237609	0.170229	0.001519	0.355898	0.334728

Land cover classification at the Upper Santa Maria Creek, CA¶

About the Upper Santa Maria Creek Subwatershed¶

Using this notebook¶

Step 1a: Libraries and GDAL¶

Step 1b: Run the caching decorator¶

Step 2: Plotting the study site¶

Step 3: Multispectral data array¶

Step 4: Kmeans clustering¶

Step 5: Plot¶

Analysis¶

Step 1b: Run the caching decorator¶

What my code is doing¶

STEP 2: Study site map¶

What my code is doing¶

STEP 3: Multispectral data¶

Step 3a: Search for data¶

Why we retrieve multiple image granules¶

Step 3b: Compile information about each granule¶

Step 3c: Open, crop, and mask data¶

Converting raw imagery to reflectance values¶

Step 3d: Merge and Composite Data¶

Creating a composite image¶

Note on the cells below¶

STEP 4: K-means clustering¶

Applying KMeans clustering¶

What my code is doing¶

Clarifying Bands vs Clusters¶

Basic insights from kmeans clustering before plotting¶

STEP 5: Plot¶

Mapping cluster results¶

What do our RBG and KMeans plots tell us?¶

Cluster 0/1 – Likely Vegetation (High NIR, Low Red)¶

Cluster 1/2 – Likely Moderate Vegetation / Mixed Cover¶

Cluster 2/3 – Likely Sparse Vegetation / Disturbed Areas¶

Cluster 3/4 – Built environments¶

Cluster 4/5 – Dry Soil / Bare Ground (High SWIR, Higher Red)¶

Cluster 5 – Mixed or Transitional¶

Takeaway¶