Source code for calliope.core.time.clustering

Copyright (C) 2013-2018 Calliope contributors listed in AUTHORS.
Licensed under the Apache 2.0 License (see LICENSE file).

Functions to cluster data along the time dimension.

import numpy as np
import pandas as pd
import xarray as xr

from sklearn.metrics import mean_squared_error
from sklearn import cluster as sk_cluster

from calliope import exceptions

def _stack_data(data, dates, times):
    Stack all non-time dimensions of an xarray DataArray
    data_to_stack = data.assign_coords(
        timesteps=pd.MultiIndex.from_product([dates, times], names=['dates', 'times'])
    non_date_dims = list(set(data_to_stack.dims).difference(['dates', 'times'])) + ['times']
    if len(non_date_dims) >= 2:
        stacked_var = data_to_stack.stack(stacked=non_date_dims)
        raise exceptions.ModelError(
            "Cannot conduct time clustering with variable {} as it has no "
            "non-time dimensions.".format(
    return stacked_var

def reshape_for_clustering(data, loc_techs=None, variables=None):
    Create an array of timeseries values, where each day has a row of all
    hourly timeseries data from all relevant variables

    data : xarray Dataset
        Dataset with all non-time dependent variables removed
    loc_techs : string or list-like, default = None
        If clustering over a subset of loc_techs, they are listed here
    variables : string or list-like, default = None
        If clustering over a subset of timeseries variables, they are listed here

    X : numpy 2darray
        Array with all rows as days and all other data in columns, with NaNs
        converted to zeros
    # to create an array with days as rows, we need to get timesteps per day
    timesteps = data.timesteps.to_index()
    if timesteps.dtype.kind == 'M': # 'M' = datetime format
        dates = np.unique(
        times = np.unique(timesteps.time)
    else: # mean_data from get_closest_days_from_clusters is in the format cluster-timestep
        dates = np.unique([i.split('-')[0] for i in timesteps])
        times = np.unique([i.split('-')[1] for i in timesteps])

    reshaped_data = np.array([[] for i in dates])
    # if 'variables' is given then we will loop over that, otherwise we loop over
    # all timeseries variables
    relevent_vars = variables if variables else data.data_vars

    # loop over variables to get our arrays
    for var in relevent_vars:
        temp_data = data[var].copy()
        # if there is a loc_tech subset, index over that
        if loc_techs:
            loc_tech_dim = [i for i in data[var].dims if 'loc_techs' in i][0]
            relevent_loc_techs = list(
            temp_data = temp_data.loc[{loc_tech_dim: relevent_loc_techs}]

        # stack all non-time dimensions to get one row of data for each timestep
        stacked_var = _stack_data(temp_data, dates, times)
        # reshape the array to split days and timesteps within days, now we have
        # one row of data per day
        reshaped_data = np.concatenate([reshaped_data, stacked_var.values], axis=1)
    # put all the columns of data together, keeping rows as days. Also convert
    # all nans to zeros
    X = np.nan_to_num(reshaped_data)

    return X

def get_mean_from_clusters(data, clusters, timesteps_per_day):
    Clusters are days which are considered similar to each other. Here we find
    the mean value (in each tech dimension) for each cluster and return it

    data : xarray Dataset
        Dataset with all non-time dependent variables removed
    clusters : pandas DataFrame
        index as days, columns as cluster group to which that day is associated
    timesteps_per_day : int
        Number of timesteps in each day

    ds : xarray Dataset
        Dataset of timeseries DataArrays. Time dimension is only equal to the
        number of clusters * timesteps_per_day.

    cluster_map = clusters.groupby(clusters).groups

    ds = {}
    t_coords = [
        '{}-{}'.format(cid, t) for cid in cluster_map for t in range(timesteps_per_day)
    for var in data.data_vars:
        clustered_array = data[var].loc[{'timesteps': data.timesteps[:len(t_coords)]}].copy()
        clustered_array['timesteps'] = t_coords
        for cluster_id, cluster_members in cluster_map.items():
            current_cluster = [
                i for i in t_coords if i.startswith('{}-'.format(cluster_id))
            clustered_array.loc[{'timesteps': current_cluster}] = (
                data[var].loc[{'timesteps': cluster_members}]
        ds[var] = clustered_array
    ds = xr.Dataset(ds)
    return ds

def find_nearest_vector_index(array, value, metric='rmse'):
    compares the data for one cluster to every day in the timeseries, to find the
    day which most closely matches.

    array : np.ndarray
        full timeseries array,
        shape = (num_dates, num_loc_techs * num_vars * num_timesteps_per_day)
        The shape is acheived by running a dataset through clustering.reshape_for_clustering
    value : np.ndarray
        one cluster of data,
        shape = (1, num_loc_techs * num_vars * num_timesteps_per_day)
        The shape is acheived by running the mean clustered dataset,
        subset over one cluster, through clustering.reshape_for_clustering
    metric : str, default = 'rmse'
        Error metric with which to compare the array and the values.
        If 'rmse', will compare the root mean square error between `value` and
        each date in `array`.
        If 'abs' will compare the absolute difference between `value` and
        each date in `array`.
    if metric == 'rmse':
        error = np.array([mean_squared_error(i, value[0]) for i in array])
    elif metric == 'abs':
        error = np.array([np.linalg.norm(sum(y)) for y in array - value])
        raise ValueError(
            'Error metric can only be `rmse` or `abs`, {} given'.format(metric)

    return error.argmin()

def get_closest_days_from_clusters(data, mean_data, clusters, daily_timesteps):
    Given a set of mean cluster timeseries profiles, find the day in the full
    timeseries that matches each cluster profile most closely.

    data : xarray Dataset
        Input model data, with all non-timeseries data vars dropped
    mean_data : xarray Dataset
        Mean per cluster of input model data, resulting timeseries is of the form
        X-Y where X is the cluster number and Y is the timestep (both integers)
    clusters : pd.Series
        Cluster to which each date is aligned.
        Index = dates of the full timeseries, values = cluster number
    daily_timesteps : list
        hours assigned to each timestep in a day (e.g. an entry of 0.25 = 15 minutes).
        We expect uniform timesteps between days, which is checked prior to
        reaching this function.

        new_data : xarray Dataset
            data, indexed over only the timesteps within the days that most
            closely match the cluster means.
            Number of timesteps = timesteps_per_day * num_clusters. May be lower
            than this if many cluster means match with the same day
        chosen_day_timestepas : dict
            The day assigned to each cluster. key = cluster number, value = date.
    dtindex = data['timesteps'].to_index()
    timesteps_per_day = len(daily_timesteps)
    chosen_days = {}

    for cluster in sorted(clusters.unique()):

        subset_t = [t for t in mean_data.timesteps.values
                    if t.startswith('{}-'.format(cluster))]

        target = reshape_for_clustering(mean_data.loc[dict(timesteps=subset_t)])

        lookup_array = reshape_for_clustering(data)

        chosen_days[cluster] = find_nearest_vector_index(lookup_array, target)

    days_list = sorted(list(set(chosen_days.values())))
    new_t_coord = _timesteps_from_daily_index(
        dtindex[::timesteps_per_day][days_list], daily_timesteps

    chosen_day_timestamps = {k: dtindex[::timesteps_per_day][v]
                             for k, v in chosen_days.items()}

    new_data = data.loc[dict(timesteps=new_t_coord)]

    return new_data, chosen_day_timestamps

def _timesteps_from_daily_index(idx, daily_timesteps):
    new_idx = pd.Index([
        date + pd.DateOffset(hours=sum(daily_timesteps[:timestep]))
        for date in idx for timestep in range(len(daily_timesteps))
    return new_idx

def map_clusters_to_data(data, clusters, how, daily_timesteps, storage_inter_cluster=True):
    Returns a copy of data that has been clustered.

    how : str
        How to select data from clusters. Can be mean (centroid) or closest real
        day to the mean (by root mean square error).
    storage_inter_cluster : bool, default=True
        If True, add `datesteps` to model_data, for use in the backend to build
        inter_cluster storage decision variables and constraints
    # FIXME hardcoded time intervals ('1H', '1D')

    # Get all timesteps, not just the first per day
    timesteps_per_day = len(daily_timesteps)
    idx = clusters.index
    new_idx = _timesteps_from_daily_index(idx, daily_timesteps)
    clusters_timeseries = (clusters.reindex(new_idx)

    new_data = get_mean_from_clusters(data, clusters_timeseries, timesteps_per_day)
    new_data.attrs = data.attrs

    if how == 'mean':
        # Add timestep names by taking the median timestamp from daily clusters...
        # (a random way of doing it, but we want some label to apply)
        timestamps = clusters.groupby(clusters).apply(
            lambda x: x.index[int(len(x.index) / 2)]
        new_data.coords['timesteps'] = _timesteps_from_daily_index(
            pd.Index(timestamps.values), daily_timesteps
        # Generate weights
        # weight of each timestep = number of timesteps in this timestep's cluster
        # divided by timesteps per day (since we're grouping days together and
        # a cluster consisting of 1 day = 24 hours should have weight of 1)
        value_counts = clusters_timeseries.value_counts() / timesteps_per_day
        # And turn the index into dates (days)
        value_counts = pd.DataFrame({
            'dates': timestamps,
            'counts': value_counts}).set_index('dates')['counts']

    elif how == 'closest':
        new_data, chosen_ts = get_closest_days_from_clusters(data, new_data, clusters, daily_timesteps)
        # Deal with the case where more than one cluster has the same closest day
        # An easy way is to rename the original clusters with the chosen days
        # So at this point, clusterdays_timeseries maps all timesteps to the day
        # of year of the cluster the timestep belongs to
        clusterdays_timeseries = x: chosen_ts[x])
        value_counts = clusterdays_timeseries.value_counts() / timesteps_per_day
        timestamps = pd.DataFrame.from_dict(chosen_ts, orient='index')[0]
        cluster_diff = len(clusters.unique()) - len(timestamps.unique())
        if cluster_diff > 0:
                'Creating {} fewer clusters as some clusters share the same '
                'closest day'.format(cluster_diff)
            timestamps = timestamps.drop_duplicates()
            for cluster, date in timestamps.items():
                clusterdays_timeseries.loc[clusterdays_timeseries == date] = cluster
            clusters = clusterdays_timeseries.resample('1D').mean()

    _clusters = xr.DataArray(
        data=np.full(len(new_data.timesteps.values), np.nan),
        coords={'timesteps': new_data.timesteps.values}

    for cluster, date in timestamps.items():
        _clusters.loc[date.strftime('%Y-%m-%d')] = cluster

    new_data['timestep_cluster'] = _clusters.astype(int)
    weights = (value_counts.reindex(_timesteps_from_daily_index(value_counts.index, daily_timesteps))
    new_data['timestep_weights'] = xr.DataArray(weights, dims=['timesteps'])
    days = np.unique(new_data.timesteps.to_index().date)
    new_data['timestep_resolution'] = (
        xr.DataArray(np.tile(daily_timesteps, len(days)),
                     coords={'timesteps': new_data['timesteps']})

    if storage_inter_cluster: = 'datesteps'
        new_data['lookup_datestep_cluster'] = xr.DataArray.from_series(clusters) = 'clusters'
    new_data.coords['clusters'] = timestamps.index

    return new_data

[docs]def get_clusters( data, func, timesteps_per_day, tech=None, timesteps=None, k=None, variables=None, **kwargs ): """ Run a clustering algorithm on the timeseries data supplied. All timeseries data is reshaped into one row per day before clustering into similar days. Parameters ---------- data : xarray.Dataset Should be normalized func : str 'kmeans' or 'hierarchical' for KMeans or Agglomerative clustering, respectively timesteps_per_day : int Total number of timesteps in a day tech : list, optional list of strings referring to technologies by which clustering is undertaken. If none (default), all technologies within timeseries variables will be used. timesteps : list or str, optional Subset of the time domain within which to apply clustering. k : int, optional Number of clusters to create. If none (default), will use Hartigan's rule to infer a reasonable number of clusters. variables : list, optional data variables (e.g. `resource`, `energy_eff`) by whose values the data will be clustered. If none (default), all timeseries variables will be used. kwargs : dict Additional keyword arguments available depend on the `func`. For available KMeans kwargs see: For available hierarchical kwargs see: Returns ------- clusters : dataframe Indexed by timesteps and with locations as columns, giving cluster membership for first timestep of each day. clustered_data : sklearn.cluster object Result of clustering using sklearn.KMeans(k).fit(X) or sklearn.KMeans(k).AgglomerativeClustering(X). Allows user to access specific attributes, for detailed statistical analysis. """ if timesteps is not None: data = data.loc[{'timesteps': timesteps}] else: timesteps = data.timesteps.values X = reshape_for_clustering(data, tech, variables) if func == 'kmeans': if not k: k = hartigan_n_clusters(X) exceptions.warn( 'Used Hartigan\'s rule to determine that' 'a good number of clusters is {}.'.format(k) ) clustered_data = sk_cluster.KMeans(k).fit(X) elif func == 'hierarchical': if not k: raise exceptions.ModelError( 'Cannot undertake hierarchical clustering without a predefined ' 'number of clusters (k)' ) clustered_data = sk_cluster.AgglomerativeClustering(k).fit(X) # Determine the cluster membership of each day day_clusters = clustered_data.labels_ # Create mapping of timesteps to clusters clusters = pd.Series(day_clusters, index=timesteps[::timesteps_per_day]) return clusters, clustered_data
def hartigan_n_clusters(X, threshold=10): """ Try clustering using sklearn.cluster.kmeans, for several cluster sizes. Using Hartigan's rule, we will return the number of clusters after which the benefit of clustering is low. """ def _H_rule(inertia, inertia_plus_one, n_clusters, len_input): # see return ((inertia / inertia_plus_one) - 1) * (len_input - n_clusters - 1) len_input = len(X) n_clusters = 1 HK = threshold + 1 while n_clusters <= len_input and HK > threshold: kmeans = sk_cluster.KMeans(n_clusters=n_clusters).fit(X) kmeans_plus_one = sk_cluster.KMeans(n_clusters=n_clusters + 1).fit(X) inertia = kmeans.inertia_ inertia_plus_one = kmeans_plus_one.inertia_ HK = _H_rule(inertia, inertia_plus_one, n_clusters, len_input) n_clusters += 1 if HK > threshold: # i.e. we went to the limit where n_clusters = len_input exceptions.warn("Based on threshold, number of clusters = number of dates") return len_input else: return n_clusters - 1