Source code for calliope.core.time.clustering

"""
Copyright (C) 2013-2018 Calliope contributors listed in AUTHORS.
Licensed under the Apache 2.0 License (see LICENSE file).

clustering.py
~~~~~~~~~~~~~

Functions to cluster data along the time dimension.

"""
import numpy as np
import pandas as pd
import xarray as xr

from sklearn.metrics import mean_squared_error
from sklearn import cluster as sk_cluster

from calliope import exceptions


def _stack_data(data, dates, times):
    """
    Stack all non-time dimensions of an xarray DataArray
    """
    data_to_stack = data.assign_coords(
        timesteps=pd.MultiIndex.from_product([dates, times], names=['dates', 'times'])
    ).unstack('timesteps')
    non_date_dims = list(set(data_to_stack.dims).difference(['dates', 'times'])) + ['times']
    if len(non_date_dims) >= 2:
        stacked_var = data_to_stack.stack(stacked=non_date_dims)
    else:
        raise exceptions.ModelError(
            "Cannot conduct time clustering with variable {} as it has no "
            "non-time dimensions.".format(data.name)
        )
    return stacked_var


def reshape_for_clustering(data, loc_techs=None, variables=None):
    """
    Create an array of timeseries values, where each day has a row of all
    hourly timeseries data from all relevant variables

    Parameters
    ----------
    data : xarray Dataset
        Dataset with all non-time dependent variables removed
    loc_techs : string or list-like, default = None
        If clustering over a subset of loc_techs, they are listed here
    variables : string or list-like, default = None
        If clustering over a subset of timeseries variables, they are listed here

    Returns
    -------
    X : numpy 2darray
        Array with all rows as days and all other data in columns, with NaNs
        converted to zeros
    """
    # to create an array with days as rows, we need to get timesteps per day
    timesteps = data.timesteps.to_index()
    if timesteps.dtype.kind == 'M': # 'M' = datetime format
        dates = np.unique(timesteps.date)
        times = np.unique(timesteps.time)
    else: # mean_data from get_closest_days_from_clusters is in the format cluster-timestep
        dates = np.unique([i.split('-')[0] for i in timesteps])
        times = np.unique([i.split('-')[1] for i in timesteps])

    reshaped_data = np.array([[] for i in dates])
    # if 'variables' is given then we will loop over that, otherwise we loop over
    # all timeseries variables
    relevent_vars = variables if variables else data.data_vars

    # loop over variables to get our arrays
    for var in relevent_vars:
        temp_data = data[var].copy()
        # if there is a loc_tech subset, index over that
        if loc_techs:
            loc_tech_dim = [i for i in data[var].dims if 'loc_techs' in i][0]
            relevent_loc_techs = list(
                set(temp_data[loc_tech_dim].values).intersection(loc_techs)
            )
            temp_data = temp_data.loc[{loc_tech_dim: relevent_loc_techs}]

        # stack all non-time dimensions to get one row of data for each timestep
        stacked_var = _stack_data(temp_data, dates, times)
        # reshape the array to split days and timesteps within days, now we have
        # one row of data per day
        reshaped_data = np.concatenate([reshaped_data, stacked_var.values], axis=1)
    # put all the columns of data together, keeping rows as days. Also convert
    # all nans to zeros
    X = np.nan_to_num(reshaped_data)

    return X


def get_mean_from_clusters(data, clusters, timesteps_per_day):
    """
    Clusters are days which are considered similar to each other. Here we find
    the mean value (in each tech dimension) for each cluster and return it

    Parameters
    ----------
    data : xarray Dataset
        Dataset with all non-time dependent variables removed
    clusters : pandas DataFrame
        index as days, columns as cluster group to which that day is associated
    timesteps_per_day : int
        Number of timesteps in each day

    Returns
    -------
    ds : xarray Dataset
        Dataset of timeseries DataArrays. Time dimension is only equal to the
        number of clusters * timesteps_per_day.
    """

    cluster_map = clusters.groupby(clusters).groups

    ds = {}
    t_coords = [
        '{}-{}'.format(cid, t) for cid in cluster_map for t in range(timesteps_per_day)
    ]
    for var in data.data_vars:
        clustered_array = data[var].loc[{'timesteps': data.timesteps[:len(t_coords)]}].copy()
        clustered_array['timesteps'] = t_coords
        for cluster_id, cluster_members in cluster_map.items():
            current_cluster = [
                i for i in t_coords if i.startswith('{}-'.format(cluster_id))
            ]
            clustered_array.loc[{'timesteps': current_cluster}] = (
                data[var].loc[{'timesteps': cluster_members}]
                .groupby('timesteps.time').mean(dim='timesteps').values
            )
        ds[var] = clustered_array
    ds = xr.Dataset(ds)
    return ds


def find_nearest_vector_index(array, value, metric='rmse'):
    """
    compares the data for one cluster to every day in the timeseries, to find the
    day which most closely matches.

    Parameters
    ----------
    array : np.ndarray
        full timeseries array,
        shape = (num_dates, num_loc_techs * num_vars * num_timesteps_per_day)
        The shape is acheived by running a dataset through clustering.reshape_for_clustering
    value : np.ndarray
        one cluster of data,
        shape = (1, num_loc_techs * num_vars * num_timesteps_per_day)
        The shape is acheived by running the mean clustered dataset,
        subset over one cluster, through clustering.reshape_for_clustering
    metric : str, default = 'rmse'
        Error metric with which to compare the array and the values.
        If 'rmse', will compare the root mean square error between `value` and
        each date in `array`.
        If 'abs' will compare the absolute difference between `value` and
        each date in `array`.
    """
    if metric == 'rmse':
        error = np.array([mean_squared_error(i, value[0]) for i in array])
    elif metric == 'abs':
        error = np.array([np.linalg.norm(sum(y)) for y in array - value])
    else:
        raise ValueError(
            'Error metric can only be `rmse` or `abs`, {} given'.format(metric)
        )

    return error.argmin()


def get_closest_days_from_clusters(data, mean_data, clusters, daily_timesteps):
    """
    Given a set of mean cluster timeseries profiles, find the day in the full
    timeseries that matches each cluster profile most closely.

    Parameters
    ----------
    data : xarray Dataset
        Input model data, with all non-timeseries data vars dropped
    mean_data : xarray Dataset
        Mean per cluster of input model data, resulting timeseries is of the form
        X-Y where X is the cluster number and Y is the timestep (both integers)
    clusters : pd.Series
        Cluster to which each date is aligned.
        Index = dates of the full timeseries, values = cluster number
    daily_timesteps : list
        hours assigned to each timestep in a day (e.g. an entry of 0.25 = 15 minutes).
        We expect uniform timesteps between days, which is checked prior to
        reaching this function.

    Returns
        new_data : xarray Dataset
            data, indexed over only the timesteps within the days that most
            closely match the cluster means.
            Number of timesteps = timesteps_per_day * num_clusters. May be lower
            than this if many cluster means match with the same day
        chosen_day_timestepas : dict
            The day assigned to each cluster. key = cluster number, value = date.
    """
    dtindex = data['timesteps'].to_index()
    timesteps_per_day = len(daily_timesteps)
    chosen_days = {}

    for cluster in sorted(clusters.unique()):

        subset_t = [t for t in mean_data.timesteps.values
                    if t.startswith('{}-'.format(cluster))]

        target = reshape_for_clustering(mean_data.loc[dict(timesteps=subset_t)])

        lookup_array = reshape_for_clustering(data)

        chosen_days[cluster] = find_nearest_vector_index(lookup_array, target)

    days_list = sorted(list(set(chosen_days.values())))
    new_t_coord = _timesteps_from_daily_index(
        dtindex[::timesteps_per_day][days_list], daily_timesteps
    )

    chosen_day_timestamps = {k: dtindex[::timesteps_per_day][v]
                             for k, v in chosen_days.items()}

    new_data = data.loc[dict(timesteps=new_t_coord)]

    return new_data, chosen_day_timestamps


def _timesteps_from_daily_index(idx, daily_timesteps):
    new_idx = pd.Index([
        date + pd.DateOffset(hours=sum(daily_timesteps[:timestep]))
        for date in idx for timestep in range(len(daily_timesteps))
    ])
    return new_idx


def map_clusters_to_data(data, clusters, how, daily_timesteps, storage_inter_cluster=True):
    """
    Returns a copy of data that has been clustered.

    Parameters
    ----------
    how : str
        How to select data from clusters. Can be mean (centroid) or closest real
        day to the mean (by root mean square error).
    storage_inter_cluster : bool, default=True
        If True, add `datesteps` to model_data, for use in the backend to build
        inter_cluster storage decision variables and constraints
    """
    # FIXME hardcoded time intervals ('1H', '1D')

    # Get all timesteps, not just the first per day
    timesteps_per_day = len(daily_timesteps)
    idx = clusters.index
    new_idx = _timesteps_from_daily_index(idx, daily_timesteps)
    clusters_timeseries = (clusters.reindex(new_idx)
                           .fillna(method='ffill').astype(int))

    new_data = get_mean_from_clusters(data, clusters_timeseries, timesteps_per_day)
    new_data.attrs = data.attrs

    if how == 'mean':
        # Add timestep names by taking the median timestamp from daily clusters...
        # (a random way of doing it, but we want some label to apply)
        timestamps = clusters.groupby(clusters).apply(
            lambda x: x.index[int(len(x.index) / 2)]
        )
        new_data.coords['timesteps'] = _timesteps_from_daily_index(
            pd.Index(timestamps.values), daily_timesteps
        )
        # Generate weights
        # weight of each timestep = number of timesteps in this timestep's cluster
        # divided by timesteps per day (since we're grouping days together and
        # a cluster consisting of 1 day = 24 hours should have weight of 1)
        value_counts = clusters_timeseries.value_counts() / timesteps_per_day
        # And turn the index into dates (days)
        value_counts = pd.DataFrame({
            'dates': timestamps,
            'counts': value_counts}).set_index('dates')['counts']

    elif how == 'closest':
        new_data, chosen_ts = get_closest_days_from_clusters(data, new_data, clusters, daily_timesteps)
        # Deal with the case where more than one cluster has the same closest day
        # An easy way is to rename the original clusters with the chosen days
        # So at this point, clusterdays_timeseries maps all timesteps to the day
        # of year of the cluster the timestep belongs to
        clusterdays_timeseries = clusters_timeseries.map(lambda x: chosen_ts[x])
        value_counts = clusterdays_timeseries.value_counts() / timesteps_per_day
        timestamps = pd.DataFrame.from_dict(chosen_ts, orient='index')[0]
        cluster_diff = len(clusters.unique()) - len(timestamps.unique())
        if cluster_diff > 0:
            exceptions.ModelWarning(
                'Creating {} fewer clusters as some clusters share the same '
                'closest day'.format(cluster_diff)
            )
            timestamps = timestamps.drop_duplicates()
            for cluster, date in timestamps.items():
                clusterdays_timeseries.loc[clusterdays_timeseries == date] = cluster
            clusters = clusterdays_timeseries.resample('1D').mean()

    _clusters = xr.DataArray(
        data=np.full(len(new_data.timesteps.values), np.nan),
        dims='timesteps',
        coords={'timesteps': new_data.timesteps.values}
    )

    for cluster, date in timestamps.items():
        _clusters.loc[date.strftime('%Y-%m-%d')] = cluster

    new_data['timestep_cluster'] = _clusters.astype(int)
    weights = (value_counts.reindex(_timesteps_from_daily_index(value_counts.index, daily_timesteps))
                           .fillna(method='ffill'))
    new_data['timestep_weights'] = xr.DataArray(weights, dims=['timesteps'])
    days = np.unique(new_data.timesteps.to_index().date)
    new_data['timestep_resolution'] = (
        xr.DataArray(np.tile(daily_timesteps, len(days)),
                     dims=['timesteps'],
                     coords={'timesteps': new_data['timesteps']})
    )

    if storage_inter_cluster:
        clusters.index.name = 'datesteps'
        new_data['lookup_datestep_cluster'] = xr.DataArray.from_series(clusters)
    timestamps.index.name = 'clusters'
    new_data.coords['clusters'] = timestamps.index

    return new_data


[docs]def get_clusters(
        data, func, timesteps_per_day,
        tech=None, timesteps=None, k=None, variables=None, **kwargs
    ):
    """
    Run a clustering algorithm on the timeseries data supplied. All timeseries
    data is reshaped into one row per day before clustering into similar days.

    Parameters
    ----------
    data : xarray.Dataset
        Should be normalized
    func : str
        'kmeans' or 'hierarchical' for KMeans or Agglomerative clustering, respectively
    timesteps_per_day : int
        Total number of timesteps in a day
    tech : list, optional
        list of strings referring to technologies by which clustering is undertaken.
        If none (default), all technologies within timeseries variables will be used.
    timesteps : list or str, optional
        Subset of the time domain within which to apply clustering.
    k : int, optional
        Number of clusters to create. If none (default), will use Hartigan's rule
        to infer a reasonable number of clusters.
    variables : list, optional
        data variables (e.g. `resource`, `energy_eff`) by whose values the data
        will be clustered. If none (default), all timeseries variables will be used.
    kwargs : dict
        Additional keyword arguments available depend on the `func`.
        For available KMeans kwargs see:
        http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
        For available hierarchical kwargs see:
        http://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html
    Returns
    -------
    clusters : dataframe
        Indexed by timesteps and with locations as columns, giving cluster
        membership for first timestep of each day.
    clustered_data : sklearn.cluster object
        Result of clustering using sklearn.KMeans(k).fit(X) or
        sklearn.KMeans(k).AgglomerativeClustering(X). Allows user to access
        specific attributes, for detailed statistical analysis.

    """

    if timesteps is not None:
        data = data.loc[{'timesteps': timesteps}]
    else:
        timesteps = data.timesteps.values

    X = reshape_for_clustering(data, tech, variables)

    if func == 'kmeans':
        if not k:
            k = hartigan_n_clusters(X)
            exceptions.warn(
                'Used Hartigan\'s rule to determine that'
                'a good number of clusters is {}.'.format(k)
            )
        clustered_data = sk_cluster.KMeans(k).fit(X)

    elif func == 'hierarchical':
        if not k:
            raise exceptions.ModelError(
                'Cannot undertake hierarchical clustering without a predefined '
                'number of clusters (k)'
            )
        clustered_data = sk_cluster.AgglomerativeClustering(k).fit(X)

    # Determine the cluster membership of each day
    day_clusters = clustered_data.labels_

    # Create mapping of timesteps to clusters
    clusters = pd.Series(day_clusters, index=timesteps[::timesteps_per_day])

    return clusters, clustered_data


def hartigan_n_clusters(X, threshold=10):
    """
    Try clustering using sklearn.cluster.kmeans, for several cluster sizes.
    Using Hartigan's rule, we will return the number of clusters after which
    the benefit of clustering is low.
    """
    def _H_rule(inertia, inertia_plus_one, n_clusters, len_input):
        # see http://www.dcs.bbk.ac.uk/~mirkin/papers/00357_07-216RR_mirkin.pdf
        return ((inertia / inertia_plus_one) - 1) * (len_input - n_clusters - 1)

    len_input = len(X)
    n_clusters = 1
    HK = threshold + 1

    while n_clusters <= len_input and HK > threshold:

        kmeans = sk_cluster.KMeans(n_clusters=n_clusters).fit(X)
        kmeans_plus_one = sk_cluster.KMeans(n_clusters=n_clusters + 1).fit(X)

        inertia = kmeans.inertia_
        inertia_plus_one = kmeans_plus_one.inertia_

        HK = _H_rule(inertia, inertia_plus_one, n_clusters, len_input)

        n_clusters += 1

    if HK > threshold:  # i.e. we went to the limit where n_clusters = len_input
        exceptions.warn("Based on threshold, number of clusters = number of dates")
        return len_input
    else:
        return n_clusters - 1