Source code for calliope.time_clustering

"""
Copyright (C) 2013-2017 Calliope contributors listed in AUTHORS.
Licensed under the Apache 2.0 License (see LICENSE file).

time_clustering.py
~~~~~~~~~~~~~~~~~~

Functions to cluster data along the time dimension.

"""

import numpy as np
import pandas as pd
import xarray as xr

try:
    import matplotlib.pyplot as plt
    import matplotlib.patches
    from matplotlib.colors import rgb2hex
    import seaborn as sns
except ImportError:
    pass  # This is logged in analysis.py

import scipy.cluster.vq as vq
from scipy.cluster import hierarchy
from scipy.spatial.distance import pdist


def _get_y_coord(array):
    if 'y' in array.coords:
        y = 'y'
    else:
        try:  # assumes a single y_ coord in array
            y = [k for k in array.coords if 'y_' in k][0]
        except IndexError:  # empty list
            y = None
    return y


def _get_datavars(data):
    return [var for var in data.data_vars if not var.startswith('_')]


def _get_timesteps_per_day(data):
    timesteps_per_day = data.attrs['time_res'] * 24
    if isinstance(timesteps_per_day, float):
        assert timesteps_per_day.is_integer(), 'Timesteps/day must be integer.'
        timesteps_per_day = int(timesteps_per_day)
    return timesteps_per_day


def reshape_for_clustering(data, tech=None):
    y_coord = 'y_def_r'
    y_values = list(data.attrs['_sets'][y_coord])
    timesteps_per_day = _get_timesteps_per_day(data)
    days = int(len(data.t) / timesteps_per_day)
    regions = len(data['x'])
    techs = len(y_values)

    if tech is None:
        X = (data.r.loc[{'y': y_values}]
                   .transpose('t', 'x', 'y')
                   .values.reshape(days, timesteps_per_day * regions * techs))
    else:
        X = (data.r.loc[{'y': tech}]
                   .values
                   .reshape((days, timesteps_per_day * regions)))

    return np.nan_to_num(X)  # replace any NaN with 0


def reshape_clustered(clustered, data, tech=None):
    y_coord = 'y_def_r'
    y_values = list(data.attrs['_sets'][y_coord])
    timesteps_per_day = _get_timesteps_per_day(data)
    days = clustered.shape[0]
    regions = len(data['x'])
    techs = len(y_values)

    if tech is None:
        reshaped = clustered.reshape(techs, days * timesteps_per_day, regions)
    else:
        reshaped = clustered.reshape(1, days * timesteps_per_day, regions)

    arr = xr.DataArray(reshaped, dims=('y', 't', 'x'))

    arr['x'] = data['x']
    if tech is None:
        arr['y'] = y_values
    else:
        arr['y'] = tech

    return arr


def get_mean_from_clusters(data, clusters, timesteps_per_day):

    cluster_map = clusters.groupby(clusters).groups

    hour_of_day = pd.Series([pd.Timestamp(i).hour
                             for i in data.coords['t'].values])
    # hour_of_day = pd.Series([i - (i // timesteps_per_day) * timesteps_per_day
    #                          for i in data.coords['t'].values])

    ds = {}
    data_vars_in_t = [v for v in _get_datavars(data)
                      if 't' in data[v].dims]
    for var in data_vars_in_t:
        data_arrays = []
        array = data[var]
        y_coord = _get_y_coord(array)

        t_coords = ['{}-{}'.format(cid, t)
                    for cid in cluster_map
                    for t in range(timesteps_per_day)]

        # Check if this variable has any data
        if len(data.coords[y_coord].values) == 0:
            e = np.empty((0, len(t_coords), len(data.coords['x'].values)))
            ds[var] = xr.DataArray(e, dims=[y_coord, 't', 'x'],
                                   coords={'x': data.coords['x'].values,
                                           't': t_coords})
        else:
            for cluster_id, cluster_members in cluster_map.items():
                y_arrays = []
                for y in data.coords[y_coord].values:
                    d = (array.loc[{'t': cluster_members, y_coord: y}]
                              .to_pandas().groupby(lambda x: x.hour).mean()
                              .to_xarray().to_array(dim='x').T
                         )
                    d.coords[y_coord] = y
                    d = d.rename({'index': 't'})
                    d.coords['t'] = [i for i in t_coords
                                     if i.startswith('{}-'.format(cluster_id))]
                    y_arrays.append(d)
                data_arrays.append(xr.concat(y_arrays, dim=y_coord))
                # data_arrays[-1].coords['cluster'] = cluster_id

            ds[var] = xr.concat(data_arrays, dim='t')
    ds = xr.Dataset(ds)
    return ds


def find_nearest_vector_index(array, value):
    return np.array([np.linalg.norm(x + y + z + u)
                     for (x, y, z, u) in array - value]).argmin()


def get_closest_days_from_clusters(data, mean_data, clusters):
    subset_y = list(data.attrs['_sets']['y_def_r'])
    dtindex = data['t'].to_index()
    ts_per_day = _get_timesteps_per_day(data)
    days = int(len(data['t']) / ts_per_day)
    n_x = len(data['x'])
    n_y = len(subset_y)

    chosen_days = {}

    for cluster in sorted(clusters.unique()):

        subset_t = [t for t in mean_data.t.values if t.startswith('{}-'.format(cluster))]

        target = mean_data['r'].loc[dict(t=subset_t, y=subset_y)].values

        lookup_array = data['r'].loc[dict(y=subset_y)].values
        lookup_array = lookup_array.reshape((n_y, days, ts_per_day, n_x)).transpose(1, 0, 2, 3)

        chosen_days[cluster] = find_nearest_vector_index(lookup_array, target)

    days_list = sorted(list(set(chosen_days.values())))
    new_t_coord = _hourly_from_daily_index(dtindex[::ts_per_day][days_list])

    chosen_day_timestamps = {k: dtindex[::ts_per_day][v]
                             for k, v in chosen_days.items()}

    new_data = data.loc[dict(t=new_t_coord)]

    return new_data, chosen_day_timestamps


def _hourly_from_daily_index(idx):
    dtrange = lambda i: pd.date_range(i, i + pd.Timedelta('1D'),
                                      freq='1H')[:-1]
    new_idx = pd.concat([pd.Series(1, dtrange(i))
                         for i in idx]).index
    return new_idx


[docs]def map_clusters_to_data(data, clusters, how):
    """
    Returns a copy of data that has been clustered.

    Parameters
    ----------
    how : str
        How to select data from clusters.
        Can be mean (centroid) or closest.

    """
    # FIXME hardcoded time intervals ('1H', '1D')

    # Get all timesteps, not just the first per day
    ts_per_day = _get_timesteps_per_day(data)
    idx = clusters.index
    new_idx = _hourly_from_daily_index(idx)
    clusters_timeseries = (clusters.reindex(new_idx)
                           .fillna(method='ffill').astype(int))

    new_data = get_mean_from_clusters(data, clusters_timeseries, ts_per_day)

    if how == 'mean':
        # Add timestep names by taking the median timestamp from daily clusters...
        # (a random way of doing it, but we want some label to apply)
        timestamps = clusters.groupby(clusters).apply(lambda x: x.index[int(len(x.index) / 2)])
        new_t_coord = pd.concat([pd.Series(pd.date_range(ts, ts + pd.Timedelta('1D'), freq='1H')[:-1])
                                for ts in timestamps], ignore_index=True)
        new_data.coords['t'] = new_t_coord.as_matrix()

        # Generate weights
        # weight of each timestep = number of timesteps in this timestep's cluster
        # divided by timesteps per day (since we're grouping days together and
        # a cluster consisting of 1 day = 24 hours should have weight of 1)
        value_counts = clusters_timeseries.value_counts() / ts_per_day
        # And turn the index into dates (days)
        value_counts = pd.DataFrame({
            'dates': timestamps,
            'counts': value_counts}).set_index('dates')['counts']

    elif how == 'closest':
        new_data, chosen_ts = get_closest_days_from_clusters(data, new_data, clusters)
        # Deal with the case where more than one cluster has the same closest day
        # An easy way is to rename the original clusters with the chosen days
        # So at this point, clusterdays_timeseries maps all timesteps to the day
        # of year of the cluster the timestep belongs to
        clusterdays_timeseries = clusters_timeseries.map(lambda x: chosen_ts[x])
        value_counts = clusterdays_timeseries.value_counts() / ts_per_day

    weights = (value_counts.reindex(_hourly_from_daily_index(value_counts.index))
                           .fillna(method='ffill'))
    new_data['_weights'] = xr.DataArray(weights, dims=['t'])
    new_data['_time_res'] = xr.DataArray(np.ones(len(new_data['t'])) * (24 / ts_per_day),
                                         coords={'t': new_data['t']})

    return new_data


[docs]def get_clusters_kmeans(data, tech=None, timesteps=None, k=5):
    """
    Parameters
    ----------
    data : xarray.Dataset
        Should be normalized

    Returns
    -------
    clusters : dataframe
        Indexed by timesteps and with locations as columns, giving cluster
        membership for first timestep of each day.
    centroids

    """
    timesteps_per_day = _get_timesteps_per_day(data)

    if timesteps is not None:
        data = data.loc[{'t': timesteps}]
    else:
        timesteps = data.t.values

    X = reshape_for_clustering(data, tech)

    centroids, distortion = vq.kmeans(X, k)

    # Determine the cluster membership of each day
    day_clusters = vq.vq(X, centroids)[0]

    # Create mapping of timesteps to clusters
    clusters = pd.Series(day_clusters, index=timesteps[::timesteps_per_day])

    # Reshape centroids
    centroids = reshape_clustered(centroids, data, tech)

    return clusters, centroids


[docs]def get_clusters_hierarchical(data, tech=None, max_d=None, k=None):
    """
    Parameters
    ----------
    data : xarray.Dataset
        Should be normalized
    max_d : float or int, optional
        Max distance for returning clusters.
    k : int, optional
        Number of desired clusters.

    Returns
    -------
    clusters
    X
    Z

    """
    X = reshape_for_clustering(data, tech)

    # Generate the linkage matrix
    Z = hierarchy.linkage(X, 'ward')

    if max_d:
        # Get clusters based on maximum distance
        clusters = hierarchy.fcluster(Z, max_d, criterion='distance')
    elif k:
        # Get clusters based on number of desired clusters
        clusters = hierarchy.fcluster(Z, k, criterion='maxclust')
    else:
        clusters = None

    # Make sure clusters are a pd.Series with a datetime index
    if clusters is not None:
        timesteps_per_day = _get_timesteps_per_day(data)
        timesteps = data.coords['t'].values  # All timesteps

        clusters = pd.Series(clusters, index=timesteps[::timesteps_per_day])

    return (clusters, X, Z)


[docs]def fancy_dendrogram(*args, **kwargs):
    """
    Code adapted from:
    https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/

    """
    max_d = kwargs.pop('max_d', None)
    if max_d and 'color_threshold' not in kwargs:
        kwargs['color_threshold'] = max_d
    annotate_above = kwargs.pop('annotate_above', 0)

    ddata = hierarchy.dendrogram(*args, **kwargs)

    if not kwargs.get('no_plot', False):
        plt.xlabel('Sample index or (cluster size)')
        plt.ylabel('Distance')
        for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
            x = 0.5 * sum(i[1:3])
            y = d[1]
            if y > annotate_above:
                plt.plot(x, y, 'o', c=c)
                plt.annotate("%.3g" % y, (x, y), xytext=(0, -5),
                             textcoords='offset points',
                             va='top', ha='center', fontsize=10)
        if max_d:
            plt.axhline(y=max_d, c='grey', lw=1.0, alpha=0.5)
    return ddata


def draw_dendrogram(Z, max_d):
    fig = plt.figure(figsize=(12, 6))
    ax = plt.subplot(111)

    # link_color_pal = sns.color_palette("hls", 8)
    link_color_pal = sns.color_palette("Set2", 10)

    hierarchy.set_link_color_palette([rgb2hex(i) for i in link_color_pal])

    color_threshold = max_d

    fancy_dendrogram(
        Z,
        ax=ax,
        color_threshold=color_threshold,
        truncate_mode='lastp',
        p=20,
        leaf_rotation=90,
        leaf_font_size=10,
        show_contracted=True,
        annotate_above=5,  # useful in small plots so annotations don't overlap
        max_d=color_threshold,
    )

    sns.despine()

    # Modify the contracted markers
    for child in ax.get_children():
        if isinstance(child, matplotlib.patches.Ellipse):
            child.set_zorder(1000)
            child.set_alpha(0.3)

    return fig


[docs]def cophenetic_corr(X, Z):
    """
    Get the Cophenetic Correlation Coefficient of a clustering with help
    of the cophenet() function. This (very very briefly) compares (correlates)
    the actual pairwise distances of all your samples to those implied by the
    hierarchical clustering. The closer the value is to 1, the better
    the clustering preserves the original distances.

    Source:
    https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/

    """
    c, coph_dists = hierarchy.cophenet(Z, pdist(X))
    return c, coph_dists