Source code for calliope.core.time.funcs

Copyright (C) 2013-2018 Calliope contributors listed in AUTHORS.
Licensed under the Apache 2.0 License (see LICENSE file).

Functions to process time series data.


import numpy as np
import pandas as pd
import xarray as xr

from calliope import exceptions
from calliope.core.util.dataset import get_loc_techs
from calliope.core.time import clustering
from calliope.core.util.logging import logger
from calliope.core.preprocess.lookup import lookup_clusters

def get_daily_timesteps(data, check_uniformity=False):
    daily_timesteps = [
        for i in np.unique(data.timesteps.to_index().strftime('%Y-%m-%d'))

    if check_uniformity:
        if not np.all(daily_timesteps == daily_timesteps[0]):
            raise exceptions.ModelError(
                'For clustering, timestep resolution must be uniform.'

    return daily_timesteps[0]

def normalized_copy(data):
    Normalize timeseries data, using the maximum across all regions and timesteps.

    data : xarray Dataset
        Dataset with all non-time dependent variables removed

    ds : xarray Dataset
        Copy of `data`, with the absolute taken and normalized to 0-1

    ds = data.copy(deep=True)  # Work off a copy

    for var in ds.data_vars:
        # Each DataArray is indexed over a different subset of loc_techs,
        # so we find it in the list of dimensions
        loc_tech_dim = [i for i in ds[var].dims if 'loc_techs' in i][0]

        # For each technology, get the loc_techs which are relevant
        loc_tech_subsets = [
            get_loc_techs(ds[loc_tech_dim].values, tech)
            for tech in set(i.split('::')[1] for i in ds[loc_tech_dim].values)
        # remove empty lists within the _techs list
        loc_tech_subsets = [i for i in loc_tech_subsets if i]

        # For each technology, divide all values by the maximum absolute value
        for loc_tech in loc_tech_subsets:
            ds[var].loc[{loc_tech_dim: loc_tech}] = abs(
                ds[var].loc[{loc_tech_dim: loc_tech}] /
                abs(ds[var].loc[{loc_tech_dim: loc_tech}]).max()
    return ds

def _copy_non_t_vars(data0, data1):
    """Copies non-t-indexed variables from data0 into data1, then
    returns data1"""
    non_t_vars = [varname for varname, vardata in data0.data_vars.items()
                  if 'timesteps' not in vardata.dims]
    # Manually copy over variables not in `timesteps`. If we don't do this,
    # these vars get polluted with a superfluous `timesteps` dimension
    for v in non_t_vars:
        data1[v] = data0[v]
    return data1

def _combine_datasets(data0, data1):
    """Concatenates data0 and data1 along the time dimension"""
    data_new = xr.concat([data0, data1], dim='timesteps')
    # Ensure time dimension is ordered
    data_new = data_new.loc[
        {'timesteps': data_new.timesteps.to_index().sort_values()}

    return data_new

def _drop_timestep_vars(data, timesteps):
    timeseries_data = data.copy(deep=True)
    # Save all coordinates, to ensure they can be added back in after clustering
    data_coords = data.copy().coords
    del data_coords['timesteps']

    if timesteps is not None:
        timeseries_data = timeseries_data.loc[{'timesteps': timesteps}]

    timeseries_data = timeseries_data.drop([
        varname for varname, vardata in data.data_vars.items()
        if 'timesteps' not in vardata.dims

    return timeseries_data, data_coords

def apply_clustering(data, timesteps, clustering_func, how, normalize=True,
                     scale_clusters='mean', storage_inter_cluster=True,
                     model_run=None, **kwargs):
    Apply the given clustering function to the given data.

    data : xarray.Dataset
    timesteps : pandas.DatetimeIndex or list of timesteps or None
    clustering_func : str
        Name of clustering function. Can be `file=....csv:column_name`
        if loading custom clustering. Custom clustering index = timeseries days.
        If no column_name, the CSV file must have only one column of data.
    how : str
        How to map clusters to data. 'mean' or 'closest'.
    normalize : bool, optional
        If True (default), data is normalized before clustering is applied,
        using :func:`~calliope.core.time.funcs.normalized_copy`.
    scale_clusters : str or None, default = 'mean'
        Scale the results of clustering such that the clusters match the metric
        given by scale_clusters. For example, 'mean' scales along each loc_tech
        and variable to match inputs and outputs. Other options for matching
        include 'sum', 'max', and 'min'. If None, no scaling occurs.
    **kwargs : optional
        Arguments passed to clustering_func.

    data_new_scaled : xarray.Dataset


    assert how in ['mean', 'closest']

    daily_timesteps = get_daily_timesteps(data, check_uniformity=True)
    timesteps_per_day = len(daily_timesteps)

    # get a copy of the dataset with only timeseries variables,
    # and get all coordinates of the original dataset, to reinstate later
    data_to_cluster, data_coords = _drop_timestep_vars(data, timesteps)

    data_to_cluster = data_to_cluster.drop(['timestep_weights', 'timestep_resolution'])

    for dim in data_to_cluster.dims:
        data_to_cluster[dim] = data[dim]

    if normalize:
        data_normalized = normalized_copy(data_to_cluster)
        data_normalized = data_to_cluster

    if 'file=' in clustering_func:
        file = clustering_func.split('=')[1]
        if ':' in file:
            file, column = file.rsplit(':', 1)
            column = None

        df = model_run.timeseries_data[file]
        if isinstance(df, pd.Series) and column is not None:
            raise exceptions.ModelWarning(
                '{} given as time clustering column, but only one column to '
                'choose from in {}.'.format(column, file)
            clusters = df.resample('1D').mean()
        elif isinstance(df, pd.DataFrame) and column is None:
            raise exceptions.ModelError(
                'No time clustering column given, but multiple columns found in '
                '{0}. Choose one column and add it to {1} as {1}:name_of_column.'
                .format(file, clustering_func)
        elif isinstance(df, pd.DataFrame) and column not in df.columns:
            raise KeyError(
                'time clustering column {} not found in {}.'.format(column, file)
        elif isinstance(df, pd.DataFrame):
            clusters = df.loc[:, column].groupby(pd.Grouper(freq='1D')).unique()

        # Check there weren't instances of more than one cluster assigned to a day
        # or days with no information assigned
        if any([len(i) == 0 for i in clusters.values]):
            raise exceptions.ModelError(
                'Missing cluster days in `{}:{}`.'.format(file, column)
        elif any([len(i) > 1 for i in clusters.values]):
            raise exceptions.ModelError(
                'More than one cluster value assigned to a day in `{}:{}`. '
                'Unique clusters per day: {}'.format(file, column, clusters)
            clusters.loc[:] = [i[0] for i in clusters.values]

        result = clustering.get_clusters(
            data_normalized, clustering_func, timesteps_per_day=timesteps_per_day,
        clusters = result[0]  # Ignore other stuff returned

    data_new = clustering.map_clusters_to_data(
        data_to_cluster, clusters,
        how=how, daily_timesteps=daily_timesteps,

    # It's now safe to add the original coordinates back in (preserving all the
    # loc_tech sets that aren't used to index a variable in the DataArray)

    data_new = _copy_non_t_vars(data, data_new)

    if timesteps is not None:
        data_new = _copy_non_t_vars(data, data_new)
        data_new = _combine_datasets(data.drop(timesteps, dim='timesteps'), data_new)
        data_new = _copy_non_t_vars(data, data_new)

    # Scale the new/combined data so that the mean for each (loc_tech, variable)
    # combination matches that from the original data
    data_new_scaled = data_new.copy(deep=True)
    if scale_clusters:
        data_vars_in_t = [
            v for v in data_new.data_vars
            if 'timesteps' in data_new[v].dims and
            'timestep_' not in v and v != 'clusters'
        for var in data_vars_in_t:
            scale = (
                getattr(data[var], scale_clusters)(dim='timesteps') /
                getattr(data_new[var], scale_clusters)(dim='timesteps')
            data_new_scaled[var] = data_new[var] * scale.fillna(0)


    return data_new_scaled

[docs]def resample(data, timesteps, resolution): """ Function to resample timeseries data from the input resolution (e.g. 1H), to the given resolution (e.g. 2H) Parameters ---------- data : xarray.Dataset calliope model data, containing only timeseries data variables timesteps : str or list; optional If given, apply resampling to a subset of the timeseries data resolution : str time resolution of the output data, given in Pandas time frequency format. E.g. 1H = 1 hour, 1W = 1 week, 1M = 1 month, 1T = 1 minute. Multiples allowed. """ def _resample(var, how): return getattr(var.resample(timesteps=resolution, keep_attrs=True), how)('timesteps') # get a copy of the dataset with only timeseries variables, # and get all coordinates of the original dataset, to reinstate later data_new, data_coords = _drop_timestep_vars(data, timesteps) # First create a new resampled dataset of the correct size by # using first-resample, which should be a quick way to achieve this data_rs = _resample(data_new, how='first') for var in data_rs.data_vars: if var in ['timestep_resolution', 'resource']: data_rs[var] = _resample(data_new[var], how='sum') else: try: data_rs[var] = _resample(data_new[var], how='mean') except TypeError: # If the var has a datatype of strings, it can't be resampled logger.error( 'Dropping {} because it has a {} data type when integer or ' 'float is expected for timeseries resampling.' .format(var, data_rs[var].dtype) ) data_rs = data_rs.drop(var) # Get rid of the filled-in NaN timestamps data_rs = data_rs.dropna(dim='timesteps', how='all') data_rs.attrs['allow_operate_mode'] = 1 # Resampling still permits operational mode # It's now safe to add the original coordinates back in (preserving all the # loc_tech sets that aren't used to index a variable in the DataArray) data_rs.update(data_coords) data_rs = _copy_non_t_vars(data, data_rs) # add back in non timeseries data if timesteps is not None: # Combine leftover parts of passed in data with new data data_rs = _combine_datasets(data.drop(timesteps, dim='timesteps'), data_rs) data_rs = _copy_non_t_vars(data, data_rs) # Having timesteps with different lengths does not permit operational mode data_rs.attrs['allow_operate_mode'] = 0 return data_rs
def drop(data, timesteps): """ Drop timesteps from data, adjusting the timestep weight of remaining timesteps accordingly. Returns updated dataset. Parameters ---------- data : xarray.Dataset Calliope model data. timesteps : str or list or other iterable Pandas-compatible timestep strings. """ # Turn timesteps into a pandas datetime index for subsetting, which also # checks whether they are actually valid try: timesteps_pd = pd.to_datetime(timesteps) except Exception as e: raise exceptions.ModelError( 'Invalid timesteps: {}'.format(timesteps) ) # 'Distribute weight' of the dropped timesteps onto the remaining ones dropped_weight = data.timestep_weights.loc[{'timesteps': timesteps_pd}].sum() data = data.drop(timesteps_pd, dim='timesteps') data['timestep_weights'] = data['timestep_weights'] + (dropped_weight / len(data['timestep_weights'])) return data