Source code for wwdata.Class_OnlineSensorBased

"""
Class_OnlineSensorBased provides functionalities for data handling of data obtained with online sensors in the field of (waste)water treatment.
Copyright (C) 2016 Chaim De Mulder

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Affero General Public License for more details.

You should have received a copy of the GNU Affero General Public License
along with this program.  If not, see http://www.gnu.org/licenses/.
"""

#import sys
#import os
#from os import listdir
import pandas as pd
#import scipy as sp
import numpy as np
import matplotlib.pyplot as plt   #plotten in python
import datetime as dt
import warnings as wn
import random as rn

from wwdata.Class_HydroData import HydroData
#from data_reading_functions import _print_removed_output,_log_removed_output
#from time_conversion_functions import *

[docs]class OnlineSensorBased(HydroData):
    """
    Superclass for a HydroData object, expanding the functionalities with
    specific functions for data gathered at full scale by continous measurements

    Attributes
    ----------
    timedata_column : str
        name of the column containing the time data
    data_type : str
        type of data provided
    experiment_tag : str
        A tag identifying the experiment; can be a date or a code used by
        the producer/owner of the data.
    time_unit : str
        The time unit in which the time data is given
    units : array
        The units of the variables in the columns

    """
    def __init__(self,data,timedata_column='index',data_type='WWTP',
                 experiment_tag='No tag given',time_unit=None):
        """
        initialisation of a FullScaleSensorBased object, based on a previously defined
        HydroData object.
        """
        HydroData.__init__(self,data=data,timedata_column=timedata_column,
                           data_type=data_type,experiment_tag=experiment_tag,
                           time_unit=time_unit)
        self.filled = pd.DataFrame(index=self.index())
        self.meta_filled = pd.DataFrame(self.meta_valid.copy(),index=self.data.index)
        self.filling_error = pd.DataFrame(index = self.data.columns,
                                          columns=['imputation error [%]'])

    #def time_to_index(self,drop=True,inplace=True,verify_integrity=False):
    #    """CONFIRMED
    #    using pandas set_index function to set the columns with timevalues
    #    as index"""
    #    # Drop second layer of indexing to make dataframe handlable
    #    # self.data.columns = self.data.columns.get_level_values(0)
    #
    #    if self.timename == 'index':
    #        raise IndexError('There already is a timeseries in the dataframe index!')
    #    if isinstance(self.time[0],str):
    #        raise ValueError('Time values of type "str" can not be used as index')
    #
    #    if inplace == False:
    #        new_data = self.set_index(self.timename,drop=drop,inplace=False,
    #                                  verify_integrity=verify_integrity)
    #        #self.columns = np.array(new_data.columns)
    #        return self.__class__(new_data,timedata_column='index',
    #                              data_type=self.data_type,experiment_tag=self.tag,
    #                              time_unit=self.time_unit)
    #    elif inplace == True:
    #        self.set_index(self.timename,drop=drop,inplace=True,
    #                       verify_integrity=verify_integrity)
    #        #self.columns = np.array(self.data.columns)
    #        #self.timename = 'index'
    #        #self.time = self.index()

[docs]    def drop_index_duplicates(self):
        """
        drop rows with a duplicate index. Also updates the meta_valid,
        meta_filled and filled dataframes

        Note
        ----
        This operation assumes the dropped rows have the same data in them and
        therefor no data is lost.
        """
        #self.data = self.data.groupby(self.index()).first()
        #self.meta_valid = self.meta_valid.groupby(self.meta_valid.index).first()
        #self.meta_filled = self.meta_filled.groupby(self.meta_filled.index).first()
        #self.filled = self.filled.groupby(self.filled.index).first()

        self.data = self.data[~self.data.index.duplicated(keep='first')]
        self.meta_valid = self.meta_valid[~self.meta_valid.index.duplicated(keep='first')]
        self.meta_filled = self.meta_filled[~self.meta_filled.index.duplicated(keep='first')]
        self.filled= self.filled[~self.filled.index.duplicated(keep='first')]

        self._update_time()
        if isinstance(self.index()[1],str):
            wn.warn('Rows may change order using this function based on '+ \
            'string values. Convert to datetime, int or float and use '+ \
            '.sort_index() or .sort_value() to avoid. (see also hp.to_datetime())')

[docs]    def calc_total_proportional(self,Q_tot,Q,conc,new_name='new',unit='mg/l',
                                filled=False):
        """
        Calculates the total concentration of an incoming flow, based on the
        given total flow and the separate incoming flows and concentrations

        Parameters
        ----------
        Q_tot : str
            name of the column containing the total flow
        Q : array of str
            names of the columns containing the separate flows
        conc : array of str
            names of the columns containing the separate concentration values
        new_name : str
            name of the column to be added
        filled : bool
            if true, use self.filled to calculate proportions from

        Note
        ------
        !!Order of columns in Q and conc must match!!

        Returns
        -------
        None;
        creates a hydropy object with added column for the proportional concentration
        """
        if filled:
            index = self.filled.index
            sum_ = pd.Series(0, index=index)
            for i in range(0,len(Q)):
                sum_ = sum_ + self.filled[Q[i]] * self.filled[conc[i]]
            self.filled[new_name] = sum_ / self.filled[Q_tot]

        else:
            index = self.index()
            sum_ = pd.Series(0, index=index)
            for i in range(0,len(Q)):
                sum_ = sum_ + self.data[Q[i]] * self.data[conc[i]]

            self.data[new_name] = sum_ / self.data[Q_tot]
            self.columns = np.array(self.data.columns)

        try:
            self.units = pd.concat([self.units,
                                    pd.DataFrame([[new_name,unit]],columns=self.units.columns)],
                                    ignore_index=True)
        except:
            wn.warn('Something might have gone wrong with the updating of the units. '+ \
                    'Check self.units to make sure everything is still okay.')
        return None

[docs]    def calc_daily_average(self,column_name,arange,plot=False):
        """
        calculates the daily average of values in the given column and returns them as a 2D-array,
        containing the days and the average values on the respective days. Plotting is possible.

        Parameters
        ----------
        column_name : str
            name of the column containing the data to calculate the average values for
        arange : array of two values
            the range within which daily averages need to be calculated
        plot : bool
            plot or not

        Returns
        -------
        pd.Dataframe :
            pandas dataframe, containing the daily means with standard deviations
            for the selected column
        """
        self.daily_average = {}
        try:
            series = self.data[column_name][arange[0]:arange[1]].copy()
        except TypeError:
            raise TypeError("Slicing not possible for index type " + \
            str(type(self.data.index[0])) + " and arange argument type " + \
            str(type(arange[0])) + ". Try changing the type of the arange " + \
            "values to one compatible with " + str(type(self.data.index[0])) + \
            " slicing.")

        if isinstance(series.index[0],float):
            days = np.arange(series.index[0],series.index[-1],1)
            means = [series[x:x+1].mean() for x in days]
            stds = [series[x:x+1].std() for x in days]

            to_return = pd.DataFrame([days,means,stds]).transpose()
            to_return.columns = ['day','mean','std']

        elif isinstance(self.data.index[0],pd.tslib.Timestamp):
            means = series.resample('d').mean().dropna()
            stds = series.resample('d').std().dropna()

            to_return = pd.DataFrame([means.index,means.values,stds.values]).transpose()
            to_return.columns = ['day','mean','std']

        if plot==True:
            fig = plt.figure(figsize=(16,6))
            ax = fig.add_subplot(111)
            if isinstance(self.data.index[0],pd.tslib.Timestamp):
                ax.errorbar([pd.to_datetime(x) for x in to_return['day']],to_return['mean'],
                            yerr=to_return['std'],fmt='o')
            else:
                ax.errorbar(to_return['day'],to_return['mean'],
                            yerr=to_return['std'],fmt='o')
            #ax.plot(to_return['day'],(to_return['mean']+to_return['std']),'b',alpha=0.5)
            #ax.plot(to_return['day'],(to_return['mean']-to_return['std']),'b',alpha=0.5)
            #ax.fill_between(to_return['day'],to_return['mean'],(to_return['mean']+to_return['std']),
            #                color='grey', alpha='0.3')
            #ax.fill_between(to_return['day'],to_return['mean'],(to_return['mean']-to_return['std']),
            #                color='grey', alpha='0.3')
            ax.tick_params(labelsize=15)
            ax.set_ylabel(column_name,size=20)
            ax.set_xlabel('Time',size=20)

        self.daily_average[column_name] = to_return

#==============================================================================
# FILLING FUNCTIONS
#==============================================================================
    def _reset_meta_filled(self,data_name=None):
        """
        reset the meta dataframe, possibly for only a certain data series,
        should wrong labels have been assigned at some point
        """
        if data_name == None:
            self.meta_filled = pd.DataFrame(self.meta_valid.copy(),index=self.data.index)
        else:
            try:
                self.meta_filled[data_name] = self.meta_valid[data_name].copy()
            except:
                pass
                #wn.warn(data_name + ' is not contained in self.meta_valid yet, so cannot\
                #be removed from it!')

[docs]    def add_to_filled(self,column_names):
        """
        column_names : array
        """
        self._plot = 'filled'
        # Create/adjust self.filled
        self.filled = self.filled.reindex(self.index())
        for column in column_names:
            if not column in self.filled.columns:
                # Only take the validated values to be in the self.filled dataframe in the
                # first place. The reindexing creates nan values where no validated
                # values are present
                self.filled[column] = self.data[column][self.meta_valid[column] == 'original'].copy()
                self.filled = self.filled.reindex(self.index())
            else:
                pass
                #wn.warn('self.filled already contains a column named ' +
                #    column + '. The original columns was kept.')

    #####################
    ###   FILLING
    #####################

[docs]    def fill_missing_interpolation(self,to_fill,range_,arange,method='index',plot=False,
                                   clear=False):
        """
        Fills the missing values in a dataset (to_fill), based specified
        interpolation algorithm (method). This happens only if the number of
        consecutive missing values is smaller than range_.

        Parameters
        ----------
        to_fill : str
            name of the column containing the data to be filled
        range_ : int
            the maximum range that the absence of values can be to still
            allow interpolation to fill in values
        arange : array of two values
            the range within which missing/filtered values need to be replaced
        method : str
            interpolation method to be used by the .interpolate function. See
            pandas docstrings for more info
        plot : bool
            whether or not to plot the new dataset
        clear : bool
            whether or not to clear the previoulsy filled values and start from
            the self.meta_valid dataset again for this particular dataseries.

        Returns
        -------
        None;
        creates/updates self.filled, containing the adjusted dataset and updates
        meta_filled with the correct labels.
        """
        ###
        # CHECKS
        ###
        self._plot = 'filled'
        wn.warn('When making use of filling functions, please make sure to '+ \
        'start filling small gaps and progressively move to larger gaps. This '+ \
        'ensures the proper working of the package algorithms.')
        if clear:
            self._reset_meta_filled(to_fill)
        self.meta_filled = self.meta_filled.reindex(self.index(),fill_value='!!')

        if not to_fill in self.meta_filled.columns:
            # if the to_fill column doesn't exist yet in the meta_filled dataset,
            # add it, and fill it with the meta_valid values; if this last one
            # doesn't exist yet, create it with 'original' tags.
            try:
                self.meta_filled[to_fill] = self.meta_valid[to_fill]
            except:
                self.add_to_meta_valid([to_fill])
                self.meta_filled[to_fill] = self.meta_valid[to_fill]
        else:
            # where the meta_filled dataset contains original values, update with
            # the values from meta_valid; in case a filling round was done before
            # any filtering; not supposed to happen, but cases exist.
            try:
                self.meta_filled[to_fill].loc[self.meta_filled[to_fill]=='original'] = \
                self.meta_valid[to_fill].loc[self.meta_filled[to_fill]=='original']
            except:
                self.add_to_meta_valid([to_fill])
                self.meta_filled[to_fill].loc[self.meta_filled[to_fill]=='original'] = \
                self.meta_valid[to_fill].loc[self.meta_filled[to_fill]=='original']
        if not to_fill in self.filled:
            self.add_to_filled([to_fill])

        # Give warning when replacing data from rain events and at the same time
        # check if arange has the right type
        try:
            rain = (self.data_type == 'WWTP') and \
                   (self.highs['highs'].loc[arange[0]:arange[1]].sum() > 1)
        except TypeError:
            raise TypeError("Slicing not possible for index type " + \
            str(type(self.data.index[0])) + " and arange argument type " + \
            str(type(arange[0])) + ". Try changing the type of the arange " + \
            "values to one compatible with " + str(type(self.data.index[0])) + \
            " slicing.")

        if rain :
            wn.warn('Data points obtained during a rain event will be replaced. '+\
            'Make sure you are confident in this replacement method for the '+\
            'filling of gaps in the data during rain events.')

        ###
        # CALCULATIONS
        ###
        # Create a mask to replace the filtered datapoints with nan-values,
        # if consecutive occurence lower than range_
        mask_df = pd.DataFrame(index = self.meta_valid[arange[0]:arange[1]].index)
        mask_df['count'] = (self.meta_valid[to_fill][arange[0]:arange[1]] != self.meta_valid[to_fill][arange[0]:arange[1]].\
                            shift()).astype(int).cumsum().astype(str)
        group = mask_df.groupby('count').size()
        group.index = mask_df.groupby('count').size().index.astype(str)

        # Compare the values in 'count' with the ones in the group-by object.
        # mask_df now contains the amount of consecutive true or false datapoints,
        # for every datapoint
        replace_dict = {'count':dict(group)}
        mask_df = mask_df.replace(replace_dict)

        # Based on the mask and whether a datapoint is filtered, replace with
        # nan values
        filtered_based = pd.DataFrame(self.meta_filled.loc[self.meta_filled[to_fill] == 'filtered'].index.values)
        mask_based = pd.DataFrame(mask_df.loc[mask_df['count'] < range_].index.values)
        indexes_to_replace = pd.merge(filtered_based,mask_based,how='inner')
        self.filled[to_fill] = self.filled[to_fill].drop(indexes_to_replace[0])

        ###
        # FILLING
        ###
        # Use the .interpolate() method to interpolate for the nan values just created
        # the limit argument makes sure that only the values than can be filled by
        # interpolation are filled; needed to prevent other, already present NaN values
        # from also getting filled!!
        self.filled[to_fill] = self.filled[to_fill].interpolate(method=method,limit=range_)

        # Adjust in the self.meta_filled dataframe
        self.meta_filled.loc[indexes_to_replace[0],to_fill] = 'filled_interpol'

        # Set all points still tagged filtered in the self.filled dataset to NaN
        self.filled.loc[self.meta_filled[to_fill] == 'filtered'] = np.nan

        if plot:
            self.plot_analysed(to_fill)

        return None

[docs]    def fill_missing_ratio(self,to_fill,to_use,ratio,arange,
                             filtered_only=True,plot=False,clear=False):#,use_smoothing=True):
        """
        Fills the missing values in a dataset (to_fill), based on the ratio this
        data shows when comparing to other data (to_use). This happens within
        the range given by arange.

        Parameters
        ----------
        to_fill : str
            name of the column with data to fill
        to_use : str
            name of the column to use, in combination with the given ratio, to
            fill in some of the missing data
        ratio : float
            ratio to multiply the to_use data with to obtain data for filling in
            in the to_fill data column
        arange : array of two values
            the range within which missing/filtered values need to be replaced
        filtered_only : boolean
            if True, fills only the datapoints labeled as filtered. If False,
            fills/replaces all datapoints in the given range
        plot : bool
            whether or not to plot the new dataset
        clear : bool
            whether or not to clear the previoulsy filled values and start from
            the self.meta_valid dataset again for this particular dataseries.

        Returns
        -------
        None;
        creates/updates self.filled, containing the adjusted dataset and updates
        meta_filled with the correct labels.
        """
        ###
        # CHECKS
        ###
        self._plot = 'filled'
        wn.warn('When making use of filling functions, please make sure to '+ \
        'start filling small gaps and progressively move to larger gaps. This '+ \
        'ensures the proper working of the package algorithms.')
        if clear:
            self._reset_meta_filled(to_fill)
        self.meta_filled = self.meta_filled.reindex(self.index(),fill_value='!!')

        if not to_fill in self.meta_filled.columns:
            # if the to_fill column doesn't exist yet in the meta_filled dataset,
            # add it, and fill it with the meta_valid values; if this last one
            # doesn't exist yet, create it with 'original' tags.
            try:
                self.meta_filled[to_fill] = self.meta_valid[to_fill]
            except:
                self.add_to_meta_valid([to_fill])
                self.meta_filled[to_fill] = self.meta_valid[to_fill]
        else:
            # where the meta_filled dataset contains original values, update with
            # the values from meta_valid; in case a filling round was done before
            # any filtering; not supposed to happen, but cases exist.
            try:
                self.meta_filled[to_fill].loc[self.meta_filled[to_fill]=='original'] = \
                self.meta_valid[to_fill].loc[self.meta_filled[to_fill]=='original']
            except:
                self.add_to_meta_valid([to_fill])
                self.meta_filled[to_fill].loc[self.meta_filled[to_fill]=='original'] = \
                self.meta_valid[to_fill].loc[self.meta_filled[to_fill]=='original']

        if not to_fill in self.filled:
            self.add_to_filled([to_fill])

        # Give warning when replacing data from rain events and at the same time
        # check if arange has the right type
        try:
            rain = (self.data_type == 'WWTP') and \
                   (self.highs['highs'].loc[arange[0]:arange[1]].sum() > 1)
        except TypeError:
            raise TypeError("Slicing not possible for index type " + \
            str(type(self.data.index[0])) + " and arange argument type " + \
            str(type(arange[0])) + ". Try changing the type of the arange " + \
            "values to one compatible with " + str(type(self.data.index[0])) + \
            " slicing.")

        if rain :
            wn.warn('Data points obtained during a rain event will be replaced. '+ \
            'Make sure you are confident in this replacement method for the '+ \
            'filling of gaps in the data during rain events.')

        ###
        # FILLING
        ###
        if filtered_only:
            indexes_to_replace = pd.DataFrame(self.meta_valid.\
                                    loc[arange[0]:arange[1]].\
                                    loc[self.meta_filled[to_fill] == 'filtered'].index.values)
            self.filled.loc[indexes_to_replace[0],to_fill] = self.data.loc[indexes_to_replace[0],to_use]*ratio
            # Adjust in the self.meta_filled dataframe
            self.meta_filled.loc[indexes_to_replace[0],to_fill] = 'filled_ratio'

        if not filtered_only:
            self.filled.loc[arange[0]:arange[1],to_fill] = self.data.loc[arange[0]:arange[1],to_use]*ratio
            # Adjust in the self.meta_valid dataframe
            self.meta_filled[to_fill].loc[arange[0]:arange[1]] = 'filled_ratio'

        if plot:
            self.plot_analysed(to_fill)

        return None

[docs]    def fill_missing_correlation(self,to_fill,to_use,arange,corr_range,
                                 zero_intercept=False,filtered_only=True,
                                 plot=False,clear=False):
        """
        Fills the missing values in a dataset (to_fill), based on the correlation
        this data shows when comparing to other data (to_use). This happens within
        the range given by arange.

        Parameters
        ----------
        to_fill : str
            name of the column with data to fill
        to_use : str
            name of the column to use, in combination with the given ratio, to
            fill in some of the missing data
        arange : array of two values
            the range within which missing/filtered values need to be replaced
        corr_range : array of two values
            the range to use for the calculation of the correlation
        filtered_only : boolean
            if True, fills only the datapoints labeled as filtered. If False,
            fills/replaces all datapoints in the given range
        plot : bool
            whether or not to plot the new dataset
        clear : bool
            whether or not to clear the previoulsy filled values and start from
            the self.meta_valid dataset again for this particular dataseries.

        Returns
        -------
        None;
        creates/updates self.filled, containing the adjusted dataset and updates
        meta_filled with the correct labels.
        """
        ###
        # CHECKS
        ###
        self._plot = 'filled'
        wn.warn('When making use of filling functions, please make sure to '+ \
        'start filling small gaps and progressively move to larger gaps. This '+ \
        'ensures the proper working of the package algorithms.')
        if clear:
            self._reset_meta_filled(to_fill)
        self.meta_filled = self.meta_filled.reindex(self.index(),fill_value='!!')

        if not to_fill in self.meta_filled.columns:
            # if the to_fill column doesn't exist yet in the meta_filled dataset,
            # add it, and fill it with the meta_valid values; if this last one
            # doesn't exist yet, create it with 'original' tags.
            try:
                self.meta_filled[to_fill] = self.meta_valid[to_fill]
            except:
                self.add_to_meta_valid([to_fill])
                self.meta_filled[to_fill] = self.meta_valid[to_fill]
        else:
            # where the meta_filled dataset contains original values, update with
            # the values from meta_valid; in case a filling round was done before
            # any filtering; not supposed to happen, but cases exist.
            try:
                self.meta_filled[to_fill].loc[self.meta_filled[to_fill]=='original'] = \
                self.meta_valid[to_fill].loc[self.meta_filled[to_fill]=='original']
            except:
                self.add_to_meta_valid([to_fill])
                self.meta_filled[to_fill].loc[self.meta_filled[to_fill]=='original'] = \
                self.meta_valid[to_fill].loc[self.meta_filled[to_fill]=='original']

        if not to_fill in self.filled:
            self.add_to_filled([to_fill])

        # Give warning when replacing data from rain events and at the same time
        # check if arange has the right type
        try:
            rain = (self.data_type == 'WWTP') and \
                   (self.highs['highs'].loc[arange[0]:arange[1]].sum() > 1)
        except TypeError:
            raise TypeError("Slicing not possible for index type " + \
            str(type(self.data.index[0])) + " and arange argument type " + \
            str(type(arange[0])) + ". Try changing the type of the arange " + \
            "values to one compatible with " + str(type(self.data.index[0])) + \
            " slicing.")

        if rain :
            wn.warn('Data points obtained during a rain event will be replaced.' + \
            ' Make sure you are confident in this replacement method for the' + \
            ' filling of gaps in the data during rain events.')

        ###
        # CALCULATIONS
        ###
        slope,intercept,r_sq = self.get_correlation(to_use,to_fill,corr_range,
                                                    zero_intercept=zero_intercept)

        if intercept < 0:
                wn.warn('The intercept was calculated to be lower than '+ \
                '0, which might lead to negative data values when data is replaced '+ \
                'based on this correlation. Try setting "zero_intercept" to True '+ \
                'to avoid.')
        ###
        # FILLING
        ###
        if filtered_only:
            indexes_to_replace = pd.DataFrame(self.meta_valid.\
                                            loc[arange[0]:arange[1]].\
                                            loc[self.meta_valid[to_fill] == 'filtered'].index.values)
            self.filled.loc[indexes_to_replace[0],to_fill] = \
                            self.data.loc[indexes_to_replace[0],to_use]*slope + intercept
            # Adjust in the self.meta_filled dataframe
            self.meta_filled.loc[indexes_to_replace[0],to_fill] = 'filled_correlation'

        if not filtered_only:
            self.filled.loc[arange[0]:arange[1],to_fill] = \
                            self.data.loc[arange[0]:arange[1],to_use]*slope + intercept
            # Adjust in the self.meta_filled dataframe
            self.meta_filled[to_fill].loc[arange[0]:arange[1]] = 'filled_correlation'

        if plot:
            self.plot_analysed(to_fill)

        return None

[docs]    def fill_missing_standard(self,to_fill,arange,filtered_only=True,plot=False,
                              clear=False):
        """
        Fills the missing values in a dataset (to_fill), based on the average
        daily profile calculated by calc_daily_profile(). This happens within
        the range given by arange.

        Parameters
        ----------
        to_fill : str
            name of the column with data to fill
        arange : array of two values
            the range within which missing/filtered values need to be replaced
        filtered_only : boolean
            if True, fills only the datapoints labeled as filtered. If False,
            fills/replaces all datapoints in the given range
        plot : bool
            whether or not to plot the new dataset
        clear : bool
            whether or not to clear the previoulsy filled values and start from
            the self.meta_valid dataset again for this particular dataseries.

        Returns
        -------
        None;
        creates/updates self.filled, containing the adjusted dataset and updates
        meta_filled with the correct labels.
        """
        ###
        # CHECKS
        ###
        self._plot = 'filled'
        wn.warn('When making use of filling functions, please make sure to '+ \
        'start filling small gaps and progressively move to larger gaps. This '+ \
        'ensures the proper working of the package algorithms.')

        # several checks on availability of the right columns in the necessary
        # dataframes/dictionaries
        if clear:
            self._reset_meta_filled(to_fill)
        self.meta_filled = self.meta_filled.reindex(self.index(),fill_value='!!')

        if not to_fill in self.meta_filled.columns:
            # if the to_fill column doesn't exist yet in the meta_filled dataset,
            # add it, and fill it with the meta_valid values; if this last one
            # doesn't exist yet, create it with 'original' tags.
            try:
                self.meta_filled[to_fill] = self.meta_valid[to_fill]
            except:
                self.add_to_meta_valid([to_fill])
                self.meta_filled[to_fill] = self.meta_valid[to_fill]
        else:
            # where the meta_filled dataset contains original values, update with
            # the values from meta_valid; in case a filling round was done before
            # any filtering; not supposed to happen, but cases exist.
            try:
                self.meta_filled[to_fill].loc[self.meta_filled[to_fill]=='original'] = \
                self.meta_valid[to_fill].loc[self.meta_filled[to_fill]=='original']
            except:
                self.add_to_meta_valid([to_fill])
                self.meta_filled[to_fill].loc[self.meta_filled[to_fill]=='original'] = \
                self.meta_valid[to_fill].loc[self.meta_filled[to_fill]=='original']

        if not to_fill in self.filled:
            self.add_to_filled([to_fill])

        try:
            if not isinstance(self.daily_profile,dict):
                raise TypeError("self.daily_profile should be a dictionary Type. \
                Run calc_daily_profile() to get an average daily profile for " + to_fill)
        except AttributeError:
            raise AttributeError("self.daily_profile doesn't exist yet, meaning "+
            "there is no data available to replace other data with. Run "+
            "calc_daily_profile() to get an average daily profile for " + to_fill)

        # Give warning when replacing data from rain events and at the same time
        # check if arange has the right type
        try:
            rain = (self.data_type == 'WWTP') and \
                   (self.highs['highs'].loc[arange[0]:arange[1]].sum() > 1)
        except TypeError:
            raise TypeError("Slicing not possible for index type " + \
            str(type(self.data.index[0])) + " and arange argument type " + \
            str(type(arange[0])) + ". Try changing the type of the arange " + \
            "values to one compatible with " + str(type(self.data.index[0])) + \
            " slicing.")

        if rain :
            wn.warn('Data points obtained during a rain event will be replaced. '+ \
            'Make sure you are confident in this replacement method for the '+ \
            'filling of gaps in the data during rain events.')

        ###
        # CALCULATIONS
        ###
        daily_profile = pd.DataFrame([self.daily_profile[to_fill].index.values,
                                      self.daily_profile[to_fill]['avg'].values])
        daily_profile = daily_profile.transpose()
        daily_profile.index = self.daily_profile[to_fill].index
        daily_profile.columns = ['time','data']

        ###
        # FILLING
        ###
        if filtered_only:
            indexes_to_replace = pd.DataFrame(self.meta_filled.\
                                            loc[arange[0]:arange[1]].\
                                            loc[self.meta_filled[to_fill] == 'filtered'].index.values,
                                            columns=['indexes'])
        elif not filtered_only:
            indexes_to_replace = pd.DataFrame(self.meta_filled.loc[arange[0]:arange[1]].index.values,
                                              columns=['indexes'])

        if isinstance(self.data.index[0],dt.datetime):
            indexes_to_replace['day'] = pd.Index(indexes_to_replace['indexes']).time
            indexes_to_replace['values'] = [daily_profile['data'][index_value] for index_value in indexes_to_replace['day']]
        elif isinstance(self.data.index[0],float):
            indexes_to_replace['day'] = indexes_to_replace['indexes'].apply(lambda x: x-int(x))
            indexes_to_replace['time_index'] = indexes_to_replace['day'].apply(find_nearest_time,args=(daily_profile,'time'))
            indexes_to_replace['values'] = indexes_to_replace['time_index'].apply(vlookup_day,args=(daily_profile,'data'))

        self.filled[to_fill][indexes_to_replace['indexes']] = indexes_to_replace['values'].values

        # Adjust in the self.meta_valid dataframe
        self.meta_filled[to_fill][indexes_to_replace['indexes']] = 'filled_average_profile'

        if plot:
            self.plot_analysed(to_fill)

        return None

[docs]    def fill_missing_model(self,to_fill,to_use,arange,filtered_only=True,
                           unit='d',plot=False,clear=False):
        """
        Fills the missing values in a dataset (to_fill), based on the modeled
        values given in to_use. This happens within the range given by arange.

        Parameters
        ----------
        to_fill : str
            name of the column with data to fill
        to_use : pd.Series
            pandas series containing the modeled data with which the filtered
            data can be replaced
        arange : array of two values
            the range within which missing/filtered values need to be replaced
        filtered_only : boolean
            if True, fills only the datapoints labeled as filtered. If False,
            fills/replaces all datapoints in the given range
        unit : str
            the unit in which the modeled values are given; datetime values will
            be converted to values with that unit. Possible: sec, min, hr, d
        plot : bool
            whether or not to plot the new dataset
        clear : bool
            whether or not to clear the previoulsy filled values and start from
            the self.meta_valid dataset again for this particular dataseries.

        Returns
        -------
        None;
        creates/updates self.filled, containing the adjusted dataset and updates
        meta_filled with the correct labels.
        """
        ###
        # CHECKS
        ###
        self._plot = 'filled'
        wn.warn('When making use of filling functions, please make sure to '+ \
        'start filling small gaps and progressively move to larger gaps. This '+ \
        'ensures the proper working of the package algorithms.')

        # several checks on availability of the right columns in the necessary
        # dataframes/dictionaries
        if clear:
            self._reset_meta_filled(to_fill)
        self.meta_filled = self.meta_filled.reindex(self.index(),fill_value='!!')

        if not to_fill in self.meta_filled.columns:
            # if the to_fill column doesn't exist yet in the meta_filled dataset,
            # add it, and fill it with the meta_valid values; if this last one
            # doesn't exist yet, create it with 'original' tags.
            try:
                self.meta_filled[to_fill] = self.meta_valid[to_fill]
            except:
                self.add_to_meta_valid([to_fill])
                self.meta_filled[to_fill] = self.meta_valid[to_fill]
        else:
            # where the meta_filled dataset contains original values, update with
            # the values from meta_valid; in case a filling round was done before
            # any filtering; not supposed to happen, but cases exist.
            try:
                self.meta_filled[to_fill].loc[self.meta_filled[to_fill]=='original'] = \
                self.meta_valid[to_fill].loc[self.meta_filled[to_fill]=='original']
            except:
                self.add_to_meta_valid([to_fill])
                self.meta_filled[to_fill].loc[self.meta_filled[to_fill]=='original'] = \
                self.meta_valid[to_fill].loc[self.meta_filled[to_fill]=='original']

        if not to_fill in self.filled:
            self.add_to_filled([to_fill])

        # Give warning when replacing data from rain events and at the same time
        # check if arange has the right type
        try:
            rain = (self.data_type == 'WWTP') and \
                   (self.highs['highs'].loc[arange[0]:arange[1]].sum() > 1)
        except TypeError:
            raise TypeError("Slicing not possible for index type " + \
            str(type(self.data.index[0])) + " and arange argument type " + \
            str(type(arange[0])) + ". Try changing the type of the arange " + \
            "values to one compatible with " + str(type(self.data.index[0])) + \
            " slicing.")

        if rain :
            wn.warn('Data points obtained during a rain event will be replaced. '+ \
            'Make sure you are confident in this replacement method for the '+ \
            'filling of gaps in the data during rain events.')

        ###
        # CALCULATIONS
        ###
        #model_values = to_use.name
        model_values = pd.DataFrame(index = to_use.index)
        model_values['time'] = to_use.index
        model_values['data'] = to_use.values

        ###
        # FILLING
        ###
        if filtered_only:
            indexes_to_replace = pd.DataFrame(self.meta_filled.\
                                            loc[arange[0]:arange[1]].\
                                            loc[self.meta_filled[to_fill] == 'filtered'].index.values,
                                            columns=['indexes'])
        if not filtered_only:
            indexes_to_replace = pd.DataFrame(self.meta_filled.\
                                            loc[arange[0]:arange[1]].index.values,
                                            columns=['indexes'])

        if not isinstance(model_values['time'][0],type(self.data.index[0])):
            # if datatype of time of modeled vs data values doesn't match, convert to absolute values
            # (floats)
            try:
                indexes_to_replace['abs_indexes'] = absolute_to_relative(indexes_to_replace['indexes'],
                                                    start_date=self.data.index[0],unit=unit)
                indexes_to_replace['time_index'] = indexes_to_replace['abs_indexes'].\
                                                    apply(find_nearest_time,args=(model_values,'time'))
            except(IndexError):
                raise IndexError('No indexes were found to replace. Check the '+ \
                'range in which you want to replace values, or check if filtered '+ \
                'values actually exist in the meta_filled dataset.')

        else:
            indexes_to_replace['time_index'] = indexes_to_replace['indexes'].\
                                               apply(find_nearest_time,args=(model_values,'time'))

        indexes_to_replace['values'] = indexes_to_replace['time_index'].apply(vlookup_day,args=(model_values,'data'))

        self.filled[to_fill][indexes_to_replace['indexes']] = indexes_to_replace['values'].values
        # Adjust in the self.meta_valid dataframe
        self.meta_filled[to_fill][indexes_to_replace['indexes']] = 'filled_infl_model'

            #self.filled.loc[arange[0]:arange[1],to_fill] = to_use.values
            # Adjust in the self.meta_valid dataframe
            #self.meta_filled.loc[arange[0]:arange[1],to_fill] = 'filled_model'

        if plot:
            self.plot_analysed(to_fill)

        return None

[docs]    def fill_missing_daybefore(self,to_fill,arange,range_to_replace=[1,4],
                               filtered_only=True,plot=False,clear=False):
        """
        Fills the missing values in a dataset (to_fill), based on the data values
        from the day before the range starts. These data values are based on
        the self.filled dataset and therefor can contain filled datapoints as well.
        This happens within the range given by arange.
        !! IMPORTANT !!
        This function will not work on datasets with non-equidistant data points!

        Parameters
        ----------
        to_fill : str
            name of the column with data to fill
        arange : array of two values
            the range within which missing/filtered values need to be replaced
        range_to_replace : array of two int/float values
            the minimum and maximum amount of time (i.e. min and max size of
            gaps in data) where missing datapoints can be replaced using this
            function, i.e. using values of the last day before measurements
            went bad.
        filtered_only : boolean
            if True, fills only the datapoints labeled as filtered. If False,
            fills/replaces all datapoints in the given range
        plot : bool
            whether or not to plot the new dataset
        clear : bool
            whether or not to clear the previoulsy filled values and start from
            the self.meta_valid dataset again for this particular dataseries.

        Returns
        -------
        None;
        creates/updates self.filled, containing the adjusted dataset and updates
        meta_filled with the correct labels.
        """
        ###
        # CHECKS
        ###
        self._plot = 'filled'
        wn.warn('When making use of filling functions, please make sure to '+ \
        'start filling small gaps and progressively move to larger gaps. This '+ \
        'ensures the proper working of the package algorithms.')
        # index checks
        #if arange[0] < 1 or arange[1] > self.index()[-1]:
        #    raise IndexError('Index out of bounds. Check whether the values of \
        #    "arange" are within the index range of the data. Mind that the first \
        #    day of data cannot be replaced with this algorithm!')

        # several checks on availability of the right columns in the necessary
        # dataframes/dictionaries
        if clear:
            self._reset_meta_filled(to_fill)
        self.meta_filled = self.meta_filled.reindex(self.index(),fill_value='!!')

        if not to_fill in self.meta_filled.columns:
            # if the to_fill column doesn't exist yet in the meta_filled dataset,
            # add it, and fill it with the meta_valid values; if this last one
            # doesn't exist yet, create it with 'original' tags.
            try:
                self.meta_filled[to_fill] = self.meta_valid[to_fill]
            except:
                self.add_to_meta_valid([to_fill])
                self.meta_filled[to_fill] = self.meta_valid[to_fill]
        else:
            # where the meta_filled dataset contains original values, update with
            # the values from meta_valid; in case a filling round was done before
            # any filtering; not supposed to happen, but cases exist.
            try:
                self.meta_filled[to_fill].loc[self.meta_filled[to_fill]=='original'] = \
                self.meta_valid[to_fill].loc[self.meta_filled[to_fill]=='original']
            except:
                self.add_to_meta_valid([to_fill])
                self.meta_filled[to_fill].loc[self.meta_filled[to_fill]=='original'] = \
                self.meta_valid[to_fill].loc[self.meta_filled[to_fill]=='original']

        if not to_fill in self.filled:
            self.add_to_filled([to_fill])

        # Give warning when replacing data from rain events and at the same time
        # check if arange has the right type
        try:
            rain = (self.data_type == 'WWTP') and \
                   (self.highs['highs'].loc[arange[0]:arange[1]].sum() > 1)
        except TypeError:
            raise TypeError("Slicing not possible for index type " + \
            str(type(self.data.index[0])) + " and arange argument type " + \
            str(type(arange[0])) + ". Try changing the type of the arange " + \
            "values to one compatible with " + str(type(self.data.index[0])) + \
            " slicing.")

        if rain :
            wn.warn('Data points obtained during a rain event will be replaced. '+ \
            'Make sure you are confident in this replacement method for the '+ \
            'filling of gaps in the data during rain events.')

        ###
        # CALCULATIONS
        ###
        # Get data to fill the missing data with, i.e. data from the day before,
        # and convert indices to relative ones per day; parallel for
        # self.meta_filled

        # check if arange[0] is equal to beginning of the dataset; if this is
        # the case, change it to one day further for the coming code to work


        if isinstance(self.data.index[0],dt.datetime):
            oneday = dt.timedelta(1)
            if arange[0] < self.time[0]+oneday:
                raise IndexError("No data from the day before available, "+\
                                 "adjust the range for replacement.")
                #arange[0] = arange[0] + oneday
                #wn.warn("The range for replacement given in the arange argument "+\
                #        "included the first day of data. The range was adjusted to"+\
                #        "start one day later.")
            time = pd.Series((self.filled[to_fill][arange[0]-oneday:arange[0]].index).time)
        elif isinstance(self.data.index[0],float):
            oneday = 1
            if arange[0] < self.time[0]+oneday:
                raise IndexError("No data from the day before available, "+\
                                 "adjust the range for replacement.")
                #arange[0] = arange[0] + oneday
                #wn.warn("The range for replacement given in the arange argument "+\
                #        "included the first day of data. The range was adjusted to"+\
                #        "start one day later.")
            time = pd.Series(self.filled[to_fill][arange[0]-oneday:arange[0]].index).apply(lambda x: x-int(x))

        day_before = pd.DataFrame(self.filled[to_fill][arange[0]-oneday:arange[0]].values,
                                  index=time)
        day_before.columns = ['data']
        day_before = day_before.reset_index().drop_duplicates('index',keep='first').\
                     set_index('index')

        range_to_replace[0] = range_to_replace[0] * len(day_before)
        range_to_replace[1] = range_to_replace[1] * len(day_before)

        # Create a mask to replace the filtered datapoints with nan-values,
        # if consecutive occurence lower than range_
        mask_df = pd.DataFrame(index = self.meta_valid[arange[0]:arange[1]].index)
        mask_df['count'] = (self.meta_valid[to_fill][arange[0]:arange[1]] != self.meta_valid[to_fill][arange[0]:arange[1]].\
                            shift()).astype(int).cumsum().astype(str)
        group = mask_df.groupby('count').size()
        group.index = mask_df.groupby('count').size().index.astype(str)

        # Compare the values in 'count' with the ones in the group-by object.
        # mask_df now contains the amount of consecutive true or false datapoints,
        # for every datapointday
        replace_dict = {'count':dict(group)}
        mask_df = mask_df.replace(replace_dict)

        ###
        # FILLING
        ###
        # Based on the mask and whether a datapoint is filtered, replace with
        # nan values
        if filtered_only:
            filtered_based = pd.DataFrame(self.meta_valid.loc[arange[0]:arange[1]].\
                                           loc[self.meta_filled[to_fill] == 'filtered'].index.values,
                                           columns = ['indexes'])
        if not filtered_only:
            filtered_based = pd.DataFrame(self.meta_filled.loc[arange[0]:arange[1]].index.values,
                                           columns=['indexes'])

        mask_based = pd.DataFrame(mask_df.loc[mask_df['count'] < range_to_replace[1]].\
                                          loc[mask_df['count'] > range_to_replace[0]].\
                                          index.values,columns=['indexes'])
        #mask_based.columns = ['indexes']
        # if all values are still original in meta_valid, don't use mask_based, because this
        # can contain no values and make that nothing is filled
        if len(self.meta_valid) == len(self.meta_valid[self.meta_valid[to_fill]=='original']):
            indexes_to_replace = filtered_based
        else:
            indexes_to_replace = pd.merge(filtered_based,mask_based,how='inner')

        # look up the values to replace with in the day_before dataset
        if isinstance(self.data.index[0],dt.datetime):
            indexes_to_replace['day'] = pd.Index(indexes_to_replace['indexes']).time
            indexes_to_replace['values'] = [day_before['data'][index_value] for index_value in indexes_to_replace['day']]
        elif isinstance(self.data.index[0],float):
            indexes_to_replace['day'] = indexes_to_replace['indexes'].apply(lambda x: x-int(x))
            indexes_to_replace['time_index'] = indexes_to_replace['day'].apply(find_nearest_time,args=(day_before,'time'))
            indexes_to_replace['values'] = indexes_to_replace['time_index'].apply(vlookup_day,args=(day_before,'data'))

        self.filled[to_fill][indexes_to_replace['indexes']] = indexes_to_replace['values'].values

        # Adjust in the self.meta_valid dataframe
        self.meta_filled[to_fill][indexes_to_replace['indexes']] = 'filled_profile_day_before'

        if plot:
            self.plot_analysed(to_fill)

        return None


    #####################
    ###   CHECKING
    #####################

    def _create_gaps(self,data_name,range_,number,max_size,reset=False,user_output=False):
        """
        Randomly creates gaps in the data by introducing fake 'filtered' tags in
        meta_valid. This artificial creation of gaps can be filled later to
        test the reliability of the filling algorithms.

        Parameters
        ----------
        data_name : string
            name of the column containing the data to create gaps in
        range_ : 2-element array
            the range within which gaps need to be created
        number : int
            number of gaps to create
        max_size : int
            maximum size of the gaps, expressed in data points
        reset : boolean
            if True, the meta_valid dataframe is set back to 'original' values

        Returns
        -------
        None; creates a self.meta_valid dataframe containing 'fake' tags
        creating artificial gaps in the data.

        !!!
        Watch out when using this on the original dataset, as tags might be
        changed or removed when using this function.
        !!!
        """
        # create a new meta_valid dataframe with original values
        if reset:
            self._reset_meta_valid(data_name)

        # get index locations of range_
        try:
            list_ = list(self.meta_valid.index)
            ilocs = [list_.index(range_[0]),
                     list_.index(range_[1])]

        except TypeError:
            raise TypeError("Slicing not possible for index type " + \
            str(type(self.meta_valid.index[0])) + " and range_ argument type " + \
            str(type(range_[0])) + ". Try changing the type of the range_ " + \
            "values to one compatible with " + str(type(self.meta_valid.index[0])) + \
            " slicing.")

        # create random positions where to create gaps
        positions = [rn.randrange(ilocs[0],ilocs[1]) for _ in range(number)]

        # create random sizes with maximum size of max_size
        sizes = [rn.randrange(0,max_size) for _ in range(len(positions))]

        # define integer indexes where gaps need to be created (i.e. 'filtered'
        # in meta_valid)
        locs = [np.arange(x,x+y) for x,y in zip(positions,sizes)]
        locations = np.concatenate([x for x in locs])
        # replace values when higher than length of the dataset with the maximum position
        locations = np.clip(locations,ilocs[0],ilocs[1])

        # create gaps by replacing data with 0; not nan, because this will
        # complicate comparison with filled values when using check_filling_error
        self.data[data_name].iloc[locations] = 0
        # create gaps in meta_valid
        self.meta_valid.iloc[locations] = 'filtered'

        if user_output:
            left = self.meta_valid.groupby(data_name).size()['original']*100/len(self.meta_valid)
            print(str(left)+" % of datapoints left after creating gaps")

    def _calculate_filling_error(self,data_name,filling_function,test_data_range,
                                 nr_small_gaps=0,max_size_small_gaps=0,
                                 nr_large_gaps=0,max_size_large_gaps=0,
                                 **options):
        """
        Calculates a filling error based on the articial and random creation of
        gaps in a dataset, subsequent filling of those gaps with a defined
        algorithm and comparison of the filling results with the original data.
        Because this happens randomly, results differ every time this function
        is used. To get an average of the errors, run check_filling_error.

        Parameters
        ----------
        please refer to the check_filling_error docstring for the parameter
        definitions.

        Returns
        -------
        Average filling error

        """

        orig = self.__class__(self.data[test_data_range[0]:test_data_range[1]].copy())
        gaps = self.__class__(self.data[test_data_range[0]:test_data_range[1]].copy())
        gaps.get_highs(data_name,0.9,[test_data_range[0],test_data_range[1]])


        # create gaps;
        if nr_small_gaps == 0:
            gaps._create_gaps(data_name,options['arange'],nr_large_gaps,max_size_large_gaps,reset=True)
        elif nr_large_gaps == 0:
            gaps._create_gaps(data_name,options['arange'],nr_small_gaps,max_size_small_gaps,reset=True)
        else:
            gaps._create_gaps(data_name,options['arange'],nr_small_gaps,max_size_small_gaps,reset=True)
            gaps._create_gaps(data_name,options['arange'],nr_large_gaps,max_size_large_gaps,reset=False)

        # create a column in gaps.filled containing the artificial gaps; this
        # avoids calling of the add_to_filled function in the filling functions
        # which would reset gaps.filled to the original dataset and make
        # comparing after data imputation impossible
        gaps.filled = pd.DataFrame(gaps.data[data_name].copy(),columns = [data_name],
                                   index = gaps.data.index)

        # fill gaps
        try:
            if filling_function == 'fill_missing_interpolation':
                gaps.fill_missing_interpolation(options['to_fill'],options['range_'],
                                                options['arange'])

            elif filling_function == 'fill_missing_ratio':
                gaps.fill_missing_ratio(options['to_fill'],options['to_use'],
                                        options['ratio'],options['arange'])

            elif filling_function == 'fill_missing_correlation':
                gaps.fill_missing_correlation(options['to_fill'],options['to_use'],
                                              options['arange'],options['corr_range'],
                                              options['zero_intercept'])


            elif filling_function == 'fill_missing_standard':
                gaps.calc_daily_profile(options['to_fill'],options['arange'])
                gaps.fill_missing_standard(options['to_fill'],options['arange'])

            elif filling_function == 'fill_missing_model':
                gaps.fill_missing_model(options['to_fill'],options['to_use'],
                                        options['arange'])

            elif filling_function == 'fill_missing_daybefore':
                # make a copy of options, because otherwise the object keeps on changing
                # in every for-iteration of the check_filling_error function
                arange = [options['arange'].copy()[0],
                          options['arange'].copy()[1]]
                # check if there is a 'day before' to do filling; this will not be
                # the case, because length of the dataset and to_fill range are the
                # same, but checking in this way still needs to happen because of
                # the for-loop in the check_filling_error function
                if isinstance(gaps.time[0],dt.datetime):
                    oneday = dt.timedelta(1)
                    if options['arange'][0] < gaps.time[0]+oneday:
                        arange[0] = options['arange'].copy()[0] + oneday
                elif isinstance(gaps.time[0],float):
                    oneday = 1
                    if options['arange'][0] < gaps.time[0]+oneday:
                        arange[0] = options['arange'].copy()[0] + oneday

                gaps.fill_missing_daybefore(options['to_fill'],arange,
                                            options['range_to_replace'].copy())

            else:
                raise ValueError("Entered filling function is not available for testing.")

        except:
            raise TypeError("Filling function could not be executed. Check "+\
                            "docstring of the filling function to provide "+\
                            "appropriate arguments.")

        indexes_to_compare = gaps.meta_valid[gaps.meta_valid[data_name]=='filtered'].index
        deviations = (abs(orig.data[data_name][indexes_to_compare] -
                          gaps.filled[data_name][indexes_to_compare])/ \
                      orig.data[data_name][indexes_to_compare])
        # drop inf values and calculate average
        avg_deviation = deviations.drop(deviations[deviations.values == np.inf].index).mean()*100

        if avg_deviation == 100.000000:
            # if avg deviation is 100, this means that gaps.filled was 0 on all
            # indexes to compare, which is exactly the same as was defined `
            # befor the filling, i.e. no data were filled.
            return None
        else:
            return avg_deviation

[docs]    def check_filling_error(self,nr_iterations,data_name,filling_function,
                            test_data_range,
                            nr_small_gaps=0,max_size_small_gaps=0,
                            nr_large_gaps=0,max_size_large_gaps=0,
                            **options):
        """
        Uses the _calculate_filling_error function (refer to that docstring for
        more specific info) to calculate the error on the data points that are
        filled with a certain algorithm.
        Because _calculate_filling_error inserts random gaps, results differ
        every time it is used. Check_filling_error averages this out.

        Parameters
        ----------
        nr_iterations : int
            The number of iterations to run for the calculation of the imputation
            error
        data_name : string
            name of the column containing the data the filling reliability needs
            to be checked for.
        filling function : str, wdata filling function
            the name of the filling function to be tested for reliability
        test_data_range : array of two values
            an array containing the start and end point of the test data to be used.
            IMPORTANT: for testing filling with correlation, this range needs to
            include the range for correlation calculation and the filling range.
        nr_small_gaps / nr_large_gaps: int
            the number of small/large gaps to create in the dataset for testing
        max_size_small_gaps / max_size_large_gaps: int
            the maximum size of the gaps inserted in the data, expressed in data
            points
        **options:
            Arguments for the filling function; refer to the relevant filling
            function to know what arguments to give

        Note
        ------
        When checking for the error on data filling, a period (arange argument)
        with mostly reliable data should be used. If for example large gaps are
        already present in the given data, this will heavily influence the
        returned error, as filled values will be compared with the values from
        the data gap.

        Returns
        -------
        None
            adds the average filling error the self.filling_error dataframe

        """
        # shut off warnings, to avoid e.g. warning about replacing datapoints
        # in wet weather
        wn.filterwarnings("ignore")

        if nr_small_gaps == 0 and nr_large_gaps == 0 :
                raise ValueError("No information was provided to make the gaps "+\
                                 "with. Please specify the number of small or "+\
                                 "large gaps you want to create for testing")

        filling_errors = pd.Series([])
        for iteration in range(0,nr_iterations):
            iter_error = self._calculate_filling_error(data_name,filling_function,test_data_range,
                                                       nr_small_gaps=nr_small_gaps,
                                                       max_size_small_gaps=max_size_small_gaps,
                                                       nr_large_gaps=nr_large_gaps,
                                                       max_size_large_gaps=max_size_large_gaps,
                                                       **options)
            #print(options_filling_function)
            if iter_error == None:
                # turn warnings on again
                wn.filterwarnings("always")
                raise ValueError("Checking of the filling function could not "+\
                                 "be executed. Check docstring of the filling "+\
                                 "function to provide appropriate arguments.")

            filling_errors = filling_errors.append(pd.Series([iter_error]))

        avg = filling_errors.dropna().mean()

        self.filling_error.ix[data_name] = avg
        print('Average deviation of imputed points from the original ones is '+\
              str(avg)+"%. This value is also saved in self.filling_error.")

        # turn warnings on again
        wn.filterwarnings("always")


#==============================================================================
# LOOKUP FUNCTIONS
#==============================================================================

[docs]def find_nearest_time(value,df,column):
    """
    Returns the (time) value in a dataframe column nearest to a given value

    Parameters
    ----------
    value : float
        time value to find the closest value for in 'df'
    df : pd.Dataframe
        dataframe to use
    column : str
        column to check 'value' against
    """
    return (np.abs(df[column]-value)).argmin()

[docs]def vlookup_day(value,df,column):
    """
    Returns the dataframe index of a given value
    """
    return df[column].loc[value]


####START ADJUSTING HERE NEXT TIME!

[docs]def drop_peaks(self,data_name,cutoff,inplace=True,log_file=None):
    """
    Filters out the peaks larger than a cut-off value in a dataseries

    Parameters
    ----------
    data_name : str
        the name of the column to use for the removal of peak values
    cutoff : int
        cut off value to use for the removing of peaks; values with an
        absolute value larger than this cut off will be removed from the data
    inplace : bool
        indicates whether a new dataframe is created and returned or whether
        the operations are executed on the existing dataframe (nothing is
        returned)
    log_file : str
        string containing the directory to a log file to be written out
        when using this function

    Returns
    -------
    LabSensorBased object (if inplace=False)
        the dataframe from which the double values of 'data' are removed
    None (if inplace=True)
    """
    original = len(self.data)
    if inplace == False:
        data = self.data.copy()
        data.drop(data[abs(data[data_name]) > cutoff].index,inplace=True)
        data.reset_index(drop=True,inplace=True)
        new = len(data)
        if log_file == None:
            _print_removed_output(original,new)
        elif type(log_file) == str:
            _log_removed_output(log_file,original,new)
        else :
            raise TypeError('Please provide the location of the log file as '+ \
                            'a string type, or leave the argument if no log '+ \
                            'file is needed.')

        return self.__class__(data,self.timename)

    elif inplace == True:
        self.drop(self.data[abs(self.data[data_name]) > cutoff].index,
                            inplace=True)
        self.data.reset_index(drop=True,inplace=True)
        new = len(self.data)
        if log_file == None:
            _print_removed_output(original,new)
        elif type(log_file) == str:
            _log_removed_output(log_file,original,new)
        else :
            raise TypeError('Please provide the location of the log file as '+ \
                            'a string type, or leave the argument if no log '+ \
                            'file is needed.')


def _select_slope(self,ydata,down=True,limit=0):#,based_on_max=True):#,bounds=[1,1]):

    #TO BE ADJUSTED BASED ON ALL FUNCTIONS FILE!
    """
    Selects down- or upward sloping data from a given dataseries, based on
    the maximum in the dataseries. This requires only one maximum to be
    present in the dataset.

    Parameters
    ----------
    ydata : str
        name of the column containing the data for which slopes, either up
        or down, need to be selected
    down : bool
        if True, the downwards slopes are selected, if False, the upward
        slopes
    based_on_max : bool
        if True, the data is selected based on the maximum of the data, if
        false it is based on the minimum
    bounds : array
        array containing two integer values, indicating the extra margin of
        values that needs to be dropped from the dataset to avoid selecting
        irregular data (e.g. not straightened out after reaching of maximum)

    Returns
    -------
    LabSensorBased object:
        a dataframe from which the non-down or -upward sloping data are dropped
    """
    #if based_on_max == True:
    drop_index = self.data[ydata].idxmax()
    if down == True:
        try:
            print('Selecting downward slope:',drop_index,\
            'datapoints dropped,',len(self.data)-drop_index,\
            'datapoints left.')

            self.data = self.data[drop_index:]
            self.data.reset_index(drop=True,inplace=True)
            return self.__class__(self.data,self.timename)
        except:#IndexError:
            print('Not enough datapoints left for selection')

    elif down == False:
        try:
            print('Selecting upward slope:',len(self.data)-drop_index,\
            'datapoints dropped,',drop_index,'datapoints left.')

            self.data = self.data[:drop_index]
            self.data.reset_index(drop=True,inplace=True)
            return self.__class__(self.data,self.timename)
        except:#IndexError:
            print('Not enough datapoints left for selection')

    #    elif based_on_max == False:
    #        drop_index = dataframe[ydata].idxmin()
    #        if down == True:
    #            try:
    #                print 'Selecting downward slope:',drop_index+sum(bounds),\
    #                'datapoints dropped,',len(dataframe)-drop_index-sum(bounds),\
    #                'datapoints left.'
    #
    #                dataframe = dataframe[bounds[0]:drop_index-bounds[1]]
    #                dataframe.reset_index(drop=True,inplace=True)
    #                return dataframe
    #            except IndexError:
    #                print 'Not enough datapoints left for selection'
    #
    #        elif down == False:
    #            try:
    #                print 'Selecting upward slope:',len(dataframe)-drop_index+sum(bounds),\
    #                'datapoints dropped,',drop_index-sum(bounds),'datapoints left.'
    #
    #                dataframe = dataframe[drop_index+bounds[0]:-bounds[1]]
    #                dataframe.reset_index(drop=True,inplace=True)
    #                return dataframe
    #            except IndexError:
    #                print 'Not enough datapoints left for selection'
    #

[docs]def go_WEST(raw_data,time_data,WEST_name_conversion):
    """
    Saves a WEST compatible file (influent or other inputs)

    parameters
    ----------
    raw_data: str or pd DataFrame

    time_data:

    WEST_name_conversion: pd DataFrame with column names: WEST, units and RAW
        dataframe containing three columns: the column names for the WEST-compatible file,
        the units to appear in the WEST-compatible file and the column names of the raw
        data file.

    output
    ------
    None

    """
    #if type(raw_data) == str:
    #    try data = pd.read_csv(raw_data,sep= '\t')
    #    except print('Provide valid file name (including path) to read.')
    #else:
    data = raw_data
    #if not data.columns == WEST_name_conversion['raw_data_name']
    #    print('raw data columns should be the same as the raw data colum values given in WEST_name_conversion')
    #    return None

    WEST_compatible = pd.DataFrame()
    for i in range(0,len(WEST_name_conversion)):
        WEST_compatible[WEST_name_conversion['WEST'][i]] = data[WEST_name_conversion['RAW'][i]]
    help_df = pd.DataFrame(WEST_name_conversion['units']).transpose()
    help_df.columns = [WEST_compatible.columns]
    WEST_compatible = help_df.append(WEST_compatible)
    WEST_compatible.insert(0,'#t',time_data)
    WEST_compatible['#t']['units']='#d'
    return WEST_compatible



###############################################################################
##                          HELP FUNCTIONS                                   ##
###############################################################################

def _print_removed_output(original,new,type_):
    """
    function printing the output of functions that remove datapoints.

    Parameters
    ----------
    original : int
        original length of the dataset
    new : int
        length of the new dataset
    type_ : str
        'removed' or 'dropped'

    """
    print('Original dataset:',original,'datapoints')
    print('New dataset:',new,'datapoints')
    print(original-new,'datapoints ',type_)

def _log_removed_output(log_file,original,new,type_):
    """
    function writing the output of functions that remove datapoints to a log file.

    Parameters
    ----------
    log_file : str
        string containing the directory to the log file to be written out
    original : int
        original length of the dataset
    new : int
        length of the new dataset
    type_ : str
        'removed' or 'dropped'
    """
    log_file = open(log_file,'a')
    log_file.write(str('\nOriginal dataset: '+str(original)+' datapoints; new dataset: '+
                    str(new)+' datapoints'+str(original-new)+' datapoints ',type_))
    log_file.close()

[docs]def total_seconds(timedelta_value):
    return timedelta_value.total_seconds()

[docs]def absolute_to_relative(series,start_date,unit='d',decimals=5):
    """
    converts a pandas series with datetime timevalues to relative timevalues
    in the given unit, starting from start_date

    parameters
    ----------
    series : pd.Series
        series of datetime of comparable values
    unit : str
        unit to which to convert the time values (sec, min, hr or d)

    output
    ------

    """
    try:
        time_delta = series - series[0]
    except('IndexError'):
        raise IndexError('The passed series appears to be empty. To calculate ' + \
        'a relative timeseries, an absolute timeseries is necessary.')
    start = total_seconds(series[0] - start_date)

    relative = time_delta.map(total_seconds)
    if unit == 'sec':
        relative = np.array(relative) + start
    elif unit == 'min':
        relative = (np.array(relative) + start) / (60)
    elif unit == 'hr':
        relative = (np.array(relative) + start) / (60*60)
    elif unit == 'd':
        relative = (np.array(relative) + start) / (60*60*24)

    return relative.round(decimals)