Source code for wwdata.data_reading_functions

# -*- coding: utf-8 -*-
"""
data_reading_functions provides functionalities for data reading in the context of the wwdata package.
Copyright (C) 2016 Chaim De Mulder

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Affero General Public License for more details.

You should have received a copy of the GNU Affero General Public License
along with this program.  If not, see http://www.gnu.org/licenses/.
"""

import sys
import os
from os import listdir
import pandas as pd
import scipy as sp
import numpy as np
import matplotlib.pyplot as plt   #plotten in python
import xlrd

[docs]def list_files(path,ext): """ Returns a list of files in a certain folder ('path') with a certain extension ('ext') Parameters ---------- path : str path to the folder containing the files to be listed ext : str extension of the files to be listed; current options are 'excel','text' or 'csv' """ if ext == 'excel': files = [f for f in listdir(path) if '.xls' in f] elif ext == 'text': files = [f for f in listdir(path) if f.endswith('.txt')] elif ext == 'csv': files = [f for f in listdir(path) if f.endswith('.csv')] else: print('No files with',ext,'extension found in directory',path,'Please \ choose one of the following: text, excel, csv') return None return files
[docs]def remove_empty_lines(path,ext): """ Removes the empty lines from files in a certain folder ('path') and with a certain extension ('ext') Parameters ---------- path : str path to the folder containing the files in which empty lines need to be removed ext : str extension of the files in which empty lines need to be removed; current options are 'excel','text' or 'csv' """ files = list_files(path,ext) if not files: print('Please provide a directory that contains '+ext+' files.') return None for filename in files: filepath = os.path.join(path,filename) data = pd.read_csv(filepath,sep='\t') data.dropna(axis=0,inplace=True) #filepath_new = os.path.join(path,filename+'_') data.to_csv(filepath,sep='\t',index=False,index_label=False) return None
[docs]def find_and_replace(path,ext,replace): """ Finds the files with a certain extension in a directory and applies a find- replace action to those files. Removes the old files and produces files with a prefix stating the replacing value. Parameters ---------- path : str the path name of the directory to apply the function to ext : str the extension of the files to be searched (excel, text or csv) replace : array of str the first value of replace is the string to be replaced by the second value of replace. """ files = list_files(path,ext) if not files: print('Please provide a directory that contains '+ext+' files.') return None for filename in files: filepath = os.path.join(path,filename) filedata = None with open(filepath, 'r') as file : filedata = file.read() # Replace the target string filedata = filedata.replace(replace[0], replace[1]) # Write the file out again with open(filepath, 'w') as file: file.write(filedata) #data = pd.read_csv(filepath,sep='\t') #data.replace(to_replace=replace[0],value=replace[1],inplace=True) #data.to_csv(filepath,sep='\t',index=False,index_label=False) return None
[docs]def sort_data(data,based_on,reset_index=[False,'new_index_name'], convert_to_timestamp=[True,'time_name','%d.%m.%Y %H:%M:%S']): """ Sorts a dataset based on values in one of the columns and splits them in different dataframes, returned in the form of one dictionary Parameters ---------- data : pd.dataframe the dataframe containing the data that needs to be sorted based_on : str the name of the column that contains the names or values the sorting should be based on reset_index : [bool,str] array indicating if the index of the sorted datasets should be reset to a new one; if first element is true, the second element is the title of the column to use as new index; default: False Returns ------- dict : A dictionary of pandas dataframes with as labels those acquired from the based_on column """ dictionary = {} measurement_codes = pd.Series(data[based_on].ravel()).unique() for i in measurement_codes: dictionary[i] = data[data[based_on]==i].drop(based_on,axis=1) if convert_to_timestamp[0] == True & reset_index[0] == True: dictionary[i][convert_to_timestamp[1]] = \ pd.to_datetime(dictionary[i][convert_to_timestamp[1]], format=convert_to_timestamp[2]) dictionary[i].set_index(reset_index[1],inplace=True) elif convert_to_timestamp[0] == True & reset_index[0] == False: dictionary[i][convert_to_timestamp[1]] = \ pd.to_datetime(dictionary[i][convert_to_timestamp[1]], format=convert_to_timestamp[2]) elif reset_index[0] == True & convert_to_timestamp[0] == False: dictionary[i].set_index(reset_index[1],inplace=True) print('Sorting',i,'...') return dictionary
def _get_header_length(read_file,ext='text',comment='#'): """ Determines the amount of rows that are part of the header in a file that is already opened and readable Parameters ---------- read_file : opened file an opened file object that is readable ext : str the extension (in words) of the file the headerlength needs to be found for comment : str comment symbol used in the files Returns ------- headerlength : int the amount of rows that are part of the header in the read file """ headerlength = 0 header_test = comment counter = 0 if ext == 'excel' or ext == 'zrx': while header_test == comment: header_test = str(read_file.sheet_by_index(0).cell_value(counter,0))[0] headerlength += 1 counter +=1 elif ext == 'text' or ext == 'csv': while header_test == comment: header_test = read_file.readline()[0] headerlength += 1 return headerlength-1
[docs]def read_mat(path): """ TO DO Reads in .mat datafiles and returns them as pd.DataFrame http://stackoverflow.com/questions/24762122/read-matlab-data-file-into-python-need-to-export-to-csv """
#Also write separate script for converting all .mat files in one dir to .csv files def _get_header_length(read_file,ext='text',comment='#'): """ Determines the amount of rows that are part of the header in a file that is already opened and readable Parameters ---------- read_file : opened file an opened file object that is readable ext : str the extension (in words) of the file the headerlength needs to be found for comment : str comment symbol used in the files Returns ------- headerlength : int the amount of rows that are part of the header in the read file """ headerlength = 0 header_test = comment counter = 0 if ext == 'excel' or ext == 'zrx': while header_test == comment: header_test = str(read_file.sheet_by_index(0).cell_value(counter,0))[0] headerlength += 1 counter +=1 elif ext == 'text': while header_test == comment: header_test = read_file.readline()[0] headerlength += 1 return headerlength-1 def _open_file(filepath,ext='text'): """ Opens file of a given extension in readable mode Parameters ---------- filepath : str the complete path to the file to be opened in read mode ext : str the extension (in words) of the file that needs to be opened in read mode Returns ------- The opened file in read mode """ if ext == 'text' or ext == 'zrx' or ext == 'csv': return open(filepath, 'r') elif ext == 'excel': return xlrd.open_workbook(filepath) def _read_file(filepath,ext='text',skiprows=0,sep='\t',encoding='utf8',decimal='.'): """ Read a file of given extension and save it as a pandas dataframe Parameters ---------- filepath : str the complete path to the file to be read and saved as dataframe ext : str the extension (in words) of the file that needs to be read and saved skiprows : int number of rows to skip when reading a file Returns ------- A pandas dataframe containing the data from the given file """ if ext == 'text': return pd.read_table(filepath,skiprows=skiprows,decimal='.',low_memory=False,index_col=None) elif ext == 'excel': return pd.read_excel(filepath,skiprows=skiprows,low_memory=False,index_col=None) elif ext == 'csv': return pd.read_csv(filepath,sep=sep,skiprows=skiprows,encoding=encoding, error_bad_lines=False,low_memory=False,index_col=None)
[docs]def join_files(path,files,ext='text',sep=',',comment='#',encoding='utf8',decimal='.'): """ Reads all files in a given directory, joins them and returns one pd.dataframe Parameters ---------- path : str path to the folder that contains the files to be joined files : list list of files to be joined, must be the same extension ext : str extention of the files to read; possible: excel, text, csv sep : str the separating element (e.g. , or \t) necessary when reading csv-files comment : str comment symbol used in the files sort : array of bool and str if first element is true, apply the sort function to sort the data based on the tags in the column mentioned in the second element of the sort array Returns ------- pd.dataframe: pandas dataframe containin concatenated files in the given directory """ #Initialisations data = pd.DataFrame() #Select files based on extension and sort files alphabetically to make sure #they are added to each other in the correct order #files = list_files(path,ext) files.sort() print('joining',len(files),'files...') #Read files for file_name in files: dir_file_path = os.path.join(path,file_name) with _open_file(dir_file_path,ext) as read_file: headerlength = _get_header_length(read_file,ext,comment) data = data.append(_read_file(dir_file_path,ext=ext,sep=sep, skiprows=headerlength, decimal=decimal,encoding=encoding), ignore_index=True) print('Adding file',file_name,'to dataframe') data.to_csv('joined_files',sep=sep) return data
[docs]def write_to_WEST(df,file_normal,file_west,units,filepath=os.getcwd(),fillna=True): """ writes a text-file that is compatible with WEST. Adds the units as they are given in the 'units' argument. Parameters ---------- df : pd.DataFrame the dataframe to write to WEST file_normal : str name of the original file to write, not yet compatible with WEST file_west : str name of the file that needs to be WEST compatible units : array of strings array containing the units for the respective columns in df filepath : str directory to save the files in; defaults to the current one fillna : bool when True, replaces nan values with 0 values (this might avoid WEST problems later one). Returns ------- None; writes files """ if fillna: df = df.fillna(0) df.to_csv(os.path.join(filepath,file_normal),sep='\t') f = open(os.path.join(filepath,file_normal),'r') columns = f.readline() temp = f.read() f.close() f = open(os.path.join(filepath,file_west), 'w') f.write('#.t' + columns) unit_line = '#d\t' for i in range(0,len(units)-1): unit_line = unit_line + '{}\t'.format(units[i]) unit_line = unit_line + '{}\n'.format(units[-1]) f.write(unit_line) f.write(temp) f.close()