Pandas DF related tools for acquisition, parsing, plotting etc
Generate new categorical data tyoes, option for ordered values.
def categorical_datatype(levels, ordered): """ Create new categorical datatype :param levels: Category Levels, list/tuple form. :param ordered: Boolean choice, determeines if input list is in hierarchical order of entry. :return: New categorical datatype w/ levels and ordering choice. """ if isinstance(type(levels), list): category_name = CategoricalDtype(levels, ordered=ordered) return category_name else: try: levels = list(levels) except ValueError: print("Please enter a valid, non-empty data structure, list/tuple of levels") else: category_name = CategoricalDtype(levels, ordered=ordered) return category_nameSort indexes of multiindex array. Native solution sort_index(by='attribute') currently deprecated.
import pandas as pd from pandas.api.types import CategoricalDtype def sort_mixed_dtypes(df, indexes): """ Sort indexes in multiindexed dataframe, helpful for multiple custom CategoricalDatatypes :DataFrame df: Target dataframe :List indexes: Index positions to sort :return: DataFrame with sorted index """ return df.reset_index().set_index(indexes).sort_index()def match_case_dataframe(df, target_labels, target_column): """ Search a dataframe column for specific values/strings, if match return them in a new dataframe. Ex. Search Column A for all rows containing 'Word' :param df: Pandas DataFrame containing target dataset :param target_labels: List of labels to be matched to observations :param target_column: Name of column to be assayed :return filtered_df : DataFrame containing observations matching target labels """ if isinstance(target_labels, str): filtered_df = df[df[target_column].isin([target_labels])] print('Results:') find_length(filtered_df, target_labels) print(filtered_df.head()) return filtered_df else: filtered_df = df[df[target_column].isin(target_labels)] print('Results:') find_length(filtered_df, target_labels) print(filtered_df.head()) return filtered_df def find_length(processed_df, target): """ Report how many rows matched target keyword. :param processed_df: Filtered Pandas DataFrame containing target dataset :param target: Target label previously passed to 'match_case_dataframe' :return: """ matches = len(processed_df) if matches <= 1: print('{} row returned containing \'{}\''.format(matches, target)) else: print('{} rows returned containing \'{}\''.format(matches, ", ".join(target)))Combine a list of dataframes into single unit, display head and shape.
def concat(df_list): df_output = pd.concat(df_list) print(df_output.shape) print(df_output.head()) return df_outputReplace NaN observations in specified column with mean of populated column
def replace_with_mean(df, colname): #Generate mean values of specified column col_mean = np.mean(df[colname]) #Add mean values to DataFrame.colname df[colname] = df[colname].fillna(col_mean) print(df.info()) return dfTest if dataframe has missing data via Assert statement
def check_missing_data(df): #Test assertion of null values assert pd.notnull(df).all().all()Drop duplicates
def drop_dups(df, cols): #Generate a pre-processed representation of specified column pre_col = pd.DataFrame(df[cols]) #Check column info print(pre_col.info()) #Drop any duplicate rows within pre_col post_col = pre_col.drop_duplicates() #Check if successful print(post_col.info()) #Return the processed DataFrame return post_coldef merge_many(t1, t2, t3, t1_left, t2_right, t3_left, tkey_right): #Merge T1 and T2 into first_merge table first_merge = pd.merge(left=t1, right=t2, left_on=t1_left, right_on=t2_right) #Merge first_merge and T3 into second_merge second_merge = pd.merge(left=first_merge, right=t3, left_on=t3_left, right_on=tkey_right) #Return 3 merged tables into a single table return second_mergeimport pandas as pd import matplotlib.pyplot as plt #-------------------- #Function: hist_columns() #Purpose: Easily generate a Histogram pyplot from a DataFrame and list of columns #Inputs: Pandas DataFrame, Log condition(T/F), Rotation value, single or variable column names #Outputs: Histogram plots of given column in Pandas DataFrame #-------------------- def hist_columns(df, log, rot, *args): # For each column name given, check if log axes chosen, plot histogram for arg in args: #Check if user passed a logarithmic scale designation if log == "True": #Create Histogram plot df[arg].plot(kind='hist', logx=True, logy=True, rot=rot) #Add x-axis label plt.xlabel(str(arg)) #Show plot plt.show() #No log scale specified else: #Create Histogram plot df[arg].plot(kind='hist', logx=False, logy=False, rot=rot) #Add x-axis label plt.xlabel(str(arg)) #Show plot plt.show()#-------------------- #Purpose: Generate a scatter plot from given datasets #Inputs: X-axis data, Y-axis data, X-Axis Label, Y-Axis Label, Graph Title #Outputs: Populated Scatter Plot with Labels/Title and Legend #-------------------- def make_scatter(x, y, xlab, ylab, gtitle): plt.scatter(x = x, y = y) plt.xlabel(xlab) plt.ylabel(ylab) plt.title(gtitle) plt.legend(loc=1) plt.grid() plt.show()#-------------------- #Purpose: Generates a TimeSeries Line Plot with labels #Inputs: Pandas DataFrame, Independent Variable, Dependent Variable, X-Label, Y-Label, Graph Title #Outputs: TimeSeries LinePlot #-------------------- def make_timeseries(df, x, y, xlab, ylab, gtitle): #Generate a line plot df.plot(kind='line', x=x, y=y) #Add X&Y axis labels plt.xlabel(xlab) plt.ylabel(ylab) #Add graph title plt.title(gtitle) #Add Legend in position 1 plt.legend(loc=1) #Add gridlines to graph plt.grid() #Show plot plt.show()