Source code for sekupy.results.dataframe

import pandas as pd
import numpy as np

import logging
logger = logging.getLogger(__name__)

[docs] def array2df(dataframe, key): # TODO : Documentation df = pd.DataFrame(dataframe[key].values.tolist(), columns=['%s_%d' %(key, i) \ for i in range(dataframe[key].values[0].shape[0])]) df_keys = pd.DataFrame([row[:-1] for row in dataframe.values.tolist()], columns=dataframe.keys()[:-1]) df_concat = pd.concat([df, df_keys]) return df_concat
[docs] def query_rows(dataframe, keys, attr, fx=np.max): """[summary] Parameters ---------- dataframe : [type] [description] keys : [type] [description] attr : [type] [description] fx : [type], optional [description], by default np.max Returns ------- [type] [description] """ df_values = apply_function(dataframe, keys, attr=attr, fx=fx) queried_df = [] for i, row in df_values.iterrows(): mask = np.ones(dataframe.shape[0], dtype=bool) for k in df_values.keys(): mask = np.logical_and(mask, dataframe[k].values == row[k]) queried_df.append(dataframe.loc[mask]) return pd.concat(queried_df)
[docs] def apply_function(dataframe, keys, attr='features', fx=lambda x:np.vstack(x).sum(0), **fx_kwargs): """This function executes an operation on the dataframe, it groups the dataframe by using the `keys` parameter and applies a function to values indicated by `attr`. Parameters ---------- dataframe : pandas Dataframe The dataframe to be processed by the function keys : list of string The keys that should be used to group the dataframe. These keys are those that were preserved in the output. attr : str, optional The key were values should be found (the default is 'features') fx : function, optional The function that is applied to values. (the default is lambda x:np.vstack(x).sum(0)) fx_kwargs : dictionary, optional Arguments passed to the fx function. Returns ------- dataframe : The processed dataframe. """ df_sum = dataframe.groupby(keys)[attr].apply(fx, **fx_kwargs) return df_sum.reset_index()
[docs] def get_weights(dataframe): from scipy.stats import zscore df_weights = [] for i, row in dataframe.iterrows(): matrix = np.zeros_like(row['features'], dtype=np.float16) mask = np.equal(row['features'], 1) matrix[mask] = zscore(row['weights']) row['weights'] = matrix df_weights.append(row) return pd.DataFrame(df_weights)
[docs] def clean_dataframe(dataframe, keys=[]): """Clean columns that are not informative. A list with keys can be provided to delete unuseful columns. Parameters ---------- dataframe : [type] [description] """ from collections import Counter if keys == []: keys = dataframe.keys() for k in keys: if 'score' in k: continue logger.debug(k) unique = Counter(dataframe[k].values).keys() if len(unique) == 1: dataframe = dataframe.drop(k, axis=1) return dataframe
[docs] def dataframe_slicer(data, row=None, col=None, hue=None): """Generator for name indices and data subsets for each unique value of row, col, hue. Adaptively stolen from `seaborn`. Yields ------ (i, j, k), data_ijk : tuple of ints, DataFrame The ints provide an index into the {row, col, hue}_names attribute, and the dataframe contains a subset of the full data corresponding to each facet. The generator yields subsets that correspond with the self.axes.flat iterator, or self.axes[i, j] when `col_wrap` is None. """ from seaborn._core import categorical_order from itertools import product # Construct masks for the row variable if row is not None: row_names = categorical_order(data[row]) row_masks = [data[row] == n for n in row_names] else: row_masks = [np.repeat(True, len(data))] # Construct masks for the column variable if col is not None: col_names = categorical_order(data[col]) col_masks = [data[col] == n for n in col_names] else: col_masks = [np.repeat(True, len(data))] # Construct masks for the hue variable if hue is not None: hue_names = categorical_order(data[hue]) hue_masks = [data[hue] == n for n in hue_names] else: hue_masks = [np.repeat(True, len(data))] # Here is the main generator loop for (i, row), (j, col), (k, hue) in product(enumerate(row_masks), enumerate(col_masks), enumerate(hue_masks)): data_ijk = data[row & col & hue ] # Check null yield (i, j, k), data_ijk
[docs] def dataframe_dummy_columns(dataframe, keyword, mapping): # Assuming your data is in a DataFrame called 'dataframe' and the column is named keyword dataframe[keyword] = dataframe[keyword].astype(str) # Convert everything to strings for consistency dataframe[keyword] = dataframe[keyword].replace('nan', np.nan) # Replace 'nan' strings with NaN values # Split combined categories and create a list of all unique categories all_categories = set() for categories in dataframe[keyword]: if pd.notna(categories): # Skip NaN values for category in categories.split(','): all_categories.add(category.strip()) # Strip leading/trailing spaces # Create a new DataFrame with dummy variables columns = list(mapping.values()) dataframe_dummies = pd.DataFrame(0, index=dataframe.index, columns=columns) # Fill in the dummy variables for i, categories in enumerate(dataframe[keyword]): if pd.notna(categories): for category in categories.split(','): dataframe_dummies.at[i, mapping[category.strip()]] = 1 # Combine the original DataFrame with the dummy variables dataframe_final = pd.concat([dataframe, dataframe_dummies], axis=1) return dataframe_final