Source code for sekupy.results.dataframe

import pandas as pd
import numpy as np

import logging
logger = logging.getLogger(__name__)


[docs]
def array2df(dataframe, key):
    # TODO : Documentation
    df = pd.DataFrame(dataframe[key].values.tolist(), 
                      columns=['%s_%d' %(key, i) \
                                    for i in range(dataframe[key].values[0].shape[0])])


    df_keys = pd.DataFrame([row[:-1] for row in dataframe.values.tolist()], 
                                    columns=dataframe.keys()[:-1])

    df_concat = pd.concat([df, df_keys])

    return df_concat




[docs]
def query_rows(dataframe, keys, attr, fx=np.max):
    """[summary]
    
    Parameters
    ----------
    dataframe : [type]
        [description]
    keys : [type]
        [description]
    attr : [type]
        [description]
    fx : [type], optional
        [description], by default np.max
    
    Returns
    -------
    [type]
        [description]
    """


    df_values = apply_function(dataframe, keys, attr=attr, fx=fx)

    queried_df = []

    for i, row in df_values.iterrows():
        mask = np.ones(dataframe.shape[0], dtype=bool)
        for k in df_values.keys():
            mask = np.logical_and(mask, dataframe[k].values == row[k])

        queried_df.append(dataframe.loc[mask])


    return pd.concat(queried_df)






[docs]
def apply_function(dataframe, keys, attr='features', fx=lambda x:np.vstack(x).sum(0), **fx_kwargs):
    """This function executes an operation on the dataframe, it groups the dataframe
    by using the `keys` parameter and applies a function to values indicated by `attr`.
    
    Parameters
    ----------
    dataframe : pandas Dataframe
        The dataframe to be processed by the function
    keys : list of string
        The keys that should be used to group the dataframe. These keys are those that
        were preserved in the output.
    attr : str, optional
        The key were values should be found (the default is 'features')
    fx : function, optional
        The function that is applied to values. (the default is lambda x:np.vstack(x).sum(0))
    fx_kwargs : dictionary, optional
        Arguments passed to the fx function.
            
    Returns
    -------
    dataframe : The processed dataframe.
    """

    df_sum = dataframe.groupby(keys)[attr].apply(fx, **fx_kwargs)

    return df_sum.reset_index()




[docs]
def get_weights(dataframe):

    from scipy.stats import zscore

    df_weights = []
    for i, row in dataframe.iterrows():
        matrix = np.zeros_like(row['features'], dtype=np.float16)
        mask = np.equal(row['features'], 1)

        matrix[mask] = zscore(row['weights'])

        row['weights'] = matrix
        
        df_weights.append(row)

    return pd.DataFrame(df_weights)




[docs]
def clean_dataframe(dataframe, keys=[]):
    """Clean columns that are not informative.
    A list with keys can be provided to delete unuseful
    columns.
    
    Parameters
    ----------
    dataframe : [type]
        [description]
    """
    from collections import Counter

    if keys == []:
        keys = dataframe.keys()
      
    for k in keys:

        if 'score' in k:
            continue

        logger.debug(k)

        unique = Counter(dataframe[k].values).keys()

        if len(unique) == 1:
            dataframe = dataframe.drop(k, axis=1)

    return dataframe





[docs]
def dataframe_slicer(data, row=None, col=None, hue=None):
    """Generator for name indices and data subsets for each unique value
    of row, col, hue.

    Adaptively stolen from `seaborn`.

    Yields
    ------
    (i, j, k), data_ijk : tuple of ints, DataFrame
        The ints provide an index into the {row, col, hue}_names attribute,
        and the dataframe contains a subset of the full data corresponding
        to each facet. The generator yields subsets that correspond with
        the self.axes.flat iterator, or self.axes[i, j] when `col_wrap`
        is None.

    """
    from seaborn._core import categorical_order
    from itertools import product

    # Construct masks for the row variable
    if row is not None:
        row_names = categorical_order(data[row])
        row_masks = [data[row] == n for n in row_names]
    else:
        row_masks = [np.repeat(True, len(data))]

    # Construct masks for the column variable
    if col is not None:
        col_names = categorical_order(data[col])
        col_masks = [data[col] == n for n in col_names]
    else:
        col_masks = [np.repeat(True, len(data))]

    # Construct masks for the hue variable
    if hue is not None:
        hue_names = categorical_order(data[hue])
        hue_masks = [data[hue] == n for n in hue_names]
    else:
        hue_masks = [np.repeat(True, len(data))]

    # Here is the main generator loop
    for (i, row), (j, col), (k, hue) in product(enumerate(row_masks),
                                                enumerate(col_masks),
                                                enumerate(hue_masks)):
        data_ijk = data[row & col & hue ] # Check null
        yield (i, j, k), data_ijk

        
        

[docs]
def dataframe_dummy_columns(dataframe, keyword, mapping):
    # Assuming your data is in a DataFrame called 'dataframe' and the column is named keyword
    dataframe[keyword] = dataframe[keyword].astype(str)  # Convert everything to strings for consistency
    dataframe[keyword] = dataframe[keyword].replace('nan', np.nan)  # Replace 'nan' strings with NaN values

    # Split combined categories and create a list of all unique categories
    all_categories = set()
    for categories in dataframe[keyword]:
        if pd.notna(categories):  # Skip NaN values
            for category in categories.split(','):
                all_categories.add(category.strip())  # Strip leading/trailing spaces

    # Create a new DataFrame with dummy variables
    columns = list(mapping.values())
    dataframe_dummies = pd.DataFrame(0, index=dataframe.index, 
                                     columns=columns)

    # Fill in the dummy variables
    for i, categories in enumerate(dataframe[keyword]):
        if pd.notna(categories):
            for category in categories.split(','):
                dataframe_dummies.at[i, mapping[category.strip()]] = 1

    # Combine the original DataFrame with the dummy variables
    dataframe_final = pd.concat([dataframe, dataframe_dummies], axis=1)
    
    return dataframe_final