scmkl.dataframes

  1import os
  2import re
  3import numpy as np
  4import pandas as pd
  5
  6
  7def _parse_result_type(results : dict | None, rfiles : dict | None) -> bool:
  8    '''
  9    This function simply returns a bool for whether or not there are 
 10    multiple runs present while checking that There is one dict and 
 11    one Nonetype between `results` and `rfiles`.
 12    '''
 13    dtypes = (type(results), type(rfiles))
 14    none_in_dtypes = type(None) in dtypes
 15    dict_in_dtypes = dict in dtypes
 16    both_in_dtypes = none_in_dtypes and dict_in_dtypes
 17
 18    # Ensuring that at least one of dtypes is None
 19    assert both_in_dtypes, "Only `rfiles` or `results` can be provided"
 20
 21    if type(rfiles) is dict:
 22        mult_files = True
 23    else:
 24        mult_files = False
 25
 26    return mult_files
 27
 28
 29def _parse_metrics(results, key : str | None = None, 
 30                   include_as = False) -> pd.DataFrame:
 31    '''
 32    This function returns a pd.DataFrame for a single scMKL result.
 33    '''
 34    alpha_vals = []
 35    met_names = []
 36    met_vals = []
 37
 38    # If statement ensuring results is a scMKL results with metrics
 39    if 'Metrics' in results.keys():
 40        for alpha in results['Metrics'].keys():
 41            for metric, value in results['Metrics'][alpha].items():
 42                alpha_vals.append(alpha)
 43                met_names.append(metric)
 44                met_vals.append(value)
 45
 46    # Fix this for include_as parameter
 47    else:
 48        print(f"{key} is not a scMKL result and will be ignored.")
 49            
 50    df = pd.DataFrame({'Alpha' : alpha_vals,
 51                       'Metric' : met_names,
 52                       'Value' : met_vals})
 53    
 54    if include_as:
 55        assert 'Alpha_star' in results.keys(), "'Alpha_star' not in results"
 56        df['Alpha Star'] = df['Alpha'] == results['Alpha_star']
 57
 58    if key is not None:
 59        df['Key'] = [key] * df.shape[0]
 60
 61    return df        
 62
 63
 64def _parse_weights(results : dict, include_as : bool = False, 
 65                   key : None | str = None) -> pd.DataFrame:
 66    '''
 67    '''
 68    alpha_vals = []
 69    group_names = []
 70    kernel_weights = []
 71
 72    for alpha in results['Norms'].keys():
 73        alpha_vals.extend([alpha] * len(results['Norms'][alpha]))
 74        group_names.extend(results['Group_names'])
 75        kernel_weights.extend(results['Norms'][alpha])
 76
 77    df = pd.DataFrame({'Alpha' : alpha_vals, 
 78                       'Group' : group_names, 
 79                       'Kernel Weight' : kernel_weights})
 80    
 81    if include_as:
 82        df['Alpha Star'] = df['Alpha'] == results['Alpha_star'] 
 83
 84    if key is not None:
 85        df['Key'] = [key] * df.shape[0]
 86
 87    return df
 88
 89
 90def get_summary(results : dict, metric = 'AUROC'):
 91    '''
 92    Takes the results from either `scmkl.run()` and generates a 
 93    dataframe for each model containing columns for alpha, area under 
 94    the ROC, number of groups with nonzero weights, and highest 
 95    weighted group.
 96
 97    Parameters
 98    ----------
 99    **results** : *dict*
100        > A dictionary of results from scMKL generated from either 
101        `scmkl.run()`.
102
103    **metric** : *str*
104        > Which metric to include in the summary. Default is AUROC. 
105        Options include `'AUROC'`, `'Recall'`, `'Precision'`, 
106        `'Accuracy'`, and `'F1-Score'`.
107
108    Returns
109    -------
110    **summary_df** : *pd.DataFrame*
111        > A table with columns:
112        `['Alpha', 'AUROC', 'Number of Selected Groups', 'Top Group']`.
113    
114    Examples
115    --------
116    >>> results = scmkl.run(adata, alpha_list)
117    >>> summary_df = scmkl.get_summary(results)
118    ...
119    >>> summary_df.head()
120        Alpha   AUROC  Number of Selected Groups 
121    0   2.20  0.8600                          3   
122    1   1.96  0.9123                          4   
123    2   1.72  0.9357                          5   
124    3   1.48  0.9524                          7   
125    4   1.24  0.9666                          9   
126        Top Group
127    0   RNA-HALLMARK_E2F_TARGETS
128    1   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
129    2   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
130    3   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
131    4   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
132    '''
133    summary = {'Alpha' : [],
134                'AUROC' : [],
135                'Number of Selected Groups' : [],
136                'Top Group' : []}
137    
138    alpha_list = list(results['Metrics'].keys())
139
140    # Creating summary DataFrame for each model
141    for alpha in alpha_list:
142        cur_alpha_rows = results['Norms'][alpha]
143        top_weight_rows = np.max(results['Norms'][alpha])
144        top_group_index = np.where(cur_alpha_rows == top_weight_rows)
145        num_selected = len(results['Selected_groups'][alpha])
146        top_group_names = np.array(results['Group_names'])[top_group_index]
147
148        summary['Alpha'].append(alpha)
149        summary[metric].append(results['Metrics'][alpha][metric])
150        summary['Number of Selected Groups'].append(num_selected)
151        summary['Top Group'].append(*top_group_names)
152    
153    summary = pd.DataFrame(summary)
154
155    return summary
156
157
158def read_files(dir : str, pattern : str | None = None) -> dict:
159    '''
160    This function takes a directory of scMKL results as pickle files 
161    and returns a dictionary with the file names as keys and the data 
162    from the respective files as the values.
163
164    Parameters
165    ----------
166    **dir** : *str*
167        > A string specifying the file path for the output scMKL runs.
168
169    **pattern** : *str*
170        > A regex string for filtering down to desired files. If 
171        `None`, all files in the directory with the pickle file 
172        extension will be added to the dictionary.
173
174    Returns
175    -------
176    **results** : *dict*
177        > a dictionary with the file names as keys and data as values.
178
179    Examples
180    --------
181    >>> filepath = 'scMKL_results/rna+atac/'
182    ...
183    >>> all_results = scmkl.read_files(filepath)
184    >>> all_results.keys()
185    dict_keys(['Rep_1.pkl', Rep_2.pkl, Rep_3.pkl, ...])
186    '''
187    # Reading all pickle files in patter is None
188    if pattern is None:
189        data = {file : np.load(f'{dir}/{file}', allow_pickle = True)
190                 for file in os.listdir(dir) if '.pkl' in file}
191    
192    # Reading only files matching pattern if not None
193    else:
194        pattern = repr(pattern)
195        data = {file : np.load(f'{dir}/{file}', allow_pickle = True)
196                 for file in os.listdir(dir) 
197                 if re.fullmatch(pattern, file) is not None}
198        
199    return data
200
201
202def get_metrics(results : dict | None = None, rfiles : dict | None = None, 
203                include_as : bool = False) -> pd.DataFrame:
204    '''
205    Takes either a single scMKL result or a dictionary where each 
206    entry cooresponds to one result. Returns a dataframe with cols 
207    ['Alpha', 'Metric', 'Value']. If `include_as == True`, another 
208    col of booleans will be added to indicate whether or not the run 
209    respective to that alpha was chosen as optimal via CV. If 
210    `include_key == True`, another column will be added with the name 
211    of the key to the respective file (only applicable with multiple 
212    results).
213
214    Parameters
215    ----------
216    **results** : *None* | *dict*
217        > A dictionary with the results of a single run from 
218        `scmkl.run()`. Must be `None` if `rfiles is not None`.
219
220    **rfiles** : *None* | *dict*
221        > A dictionary of results dictionaries containing multiple 
222        results from `scmkl.run()`. If `include_keys == True`, a col 
223        will be added to the output pd.DataFrame with the keys as 
224        values cooresponding to each row.
225
226    **include_as** : *bool*
227        > When `True`, will add a bool col to output pd.DataFrame 
228        where rows with alphas cooresponding to alpha_star will be 
229        `True`.
230
231    Returns
232    -------
233    **df** : *pd.DataFrame*
234        > A pd.DataFrame containing all of the metrics present from 
235        the runs input.
236
237    Examples
238    --------
239    >>> # For a single file
240    >>> results = scmkl.run(adata)
241    >>> metrics = scmkl.get_metrics(results = results)
242
243    >>> # For multiple runs saved in a dict
244    >>> output_dir = 'scMKL_outputs/'
245    >>> rfiles = scmkl.read_files(output_dir)
246    >>> metrics = scmkl.get_metrics(rfiles)
247    '''
248    # Checking which data is being worked with 
249    multi_results = _parse_result_type(results = results, rfiles = rfiles)
250
251    # Initiating col list with minimal columns
252    cols = ['Alpha', 'Metric', 'Value']
253
254    if include_as:
255        cols.append('Alpha Star')
256
257    if multi_results:
258        cols.append('Key')
259        df = pd.DataFrame(columns = cols)
260        for key, result in rfiles.items():
261            cur_df = _parse_metrics(results = result, key = key, 
262                                     include_as = include_as)
263            df = pd.concat([df, cur_df.copy()])
264            
265    else:
266        df = _parse_metrics(results = results, include_as = include_as)
267
268    return df
269
270
271def get_weights(results : dict | None = None, rfiles : dict | None = None, 
272                include_as : bool = False) -> pd.DataFrame:
273    '''
274    Takes either a single scMKL result or dictionary of results and 
275    returns a pd.DataFrame with cols ['Alpha', 'Group', 
276    'Kernel Weight']. If include_as == True, a fourth col will be 
277    added to indicate whether or not the run respective to that alpha 
278    was chosen as optimal via CV.
279
280    Parameters
281    ----------
282    **results** : *None* | *dict*
283        > A dictionary with the results of a single run from 
284        `scmkl.run()`. Must be `None` if `rfiles is not None`.
285
286    **rfiles** : *None* | *dict*
287        > A dictionary of results dictionaries containing multiple 
288        results from `scmkl.run()`. If `include_keys == True`, a col 
289        will be added to the output pd.DataFrame with the keys as 
290        values cooresponding to each row.
291
292    **include_as** : *bool*
293        > When `True`, will add a bool col to output pd.DataFrame 
294        where rows with alphas cooresponding to alpha_star will be 
295        `True`.
296
297    Returns
298    -------
299    **df** : *pd.DataFrame*
300        > A pd.DataFrame containing all of the groups from each alpha 
301        and their cooresponding kernel weights.
302
303    Examples
304    --------
305    >>> # For a single file
306    >>> results = scmkl.run(adata)
307    >>> weights = scmkl.get_weights(results = results)
308    >>>
309    >>> # For multiple runs saved in a dict
310    >>> output_dir = 'scMKL_outputs/'
311    >>> rfiles = scmkl.read_files(output_dir)
312    >>> weights = scmkl.get_weights(rfiles)
313    '''
314    # Checking which data is being worked with 
315    multi_results = _parse_result_type(results = results, rfiles = rfiles)
316
317    # Initiating col list with minimal columns
318    cols = ['Alpha', 'Group', 'Kernel Weight']
319
320    if include_as:
321        cols.append('Alpha Star')
322
323    if multi_results:
324        cols.append('Key')
325        df = pd.DataFrame(columns = cols)
326        for key, result in rfiles.items():
327            cur_df = _parse_weights(results = result, key = key, 
328                                     include_as = include_as)
329            df = pd.concat([df, cur_df.copy()])
330            
331    else:
332        df = _parse_metrics(results = results, include_as = include_as)
333
334    return df
335
336
337def get_selection(weights_df, order_groups : bool) -> pd.DataFrame:
338    '''
339    This function takes a pd.DataFrame created by 
340    `scmkl.get_weights()` and returns a selection table. Selection 
341    refers to how many times a group had a nonzero group weight. To 
342    calculate this, a col is added indicating whether the group was 
343    selected. Then, the dataframe is grouped by alpha and group. 
344    Selection can then be summed returning a dataframe with cols 
345    `['Alpha', 'Group', Selection]`.
346
347    Parameters
348    ----------
349    **weights_df** : *pd.DataFrame*
350        > A dataframe output by `scmkl.get_weights()` with cols
351        `['Alpha', 'Group', 'Kernel Weight']`.
352
353    **order_groups** : *bool*
354        > If `True`, the `'Group'` col of the output dataframe will be 
355        made into a `pd.Categorical` col ordered by number of times 
356        each group was selected in decending order.
357
358    Returns
359    -------
360    **df** : *pd.DataFrame*
361        > A dataframe with cols `['Alpha', 'Group', Selection]`.
362
363    Example
364    -------
365    >>> # For a single file
366    >>> results = scmkl.run(adata)
367    >>> weights = scmkl.get_weights(results = results)
368    >>> selection = scmkl.get_selection(weights)
369    >>>
370    >>> # For multiple runs saved in a dict
371    >>> output_dir = 'scMKL_outputs/'
372    >>> rfiles = scmkl.read_files(output_dir)
373    >>> weights = scmkl.get_weights(rfiles)
374    >>> selection = scmkl.get_selection(weights)
375    '''
376    # Adding col indicating whether or not groups have nonzero weight
377    selection = weights_df['Kernel Weight'].apply(lambda x: x > 0)
378    weights_df['Selection'] = selection
379
380    # Summing selection across replications to get selection
381    df = weights_df.groupby(['Alpha', 'Group'])['Selection'].sum()
382    df = df.reset_index()
383
384    # Getting group order
385    if order_groups:
386        order = df.groupby('Group')['Selection'].sum()
387        order = order.reset_index().sort_values(by = 'Selection', 
388                                                ascending = False)
389        order = order['Group']
390        df['Group'] = pd.Categorical(df['Group'], categories = order)
391
392
393    return df
394
395
396def mean_groups_per_alpha(selection_df) -> dict:
397    '''
398    This function takes a pd.DataFrame from `scmkl.get_selection()` 
399    generated from multiple scMKL results and returns a dictionary 
400    with keys being alphas from the input dataframe and values being 
401    the mean number of selected groups for a given alpha across 
402    results. 
403
404    Parameters
405    ----------
406    **selection_df** : *pd.DataFrame*
407        > A dataframe output by `scmkl.get_selection()` with cols 
408        `['Alpha', 'Group', Selection].
409    
410    Returns
411    -------
412    **mean_groups** : *dict*
413        > A dictionary with alphas as keys and the mean number of 
414        selected groups for that alpha as keys.
415
416    Examples
417    --------
418    >>> weights = scmkl.get_weights(rfiles)
419    >>> selection = scmkl.get_selection(weights)
420    >>> mean_groups = scmkl.mean_groups_per_alpha(selection)
421    >>> mean_groups = {alpha : np.round(num_selected, 1)
422    ...                for alpha, num_selected in mean_groups.items()}
423    >>>
424    >>> print(mean_groups)
425    {0.05 : 50.0, 0.2 : 24.7, 1.1 : 5.3}
426    '''
427    mean_groups = {}
428    for alpha in np.unique(selection_df['Alpha']):
429
430        # Capturing rows for given alpha
431        rows = selection_df['Alpha'] == alpha
432
433        # Adding mean number of groups for alpha
434        mean_groups[alpha] = np.mean(selection_df[rows]['Selection'])
435
436    return mean_groups
def get_summary(results: dict, metric='AUROC'):
 91def get_summary(results : dict, metric = 'AUROC'):
 92    '''
 93    Takes the results from either `scmkl.run()` and generates a 
 94    dataframe for each model containing columns for alpha, area under 
 95    the ROC, number of groups with nonzero weights, and highest 
 96    weighted group.
 97
 98    Parameters
 99    ----------
100    **results** : *dict*
101        > A dictionary of results from scMKL generated from either 
102        `scmkl.run()`.
103
104    **metric** : *str*
105        > Which metric to include in the summary. Default is AUROC. 
106        Options include `'AUROC'`, `'Recall'`, `'Precision'`, 
107        `'Accuracy'`, and `'F1-Score'`.
108
109    Returns
110    -------
111    **summary_df** : *pd.DataFrame*
112        > A table with columns:
113        `['Alpha', 'AUROC', 'Number of Selected Groups', 'Top Group']`.
114    
115    Examples
116    --------
117    >>> results = scmkl.run(adata, alpha_list)
118    >>> summary_df = scmkl.get_summary(results)
119    ...
120    >>> summary_df.head()
121        Alpha   AUROC  Number of Selected Groups 
122    0   2.20  0.8600                          3   
123    1   1.96  0.9123                          4   
124    2   1.72  0.9357                          5   
125    3   1.48  0.9524                          7   
126    4   1.24  0.9666                          9   
127        Top Group
128    0   RNA-HALLMARK_E2F_TARGETS
129    1   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
130    2   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
131    3   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
132    4   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
133    '''
134    summary = {'Alpha' : [],
135                'AUROC' : [],
136                'Number of Selected Groups' : [],
137                'Top Group' : []}
138    
139    alpha_list = list(results['Metrics'].keys())
140
141    # Creating summary DataFrame for each model
142    for alpha in alpha_list:
143        cur_alpha_rows = results['Norms'][alpha]
144        top_weight_rows = np.max(results['Norms'][alpha])
145        top_group_index = np.where(cur_alpha_rows == top_weight_rows)
146        num_selected = len(results['Selected_groups'][alpha])
147        top_group_names = np.array(results['Group_names'])[top_group_index]
148
149        summary['Alpha'].append(alpha)
150        summary[metric].append(results['Metrics'][alpha][metric])
151        summary['Number of Selected Groups'].append(num_selected)
152        summary['Top Group'].append(*top_group_names)
153    
154    summary = pd.DataFrame(summary)
155
156    return summary

Takes the results from either scmkl.run and generates a dataframe for each model containing columns for alpha, area under the ROC, number of groups with nonzero weights, and highest weighted group.

Parameters

results : dict

A dictionary of results from scMKL generated from either scmkl.run.

metric : str

Which metric to include in the summary. Default is AUROC. Options include 'AUROC', 'Recall', 'Precision', 'Accuracy', and 'F1-Score'.

Returns

summary_df : pd.DataFrame

A table with columns: ['Alpha', 'AUROC', 'Number of Selected Groups', 'Top Group'].

Examples

>>> results = scmkl.run(adata, alpha_list)
>>> summary_df = scmkl.get_summary(results)
...
>>> summary_df.head()
    Alpha   AUROC  Number of Selected Groups 
0   2.20  0.8600                          3   
1   1.96  0.9123                          4   
2   1.72  0.9357                          5   
3   1.48  0.9524                          7   
4   1.24  0.9666                          9   
    Top Group
0   RNA-HALLMARK_E2F_TARGETS
1   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
2   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
3   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
4   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
def read_files(dir: str, pattern: str | None = None) -> dict:
159def read_files(dir : str, pattern : str | None = None) -> dict:
160    '''
161    This function takes a directory of scMKL results as pickle files 
162    and returns a dictionary with the file names as keys and the data 
163    from the respective files as the values.
164
165    Parameters
166    ----------
167    **dir** : *str*
168        > A string specifying the file path for the output scMKL runs.
169
170    **pattern** : *str*
171        > A regex string for filtering down to desired files. If 
172        `None`, all files in the directory with the pickle file 
173        extension will be added to the dictionary.
174
175    Returns
176    -------
177    **results** : *dict*
178        > a dictionary with the file names as keys and data as values.
179
180    Examples
181    --------
182    >>> filepath = 'scMKL_results/rna+atac/'
183    ...
184    >>> all_results = scmkl.read_files(filepath)
185    >>> all_results.keys()
186    dict_keys(['Rep_1.pkl', Rep_2.pkl, Rep_3.pkl, ...])
187    '''
188    # Reading all pickle files in patter is None
189    if pattern is None:
190        data = {file : np.load(f'{dir}/{file}', allow_pickle = True)
191                 for file in os.listdir(dir) if '.pkl' in file}
192    
193    # Reading only files matching pattern if not None
194    else:
195        pattern = repr(pattern)
196        data = {file : np.load(f'{dir}/{file}', allow_pickle = True)
197                 for file in os.listdir(dir) 
198                 if re.fullmatch(pattern, file) is not None}
199        
200    return data

This function takes a directory of scMKL results as pickle files and returns a dictionary with the file names as keys and the data from the respective files as the values.

Parameters

dir : str

A string specifying the file path for the output scMKL runs.

pattern : str

A regex string for filtering down to desired files. If None, all files in the directory with the pickle file extension will be added to the dictionary.

Returns

results : dict

a dictionary with the file names as keys and data as values.

Examples

>>> filepath = 'scMKL_results/rna+atac/'
...
>>> all_results = scmkl.read_files(filepath)
>>> all_results.keys()
dict_keys(['Rep_1.pkl', Rep_2.pkl, Rep_3.pkl, ...])
def get_metrics( results: dict | None = None, rfiles: dict | None = None, include_as: bool = False) -> pandas.core.frame.DataFrame:
203def get_metrics(results : dict | None = None, rfiles : dict | None = None, 
204                include_as : bool = False) -> pd.DataFrame:
205    '''
206    Takes either a single scMKL result or a dictionary where each 
207    entry cooresponds to one result. Returns a dataframe with cols 
208    ['Alpha', 'Metric', 'Value']. If `include_as == True`, another 
209    col of booleans will be added to indicate whether or not the run 
210    respective to that alpha was chosen as optimal via CV. If 
211    `include_key == True`, another column will be added with the name 
212    of the key to the respective file (only applicable with multiple 
213    results).
214
215    Parameters
216    ----------
217    **results** : *None* | *dict*
218        > A dictionary with the results of a single run from 
219        `scmkl.run()`. Must be `None` if `rfiles is not None`.
220
221    **rfiles** : *None* | *dict*
222        > A dictionary of results dictionaries containing multiple 
223        results from `scmkl.run()`. If `include_keys == True`, a col 
224        will be added to the output pd.DataFrame with the keys as 
225        values cooresponding to each row.
226
227    **include_as** : *bool*
228        > When `True`, will add a bool col to output pd.DataFrame 
229        where rows with alphas cooresponding to alpha_star will be 
230        `True`.
231
232    Returns
233    -------
234    **df** : *pd.DataFrame*
235        > A pd.DataFrame containing all of the metrics present from 
236        the runs input.
237
238    Examples
239    --------
240    >>> # For a single file
241    >>> results = scmkl.run(adata)
242    >>> metrics = scmkl.get_metrics(results = results)
243
244    >>> # For multiple runs saved in a dict
245    >>> output_dir = 'scMKL_outputs/'
246    >>> rfiles = scmkl.read_files(output_dir)
247    >>> metrics = scmkl.get_metrics(rfiles)
248    '''
249    # Checking which data is being worked with 
250    multi_results = _parse_result_type(results = results, rfiles = rfiles)
251
252    # Initiating col list with minimal columns
253    cols = ['Alpha', 'Metric', 'Value']
254
255    if include_as:
256        cols.append('Alpha Star')
257
258    if multi_results:
259        cols.append('Key')
260        df = pd.DataFrame(columns = cols)
261        for key, result in rfiles.items():
262            cur_df = _parse_metrics(results = result, key = key, 
263                                     include_as = include_as)
264            df = pd.concat([df, cur_df.copy()])
265            
266    else:
267        df = _parse_metrics(results = results, include_as = include_as)
268
269    return df

Takes either a single scMKL result or a dictionary where each entry cooresponds to one result. Returns a dataframe with cols ['Alpha', 'Metric', 'Value']. If include_as == True, another col of booleans will be added to indicate whether or not the run respective to that alpha was chosen as optimal via CV. If include_key == True, another column will be added with the name of the key to the respective file (only applicable with multiple results).

Parameters

results : None | dict

A dictionary with the results of a single run from scmkl.run. Must be None if rfiles is not None.

rfiles : None | dict

A dictionary of results dictionaries containing multiple results from scmkl.run. If include_keys == True, a col will be added to the output pd.DataFrame with the keys as values cooresponding to each row.

include_as : bool

When True, will add a bool col to output pd.DataFrame where rows with alphas cooresponding to alpha_star will be True.

Returns

df : pd.DataFrame

A pd.DataFrame containing all of the metrics present from the runs input.

Examples

>>> # For a single file
>>> results = scmkl.run(adata)
>>> metrics = scmkl.get_metrics(results = results)
>>> # For multiple runs saved in a dict
>>> output_dir = 'scMKL_outputs/'
>>> rfiles = scmkl.read_files(output_dir)
>>> metrics = scmkl.get_metrics(rfiles)
def get_weights( results: dict | None = None, rfiles: dict | None = None, include_as: bool = False) -> pandas.core.frame.DataFrame:
272def get_weights(results : dict | None = None, rfiles : dict | None = None, 
273                include_as : bool = False) -> pd.DataFrame:
274    '''
275    Takes either a single scMKL result or dictionary of results and 
276    returns a pd.DataFrame with cols ['Alpha', 'Group', 
277    'Kernel Weight']. If include_as == True, a fourth col will be 
278    added to indicate whether or not the run respective to that alpha 
279    was chosen as optimal via CV.
280
281    Parameters
282    ----------
283    **results** : *None* | *dict*
284        > A dictionary with the results of a single run from 
285        `scmkl.run()`. Must be `None` if `rfiles is not None`.
286
287    **rfiles** : *None* | *dict*
288        > A dictionary of results dictionaries containing multiple 
289        results from `scmkl.run()`. If `include_keys == True`, a col 
290        will be added to the output pd.DataFrame with the keys as 
291        values cooresponding to each row.
292
293    **include_as** : *bool*
294        > When `True`, will add a bool col to output pd.DataFrame 
295        where rows with alphas cooresponding to alpha_star will be 
296        `True`.
297
298    Returns
299    -------
300    **df** : *pd.DataFrame*
301        > A pd.DataFrame containing all of the groups from each alpha 
302        and their cooresponding kernel weights.
303
304    Examples
305    --------
306    >>> # For a single file
307    >>> results = scmkl.run(adata)
308    >>> weights = scmkl.get_weights(results = results)
309    >>>
310    >>> # For multiple runs saved in a dict
311    >>> output_dir = 'scMKL_outputs/'
312    >>> rfiles = scmkl.read_files(output_dir)
313    >>> weights = scmkl.get_weights(rfiles)
314    '''
315    # Checking which data is being worked with 
316    multi_results = _parse_result_type(results = results, rfiles = rfiles)
317
318    # Initiating col list with minimal columns
319    cols = ['Alpha', 'Group', 'Kernel Weight']
320
321    if include_as:
322        cols.append('Alpha Star')
323
324    if multi_results:
325        cols.append('Key')
326        df = pd.DataFrame(columns = cols)
327        for key, result in rfiles.items():
328            cur_df = _parse_weights(results = result, key = key, 
329                                     include_as = include_as)
330            df = pd.concat([df, cur_df.copy()])
331            
332    else:
333        df = _parse_metrics(results = results, include_as = include_as)
334
335    return df

Takes either a single scMKL result or dictionary of results and returns a pd.DataFrame with cols ['Alpha', 'Group', 'Kernel Weight']. If include_as == True, a fourth col will be added to indicate whether or not the run respective to that alpha was chosen as optimal via CV.

Parameters

results : None | dict

A dictionary with the results of a single run from scmkl.run. Must be None if rfiles is not None.

rfiles : None | dict

A dictionary of results dictionaries containing multiple results from scmkl.run. If include_keys == True, a col will be added to the output pd.DataFrame with the keys as values cooresponding to each row.

include_as : bool

When True, will add a bool col to output pd.DataFrame where rows with alphas cooresponding to alpha_star will be True.

Returns

df : pd.DataFrame

A pd.DataFrame containing all of the groups from each alpha and their cooresponding kernel weights.

Examples

>>> # For a single file
>>> results = scmkl.run(adata)
>>> weights = scmkl.get_weights(results = results)
>>>
>>> # For multiple runs saved in a dict
>>> output_dir = 'scMKL_outputs/'
>>> rfiles = scmkl.read_files(output_dir)
>>> weights = scmkl.get_weights(rfiles)
def get_selection(weights_df, order_groups: bool) -> pandas.core.frame.DataFrame:
338def get_selection(weights_df, order_groups : bool) -> pd.DataFrame:
339    '''
340    This function takes a pd.DataFrame created by 
341    `scmkl.get_weights()` and returns a selection table. Selection 
342    refers to how many times a group had a nonzero group weight. To 
343    calculate this, a col is added indicating whether the group was 
344    selected. Then, the dataframe is grouped by alpha and group. 
345    Selection can then be summed returning a dataframe with cols 
346    `['Alpha', 'Group', Selection]`.
347
348    Parameters
349    ----------
350    **weights_df** : *pd.DataFrame*
351        > A dataframe output by `scmkl.get_weights()` with cols
352        `['Alpha', 'Group', 'Kernel Weight']`.
353
354    **order_groups** : *bool*
355        > If `True`, the `'Group'` col of the output dataframe will be 
356        made into a `pd.Categorical` col ordered by number of times 
357        each group was selected in decending order.
358
359    Returns
360    -------
361    **df** : *pd.DataFrame*
362        > A dataframe with cols `['Alpha', 'Group', Selection]`.
363
364    Example
365    -------
366    >>> # For a single file
367    >>> results = scmkl.run(adata)
368    >>> weights = scmkl.get_weights(results = results)
369    >>> selection = scmkl.get_selection(weights)
370    >>>
371    >>> # For multiple runs saved in a dict
372    >>> output_dir = 'scMKL_outputs/'
373    >>> rfiles = scmkl.read_files(output_dir)
374    >>> weights = scmkl.get_weights(rfiles)
375    >>> selection = scmkl.get_selection(weights)
376    '''
377    # Adding col indicating whether or not groups have nonzero weight
378    selection = weights_df['Kernel Weight'].apply(lambda x: x > 0)
379    weights_df['Selection'] = selection
380
381    # Summing selection across replications to get selection
382    df = weights_df.groupby(['Alpha', 'Group'])['Selection'].sum()
383    df = df.reset_index()
384
385    # Getting group order
386    if order_groups:
387        order = df.groupby('Group')['Selection'].sum()
388        order = order.reset_index().sort_values(by = 'Selection', 
389                                                ascending = False)
390        order = order['Group']
391        df['Group'] = pd.Categorical(df['Group'], categories = order)
392
393
394    return df

This function takes a pd.DataFrame created by scmkl.get_weights() and returns a selection table. Selection refers to how many times a group had a nonzero group weight. To calculate this, a col is added indicating whether the group was selected. Then, the dataframe is grouped by alpha and group. Selection can then be summed returning a dataframe with cols ['Alpha', 'Group', Selection].

Parameters

weights_df : pd.DataFrame

A dataframe output by scmkl.get_weights() with cols ['Alpha', 'Group', 'Kernel Weight'].

order_groups : bool

If True, the 'Group' col of the output dataframe will be made into a pd.Categorical col ordered by number of times each group was selected in decending order.

Returns

df : pd.DataFrame

A dataframe with cols ['Alpha', 'Group', Selection].

Example

>>> # For a single file
>>> results = scmkl.run(adata)
>>> weights = scmkl.get_weights(results = results)
>>> selection = scmkl.get_selection(weights)
>>>
>>> # For multiple runs saved in a dict
>>> output_dir = 'scMKL_outputs/'
>>> rfiles = scmkl.read_files(output_dir)
>>> weights = scmkl.get_weights(rfiles)
>>> selection = scmkl.get_selection(weights)
def mean_groups_per_alpha(selection_df) -> dict:
397def mean_groups_per_alpha(selection_df) -> dict:
398    '''
399    This function takes a pd.DataFrame from `scmkl.get_selection()` 
400    generated from multiple scMKL results and returns a dictionary 
401    with keys being alphas from the input dataframe and values being 
402    the mean number of selected groups for a given alpha across 
403    results. 
404
405    Parameters
406    ----------
407    **selection_df** : *pd.DataFrame*
408        > A dataframe output by `scmkl.get_selection()` with cols 
409        `['Alpha', 'Group', Selection].
410    
411    Returns
412    -------
413    **mean_groups** : *dict*
414        > A dictionary with alphas as keys and the mean number of 
415        selected groups for that alpha as keys.
416
417    Examples
418    --------
419    >>> weights = scmkl.get_weights(rfiles)
420    >>> selection = scmkl.get_selection(weights)
421    >>> mean_groups = scmkl.mean_groups_per_alpha(selection)
422    >>> mean_groups = {alpha : np.round(num_selected, 1)
423    ...                for alpha, num_selected in mean_groups.items()}
424    >>>
425    >>> print(mean_groups)
426    {0.05 : 50.0, 0.2 : 24.7, 1.1 : 5.3}
427    '''
428    mean_groups = {}
429    for alpha in np.unique(selection_df['Alpha']):
430
431        # Capturing rows for given alpha
432        rows = selection_df['Alpha'] == alpha
433
434        # Adding mean number of groups for alpha
435        mean_groups[alpha] = np.mean(selection_df[rows]['Selection'])
436
437    return mean_groups

This function takes a pd.DataFrame from scmkl.get_selection() generated from multiple scMKL results and returns a dictionary with keys being alphas from the input dataframe and values being the mean number of selected groups for a given alpha across results.

Parameters

selection_df : pd.DataFrame

A dataframe output by scmkl.get_selection() with cols `['Alpha', 'Group', Selection].

Returns

mean_groups : dict

A dictionary with alphas as keys and the mean number of selected groups for that alpha as keys.

Examples

>>> weights = scmkl.get_weights(rfiles)
>>> selection = scmkl.get_selection(weights)
>>> mean_groups = scmkl.mean_groups_per_alpha(selection)
>>> mean_groups = {alpha : np.round(num_selected, 1)
...                for alpha, num_selected in mean_groups.items()}
>>>
>>> print(mean_groups)
{0.05 : 50.0, 0.2 : 24.7, 1.1 : 5.3}