scmkl.dataframes

  1import os
  2import re
  3import numpy as np
  4import pandas as pd
  5
  6
  7def _parse_result_type(results : dict | None, rfiles : dict | None) -> bool:
  8    '''
  9    This function simply returns a bool for whether or not there are 
 10    multiple runs present while checking that There is one dict and 
 11    one Nonetype between `results` and `rfiles`.
 12    '''
 13    dtypes = (type(results), type(rfiles))
 14    none_in_dtypes = type(None) in dtypes
 15    dict_in_dtypes = dict in dtypes
 16    both_in_dtypes = none_in_dtypes and dict_in_dtypes
 17
 18    # Ensuring that at least one of dtypes is None
 19    assert both_in_dtypes, "Only `rfiles` or `results` can be provided"
 20
 21    if type(rfiles) is dict:
 22        mult_files = True
 23    else:
 24        mult_files = False
 25
 26    return mult_files
 27
 28
 29def _parse_metrics(results, key : str | None = None, 
 30                   include_as = False) -> pd.DataFrame:
 31    '''
 32    This function returns a pd.DataFrame for a single scMKL result.
 33    '''
 34    alpha_vals = []
 35    met_names = []
 36    met_vals = []
 37
 38    # If statement ensuring results is a scMKL results with metrics
 39    if 'Metrics' in results.keys():
 40        for alpha in results['Metrics'].keys():
 41            for metric, value in results['Metrics'][alpha].items():
 42                alpha_vals.append(alpha)
 43                met_names.append(metric)
 44                met_vals.append(value)
 45
 46        if include_as:
 47            assert 'Alpha_star' in results.keys(), "'Alpha_star' not in results"
 48            df['Alpha Star'] = df['Alpha'] == results['Alpha_star']
 49
 50    else:
 51        print(f"{key} is not a scMKL result and will be ignored.")
 52            
 53    df = pd.DataFrame({'Alpha' : alpha_vals,
 54                       'Metric' : met_names,
 55                       'Value' : met_vals})
 56
 57    if key is not None:
 58        df['Key'] = [key] * df.shape[0]
 59
 60    return df            
 61
 62
 63def _parse_weights(results : dict, include_as : bool = False, 
 64                   key : None | str = None) -> pd.DataFrame:
 65    '''
 66    '''
 67    alpha_vals = []
 68    group_names = []
 69    kernel_weights = []
 70
 71    for alpha in results['Norms'].keys():
 72        alpha_vals.extend([alpha] * len(results['Norms'][alpha]))
 73        group_names.extend(results['Group_names'])
 74        kernel_weights.extend(results['Norms'][alpha])
 75
 76    df = pd.DataFrame({'Alpha' : alpha_vals, 
 77                       'Group' : group_names, 
 78                       'Kernel Weight' : kernel_weights})
 79    
 80    if include_as:
 81        df['Alpha Star'] = df['Alpha'] == results['Alpha_star'] 
 82
 83    if key is not None:
 84        df['Key'] = [key] * df.shape[0]
 85
 86    return df
 87
 88
 89def get_summary(results : dict, metric = 'AUROC'):
 90    '''
 91    Takes the results from either `scmkl.run()` and generates a 
 92    dataframe for each model containing columns for alpha, area under 
 93    the ROC, number of groups with nonzero weights, and highest 
 94    weighted group.
 95
 96    Parameters
 97    ----------
 98    **results** : *dict*
 99        > A dictionary of results from scMKL generated from either 
100        `scmkl.run()`.
101
102    **metric** : *str*
103        > Which metric to include in the summary. Default is AUROC. 
104        Options include `'AUROC'`, `'Recall'`, `'Precision'`, 
105        `'Accuracy'`, and `'F1-Score'`.
106
107    Returns
108    -------
109    **summary_df** : *pd.DataFrame*
110        > A table with columns:
111        `['Alpha', 'AUROC', 'Number of Selected Groups', 'Top Group']`.
112    
113    Examples
114    --------
115    >>> results = scmkl.run(adata, alpha_list)
116    >>> summary_df = scmkl.get_summary(results)
117    ...
118    >>> summary_df.head()
119        Alpha   AUROC  Number of Selected Groups 
120    0   2.20  0.8600                          3   
121    1   1.96  0.9123                          4   
122    2   1.72  0.9357                          5   
123    3   1.48  0.9524                          7   
124    4   1.24  0.9666                          9   
125        Top Group
126    0   RNA-HALLMARK_E2F_TARGETS
127    1   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
128    2   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
129    3   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
130    4   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
131    '''
132    summary = {'Alpha' : [],
133                'AUROC' : [],
134                'Number of Selected Groups' : [],
135                'Top Group' : []}
136    
137    alpha_list = list(results['Metrics'].keys())
138
139    # Creating summary DataFrame for each model
140    for alpha in alpha_list:
141        cur_alpha_rows = results['Norms'][alpha]
142        top_weight_rows = np.max(results['Norms'][alpha])
143        top_group_index = np.where(cur_alpha_rows == top_weight_rows)
144        num_selected = len(results['Selected_groups'][alpha])
145        top_group_names = np.array(results['Group_names'])[top_group_index]
146
147        summary['Alpha'].append(alpha)
148        summary['AUROC'].append(results['Metrics'][alpha][metric])
149        summary['Number of Selected Groups'].append(num_selected)
150        summary['Top Group'].append(*top_group_names)
151    
152    summary = pd.DataFrame(summary)
153
154    return summary
155
156
157def read_files(dir : str, pattern : str | None = None) -> dict:
158    '''
159    This function takes a directory of scMKL results as pickle files 
160    and returns a dictionary with the file names as keys and the data 
161    from the respective files as the values.
162
163    Parameters
164    ----------
165    **dir** : *str*
166        > A string specifying the file path for the output scMKL runs.
167
168    **pattern** : *str*
169        > A regex string for filtering down to desired files. If 
170        `None`, all files in the directory with the pickle file 
171        extension will be added to the dictionary.
172
173    Returns
174    -------
175    **results** : *dict*
176        > a dictionary with the file names as keys and data as values.
177
178    Examples
179    --------
180    >>> filepath = 'scMKL_results/rna+atac/'
181    ...
182    >>> all_results = scmkl.read_files(filepath)
183    >>> all_results.keys()
184    dict_keys(['Rep_1.pkl', Rep_2.pkl, Rep_3.pkl, ...])
185    '''
186    # Reading all pickle files in patter is None
187    if pattern is None:
188        data = {file : np.load(f'{dir}/{file}', allow_pickle = True)
189                 for file in os.listdir(dir) if '.pkl' in file}
190    
191    # Reading only files matching pattern if not None
192    else:
193        pattern = repr(pattern)
194        data = {file : np.load(f'{dir}/{file}', allow_pickle = True)
195                 for file in os.listdir(dir) 
196                 if re.fullmatch(pattern, file) is not None}
197        
198    return data
199
200
201def get_metrics(results : dict | None = None, rfiles : dict | None = None, 
202                include_as : bool = False) -> pd.DataFrame:
203    '''
204    Takes either a single scMKL result or a dictionary where each 
205    entry cooresponds to one result. Returns a dataframe with cols 
206    ['Alpha', 'Metric', 'Value']. If `include_as == True`, another 
207    col of booleans will be added to indicate whether or not the run 
208    respective to that alpha was chosen as optimal via CV. If 
209    `include_key == True`, another column will be added with the name 
210    of the key to the respective file (only applicable with multiple 
211    results).
212
213    Parameters
214    ----------
215    **results** : *None* | *dict*
216        > A dictionary with the results of a single run from 
217        `scmkl.run()`. Must be `None` if `rfiles is not None`.
218
219    **rfiles** : *None* | *dict*
220        > A dictionary of results dictionaries containing multiple 
221        results from `scmkl.run()`. If `include_keys == True`, a col 
222        will be added to the output pd.DataFrame with the keys as 
223        values cooresponding to each row.
224
225    **include_as** : *bool*
226        > When `True`, will add a bool col to output pd.DataFrame 
227        where rows with alphas cooresponding to alpha_star will be 
228        `True`.
229
230    Returns
231    -------
232    **df** : *pd.DataFrame*
233        > A pd.DataFrame containing all of the metrics present from 
234        the runs input.
235
236    Examples
237    --------
238    >>> # For a single file
239    >>> results = scmkl.run(adata)
240    >>> metrics = scmkl.get_metrics(results = results)
241
242    >>> # For multiple runs saved in a dict
243    >>> output_dir = 'scMKL_outputs/'
244    >>> rfiles = scmkl.read_files(output_dir)
245    >>> metrics = scmkl.get_metrics(rfiles)
246    '''
247    # Checking which data is being worked with 
248    multi_results = _parse_result_type(results = results, rfiles = rfiles)
249
250    # Initiating col list with minimal columns
251    cols = ['Alpha', 'Metric', 'Value']
252
253    if include_as:
254        cols.append('Alpha Star')
255
256    if multi_results:
257        cols.append('Key')
258        df = pd.DataFrame(columns = cols)
259        for key, result in rfiles.items():
260            cur_df = _parse_metrics(results = result, key = key, 
261                                     include_as = include_as)
262            df = pd.concat([df, cur_df.copy()])
263            
264    else:
265        df = _parse_metrics(results = results, include_as = include_as)
266
267    return df
268
269
270def get_weights(results : dict | None = None, rfiles : dict | None = None, 
271                include_as : bool = False) -> pd.DataFrame:
272    '''
273    Takes either a single scMKL result or dictionary of results and 
274    returns a pd.DataFrame with cols ['Alpha', 'Group', 
275    'Kernel Weight']. If include_as == True, a fourth col will be 
276    added to indicate whether or not the run respective to that alpha 
277    was chosen as optimal via CV.
278
279    Parameters
280    ----------
281    **results** : *None* | *dict*
282        > A dictionary with the results of a single run from 
283        `scmkl.run()`. Must be `None` if `rfiles is not None`.
284
285    **rfiles** : *None* | *dict*
286        > A dictionary of results dictionaries containing multiple 
287        results from `scmkl.run()`. If `include_keys == True`, a col 
288        will be added to the output pd.DataFrame with the keys as 
289        values cooresponding to each row.
290
291    **include_as** : *bool*
292        > When `True`, will add a bool col to output pd.DataFrame 
293        where rows with alphas cooresponding to alpha_star will be 
294        `True`.
295
296    Returns
297    -------
298    **df** : *pd.DataFrame*
299        > A pd.DataFrame containing all of the groups from each alpha 
300        and their cooresponding kernel weights.
301
302    Examples
303    --------
304    >>> # For a single file
305    >>> results = scmkl.run(adata)
306    >>> weights = scmkl.get_weights(results = results)
307    >>>
308    >>> # For multiple runs saved in a dict
309    >>> output_dir = 'scMKL_outputs/'
310    >>> rfiles = scmkl.read_files(output_dir)
311    >>> weights = scmkl.get_weights(rfiles)
312    '''
313    # Checking which data is being worked with 
314    multi_results = _parse_result_type(results = results, rfiles = rfiles)
315
316    # Initiating col list with minimal columns
317    cols = ['Alpha', 'Group', 'Kernel Weight']
318
319    if include_as:
320        cols.append('Alpha Star')
321
322    if multi_results:
323        cols.append('Key')
324        df = pd.DataFrame(columns = cols)
325        for key, result in rfiles.items():
326            cur_df = _parse_weights(results = result, key = key, 
327                                     include_as = include_as)
328            df = pd.concat([df, cur_df.copy()])
329            
330    else:
331        df = _parse_metrics(results = results, include_as = include_as)
332
333    return df
334
335
336def get_selection(weights_df, order_groups : bool) -> pd.DataFrame:
337    '''
338    This function takes a pd.DataFrame created by 
339    `scmkl.get_weights()` and returns a selection table. Selection 
340    refers to how many times a group had a nonzero group weight. To 
341    calculate this, a col is added indicating whether the group was 
342    selected. Then, the dataframe is grouped by alpha and group. 
343    Selection can then be summed returning a dataframe with cols 
344    `['Alpha', 'Group', Selection]`.
345
346    Parameters
347    ----------
348    **weights_df** : *pd.DataFrame*
349        > A dataframe output by `scmkl.get_weights()` with cols
350        `['Alpha', 'Group', 'Kernel Weight']`.
351
352    **order_groups** : *bool*
353        > If `True`, the `'Group'` col of the output dataframe will be 
354        made into a `pd.Categorical` col ordered by number of times 
355        each group was selected in decending order.
356
357    Returns
358    -------
359    **df** : *pd.DataFrame*
360        > A dataframe with cols `['Alpha', 'Group', Selection]`.
361
362    Example
363    -------
364    >>> # For a single file
365    >>> results = scmkl.run(adata)
366    >>> weights = scmkl.get_weights(results = results)
367    >>> selection = scmkl.get_selection(weights)
368    >>>
369    >>> # For multiple runs saved in a dict
370    >>> output_dir = 'scMKL_outputs/'
371    >>> rfiles = scmkl.read_files(output_dir)
372    >>> weights = scmkl.get_weights(rfiles)
373    >>> selection = scmkl.get_selection(weights)
374    '''
375    # Adding col indicating whether or not groups have nonzero weight
376    selection = weights_df['Kernel Weight'].apply(lambda x: x > 0)
377    weights_df['Selection'] = selection
378
379    # Summing selection across replications to get selection
380    df = weights_df.groupby(['Alpha', 'Group'])['Selection'].sum()
381    df = df.reset_index()
382
383    # Getting group order
384    if order_groups:
385        order = df.groupby('Group')['Selection'].sum()
386        order = order.reset_index().sort_values(by = 'Selection', 
387                                                ascending = False)
388        order = order['Group']
389        df['Group'] = pd.Categorical(df['Group'], categories = order)
390
391
392    return df
393
394
395def mean_groups_per_alpha(selection_df) -> dict:
396    '''
397    This function takes a pd.DataFrame from `scmkl.get_selection()` 
398    generated from multiple scMKL results and returns a dictionary 
399    with keys being alphas from the input dataframe and values being 
400    the mean number of selected groups for a given alpha across 
401    results. 
402
403    Parameters
404    ----------
405    **selection_df** : *pd.DataFrame*
406        > A dataframe output by `scmkl.get_selection()` with cols 
407        `['Alpha', 'Group', Selection].
408    
409    Returns
410    -------
411    **mean_groups** : *dict*
412        > A dictionary with alphas as keys and the mean number of 
413        selected groups for that alpha as keys.
414
415    Examples
416    --------
417    >>> weights = scmkl.get_weights(rfiles)
418    >>> selection = scmkl.get_selection(weights)
419    >>> mean_groups = scmkl.mean_groups_per_alpha(selection)
420    >>> mean_groups = {alpha : np.round(num_selected, 1)
421    ...                for alpha, num_selected in mean_groups.items()}
422    >>>
423    >>> print(mean_groups)
424    {0.05 : 50.0, 0.2 : 24.7, 1.1 : 5.3}
425    '''
426    mean_groups = {}
427    for alpha in np.unique(selection_df['Alpha']):
428
429        # Capturing rows for given alpha
430        rows = selection_df['Alpha'] == alpha
431
432        # Adding mean number of groups for alpha
433        mean_groups[alpha] = np.mean(selection_df[rows]['Selection'])
434
435    return mean_groups
def get_summary(results: dict, metric='AUROC'):
 90def get_summary(results : dict, metric = 'AUROC'):
 91    '''
 92    Takes the results from either `scmkl.run()` and generates a 
 93    dataframe for each model containing columns for alpha, area under 
 94    the ROC, number of groups with nonzero weights, and highest 
 95    weighted group.
 96
 97    Parameters
 98    ----------
 99    **results** : *dict*
100        > A dictionary of results from scMKL generated from either 
101        `scmkl.run()`.
102
103    **metric** : *str*
104        > Which metric to include in the summary. Default is AUROC. 
105        Options include `'AUROC'`, `'Recall'`, `'Precision'`, 
106        `'Accuracy'`, and `'F1-Score'`.
107
108    Returns
109    -------
110    **summary_df** : *pd.DataFrame*
111        > A table with columns:
112        `['Alpha', 'AUROC', 'Number of Selected Groups', 'Top Group']`.
113    
114    Examples
115    --------
116    >>> results = scmkl.run(adata, alpha_list)
117    >>> summary_df = scmkl.get_summary(results)
118    ...
119    >>> summary_df.head()
120        Alpha   AUROC  Number of Selected Groups 
121    0   2.20  0.8600                          3   
122    1   1.96  0.9123                          4   
123    2   1.72  0.9357                          5   
124    3   1.48  0.9524                          7   
125    4   1.24  0.9666                          9   
126        Top Group
127    0   RNA-HALLMARK_E2F_TARGETS
128    1   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
129    2   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
130    3   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
131    4   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
132    '''
133    summary = {'Alpha' : [],
134                'AUROC' : [],
135                'Number of Selected Groups' : [],
136                'Top Group' : []}
137    
138    alpha_list = list(results['Metrics'].keys())
139
140    # Creating summary DataFrame for each model
141    for alpha in alpha_list:
142        cur_alpha_rows = results['Norms'][alpha]
143        top_weight_rows = np.max(results['Norms'][alpha])
144        top_group_index = np.where(cur_alpha_rows == top_weight_rows)
145        num_selected = len(results['Selected_groups'][alpha])
146        top_group_names = np.array(results['Group_names'])[top_group_index]
147
148        summary['Alpha'].append(alpha)
149        summary['AUROC'].append(results['Metrics'][alpha][metric])
150        summary['Number of Selected Groups'].append(num_selected)
151        summary['Top Group'].append(*top_group_names)
152    
153    summary = pd.DataFrame(summary)
154
155    return summary

Takes the results from either scmkl.run and generates a dataframe for each model containing columns for alpha, area under the ROC, number of groups with nonzero weights, and highest weighted group.

Parameters

results : dict

A dictionary of results from scMKL generated from either scmkl.run.

metric : str

Which metric to include in the summary. Default is AUROC. Options include 'AUROC', 'Recall', 'Precision', 'Accuracy', and 'F1-Score'.

Returns

summary_df : pd.DataFrame

A table with columns: ['Alpha', 'AUROC', 'Number of Selected Groups', 'Top Group'].

Examples

>>> results = scmkl.run(adata, alpha_list)
>>> summary_df = scmkl.get_summary(results)
...
>>> summary_df.head()
    Alpha   AUROC  Number of Selected Groups 
0   2.20  0.8600                          3   
1   1.96  0.9123                          4   
2   1.72  0.9357                          5   
3   1.48  0.9524                          7   
4   1.24  0.9666                          9   
    Top Group
0   RNA-HALLMARK_E2F_TARGETS
1   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
2   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
3   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
4   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
def read_files(dir: str, pattern: str | None = None) -> dict:
158def read_files(dir : str, pattern : str | None = None) -> dict:
159    '''
160    This function takes a directory of scMKL results as pickle files 
161    and returns a dictionary with the file names as keys and the data 
162    from the respective files as the values.
163
164    Parameters
165    ----------
166    **dir** : *str*
167        > A string specifying the file path for the output scMKL runs.
168
169    **pattern** : *str*
170        > A regex string for filtering down to desired files. If 
171        `None`, all files in the directory with the pickle file 
172        extension will be added to the dictionary.
173
174    Returns
175    -------
176    **results** : *dict*
177        > a dictionary with the file names as keys and data as values.
178
179    Examples
180    --------
181    >>> filepath = 'scMKL_results/rna+atac/'
182    ...
183    >>> all_results = scmkl.read_files(filepath)
184    >>> all_results.keys()
185    dict_keys(['Rep_1.pkl', Rep_2.pkl, Rep_3.pkl, ...])
186    '''
187    # Reading all pickle files in patter is None
188    if pattern is None:
189        data = {file : np.load(f'{dir}/{file}', allow_pickle = True)
190                 for file in os.listdir(dir) if '.pkl' in file}
191    
192    # Reading only files matching pattern if not None
193    else:
194        pattern = repr(pattern)
195        data = {file : np.load(f'{dir}/{file}', allow_pickle = True)
196                 for file in os.listdir(dir) 
197                 if re.fullmatch(pattern, file) is not None}
198        
199    return data

This function takes a directory of scMKL results as pickle files and returns a dictionary with the file names as keys and the data from the respective files as the values.

Parameters

dir : str

A string specifying the file path for the output scMKL runs.

pattern : str

A regex string for filtering down to desired files. If None, all files in the directory with the pickle file extension will be added to the dictionary.

Returns

results : dict

a dictionary with the file names as keys and data as values.

Examples

>>> filepath = 'scMKL_results/rna+atac/'
...
>>> all_results = scmkl.read_files(filepath)
>>> all_results.keys()
dict_keys(['Rep_1.pkl', Rep_2.pkl, Rep_3.pkl, ...])
def get_metrics( results: dict | None = None, rfiles: dict | None = None, include_as: bool = False) -> pandas.core.frame.DataFrame:
202def get_metrics(results : dict | None = None, rfiles : dict | None = None, 
203                include_as : bool = False) -> pd.DataFrame:
204    '''
205    Takes either a single scMKL result or a dictionary where each 
206    entry cooresponds to one result. Returns a dataframe with cols 
207    ['Alpha', 'Metric', 'Value']. If `include_as == True`, another 
208    col of booleans will be added to indicate whether or not the run 
209    respective to that alpha was chosen as optimal via CV. If 
210    `include_key == True`, another column will be added with the name 
211    of the key to the respective file (only applicable with multiple 
212    results).
213
214    Parameters
215    ----------
216    **results** : *None* | *dict*
217        > A dictionary with the results of a single run from 
218        `scmkl.run()`. Must be `None` if `rfiles is not None`.
219
220    **rfiles** : *None* | *dict*
221        > A dictionary of results dictionaries containing multiple 
222        results from `scmkl.run()`. If `include_keys == True`, a col 
223        will be added to the output pd.DataFrame with the keys as 
224        values cooresponding to each row.
225
226    **include_as** : *bool*
227        > When `True`, will add a bool col to output pd.DataFrame 
228        where rows with alphas cooresponding to alpha_star will be 
229        `True`.
230
231    Returns
232    -------
233    **df** : *pd.DataFrame*
234        > A pd.DataFrame containing all of the metrics present from 
235        the runs input.
236
237    Examples
238    --------
239    >>> # For a single file
240    >>> results = scmkl.run(adata)
241    >>> metrics = scmkl.get_metrics(results = results)
242
243    >>> # For multiple runs saved in a dict
244    >>> output_dir = 'scMKL_outputs/'
245    >>> rfiles = scmkl.read_files(output_dir)
246    >>> metrics = scmkl.get_metrics(rfiles)
247    '''
248    # Checking which data is being worked with 
249    multi_results = _parse_result_type(results = results, rfiles = rfiles)
250
251    # Initiating col list with minimal columns
252    cols = ['Alpha', 'Metric', 'Value']
253
254    if include_as:
255        cols.append('Alpha Star')
256
257    if multi_results:
258        cols.append('Key')
259        df = pd.DataFrame(columns = cols)
260        for key, result in rfiles.items():
261            cur_df = _parse_metrics(results = result, key = key, 
262                                     include_as = include_as)
263            df = pd.concat([df, cur_df.copy()])
264            
265    else:
266        df = _parse_metrics(results = results, include_as = include_as)
267
268    return df

Takes either a single scMKL result or a dictionary where each entry cooresponds to one result. Returns a dataframe with cols ['Alpha', 'Metric', 'Value']. If include_as == True, another col of booleans will be added to indicate whether or not the run respective to that alpha was chosen as optimal via CV. If include_key == True, another column will be added with the name of the key to the respective file (only applicable with multiple results).

Parameters

results : None | dict

A dictionary with the results of a single run from scmkl.run. Must be None if rfiles is not None.

rfiles : None | dict

A dictionary of results dictionaries containing multiple results from scmkl.run. If include_keys == True, a col will be added to the output pd.DataFrame with the keys as values cooresponding to each row.

include_as : bool

When True, will add a bool col to output pd.DataFrame where rows with alphas cooresponding to alpha_star will be True.

Returns

df : pd.DataFrame

A pd.DataFrame containing all of the metrics present from the runs input.

Examples

>>> # For a single file
>>> results = scmkl.run(adata)
>>> metrics = scmkl.get_metrics(results = results)
>>> # For multiple runs saved in a dict
>>> output_dir = 'scMKL_outputs/'
>>> rfiles = scmkl.read_files(output_dir)
>>> metrics = scmkl.get_metrics(rfiles)
def get_weights( results: dict | None = None, rfiles: dict | None = None, include_as: bool = False) -> pandas.core.frame.DataFrame:
271def get_weights(results : dict | None = None, rfiles : dict | None = None, 
272                include_as : bool = False) -> pd.DataFrame:
273    '''
274    Takes either a single scMKL result or dictionary of results and 
275    returns a pd.DataFrame with cols ['Alpha', 'Group', 
276    'Kernel Weight']. If include_as == True, a fourth col will be 
277    added to indicate whether or not the run respective to that alpha 
278    was chosen as optimal via CV.
279
280    Parameters
281    ----------
282    **results** : *None* | *dict*
283        > A dictionary with the results of a single run from 
284        `scmkl.run()`. Must be `None` if `rfiles is not None`.
285
286    **rfiles** : *None* | *dict*
287        > A dictionary of results dictionaries containing multiple 
288        results from `scmkl.run()`. If `include_keys == True`, a col 
289        will be added to the output pd.DataFrame with the keys as 
290        values cooresponding to each row.
291
292    **include_as** : *bool*
293        > When `True`, will add a bool col to output pd.DataFrame 
294        where rows with alphas cooresponding to alpha_star will be 
295        `True`.
296
297    Returns
298    -------
299    **df** : *pd.DataFrame*
300        > A pd.DataFrame containing all of the groups from each alpha 
301        and their cooresponding kernel weights.
302
303    Examples
304    --------
305    >>> # For a single file
306    >>> results = scmkl.run(adata)
307    >>> weights = scmkl.get_weights(results = results)
308    >>>
309    >>> # For multiple runs saved in a dict
310    >>> output_dir = 'scMKL_outputs/'
311    >>> rfiles = scmkl.read_files(output_dir)
312    >>> weights = scmkl.get_weights(rfiles)
313    '''
314    # Checking which data is being worked with 
315    multi_results = _parse_result_type(results = results, rfiles = rfiles)
316
317    # Initiating col list with minimal columns
318    cols = ['Alpha', 'Group', 'Kernel Weight']
319
320    if include_as:
321        cols.append('Alpha Star')
322
323    if multi_results:
324        cols.append('Key')
325        df = pd.DataFrame(columns = cols)
326        for key, result in rfiles.items():
327            cur_df = _parse_weights(results = result, key = key, 
328                                     include_as = include_as)
329            df = pd.concat([df, cur_df.copy()])
330            
331    else:
332        df = _parse_metrics(results = results, include_as = include_as)
333
334    return df

Takes either a single scMKL result or dictionary of results and returns a pd.DataFrame with cols ['Alpha', 'Group', 'Kernel Weight']. If include_as == True, a fourth col will be added to indicate whether or not the run respective to that alpha was chosen as optimal via CV.

Parameters

results : None | dict

A dictionary with the results of a single run from scmkl.run. Must be None if rfiles is not None.

rfiles : None | dict

A dictionary of results dictionaries containing multiple results from scmkl.run. If include_keys == True, a col will be added to the output pd.DataFrame with the keys as values cooresponding to each row.

include_as : bool

When True, will add a bool col to output pd.DataFrame where rows with alphas cooresponding to alpha_star will be True.

Returns

df : pd.DataFrame

A pd.DataFrame containing all of the groups from each alpha and their cooresponding kernel weights.

Examples

>>> # For a single file
>>> results = scmkl.run(adata)
>>> weights = scmkl.get_weights(results = results)
>>>
>>> # For multiple runs saved in a dict
>>> output_dir = 'scMKL_outputs/'
>>> rfiles = scmkl.read_files(output_dir)
>>> weights = scmkl.get_weights(rfiles)
def get_selection(weights_df, order_groups: bool) -> pandas.core.frame.DataFrame:
337def get_selection(weights_df, order_groups : bool) -> pd.DataFrame:
338    '''
339    This function takes a pd.DataFrame created by 
340    `scmkl.get_weights()` and returns a selection table. Selection 
341    refers to how many times a group had a nonzero group weight. To 
342    calculate this, a col is added indicating whether the group was 
343    selected. Then, the dataframe is grouped by alpha and group. 
344    Selection can then be summed returning a dataframe with cols 
345    `['Alpha', 'Group', Selection]`.
346
347    Parameters
348    ----------
349    **weights_df** : *pd.DataFrame*
350        > A dataframe output by `scmkl.get_weights()` with cols
351        `['Alpha', 'Group', 'Kernel Weight']`.
352
353    **order_groups** : *bool*
354        > If `True`, the `'Group'` col of the output dataframe will be 
355        made into a `pd.Categorical` col ordered by number of times 
356        each group was selected in decending order.
357
358    Returns
359    -------
360    **df** : *pd.DataFrame*
361        > A dataframe with cols `['Alpha', 'Group', Selection]`.
362
363    Example
364    -------
365    >>> # For a single file
366    >>> results = scmkl.run(adata)
367    >>> weights = scmkl.get_weights(results = results)
368    >>> selection = scmkl.get_selection(weights)
369    >>>
370    >>> # For multiple runs saved in a dict
371    >>> output_dir = 'scMKL_outputs/'
372    >>> rfiles = scmkl.read_files(output_dir)
373    >>> weights = scmkl.get_weights(rfiles)
374    >>> selection = scmkl.get_selection(weights)
375    '''
376    # Adding col indicating whether or not groups have nonzero weight
377    selection = weights_df['Kernel Weight'].apply(lambda x: x > 0)
378    weights_df['Selection'] = selection
379
380    # Summing selection across replications to get selection
381    df = weights_df.groupby(['Alpha', 'Group'])['Selection'].sum()
382    df = df.reset_index()
383
384    # Getting group order
385    if order_groups:
386        order = df.groupby('Group')['Selection'].sum()
387        order = order.reset_index().sort_values(by = 'Selection', 
388                                                ascending = False)
389        order = order['Group']
390        df['Group'] = pd.Categorical(df['Group'], categories = order)
391
392
393    return df

This function takes a pd.DataFrame created by scmkl.get_weights() and returns a selection table. Selection refers to how many times a group had a nonzero group weight. To calculate this, a col is added indicating whether the group was selected. Then, the dataframe is grouped by alpha and group. Selection can then be summed returning a dataframe with cols ['Alpha', 'Group', Selection].

Parameters

weights_df : pd.DataFrame

A dataframe output by scmkl.get_weights() with cols ['Alpha', 'Group', 'Kernel Weight'].

order_groups : bool

If True, the 'Group' col of the output dataframe will be made into a pd.Categorical col ordered by number of times each group was selected in decending order.

Returns

df : pd.DataFrame

A dataframe with cols ['Alpha', 'Group', Selection].

Example

>>> # For a single file
>>> results = scmkl.run(adata)
>>> weights = scmkl.get_weights(results = results)
>>> selection = scmkl.get_selection(weights)
>>>
>>> # For multiple runs saved in a dict
>>> output_dir = 'scMKL_outputs/'
>>> rfiles = scmkl.read_files(output_dir)
>>> weights = scmkl.get_weights(rfiles)
>>> selection = scmkl.get_selection(weights)
def mean_groups_per_alpha(selection_df) -> dict:
396def mean_groups_per_alpha(selection_df) -> dict:
397    '''
398    This function takes a pd.DataFrame from `scmkl.get_selection()` 
399    generated from multiple scMKL results and returns a dictionary 
400    with keys being alphas from the input dataframe and values being 
401    the mean number of selected groups for a given alpha across 
402    results. 
403
404    Parameters
405    ----------
406    **selection_df** : *pd.DataFrame*
407        > A dataframe output by `scmkl.get_selection()` with cols 
408        `['Alpha', 'Group', Selection].
409    
410    Returns
411    -------
412    **mean_groups** : *dict*
413        > A dictionary with alphas as keys and the mean number of 
414        selected groups for that alpha as keys.
415
416    Examples
417    --------
418    >>> weights = scmkl.get_weights(rfiles)
419    >>> selection = scmkl.get_selection(weights)
420    >>> mean_groups = scmkl.mean_groups_per_alpha(selection)
421    >>> mean_groups = {alpha : np.round(num_selected, 1)
422    ...                for alpha, num_selected in mean_groups.items()}
423    >>>
424    >>> print(mean_groups)
425    {0.05 : 50.0, 0.2 : 24.7, 1.1 : 5.3}
426    '''
427    mean_groups = {}
428    for alpha in np.unique(selection_df['Alpha']):
429
430        # Capturing rows for given alpha
431        rows = selection_df['Alpha'] == alpha
432
433        # Adding mean number of groups for alpha
434        mean_groups[alpha] = np.mean(selection_df[rows]['Selection'])
435
436    return mean_groups

This function takes a pd.DataFrame from scmkl.get_selection() generated from multiple scMKL results and returns a dictionary with keys being alphas from the input dataframe and values being the mean number of selected groups for a given alpha across results.

Parameters

selection_df : pd.DataFrame

A dataframe output by scmkl.get_selection() with cols `['Alpha', 'Group', Selection].

Returns

mean_groups : dict

A dictionary with alphas as keys and the mean number of selected groups for that alpha as keys.

Examples

>>> weights = scmkl.get_weights(rfiles)
>>> selection = scmkl.get_selection(weights)
>>> mean_groups = scmkl.mean_groups_per_alpha(selection)
>>> mean_groups = {alpha : np.round(num_selected, 1)
...                for alpha, num_selected in mean_groups.items()}
>>>
>>> print(mean_groups)
{0.05 : 50.0, 0.2 : 24.7, 1.1 : 5.3}