scmkl.run

  1import anndata as ad
  2import numpy as np
  3import time
  4import tracemalloc
  5
  6from scmkl.train_model import train_model
  7from scmkl.test import predict, find_selected_groups
  8
  9
 10def run(adata: ad.AnnData, alpha_list: np.ndarray, 
 11        metrics: list | None = None, 
 12        return_probs: bool=False) -> dict:
 13    """
 14    Wrapper function for training and test with multiple alpha values.
 15    Returns metrics, predictions, group weights, and resource usage.
 16
 17    Parameters
 18    ----------
 19    adata : ad.AnnData 
 20        A processed `ad.AnnData` with `'Z_train'`, `'Z_test'`, and 
 21        `'group_dict'` keys in `adata.uns`.
 22    
 23    alpha_list : np.ndarray 
 24        Sparsity values to create models with. Alpha refers to the 
 25        penalty parameter in Group Lasso. Larger alphas force group 
 26        weights to shrink towards zero while smaller alphas apply a 
 27        lesser penalty to kernal weights. Values too large will results 
 28        in models that weight all groups as zero.
 29
 30    metrics : list[str]
 31        Metrics that should be calculated on predictions. Options are 
 32        `['AUROC', 'F1-Score', 'Accuracy', 'Precision', 'Recall']`. 
 33        When set to `None`, all metrics are calculated.
 34    
 35    Returns
 36    -------
 37    results : dict
 38        Results with keys and values: 
 39
 40        `'Metrics'` (dict): 
 41        A nested dictionary as `[alpha][metric] = value`.
 42
 43        `'Group_names'` (np.ndarray): 
 44        Array of group names used in model(s).
 45    
 46        `'Selected_groups'` (dict): 
 47        A nested dictionary as `[alpha] = np.array([nonzero_groups])`.
 48        Nonzero groups are groups that had a kernel weight above zero.
 49
 50        `'Norms'` (dict): 
 51        A nested dictionary as `[alpha] = np.array([kernel_weights])`
 52        Order of `kernel_weights` is respective to `'Group_names'` 
 53        values.
 54
 55        `'Observed'` (np.nparray): 
 56        An array of ground truth cell labels from the test set.
 57
 58        `'Predictions'` (dict): 
 59        A nested dictionary as `[alpha] = predicted_class` respective 
 60        to `'Observations'` for `alpha`.
 61
 62        `'Test_indices'` (np.array: 
 63        Indices of samples respective to adata used in the training 
 64        set.
 65
 66        `'Model'` (dict): 
 67        A nested dictionary where `[alpha] = celer.GroupLasso` object 
 68        for `alpha`.
 69
 70        `'RAM_usage'` (dict): 
 71        A nested dictionary with memory usage in GB after 
 72        training models for each `alpha`.
 73
 74    Examples
 75    --------
 76    >>> results = scmkl.run(adata = adata, 
 77    ...                     alpha_list = np.array([0.05, 0.1, 0.5]))
 78    >>> results
 79    dict_keys(['Metrics', 'Selected_groups', 'Norms', 'Predictions', 
 80    ...        'Observed', 'Test_indices', 'Group_names', 'Models', 
 81    ...        'Train_time', 'RAM_usage'])
 82    >>>
 83    >>> alpha values
 84    >>> results['Metrics'].keys()
 85    dict_keys([0.05, 0.1, 0.5])
 86    >>>
 87    >>> results['Metrics'][0.05]
 88    {'AUROC': 0.9859,
 89    'Accuracy': 0.945,
 90    'F1-Score': 0.9452736318407959,
 91    'Precision': 0.9405940594059405,
 92    'Recall': 0.95}
 93    """
 94    if metrics is None:
 95        metrics = ['AUROC', 'F1-Score','Accuracy', 'Precision', 'Recall']
 96
 97    # Initializing variables to capture metrics
 98    group_names = list(adata.uns['group_dict'].keys())
 99    preds = {}
100    group_norms = {}
101    mets_dict = {}
102    selected_groups = {}
103    train_time = {}
104    models = {}
105    probs = {}
106
107    D = adata.uns['D']
108
109    # Generating models for each alpha and outputs
110    tracemalloc.start()
111    for alpha in alpha_list:
112        
113        print(f'  Evaluating model. Alpha: {alpha}', flush = True)
114
115        train_start = time.time()
116
117        adata = train_model(adata, group_size= 2*D, alpha = alpha)
118
119        if return_probs:
120            alpha_res = predict(adata, 
121                                metrics = metrics,
122                                return_probs = return_probs)
123            preds[alpha], mets_dict[alpha], probs[alpha] = alpha_res
124
125        else:
126            alpha_res = predict(adata, 
127                                metrics = metrics,
128                                return_probs = return_probs)
129            preds[alpha], mets_dict[alpha] = alpha_res
130
131        selected_groups[alpha] = find_selected_groups(adata)
132
133        kernel_weights = adata.uns['model'].coef_
134        group_norms[alpha] = [
135            np.linalg.norm(kernel_weights[i * 2 * D : (i + 1) * 2 * D - 1])
136            for i in np.arange(len(group_names))
137            ]
138        
139        models[alpha] = adata.uns['model']
140        
141        train_end = time.time()
142        train_time[alpha] = train_end - train_start
143
144    # Combining results into one object
145    results = {}
146    results['Metrics'] = mets_dict
147    results['Selected_groups'] = selected_groups
148    results['Norms'] = group_norms
149    results['Predictions'] = preds
150    results['Observed'] = adata.obs['labels'].iloc[adata.uns['test_indices']]
151    results['Test_indices'] = adata.uns['test_indices']
152    results['Group_names']= group_names
153    results['Models'] = models
154    results['Train_time'] = train_time
155    results['RAM_usage'] = f'{tracemalloc.get_traced_memory()[1]/1e9} GB'
156    results['Probabilities'] = probs
157
158    return results
def run( adata: anndata._core.anndata.AnnData, alpha_list: numpy.ndarray, metrics: list | None = None, return_probs: bool = False) -> dict:
 11def run(adata: ad.AnnData, alpha_list: np.ndarray, 
 12        metrics: list | None = None, 
 13        return_probs: bool=False) -> dict:
 14    """
 15    Wrapper function for training and test with multiple alpha values.
 16    Returns metrics, predictions, group weights, and resource usage.
 17
 18    Parameters
 19    ----------
 20    adata : ad.AnnData 
 21        A processed `ad.AnnData` with `'Z_train'`, `'Z_test'`, and 
 22        `'group_dict'` keys in `adata.uns`.
 23    
 24    alpha_list : np.ndarray 
 25        Sparsity values to create models with. Alpha refers to the 
 26        penalty parameter in Group Lasso. Larger alphas force group 
 27        weights to shrink towards zero while smaller alphas apply a 
 28        lesser penalty to kernal weights. Values too large will results 
 29        in models that weight all groups as zero.
 30
 31    metrics : list[str]
 32        Metrics that should be calculated on predictions. Options are 
 33        `['AUROC', 'F1-Score', 'Accuracy', 'Precision', 'Recall']`. 
 34        When set to `None`, all metrics are calculated.
 35    
 36    Returns
 37    -------
 38    results : dict
 39        Results with keys and values: 
 40
 41        `'Metrics'` (dict): 
 42        A nested dictionary as `[alpha][metric] = value`.
 43
 44        `'Group_names'` (np.ndarray): 
 45        Array of group names used in model(s).
 46    
 47        `'Selected_groups'` (dict): 
 48        A nested dictionary as `[alpha] = np.array([nonzero_groups])`.
 49        Nonzero groups are groups that had a kernel weight above zero.
 50
 51        `'Norms'` (dict): 
 52        A nested dictionary as `[alpha] = np.array([kernel_weights])`
 53        Order of `kernel_weights` is respective to `'Group_names'` 
 54        values.
 55
 56        `'Observed'` (np.nparray): 
 57        An array of ground truth cell labels from the test set.
 58
 59        `'Predictions'` (dict): 
 60        A nested dictionary as `[alpha] = predicted_class` respective 
 61        to `'Observations'` for `alpha`.
 62
 63        `'Test_indices'` (np.array: 
 64        Indices of samples respective to adata used in the training 
 65        set.
 66
 67        `'Model'` (dict): 
 68        A nested dictionary where `[alpha] = celer.GroupLasso` object 
 69        for `alpha`.
 70
 71        `'RAM_usage'` (dict): 
 72        A nested dictionary with memory usage in GB after 
 73        training models for each `alpha`.
 74
 75    Examples
 76    --------
 77    >>> results = scmkl.run(adata = adata, 
 78    ...                     alpha_list = np.array([0.05, 0.1, 0.5]))
 79    >>> results
 80    dict_keys(['Metrics', 'Selected_groups', 'Norms', 'Predictions', 
 81    ...        'Observed', 'Test_indices', 'Group_names', 'Models', 
 82    ...        'Train_time', 'RAM_usage'])
 83    >>>
 84    >>> alpha values
 85    >>> results['Metrics'].keys()
 86    dict_keys([0.05, 0.1, 0.5])
 87    >>>
 88    >>> results['Metrics'][0.05]
 89    {'AUROC': 0.9859,
 90    'Accuracy': 0.945,
 91    'F1-Score': 0.9452736318407959,
 92    'Precision': 0.9405940594059405,
 93    'Recall': 0.95}
 94    """
 95    if metrics is None:
 96        metrics = ['AUROC', 'F1-Score','Accuracy', 'Precision', 'Recall']
 97
 98    # Initializing variables to capture metrics
 99    group_names = list(adata.uns['group_dict'].keys())
100    preds = {}
101    group_norms = {}
102    mets_dict = {}
103    selected_groups = {}
104    train_time = {}
105    models = {}
106    probs = {}
107
108    D = adata.uns['D']
109
110    # Generating models for each alpha and outputs
111    tracemalloc.start()
112    for alpha in alpha_list:
113        
114        print(f'  Evaluating model. Alpha: {alpha}', flush = True)
115
116        train_start = time.time()
117
118        adata = train_model(adata, group_size= 2*D, alpha = alpha)
119
120        if return_probs:
121            alpha_res = predict(adata, 
122                                metrics = metrics,
123                                return_probs = return_probs)
124            preds[alpha], mets_dict[alpha], probs[alpha] = alpha_res
125
126        else:
127            alpha_res = predict(adata, 
128                                metrics = metrics,
129                                return_probs = return_probs)
130            preds[alpha], mets_dict[alpha] = alpha_res
131
132        selected_groups[alpha] = find_selected_groups(adata)
133
134        kernel_weights = adata.uns['model'].coef_
135        group_norms[alpha] = [
136            np.linalg.norm(kernel_weights[i * 2 * D : (i + 1) * 2 * D - 1])
137            for i in np.arange(len(group_names))
138            ]
139        
140        models[alpha] = adata.uns['model']
141        
142        train_end = time.time()
143        train_time[alpha] = train_end - train_start
144
145    # Combining results into one object
146    results = {}
147    results['Metrics'] = mets_dict
148    results['Selected_groups'] = selected_groups
149    results['Norms'] = group_norms
150    results['Predictions'] = preds
151    results['Observed'] = adata.obs['labels'].iloc[adata.uns['test_indices']]
152    results['Test_indices'] = adata.uns['test_indices']
153    results['Group_names']= group_names
154    results['Models'] = models
155    results['Train_time'] = train_time
156    results['RAM_usage'] = f'{tracemalloc.get_traced_memory()[1]/1e9} GB'
157    results['Probabilities'] = probs
158
159    return results

Wrapper function for training and test with multiple alpha values. Returns metrics, predictions, group weights, and resource usage.

Parameters
  • adata (ad.AnnData): A processed ad.AnnData with 'Z_train', 'Z_test', and 'group_dict' keys in adata.uns.
  • alpha_list (np.ndarray): Sparsity values to create models with. Alpha refers to the penalty parameter in Group Lasso. Larger alphas force group weights to shrink towards zero while smaller alphas apply a lesser penalty to kernal weights. Values too large will results in models that weight all groups as zero.
  • metrics (list[str]): Metrics that should be calculated on predictions. Options are ['AUROC', 'F1-Score', 'Accuracy', 'Precision', 'Recall']. When set to None, all metrics are calculated.
Returns
  • results (dict): Results with keys and values:

    'Metrics' (dict): A nested dictionary as [alpha][metric] = value.

    'Group_names' (np.ndarray): Array of group names used in model(s).

    'Selected_groups' (dict): A nested dictionary as [alpha] = np.array([nonzero_groups]). Nonzero groups are groups that had a kernel weight above zero.

    'Norms' (dict): A nested dictionary as [alpha] = np.array([kernel_weights]) Order of kernel_weights is respective to 'Group_names' values.

    'Observed' (np.nparray): An array of ground truth cell labels from the test set.

    'Predictions' (dict): A nested dictionary as [alpha] = predicted_class respective to 'Observations' for alpha.

    'Test_indices' (np.array: Indices of samples respective to adata used in the training set.

    'Model' (dict): A nested dictionary where [alpha] = celer.GroupLasso object for alpha.

    'RAM_usage' (dict): A nested dictionary with memory usage in GB after training models for each alpha.

Examples
>>> results = scmkl.run(adata = adata, 
...                     alpha_list = np.array([0.05, 0.1, 0.5]))
>>> results
dict_keys(['Metrics', 'Selected_groups', 'Norms', 'Predictions', 
...        'Observed', 'Test_indices', 'Group_names', 'Models', 
...        'Train_time', 'RAM_usage'])
>>>
>>> alpha values
>>> results['Metrics'].keys()
dict_keys([0.05, 0.1, 0.5])
>>>
>>> results['Metrics'][0.05]
{'AUROC': 0.9859,
'Accuracy': 0.945,
'F1-Score': 0.9452736318407959,
'Precision': 0.9405940594059405,
'Recall': 0.95}