scmkl.run

  1import anndata as ad
  2import numpy as np
  3import time
  4import tracemalloc
  5
  6from scmkl.train_model import train_model
  7from scmkl.test import predict, find_selected_groups
  8
  9
 10def run(adata: ad.AnnData, alpha_list: np.ndarray, 
 11        metrics: list | None = None, 
 12        return_probs: bool=False) -> dict:
 13    """
 14    Wrapper function for training and test with multiple alpha values.
 15    Returns metrics, predictions, group weights, and resource usage.
 16
 17    Parameters
 18    ----------
 19    adata : ad.AnnData 
 20        A processed `ad.AnnData` with `'Z_train'`, `'Z_test'`, and 
 21        `'group_dict'` keys in `adata.uns`.
 22    
 23    alpha_list : np.ndarray 
 24        Sparsity values to create models with. Alpha refers to the 
 25        penalty parameter in Group Lasso. Larger alphas force group 
 26        weights to shrink towards zero while smaller alphas apply a 
 27        lesser penalty to kernal weights. Values too large will results 
 28        in models that weight all groups as zero.
 29
 30    metrics : list[str]
 31        Metrics that should be calculated on predictions. Options are 
 32        `['AUROC', 'F1-Score', 'Accuracy', 'Precision', 'Recall']`. 
 33        When set to `None`, all metrics are calculated.
 34    
 35    Returns
 36    -------
 37    results : dict
 38        Results with keys and values: 
 39
 40        `'Metrics'` (dict): 
 41        A nested dictionary as `[alpha][metric] = value`.
 42
 43        `'Group_names'` (np.ndarray): 
 44        Array of group names used in model(s).
 45    
 46        `'Selected_groups'` (dict): 
 47        A nested dictionary as `[alpha] = np.array([nonzero_groups])`.
 48        Nonzero groups are groups that had a kernel weight above zero.
 49
 50        `'Norms'` (dict): 
 51        A nested dictionary as `[alpha] = np.array([kernel_weights])`
 52        Order of `kernel_weights` is respective to `'Group_names'` 
 53        values.
 54
 55        `'Observed'` (np.nparray): 
 56        An array of ground truth cell labels from the test set.
 57
 58        `'Predictions'` (dict): 
 59        A nested dictionary as `[alpha] = predicted_class` respective 
 60        to `'Observations'` for `alpha`.
 61
 62        `'Test_indices'` (np.array: 
 63        Indices of samples respective to adata used in the training 
 64        set.
 65
 66        `'Model'` (dict): 
 67        A nested dictionary where `[alpha] = celer.GroupLasso` object 
 68        for `alpha`.
 69
 70        `'RAM_usage'` (dict): 
 71        A nested dictionary with memory usage in GB after 
 72        training models for each `alpha`.
 73
 74    Examples
 75    --------
 76    >>> results = scmkl.run(adata = adata, 
 77    ...                     alpha_list = np.array([0.05, 0.1, 0.5]))
 78    >>> results
 79    dict_keys(['Metrics', 'Selected_groups', 'Norms', 'Predictions', 
 80    ...        'Observed', 'Test_indices', 'Group_names', 'Models', 
 81    ...        'Train_time', 'RAM_usage'])
 82    >>>
 83    >>> alpha values
 84    >>> results['Metrics'].keys()
 85    dict_keys([0.05, 0.1, 0.5])
 86    >>>
 87    >>> results['Metrics'][0.05]
 88    {'AUROC': 0.9859,
 89    'Accuracy': 0.945,
 90    'F1-Score': 0.9452736318407959,
 91    'Precision': 0.9405940594059405,
 92    'Recall': 0.95}
 93    """
 94    if metrics is None:
 95        metrics = ['AUROC', 'F1-Score','Accuracy', 'Precision', 'Recall']
 96
 97    # Initializing variables to capture metrics
 98    group_names = list(adata.uns['group_dict'].keys())
 99    preds = {}
100    group_norms = {}
101    mets_dict = {}
102    selected_groups = {}
103    train_time = {}
104    models = {}
105    probs = {}
106
107    D = adata.uns['D']
108
109    # Generating models for each alpha and outputs
110    tracemalloc.start()
111    for alpha in alpha_list:
112        
113        print(f'  Evaluating model. Alpha: {alpha}', flush = True)
114
115        train_start = time.time()
116
117        adata = train_model(adata, group_size= 2*D, alpha = alpha)
118        if return_probs:
119            alpha_res = predict(adata, 
120                                metrics = metrics,
121                                return_probs = return_probs)
122            preds[alpha], mets_dict[alpha], probs[alpha] = alpha_res
123
124        else:
125            alpha_res = predict(adata, 
126                                metrics = metrics,
127                                return_probs = return_probs)
128            preds[alpha], mets_dict[alpha] = alpha_res
129
130        selected_groups[alpha] = find_selected_groups(adata)
131
132        kernel_weights = adata.uns['model'].coef_
133        group_norms[alpha] = [
134            np.linalg.norm(kernel_weights[i * 2 * D : (i + 1) * 2 * D - 1])
135            for i in np.arange(len(group_names))
136            ]
137        
138        models[alpha] = adata.uns['model']
139        
140        train_end = time.time()
141        train_time[alpha] = train_end - train_start
142
143    # Combining results into one object
144    results = {}
145    results['Metrics'] = mets_dict
146    results['Selected_groups'] = selected_groups
147    results['Norms'] = group_norms
148    results['Predictions'] = preds
149    results['Observed'] = adata.obs['labels'].iloc[adata.uns['test_indices']]
150    results['Test_indices'] = adata.uns['test_indices']
151    results['Group_names']= group_names
152    results['Models'] = models
153    results['Train_time'] = train_time
154    results['RAM_usage'] = f'{tracemalloc.get_traced_memory()[1]/1e9} GB'
155    results['Probabilities'] = probs
156
157    return results
def run( adata: anndata._core.anndata.AnnData, alpha_list: numpy.ndarray, metrics: list | None = None, return_probs: bool = False) -> dict:
 11def run(adata: ad.AnnData, alpha_list: np.ndarray, 
 12        metrics: list | None = None, 
 13        return_probs: bool=False) -> dict:
 14    """
 15    Wrapper function for training and test with multiple alpha values.
 16    Returns metrics, predictions, group weights, and resource usage.
 17
 18    Parameters
 19    ----------
 20    adata : ad.AnnData 
 21        A processed `ad.AnnData` with `'Z_train'`, `'Z_test'`, and 
 22        `'group_dict'` keys in `adata.uns`.
 23    
 24    alpha_list : np.ndarray 
 25        Sparsity values to create models with. Alpha refers to the 
 26        penalty parameter in Group Lasso. Larger alphas force group 
 27        weights to shrink towards zero while smaller alphas apply a 
 28        lesser penalty to kernal weights. Values too large will results 
 29        in models that weight all groups as zero.
 30
 31    metrics : list[str]
 32        Metrics that should be calculated on predictions. Options are 
 33        `['AUROC', 'F1-Score', 'Accuracy', 'Precision', 'Recall']`. 
 34        When set to `None`, all metrics are calculated.
 35    
 36    Returns
 37    -------
 38    results : dict
 39        Results with keys and values: 
 40
 41        `'Metrics'` (dict): 
 42        A nested dictionary as `[alpha][metric] = value`.
 43
 44        `'Group_names'` (np.ndarray): 
 45        Array of group names used in model(s).
 46    
 47        `'Selected_groups'` (dict): 
 48        A nested dictionary as `[alpha] = np.array([nonzero_groups])`.
 49        Nonzero groups are groups that had a kernel weight above zero.
 50
 51        `'Norms'` (dict): 
 52        A nested dictionary as `[alpha] = np.array([kernel_weights])`
 53        Order of `kernel_weights` is respective to `'Group_names'` 
 54        values.
 55
 56        `'Observed'` (np.nparray): 
 57        An array of ground truth cell labels from the test set.
 58
 59        `'Predictions'` (dict): 
 60        A nested dictionary as `[alpha] = predicted_class` respective 
 61        to `'Observations'` for `alpha`.
 62
 63        `'Test_indices'` (np.array: 
 64        Indices of samples respective to adata used in the training 
 65        set.
 66
 67        `'Model'` (dict): 
 68        A nested dictionary where `[alpha] = celer.GroupLasso` object 
 69        for `alpha`.
 70
 71        `'RAM_usage'` (dict): 
 72        A nested dictionary with memory usage in GB after 
 73        training models for each `alpha`.
 74
 75    Examples
 76    --------
 77    >>> results = scmkl.run(adata = adata, 
 78    ...                     alpha_list = np.array([0.05, 0.1, 0.5]))
 79    >>> results
 80    dict_keys(['Metrics', 'Selected_groups', 'Norms', 'Predictions', 
 81    ...        'Observed', 'Test_indices', 'Group_names', 'Models', 
 82    ...        'Train_time', 'RAM_usage'])
 83    >>>
 84    >>> alpha values
 85    >>> results['Metrics'].keys()
 86    dict_keys([0.05, 0.1, 0.5])
 87    >>>
 88    >>> results['Metrics'][0.05]
 89    {'AUROC': 0.9859,
 90    'Accuracy': 0.945,
 91    'F1-Score': 0.9452736318407959,
 92    'Precision': 0.9405940594059405,
 93    'Recall': 0.95}
 94    """
 95    if metrics is None:
 96        metrics = ['AUROC', 'F1-Score','Accuracy', 'Precision', 'Recall']
 97
 98    # Initializing variables to capture metrics
 99    group_names = list(adata.uns['group_dict'].keys())
100    preds = {}
101    group_norms = {}
102    mets_dict = {}
103    selected_groups = {}
104    train_time = {}
105    models = {}
106    probs = {}
107
108    D = adata.uns['D']
109
110    # Generating models for each alpha and outputs
111    tracemalloc.start()
112    for alpha in alpha_list:
113        
114        print(f'  Evaluating model. Alpha: {alpha}', flush = True)
115
116        train_start = time.time()
117
118        adata = train_model(adata, group_size= 2*D, alpha = alpha)
119        if return_probs:
120            alpha_res = predict(adata, 
121                                metrics = metrics,
122                                return_probs = return_probs)
123            preds[alpha], mets_dict[alpha], probs[alpha] = alpha_res
124
125        else:
126            alpha_res = predict(adata, 
127                                metrics = metrics,
128                                return_probs = return_probs)
129            preds[alpha], mets_dict[alpha] = alpha_res
130
131        selected_groups[alpha] = find_selected_groups(adata)
132
133        kernel_weights = adata.uns['model'].coef_
134        group_norms[alpha] = [
135            np.linalg.norm(kernel_weights[i * 2 * D : (i + 1) * 2 * D - 1])
136            for i in np.arange(len(group_names))
137            ]
138        
139        models[alpha] = adata.uns['model']
140        
141        train_end = time.time()
142        train_time[alpha] = train_end - train_start
143
144    # Combining results into one object
145    results = {}
146    results['Metrics'] = mets_dict
147    results['Selected_groups'] = selected_groups
148    results['Norms'] = group_norms
149    results['Predictions'] = preds
150    results['Observed'] = adata.obs['labels'].iloc[adata.uns['test_indices']]
151    results['Test_indices'] = adata.uns['test_indices']
152    results['Group_names']= group_names
153    results['Models'] = models
154    results['Train_time'] = train_time
155    results['RAM_usage'] = f'{tracemalloc.get_traced_memory()[1]/1e9} GB'
156    results['Probabilities'] = probs
157
158    return results

Wrapper function for training and test with multiple alpha values. Returns metrics, predictions, group weights, and resource usage.

Parameters
  • adata (ad.AnnData): A processed ad.AnnData with 'Z_train', 'Z_test', and 'group_dict' keys in adata.uns.
  • alpha_list (np.ndarray): Sparsity values to create models with. Alpha refers to the penalty parameter in Group Lasso. Larger alphas force group weights to shrink towards zero while smaller alphas apply a lesser penalty to kernal weights. Values too large will results in models that weight all groups as zero.
  • metrics (list[str]): Metrics that should be calculated on predictions. Options are ['AUROC', 'F1-Score', 'Accuracy', 'Precision', 'Recall']. When set to None, all metrics are calculated.
Returns
  • results (dict): Results with keys and values:

    'Metrics' (dict): A nested dictionary as [alpha][metric] = value.

    'Group_names' (np.ndarray): Array of group names used in model(s).

    'Selected_groups' (dict): A nested dictionary as [alpha] = np.array([nonzero_groups]). Nonzero groups are groups that had a kernel weight above zero.

    'Norms' (dict): A nested dictionary as [alpha] = np.array([kernel_weights]) Order of kernel_weights is respective to 'Group_names' values.

    'Observed' (np.nparray): An array of ground truth cell labels from the test set.

    'Predictions' (dict): A nested dictionary as [alpha] = predicted_class respective to 'Observations' for alpha.

    'Test_indices' (np.array: Indices of samples respective to adata used in the training set.

    'Model' (dict): A nested dictionary where [alpha] = celer.GroupLasso object for alpha.

    'RAM_usage' (dict): A nested dictionary with memory usage in GB after training models for each alpha.

Examples
>>> results = scmkl.run(adata = adata, 
...                     alpha_list = np.array([0.05, 0.1, 0.5]))
>>> results
dict_keys(['Metrics', 'Selected_groups', 'Norms', 'Predictions', 
...        'Observed', 'Test_indices', 'Group_names', 'Models', 
...        'Train_time', 'RAM_usage'])
>>>
>>> alpha values
>>> results['Metrics'].keys()
dict_keys([0.05, 0.1, 0.5])
>>>
>>> results['Metrics'][0.05]
{'AUROC': 0.9859,
'Accuracy': 0.945,
'F1-Score': 0.9452736318407959,
'Precision': 0.9405940594059405,
'Recall': 0.95}