scmkl.optimize_alpha

  1import numpy as np
  2import anndata as ad
  3import gc
  4
  5from scmkl.tfidf_normalize import tfidf_normalize
  6from scmkl.calculate_z import calculate_z
  7from scmkl.train_model import train_model
  8from scmkl.multimodal_processing import multimodal_processing
  9from scmkl.test import predict
 10from scmkl.one_v_rest import get_class_train
 11
 12
 13# Array of alphas to be used if not provided
 14default_alphas = np.round(np.linspace(1.9, 0.05, 10),2)
 15
 16
 17def stop_early(metric_array, alpha_idx, fold_idx):
 18    """
 19    Assumes smallest alpha comes first.
 20    """
 21    # Must be at least two metrics from two alphas to compare
 22    if alpha_idx <= 0:
 23        return False
 24    
 25    cur_met = metric_array[alpha_idx, fold_idx]
 26    last_met = metric_array[alpha_idx - 1, fold_idx]
 27
 28    if cur_met < last_met:
 29        return True
 30    else:
 31        return False
 32
 33
 34def sort_alphas(alpha_array: np.ndarray):
 35    """
 36    Sorts alphas from smallest to largest.
 37    """
 38    order = np.argsort(alpha_array)
 39    alpha_array = alpha_array[order]
 40
 41    return alpha_array
 42
 43
 44def get_labels(adata: list | ad.AnnData):
 45    """
 46    Copies labels and train indices from `ad.AnnData` object for k-fold 
 47    cross validation. Mostly present for readability.
 48    """
 49    train_indices = adata[0].uns['train_indices'].copy()
 50    y = adata[0].obs['labels'].iloc[train_indices].to_numpy()
 51
 52    return train_indices, y
 53
 54
 55def get_folds(y: np.array, k: int):
 56    """
 57    With labels of training data and number of folds, returns the 
 58    indices and label for each k-folds.
 59    """
 60    # Splits the labels evenly between folds
 61    pos_idcs = np.where(y == np.unique(y)[0])[0]
 62    neg_idcs = np.setdiff1d(np.arange(len(y)), pos_idcs)
 63    
 64    pos_anno = np.arange(len(pos_idcs)) % k
 65    neg_anno = np.arange(len(neg_idcs)) % k
 66
 67    return pos_idcs, neg_idcs, pos_anno, neg_anno
 68
 69
 70def bin_optimize_alpha(adata: list[ad.AnnData], 
 71                       group_size: int | None=None, 
 72                       tfidf: list[bool]=False, 
 73                       alpha_array: np.ndarray=default_alphas, 
 74                       k: int=4, metric: str='AUROC', 
 75                       early_stopping: bool=False,
 76                       batches: int=10, batch_size: int=100,
 77                       combination: str='concatenate'):
 78    """
 79    Iteratively train a grouplasso model and update alpha to find the 
 80    parameter yielding best performing sparsity via k-fold cross 
 81    validation. This function currently only works for binary 
 82    experiments. Called by `scmkl.optimize_alpha()`.
 83
 84    Parameters
 85    ----------
 86    adata : list[ad.AnnData]
 87        List of `ad.AnnData`(s) with `'Z_train'` and `'Z_test'` in 
 88        `adata.uns.keys()`.
 89
 90    group_size : None | int
 91        Argument describing how the features are grouped. If `None`, 
 92        `2 * adata.uns['D']` will be used. For more information see 
 93        [celer documentation](https://mathurinm.github.io/celer/
 94        generated/celer.GroupLasso.html).
 95
 96    tfidf : list | bool
 97        If `False`, no data will be TF-IDF transformed. If 
 98        `type(adata) is list` and TF-IDF transformation is desired for 
 99        all or some of the data, a bool list corresponding to `adata` 
100        must be provided. To simply TF-IDF transform `adata` when 
101        `type(adata) is ad.AnnData`, use `True`.
102    
103    alpha_array : np.ndarray
104        Array of all alpha values to be tested.
105
106    k : int
107        Number of folds to perform cross validation over.
108            
109    metric : str
110        Which metric to use to optimize alpha. Options are `'AUROC'`, 
111        `'Accuracy'`, `'F1-Score'`, `'Precision'`, and `'Recall'`.
112
113    batches : int
114        The number of batches to use for the distance calculation.
115        This will average the result of `batches` distance calculations
116        of `batch_size` randomly sampled cells. More batches will converge
117        to population distance values at the cost of scalability.
118
119    batch_size : int
120        The number of cells to include per batch for distance
121        calculations. Higher batch size will converge to population
122        distance values at the cost of scalability. If 
123        `batches*batch_size > num_training_cells`, `batch_size` will be 
124        reduced to `int(num_training_cells/batches)`.
125
126    Returns
127    -------
128    alpha_star : float
129        The best performing alpha value from cross validation on 
130        training data.
131
132    Examples
133    --------
134    >>> alpha_star = scmkl.optimize_alpha(adata)
135    >>> alpha_star
136    0.1
137    """    
138    assert isinstance(k, int) and k > 0, "'k' must be positive"
139
140    import warnings 
141    warnings.filterwarnings('ignore')
142
143    alpha_array = sort_alphas(alpha_array)
144
145    # Need even folds for cross validation 
146    train_indices, y = get_labels(adata)
147    pos_idcs, neg_idcs, pos_anno, neg_anno = get_folds(y, k)
148
149    metric_array = np.zeros((len(alpha_array), k))
150
151    gc.collect()
152
153    for fold in np.arange(k):
154        cv_adata = [adata[i][train_indices, :] 
155                    for i in range(len(adata))]
156        
157        for i in range(len(cv_adata)):
158            if 'sigma' in cv_adata[i].uns_keys():
159                del cv_adata[i].uns['sigma']
160
161        # Create CV train/test indices
162        fold_train = np.concatenate((pos_idcs[np.where(pos_anno != fold)[0]], 
163                                     neg_idcs[np.where(neg_anno != fold)[0]]))
164        fold_test = np.concatenate((pos_idcs[np.where(pos_anno == fold)[0]], 
165                                    neg_idcs[np.where(neg_anno == fold)[0]]))
166
167
168        for i in range(len(cv_adata)):
169            cv_adata[i].uns['train_indices'] = fold_train
170            cv_adata[i].uns['test_indices'] = fold_test
171            if tfidf[i]:
172                cv_adata[i] = tfidf_normalize(cv_adata[i], binarize=True)
173
174            cv_adata[i] = calculate_z(cv_adata[i], n_features= 5000, 
175                            batches = batches, batch_size = batch_size)
176            
177        names = ['Adata ' + str(i + 1) for i in range(len(cv_adata))]
178        cv_adata = multimodal_processing(adata, names, tfidf, 
179                                            combination, batches, 
180                                            batch_size)
181                    
182            
183        # In train_model we index Z_train for balancing multiclass labels. We just recreate
184        # dummy indices here that are unused for use in the binary case
185        cv_adata.uns['train_indices'] = np.arange(0, len(fold_train))
186
187        gc.collect()
188
189        for i, alpha in enumerate(alpha_array):
190
191            cv_adata = train_model(cv_adata, group_size, alpha = alpha)
192            _, metrics = predict(cv_adata, metrics = [metric])
193            metric_array[i, fold] = metrics[metric]
194
195            # If metrics are decreasing, cv stopped and moving to next fold
196            end_fold = stop_early(metric_array, alpha_idx=i, fold_idx=fold)
197            if end_fold and early_stopping:
198                break
199
200        del cv_adata
201        gc.collect()
202
203    # Take AUROC mean across the k folds to find alpha yielding highest AUROC
204    alpha_star = alpha_array[np.argmax(np.mean(metric_array, axis = 1))]
205    gc.collect()
206
207    return alpha_star
208
209
210def multi_optimize_alpha(adata: list[ad.AnnData], group_size: int, 
211                         tfidf: list[bool]=[False], 
212                         alpha_array: np.ndarray=default_alphas, k: int=4, 
213                         metric: str='AUROC', early_stopping: bool=False,
214                         batches: int=10, batch_size: int=100, 
215                         force_balance: bool=True, combination: str='concatenate',
216                         train_dict: dict=None):
217    """
218    Wrapper function for running k-fold cross validation for every 
219    label in a multiclass experiment. Called by 
220    `scmkl.optimize_alpha()`. 
221
222    Parameters
223    ----------
224    adata : list[ad.AnnData]
225        List of `ad.AnnData`(s) with `'Z_train'` and `'Z_test'` in 
226        `adata.uns.keys()`.
227
228    group_size : None | int
229        Argument describing how the features are grouped. If `None`, 
230        `2 * adata.uns['D']` will be used. For more information see 
231        [celer documentation](https://mathurinm.github.io/celer/
232        generated/celer.GroupLasso.html).
233
234    tfidf : list[bool]
235        If `False`, no data will be TF-IDF transformed. If 
236        `type(adata) is list` and TF-IDF transformation is desired for 
237        all or some of the data, a bool list corresponding to `adata` 
238        must be provided. To simply TF-IDF transform `adata` when 
239        `type(adata) is ad.AnnData`, use `True`.
240    
241    alpha_array : np.ndarray
242        Array of all alpha values to be tested.
243
244    k : int
245        Number of folds to perform cross validation over.
246            
247    metric : str
248        Which metric to use to optimize alpha. Options are `'AUROC'`, 
249        `'Accuracy'`, `'F1-Score'`, `'Precision'`, and `'Recall'`.
250
251    batches : int
252        The number of batches to use for the distance calculation.
253        This will average the result of `batches` distance calculations
254        of `batch_size` randomly sampled cells. More batches will converge
255        to population distance values at the cost of scalability.
256
257    batch_size : int
258        The number of cells to include per batch for distance
259        calculations. Higher batch size will converge to population
260        distance values at the cost of scalability. If 
261        `batches*batch_size > num_training_cells`, `batch_size` will be 
262        reduced to `int(num_training_cells/batches)`.
263
264    force_balance: bool
265        If `True`, training sets will be balanced to reduce class label 
266        imbalance for each iteration. Defaults to `False`.
267
268    other_factor : float
269        The ratio of cells to sample for the other class for each 
270        model. For example, if classifying B cells with 100 B cells in 
271        training, if `other_factor=1`, 100 cells that are not B cells 
272        will be trained on with the B cells. This will be done for each 
273        fold for each class if `force_balance` is `True`. 
274
275    combination: str
276        How should multiple views of data be combined. For more details 
277        see ad.concat.
278
279    train_dict: dict
280        A `dict` where each key is a class label and values are are the 
281        indices to be trained with for that class for class balance. 
282        All values must be present in each adata.uns['train_indices'].
283
284    Returns
285    -------
286    alpha_star : dict
287        A dictionary with keys being class labels and values being the 
288        best performing alpha parameter for that class as a float.
289    """
290    classes = np.unique(adata[0].obs['labels'])
291    orig_labels = adata[0].obs['labels'].to_numpy().copy()
292    orig_train = adata[0].uns['train_indices'].copy()
293
294    if train_dict:
295        train_idcs = train_dict
296    else:
297        if force_balance:
298            train_idcs = get_class_train(adata[0].uns['train_indices'], 
299                                         adata[0].obs['labels'], 
300                                         adata[0].uns['seed_obj'])
301        else:
302            train_idcs = {ct: adata[0].uns['train_indices'].copy()
303                          for ct in classes}
304
305    opt_alpha_dict = dict()
306
307    for cl in classes:
308        temp_classes = orig_labels.copy()
309        temp_classes[temp_classes != cl] = 'other'
310
311        for i in range(len(adata)):
312            adata[i].obs['labels'] = temp_classes.copy()
313            adata[i].uns['train_indices'] = train_idcs[cl]
314
315        opt_alpha_dict[cl] = bin_optimize_alpha(adata, group_size, tfidf, 
316                                                alpha_array, k, metric, 
317                                                early_stopping, batches, 
318                                                batch_size, combination)     
319    
320    # Global adata obj will be permanently changed if not reset
321    for i in range(len(adata)):
322        adata[i].obs['labels'] = orig_labels
323        adata[i].uns['train_indices'] = orig_train
324
325    return opt_alpha_dict
326
327
328def optimize_alpha(adata: ad.AnnData | list[ad.AnnData], 
329                   group_size: int | None=None, tfidf: None | list[bool]=None, 
330                   alpha_array: np.ndarray=default_alphas, k: int=4, 
331                   metric: str='AUROC', early_stopping: bool=False,
332                   batches: int=10, batch_size: int=100, 
333                   combination: str='concatenate', force_balance: bool=True,
334                   train_dict: dict=None):
335    """
336    K-fold cross validation for optimizing alpha hyperparameter using 
337    training indices. 
338
339    Parameters
340    ----------
341    adata : list[ad.AnnData]
342        List of `ad.AnnData`(s) with `'Z_train'` and `'Z_test'` in 
343        `adata.uns.keys()`.
344
345    group_size : None | int
346        Argument describing how the features are grouped. If `None`, 
347        `2 * adata.uns['D']` will be used. For more information see 
348        [celer documentation](https://mathurinm.github.io/celer/
349        generated/celer.GroupLasso.html).
350
351    tfidf : list[bool]
352        If `False`, no data will be TF-IDF transformed. If 
353        `type(adata) is list` and TF-IDF transformation is desired for 
354        all or some of the data, a bool list corresponding to `adata` 
355        must be provided. To simply TF-IDF transform `adata` when 
356        `type(adata) is ad.AnnData`, use `True`.
357    
358    alpha_array : np.ndarray
359        Array of all alpha values to be tested.
360
361    k : int
362        Number of folds to perform cross validation over.
363            
364    metric : str
365        Which metric to use to optimize alpha. Options are `'AUROC'`, 
366        `'Accuracy'`, `'F1-Score'`, `'Precision'`, and `'Recall'`.
367
368    batches : int
369        The number of batches to use for the distance calculation.
370        This will average the result of `batches` distance calculations
371        of `batch_size` randomly sampled cells. More batches will converge
372        to population distance values at the cost of scalability.
373
374    batch_size : int
375        The number of cells to include per batch for distance
376        calculations. Higher batch size will converge to population
377        distance values at the cost of scalability. If 
378        `batches*batch_size > num_training_cells`, `batch_size` will be 
379        reduced to `int(num_training_cells/batches)`.
380
381    force_balance: bool
382        If `True`, training sets will be balanced to reduce class label 
383        imbalance for each iteration. Defaults to `False`.
384
385    other_factor : float
386        The ratio of cells to sample for the other class for each 
387        model. For example, if classifying B cells with 100 B cells in 
388        training, if `other_factor=1`, 100 cells that are not B cells 
389        will be trained on with the B cells. This will be done for each 
390        fold for each class if `force_balance` is `True`. 
391
392    combination: str
393        How should multiple views of data be combined. For more details 
394        see ad.concat.
395
396    train_dict: dict
397        A `dict` where each key is a class label and values are are the 
398        indices to be trained with for that class for class balance. 
399        All values must be present in each adata.uns['train_indices'].
400
401    Returns
402    -------
403    alpha_star : float | dict
404        If number of classes is more than 2, a dictionary with keys 
405        being class labels and values being the best performing alpha 
406        parameter for that class as a float. Else, a float for 
407        comparing the two classes.
408    """
409    # Need singe-view runs to be iterable
410    if isinstance(adata, ad.AnnData):
411        adata = [adata.copy()]
412
413    if isinstance(tfidf, type(None)):
414        tfidf = len(adata)*[False]
415
416    is_multi = 2 < len(set(adata[0].obs['labels']))
417    
418    if isinstance(group_size, type(None)):
419        group_size = 2*adata[0].uns['D']
420
421
422    if is_multi:
423        alpha_star = multi_optimize_alpha(adata, group_size, tfidf, 
424                                          alpha_array, k, metric, 
425                                          early_stopping, batches, 
426                                          batch_size, force_balance, 
427                                          combination, train_dict)
428        
429    else:
430        alpha_star = bin_optimize_alpha(adata, group_size, tfidf, 
431                                        alpha_array, k, metric, 
432                                        early_stopping, batches, 
433                                        batch_size, combination)
434        
435    return alpha_star
default_alphas = array([1.9 , 1.69, 1.49, 1.28, 1.08, 0.87, 0.67, 0.46, 0.26, 0.05])
def stop_early(metric_array, alpha_idx, fold_idx):
18def stop_early(metric_array, alpha_idx, fold_idx):
19    """
20    Assumes smallest alpha comes first.
21    """
22    # Must be at least two metrics from two alphas to compare
23    if alpha_idx <= 0:
24        return False
25    
26    cur_met = metric_array[alpha_idx, fold_idx]
27    last_met = metric_array[alpha_idx - 1, fold_idx]
28
29    if cur_met < last_met:
30        return True
31    else:
32        return False

Assumes smallest alpha comes first.

def sort_alphas(alpha_array: numpy.ndarray):
35def sort_alphas(alpha_array: np.ndarray):
36    """
37    Sorts alphas from smallest to largest.
38    """
39    order = np.argsort(alpha_array)
40    alpha_array = alpha_array[order]
41
42    return alpha_array

Sorts alphas from smallest to largest.

def get_labels(adata: list | anndata._core.anndata.AnnData):
45def get_labels(adata: list | ad.AnnData):
46    """
47    Copies labels and train indices from `ad.AnnData` object for k-fold 
48    cross validation. Mostly present for readability.
49    """
50    train_indices = adata[0].uns['train_indices'].copy()
51    y = adata[0].obs['labels'].iloc[train_indices].to_numpy()
52
53    return train_indices, y

Copies labels and train indices from ad.AnnData object for k-fold cross validation. Mostly present for readability.

def get_folds(y: <built-in function array>, k: int):
56def get_folds(y: np.array, k: int):
57    """
58    With labels of training data and number of folds, returns the 
59    indices and label for each k-folds.
60    """
61    # Splits the labels evenly between folds
62    pos_idcs = np.where(y == np.unique(y)[0])[0]
63    neg_idcs = np.setdiff1d(np.arange(len(y)), pos_idcs)
64    
65    pos_anno = np.arange(len(pos_idcs)) % k
66    neg_anno = np.arange(len(neg_idcs)) % k
67
68    return pos_idcs, neg_idcs, pos_anno, neg_anno

With labels of training data and number of folds, returns the indices and label for each k-folds.

def bin_optimize_alpha( adata: list[anndata._core.anndata.AnnData], group_size: int | None = None, tfidf: list[bool] = False, alpha_array: numpy.ndarray = array([1.9 , 1.69, 1.49, 1.28, 1.08, 0.87, 0.67, 0.46, 0.26, 0.05]), k: int = 4, metric: str = 'AUROC', early_stopping: bool = False, batches: int = 10, batch_size: int = 100, combination: str = 'concatenate'):
 71def bin_optimize_alpha(adata: list[ad.AnnData], 
 72                       group_size: int | None=None, 
 73                       tfidf: list[bool]=False, 
 74                       alpha_array: np.ndarray=default_alphas, 
 75                       k: int=4, metric: str='AUROC', 
 76                       early_stopping: bool=False,
 77                       batches: int=10, batch_size: int=100,
 78                       combination: str='concatenate'):
 79    """
 80    Iteratively train a grouplasso model and update alpha to find the 
 81    parameter yielding best performing sparsity via k-fold cross 
 82    validation. This function currently only works for binary 
 83    experiments. Called by `scmkl.optimize_alpha()`.
 84
 85    Parameters
 86    ----------
 87    adata : list[ad.AnnData]
 88        List of `ad.AnnData`(s) with `'Z_train'` and `'Z_test'` in 
 89        `adata.uns.keys()`.
 90
 91    group_size : None | int
 92        Argument describing how the features are grouped. If `None`, 
 93        `2 * adata.uns['D']` will be used. For more information see 
 94        [celer documentation](https://mathurinm.github.io/celer/
 95        generated/celer.GroupLasso.html).
 96
 97    tfidf : list | bool
 98        If `False`, no data will be TF-IDF transformed. If 
 99        `type(adata) is list` and TF-IDF transformation is desired for 
100        all or some of the data, a bool list corresponding to `adata` 
101        must be provided. To simply TF-IDF transform `adata` when 
102        `type(adata) is ad.AnnData`, use `True`.
103    
104    alpha_array : np.ndarray
105        Array of all alpha values to be tested.
106
107    k : int
108        Number of folds to perform cross validation over.
109            
110    metric : str
111        Which metric to use to optimize alpha. Options are `'AUROC'`, 
112        `'Accuracy'`, `'F1-Score'`, `'Precision'`, and `'Recall'`.
113
114    batches : int
115        The number of batches to use for the distance calculation.
116        This will average the result of `batches` distance calculations
117        of `batch_size` randomly sampled cells. More batches will converge
118        to population distance values at the cost of scalability.
119
120    batch_size : int
121        The number of cells to include per batch for distance
122        calculations. Higher batch size will converge to population
123        distance values at the cost of scalability. If 
124        `batches*batch_size > num_training_cells`, `batch_size` will be 
125        reduced to `int(num_training_cells/batches)`.
126
127    Returns
128    -------
129    alpha_star : float
130        The best performing alpha value from cross validation on 
131        training data.
132
133    Examples
134    --------
135    >>> alpha_star = scmkl.optimize_alpha(adata)
136    >>> alpha_star
137    0.1
138    """    
139    assert isinstance(k, int) and k > 0, "'k' must be positive"
140
141    import warnings 
142    warnings.filterwarnings('ignore')
143
144    alpha_array = sort_alphas(alpha_array)
145
146    # Need even folds for cross validation 
147    train_indices, y = get_labels(adata)
148    pos_idcs, neg_idcs, pos_anno, neg_anno = get_folds(y, k)
149
150    metric_array = np.zeros((len(alpha_array), k))
151
152    gc.collect()
153
154    for fold in np.arange(k):
155        cv_adata = [adata[i][train_indices, :] 
156                    for i in range(len(adata))]
157        
158        for i in range(len(cv_adata)):
159            if 'sigma' in cv_adata[i].uns_keys():
160                del cv_adata[i].uns['sigma']
161
162        # Create CV train/test indices
163        fold_train = np.concatenate((pos_idcs[np.where(pos_anno != fold)[0]], 
164                                     neg_idcs[np.where(neg_anno != fold)[0]]))
165        fold_test = np.concatenate((pos_idcs[np.where(pos_anno == fold)[0]], 
166                                    neg_idcs[np.where(neg_anno == fold)[0]]))
167
168
169        for i in range(len(cv_adata)):
170            cv_adata[i].uns['train_indices'] = fold_train
171            cv_adata[i].uns['test_indices'] = fold_test
172            if tfidf[i]:
173                cv_adata[i] = tfidf_normalize(cv_adata[i], binarize=True)
174
175            cv_adata[i] = calculate_z(cv_adata[i], n_features= 5000, 
176                            batches = batches, batch_size = batch_size)
177            
178        names = ['Adata ' + str(i + 1) for i in range(len(cv_adata))]
179        cv_adata = multimodal_processing(adata, names, tfidf, 
180                                            combination, batches, 
181                                            batch_size)
182                    
183            
184        # In train_model we index Z_train for balancing multiclass labels. We just recreate
185        # dummy indices here that are unused for use in the binary case
186        cv_adata.uns['train_indices'] = np.arange(0, len(fold_train))
187
188        gc.collect()
189
190        for i, alpha in enumerate(alpha_array):
191
192            cv_adata = train_model(cv_adata, group_size, alpha = alpha)
193            _, metrics = predict(cv_adata, metrics = [metric])
194            metric_array[i, fold] = metrics[metric]
195
196            # If metrics are decreasing, cv stopped and moving to next fold
197            end_fold = stop_early(metric_array, alpha_idx=i, fold_idx=fold)
198            if end_fold and early_stopping:
199                break
200
201        del cv_adata
202        gc.collect()
203
204    # Take AUROC mean across the k folds to find alpha yielding highest AUROC
205    alpha_star = alpha_array[np.argmax(np.mean(metric_array, axis = 1))]
206    gc.collect()
207
208    return alpha_star

Iteratively train a grouplasso model and update alpha to find the parameter yielding best performing sparsity via k-fold cross validation. This function currently only works for binary experiments. Called by scmkl.optimize_alpha.

Parameters
  • adata (list[ad.AnnData]): List of ad.AnnData(s) with 'Z_train' and 'Z_test' in adata.uns.keys().
  • group_size (None | int): Argument describing how the features are grouped. If None, 2 * adata.uns['D'] will be used. For more information see celer documentation.
  • tfidf (list | bool): If False, no data will be TF-IDF transformed. If type(adata) is list and TF-IDF transformation is desired for all or some of the data, a bool list corresponding to adata must be provided. To simply TF-IDF transform adata when type(adata) is ad.AnnData, use True.
  • alpha_array (np.ndarray): Array of all alpha values to be tested.
  • k (int): Number of folds to perform cross validation over.
  • metric (str): Which metric to use to optimize alpha. Options are 'AUROC', 'Accuracy', 'F1-Score', 'Precision', and 'Recall'.
  • batches (int): The number of batches to use for the distance calculation. This will average the result of batches distance calculations of batch_size randomly sampled cells. More batches will converge to population distance values at the cost of scalability.
  • batch_size (int): The number of cells to include per batch for distance calculations. Higher batch size will converge to population distance values at the cost of scalability. If batches*batch_size > num_training_cells, batch_size will be reduced to int(num_training_cells/batches).
Returns
  • alpha_star (float): The best performing alpha value from cross validation on training data.
Examples
>>> alpha_star = scmkl.optimize_alpha(adata)
>>> alpha_star
0.1
def multi_optimize_alpha( adata: list[anndata._core.anndata.AnnData], group_size: int, tfidf: list[bool] = [False], alpha_array: numpy.ndarray = array([1.9 , 1.69, 1.49, 1.28, 1.08, 0.87, 0.67, 0.46, 0.26, 0.05]), k: int = 4, metric: str = 'AUROC', early_stopping: bool = False, batches: int = 10, batch_size: int = 100, force_balance: bool = True, combination: str = 'concatenate', train_dict: dict = None):
211def multi_optimize_alpha(adata: list[ad.AnnData], group_size: int, 
212                         tfidf: list[bool]=[False], 
213                         alpha_array: np.ndarray=default_alphas, k: int=4, 
214                         metric: str='AUROC', early_stopping: bool=False,
215                         batches: int=10, batch_size: int=100, 
216                         force_balance: bool=True, combination: str='concatenate',
217                         train_dict: dict=None):
218    """
219    Wrapper function for running k-fold cross validation for every 
220    label in a multiclass experiment. Called by 
221    `scmkl.optimize_alpha()`. 
222
223    Parameters
224    ----------
225    adata : list[ad.AnnData]
226        List of `ad.AnnData`(s) with `'Z_train'` and `'Z_test'` in 
227        `adata.uns.keys()`.
228
229    group_size : None | int
230        Argument describing how the features are grouped. If `None`, 
231        `2 * adata.uns['D']` will be used. For more information see 
232        [celer documentation](https://mathurinm.github.io/celer/
233        generated/celer.GroupLasso.html).
234
235    tfidf : list[bool]
236        If `False`, no data will be TF-IDF transformed. If 
237        `type(adata) is list` and TF-IDF transformation is desired for 
238        all or some of the data, a bool list corresponding to `adata` 
239        must be provided. To simply TF-IDF transform `adata` when 
240        `type(adata) is ad.AnnData`, use `True`.
241    
242    alpha_array : np.ndarray
243        Array of all alpha values to be tested.
244
245    k : int
246        Number of folds to perform cross validation over.
247            
248    metric : str
249        Which metric to use to optimize alpha. Options are `'AUROC'`, 
250        `'Accuracy'`, `'F1-Score'`, `'Precision'`, and `'Recall'`.
251
252    batches : int
253        The number of batches to use for the distance calculation.
254        This will average the result of `batches` distance calculations
255        of `batch_size` randomly sampled cells. More batches will converge
256        to population distance values at the cost of scalability.
257
258    batch_size : int
259        The number of cells to include per batch for distance
260        calculations. Higher batch size will converge to population
261        distance values at the cost of scalability. If 
262        `batches*batch_size > num_training_cells`, `batch_size` will be 
263        reduced to `int(num_training_cells/batches)`.
264
265    force_balance: bool
266        If `True`, training sets will be balanced to reduce class label 
267        imbalance for each iteration. Defaults to `False`.
268
269    other_factor : float
270        The ratio of cells to sample for the other class for each 
271        model. For example, if classifying B cells with 100 B cells in 
272        training, if `other_factor=1`, 100 cells that are not B cells 
273        will be trained on with the B cells. This will be done for each 
274        fold for each class if `force_balance` is `True`. 
275
276    combination: str
277        How should multiple views of data be combined. For more details 
278        see ad.concat.
279
280    train_dict: dict
281        A `dict` where each key is a class label and values are are the 
282        indices to be trained with for that class for class balance. 
283        All values must be present in each adata.uns['train_indices'].
284
285    Returns
286    -------
287    alpha_star : dict
288        A dictionary with keys being class labels and values being the 
289        best performing alpha parameter for that class as a float.
290    """
291    classes = np.unique(adata[0].obs['labels'])
292    orig_labels = adata[0].obs['labels'].to_numpy().copy()
293    orig_train = adata[0].uns['train_indices'].copy()
294
295    if train_dict:
296        train_idcs = train_dict
297    else:
298        if force_balance:
299            train_idcs = get_class_train(adata[0].uns['train_indices'], 
300                                         adata[0].obs['labels'], 
301                                         adata[0].uns['seed_obj'])
302        else:
303            train_idcs = {ct: adata[0].uns['train_indices'].copy()
304                          for ct in classes}
305
306    opt_alpha_dict = dict()
307
308    for cl in classes:
309        temp_classes = orig_labels.copy()
310        temp_classes[temp_classes != cl] = 'other'
311
312        for i in range(len(adata)):
313            adata[i].obs['labels'] = temp_classes.copy()
314            adata[i].uns['train_indices'] = train_idcs[cl]
315
316        opt_alpha_dict[cl] = bin_optimize_alpha(adata, group_size, tfidf, 
317                                                alpha_array, k, metric, 
318                                                early_stopping, batches, 
319                                                batch_size, combination)     
320    
321    # Global adata obj will be permanently changed if not reset
322    for i in range(len(adata)):
323        adata[i].obs['labels'] = orig_labels
324        adata[i].uns['train_indices'] = orig_train
325
326    return opt_alpha_dict

Wrapper function for running k-fold cross validation for every label in a multiclass experiment. Called by scmkl.optimize_alpha.

Parameters
  • adata (list[ad.AnnData]): List of ad.AnnData(s) with 'Z_train' and 'Z_test' in adata.uns.keys().
  • group_size (None | int): Argument describing how the features are grouped. If None, 2 * adata.uns['D'] will be used. For more information see celer documentation.
  • tfidf (list[bool]): If False, no data will be TF-IDF transformed. If type(adata) is list and TF-IDF transformation is desired for all or some of the data, a bool list corresponding to adata must be provided. To simply TF-IDF transform adata when type(adata) is ad.AnnData, use True.
  • alpha_array (np.ndarray): Array of all alpha values to be tested.
  • k (int): Number of folds to perform cross validation over.
  • metric (str): Which metric to use to optimize alpha. Options are 'AUROC', 'Accuracy', 'F1-Score', 'Precision', and 'Recall'.
  • batches (int): The number of batches to use for the distance calculation. This will average the result of batches distance calculations of batch_size randomly sampled cells. More batches will converge to population distance values at the cost of scalability.
  • batch_size (int): The number of cells to include per batch for distance calculations. Higher batch size will converge to population distance values at the cost of scalability. If batches*batch_size > num_training_cells, batch_size will be reduced to int(num_training_cells/batches).
  • force_balance (bool): If True, training sets will be balanced to reduce class label imbalance for each iteration. Defaults to False.
  • other_factor (float): The ratio of cells to sample for the other class for each model. For example, if classifying B cells with 100 B cells in training, if other_factor=1, 100 cells that are not B cells will be trained on with the B cells. This will be done for each fold for each class if force_balance is True.
  • combination (str): How should multiple views of data be combined. For more details see ad.concat.
  • train_dict (dict): A dict where each key is a class label and values are are the indices to be trained with for that class for class balance. All values must be present in each adata.uns['train_indices'].
Returns
  • alpha_star (dict): A dictionary with keys being class labels and values being the best performing alpha parameter for that class as a float.
def optimize_alpha( adata: anndata._core.anndata.AnnData | list[anndata._core.anndata.AnnData], group_size: int | None = None, tfidf: None | list[bool] = None, alpha_array: numpy.ndarray = array([1.9 , 1.69, 1.49, 1.28, 1.08, 0.87, 0.67, 0.46, 0.26, 0.05]), k: int = 4, metric: str = 'AUROC', early_stopping: bool = False, batches: int = 10, batch_size: int = 100, combination: str = 'concatenate', force_balance: bool = True, train_dict: dict = None):
329def optimize_alpha(adata: ad.AnnData | list[ad.AnnData], 
330                   group_size: int | None=None, tfidf: None | list[bool]=None, 
331                   alpha_array: np.ndarray=default_alphas, k: int=4, 
332                   metric: str='AUROC', early_stopping: bool=False,
333                   batches: int=10, batch_size: int=100, 
334                   combination: str='concatenate', force_balance: bool=True,
335                   train_dict: dict=None):
336    """
337    K-fold cross validation for optimizing alpha hyperparameter using 
338    training indices. 
339
340    Parameters
341    ----------
342    adata : list[ad.AnnData]
343        List of `ad.AnnData`(s) with `'Z_train'` and `'Z_test'` in 
344        `adata.uns.keys()`.
345
346    group_size : None | int
347        Argument describing how the features are grouped. If `None`, 
348        `2 * adata.uns['D']` will be used. For more information see 
349        [celer documentation](https://mathurinm.github.io/celer/
350        generated/celer.GroupLasso.html).
351
352    tfidf : list[bool]
353        If `False`, no data will be TF-IDF transformed. If 
354        `type(adata) is list` and TF-IDF transformation is desired for 
355        all or some of the data, a bool list corresponding to `adata` 
356        must be provided. To simply TF-IDF transform `adata` when 
357        `type(adata) is ad.AnnData`, use `True`.
358    
359    alpha_array : np.ndarray
360        Array of all alpha values to be tested.
361
362    k : int
363        Number of folds to perform cross validation over.
364            
365    metric : str
366        Which metric to use to optimize alpha. Options are `'AUROC'`, 
367        `'Accuracy'`, `'F1-Score'`, `'Precision'`, and `'Recall'`.
368
369    batches : int
370        The number of batches to use for the distance calculation.
371        This will average the result of `batches` distance calculations
372        of `batch_size` randomly sampled cells. More batches will converge
373        to population distance values at the cost of scalability.
374
375    batch_size : int
376        The number of cells to include per batch for distance
377        calculations. Higher batch size will converge to population
378        distance values at the cost of scalability. If 
379        `batches*batch_size > num_training_cells`, `batch_size` will be 
380        reduced to `int(num_training_cells/batches)`.
381
382    force_balance: bool
383        If `True`, training sets will be balanced to reduce class label 
384        imbalance for each iteration. Defaults to `False`.
385
386    other_factor : float
387        The ratio of cells to sample for the other class for each 
388        model. For example, if classifying B cells with 100 B cells in 
389        training, if `other_factor=1`, 100 cells that are not B cells 
390        will be trained on with the B cells. This will be done for each 
391        fold for each class if `force_balance` is `True`. 
392
393    combination: str
394        How should multiple views of data be combined. For more details 
395        see ad.concat.
396
397    train_dict: dict
398        A `dict` where each key is a class label and values are are the 
399        indices to be trained with for that class for class balance. 
400        All values must be present in each adata.uns['train_indices'].
401
402    Returns
403    -------
404    alpha_star : float | dict
405        If number of classes is more than 2, a dictionary with keys 
406        being class labels and values being the best performing alpha 
407        parameter for that class as a float. Else, a float for 
408        comparing the two classes.
409    """
410    # Need singe-view runs to be iterable
411    if isinstance(adata, ad.AnnData):
412        adata = [adata.copy()]
413
414    if isinstance(tfidf, type(None)):
415        tfidf = len(adata)*[False]
416
417    is_multi = 2 < len(set(adata[0].obs['labels']))
418    
419    if isinstance(group_size, type(None)):
420        group_size = 2*adata[0].uns['D']
421
422
423    if is_multi:
424        alpha_star = multi_optimize_alpha(adata, group_size, tfidf, 
425                                          alpha_array, k, metric, 
426                                          early_stopping, batches, 
427                                          batch_size, force_balance, 
428                                          combination, train_dict)
429        
430    else:
431        alpha_star = bin_optimize_alpha(adata, group_size, tfidf, 
432                                        alpha_array, k, metric, 
433                                        early_stopping, batches, 
434                                        batch_size, combination)
435        
436    return alpha_star

K-fold cross validation for optimizing alpha hyperparameter using training indices.

Parameters
  • adata (list[ad.AnnData]): List of ad.AnnData(s) with 'Z_train' and 'Z_test' in adata.uns.keys().
  • group_size (None | int): Argument describing how the features are grouped. If None, 2 * adata.uns['D'] will be used. For more information see celer documentation.
  • tfidf (list[bool]): If False, no data will be TF-IDF transformed. If type(adata) is list and TF-IDF transformation is desired for all or some of the data, a bool list corresponding to adata must be provided. To simply TF-IDF transform adata when type(adata) is ad.AnnData, use True.
  • alpha_array (np.ndarray): Array of all alpha values to be tested.
  • k (int): Number of folds to perform cross validation over.
  • metric (str): Which metric to use to optimize alpha. Options are 'AUROC', 'Accuracy', 'F1-Score', 'Precision', and 'Recall'.
  • batches (int): The number of batches to use for the distance calculation. This will average the result of batches distance calculations of batch_size randomly sampled cells. More batches will converge to population distance values at the cost of scalability.
  • batch_size (int): The number of cells to include per batch for distance calculations. Higher batch size will converge to population distance values at the cost of scalability. If batches*batch_size > num_training_cells, batch_size will be reduced to int(num_training_cells/batches).
  • force_balance (bool): If True, training sets will be balanced to reduce class label imbalance for each iteration. Defaults to False.
  • other_factor (float): The ratio of cells to sample for the other class for each model. For example, if classifying B cells with 100 B cells in training, if other_factor=1, 100 cells that are not B cells will be trained on with the B cells. This will be done for each fold for each class if force_balance is True.
  • combination (str): How should multiple views of data be combined. For more details see ad.concat.
  • train_dict (dict): A dict where each key is a class label and values are are the indices to be trained with for that class for class balance. All values must be present in each adata.uns['train_indices'].
Returns
  • alpha_star (float | dict): If number of classes is more than 2, a dictionary with keys being class labels and values being the best performing alpha parameter for that class as a float. Else, a float for comparing the two classes.