scmkl.optimize_alpha

  1import gc
  2import numpy as np
  3import anndata as ad
  4from sklearn.model_selection import StratifiedKFold
  5
  6from scmkl.tfidf_normalize import tfidf_normalize
  7from scmkl.calculate_z import calculate_z
  8from scmkl.train_model import train_model
  9from scmkl.multimodal_processing import multimodal_processing
 10from scmkl.test import predict
 11from scmkl.one_v_rest import get_class_train
 12from scmkl.create_adata import sort_samples
 13
 14
 15# Array of alphas to be used if not provided
 16default_alphas = np.round(np.linspace(1.9, 0.05, 10),2)
 17
 18
 19def stop_early(metric_array, alpha_idx, fold_idx):
 20    """
 21    Assumes smallest alpha comes first.
 22    """
 23    # Must be at least two metrics from two alphas to compare
 24    if alpha_idx <= 0:
 25        return False
 26    
 27    cur_met = metric_array[alpha_idx, fold_idx]
 28    last_met = metric_array[alpha_idx - 1, fold_idx]
 29
 30    if cur_met < last_met:
 31        return True
 32    else:
 33        return False
 34
 35
 36def sort_alphas(alpha_array: np.ndarray):
 37    """
 38    Sorts alphas from smallest to largest.
 39    """
 40    order = np.argsort(alpha_array)
 41    alpha_array = alpha_array[order]
 42
 43    return alpha_array
 44
 45
 46def get_folds(adata: ad.AnnData, k: int):
 47    """
 48    With labels of samples for cross validation and number of folds, 
 49    returns the indices and label for each k-folds.
 50
 51    Parameters
 52    ----------
 53    adata : ad.AnnData
 54        `AnnData` object containing `'labels'` column in `.obs` and 
 55        `'train_indices'` in `.uns`.
 56
 57    k : int
 58        The number of folds to perform cross validation over.
 59
 60    Returns
 61    -------
 62    folds : dict
 63        A dictionary with keys being [0, k) and values being a tuple 
 64        with first element being training sample indices and the second 
 65        being testing value indices.
 66
 67    Examples
 68    --------
 69    >>> adata = scmkl.create_adata(...)
 70    >>> folds = scmkl.get_folds(adata, k=4)
 71    >>>
 72    >>> train_fold_0, test_fold_0 = folds[0]
 73    >>>
 74    >>> train_fold_0
 75    array([  0,   3,   5,   6,   8,   9,  10,  11,  12,  13])
 76    >>> test_fold_0
 77    array([  1,   2,  4,  7])
 78    """
 79    y = adata.obs['labels'][adata.uns['train_indices']].copy()
 80
 81    # Creating dummy x prevents holding more views in memory
 82    x = y.copy()
 83
 84    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=100)
 85
 86    folds = dict()
 87    for fold, (fold_train, fold_test) in enumerate(skf.split(x, y)):
 88        folds[fold] = (fold_train, fold_test)
 89
 90    return folds
 91
 92
 93def prepare_fold(fold_adata, sort_idcs, fold_train, fold_test):
 94    """
 95    Reorders adata samples, reassigns each `adata.uns['train_indices' & 
 96    'test_indices']`, and removes sigmas if present.
 97
 98    Parameters
 99    ----------
100    fold_adata : list[ad.AnnData]
101        A `list` of `AnnData` objects to be reordered based on fold 
102        indices.
103
104    sort_idcs: np.ndarray
105        The indices that will sort `AnnData`s as all train samples then 
106        all test.
107
108    fold_train : np.ndarray
109        The indices of training sample for respective fold.
110
111    fold_test : np.ndarray
112        The indices of testing samples for respective fold.
113
114    Returns
115    -------
116    fold_adata : list[ad.AnnData]
117        The `list` of `AnnData`s with each `AnnData` sorted as all 
118        training samples then all testing samples. 
119        `adata.uns['train_indices' & 'test_indices']` are also updated 
120        to reflect new sample positions.
121
122    Examples
123    --------
124    >>> adata = [scmkl.create_adata(...)]
125    >>> folds = scmkl.get_folds(adata[0], k=4)
126    >>> sort_idcs, fold_train, fold_test = sort_samples(fold_train, 
127    ...                                                 fold_test)
128    >>> fold_adata = scmkl.prepare_fold(adata, sort_idcs, 
129    ...                                 fold_train, fold_test)
130    """
131    for i in range(len(fold_adata)):
132        fold_adata[i] = fold_adata[i][sort_idcs]
133        fold_adata[i].uns['train_indices'] = fold_train
134        fold_adata[i].uns['test_indices'] = fold_test
135    
136    # Need to recalculate sigmas for new fold train
137    for i in range(len(fold_adata)):
138        if 'sigma' in fold_adata[i].uns_keys():
139            del fold_adata[i].uns['sigma']
140
141    return fold_adata
142
143
144def process_fold(fold_adata, names, tfidf, combination, batches, batch_size):
145    """
146    Combines adata if needed, estimates sigmas, and calculates kernels 
147    for model training and evaluation.
148
149    Parameters
150    ----------
151    fold_adata : list[ad.AnnData]
152        A `list` of `AnnData` objects to combine and calculate kernels 
153        for.
154
155    names : list
156        A list of names respective to each `AnnData` in `fold_adata` 
157        for verbose outputs.
158
159    tfidf : list[bool]
160        A boolean list indicating whether or not to perform TF-IDF 
161        transformation for each adata respective to `fold_adata`.
162
163    combination : str
164        The method of combining `AnnData` objects passed to 
165        `ad.concatenate()`. Ignored if `len(fold_adata) == 1`.
166
167    batches : int
168        The number of batches for kernel width (sigma) estimation.
169
170    batch_size : int
171        The number of samples to include in each batch of kernel width 
172        (sigma) estimation.
173
174    Examples
175    --------
176    >>> adata = [scmkl.create_adata(...)]
177    >>> tfidf = [False]
178    >>> names = ['adata1']
179    >>> folds = scmkl.get_folds(adata[0], k=4)
180    >>> sort_idcs, fold_train, fold_test = sort_samples(fold_train, 
181    ...                                                 fold_test)
182    >>> fold_adata = scmkl.prepare_fold(adata, sort_idcs, 
183    ...                                 fold_train, fold_test)
184    >>> fold_adata = scmkl.process_fold(fold_adata, names, tfidf)
185    """
186    if 1 < len(fold_adata):
187        fold_adata = multimodal_processing(fold_adata, names, tfidf, 
188                                           combination, batches, 
189                                           batch_size, False)
190    else:
191        fold_adata = fold_adata[0]
192        if tfidf[0]:
193            fold_adata = tfidf_normalize(fold_adata, binarize=True)
194        fold_adata = calculate_z(fold_adata, n_features= 5000, 
195                                 batches=batches, batch_size=batch_size)
196        
197    return fold_adata
198
199
200def bin_optimize_alpha(adata: list[ad.AnnData], 
201                       group_size: int | None=None, 
202                       tfidf: list[bool]=False, 
203                       alpha_array: np.ndarray=default_alphas, 
204                       k: int=4, metric: str='AUROC', 
205                       early_stopping: bool=False,
206                       batches: int=10, batch_size: int=100,
207                       combination: str='concatenate'):
208    """
209    Iteratively train a grouplasso model and update alpha to find the 
210    parameter yielding best performing sparsity via k-fold cross 
211    validation. This function currently only works for binary 
212    experiments. Called by `scmkl.optimize_alpha()`.
213
214    Parameters
215    ----------
216    adata : list[ad.AnnData]
217        List of `ad.AnnData`(s) with `'Z_train'` and `'Z_test'` in 
218        `adata.uns.keys()`.
219
220    group_size : None | int
221        Argument describing how the features are grouped. If `None`, 
222        `2 * adata.uns['D']` will be used. For more information see 
223        [celer documentation](https://mathurinm.github.io/celer/
224        generated/celer.GroupLasso.html).
225
226    tfidf : list | bool
227        If `False`, no data will be TF-IDF transformed. If 
228        `type(adata) is list` and TF-IDF transformation is desired for 
229        all or some of the data, a bool list corresponding to `adata` 
230        must be provided. To simply TF-IDF transform `adata` when 
231        `type(adata) is ad.AnnData`, use `True`.
232    
233    alpha_array : np.ndarray
234        Array of all alpha values to be tested.
235
236    k : int
237        Number of folds to perform cross validation over.
238            
239    metric : str
240        Which metric to use to optimize alpha. Options are `'AUROC'`, 
241        `'Accuracy'`, `'F1-Score'`, `'Precision'`, and `'Recall'`.
242
243    batches : int
244        The number of batches to use for the distance calculation.
245        This will average the result of `batches` distance calculations
246        of `batch_size` randomly sampled cells. More batches will converge
247        to population distance values at the cost of scalability.
248
249    batch_size : int
250        The number of cells to include per batch for distance
251        calculations. Higher batch size will converge to population
252        distance values at the cost of scalability. If 
253        `batches*batch_size > num_training_cells`, `batch_size` will be 
254        reduced to `int(num_training_cells/batches)`.
255
256    Returns
257    -------
258    alpha_star : float
259        The best performing alpha value from cross validation on 
260        training data.
261
262    Examples
263    --------
264    >>> alpha_star = scmkl.optimize_alpha(adata)
265    >>> alpha_star
266    0.1
267    """    
268    assert isinstance(k, int) and k > 0, "'k' must be positive"
269
270    import warnings 
271    warnings.filterwarnings('ignore')
272
273    alpha_array = sort_alphas(alpha_array)
274
275    # Only want folds for training samples
276    train_indices = adata[0].uns['train_indices'].copy()
277    cv_adata = [adata[i][train_indices, :].copy()
278                for i in range(len(adata))]
279
280    folds = get_folds(adata[0], k)
281
282    metric_array = np.zeros((len(alpha_array), k))
283
284    for fold in range(k):
285
286        fold_train, fold_test = folds[fold]
287        fold_adata = cv_adata.copy()
288
289        # Downstream functions expect train then test samples in adata(s)
290        sort_idcs, fold_train, fold_test = sort_samples(fold_train, fold_test)
291        fold_adata = prepare_fold(fold_adata, sort_idcs, 
292                                  fold_train, fold_test)
293            
294        names = ['Adata ' + str(i + 1) for i in range(len(cv_adata))]
295
296        # Adatas need combined if applicable and kernels calculated 
297        fold_adata = process_fold(fold_adata, names, tfidf, combination, 
298                                  batches, batch_size)
299
300        for i, alpha in enumerate(alpha_array):
301
302            fold_adata = train_model(fold_adata, group_size, alpha=alpha)
303            _, metrics = predict(fold_adata, metrics=[metric])
304            metric_array[i, fold] = metrics[metric]
305
306            # If metrics are decreasing, cv stopped and moving to next fold
307            end_fold = stop_early(metric_array, alpha_idx=i, fold_idx=fold)
308            if end_fold and early_stopping:
309                break
310
311        del fold_adata
312        gc.collect()
313
314    del cv_adata
315    gc.collect()
316
317    # Need highest performing alpha for given metric
318    alpha_star = alpha_array[np.argmax(np.mean(metric_array, axis = 1))]
319
320    return alpha_star
321
322
323def multi_optimize_alpha(adata: list[ad.AnnData], group_size: int, 
324                         tfidf: list[bool]=[False], 
325                         alpha_array: np.ndarray=default_alphas, k: int=4, 
326                         metric: str='AUROC', early_stopping: bool=False,
327                         batches: int=10, batch_size: int=100, 
328                         force_balance: bool=True, combination: str='concatenate',
329                         train_dict: dict=None):
330    """
331    Wrapper function for running k-fold cross validation for every 
332    label in a multiclass experiment. Called by 
333    `scmkl.optimize_alpha()`. 
334
335    Parameters
336    ----------
337    adata : list[ad.AnnData]
338        List of `ad.AnnData`(s) with `'Z_train'` and `'Z_test'` in 
339        `adata.uns.keys()`.
340
341    group_size : None | int
342        Argument describing how the features are grouped. If `None`, 
343        `2 * adata.uns['D']` will be used. For more information see 
344        [celer documentation](https://mathurinm.github.io/celer/
345        generated/celer.GroupLasso.html).
346
347    tfidf : list[bool]
348        If `False`, no data will be TF-IDF transformed. If 
349        `type(adata) is list` and TF-IDF transformation is desired for 
350        all or some of the data, a bool list corresponding to `adata` 
351        must be provided. To simply TF-IDF transform `adata` when 
352        `type(adata) is ad.AnnData`, use `True`.
353    
354    alpha_array : np.ndarray
355        Array of all alpha values to be tested.
356
357    k : int
358        Number of folds to perform cross validation over.
359            
360    metric : str
361        Which metric to use to optimize alpha. Options are `'AUROC'`, 
362        `'Accuracy'`, `'F1-Score'`, `'Precision'`, and `'Recall'`.
363
364    batches : int
365        The number of batches to use for the distance calculation.
366        This will average the result of `batches` distance calculations
367        of `batch_size` randomly sampled cells. More batches will converge
368        to population distance values at the cost of scalability.
369
370    batch_size : int
371        The number of cells to include per batch for distance
372        calculations. Higher batch size will converge to population
373        distance values at the cost of scalability. If 
374        `batches*batch_size > num_training_cells`, `batch_size` will be 
375        reduced to `int(num_training_cells/batches)`.
376
377    force_balance: bool
378        If `True`, training sets will be balanced to reduce class label 
379        imbalance for each iteration. Defaults to `False`.
380
381    other_factor : float
382        The ratio of cells to sample for the other class for each 
383        model. For example, if classifying B cells with 100 B cells in 
384        training, if `other_factor=1`, 100 cells that are not B cells 
385        will be trained on with the B cells. This will be done for each 
386        fold for each class if `force_balance` is `True`. 
387
388    combination: str
389        How should multiple views of data be combined. For more details 
390        see ad.concat.
391
392    train_dict: dict
393        A `dict` where each key is a class label and values are are the 
394        indices to be trained with for that class for class balance. 
395        All values must be present in each adata.uns['train_indices'].
396
397    Returns
398    -------
399    alpha_star : dict
400        A dictionary with keys being class labels and values being the 
401        best performing alpha parameter for that class as a float.
402    """
403    classes = np.unique(adata[0].obs['labels'])
404    orig_labels = adata[0].obs['labels'].to_numpy().copy()
405    orig_train = adata[0].uns['train_indices'].copy()
406
407    if train_dict:
408        train_idcs = train_dict
409    else:
410        if force_balance:
411            train_idcs = get_class_train(adata[0].uns['train_indices'], 
412                                         adata[0].obs['labels'], 
413                                         adata[0].uns['seed_obj'])
414        else:
415            train_idcs = {ct: adata[0].uns['train_indices'].copy()
416                          for ct in classes}
417
418    opt_alpha_dict = dict()
419
420    for cl in classes:
421        temp_classes = orig_labels.copy()
422        temp_classes[temp_classes != cl] = 'other'
423
424        for i in range(len(adata)):
425            adata[i].obs['labels'] = temp_classes.copy()
426            adata[i].uns['train_indices'] = train_idcs[cl]
427        
428        opt_alpha_dict[cl] = bin_optimize_alpha(adata, group_size, tfidf, 
429                                                alpha_array, k, metric, 
430                                                early_stopping, batches, 
431                                                batch_size, combination)     
432    
433    # Global adata obj will be permanently changed if not reset
434    for i in range(len(adata)):
435        adata[i].obs['labels'] = orig_labels
436        adata[i].uns['train_indices'] = orig_train
437
438    return opt_alpha_dict
439
440
441def optimize_alpha(adata: ad.AnnData | list[ad.AnnData], 
442                   group_size: int | None=None, tfidf: None | list[bool]=None, 
443                   alpha_array: np.ndarray=default_alphas, k: int=4, 
444                   metric: str='AUROC', early_stopping: bool=False,
445                   batches: int=10, batch_size: int=100, 
446                   combination: str='concatenate', force_balance: bool=True,
447                   train_dict: dict=None):
448    """
449    K-fold cross validation for optimizing alpha hyperparameter using 
450    training indices. 
451
452    Parameters
453    ----------
454    adata : list[ad.AnnData]
455        List of `ad.AnnData`(s) with `'Z_train'` and `'Z_test'` in 
456        `adata.uns.keys()`.
457
458    group_size : None | int
459        Argument describing how the features are grouped. If `None`, 
460        `2 * adata.uns['D']` will be used. For more information see 
461        [celer documentation](https://mathurinm.github.io/celer/
462        generated/celer.GroupLasso.html).
463
464    tfidf : list[bool]
465        If `False`, no data will be TF-IDF transformed. If 
466        `type(adata) is list` and TF-IDF transformation is desired for 
467        all or some of the data, a bool list corresponding to `adata` 
468        must be provided. To simply TF-IDF transform `adata` when 
469        `type(adata) is ad.AnnData`, use `True`.
470    
471    alpha_array : np.ndarray
472        Array of all alpha values to be tested.
473
474    k : int
475        Number of folds to perform cross validation over.
476            
477    metric : str
478        Which metric to use to optimize alpha. Options are `'AUROC'`, 
479        `'Accuracy'`, `'F1-Score'`, `'Precision'`, and `'Recall'`.
480
481    batches : int
482        The number of batches to use for the distance calculation.
483        This will average the result of `batches` distance calculations
484        of `batch_size` randomly sampled cells. More batches will converge
485        to population distance values at the cost of scalability.
486
487    batch_size : int
488        The number of cells to include per batch for distance
489        calculations. Higher batch size will converge to population
490        distance values at the cost of scalability. If 
491        `batches*batch_size > num_training_cells`, `batch_size` will be 
492        reduced to `int(num_training_cells/batches)`.
493
494    force_balance: bool
495        If `True`, training sets will be balanced to reduce class label 
496        imbalance for each iteration. Defaults to `False`.
497
498    other_factor : float
499        The ratio of cells to sample for the other class for each 
500        model. For example, if classifying B cells with 100 B cells in 
501        training, if `other_factor=1`, 100 cells that are not B cells 
502        will be trained on with the B cells. This will be done for each 
503        fold for each class if `force_balance` is `True`. 
504
505    combination: str
506        How should multiple views of data be combined. For more details 
507        see ad.concat.
508
509    train_dict: dict
510        A `dict` where each key is a class label and values are are the 
511        indices to be trained with for that class for class balance. 
512        All values must be present in each adata.uns['train_indices'].
513
514    Returns
515    -------
516    alpha_star : float | dict
517        If number of classes is more than 2, a dictionary with keys 
518        being class labels and values being the best performing alpha 
519        parameter for that class as a float. Else, a float for 
520        comparing the two classes.
521    """
522    # Need singe-view runs to be iterable
523    if isinstance(adata, ad.AnnData):
524        adata = [adata.copy()]
525
526    if isinstance(tfidf, type(None)):
527        tfidf = len(adata)*[False]
528
529    is_multi = 2 < len(set(adata[0].obs['labels']))
530    
531    if isinstance(group_size, type(None)):
532        group_size = 2*adata[0].uns['D']
533
534
535    if is_multi:
536        alpha_star = multi_optimize_alpha(adata, group_size, tfidf, 
537                                          alpha_array, k, metric, 
538                                          early_stopping, batches, 
539                                          batch_size, force_balance, 
540                                          combination, train_dict)
541        
542    else:
543        alpha_star = bin_optimize_alpha(adata, group_size, tfidf, 
544                                        alpha_array, k, metric, 
545                                        early_stopping, batches, 
546                                        batch_size, combination)
547        
548    gc.collect()
549
550    return alpha_star
default_alphas = array([1.9 , 1.69, 1.49, 1.28, 1.08, 0.87, 0.67, 0.46, 0.26, 0.05])
def stop_early(metric_array, alpha_idx, fold_idx):
20def stop_early(metric_array, alpha_idx, fold_idx):
21    """
22    Assumes smallest alpha comes first.
23    """
24    # Must be at least two metrics from two alphas to compare
25    if alpha_idx <= 0:
26        return False
27    
28    cur_met = metric_array[alpha_idx, fold_idx]
29    last_met = metric_array[alpha_idx - 1, fold_idx]
30
31    if cur_met < last_met:
32        return True
33    else:
34        return False

Assumes smallest alpha comes first.

def sort_alphas(alpha_array: numpy.ndarray):
37def sort_alphas(alpha_array: np.ndarray):
38    """
39    Sorts alphas from smallest to largest.
40    """
41    order = np.argsort(alpha_array)
42    alpha_array = alpha_array[order]
43
44    return alpha_array

Sorts alphas from smallest to largest.

def get_folds(adata: anndata._core.anndata.AnnData, k: int):
47def get_folds(adata: ad.AnnData, k: int):
48    """
49    With labels of samples for cross validation and number of folds, 
50    returns the indices and label for each k-folds.
51
52    Parameters
53    ----------
54    adata : ad.AnnData
55        `AnnData` object containing `'labels'` column in `.obs` and 
56        `'train_indices'` in `.uns`.
57
58    k : int
59        The number of folds to perform cross validation over.
60
61    Returns
62    -------
63    folds : dict
64        A dictionary with keys being [0, k) and values being a tuple 
65        with first element being training sample indices and the second 
66        being testing value indices.
67
68    Examples
69    --------
70    >>> adata = scmkl.create_adata(...)
71    >>> folds = scmkl.get_folds(adata, k=4)
72    >>>
73    >>> train_fold_0, test_fold_0 = folds[0]
74    >>>
75    >>> train_fold_0
76    array([  0,   3,   5,   6,   8,   9,  10,  11,  12,  13])
77    >>> test_fold_0
78    array([  1,   2,  4,  7])
79    """
80    y = adata.obs['labels'][adata.uns['train_indices']].copy()
81
82    # Creating dummy x prevents holding more views in memory
83    x = y.copy()
84
85    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=100)
86
87    folds = dict()
88    for fold, (fold_train, fold_test) in enumerate(skf.split(x, y)):
89        folds[fold] = (fold_train, fold_test)
90
91    return folds

With labels of samples for cross validation and number of folds, returns the indices and label for each k-folds.

Parameters
  • adata (ad.AnnData): AnnData object containing 'labels' column in .obs and 'train_indices' in .uns.
  • k (int): The number of folds to perform cross validation over.
Returns
  • folds (dict): A dictionary with keys being [0, k) and values being a tuple with first element being training sample indices and the second being testing value indices.
Examples
>>> adata = scmkl.create_adata(...)
>>> folds = scmkl.get_folds(adata, k=4)
>>>
>>> train_fold_0, test_fold_0 = folds[0]
>>>
>>> train_fold_0
array([  0,   3,   5,   6,   8,   9,  10,  11,  12,  13])
>>> test_fold_0
array([  1,   2,  4,  7])
def prepare_fold(fold_adata, sort_idcs, fold_train, fold_test):
 94def prepare_fold(fold_adata, sort_idcs, fold_train, fold_test):
 95    """
 96    Reorders adata samples, reassigns each `adata.uns['train_indices' & 
 97    'test_indices']`, and removes sigmas if present.
 98
 99    Parameters
100    ----------
101    fold_adata : list[ad.AnnData]
102        A `list` of `AnnData` objects to be reordered based on fold 
103        indices.
104
105    sort_idcs: np.ndarray
106        The indices that will sort `AnnData`s as all train samples then 
107        all test.
108
109    fold_train : np.ndarray
110        The indices of training sample for respective fold.
111
112    fold_test : np.ndarray
113        The indices of testing samples for respective fold.
114
115    Returns
116    -------
117    fold_adata : list[ad.AnnData]
118        The `list` of `AnnData`s with each `AnnData` sorted as all 
119        training samples then all testing samples. 
120        `adata.uns['train_indices' & 'test_indices']` are also updated 
121        to reflect new sample positions.
122
123    Examples
124    --------
125    >>> adata = [scmkl.create_adata(...)]
126    >>> folds = scmkl.get_folds(adata[0], k=4)
127    >>> sort_idcs, fold_train, fold_test = sort_samples(fold_train, 
128    ...                                                 fold_test)
129    >>> fold_adata = scmkl.prepare_fold(adata, sort_idcs, 
130    ...                                 fold_train, fold_test)
131    """
132    for i in range(len(fold_adata)):
133        fold_adata[i] = fold_adata[i][sort_idcs]
134        fold_adata[i].uns['train_indices'] = fold_train
135        fold_adata[i].uns['test_indices'] = fold_test
136    
137    # Need to recalculate sigmas for new fold train
138    for i in range(len(fold_adata)):
139        if 'sigma' in fold_adata[i].uns_keys():
140            del fold_adata[i].uns['sigma']
141
142    return fold_adata

Reorders adata samples, reassigns each adata.uns['train_indices' & 'test_indices'], and removes sigmas if present.

Parameters
  • fold_adata (list[ad.AnnData]): A list of AnnData objects to be reordered based on fold indices.
  • sort_idcs (np.ndarray): The indices that will sort AnnDatas as all train samples then all test.
  • fold_train (np.ndarray): The indices of training sample for respective fold.
  • fold_test (np.ndarray): The indices of testing samples for respective fold.
Returns
  • fold_adata (list[ad.AnnData]): The list of AnnDatas with each AnnData sorted as all training samples then all testing samples. adata.uns['train_indices' & 'test_indices'] are also updated to reflect new sample positions.
Examples
>>> adata = [scmkl.create_adata(...)]
>>> folds = scmkl.get_folds(adata[0], k=4)
>>> sort_idcs, fold_train, fold_test = sort_samples(fold_train, 
...                                                 fold_test)
>>> fold_adata = scmkl.prepare_fold(adata, sort_idcs, 
...                                 fold_train, fold_test)
def process_fold(fold_adata, names, tfidf, combination, batches, batch_size):
145def process_fold(fold_adata, names, tfidf, combination, batches, batch_size):
146    """
147    Combines adata if needed, estimates sigmas, and calculates kernels 
148    for model training and evaluation.
149
150    Parameters
151    ----------
152    fold_adata : list[ad.AnnData]
153        A `list` of `AnnData` objects to combine and calculate kernels 
154        for.
155
156    names : list
157        A list of names respective to each `AnnData` in `fold_adata` 
158        for verbose outputs.
159
160    tfidf : list[bool]
161        A boolean list indicating whether or not to perform TF-IDF 
162        transformation for each adata respective to `fold_adata`.
163
164    combination : str
165        The method of combining `AnnData` objects passed to 
166        `ad.concatenate()`. Ignored if `len(fold_adata) == 1`.
167
168    batches : int
169        The number of batches for kernel width (sigma) estimation.
170
171    batch_size : int
172        The number of samples to include in each batch of kernel width 
173        (sigma) estimation.
174
175    Examples
176    --------
177    >>> adata = [scmkl.create_adata(...)]
178    >>> tfidf = [False]
179    >>> names = ['adata1']
180    >>> folds = scmkl.get_folds(adata[0], k=4)
181    >>> sort_idcs, fold_train, fold_test = sort_samples(fold_train, 
182    ...                                                 fold_test)
183    >>> fold_adata = scmkl.prepare_fold(adata, sort_idcs, 
184    ...                                 fold_train, fold_test)
185    >>> fold_adata = scmkl.process_fold(fold_adata, names, tfidf)
186    """
187    if 1 < len(fold_adata):
188        fold_adata = multimodal_processing(fold_adata, names, tfidf, 
189                                           combination, batches, 
190                                           batch_size, False)
191    else:
192        fold_adata = fold_adata[0]
193        if tfidf[0]:
194            fold_adata = tfidf_normalize(fold_adata, binarize=True)
195        fold_adata = calculate_z(fold_adata, n_features= 5000, 
196                                 batches=batches, batch_size=batch_size)
197        
198    return fold_adata

Combines adata if needed, estimates sigmas, and calculates kernels for model training and evaluation.

Parameters
  • fold_adata (list[ad.AnnData]): A list of AnnData objects to combine and calculate kernels for.
  • names (list): A list of names respective to each AnnData in fold_adata for verbose outputs.
  • tfidf (list[bool]): A boolean list indicating whether or not to perform TF-IDF transformation for each adata respective to fold_adata.
  • combination (str): The method of combining AnnData objects passed to ad.concatenate(). Ignored if len(fold_adata) == 1.
  • batches (int): The number of batches for kernel width (sigma) estimation.
  • batch_size (int): The number of samples to include in each batch of kernel width (sigma) estimation.
Examples
>>> adata = [scmkl.create_adata(...)]
>>> tfidf = [False]
>>> names = ['adata1']
>>> folds = scmkl.get_folds(adata[0], k=4)
>>> sort_idcs, fold_train, fold_test = sort_samples(fold_train, 
...                                                 fold_test)
>>> fold_adata = scmkl.prepare_fold(adata, sort_idcs, 
...                                 fold_train, fold_test)
>>> fold_adata = scmkl.process_fold(fold_adata, names, tfidf)
def bin_optimize_alpha( adata: list[anndata._core.anndata.AnnData], group_size: int | None = None, tfidf: list[bool] = False, alpha_array: numpy.ndarray = array([1.9 , 1.69, 1.49, 1.28, 1.08, 0.87, 0.67, 0.46, 0.26, 0.05]), k: int = 4, metric: str = 'AUROC', early_stopping: bool = False, batches: int = 10, batch_size: int = 100, combination: str = 'concatenate'):
201def bin_optimize_alpha(adata: list[ad.AnnData], 
202                       group_size: int | None=None, 
203                       tfidf: list[bool]=False, 
204                       alpha_array: np.ndarray=default_alphas, 
205                       k: int=4, metric: str='AUROC', 
206                       early_stopping: bool=False,
207                       batches: int=10, batch_size: int=100,
208                       combination: str='concatenate'):
209    """
210    Iteratively train a grouplasso model and update alpha to find the 
211    parameter yielding best performing sparsity via k-fold cross 
212    validation. This function currently only works for binary 
213    experiments. Called by `scmkl.optimize_alpha()`.
214
215    Parameters
216    ----------
217    adata : list[ad.AnnData]
218        List of `ad.AnnData`(s) with `'Z_train'` and `'Z_test'` in 
219        `adata.uns.keys()`.
220
221    group_size : None | int
222        Argument describing how the features are grouped. If `None`, 
223        `2 * adata.uns['D']` will be used. For more information see 
224        [celer documentation](https://mathurinm.github.io/celer/
225        generated/celer.GroupLasso.html).
226
227    tfidf : list | bool
228        If `False`, no data will be TF-IDF transformed. If 
229        `type(adata) is list` and TF-IDF transformation is desired for 
230        all or some of the data, a bool list corresponding to `adata` 
231        must be provided. To simply TF-IDF transform `adata` when 
232        `type(adata) is ad.AnnData`, use `True`.
233    
234    alpha_array : np.ndarray
235        Array of all alpha values to be tested.
236
237    k : int
238        Number of folds to perform cross validation over.
239            
240    metric : str
241        Which metric to use to optimize alpha. Options are `'AUROC'`, 
242        `'Accuracy'`, `'F1-Score'`, `'Precision'`, and `'Recall'`.
243
244    batches : int
245        The number of batches to use for the distance calculation.
246        This will average the result of `batches` distance calculations
247        of `batch_size` randomly sampled cells. More batches will converge
248        to population distance values at the cost of scalability.
249
250    batch_size : int
251        The number of cells to include per batch for distance
252        calculations. Higher batch size will converge to population
253        distance values at the cost of scalability. If 
254        `batches*batch_size > num_training_cells`, `batch_size` will be 
255        reduced to `int(num_training_cells/batches)`.
256
257    Returns
258    -------
259    alpha_star : float
260        The best performing alpha value from cross validation on 
261        training data.
262
263    Examples
264    --------
265    >>> alpha_star = scmkl.optimize_alpha(adata)
266    >>> alpha_star
267    0.1
268    """    
269    assert isinstance(k, int) and k > 0, "'k' must be positive"
270
271    import warnings 
272    warnings.filterwarnings('ignore')
273
274    alpha_array = sort_alphas(alpha_array)
275
276    # Only want folds for training samples
277    train_indices = adata[0].uns['train_indices'].copy()
278    cv_adata = [adata[i][train_indices, :].copy()
279                for i in range(len(adata))]
280
281    folds = get_folds(adata[0], k)
282
283    metric_array = np.zeros((len(alpha_array), k))
284
285    for fold in range(k):
286
287        fold_train, fold_test = folds[fold]
288        fold_adata = cv_adata.copy()
289
290        # Downstream functions expect train then test samples in adata(s)
291        sort_idcs, fold_train, fold_test = sort_samples(fold_train, fold_test)
292        fold_adata = prepare_fold(fold_adata, sort_idcs, 
293                                  fold_train, fold_test)
294            
295        names = ['Adata ' + str(i + 1) for i in range(len(cv_adata))]
296
297        # Adatas need combined if applicable and kernels calculated 
298        fold_adata = process_fold(fold_adata, names, tfidf, combination, 
299                                  batches, batch_size)
300
301        for i, alpha in enumerate(alpha_array):
302
303            fold_adata = train_model(fold_adata, group_size, alpha=alpha)
304            _, metrics = predict(fold_adata, metrics=[metric])
305            metric_array[i, fold] = metrics[metric]
306
307            # If metrics are decreasing, cv stopped and moving to next fold
308            end_fold = stop_early(metric_array, alpha_idx=i, fold_idx=fold)
309            if end_fold and early_stopping:
310                break
311
312        del fold_adata
313        gc.collect()
314
315    del cv_adata
316    gc.collect()
317
318    # Need highest performing alpha for given metric
319    alpha_star = alpha_array[np.argmax(np.mean(metric_array, axis = 1))]
320
321    return alpha_star

Iteratively train a grouplasso model and update alpha to find the parameter yielding best performing sparsity via k-fold cross validation. This function currently only works for binary experiments. Called by scmkl.optimize_alpha.

Parameters
  • adata (list[ad.AnnData]): List of ad.AnnData(s) with 'Z_train' and 'Z_test' in adata.uns.keys().
  • group_size (None | int): Argument describing how the features are grouped. If None, 2 * adata.uns['D'] will be used. For more information see celer documentation.
  • tfidf (list | bool): If False, no data will be TF-IDF transformed. If type(adata) is list and TF-IDF transformation is desired for all or some of the data, a bool list corresponding to adata must be provided. To simply TF-IDF transform adata when type(adata) is ad.AnnData, use True.
  • alpha_array (np.ndarray): Array of all alpha values to be tested.
  • k (int): Number of folds to perform cross validation over.
  • metric (str): Which metric to use to optimize alpha. Options are 'AUROC', 'Accuracy', 'F1-Score', 'Precision', and 'Recall'.
  • batches (int): The number of batches to use for the distance calculation. This will average the result of batches distance calculations of batch_size randomly sampled cells. More batches will converge to population distance values at the cost of scalability.
  • batch_size (int): The number of cells to include per batch for distance calculations. Higher batch size will converge to population distance values at the cost of scalability. If batches*batch_size > num_training_cells, batch_size will be reduced to int(num_training_cells/batches).
Returns
  • alpha_star (float): The best performing alpha value from cross validation on training data.
Examples
>>> alpha_star = scmkl.optimize_alpha(adata)
>>> alpha_star
0.1
def multi_optimize_alpha( adata: list[anndata._core.anndata.AnnData], group_size: int, tfidf: list[bool] = [False], alpha_array: numpy.ndarray = array([1.9 , 1.69, 1.49, 1.28, 1.08, 0.87, 0.67, 0.46, 0.26, 0.05]), k: int = 4, metric: str = 'AUROC', early_stopping: bool = False, batches: int = 10, batch_size: int = 100, force_balance: bool = True, combination: str = 'concatenate', train_dict: dict = None):
324def multi_optimize_alpha(adata: list[ad.AnnData], group_size: int, 
325                         tfidf: list[bool]=[False], 
326                         alpha_array: np.ndarray=default_alphas, k: int=4, 
327                         metric: str='AUROC', early_stopping: bool=False,
328                         batches: int=10, batch_size: int=100, 
329                         force_balance: bool=True, combination: str='concatenate',
330                         train_dict: dict=None):
331    """
332    Wrapper function for running k-fold cross validation for every 
333    label in a multiclass experiment. Called by 
334    `scmkl.optimize_alpha()`. 
335
336    Parameters
337    ----------
338    adata : list[ad.AnnData]
339        List of `ad.AnnData`(s) with `'Z_train'` and `'Z_test'` in 
340        `adata.uns.keys()`.
341
342    group_size : None | int
343        Argument describing how the features are grouped. If `None`, 
344        `2 * adata.uns['D']` will be used. For more information see 
345        [celer documentation](https://mathurinm.github.io/celer/
346        generated/celer.GroupLasso.html).
347
348    tfidf : list[bool]
349        If `False`, no data will be TF-IDF transformed. If 
350        `type(adata) is list` and TF-IDF transformation is desired for 
351        all or some of the data, a bool list corresponding to `adata` 
352        must be provided. To simply TF-IDF transform `adata` when 
353        `type(adata) is ad.AnnData`, use `True`.
354    
355    alpha_array : np.ndarray
356        Array of all alpha values to be tested.
357
358    k : int
359        Number of folds to perform cross validation over.
360            
361    metric : str
362        Which metric to use to optimize alpha. Options are `'AUROC'`, 
363        `'Accuracy'`, `'F1-Score'`, `'Precision'`, and `'Recall'`.
364
365    batches : int
366        The number of batches to use for the distance calculation.
367        This will average the result of `batches` distance calculations
368        of `batch_size` randomly sampled cells. More batches will converge
369        to population distance values at the cost of scalability.
370
371    batch_size : int
372        The number of cells to include per batch for distance
373        calculations. Higher batch size will converge to population
374        distance values at the cost of scalability. If 
375        `batches*batch_size > num_training_cells`, `batch_size` will be 
376        reduced to `int(num_training_cells/batches)`.
377
378    force_balance: bool
379        If `True`, training sets will be balanced to reduce class label 
380        imbalance for each iteration. Defaults to `False`.
381
382    other_factor : float
383        The ratio of cells to sample for the other class for each 
384        model. For example, if classifying B cells with 100 B cells in 
385        training, if `other_factor=1`, 100 cells that are not B cells 
386        will be trained on with the B cells. This will be done for each 
387        fold for each class if `force_balance` is `True`. 
388
389    combination: str
390        How should multiple views of data be combined. For more details 
391        see ad.concat.
392
393    train_dict: dict
394        A `dict` where each key is a class label and values are are the 
395        indices to be trained with for that class for class balance. 
396        All values must be present in each adata.uns['train_indices'].
397
398    Returns
399    -------
400    alpha_star : dict
401        A dictionary with keys being class labels and values being the 
402        best performing alpha parameter for that class as a float.
403    """
404    classes = np.unique(adata[0].obs['labels'])
405    orig_labels = adata[0].obs['labels'].to_numpy().copy()
406    orig_train = adata[0].uns['train_indices'].copy()
407
408    if train_dict:
409        train_idcs = train_dict
410    else:
411        if force_balance:
412            train_idcs = get_class_train(adata[0].uns['train_indices'], 
413                                         adata[0].obs['labels'], 
414                                         adata[0].uns['seed_obj'])
415        else:
416            train_idcs = {ct: adata[0].uns['train_indices'].copy()
417                          for ct in classes}
418
419    opt_alpha_dict = dict()
420
421    for cl in classes:
422        temp_classes = orig_labels.copy()
423        temp_classes[temp_classes != cl] = 'other'
424
425        for i in range(len(adata)):
426            adata[i].obs['labels'] = temp_classes.copy()
427            adata[i].uns['train_indices'] = train_idcs[cl]
428        
429        opt_alpha_dict[cl] = bin_optimize_alpha(adata, group_size, tfidf, 
430                                                alpha_array, k, metric, 
431                                                early_stopping, batches, 
432                                                batch_size, combination)     
433    
434    # Global adata obj will be permanently changed if not reset
435    for i in range(len(adata)):
436        adata[i].obs['labels'] = orig_labels
437        adata[i].uns['train_indices'] = orig_train
438
439    return opt_alpha_dict

Wrapper function for running k-fold cross validation for every label in a multiclass experiment. Called by scmkl.optimize_alpha.

Parameters
  • adata (list[ad.AnnData]): List of ad.AnnData(s) with 'Z_train' and 'Z_test' in adata.uns.keys().
  • group_size (None | int): Argument describing how the features are grouped. If None, 2 * adata.uns['D'] will be used. For more information see celer documentation.
  • tfidf (list[bool]): If False, no data will be TF-IDF transformed. If type(adata) is list and TF-IDF transformation is desired for all or some of the data, a bool list corresponding to adata must be provided. To simply TF-IDF transform adata when type(adata) is ad.AnnData, use True.
  • alpha_array (np.ndarray): Array of all alpha values to be tested.
  • k (int): Number of folds to perform cross validation over.
  • metric (str): Which metric to use to optimize alpha. Options are 'AUROC', 'Accuracy', 'F1-Score', 'Precision', and 'Recall'.
  • batches (int): The number of batches to use for the distance calculation. This will average the result of batches distance calculations of batch_size randomly sampled cells. More batches will converge to population distance values at the cost of scalability.
  • batch_size (int): The number of cells to include per batch for distance calculations. Higher batch size will converge to population distance values at the cost of scalability. If batches*batch_size > num_training_cells, batch_size will be reduced to int(num_training_cells/batches).
  • force_balance (bool): If True, training sets will be balanced to reduce class label imbalance for each iteration. Defaults to False.
  • other_factor (float): The ratio of cells to sample for the other class for each model. For example, if classifying B cells with 100 B cells in training, if other_factor=1, 100 cells that are not B cells will be trained on with the B cells. This will be done for each fold for each class if force_balance is True.
  • combination (str): How should multiple views of data be combined. For more details see ad.concat.
  • train_dict (dict): A dict where each key is a class label and values are are the indices to be trained with for that class for class balance. All values must be present in each adata.uns['train_indices'].
Returns
  • alpha_star (dict): A dictionary with keys being class labels and values being the best performing alpha parameter for that class as a float.
def optimize_alpha( adata: anndata._core.anndata.AnnData | list[anndata._core.anndata.AnnData], group_size: int | None = None, tfidf: None | list[bool] = None, alpha_array: numpy.ndarray = array([1.9 , 1.69, 1.49, 1.28, 1.08, 0.87, 0.67, 0.46, 0.26, 0.05]), k: int = 4, metric: str = 'AUROC', early_stopping: bool = False, batches: int = 10, batch_size: int = 100, combination: str = 'concatenate', force_balance: bool = True, train_dict: dict = None):
442def optimize_alpha(adata: ad.AnnData | list[ad.AnnData], 
443                   group_size: int | None=None, tfidf: None | list[bool]=None, 
444                   alpha_array: np.ndarray=default_alphas, k: int=4, 
445                   metric: str='AUROC', early_stopping: bool=False,
446                   batches: int=10, batch_size: int=100, 
447                   combination: str='concatenate', force_balance: bool=True,
448                   train_dict: dict=None):
449    """
450    K-fold cross validation for optimizing alpha hyperparameter using 
451    training indices. 
452
453    Parameters
454    ----------
455    adata : list[ad.AnnData]
456        List of `ad.AnnData`(s) with `'Z_train'` and `'Z_test'` in 
457        `adata.uns.keys()`.
458
459    group_size : None | int
460        Argument describing how the features are grouped. If `None`, 
461        `2 * adata.uns['D']` will be used. For more information see 
462        [celer documentation](https://mathurinm.github.io/celer/
463        generated/celer.GroupLasso.html).
464
465    tfidf : list[bool]
466        If `False`, no data will be TF-IDF transformed. If 
467        `type(adata) is list` and TF-IDF transformation is desired for 
468        all or some of the data, a bool list corresponding to `adata` 
469        must be provided. To simply TF-IDF transform `adata` when 
470        `type(adata) is ad.AnnData`, use `True`.
471    
472    alpha_array : np.ndarray
473        Array of all alpha values to be tested.
474
475    k : int
476        Number of folds to perform cross validation over.
477            
478    metric : str
479        Which metric to use to optimize alpha. Options are `'AUROC'`, 
480        `'Accuracy'`, `'F1-Score'`, `'Precision'`, and `'Recall'`.
481
482    batches : int
483        The number of batches to use for the distance calculation.
484        This will average the result of `batches` distance calculations
485        of `batch_size` randomly sampled cells. More batches will converge
486        to population distance values at the cost of scalability.
487
488    batch_size : int
489        The number of cells to include per batch for distance
490        calculations. Higher batch size will converge to population
491        distance values at the cost of scalability. If 
492        `batches*batch_size > num_training_cells`, `batch_size` will be 
493        reduced to `int(num_training_cells/batches)`.
494
495    force_balance: bool
496        If `True`, training sets will be balanced to reduce class label 
497        imbalance for each iteration. Defaults to `False`.
498
499    other_factor : float
500        The ratio of cells to sample for the other class for each 
501        model. For example, if classifying B cells with 100 B cells in 
502        training, if `other_factor=1`, 100 cells that are not B cells 
503        will be trained on with the B cells. This will be done for each 
504        fold for each class if `force_balance` is `True`. 
505
506    combination: str
507        How should multiple views of data be combined. For more details 
508        see ad.concat.
509
510    train_dict: dict
511        A `dict` where each key is a class label and values are are the 
512        indices to be trained with for that class for class balance. 
513        All values must be present in each adata.uns['train_indices'].
514
515    Returns
516    -------
517    alpha_star : float | dict
518        If number of classes is more than 2, a dictionary with keys 
519        being class labels and values being the best performing alpha 
520        parameter for that class as a float. Else, a float for 
521        comparing the two classes.
522    """
523    # Need singe-view runs to be iterable
524    if isinstance(adata, ad.AnnData):
525        adata = [adata.copy()]
526
527    if isinstance(tfidf, type(None)):
528        tfidf = len(adata)*[False]
529
530    is_multi = 2 < len(set(adata[0].obs['labels']))
531    
532    if isinstance(group_size, type(None)):
533        group_size = 2*adata[0].uns['D']
534
535
536    if is_multi:
537        alpha_star = multi_optimize_alpha(adata, group_size, tfidf, 
538                                          alpha_array, k, metric, 
539                                          early_stopping, batches, 
540                                          batch_size, force_balance, 
541                                          combination, train_dict)
542        
543    else:
544        alpha_star = bin_optimize_alpha(adata, group_size, tfidf, 
545                                        alpha_array, k, metric, 
546                                        early_stopping, batches, 
547                                        batch_size, combination)
548        
549    gc.collect()
550
551    return alpha_star

K-fold cross validation for optimizing alpha hyperparameter using training indices.

Parameters
  • adata (list[ad.AnnData]): List of ad.AnnData(s) with 'Z_train' and 'Z_test' in adata.uns.keys().
  • group_size (None | int): Argument describing how the features are grouped. If None, 2 * adata.uns['D'] will be used. For more information see celer documentation.
  • tfidf (list[bool]): If False, no data will be TF-IDF transformed. If type(adata) is list and TF-IDF transformation is desired for all or some of the data, a bool list corresponding to adata must be provided. To simply TF-IDF transform adata when type(adata) is ad.AnnData, use True.
  • alpha_array (np.ndarray): Array of all alpha values to be tested.
  • k (int): Number of folds to perform cross validation over.
  • metric (str): Which metric to use to optimize alpha. Options are 'AUROC', 'Accuracy', 'F1-Score', 'Precision', and 'Recall'.
  • batches (int): The number of batches to use for the distance calculation. This will average the result of batches distance calculations of batch_size randomly sampled cells. More batches will converge to population distance values at the cost of scalability.
  • batch_size (int): The number of cells to include per batch for distance calculations. Higher batch size will converge to population distance values at the cost of scalability. If batches*batch_size > num_training_cells, batch_size will be reduced to int(num_training_cells/batches).
  • force_balance (bool): If True, training sets will be balanced to reduce class label imbalance for each iteration. Defaults to False.
  • other_factor (float): The ratio of cells to sample for the other class for each model. For example, if classifying B cells with 100 B cells in training, if other_factor=1, 100 cells that are not B cells will be trained on with the B cells. This will be done for each fold for each class if force_balance is True.
  • combination (str): How should multiple views of data be combined. For more details see ad.concat.
  • train_dict (dict): A dict where each key is a class label and values are are the indices to be trained with for that class for class balance. All values must be present in each adata.uns['train_indices'].
Returns
  • alpha_star (float | dict): If number of classes is more than 2, a dictionary with keys being class labels and values being the best performing alpha parameter for that class as a float. Else, a float for comparing the two classes.