scmkl.optimize_alpha

  1import numpy as np
  2import anndata as ad
  3import gc
  4import tracemalloc
  5
  6from scmkl.tfidf_normalize import tfidf_normalize
  7from scmkl.estimate_sigma import estimate_sigma
  8from scmkl.calculate_z import calculate_z
  9from scmkl.train_model import train_model
 10from scmkl.multimodal_processing import multimodal_processing
 11from scmkl.test import predict
 12
 13
 14def multimodal_optimize_alpha(adatas: list[ad.AnnData], group_size: int, 
 15                              tfidf_list: list | bool=False,
 16                              alpha_array: np.ndarray=np.round(np.linspace(1.9,0.1, 10),2), 
 17                              k: int=4, metric: str='AUROC',
 18                              batches: int=10, batch_size: int=100):
 19    """
 20    Iteratively train a grouplasso model and update alpha to find the 
 21    parameter yielding the desired sparsity. Meant to find a good 
 22    starting point for your model, and the alpha may need further fine 
 23    tuning.
 24    
 25    Parameters
 26    ----------
 27    adatas : list[ad.AnnData]
 28        Objects of type `ad.AnnData` where each object is one modality 
 29        and Z_train and Z_test are calculated
 30
 31    group_size : None | int
 32        Argument describing how the features are grouped. If `None`, 
 33        `2 * adata.uns['D']` will be used. For more information see 
 34        [celer documentation](https://mathurinm.github.io/celer/
 35        generated/celer.GroupLasso.html).
 36
 37    tfidf_list : list | None
 38        A boolean mask where `tfidf_list[i]` is respective to 
 39        `adatas[i]`. If `True`, TF-IDF normalization will be applied to 
 40        the respective `ad.AnnData` during cross validation
 41    
 42    alpha_array : np.ndarray
 43        All alpha values to be tested.
 44
 45    k : int
 46        Number of folds to perform cross validation over.
 47
 48    metric : str
 49        Which metric to use to optimize alpha. Options are `'AUROC'`, 
 50        `'Accuracy'`, `'F1-Score'`, `'Precision'`, and `'Recall'`.
 51
 52    batches : int
 53        The number of batches to use for the distance calculation.
 54        This will average the result of `batches` distance calculations
 55        of `batch_size` randomly sampled cells. More batches will converge
 56        to population distance values at the cost of scalability.
 57
 58    batch_size : int
 59        The number of cells to include per batch for distance
 60        calculations. Higher batch size will converge to population
 61        distance values at the cost of scalability. If 
 62        `batches*batch_size > num_training_cells`, `batch_size` will be 
 63        reduced to `int(num_training_cells/batches)`.
 64            
 65    Returns
 66    -------
 67    alpha_star : float
 68        The alpha value yielding the best performing model from cross 
 69        validation.
 70    """
 71    assert isinstance(k, int) and k > 0, 'Must be a positive integer number of folds'
 72
 73    import warnings 
 74    warnings.filterwarnings('ignore')
 75
 76    if not tfidf_list:
 77        tfidf_list = [False]*len(adatas)
 78
 79    y = adatas[0].obs['labels'].iloc[adatas[0].uns['train_indices']].to_numpy()
 80    
 81    # Splits the labels evenly between folds
 82    positive_indices = np.where(y == np.unique(y)[0])[0]
 83    negative_indices = np.setdiff1d(np.arange(len(y)), positive_indices)
 84
 85    positive_annotations = np.arange(len(positive_indices)) % k
 86    negative_annotations = np.arange(len(negative_indices)) % k
 87
 88    metric_array = np.zeros((len(alpha_array), k))
 89
 90    cv_adatas = []
 91
 92    for adata in adatas:
 93        cv_adatas.append(adata[adata.uns['train_indices'],:].copy())
 94
 95    del adatas
 96    gc.collect()
 97
 98    for fold in np.arange(k):
 99
100        fold_train = np.concatenate((positive_indices[np.where(positive_annotations != fold)[0]], 
101                                     negative_indices[np.where(negative_annotations != fold)[0]]))
102        fold_test = np.concatenate((positive_indices[np.where(positive_annotations == fold)[0]], 
103                                    negative_indices[np.where(negative_annotations == fold)[0]]))
104
105        for i in range(len(cv_adatas)):
106            cv_adatas[i].uns['train_indices'] = fold_train
107            cv_adatas[i].uns['test_indices'] = fold_test
108
109        # Creating dummy names for cv. 
110        # Necessary for interpretability but not for AUROC cv
111        dummy_names = [f'adata {i}' for i in range(len(cv_adatas))]
112
113        # Calculate the Z's for each modality independently
114        fold_cv_adata = multimodal_processing(adatas = cv_adatas, 
115                                              names = dummy_names, 
116                                              tfidf = tfidf_list, 
117                                              batch_size= batch_size, 
118                                              batches = batches)
119
120        fold_cv_adata.uns['seed_obj'] = cv_adatas[0].uns['seed_obj']
121
122        if 'sigma' in fold_cv_adata.uns_keys():
123            del fold_cv_adata.uns['sigma']
124
125        # In train_model we index Z_train for balancing multiclass labels. We just recreate
126        # dummy indices here that are unused for use in the binary case
127        fold_cv_adata.uns['train_indices'] = np.arange(0, len(fold_train))
128
129        gc.collect()
130
131        for j, alpha in enumerate(alpha_array):
132
133            fold_cv_adata = train_model(fold_cv_adata, group_size, alpha = alpha)
134
135            _, metrics = predict(fold_cv_adata, metrics = [metric])
136            metric_array[j, fold] = metrics[metric]
137
138        del fold_cv_adata
139        gc.collect()
140
141    # Take AUROC mean across the k folds and select the alpha resulting in highest AUROC
142    alpha_star = alpha_array[np.argmax(np.mean(metric_array, axis = 1))]
143    del cv_adatas
144    gc.collect()
145    
146    return alpha_star
147
148
149def optimize_alpha(adata: ad.AnnData | list[ad.AnnData], 
150                   group_size: int | None=None, 
151                   tfidf: bool | list[bool]=False, 
152                   alpha_array: np.ndarray=np.round(np.linspace(1.9,0.1, 10),2), 
153                   k: int=4, metric: str='AUROC', 
154                   batches: int=10, batch_size: int=100):
155    """
156    Iteratively train a grouplasso model and update alpha to find the 
157    parameter yielding best performing sparsity. This function 
158    currently only works for binary experiments.
159
160    Parameters
161    ----------
162    adata : ad.AnnData | list[ad.AnnData]
163        `ad.AnnData`(s) with `'Z_train'` and `'Z_test'` in 
164        `adata.uns.keys()`.
165
166    group_size : None | int
167        Argument describing how the features are grouped. If `None`, 
168        `2 * adata.uns['D']` will be used. For more information see 
169        [celer documentation](https://mathurinm.github.io/celer/
170        generated/celer.GroupLasso.html).
171
172    tfidf : list | bool
173        If `False`, no data will be TF-IDF transformed. If 
174        `type(adata) is list` and TF-IDF transformation is desired for 
175        all or some of the data, a bool list corresponding to `adata` 
176        must be provided. To simply TF-IDF transform `adata` when 
177        `type(adata) is ad.AnnData`, use `True`.
178    
179    alpha_array : np.ndarray
180        Array of all alpha values to be tested.
181
182    k : int
183        Number of folds to perform cross validation over.
184            
185    metric : str
186        Which metric to use to optimize alpha. Options are `'AUROC'`, 
187        `'Accuracy'`, `'F1-Score'`, `'Precision'`, and `'Recall'`.
188
189    batches : int
190        The number of batches to use for the distance calculation.
191        This will average the result of `batches` distance calculations
192        of `batch_size` randomly sampled cells. More batches will converge
193        to population distance values at the cost of scalability.
194
195    batch_size : int
196        The number of cells to include per batch for distance
197        calculations. Higher batch size will converge to population
198        distance values at the cost of scalability. If 
199        `batches*batch_size > num_training_cells`, `batch_size` will be 
200        reduced to `int(num_training_cells/batches)`.
201
202    Returns
203    -------
204    alpha_star : float
205        The best performing alpha value from cross validation on 
206        training data.
207
208    Examples
209    --------
210    >>> alpha_star = scmkl.optimize_alpha(adata)
211    >>> alpha_star
212    0.1
213    """    
214    assert isinstance(k, int) and k > 0, "'k' must be positive"
215
216    import warnings 
217    warnings.filterwarnings('ignore')
218
219    if group_size == None:
220        group_size = adata.uns['D']*2
221
222    if type(adata) == list:
223        alpha_star = multimodal_optimize_alpha(adatas = adata, 
224                                               group_size = group_size,
225                                               tfidf_list = tfidf, 
226                                               alpha_array = alpha_array, 
227                                               metric = metric,
228                                               batch_size = batch_size,
229                                               batches = batches)
230        return alpha_star
231
232    y = adata.obs['labels'].iloc[adata.uns['train_indices']].to_numpy()
233    
234    # Splits the labels evenly between folds
235    positive_indices = np.where(y == np.unique(y)[0])[0]
236    negative_indices = np.setdiff1d(np.arange(len(y)), positive_indices)
237
238    positive_annotations = np.arange(len(positive_indices)) % k
239    negative_annotations = np.arange(len(negative_indices)) % k
240
241    metric_array = np.zeros((len(alpha_array), k))
242
243    gc.collect()
244
245    for fold in np.arange(k):
246
247        cv_adata = adata[adata.uns['train_indices'],:]
248
249        if 'sigma' in cv_adata.uns_keys():
250            del cv_adata.uns['sigma']
251
252        # Create CV train/test indices
253        fold_train = np.concatenate((positive_indices[np.where(positive_annotations != fold)[0]], 
254                                     negative_indices[np.where(negative_annotations != fold)[0]]))
255        fold_test = np.concatenate((positive_indices[np.where(positive_annotations == fold)[0]], 
256                                    negative_indices[np.where(negative_annotations == fold)[0]]))
257
258        cv_adata.uns['train_indices'] = fold_train
259        cv_adata.uns['test_indices'] = fold_test
260
261        if tfidf:
262            cv_adata = tfidf_normalize(cv_adata, binarize= True)
263
264        # Estimating kernel widths and calculating Zs
265        cv_adata = calculate_z(cv_adata, n_features= 5000, 
266                               batches = batches, batch_size = batch_size)
267
268        # In train_model we index Z_train for balancing multiclass labels. We just recreate
269        # dummy indices here that are unused for use in the binary case
270        cv_adata.uns['train_indices'] = np.arange(0, len(fold_train))
271
272        gc.collect()
273
274        for i, alpha in enumerate(alpha_array):
275
276            cv_adata = train_model(cv_adata, group_size, alpha = alpha)
277            _, metrics = predict(cv_adata, metrics = [metric])
278            metric_array[i, fold] = metrics[metric]
279
280            gc.collect()
281
282        del cv_adata
283        gc.collect()
284        
285    # Take AUROC mean across the k folds to find alpha yielding highest AUROC
286    alpha_star = alpha_array[np.argmax(np.mean(metric_array, axis = 1))]
287    gc.collect()
288    
289
290    return alpha_star
def multimodal_optimize_alpha( adatas: list[anndata._core.anndata.AnnData], group_size: int, tfidf_list: list | bool = False, alpha_array: numpy.ndarray = array([1.9, 1.7, 1.5, 1.3, 1.1, 0.9, 0.7, 0.5, 0.3, 0.1]), k: int = 4, metric: str = 'AUROC', batches: int = 10, batch_size: int = 100):
 15def multimodal_optimize_alpha(adatas: list[ad.AnnData], group_size: int, 
 16                              tfidf_list: list | bool=False,
 17                              alpha_array: np.ndarray=np.round(np.linspace(1.9,0.1, 10),2), 
 18                              k: int=4, metric: str='AUROC',
 19                              batches: int=10, batch_size: int=100):
 20    """
 21    Iteratively train a grouplasso model and update alpha to find the 
 22    parameter yielding the desired sparsity. Meant to find a good 
 23    starting point for your model, and the alpha may need further fine 
 24    tuning.
 25    
 26    Parameters
 27    ----------
 28    adatas : list[ad.AnnData]
 29        Objects of type `ad.AnnData` where each object is one modality 
 30        and Z_train and Z_test are calculated
 31
 32    group_size : None | int
 33        Argument describing how the features are grouped. If `None`, 
 34        `2 * adata.uns['D']` will be used. For more information see 
 35        [celer documentation](https://mathurinm.github.io/celer/
 36        generated/celer.GroupLasso.html).
 37
 38    tfidf_list : list | None
 39        A boolean mask where `tfidf_list[i]` is respective to 
 40        `adatas[i]`. If `True`, TF-IDF normalization will be applied to 
 41        the respective `ad.AnnData` during cross validation
 42    
 43    alpha_array : np.ndarray
 44        All alpha values to be tested.
 45
 46    k : int
 47        Number of folds to perform cross validation over.
 48
 49    metric : str
 50        Which metric to use to optimize alpha. Options are `'AUROC'`, 
 51        `'Accuracy'`, `'F1-Score'`, `'Precision'`, and `'Recall'`.
 52
 53    batches : int
 54        The number of batches to use for the distance calculation.
 55        This will average the result of `batches` distance calculations
 56        of `batch_size` randomly sampled cells. More batches will converge
 57        to population distance values at the cost of scalability.
 58
 59    batch_size : int
 60        The number of cells to include per batch for distance
 61        calculations. Higher batch size will converge to population
 62        distance values at the cost of scalability. If 
 63        `batches*batch_size > num_training_cells`, `batch_size` will be 
 64        reduced to `int(num_training_cells/batches)`.
 65            
 66    Returns
 67    -------
 68    alpha_star : float
 69        The alpha value yielding the best performing model from cross 
 70        validation.
 71    """
 72    assert isinstance(k, int) and k > 0, 'Must be a positive integer number of folds'
 73
 74    import warnings 
 75    warnings.filterwarnings('ignore')
 76
 77    if not tfidf_list:
 78        tfidf_list = [False]*len(adatas)
 79
 80    y = adatas[0].obs['labels'].iloc[adatas[0].uns['train_indices']].to_numpy()
 81    
 82    # Splits the labels evenly between folds
 83    positive_indices = np.where(y == np.unique(y)[0])[0]
 84    negative_indices = np.setdiff1d(np.arange(len(y)), positive_indices)
 85
 86    positive_annotations = np.arange(len(positive_indices)) % k
 87    negative_annotations = np.arange(len(negative_indices)) % k
 88
 89    metric_array = np.zeros((len(alpha_array), k))
 90
 91    cv_adatas = []
 92
 93    for adata in adatas:
 94        cv_adatas.append(adata[adata.uns['train_indices'],:].copy())
 95
 96    del adatas
 97    gc.collect()
 98
 99    for fold in np.arange(k):
100
101        fold_train = np.concatenate((positive_indices[np.where(positive_annotations != fold)[0]], 
102                                     negative_indices[np.where(negative_annotations != fold)[0]]))
103        fold_test = np.concatenate((positive_indices[np.where(positive_annotations == fold)[0]], 
104                                    negative_indices[np.where(negative_annotations == fold)[0]]))
105
106        for i in range(len(cv_adatas)):
107            cv_adatas[i].uns['train_indices'] = fold_train
108            cv_adatas[i].uns['test_indices'] = fold_test
109
110        # Creating dummy names for cv. 
111        # Necessary for interpretability but not for AUROC cv
112        dummy_names = [f'adata {i}' for i in range(len(cv_adatas))]
113
114        # Calculate the Z's for each modality independently
115        fold_cv_adata = multimodal_processing(adatas = cv_adatas, 
116                                              names = dummy_names, 
117                                              tfidf = tfidf_list, 
118                                              batch_size= batch_size, 
119                                              batches = batches)
120
121        fold_cv_adata.uns['seed_obj'] = cv_adatas[0].uns['seed_obj']
122
123        if 'sigma' in fold_cv_adata.uns_keys():
124            del fold_cv_adata.uns['sigma']
125
126        # In train_model we index Z_train for balancing multiclass labels. We just recreate
127        # dummy indices here that are unused for use in the binary case
128        fold_cv_adata.uns['train_indices'] = np.arange(0, len(fold_train))
129
130        gc.collect()
131
132        for j, alpha in enumerate(alpha_array):
133
134            fold_cv_adata = train_model(fold_cv_adata, group_size, alpha = alpha)
135
136            _, metrics = predict(fold_cv_adata, metrics = [metric])
137            metric_array[j, fold] = metrics[metric]
138
139        del fold_cv_adata
140        gc.collect()
141
142    # Take AUROC mean across the k folds and select the alpha resulting in highest AUROC
143    alpha_star = alpha_array[np.argmax(np.mean(metric_array, axis = 1))]
144    del cv_adatas
145    gc.collect()
146    
147    return alpha_star

Iteratively train a grouplasso model and update alpha to find the parameter yielding the desired sparsity. Meant to find a good starting point for your model, and the alpha may need further fine tuning.

Parameters
  • adatas (list[ad.AnnData]): Objects of type ad.AnnData where each object is one modality and Z_train and Z_test are calculated
  • group_size (None | int): Argument describing how the features are grouped. If None, 2 * adata.uns['D'] will be used. For more information see celer documentation.
  • tfidf_list (list | None): A boolean mask where tfidf_list[i] is respective to adatas[i]. If True, TF-IDF normalization will be applied to the respective ad.AnnData during cross validation
  • alpha_array (np.ndarray): All alpha values to be tested.
  • k (int): Number of folds to perform cross validation over.
  • metric (str): Which metric to use to optimize alpha. Options are 'AUROC', 'Accuracy', 'F1-Score', 'Precision', and 'Recall'.
  • batches (int): The number of batches to use for the distance calculation. This will average the result of batches distance calculations of batch_size randomly sampled cells. More batches will converge to population distance values at the cost of scalability.
  • batch_size (int): The number of cells to include per batch for distance calculations. Higher batch size will converge to population distance values at the cost of scalability. If batches*batch_size > num_training_cells, batch_size will be reduced to int(num_training_cells/batches).
Returns
  • alpha_star (float): The alpha value yielding the best performing model from cross validation.
def optimize_alpha( adata: anndata._core.anndata.AnnData | list[anndata._core.anndata.AnnData], group_size: int | None = None, tfidf: bool | list[bool] = False, alpha_array: numpy.ndarray = array([1.9, 1.7, 1.5, 1.3, 1.1, 0.9, 0.7, 0.5, 0.3, 0.1]), k: int = 4, metric: str = 'AUROC', batches: int = 10, batch_size: int = 100):
150def optimize_alpha(adata: ad.AnnData | list[ad.AnnData], 
151                   group_size: int | None=None, 
152                   tfidf: bool | list[bool]=False, 
153                   alpha_array: np.ndarray=np.round(np.linspace(1.9,0.1, 10),2), 
154                   k: int=4, metric: str='AUROC', 
155                   batches: int=10, batch_size: int=100):
156    """
157    Iteratively train a grouplasso model and update alpha to find the 
158    parameter yielding best performing sparsity. This function 
159    currently only works for binary experiments.
160
161    Parameters
162    ----------
163    adata : ad.AnnData | list[ad.AnnData]
164        `ad.AnnData`(s) with `'Z_train'` and `'Z_test'` in 
165        `adata.uns.keys()`.
166
167    group_size : None | int
168        Argument describing how the features are grouped. If `None`, 
169        `2 * adata.uns['D']` will be used. For more information see 
170        [celer documentation](https://mathurinm.github.io/celer/
171        generated/celer.GroupLasso.html).
172
173    tfidf : list | bool
174        If `False`, no data will be TF-IDF transformed. If 
175        `type(adata) is list` and TF-IDF transformation is desired for 
176        all or some of the data, a bool list corresponding to `adata` 
177        must be provided. To simply TF-IDF transform `adata` when 
178        `type(adata) is ad.AnnData`, use `True`.
179    
180    alpha_array : np.ndarray
181        Array of all alpha values to be tested.
182
183    k : int
184        Number of folds to perform cross validation over.
185            
186    metric : str
187        Which metric to use to optimize alpha. Options are `'AUROC'`, 
188        `'Accuracy'`, `'F1-Score'`, `'Precision'`, and `'Recall'`.
189
190    batches : int
191        The number of batches to use for the distance calculation.
192        This will average the result of `batches` distance calculations
193        of `batch_size` randomly sampled cells. More batches will converge
194        to population distance values at the cost of scalability.
195
196    batch_size : int
197        The number of cells to include per batch for distance
198        calculations. Higher batch size will converge to population
199        distance values at the cost of scalability. If 
200        `batches*batch_size > num_training_cells`, `batch_size` will be 
201        reduced to `int(num_training_cells/batches)`.
202
203    Returns
204    -------
205    alpha_star : float
206        The best performing alpha value from cross validation on 
207        training data.
208
209    Examples
210    --------
211    >>> alpha_star = scmkl.optimize_alpha(adata)
212    >>> alpha_star
213    0.1
214    """    
215    assert isinstance(k, int) and k > 0, "'k' must be positive"
216
217    import warnings 
218    warnings.filterwarnings('ignore')
219
220    if group_size == None:
221        group_size = adata.uns['D']*2
222
223    if type(adata) == list:
224        alpha_star = multimodal_optimize_alpha(adatas = adata, 
225                                               group_size = group_size,
226                                               tfidf_list = tfidf, 
227                                               alpha_array = alpha_array, 
228                                               metric = metric,
229                                               batch_size = batch_size,
230                                               batches = batches)
231        return alpha_star
232
233    y = adata.obs['labels'].iloc[adata.uns['train_indices']].to_numpy()
234    
235    # Splits the labels evenly between folds
236    positive_indices = np.where(y == np.unique(y)[0])[0]
237    negative_indices = np.setdiff1d(np.arange(len(y)), positive_indices)
238
239    positive_annotations = np.arange(len(positive_indices)) % k
240    negative_annotations = np.arange(len(negative_indices)) % k
241
242    metric_array = np.zeros((len(alpha_array), k))
243
244    gc.collect()
245
246    for fold in np.arange(k):
247
248        cv_adata = adata[adata.uns['train_indices'],:]
249
250        if 'sigma' in cv_adata.uns_keys():
251            del cv_adata.uns['sigma']
252
253        # Create CV train/test indices
254        fold_train = np.concatenate((positive_indices[np.where(positive_annotations != fold)[0]], 
255                                     negative_indices[np.where(negative_annotations != fold)[0]]))
256        fold_test = np.concatenate((positive_indices[np.where(positive_annotations == fold)[0]], 
257                                    negative_indices[np.where(negative_annotations == fold)[0]]))
258
259        cv_adata.uns['train_indices'] = fold_train
260        cv_adata.uns['test_indices'] = fold_test
261
262        if tfidf:
263            cv_adata = tfidf_normalize(cv_adata, binarize= True)
264
265        # Estimating kernel widths and calculating Zs
266        cv_adata = calculate_z(cv_adata, n_features= 5000, 
267                               batches = batches, batch_size = batch_size)
268
269        # In train_model we index Z_train for balancing multiclass labels. We just recreate
270        # dummy indices here that are unused for use in the binary case
271        cv_adata.uns['train_indices'] = np.arange(0, len(fold_train))
272
273        gc.collect()
274
275        for i, alpha in enumerate(alpha_array):
276
277            cv_adata = train_model(cv_adata, group_size, alpha = alpha)
278            _, metrics = predict(cv_adata, metrics = [metric])
279            metric_array[i, fold] = metrics[metric]
280
281            gc.collect()
282
283        del cv_adata
284        gc.collect()
285        
286    # Take AUROC mean across the k folds to find alpha yielding highest AUROC
287    alpha_star = alpha_array[np.argmax(np.mean(metric_array, axis = 1))]
288    gc.collect()
289    
290
291    return alpha_star

Iteratively train a grouplasso model and update alpha to find the parameter yielding best performing sparsity. This function currently only works for binary experiments.

Parameters
  • adata (ad.AnnData | list[ad.AnnData]): ad.AnnData(s) with 'Z_train' and 'Z_test' in adata.uns.keys().
  • group_size (None | int): Argument describing how the features are grouped. If None, 2 * adata.uns['D'] will be used. For more information see celer documentation.
  • tfidf (list | bool): If False, no data will be TF-IDF transformed. If type(adata) is list and TF-IDF transformation is desired for all or some of the data, a bool list corresponding to adata must be provided. To simply TF-IDF transform adata when type(adata) is ad.AnnData, use True.
  • alpha_array (np.ndarray): Array of all alpha values to be tested.
  • k (int): Number of folds to perform cross validation over.
  • metric (str): Which metric to use to optimize alpha. Options are 'AUROC', 'Accuracy', 'F1-Score', 'Precision', and 'Recall'.
  • batches (int): The number of batches to use for the distance calculation. This will average the result of batches distance calculations of batch_size randomly sampled cells. More batches will converge to population distance values at the cost of scalability.
  • batch_size (int): The number of cells to include per batch for distance calculations. Higher batch size will converge to population distance values at the cost of scalability. If batches*batch_size > num_training_cells, batch_size will be reduced to int(num_training_cells/batches).
Returns
  • alpha_star (float): The best performing alpha value from cross validation on training data.
Examples
>>> alpha_star = scmkl.optimize_alpha(adata)
>>> alpha_star
0.1