scmkl.optimize_alpha
1import numpy as np 2import anndata as ad 3import gc 4 5from scmkl.tfidf_normalize import tfidf_normalize 6from scmkl.calculate_z import calculate_z 7from scmkl.train_model import train_model 8from scmkl.multimodal_processing import multimodal_processing 9from scmkl.test import predict 10from scmkl.one_v_rest import get_class_train 11 12 13# Array of alphas to be used if not provided 14default_alphas = np.round(np.linspace(1.9, 0.05, 10),2) 15 16 17def stop_early(metric_array, alpha_idx, fold_idx): 18 """ 19 Assumes smallest alpha comes first. 20 """ 21 # Must be at least two metrics from two alphas to compare 22 if alpha_idx <= 0: 23 return False 24 25 cur_met = metric_array[alpha_idx, fold_idx] 26 last_met = metric_array[alpha_idx - 1, fold_idx] 27 28 if cur_met < last_met: 29 return True 30 else: 31 return False 32 33 34def sort_alphas(alpha_array: np.ndarray): 35 """ 36 Sorts alphas from smallest to largest. 37 """ 38 order = np.argsort(alpha_array) 39 alpha_array = alpha_array[order] 40 41 return alpha_array 42 43 44def get_labels(adata: list | ad.AnnData): 45 """ 46 Copies labels and train indices from `ad.AnnData` object for k-fold 47 cross validation. Mostly present for readability. 48 """ 49 train_indices = adata[0].uns['train_indices'].copy() 50 y = adata[0].obs['labels'].iloc[train_indices].to_numpy() 51 52 return train_indices, y 53 54 55def get_folds(y: np.array, k: int): 56 """ 57 With labels of training data and number of folds, returns the 58 indices and label for each k-folds. 59 """ 60 # Splits the labels evenly between folds 61 pos_idcs = np.where(y == np.unique(y)[0])[0] 62 neg_idcs = np.setdiff1d(np.arange(len(y)), pos_idcs) 63 64 pos_anno = np.arange(len(pos_idcs)) % k 65 neg_anno = np.arange(len(neg_idcs)) % k 66 67 return pos_idcs, neg_idcs, pos_anno, neg_anno 68 69 70def bin_optimize_alpha(adata: list[ad.AnnData], 71 group_size: int | None=None, 72 tfidf: list[bool]=False, 73 alpha_array: np.ndarray=default_alphas, 74 k: int=4, metric: str='AUROC', 75 early_stopping: bool=False, 76 batches: int=10, batch_size: int=100, 77 combination: str='concatenate'): 78 """ 79 Iteratively train a grouplasso model and update alpha to find the 80 parameter yielding best performing sparsity via k-fold cross 81 validation. This function currently only works for binary 82 experiments. Called by `scmkl.optimize_alpha()`. 83 84 Parameters 85 ---------- 86 adata : list[ad.AnnData] 87 List of `ad.AnnData`(s) with `'Z_train'` and `'Z_test'` in 88 `adata.uns.keys()`. 89 90 group_size : None | int 91 Argument describing how the features are grouped. If `None`, 92 `2 * adata.uns['D']` will be used. For more information see 93 [celer documentation](https://mathurinm.github.io/celer/ 94 generated/celer.GroupLasso.html). 95 96 tfidf : list | bool 97 If `False`, no data will be TF-IDF transformed. If 98 `type(adata) is list` and TF-IDF transformation is desired for 99 all or some of the data, a bool list corresponding to `adata` 100 must be provided. To simply TF-IDF transform `adata` when 101 `type(adata) is ad.AnnData`, use `True`. 102 103 alpha_array : np.ndarray 104 Array of all alpha values to be tested. 105 106 k : int 107 Number of folds to perform cross validation over. 108 109 metric : str 110 Which metric to use to optimize alpha. Options are `'AUROC'`, 111 `'Accuracy'`, `'F1-Score'`, `'Precision'`, and `'Recall'`. 112 113 batches : int 114 The number of batches to use for the distance calculation. 115 This will average the result of `batches` distance calculations 116 of `batch_size` randomly sampled cells. More batches will converge 117 to population distance values at the cost of scalability. 118 119 batch_size : int 120 The number of cells to include per batch for distance 121 calculations. Higher batch size will converge to population 122 distance values at the cost of scalability. If 123 `batches*batch_size > num_training_cells`, `batch_size` will be 124 reduced to `int(num_training_cells/batches)`. 125 126 Returns 127 ------- 128 alpha_star : float 129 The best performing alpha value from cross validation on 130 training data. 131 132 Examples 133 -------- 134 >>> alpha_star = scmkl.optimize_alpha(adata) 135 >>> alpha_star 136 0.1 137 """ 138 assert isinstance(k, int) and k > 0, "'k' must be positive" 139 140 import warnings 141 warnings.filterwarnings('ignore') 142 143 alpha_array = sort_alphas(alpha_array) 144 145 # Need even folds for cross validation 146 train_indices, y = get_labels(adata) 147 pos_idcs, neg_idcs, pos_anno, neg_anno = get_folds(y, k) 148 149 metric_array = np.zeros((len(alpha_array), k)) 150 151 gc.collect() 152 153 for fold in np.arange(k): 154 cv_adata = [adata[i][train_indices, :] 155 for i in range(len(adata))] 156 157 for i in range(len(cv_adata)): 158 if 'sigma' in cv_adata[i].uns_keys(): 159 del cv_adata[i].uns['sigma'] 160 161 # Create CV train/test indices 162 fold_train = np.concatenate((pos_idcs[np.where(pos_anno != fold)[0]], 163 neg_idcs[np.where(neg_anno != fold)[0]])) 164 fold_test = np.concatenate((pos_idcs[np.where(pos_anno == fold)[0]], 165 neg_idcs[np.where(neg_anno == fold)[0]])) 166 167 168 for i in range(len(cv_adata)): 169 cv_adata[i].uns['train_indices'] = fold_train 170 cv_adata[i].uns['test_indices'] = fold_test 171 if tfidf[i]: 172 cv_adata[i] = tfidf_normalize(cv_adata[i], binarize=True) 173 174 cv_adata[i] = calculate_z(cv_adata[i], n_features= 5000, 175 batches = batches, batch_size = batch_size) 176 177 names = ['Adata ' + str(i + 1) for i in range(len(cv_adata))] 178 cv_adata = multimodal_processing(adata, names, tfidf, 179 combination, batches, 180 batch_size) 181 182 183 # In train_model we index Z_train for balancing multiclass labels. We just recreate 184 # dummy indices here that are unused for use in the binary case 185 cv_adata.uns['train_indices'] = np.arange(0, len(fold_train)) 186 187 gc.collect() 188 189 for i, alpha in enumerate(alpha_array): 190 191 cv_adata = train_model(cv_adata, group_size, alpha = alpha) 192 _, metrics = predict(cv_adata, metrics = [metric]) 193 metric_array[i, fold] = metrics[metric] 194 195 # If metrics are decreasing, cv stopped and moving to next fold 196 end_fold = stop_early(metric_array, alpha_idx=i, fold_idx=fold) 197 if end_fold and early_stopping: 198 break 199 200 del cv_adata 201 gc.collect() 202 203 # Take AUROC mean across the k folds to find alpha yielding highest AUROC 204 alpha_star = alpha_array[np.argmax(np.mean(metric_array, axis = 1))] 205 gc.collect() 206 207 return alpha_star 208 209 210def multi_optimize_alpha(adata: list[ad.AnnData], group_size: int, 211 tfidf: list[bool]=[False], 212 alpha_array: np.ndarray=default_alphas, k: int=4, 213 metric: str='AUROC', early_stopping: bool=False, 214 batches: int=10, batch_size: int=100, 215 force_balance: bool=True, combination: str='concatenate', 216 train_dict: dict=None): 217 """ 218 Wrapper function for running k-fold cross validation for every 219 label in a multiclass experiment. Called by 220 `scmkl.optimize_alpha()`. 221 222 Parameters 223 ---------- 224 adata : list[ad.AnnData] 225 List of `ad.AnnData`(s) with `'Z_train'` and `'Z_test'` in 226 `adata.uns.keys()`. 227 228 group_size : None | int 229 Argument describing how the features are grouped. If `None`, 230 `2 * adata.uns['D']` will be used. For more information see 231 [celer documentation](https://mathurinm.github.io/celer/ 232 generated/celer.GroupLasso.html). 233 234 tfidf : list[bool] 235 If `False`, no data will be TF-IDF transformed. If 236 `type(adata) is list` and TF-IDF transformation is desired for 237 all or some of the data, a bool list corresponding to `adata` 238 must be provided. To simply TF-IDF transform `adata` when 239 `type(adata) is ad.AnnData`, use `True`. 240 241 alpha_array : np.ndarray 242 Array of all alpha values to be tested. 243 244 k : int 245 Number of folds to perform cross validation over. 246 247 metric : str 248 Which metric to use to optimize alpha. Options are `'AUROC'`, 249 `'Accuracy'`, `'F1-Score'`, `'Precision'`, and `'Recall'`. 250 251 batches : int 252 The number of batches to use for the distance calculation. 253 This will average the result of `batches` distance calculations 254 of `batch_size` randomly sampled cells. More batches will converge 255 to population distance values at the cost of scalability. 256 257 batch_size : int 258 The number of cells to include per batch for distance 259 calculations. Higher batch size will converge to population 260 distance values at the cost of scalability. If 261 `batches*batch_size > num_training_cells`, `batch_size` will be 262 reduced to `int(num_training_cells/batches)`. 263 264 force_balance: bool 265 If `True`, training sets will be balanced to reduce class label 266 imbalance for each iteration. Defaults to `False`. 267 268 other_factor : float 269 The ratio of cells to sample for the other class for each 270 model. For example, if classifying B cells with 100 B cells in 271 training, if `other_factor=1`, 100 cells that are not B cells 272 will be trained on with the B cells. This will be done for each 273 fold for each class if `force_balance` is `True`. 274 275 combination: str 276 How should multiple views of data be combined. For more details 277 see ad.concat. 278 279 train_dict: dict 280 A `dict` where each key is a class label and values are are the 281 indices to be trained with for that class for class balance. 282 All values must be present in each adata.uns['train_indices']. 283 284 Returns 285 ------- 286 alpha_star : dict 287 A dictionary with keys being class labels and values being the 288 best performing alpha parameter for that class as a float. 289 """ 290 classes = np.unique(adata[0].obs['labels']) 291 orig_labels = adata[0].obs['labels'].to_numpy().copy() 292 orig_train = adata[0].uns['train_indices'].copy() 293 294 if train_dict: 295 train_idcs = train_dict 296 else: 297 if force_balance: 298 train_idcs = get_class_train(adata[0].uns['train_indices'], 299 adata[0].obs['labels'], 300 adata[0].uns['seed_obj']) 301 else: 302 train_idcs = {ct: adata[0].uns['train_indices'].copy() 303 for ct in classes} 304 305 opt_alpha_dict = dict() 306 307 for cl in classes: 308 temp_classes = orig_labels.copy() 309 temp_classes[temp_classes != cl] = 'other' 310 311 for i in range(len(adata)): 312 adata[i].obs['labels'] = temp_classes.copy() 313 adata[i].uns['train_indices'] = train_idcs[cl] 314 315 opt_alpha_dict[cl] = bin_optimize_alpha(adata, group_size, tfidf, 316 alpha_array, k, metric, 317 early_stopping, batches, 318 batch_size, combination) 319 320 # Global adata obj will be permanently changed if not reset 321 for i in range(len(adata)): 322 adata[i].obs['labels'] = orig_labels 323 adata[i].uns['train_indices'] = orig_train 324 325 return opt_alpha_dict 326 327 328def optimize_alpha(adata: ad.AnnData | list[ad.AnnData], 329 group_size: int | None=None, tfidf: None | list[bool]=None, 330 alpha_array: np.ndarray=default_alphas, k: int=4, 331 metric: str='AUROC', early_stopping: bool=False, 332 batches: int=10, batch_size: int=100, 333 combination: str='concatenate', force_balance: bool=True, 334 train_dict: dict=None): 335 """ 336 K-fold cross validation for optimizing alpha hyperparameter using 337 training indices. 338 339 Parameters 340 ---------- 341 adata : list[ad.AnnData] 342 List of `ad.AnnData`(s) with `'Z_train'` and `'Z_test'` in 343 `adata.uns.keys()`. 344 345 group_size : None | int 346 Argument describing how the features are grouped. If `None`, 347 `2 * adata.uns['D']` will be used. For more information see 348 [celer documentation](https://mathurinm.github.io/celer/ 349 generated/celer.GroupLasso.html). 350 351 tfidf : list[bool] 352 If `False`, no data will be TF-IDF transformed. If 353 `type(adata) is list` and TF-IDF transformation is desired for 354 all or some of the data, a bool list corresponding to `adata` 355 must be provided. To simply TF-IDF transform `adata` when 356 `type(adata) is ad.AnnData`, use `True`. 357 358 alpha_array : np.ndarray 359 Array of all alpha values to be tested. 360 361 k : int 362 Number of folds to perform cross validation over. 363 364 metric : str 365 Which metric to use to optimize alpha. Options are `'AUROC'`, 366 `'Accuracy'`, `'F1-Score'`, `'Precision'`, and `'Recall'`. 367 368 batches : int 369 The number of batches to use for the distance calculation. 370 This will average the result of `batches` distance calculations 371 of `batch_size` randomly sampled cells. More batches will converge 372 to population distance values at the cost of scalability. 373 374 batch_size : int 375 The number of cells to include per batch for distance 376 calculations. Higher batch size will converge to population 377 distance values at the cost of scalability. If 378 `batches*batch_size > num_training_cells`, `batch_size` will be 379 reduced to `int(num_training_cells/batches)`. 380 381 force_balance: bool 382 If `True`, training sets will be balanced to reduce class label 383 imbalance for each iteration. Defaults to `False`. 384 385 other_factor : float 386 The ratio of cells to sample for the other class for each 387 model. For example, if classifying B cells with 100 B cells in 388 training, if `other_factor=1`, 100 cells that are not B cells 389 will be trained on with the B cells. This will be done for each 390 fold for each class if `force_balance` is `True`. 391 392 combination: str 393 How should multiple views of data be combined. For more details 394 see ad.concat. 395 396 train_dict: dict 397 A `dict` where each key is a class label and values are are the 398 indices to be trained with for that class for class balance. 399 All values must be present in each adata.uns['train_indices']. 400 401 Returns 402 ------- 403 alpha_star : float | dict 404 If number of classes is more than 2, a dictionary with keys 405 being class labels and values being the best performing alpha 406 parameter for that class as a float. Else, a float for 407 comparing the two classes. 408 """ 409 # Need singe-view runs to be iterable 410 if isinstance(adata, ad.AnnData): 411 adata = [adata.copy()] 412 413 if isinstance(tfidf, type(None)): 414 tfidf = len(adata)*[False] 415 416 is_multi = 2 < len(set(adata[0].obs['labels'])) 417 418 if isinstance(group_size, type(None)): 419 group_size = 2*adata[0].uns['D'] 420 421 422 if is_multi: 423 alpha_star = multi_optimize_alpha(adata, group_size, tfidf, 424 alpha_array, k, metric, 425 early_stopping, batches, 426 batch_size, force_balance, 427 combination, train_dict) 428 429 else: 430 alpha_star = bin_optimize_alpha(adata, group_size, tfidf, 431 alpha_array, k, metric, 432 early_stopping, batches, 433 batch_size, combination) 434 435 return alpha_star
default_alphas =
array([1.9 , 1.69, 1.49, 1.28, 1.08, 0.87, 0.67, 0.46, 0.26, 0.05])
def
stop_early(metric_array, alpha_idx, fold_idx):
18def stop_early(metric_array, alpha_idx, fold_idx): 19 """ 20 Assumes smallest alpha comes first. 21 """ 22 # Must be at least two metrics from two alphas to compare 23 if alpha_idx <= 0: 24 return False 25 26 cur_met = metric_array[alpha_idx, fold_idx] 27 last_met = metric_array[alpha_idx - 1, fold_idx] 28 29 if cur_met < last_met: 30 return True 31 else: 32 return False
Assumes smallest alpha comes first.
def
sort_alphas(alpha_array: numpy.ndarray):
35def sort_alphas(alpha_array: np.ndarray): 36 """ 37 Sorts alphas from smallest to largest. 38 """ 39 order = np.argsort(alpha_array) 40 alpha_array = alpha_array[order] 41 42 return alpha_array
Sorts alphas from smallest to largest.
def
get_labels(adata: list | anndata._core.anndata.AnnData):
45def get_labels(adata: list | ad.AnnData): 46 """ 47 Copies labels and train indices from `ad.AnnData` object for k-fold 48 cross validation. Mostly present for readability. 49 """ 50 train_indices = adata[0].uns['train_indices'].copy() 51 y = adata[0].obs['labels'].iloc[train_indices].to_numpy() 52 53 return train_indices, y
Copies labels and train indices from ad.AnnData object for k-fold
cross validation. Mostly present for readability.
def
get_folds(y: <built-in function array>, k: int):
56def get_folds(y: np.array, k: int): 57 """ 58 With labels of training data and number of folds, returns the 59 indices and label for each k-folds. 60 """ 61 # Splits the labels evenly between folds 62 pos_idcs = np.where(y == np.unique(y)[0])[0] 63 neg_idcs = np.setdiff1d(np.arange(len(y)), pos_idcs) 64 65 pos_anno = np.arange(len(pos_idcs)) % k 66 neg_anno = np.arange(len(neg_idcs)) % k 67 68 return pos_idcs, neg_idcs, pos_anno, neg_anno
With labels of training data and number of folds, returns the indices and label for each k-folds.
def
bin_optimize_alpha( adata: list[anndata._core.anndata.AnnData], group_size: int | None = None, tfidf: list[bool] = False, alpha_array: numpy.ndarray = array([1.9 , 1.69, 1.49, 1.28, 1.08, 0.87, 0.67, 0.46, 0.26, 0.05]), k: int = 4, metric: str = 'AUROC', early_stopping: bool = False, batches: int = 10, batch_size: int = 100, combination: str = 'concatenate'):
71def bin_optimize_alpha(adata: list[ad.AnnData], 72 group_size: int | None=None, 73 tfidf: list[bool]=False, 74 alpha_array: np.ndarray=default_alphas, 75 k: int=4, metric: str='AUROC', 76 early_stopping: bool=False, 77 batches: int=10, batch_size: int=100, 78 combination: str='concatenate'): 79 """ 80 Iteratively train a grouplasso model and update alpha to find the 81 parameter yielding best performing sparsity via k-fold cross 82 validation. This function currently only works for binary 83 experiments. Called by `scmkl.optimize_alpha()`. 84 85 Parameters 86 ---------- 87 adata : list[ad.AnnData] 88 List of `ad.AnnData`(s) with `'Z_train'` and `'Z_test'` in 89 `adata.uns.keys()`. 90 91 group_size : None | int 92 Argument describing how the features are grouped. If `None`, 93 `2 * adata.uns['D']` will be used. For more information see 94 [celer documentation](https://mathurinm.github.io/celer/ 95 generated/celer.GroupLasso.html). 96 97 tfidf : list | bool 98 If `False`, no data will be TF-IDF transformed. If 99 `type(adata) is list` and TF-IDF transformation is desired for 100 all or some of the data, a bool list corresponding to `adata` 101 must be provided. To simply TF-IDF transform `adata` when 102 `type(adata) is ad.AnnData`, use `True`. 103 104 alpha_array : np.ndarray 105 Array of all alpha values to be tested. 106 107 k : int 108 Number of folds to perform cross validation over. 109 110 metric : str 111 Which metric to use to optimize alpha. Options are `'AUROC'`, 112 `'Accuracy'`, `'F1-Score'`, `'Precision'`, and `'Recall'`. 113 114 batches : int 115 The number of batches to use for the distance calculation. 116 This will average the result of `batches` distance calculations 117 of `batch_size` randomly sampled cells. More batches will converge 118 to population distance values at the cost of scalability. 119 120 batch_size : int 121 The number of cells to include per batch for distance 122 calculations. Higher batch size will converge to population 123 distance values at the cost of scalability. If 124 `batches*batch_size > num_training_cells`, `batch_size` will be 125 reduced to `int(num_training_cells/batches)`. 126 127 Returns 128 ------- 129 alpha_star : float 130 The best performing alpha value from cross validation on 131 training data. 132 133 Examples 134 -------- 135 >>> alpha_star = scmkl.optimize_alpha(adata) 136 >>> alpha_star 137 0.1 138 """ 139 assert isinstance(k, int) and k > 0, "'k' must be positive" 140 141 import warnings 142 warnings.filterwarnings('ignore') 143 144 alpha_array = sort_alphas(alpha_array) 145 146 # Need even folds for cross validation 147 train_indices, y = get_labels(adata) 148 pos_idcs, neg_idcs, pos_anno, neg_anno = get_folds(y, k) 149 150 metric_array = np.zeros((len(alpha_array), k)) 151 152 gc.collect() 153 154 for fold in np.arange(k): 155 cv_adata = [adata[i][train_indices, :] 156 for i in range(len(adata))] 157 158 for i in range(len(cv_adata)): 159 if 'sigma' in cv_adata[i].uns_keys(): 160 del cv_adata[i].uns['sigma'] 161 162 # Create CV train/test indices 163 fold_train = np.concatenate((pos_idcs[np.where(pos_anno != fold)[0]], 164 neg_idcs[np.where(neg_anno != fold)[0]])) 165 fold_test = np.concatenate((pos_idcs[np.where(pos_anno == fold)[0]], 166 neg_idcs[np.where(neg_anno == fold)[0]])) 167 168 169 for i in range(len(cv_adata)): 170 cv_adata[i].uns['train_indices'] = fold_train 171 cv_adata[i].uns['test_indices'] = fold_test 172 if tfidf[i]: 173 cv_adata[i] = tfidf_normalize(cv_adata[i], binarize=True) 174 175 cv_adata[i] = calculate_z(cv_adata[i], n_features= 5000, 176 batches = batches, batch_size = batch_size) 177 178 names = ['Adata ' + str(i + 1) for i in range(len(cv_adata))] 179 cv_adata = multimodal_processing(adata, names, tfidf, 180 combination, batches, 181 batch_size) 182 183 184 # In train_model we index Z_train for balancing multiclass labels. We just recreate 185 # dummy indices here that are unused for use in the binary case 186 cv_adata.uns['train_indices'] = np.arange(0, len(fold_train)) 187 188 gc.collect() 189 190 for i, alpha in enumerate(alpha_array): 191 192 cv_adata = train_model(cv_adata, group_size, alpha = alpha) 193 _, metrics = predict(cv_adata, metrics = [metric]) 194 metric_array[i, fold] = metrics[metric] 195 196 # If metrics are decreasing, cv stopped and moving to next fold 197 end_fold = stop_early(metric_array, alpha_idx=i, fold_idx=fold) 198 if end_fold and early_stopping: 199 break 200 201 del cv_adata 202 gc.collect() 203 204 # Take AUROC mean across the k folds to find alpha yielding highest AUROC 205 alpha_star = alpha_array[np.argmax(np.mean(metric_array, axis = 1))] 206 gc.collect() 207 208 return alpha_star
Iteratively train a grouplasso model and update alpha to find the
parameter yielding best performing sparsity via k-fold cross
validation. This function currently only works for binary
experiments. Called by scmkl.optimize_alpha.
Parameters
- adata (list[ad.AnnData]):
List of
ad.AnnData(s) with'Z_train'and'Z_test'inadata.uns.keys(). - group_size (None | int):
Argument describing how the features are grouped. If
None,2 * adata.uns['D']will be used. For more information see celer documentation. - tfidf (list | bool):
If
False, no data will be TF-IDF transformed. Iftype(adata) is listand TF-IDF transformation is desired for all or some of the data, a bool list corresponding toadatamust be provided. To simply TF-IDF transformadatawhentype(adata) is ad.AnnData, useTrue. - alpha_array (np.ndarray): Array of all alpha values to be tested.
- k (int): Number of folds to perform cross validation over.
- metric (str):
Which metric to use to optimize alpha. Options are
'AUROC','Accuracy','F1-Score','Precision', and'Recall'. - batches (int):
The number of batches to use for the distance calculation.
This will average the result of
batchesdistance calculations ofbatch_sizerandomly sampled cells. More batches will converge to population distance values at the cost of scalability. - batch_size (int):
The number of cells to include per batch for distance
calculations. Higher batch size will converge to population
distance values at the cost of scalability. If
batches*batch_size > num_training_cells,batch_sizewill be reduced toint(num_training_cells/batches).
Returns
- alpha_star (float): The best performing alpha value from cross validation on training data.
Examples
>>> alpha_star = scmkl.optimize_alpha(adata)
>>> alpha_star
0.1
def
multi_optimize_alpha( adata: list[anndata._core.anndata.AnnData], group_size: int, tfidf: list[bool] = [False], alpha_array: numpy.ndarray = array([1.9 , 1.69, 1.49, 1.28, 1.08, 0.87, 0.67, 0.46, 0.26, 0.05]), k: int = 4, metric: str = 'AUROC', early_stopping: bool = False, batches: int = 10, batch_size: int = 100, force_balance: bool = True, combination: str = 'concatenate', train_dict: dict = None):
211def multi_optimize_alpha(adata: list[ad.AnnData], group_size: int, 212 tfidf: list[bool]=[False], 213 alpha_array: np.ndarray=default_alphas, k: int=4, 214 metric: str='AUROC', early_stopping: bool=False, 215 batches: int=10, batch_size: int=100, 216 force_balance: bool=True, combination: str='concatenate', 217 train_dict: dict=None): 218 """ 219 Wrapper function for running k-fold cross validation for every 220 label in a multiclass experiment. Called by 221 `scmkl.optimize_alpha()`. 222 223 Parameters 224 ---------- 225 adata : list[ad.AnnData] 226 List of `ad.AnnData`(s) with `'Z_train'` and `'Z_test'` in 227 `adata.uns.keys()`. 228 229 group_size : None | int 230 Argument describing how the features are grouped. If `None`, 231 `2 * adata.uns['D']` will be used. For more information see 232 [celer documentation](https://mathurinm.github.io/celer/ 233 generated/celer.GroupLasso.html). 234 235 tfidf : list[bool] 236 If `False`, no data will be TF-IDF transformed. If 237 `type(adata) is list` and TF-IDF transformation is desired for 238 all or some of the data, a bool list corresponding to `adata` 239 must be provided. To simply TF-IDF transform `adata` when 240 `type(adata) is ad.AnnData`, use `True`. 241 242 alpha_array : np.ndarray 243 Array of all alpha values to be tested. 244 245 k : int 246 Number of folds to perform cross validation over. 247 248 metric : str 249 Which metric to use to optimize alpha. Options are `'AUROC'`, 250 `'Accuracy'`, `'F1-Score'`, `'Precision'`, and `'Recall'`. 251 252 batches : int 253 The number of batches to use for the distance calculation. 254 This will average the result of `batches` distance calculations 255 of `batch_size` randomly sampled cells. More batches will converge 256 to population distance values at the cost of scalability. 257 258 batch_size : int 259 The number of cells to include per batch for distance 260 calculations. Higher batch size will converge to population 261 distance values at the cost of scalability. If 262 `batches*batch_size > num_training_cells`, `batch_size` will be 263 reduced to `int(num_training_cells/batches)`. 264 265 force_balance: bool 266 If `True`, training sets will be balanced to reduce class label 267 imbalance for each iteration. Defaults to `False`. 268 269 other_factor : float 270 The ratio of cells to sample for the other class for each 271 model. For example, if classifying B cells with 100 B cells in 272 training, if `other_factor=1`, 100 cells that are not B cells 273 will be trained on with the B cells. This will be done for each 274 fold for each class if `force_balance` is `True`. 275 276 combination: str 277 How should multiple views of data be combined. For more details 278 see ad.concat. 279 280 train_dict: dict 281 A `dict` where each key is a class label and values are are the 282 indices to be trained with for that class for class balance. 283 All values must be present in each adata.uns['train_indices']. 284 285 Returns 286 ------- 287 alpha_star : dict 288 A dictionary with keys being class labels and values being the 289 best performing alpha parameter for that class as a float. 290 """ 291 classes = np.unique(adata[0].obs['labels']) 292 orig_labels = adata[0].obs['labels'].to_numpy().copy() 293 orig_train = adata[0].uns['train_indices'].copy() 294 295 if train_dict: 296 train_idcs = train_dict 297 else: 298 if force_balance: 299 train_idcs = get_class_train(adata[0].uns['train_indices'], 300 adata[0].obs['labels'], 301 adata[0].uns['seed_obj']) 302 else: 303 train_idcs = {ct: adata[0].uns['train_indices'].copy() 304 for ct in classes} 305 306 opt_alpha_dict = dict() 307 308 for cl in classes: 309 temp_classes = orig_labels.copy() 310 temp_classes[temp_classes != cl] = 'other' 311 312 for i in range(len(adata)): 313 adata[i].obs['labels'] = temp_classes.copy() 314 adata[i].uns['train_indices'] = train_idcs[cl] 315 316 opt_alpha_dict[cl] = bin_optimize_alpha(adata, group_size, tfidf, 317 alpha_array, k, metric, 318 early_stopping, batches, 319 batch_size, combination) 320 321 # Global adata obj will be permanently changed if not reset 322 for i in range(len(adata)): 323 adata[i].obs['labels'] = orig_labels 324 adata[i].uns['train_indices'] = orig_train 325 326 return opt_alpha_dict
Wrapper function for running k-fold cross validation for every
label in a multiclass experiment. Called by
scmkl.optimize_alpha.
Parameters
- adata (list[ad.AnnData]):
List of
ad.AnnData(s) with'Z_train'and'Z_test'inadata.uns.keys(). - group_size (None | int):
Argument describing how the features are grouped. If
None,2 * adata.uns['D']will be used. For more information see celer documentation. - tfidf (list[bool]):
If
False, no data will be TF-IDF transformed. Iftype(adata) is listand TF-IDF transformation is desired for all or some of the data, a bool list corresponding toadatamust be provided. To simply TF-IDF transformadatawhentype(adata) is ad.AnnData, useTrue. - alpha_array (np.ndarray): Array of all alpha values to be tested.
- k (int): Number of folds to perform cross validation over.
- metric (str):
Which metric to use to optimize alpha. Options are
'AUROC','Accuracy','F1-Score','Precision', and'Recall'. - batches (int):
The number of batches to use for the distance calculation.
This will average the result of
batchesdistance calculations ofbatch_sizerandomly sampled cells. More batches will converge to population distance values at the cost of scalability. - batch_size (int):
The number of cells to include per batch for distance
calculations. Higher batch size will converge to population
distance values at the cost of scalability. If
batches*batch_size > num_training_cells,batch_sizewill be reduced toint(num_training_cells/batches). - force_balance (bool):
If
True, training sets will be balanced to reduce class label imbalance for each iteration. Defaults toFalse. - other_factor (float):
The ratio of cells to sample for the other class for each
model. For example, if classifying B cells with 100 B cells in
training, if
other_factor=1, 100 cells that are not B cells will be trained on with the B cells. This will be done for each fold for each class ifforce_balanceisTrue. - combination (str): How should multiple views of data be combined. For more details see ad.concat.
- train_dict (dict):
A
dictwhere each key is a class label and values are are the indices to be trained with for that class for class balance. All values must be present in each adata.uns['train_indices'].
Returns
- alpha_star (dict): A dictionary with keys being class labels and values being the best performing alpha parameter for that class as a float.
def
optimize_alpha( adata: anndata._core.anndata.AnnData | list[anndata._core.anndata.AnnData], group_size: int | None = None, tfidf: None | list[bool] = None, alpha_array: numpy.ndarray = array([1.9 , 1.69, 1.49, 1.28, 1.08, 0.87, 0.67, 0.46, 0.26, 0.05]), k: int = 4, metric: str = 'AUROC', early_stopping: bool = False, batches: int = 10, batch_size: int = 100, combination: str = 'concatenate', force_balance: bool = True, train_dict: dict = None):
329def optimize_alpha(adata: ad.AnnData | list[ad.AnnData], 330 group_size: int | None=None, tfidf: None | list[bool]=None, 331 alpha_array: np.ndarray=default_alphas, k: int=4, 332 metric: str='AUROC', early_stopping: bool=False, 333 batches: int=10, batch_size: int=100, 334 combination: str='concatenate', force_balance: bool=True, 335 train_dict: dict=None): 336 """ 337 K-fold cross validation for optimizing alpha hyperparameter using 338 training indices. 339 340 Parameters 341 ---------- 342 adata : list[ad.AnnData] 343 List of `ad.AnnData`(s) with `'Z_train'` and `'Z_test'` in 344 `adata.uns.keys()`. 345 346 group_size : None | int 347 Argument describing how the features are grouped. If `None`, 348 `2 * adata.uns['D']` will be used. For more information see 349 [celer documentation](https://mathurinm.github.io/celer/ 350 generated/celer.GroupLasso.html). 351 352 tfidf : list[bool] 353 If `False`, no data will be TF-IDF transformed. If 354 `type(adata) is list` and TF-IDF transformation is desired for 355 all or some of the data, a bool list corresponding to `adata` 356 must be provided. To simply TF-IDF transform `adata` when 357 `type(adata) is ad.AnnData`, use `True`. 358 359 alpha_array : np.ndarray 360 Array of all alpha values to be tested. 361 362 k : int 363 Number of folds to perform cross validation over. 364 365 metric : str 366 Which metric to use to optimize alpha. Options are `'AUROC'`, 367 `'Accuracy'`, `'F1-Score'`, `'Precision'`, and `'Recall'`. 368 369 batches : int 370 The number of batches to use for the distance calculation. 371 This will average the result of `batches` distance calculations 372 of `batch_size` randomly sampled cells. More batches will converge 373 to population distance values at the cost of scalability. 374 375 batch_size : int 376 The number of cells to include per batch for distance 377 calculations. Higher batch size will converge to population 378 distance values at the cost of scalability. If 379 `batches*batch_size > num_training_cells`, `batch_size` will be 380 reduced to `int(num_training_cells/batches)`. 381 382 force_balance: bool 383 If `True`, training sets will be balanced to reduce class label 384 imbalance for each iteration. Defaults to `False`. 385 386 other_factor : float 387 The ratio of cells to sample for the other class for each 388 model. For example, if classifying B cells with 100 B cells in 389 training, if `other_factor=1`, 100 cells that are not B cells 390 will be trained on with the B cells. This will be done for each 391 fold for each class if `force_balance` is `True`. 392 393 combination: str 394 How should multiple views of data be combined. For more details 395 see ad.concat. 396 397 train_dict: dict 398 A `dict` where each key is a class label and values are are the 399 indices to be trained with for that class for class balance. 400 All values must be present in each adata.uns['train_indices']. 401 402 Returns 403 ------- 404 alpha_star : float | dict 405 If number of classes is more than 2, a dictionary with keys 406 being class labels and values being the best performing alpha 407 parameter for that class as a float. Else, a float for 408 comparing the two classes. 409 """ 410 # Need singe-view runs to be iterable 411 if isinstance(adata, ad.AnnData): 412 adata = [adata.copy()] 413 414 if isinstance(tfidf, type(None)): 415 tfidf = len(adata)*[False] 416 417 is_multi = 2 < len(set(adata[0].obs['labels'])) 418 419 if isinstance(group_size, type(None)): 420 group_size = 2*adata[0].uns['D'] 421 422 423 if is_multi: 424 alpha_star = multi_optimize_alpha(adata, group_size, tfidf, 425 alpha_array, k, metric, 426 early_stopping, batches, 427 batch_size, force_balance, 428 combination, train_dict) 429 430 else: 431 alpha_star = bin_optimize_alpha(adata, group_size, tfidf, 432 alpha_array, k, metric, 433 early_stopping, batches, 434 batch_size, combination) 435 436 return alpha_star
K-fold cross validation for optimizing alpha hyperparameter using training indices.
Parameters
- adata (list[ad.AnnData]):
List of
ad.AnnData(s) with'Z_train'and'Z_test'inadata.uns.keys(). - group_size (None | int):
Argument describing how the features are grouped. If
None,2 * adata.uns['D']will be used. For more information see celer documentation. - tfidf (list[bool]):
If
False, no data will be TF-IDF transformed. Iftype(adata) is listand TF-IDF transformation is desired for all or some of the data, a bool list corresponding toadatamust be provided. To simply TF-IDF transformadatawhentype(adata) is ad.AnnData, useTrue. - alpha_array (np.ndarray): Array of all alpha values to be tested.
- k (int): Number of folds to perform cross validation over.
- metric (str):
Which metric to use to optimize alpha. Options are
'AUROC','Accuracy','F1-Score','Precision', and'Recall'. - batches (int):
The number of batches to use for the distance calculation.
This will average the result of
batchesdistance calculations ofbatch_sizerandomly sampled cells. More batches will converge to population distance values at the cost of scalability. - batch_size (int):
The number of cells to include per batch for distance
calculations. Higher batch size will converge to population
distance values at the cost of scalability. If
batches*batch_size > num_training_cells,batch_sizewill be reduced toint(num_training_cells/batches). - force_balance (bool):
If
True, training sets will be balanced to reduce class label imbalance for each iteration. Defaults toFalse. - other_factor (float):
The ratio of cells to sample for the other class for each
model. For example, if classifying B cells with 100 B cells in
training, if
other_factor=1, 100 cells that are not B cells will be trained on with the B cells. This will be done for each fold for each class ifforce_balanceisTrue. - combination (str): How should multiple views of data be combined. For more details see ad.concat.
- train_dict (dict):
A
dictwhere each key is a class label and values are are the indices to be trained with for that class for class balance. All values must be present in each adata.uns['train_indices'].
Returns
- alpha_star (float | dict): If number of classes is more than 2, a dictionary with keys being class labels and values being the best performing alpha parameter for that class as a float. Else, a float for comparing the two classes.